1Upstream-Status: Inappropriate
2
3RPI-Distro repo clones original ffmpeg and applies patches to enable
4raspiberry pi support.
5
6--- a/configure
7+++ b/configure
8@@ -207,6 +207,7 @@ External library support:
9   --disable-bzlib          disable bzlib [autodetect]
10   --disable-coreimage      disable Apple CoreImage framework [autodetect]
11   --enable-chromaprint     enable audio fingerprinting with chromaprint [no]
12+  --disable-epoxy          disable epoxy [autodetect]
13   --enable-frei0r          enable frei0r video filtering [no]
14   --enable-gcrypt          enable gcrypt, needed for rtmp(t)e support
15                            if openssl, librtmp or gmp is not used [no]
16@@ -274,6 +275,7 @@ External library support:
17   --enable-libtls          enable LibreSSL (via libtls), needed for https support
18                            if openssl, gnutls or mbedtls is not used [no]
19   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
20+  --disable-libudev        disable libudev [autodetect]
21   --enable-libv4l2         enable libv4l2/v4l-utils [no]
22   --enable-libvidstab      enable video stabilization using vid.stab [no]
23   --enable-libvmaf         enable vmaf filter via libvmaf [no]
24@@ -336,12 +338,17 @@ External library support:
25   --enable-libmfx          enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
26   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
27   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
28+  --enable-rpi             enable other rpi specific stuff [no]
29+  --enable-sand            enable sand video formats [rpi]
30+  --enable-vout-drm        enable the vout_drm module - for internal testing only [no]
31+  --enable-vout-egl        enable the vout_egl module - for internal testing only [no]
32   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
33   --disable-nvenc          disable Nvidia video encoding code [autodetect]
34   --enable-omx             enable OpenMAX IL code [no]
35   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
36   --enable-rkmpp           enable Rockchip Media Process Platform code [no]
37   --disable-v4l2-m2m       disable V4L2 mem2mem code [autodetect]
38+  --enable-v4l2-request    enable V4L2 request API code [no]
39   --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
40   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
41   --disable-videotoolbox   disable VideoToolbox code [autodetect]
42@@ -1699,7 +1706,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST="
43     avfoundation
44     bzlib
45     coreimage
46+    epoxy
47     iconv
48+    libudev
49     libxcb
50     libxcb_shm
51     libxcb_shape
52@@ -1861,7 +1870,10 @@ HWACCEL_LIBRARY_LIST="
53     mmal
54     omx
55     opencl
56+    v4l2_request
57     vulkan
58+    rpi4_8
59+    rpi4_10
60 "
61
62 DOCUMENT_LIST="
63@@ -1877,12 +1889,16 @@ FEATURE_LIST="
64     gray
65     hardcoded_tables
66     omx_rpi
67+    rpi
68     runtime_cpudetect
69     safe_bitstream_reader
70+    sand
71     shared
72     small
73     static
74     swscale_alpha
75+    vout_drm
76+    vout_egl
77 "
78
79 # this list should be kept in linking order
80@@ -1923,6 +1939,7 @@ SUBSYSTEM_LIST="
81     pixelutils
82     network
83     rdft
84+    rpi
85 "
86
87 # COMPONENT_LIST needs to come last to ensure correct dependency checking
88@@ -2405,9 +2422,11 @@ CONFIG_EXTRA="
89     rangecoder
90     riffdec
91     riffenc
92+    rpi
93     rtpdec
94     rtpenc_chain
95     rv34dsp
96+    sand
97     scene_sad
98     sinewin
99     snappy
100@@ -2737,6 +2756,8 @@ hap_decoder_select="snappy texturedsp"
101 hap_encoder_deps="libsnappy"
102 hap_encoder_select="texturedspenc"
103 hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp"
104+hevc_rpi_decoder_deps="rpi"
105+hevc_rpi_decoder_select="hevc_decoder sand"
106 huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
107 huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
108 hymt_decoder_select="huffyuv_decoder"
109@@ -2903,6 +2924,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder
110 dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
111 ffnvcodec_deps_any="libdl LoadLibrary"
112 nvdec_deps="ffnvcodec"
113+v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
114 vaapi_x11_deps="xlib"
115 videotoolbox_hwaccel_deps="videotoolbox pthreads"
116 videotoolbox_hwaccel_extralibs="-framework QuartzCore"
117@@ -2934,6 +2956,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicP
118 hevc_dxva2_hwaccel_select="hevc_decoder"
119 hevc_nvdec_hwaccel_deps="nvdec"
120 hevc_nvdec_hwaccel_select="hevc_decoder"
121+hevc_v4l2request_hwaccel_deps="v4l2_request"
122+hevc_v4l2request_hwaccel_select="hevc_decoder"
123+hevc_rpi4_10_hwaccel_deps="rpi"
124+hevc_rpi4_10_hwaccel_select="hevc_decoder"
125+hevc_rpi4_8_hwaccel_deps="rpi"
126+hevc_rpi4_8_hwaccel_select="hevc_decoder"
127 hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
128 hevc_vaapi_hwaccel_select="hevc_decoder"
129 hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
130@@ -3401,8 +3429,13 @@ sndio_indev_deps="sndio"
131 sndio_outdev_deps="sndio"
132 v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
133 v4l2_indev_suggest="libv4l2"
134+v4l2_outdev_deps="libdrm"
135 v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
136 v4l2_outdev_suggest="libv4l2"
137+vout_drm_outdev_deps="libdrm"
138+vout_egl_outdev_deps="xlib epoxy"
139+vout_rpi_outdev_deps="rpi"
140+vout_rpi_outdev_select="sand"
141 vfwcap_indev_deps="vfw32 vfwcap_defines"
142 xcbgrab_indev_deps="libxcb"
143 xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
144@@ -3618,6 +3651,7 @@ tonemap_vaapi_filter_deps="vaapi VAProcF
145 tonemap_opencl_filter_deps="opencl const_nan"
146 transpose_opencl_filter_deps="opencl"
147 transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
148+unsand_filter_select="sand"
149 unsharp_opencl_filter_deps="opencl"
150 uspp_filter_deps="gpl avcodec"
151 vaguedenoiser_filter_deps="gpl"
152@@ -6102,6 +6136,12 @@ check_func_headers glob.h glob
153 enabled xlib &&
154     check_lib xlib "X11/Xlib.h X11/extensions/Xvlib.h" XvGetPortAttribute -lXv -lX11 -lXext
155
156+enabled libudev &&
157+    check_pkg_config libudev libudev libudev.h udev_new
158+
159+enabled epoxy &&
160+    check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
161+
162 check_headers direct.h
163 check_headers dirent.h
164 check_headers dxgidebug.h
165@@ -6430,11 +6470,12 @@ enabled mbedtls           && { check_pkg
166                                check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto ||
167                                die "ERROR: mbedTLS not found"; }
168 enabled mediacodec        && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
169-enabled mmal              && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
170+( enabled rpi ||
171+  enabled mmal )          && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
172                                { ! enabled cross_compile &&
173                                  add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
174                                  add_ldflags -L/opt/vc/lib/ &&
175-                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } ||
176+                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } ||
177                                die "ERROR: mmal not found" &&
178                                check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
179 enabled openal            && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
180@@ -6475,8 +6516,16 @@ enabled rkmpp             && { require_p
181                                { enabled libdrm ||
182                                  die "ERROR: rkmpp requires --enable-libdrm"; }
183                              }
184+enabled v4l2_request      && { enabled libdrm ||
185+                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
186+                             { enabled libudev ||
187+                               die "ERROR: v4l2-request requires libudev"; }
188 enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
189
190+enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; }
191+
192+enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } &&
193+                    { enabled xlib  || die "ERROR: vout_egl requires xlib"; }
194
195 if enabled gcrypt; then
196     GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
197@@ -6556,6 +6605,8 @@ if enabled v4l2_m2m; then
198     check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
199 fi
200
201+check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
202+check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
203 check_headers sys/videoio.h
204 test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
205
206--- a/fftools/ffmpeg.c
207+++ b/fftools/ffmpeg.c
208@@ -2119,8 +2119,8 @@ static int ifilter_send_frame(InputFilte
209                        ifilter->channel_layout != frame->channel_layout;
210         break;
211     case AVMEDIA_TYPE_VIDEO:
212-        need_reinit |= ifilter->width  != frame->width ||
213-                       ifilter->height != frame->height;
214+        need_reinit |= ifilter->width  != av_frame_cropped_width(frame) ||
215+                       ifilter->height != av_frame_cropped_height(frame);
216         break;
217     }
218
219@@ -2131,6 +2131,9 @@ static int ifilter_send_frame(InputFilte
220         (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data))
221         need_reinit = 1;
222
223+    if (no_cvt_hw && fg->graph)
224+        need_reinit = 0;
225+
226     if (need_reinit) {
227         ret = ifilter_parameters_from_frame(ifilter, frame);
228         if (ret < 0)
229@@ -2401,8 +2404,7 @@ static int decode_video(InputStream *ist
230         decoded_frame->top_field_first = ist->top_field_first;
231
232     ist->frames_decoded++;
233-
234-    if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
235+    if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
236         err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
237         if (err < 0)
238             goto fail;
239@@ -2600,7 +2602,12 @@ static int process_input_packet(InputStr
240         case AVMEDIA_TYPE_VIDEO:
241             ret = decode_video    (ist, repeating ? NULL : &avpkt, &got_output, &duration_pts, !pkt,
242                                    &decode_failed);
243-            if (!repeating || !pkt || got_output) {
244+            // Pi: Do not inc dts if no_cvt_hw set
245+            // V4L2 H264 decode has long latency and sometimes spits out a long
246+            // stream of output without input. In this case incrementing DTS is wrong.
247+            // There may be cases where the condition as written is correct so only
248+            // "fix" in the cases which cause problems
249+            if (!repeating || !pkt || (got_output && !no_cvt_hw)) {
250                 if (pkt && pkt->duration) {
251                     duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
252                 } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) {
253@@ -2820,6 +2827,16 @@ static enum AVPixelFormat get_format(AVC
254         } else {
255             const HWAccel *hwaccel = NULL;
256             int i;
257+
258+            if (no_cvt_hw) {
259+                config = avcodec_get_hw_config(s->codec, 0);
260+                if (config->methods == AV_CODEC_HW_CONFIG_METHOD_INTERNAL) {
261+                    av_log(s, AV_LOG_DEBUG, "no_cvt_hw so accepting pix_fmt %d with codec internal hwaccel\n", *p);
262+                    ist->hwaccel_pix_fmt = *p;
263+                    break;
264+                }
265+            }
266+
267             for (i = 0; hwaccels[i].name; i++) {
268                 if (hwaccels[i].pix_fmt == *p) {
269                     hwaccel = &hwaccels[i];
270@@ -2914,6 +2931,15 @@ static int init_input_stream(int ist_ind
271             return ret;
272         }
273
274+#if CONFIG_HEVC_RPI_DECODER
275+        ret = -1;
276+        if (strcmp(codec->name, "hevc_rpi") == 0 &&
277+            (ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
278+            ist->dec = codec = avcodec_find_decoder_by_name("hevc");
279+            av_log(NULL, AV_LOG_INFO, "Failed to open hevc_rpi - trying hevc\n");
280+        }
281+        if (ret < 0)
282+#endif
283         if ((ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
284             if (ret == AVERROR_EXPERIMENTAL)
285                 abort_codec_experimental(codec, 0);
286--- a/fftools/ffmpeg.h
287+++ b/fftools/ffmpeg.h
288@@ -61,6 +61,7 @@ enum HWAccelID {
289     HWACCEL_GENERIC,
290     HWACCEL_VIDEOTOOLBOX,
291     HWACCEL_QSV,
292+    HWACCEL_RPI,
293 };
294
295 typedef struct HWAccel {
296@@ -590,6 +591,7 @@ extern int video_sync_method;
297 extern float frame_drop_threshold;
298 extern int do_benchmark;
299 extern int do_benchmark_all;
300+extern int no_cvt_hw;
301 extern int do_deinterlace;
302 extern int do_hex_dump;
303 extern int do_pkt_dump;
304--- a/fftools/ffmpeg_filter.c
305+++ b/fftools/ffmpeg_filter.c
306@@ -1186,8 +1186,8 @@ int ifilter_parameters_from_frame(InputF
307
308     ifilter->format = frame->format;
309
310-    ifilter->width               = frame->width;
311-    ifilter->height              = frame->height;
312+    ifilter->width               = av_frame_cropped_width(frame);
313+    ifilter->height              = av_frame_cropped_height(frame);
314     ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
315
316     ifilter->sample_rate         = frame->sample_rate;
317--- a/fftools/ffmpeg_hw.c
318+++ b/fftools/ffmpeg_hw.c
319@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum
320     char *name;
321     size_t index_pos;
322     int index, index_limit = 1000;
323+    if (!type_name)
324+        return NULL;
325     index_pos = strlen(type_name);
326     name = av_malloc(index_pos + 4);
327     if (!name)
328--- a/fftools/ffmpeg_opt.c
329+++ b/fftools/ffmpeg_opt.c
330@@ -130,6 +130,12 @@ static const char *opt_name_enc_time_bas
331     }\
332 }
333
334+#if CONFIG_RPI
335+static int rpi_init(AVCodecContext *avctx) {
336+    return 0;
337+}
338+#endif
339+
340 const HWAccel hwaccels[] = {
341 #if CONFIG_VIDEOTOOLBOX
342     { "videotoolbox", videotoolbox_init, HWACCEL_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX },
343@@ -137,6 +143,10 @@ const HWAccel hwaccels[] = {
344 #if CONFIG_LIBMFX
345     { "qsv",   qsv_init,   HWACCEL_QSV,   AV_PIX_FMT_QSV },
346 #endif
347+#if CONFIG_RPI
348+    {  "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_8 },
349+    {  "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_10 },
350+#endif
351     { 0 },
352 };
353 HWDevice *filter_hw_device;
354@@ -155,6 +165,7 @@ float frame_drop_threshold = 0;
355 int do_deinterlace    = 0;
356 int do_benchmark      = 0;
357 int do_benchmark_all  = 0;
358+int no_cvt_hw         = 0;
359 int do_hex_dump       = 0;
360 int do_pkt_dump       = 0;
361 int copy_ts           = 0;
362@@ -3460,6 +3471,8 @@ const OptionDef options[] = {
363         "add timings for benchmarking" },
364     { "benchmark_all",  OPT_BOOL | OPT_EXPERT,                       { &do_benchmark_all },
365       "add timings for each task" },
366+    { "no_cvt_hw",      OPT_BOOL | OPT_EXPERT,                       { &no_cvt_hw },
367+      "do not auto-convert hw frames to sw" },
368     { "progress",       HAS_ARG | OPT_EXPERT,                        { .func_arg = opt_progress },
369       "write program-readable progress information", "url" },
370     { "stdin",          OPT_BOOL | OPT_EXPERT,                       { &stdin_interaction },
371--- a/libavcodec/Makefile
372+++ b/libavcodec/Makefile
373@@ -19,6 +19,7 @@ HEADERS = ac3_parser.h
374           mediacodec.h                                                  \
375           packet.h                                                      \
376           qsv.h                                                         \
377+          rpi_zc.h                                                      \
378           vaapi.h                                                       \
379           vdpau.h                                                       \
380           version.h                                                     \
381@@ -138,6 +139,7 @@ OBJS-$(CONFIG_QSVDEC)                  +
382 OBJS-$(CONFIG_QSVENC)                  += qsvenc.o
383 OBJS-$(CONFIG_RANGECODER)              += rangecoder.o
384 OBJS-$(CONFIG_RDFT)                    += rdft.o
385+OBJS-$(CONFIG_RPI)                     += rpi_qpu.o rpi_mailbox.o rpi_zc.o
386 OBJS-$(CONFIG_RV34DSP)                 += rv34dsp.o
387 OBJS-$(CONFIG_SHARED)                  += log2_tab.o reverse.o
388 OBJS-$(CONFIG_SINEWIN)                 += sinewin.o sinewin_fixed.o
389@@ -152,7 +154,10 @@ OBJS-$(CONFIG_VIDEODSP)                +
390 OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
391 OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
392 OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
393-OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
394+OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
395+                                          weak_link.o
396+OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
397+					  v4l2_req_devscan.o weak_link.o
398 OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
399 OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
400
401@@ -391,6 +396,14 @@ OBJS-$(CONFIG_HEVC_QSV_DECODER)        +
402 OBJS-$(CONFIG_HEVC_QSV_ENCODER)        += qsvenc_hevc.o hevc_ps_enc.o       \
403                                           hevc_data.o
404 OBJS-$(CONFIG_HEVC_RKMPP_DECODER)      += rkmppdec.o
405+OBJS-$(CONFIG_RPI)                     += rpi_mem.o \
406+                                          rpi_mailbox.o rpi_zc.o
407+OBJS-$(CONFIG_HEVC_RPI_DECODER)        += rpi_hevcdec.o rpi_hevc_mvs.o \
408+                                          rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o    \
409+                                          rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o    \
410+                                          rpi_hevc_shader.o rpi_hevc_shader_template.o       \
411+                                          rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \
412+                                          rpi_hevc_sei.o rpi_hevc_data.o rpi_qpu.o rpi_mem.o
413 OBJS-$(CONFIG_HEVC_VAAPI_ENCODER)      += vaapi_encode_h265.o h265_profile_level.o
414 OBJS-$(CONFIG_HEVC_V4L2M2M_DECODER)    += v4l2_m2m_dec.o
415 OBJS-$(CONFIG_HEVC_V4L2M2M_ENCODER)    += v4l2_m2m_enc.o
416@@ -909,6 +922,10 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)
417 OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
418 OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
419 OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec_h2645.o
420+OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL)        += rpivid_hevc.o
421+OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL)       += rpivid_hevc.o
422+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
423+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o  v4l2_req_hevc_v4.o
424 OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
425 OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o
426 OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
427@@ -1261,3 +1278,31 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
428 $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
429 $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
430 endif
431+
432+ifdef CONFIG_HEVC_RPI_DECODER
433+QASM_PY := ../local/bin/qasm.py
434+VASMVIDCORE := ../local/bin/vasmvidcore_std
435+
436+ifneq ("$(wildcard $(QASM_PY))","")
437+$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm
438+	$(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
439+
440+$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm
441+	$(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
442+endif
443+
444+ifneq ("$(wildcard $(VASMVIDCORE))","")
445+$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
446+	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
447+$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
448+	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
449+
450+$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
451+	python pi-util/make_array.py $<
452+$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
453+	python pi-util/make_array.py $<
454+endif
455+
456+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
457+$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
458+endif
459--- a/libavcodec/aarch64/Makefile
460+++ b/libavcodec/aarch64/Makefile
461@@ -44,10 +44,12 @@ NEON-OBJS-$(CONFIG_H264PRED)
462 NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
463                                            aarch64/hpeldsp_neon.o
464 NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
465-NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
466+NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
467+                                           aarch64/simple_idct_neon.o
468 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
469 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
470 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
471+NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
472 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
473
474 # decoders/encoders
475--- a/libavcodec/aarch64/idctdsp_init_aarch64.c
476+++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
477@@ -27,19 +27,29 @@
478 #include "libavcodec/idctdsp.h"
479 #include "idct.h"
480
481+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
482+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
483+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
484+
485 av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
486                                      unsigned high_bit_depth)
487 {
488     int cpu_flags = av_get_cpu_flags();
489
490-    if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) {
491-        if (avctx->idct_algo == FF_IDCT_AUTO ||
492-            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
493-            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
494-            c->idct_put  = ff_simple_idct_put_neon;
495-            c->idct_add  = ff_simple_idct_add_neon;
496-            c->idct      = ff_simple_idct_neon;
497-            c->perm_type = FF_IDCT_PERM_PARTTRANS;
498+    if (have_neon(cpu_flags)) {
499+        if (!avctx->lowres && !high_bit_depth) {
500+            if (avctx->idct_algo == FF_IDCT_AUTO ||
501+                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
502+                avctx->idct_algo == FF_IDCT_SIMPLENEON) {
503+                c->idct_put  = ff_simple_idct_put_neon;
504+                c->idct_add  = ff_simple_idct_add_neon;
505+                c->idct      = ff_simple_idct_neon;
506+                c->perm_type = FF_IDCT_PERM_PARTTRANS;
507+            }
508         }
509+
510+        c->add_pixels_clamped        = ff_add_pixels_clamped_neon;
511+        c->put_pixels_clamped        = ff_put_pixels_clamped_neon;
512+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
513     }
514 }
515--- /dev/null
516+++ b/libavcodec/aarch64/idctdsp_neon.S
517@@ -0,0 +1,130 @@
518+/*
519+ * IDCT AArch64 NEON optimisations
520+ *
521+ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
522+ *
523+ * This file is part of FFmpeg.
524+ *
525+ * FFmpeg is free software; you can redistribute it and/or
526+ * modify it under the terms of the GNU Lesser General Public
527+ * License as published by the Free Software Foundation; either
528+ * version 2.1 of the License, or (at your option) any later version.
529+ *
530+ * FFmpeg is distributed in the hope that it will be useful,
531+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
532+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
533+ * Lesser General Public License for more details.
534+ *
535+ * You should have received a copy of the GNU Lesser General Public
536+ * License along with FFmpeg; if not, write to the Free Software
537+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
538+ */
539+
540+#include "libavutil/aarch64/asm.S"
541+
542+// Clamp 16-bit signed block coefficients to unsigned 8-bit
543+// On entry:
544+//   x0 -> array of 64x 16-bit coefficients
545+//   x1 -> 8-bit results
546+//   x2 = row stride for results, bytes
547+function ff_put_pixels_clamped_neon, export=1
548+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
549+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
550+        sqxtun          v0.8b, v0.8h
551+        sqxtun          v1.8b, v1.8h
552+        sqxtun          v2.8b, v2.8h
553+        sqxtun          v3.8b, v3.8h
554+        sqxtun          v4.8b, v4.8h
555+        st1             {v0.8b}, [x1], x2
556+        sqxtun          v0.8b, v5.8h
557+        st1             {v1.8b}, [x1], x2
558+        sqxtun          v1.8b, v6.8h
559+        st1             {v2.8b}, [x1], x2
560+        sqxtun          v2.8b, v7.8h
561+        st1             {v3.8b}, [x1], x2
562+        st1             {v4.8b}, [x1], x2
563+        st1             {v0.8b}, [x1], x2
564+        st1             {v1.8b}, [x1], x2
565+        st1             {v2.8b}, [x1]
566+        ret
567+endfunc
568+
569+// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
570+// On entry:
571+//   x0 -> array of 64x 16-bit coefficients
572+//   x1 -> 8-bit results
573+//   x2 = row stride for results, bytes
574+function ff_put_signed_pixels_clamped_neon, export=1
575+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
576+        movi            v4.8b, #128
577+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
578+        sqxtn           v0.8b, v0.8h
579+        sqxtn           v1.8b, v1.8h
580+        sqxtn           v2.8b, v2.8h
581+        sqxtn           v3.8b, v3.8h
582+        sqxtn           v5.8b, v16.8h
583+        add             v0.8b, v0.8b, v4.8b
584+        sqxtn           v6.8b, v17.8h
585+        add             v1.8b, v1.8b, v4.8b
586+        sqxtn           v7.8b, v18.8h
587+        add             v2.8b, v2.8b, v4.8b
588+        sqxtn           v16.8b, v19.8h
589+        add             v3.8b, v3.8b, v4.8b
590+        st1             {v0.8b}, [x1], x2
591+        add             v0.8b, v5.8b, v4.8b
592+        st1             {v1.8b}, [x1], x2
593+        add             v1.8b, v6.8b, v4.8b
594+        st1             {v2.8b}, [x1], x2
595+        add             v2.8b, v7.8b, v4.8b
596+        st1             {v3.8b}, [x1], x2
597+        add             v3.8b, v16.8b, v4.8b
598+        st1             {v0.8b}, [x1], x2
599+        st1             {v1.8b}, [x1], x2
600+        st1             {v2.8b}, [x1], x2
601+        st1             {v3.8b}, [x1]
602+        ret
603+endfunc
604+
605+// Add 16-bit signed block coefficients to unsigned 8-bit
606+// On entry:
607+//   x0 -> array of 64x 16-bit coefficients
608+//   x1 -> 8-bit input and results
609+//   x2 = row stride for 8-bit input and results, bytes
610+function ff_add_pixels_clamped_neon, export=1
611+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
612+        mov             x3, x1
613+        ld1             {v4.8b}, [x1], x2
614+        ld1             {v5.8b}, [x1], x2
615+        ld1             {v6.8b}, [x1], x2
616+        ld1             {v7.8b}, [x1], x2
617+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
618+        uaddw           v0.8h, v0.8h, v4.8b
619+        uaddw           v1.8h, v1.8h, v5.8b
620+        uaddw           v2.8h, v2.8h, v6.8b
621+        ld1             {v4.8b}, [x1], x2
622+        uaddw           v3.8h, v3.8h, v7.8b
623+        ld1             {v5.8b}, [x1], x2
624+        sqxtun          v0.8b, v0.8h
625+        ld1             {v6.8b}, [x1], x2
626+        sqxtun          v1.8b, v1.8h
627+        ld1             {v7.8b}, [x1]
628+        sqxtun          v2.8b, v2.8h
629+        sqxtun          v3.8b, v3.8h
630+        uaddw           v4.8h, v16.8h, v4.8b
631+        st1             {v0.8b}, [x3], x2
632+        uaddw           v0.8h, v17.8h, v5.8b
633+        st1             {v1.8b}, [x3], x2
634+        uaddw           v1.8h, v18.8h, v6.8b
635+        st1             {v2.8b}, [x3], x2
636+        uaddw           v2.8h, v19.8h, v7.8b
637+        sqxtun          v4.8b, v4.8h
638+        sqxtun          v0.8b, v0.8h
639+        st1             {v3.8b}, [x3], x2
640+        sqxtun          v1.8b, v1.8h
641+        sqxtun          v2.8b, v2.8h
642+        st1             {v4.8b}, [x3], x2
643+        st1             {v0.8b}, [x3], x2
644+        st1             {v1.8b}, [x3], x2
645+        st1             {v2.8b}, [x3]
646+        ret
647+endfunc
648--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
649+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
650@@ -21,10 +21,28 @@
651 #include "libavutil/attributes.h"
652 #include "libavutil/cpu.h"
653 #include "libavutil/aarch64/cpu.h"
654+#include "libavutil/intreadwrite.h"
655 #include "libavcodec/vc1dsp.h"
656
657 #include "config.h"
658
659+void ff_vc1_inv_trans_8x8_neon(int16_t *block);
660+void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
661+void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
662+void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
663+
664+void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
665+void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
666+void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
667+void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
668+
669+void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
670+void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
671+void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
672+void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
673+void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
674+void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
675+
676 void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
677                                 int h, int x, int y);
678 void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
679@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t
680 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
681                                 int h, int x, int y);
682
683+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
684+
685+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
686+{
687+    /* Dealing with starting and stopping, and removing escape bytes, are
688+     * comparatively less time-sensitive, so are more clearly expressed using
689+     * a C wrapper around the assembly inner loop. Note that we assume a
690+     * little-endian machine that supports unaligned loads. */
691+    int dsize = 0;
692+    while (size >= 4)
693+    {
694+        int found = 0;
695+        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
696+        {
697+            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
698+            if (!found)
699+            {
700+                *dst++ = *src++;
701+                --size;
702+                ++dsize;
703+            }
704+        }
705+        if (!found)
706+        {
707+            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
708+            dst += skip;
709+            src += skip;
710+            size -= skip;
711+            dsize += skip;
712+            while (!found && size >= 4)
713+            {
714+                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
715+                if (!found)
716+                {
717+                    *dst++ = *src++;
718+                    --size;
719+                    ++dsize;
720+                }
721+            }
722+        }
723+        if (found)
724+        {
725+            *dst++ = *src++;
726+            *dst++ = *src++;
727+            ++src;
728+            size -= 3;
729+            dsize += 2;
730+        }
731+    }
732+    while (size > 0)
733+    {
734+        *dst++ = *src++;
735+        --size;
736+        ++dsize;
737+    }
738+    return dsize;
739+}
740+
741 av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
742 {
743     int cpu_flags = av_get_cpu_flags();
744
745     if (have_neon(cpu_flags)) {
746+        dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
747+        dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
748+        dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
749+        dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
750+        dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
751+        dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
752+        dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
753+        dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
754+
755+        dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
756+        dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
757+        dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
758+        dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
759+        dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
760+        dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
761+
762         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
763         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
764         dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
765         dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
766+
767+        dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
768     }
769 }
770--- /dev/null
771+++ b/libavcodec/aarch64/vc1dsp_neon.S
772@@ -0,0 +1,1546 @@
773+/*
774+ * VC1 AArch64 NEON optimisations
775+ *
776+ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
777+ *
778+ * This file is part of FFmpeg.
779+ *
780+ * FFmpeg is free software; you can redistribute it and/or
781+ * modify it under the terms of the GNU Lesser General Public
782+ * License as published by the Free Software Foundation; either
783+ * version 2.1 of the License, or (at your option) any later version.
784+ *
785+ * FFmpeg is distributed in the hope that it will be useful,
786+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
787+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
788+ * Lesser General Public License for more details.
789+ *
790+ * You should have received a copy of the GNU Lesser General Public
791+ * License along with FFmpeg; if not, write to the Free Software
792+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
793+ */
794+
795+#include "libavutil/aarch64/asm.S"
796+
797+// VC-1 8x8 inverse transform
798+// On entry:
799+//   x0 -> array of 16-bit inverse transform coefficients, in column-major order
800+// On exit:
801+//   array at x0 updated to hold transformed block; also now held in row-major order
802+function ff_vc1_inv_trans_8x8_neon, export=1
803+        ld1             {v1.16b, v2.16b}, [x0], #32
804+        ld1             {v3.16b, v4.16b}, [x0], #32
805+        ld1             {v5.16b, v6.16b}, [x0], #32
806+        shl             v1.8h, v1.8h, #2        //         8/2 * src[0]
807+        sub             x1, x0, #3*32
808+        ld1             {v16.16b, v17.16b}, [x0]
809+        shl             v7.8h, v2.8h, #4        //          16 * src[8]
810+        shl             v18.8h, v2.8h, #2       //           4 * src[8]
811+        shl             v19.8h, v4.8h, #4       //                        16 * src[24]
812+        ldr             d0, .Lcoeffs_it8
813+        shl             v5.8h, v5.8h, #2        //                                      8/2 * src[32]
814+        shl             v20.8h, v6.8h, #4       //                                       16 * src[40]
815+        shl             v21.8h, v6.8h, #2       //                                        4 * src[40]
816+        shl             v22.8h, v17.8h, #4      //                                                      16 * src[56]
817+        ssra            v20.8h, v19.8h, #2      //                         4 * src[24] + 16 * src[40]
818+        mul             v23.8h, v3.8h, v0.h[0]  //                       6/2 * src[16]
819+        sub             v19.8h, v19.8h, v21.8h  //                        16 * src[24] -  4 * src[40]
820+        ssra            v7.8h, v22.8h, #2       //          16 * src[8]                               +  4 * src[56]
821+        sub             v18.8h, v22.8h, v18.8h  //        -  4 * src[8]                               + 16 * src[56]
822+        shl             v3.8h, v3.8h, #3        //                      16/2 * src[16]
823+        mls             v20.8h, v2.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
824+        ssra            v1.8h, v1.8h, #1        //        12/2 * src[0]
825+        ssra            v5.8h, v5.8h, #1        //                                     12/2 * src[32]
826+        mla             v7.8h, v4.8h, v0.h[2]   //          16 * src[8] + 15 * src[24]                +  4 * src[56]
827+        shl             v21.8h, v16.8h, #3      //                                                    16/2 * src[48]
828+        mls             v19.8h, v2.8h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
829+        sub             v2.8h, v23.8h, v21.8h   // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
830+        mla             v18.8h, v4.8h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
831+        add             v4.8h, v1.8h, v5.8h     // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
832+        sub             v1.8h, v1.8h, v5.8h     // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
833+        mla             v3.8h, v16.8h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
834+        mla             v7.8h, v6.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
835+        add             v5.8h, v1.8h, v2.8h     // t6/2 = t2/2 + t4/2
836+        sub             v16.8h, v1.8h, v2.8h    // t7/2 = t2/2 - t4/2
837+        mla             v20.8h, v17.8h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
838+        add             v21.8h, v1.8h, v2.8h    // t6/2 = t2/2 + t4/2
839+        add             v22.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
840+        mls             v19.8h, v17.8h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
841+        sub             v17.8h, v4.8h, v3.8h    // t8/2 = t1/2 - t3/2
842+        add             v23.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
843+        mls             v18.8h, v6.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
844+        sub             v1.8h, v1.8h, v2.8h     // t7/2 = t2/2 - t4/2
845+        sub             v2.8h, v4.8h, v3.8h     // t8/2 = t1/2 - t3/2
846+        neg             v3.8h, v7.8h            // -t1
847+        neg             v4.8h, v20.8h           // +t2
848+        neg             v6.8h, v19.8h           // +t3
849+        ssra            v22.8h, v7.8h, #1       // (t5 + t1) >> 1
850+        ssra            v1.8h, v19.8h, #1       // (t7 - t3) >> 1
851+        neg             v7.8h, v18.8h           // +t4
852+        ssra            v5.8h, v4.8h, #1        // (t6 + t2) >> 1
853+        ssra            v16.8h, v6.8h, #1       // (t7 + t3) >> 1
854+        ssra            v2.8h, v18.8h, #1       // (t8 - t4) >> 1
855+        ssra            v17.8h, v7.8h, #1       // (t8 + t4) >> 1
856+        ssra            v21.8h, v20.8h, #1      // (t6 - t2) >> 1
857+        ssra            v23.8h, v3.8h, #1       // (t5 - t1) >> 1
858+        srshr           v3.8h, v22.8h, #2       // (t5 + t1 + 4) >> 3
859+        srshr           v4.8h, v5.8h, #2        // (t6 + t2 + 4) >> 3
860+        srshr           v5.8h, v16.8h, #2       // (t7 + t3 + 4) >> 3
861+        srshr           v6.8h, v17.8h, #2       // (t8 + t4 + 4) >> 3
862+        srshr           v2.8h, v2.8h, #2        // (t8 - t4 + 4) >> 3
863+        srshr           v1.8h, v1.8h, #2        // (t7 - t3 + 4) >> 3
864+        srshr           v7.8h, v21.8h, #2       // (t6 - t2 + 4) >> 3
865+        srshr           v16.8h, v23.8h, #2      // (t5 - t1 + 4) >> 3
866+        trn2            v17.8h, v3.8h, v4.8h
867+        trn2            v18.8h, v5.8h, v6.8h
868+        trn2            v19.8h, v2.8h, v1.8h
869+        trn2            v20.8h, v7.8h, v16.8h
870+        trn1            v21.4s, v17.4s, v18.4s
871+        trn2            v17.4s, v17.4s, v18.4s
872+        trn1            v18.4s, v19.4s, v20.4s
873+        trn2            v19.4s, v19.4s, v20.4s
874+        trn1            v3.8h, v3.8h, v4.8h
875+        trn2            v4.2d, v21.2d, v18.2d
876+        trn1            v20.2d, v17.2d, v19.2d
877+        trn1            v5.8h, v5.8h, v6.8h
878+        trn1            v1.8h, v2.8h, v1.8h
879+        trn1            v2.8h, v7.8h, v16.8h
880+        trn1            v6.2d, v21.2d, v18.2d
881+        trn2            v7.2d, v17.2d, v19.2d
882+        shl             v16.8h, v20.8h, #4      //                        16 * src[24]
883+        shl             v17.8h, v4.8h, #4       //                                       16 * src[40]
884+        trn1            v18.4s, v3.4s, v5.4s
885+        trn1            v19.4s, v1.4s, v2.4s
886+        shl             v21.8h, v7.8h, #4       //                                                      16 * src[56]
887+        shl             v22.8h, v6.8h, #2       //           4 * src[8]
888+        shl             v23.8h, v4.8h, #2       //                                        4 * src[40]
889+        trn2            v3.4s, v3.4s, v5.4s
890+        trn2            v1.4s, v1.4s, v2.4s
891+        shl             v2.8h, v6.8h, #4        //          16 * src[8]
892+        sub             v5.8h, v16.8h, v23.8h   //                        16 * src[24] -  4 * src[40]
893+        ssra            v17.8h, v16.8h, #2      //                         4 * src[24] + 16 * src[40]
894+        sub             v16.8h, v21.8h, v22.8h  //        -  4 * src[8]                               + 16 * src[56]
895+        trn1            v22.2d, v18.2d, v19.2d
896+        trn2            v18.2d, v18.2d, v19.2d
897+        trn1            v19.2d, v3.2d, v1.2d
898+        ssra            v2.8h, v21.8h, #2       //          16 * src[8]                               +  4 * src[56]
899+        mls             v17.8h, v6.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
900+        shl             v21.8h, v22.8h, #2      //         8/2 * src[0]
901+        shl             v18.8h, v18.8h, #2      //                                      8/2 * src[32]
902+        mls             v5.8h, v6.8h, v0.h[1]   //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
903+        shl             v6.8h, v19.8h, #3       //                      16/2 * src[16]
904+        trn2            v1.2d, v3.2d, v1.2d
905+        mla             v16.8h, v20.8h, v0.h[1] //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
906+        ssra            v21.8h, v21.8h, #1      //        12/2 * src[0]
907+        ssra            v18.8h, v18.8h, #1      //                                     12/2 * src[32]
908+        mul             v3.8h, v19.8h, v0.h[0]  //                       6/2 * src[16]
909+        shl             v19.8h, v1.8h, #3       //                                                    16/2 * src[48]
910+        mla             v2.8h, v20.8h, v0.h[2]  //          16 * src[8] + 15 * src[24]                +  4 * src[56]
911+        add             v20.8h, v21.8h, v18.8h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
912+        mla             v6.8h, v1.8h, v0.h[0]   // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
913+        sub             v1.8h, v21.8h, v18.8h   // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
914+        sub             v3.8h, v3.8h, v19.8h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
915+        mla             v17.8h, v7.8h, v0.h[1]  // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
916+        mls             v5.8h, v7.8h, v0.h[2]   // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
917+        add             v7.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
918+        add             v18.8h, v20.8h, v6.8h   // t5/2 = t1/2 + t3/2
919+        mls             v16.8h, v4.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
920+        sub             v19.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
921+        neg             v21.8h, v17.8h          // +t2
922+        mla             v2.8h, v4.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
923+        sub             v0.8h, v20.8h, v6.8h    // t8/2 = t1/2 - t3/2
924+        neg             v4.8h, v5.8h            // +t3
925+        sub             v22.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
926+        sub             v23.8h, v20.8h, v6.8h   // t8/2 = t1/2 - t3/2
927+        neg             v24.8h, v16.8h          // +t4
928+        add             v6.8h, v20.8h, v6.8h    // t5/2 = t1/2 + t3/2
929+        add             v1.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
930+        ssra            v7.8h, v21.8h, #1       // (t6 + t2) >> 1
931+        neg             v3.8h, v2.8h            // -t1
932+        ssra            v18.8h, v2.8h, #1       // (t5 + t1) >> 1
933+        ssra            v19.8h, v4.8h, #1       // (t7 + t3) >> 1
934+        ssra            v0.8h, v24.8h, #1       // (t8 + t4) >> 1
935+        srsra           v23.8h, v16.8h, #1      // (t8 - t4 + 1) >> 1
936+        srsra           v22.8h, v5.8h, #1       // (t7 - t3 + 1) >> 1
937+        srsra           v1.8h, v17.8h, #1       // (t6 - t2 + 1) >> 1
938+        srsra           v6.8h, v3.8h, #1        // (t5 - t1 + 1) >> 1
939+        srshr           v2.8h, v18.8h, #6       // (t5 + t1 + 64) >> 7
940+        srshr           v3.8h, v7.8h, #6        // (t6 + t2 + 64) >> 7
941+        srshr           v4.8h, v19.8h, #6       // (t7 + t3 + 64) >> 7
942+        srshr           v5.8h, v0.8h, #6        // (t8 + t4 + 64) >> 7
943+        srshr           v16.8h, v23.8h, #6      // (t8 - t4 + 65) >> 7
944+        srshr           v17.8h, v22.8h, #6      // (t7 - t3 + 65) >> 7
945+        st1             {v2.16b, v3.16b}, [x1], #32
946+        srshr           v0.8h, v1.8h, #6        // (t6 - t2 + 65) >> 7
947+        srshr           v1.8h, v6.8h, #6        // (t5 - t1 + 65) >> 7
948+        st1             {v4.16b, v5.16b}, [x1], #32
949+        st1             {v16.16b, v17.16b}, [x1], #32
950+        st1             {v0.16b, v1.16b}, [x1]
951+        ret
952+endfunc
953+
954+// VC-1 8x4 inverse transform
955+// On entry:
956+//   x0 -> array of 8-bit samples, in row-major order
957+//   x1 = row stride for 8-bit sample array
958+//   x2 -> array of 16-bit inverse transform coefficients, in row-major order
959+// On exit:
960+//   array at x0 updated by saturated addition of (narrowed) transformed block
961+function ff_vc1_inv_trans_8x4_neon, export=1
962+        ld1             {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
963+        mov             x3, x0
964+        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
965+        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
966+        ld1             {v5.8b}, [x0], x1
967+        trn2            v6.4h, v1.4h, v3.4h
968+        trn2            v7.4h, v2.4h, v4.4h
969+        trn1            v1.4h, v1.4h, v3.4h
970+        trn1            v2.4h, v2.4h, v4.4h
971+        trn2            v3.4h, v16.4h, v18.4h
972+        trn2            v4.4h, v17.4h, v19.4h
973+        trn1            v16.4h, v16.4h, v18.4h
974+        trn1            v17.4h, v17.4h, v19.4h
975+        ld1             {v18.8b}, [x0], x1
976+        trn1            v19.2s, v6.2s, v3.2s
977+        trn2            v3.2s, v6.2s, v3.2s
978+        trn1            v6.2s, v7.2s, v4.2s
979+        trn2            v4.2s, v7.2s, v4.2s
980+        trn1            v7.2s, v1.2s, v16.2s
981+        trn1            v20.2s, v2.2s, v17.2s
982+        shl             v21.4h, v19.4h, #4      //          16 * src[1]
983+        trn2            v1.2s, v1.2s, v16.2s
984+        shl             v16.4h, v3.4h, #4       //                        16 * src[3]
985+        trn2            v2.2s, v2.2s, v17.2s
986+        shl             v17.4h, v6.4h, #4       //                                      16 * src[5]
987+        ld1             {v22.8b}, [x0], x1
988+        shl             v23.4h, v4.4h, #4       //                                                    16 * src[7]
989+        mul             v24.4h, v1.4h, v0.h[0]  //                       6/2 * src[2]
990+        ld1             {v25.8b}, [x0]
991+        shl             v26.4h, v19.4h, #2      //           4 * src[1]
992+        shl             v27.4h, v6.4h, #2       //                                       4 * src[5]
993+        ssra            v21.4h, v23.4h, #2      //          16 * src[1]                             +  4 * src[7]
994+        ssra            v17.4h, v16.4h, #2      //                         4 * src[3] + 16 * src[5]
995+        sub             v23.4h, v23.4h, v26.4h  //        -  4 * src[1]                             + 16 * src[7]
996+        sub             v16.4h, v16.4h, v27.4h  //                        16 * src[3] -  4 * src[5]
997+        shl             v7.4h, v7.4h, #2        //         8/2 * src[0]
998+        shl             v20.4h, v20.4h, #2      //                                     8/2 * src[4]
999+        mla             v21.4h, v3.4h, v0.h[2]  //          16 * src[1] + 15 * src[3]               +  4 * src[7]
1000+        shl             v1.4h, v1.4h, #3        //                      16/2 * src[2]
1001+        mls             v17.4h, v19.4h, v0.h[2] //        - 15 * src[1] +  4 * src[3] + 16 * src[5]
1002+        ssra            v7.4h, v7.4h, #1        //        12/2 * src[0]
1003+        mls             v16.4h, v19.4h, v0.h[1] //        -  9 * src[1] + 16 * src[3] -  4 * src[5]
1004+        ssra            v20.4h, v20.4h, #1      //                                    12/2 * src[4]
1005+        mla             v23.4h, v3.4h, v0.h[1]  //        -  4 * src[1] +  9 * src[3]               + 16 * src[7]
1006+        shl             v3.4h, v2.4h, #3        //                                                  16/2 * src[6]
1007+        mla             v1.4h, v2.4h, v0.h[0]   // t3/2 =               16/2 * src[2]             +  6/2 * src[6]
1008+        mla             v21.4h, v6.4h, v0.h[1]  //  t1  =   16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7]
1009+        mla             v17.4h, v4.4h, v0.h[1]  // -t2  = - 15 * src[1] +  4 * src[3] + 16 * src[5] +  9 * src[7]
1010+        sub             v2.4h, v24.4h, v3.4h    // t4/2 =                6/2 * src[2]             - 16/2 * src[6]
1011+        mls             v16.4h, v4.4h, v0.h[2]  // -t3  = -  9 * src[1] + 16 * src[3] -  4 * src[5] - 15 * src[7]
1012+        add             v3.4h, v7.4h, v20.4h    // t1/2 = 12/2 * src[0]             + 12/2 * src[4]
1013+        mls             v23.4h, v6.4h, v0.h[2]  // -t4  = -  4 * src[1] +  9 * src[3] - 15 * src[5] + 16 * src[7]
1014+        sub             v4.4h, v7.4h, v20.4h    // t2/2 = 12/2 * src[0]             - 12/2 * src[4]
1015+        neg             v6.4h, v21.4h           // -t1
1016+        add             v7.4h, v3.4h, v1.4h     // t5/2 = t1/2 + t3/2
1017+        sub             v19.4h, v3.4h, v1.4h    // t8/2 = t1/2 - t3/2
1018+        add             v20.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
1019+        sub             v24.4h, v4.4h, v2.4h    // t7/2 = t2/2 - t4/2
1020+        add             v26.4h, v3.4h, v1.4h    // t5/2 = t1/2 + t3/2
1021+        add             v27.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
1022+        sub             v2.4h, v4.4h, v2.4h     // t7/2 = t2/2 - t4/2
1023+        sub             v1.4h, v3.4h, v1.4h     // t8/2 = t1/2 - t3/2
1024+        neg             v3.4h, v17.4h           // +t2
1025+        neg             v4.4h, v16.4h           // +t3
1026+        neg             v28.4h, v23.4h          // +t4
1027+        ssra            v7.4h, v21.4h, #1       // (t5 + t1) >> 1
1028+        ssra            v1.4h, v23.4h, #1       // (t8 - t4) >> 1
1029+        ssra            v20.4h, v3.4h, #1       // (t6 + t2) >> 1
1030+        ssra            v24.4h, v4.4h, #1       // (t7 + t3) >> 1
1031+        ssra            v19.4h, v28.4h, #1      // (t8 + t4) >> 1
1032+        ssra            v2.4h, v16.4h, #1       // (t7 - t3) >> 1
1033+        ssra            v27.4h, v17.4h, #1      // (t6 - t2) >> 1
1034+        ssra            v26.4h, v6.4h, #1       // (t5 - t1) >> 1
1035+        trn1            v1.2d, v7.2d, v1.2d
1036+        trn1            v2.2d, v20.2d, v2.2d
1037+        trn1            v3.2d, v24.2d, v27.2d
1038+        trn1            v4.2d, v19.2d, v26.2d
1039+        srshr           v1.8h, v1.8h, #2        // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
1040+        srshr           v2.8h, v2.8h, #2        // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
1041+        srshr           v3.8h, v3.8h, #2        // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
1042+        srshr           v4.8h, v4.8h, #2        // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
1043+        trn2            v6.8h, v1.8h, v2.8h
1044+        trn1            v1.8h, v1.8h, v2.8h
1045+        trn2            v2.8h, v3.8h, v4.8h
1046+        trn1            v3.8h, v3.8h, v4.8h
1047+        trn2            v4.4s, v6.4s, v2.4s
1048+        trn1            v7.4s, v1.4s, v3.4s
1049+        trn2            v1.4s, v1.4s, v3.4s
1050+        mul             v3.8h, v4.8h, v0.h[5]   //                                                           22/2 * src[24]
1051+        trn1            v2.4s, v6.4s, v2.4s
1052+        mul             v4.8h, v4.8h, v0.h[4]   //                                                           10/2 * src[24]
1053+        mul             v6.8h, v7.8h, v0.h[6]   //            17 * src[0]
1054+        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[16]
1055+        mls             v3.8h, v2.8h, v0.h[4]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
1056+        mla             v4.8h, v2.8h, v0.h[5]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
1057+        add             v0.8h, v6.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[16]
1058+        sub             v1.8h, v6.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[16]
1059+        neg             v2.8h, v3.8h            // -t4/2
1060+        neg             v6.8h, v4.8h            // -t3/2
1061+        ssra            v4.8h, v0.8h, #1        // (t1 + t3) >> 1
1062+        ssra            v2.8h, v1.8h, #1        // (t2 - t4) >> 1
1063+        ssra            v3.8h, v1.8h, #1        // (t2 + t4) >> 1
1064+        ssra            v6.8h, v0.8h, #1        // (t1 - t3) >> 1
1065+        srshr           v0.8h, v4.8h, #6        // (t1 + t3 + 64) >> 7
1066+        srshr           v1.8h, v2.8h, #6        // (t2 - t4 + 64) >> 7
1067+        srshr           v2.8h, v3.8h, #6        // (t2 + t4 + 64) >> 7
1068+        srshr           v3.8h, v6.8h, #6        // (t1 - t3 + 64) >> 7
1069+        uaddw           v0.8h, v0.8h, v5.8b
1070+        uaddw           v1.8h, v1.8h, v18.8b
1071+        uaddw           v2.8h, v2.8h, v22.8b
1072+        uaddw           v3.8h, v3.8h, v25.8b
1073+        sqxtun          v0.8b, v0.8h
1074+        sqxtun          v1.8b, v1.8h
1075+        sqxtun          v2.8b, v2.8h
1076+        sqxtun          v3.8b, v3.8h
1077+        st1             {v0.8b}, [x3], x1
1078+        st1             {v1.8b}, [x3], x1
1079+        st1             {v2.8b}, [x3], x1
1080+        st1             {v3.8b}, [x3]
1081+        ret
1082+endfunc
1083+
1084+// VC-1 4x8 inverse transform
1085+// On entry:
1086+//   x0 -> array of 8-bit samples, in row-major order
1087+//   x1 = row stride for 8-bit sample array
1088+//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
1089+// On exit:
1090+//   array at x0 updated by saturated addition of (narrowed) transformed block
1091+function ff_vc1_inv_trans_4x8_neon, export=1
1092+        mov             x3, #16
1093+        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
1094+        mov             x4, x0
1095+        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
1096+        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
1097+        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
1098+        ld1             {v4.d}[0], [x2], x3     // 30 31 32 33
1099+        ld1             {v1.d}[1], [x2], x3     // 40 41 42 43
1100+        ld1             {v2.d}[1], [x2], x3     // 50 51 52 53
1101+        ld1             {v3.d}[1], [x2], x3     // 60 61 62 63
1102+        ld1             {v4.d}[1], [x2]         // 70 71 72 73
1103+        ld1             {v5.s}[0], [x0], x1
1104+        ld1             {v6.s}[0], [x0], x1
1105+        ld1             {v7.s}[0], [x0], x1
1106+        trn2            v16.8h, v1.8h, v2.8h    // 01 11 03 13 41 51 43 53
1107+        trn1            v1.8h, v1.8h, v2.8h     // 00 10 02 12 40 50 42 52
1108+        trn2            v2.8h, v3.8h, v4.8h     // 21 31 23 33 61 71 63 73
1109+        trn1            v3.8h, v3.8h, v4.8h     // 20 30 22 32 60 70 62 72
1110+        ld1             {v4.s}[0], [x0], x1
1111+        trn2            v17.4s, v16.4s, v2.4s   // 03 13 23 33 43 53 63 73
1112+        trn1            v18.4s, v1.4s, v3.4s    // 00 10 20 30 40 50 60 70
1113+        trn1            v2.4s, v16.4s, v2.4s    // 01 11 21 31 41 51 61 71
1114+        mul             v16.8h, v17.8h, v0.h[4] //                                                          10/2 * src[3]
1115+        ld1             {v5.s}[1], [x0], x1
1116+        mul             v17.8h, v17.8h, v0.h[5] //                                                          22/2 * src[3]
1117+        ld1             {v6.s}[1], [x0], x1
1118+        trn2            v1.4s, v1.4s, v3.4s     // 02 12 22 32 42 52 62 72
1119+        mul             v3.8h, v18.8h, v0.h[6]  //            17 * src[0]
1120+        ld1             {v7.s}[1], [x0], x1
1121+        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[2]
1122+        ld1             {v4.s}[1], [x0]
1123+        mla             v16.8h, v2.8h, v0.h[5]  //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
1124+        mls             v17.8h, v2.8h, v0.h[4]  //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
1125+        add             v2.8h, v3.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[2]
1126+        sub             v1.8h, v3.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[2]
1127+        neg             v3.8h, v16.8h           // -t3/2
1128+        ssra            v16.8h, v2.8h, #1       // (t1 + t3) >> 1
1129+        neg             v18.8h, v17.8h          // -t4/2
1130+        ssra            v17.8h, v1.8h, #1       // (t2 + t4) >> 1
1131+        ssra            v3.8h, v2.8h, #1        // (t1 - t3) >> 1
1132+        ssra            v18.8h, v1.8h, #1       // (t2 - t4) >> 1
1133+        srshr           v1.8h, v16.8h, #2       // (t1 + t3 + 64) >> 3
1134+        srshr           v2.8h, v17.8h, #2       // (t2 + t4 + 64) >> 3
1135+        srshr           v3.8h, v3.8h, #2        // (t1 - t3 + 64) >> 3
1136+        srshr           v16.8h, v18.8h, #2      // (t2 - t4 + 64) >> 3
1137+        trn2            v17.8h, v2.8h, v3.8h    // 12 13 32 33 52 53 72 73
1138+        trn2            v18.8h, v1.8h, v16.8h   // 10 11 30 31 50 51 70 71
1139+        trn1            v1.8h, v1.8h, v16.8h    // 00 01 20 21 40 41 60 61
1140+        trn1            v2.8h, v2.8h, v3.8h     // 02 03 22 23 42 43 62 63
1141+        trn1            v3.4s, v18.4s, v17.4s   // 10 11 12 13 50 51 52 53
1142+        trn2            v16.4s, v18.4s, v17.4s  // 30 31 32 33 70 71 72 73
1143+        trn1            v17.4s, v1.4s, v2.4s    // 00 01 02 03 40 41 42 43
1144+        mov             d18, v3.d[1]            // 50 51 52 53
1145+        shl             v19.4h, v3.4h, #4       //          16 * src[8]
1146+        mov             d20, v16.d[1]           // 70 71 72 73
1147+        shl             v21.4h, v16.4h, #4      //                        16 * src[24]
1148+        mov             d22, v17.d[1]           // 40 41 42 43
1149+        shl             v23.4h, v3.4h, #2       //           4 * src[8]
1150+        shl             v24.4h, v18.4h, #4      //                                       16 * src[40]
1151+        shl             v25.4h, v20.4h, #4      //                                                      16 * src[56]
1152+        shl             v26.4h, v18.4h, #2      //                                        4 * src[40]
1153+        trn2            v1.4s, v1.4s, v2.4s     // 20 21 22 23 60 61 62 63
1154+        ssra            v24.4h, v21.4h, #2      //                         4 * src[24] + 16 * src[40]
1155+        sub             v2.4h, v25.4h, v23.4h   //        -  4 * src[8]                               + 16 * src[56]
1156+        shl             v17.4h, v17.4h, #2      //         8/2 * src[0]
1157+        sub             v21.4h, v21.4h, v26.4h  //                        16 * src[24] -  4 * src[40]
1158+        shl             v22.4h, v22.4h, #2      //                                      8/2 * src[32]
1159+        mov             d23, v1.d[1]            // 60 61 62 63
1160+        ssra            v19.4h, v25.4h, #2      //          16 * src[8]                               +  4 * src[56]
1161+        mul             v25.4h, v1.4h, v0.h[0]  //                       6/2 * src[16]
1162+        shl             v1.4h, v1.4h, #3        //                      16/2 * src[16]
1163+        mls             v24.4h, v3.4h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
1164+        ssra            v17.4h, v17.4h, #1      //        12/2 * src[0]
1165+        mls             v21.4h, v3.4h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
1166+        ssra            v22.4h, v22.4h, #1      //                                     12/2 * src[32]
1167+        mla             v2.4h, v16.4h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
1168+        shl             v3.4h, v23.4h, #3       //                                                    16/2 * src[48]
1169+        mla             v19.4h, v16.4h, v0.h[2] //          16 * src[8] + 15 * src[24]                +  4 * src[56]
1170+        mla             v1.4h, v23.4h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
1171+        mla             v24.4h, v20.4h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
1172+        add             v16.4h, v17.4h, v22.4h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
1173+        sub             v3.4h, v25.4h, v3.4h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
1174+        sub             v17.4h, v17.4h, v22.4h  // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
1175+        mls             v21.4h, v20.4h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
1176+        mla             v19.4h, v18.4h, v0.h[1] //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
1177+        add             v20.4h, v16.4h, v1.4h   // t5/2 = t1/2 + t3/2
1178+        mls             v2.4h, v18.4h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
1179+        sub             v0.4h, v16.4h, v1.4h    // t8/2 = t1/2 - t3/2
1180+        add             v18.4h, v17.4h, v3.4h   // t6/2 = t2/2 + t4/2
1181+        sub             v22.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
1182+        neg             v23.4h, v24.4h          // +t2
1183+        sub             v25.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
1184+        add             v3.4h, v17.4h, v3.4h    // t6/2 = t2/2 + t4/2
1185+        neg             v17.4h, v21.4h          // +t3
1186+        sub             v26.4h, v16.4h, v1.4h   // t8/2 = t1/2 - t3/2
1187+        add             v1.4h, v16.4h, v1.4h    // t5/2 = t1/2 + t3/2
1188+        neg             v16.4h, v19.4h          // -t1
1189+        neg             v27.4h, v2.4h           // +t4
1190+        ssra            v20.4h, v19.4h, #1      // (t5 + t1) >> 1
1191+        srsra           v0.4h, v2.4h, #1        // (t8 - t4 + 1) >> 1
1192+        ssra            v18.4h, v23.4h, #1      // (t6 + t2) >> 1
1193+        srsra           v22.4h, v21.4h, #1      // (t7 - t3 + 1) >> 1
1194+        ssra            v25.4h, v17.4h, #1      // (t7 + t3) >> 1
1195+        srsra           v3.4h, v24.4h, #1       // (t6 - t2 + 1) >> 1
1196+        ssra            v26.4h, v27.4h, #1      // (t8 + t4) >> 1
1197+        srsra           v1.4h, v16.4h, #1       // (t5 - t1 + 1) >> 1
1198+        trn1            v0.2d, v20.2d, v0.2d
1199+        trn1            v2.2d, v18.2d, v22.2d
1200+        trn1            v3.2d, v25.2d, v3.2d
1201+        trn1            v1.2d, v26.2d, v1.2d
1202+        srshr           v0.8h, v0.8h, #6        // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
1203+        srshr           v2.8h, v2.8h, #6        // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
1204+        srshr           v3.8h, v3.8h, #6        // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
1205+        srshr           v1.8h, v1.8h, #6        // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
1206+        uaddw           v0.8h, v0.8h, v5.8b
1207+        uaddw           v2.8h, v2.8h, v6.8b
1208+        uaddw           v3.8h, v3.8h, v7.8b
1209+        uaddw           v1.8h, v1.8h, v4.8b
1210+        sqxtun          v0.8b, v0.8h
1211+        sqxtun          v2.8b, v2.8h
1212+        sqxtun          v3.8b, v3.8h
1213+        sqxtun          v1.8b, v1.8h
1214+        st1             {v0.s}[0], [x4], x1
1215+        st1             {v2.s}[0], [x4], x1
1216+        st1             {v3.s}[0], [x4], x1
1217+        st1             {v1.s}[0], [x4], x1
1218+        st1             {v0.s}[1], [x4], x1
1219+        st1             {v2.s}[1], [x4], x1
1220+        st1             {v3.s}[1], [x4], x1
1221+        st1             {v1.s}[1], [x4]
1222+        ret
1223+endfunc
1224+
1225+// VC-1 4x4 inverse transform
1226+// On entry:
1227+//   x0 -> array of 8-bit samples, in row-major order
1228+//   x1 = row stride for 8-bit sample array
1229+//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
1230+// On exit:
1231+//   array at x0 updated by saturated addition of (narrowed) transformed block
1232+function ff_vc1_inv_trans_4x4_neon, export=1
1233+        mov             x3, #16
1234+        ldr             d0, .Lcoeffs_it4
1235+        mov             x4, x0
1236+        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
1237+        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
1238+        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
1239+        ld1             {v4.d}[0], [x2]         // 30 31 32 33
1240+        ld1             {v5.s}[0], [x0], x1
1241+        ld1             {v5.s}[1], [x0], x1
1242+        ld1             {v6.s}[0], [x0], x1
1243+        trn2            v7.4h, v1.4h, v2.4h     // 01 11 03 13
1244+        trn1            v1.4h, v1.4h, v2.4h     // 00 10 02 12
1245+        ld1             {v6.s}[1], [x0]
1246+        trn2            v2.4h, v3.4h, v4.4h     // 21 31 23 33
1247+        trn1            v3.4h, v3.4h, v4.4h     // 20 30 22 32
1248+        trn2            v4.2s, v7.2s, v2.2s     // 03 13 23 33
1249+        trn1            v16.2s, v1.2s, v3.2s    // 00 10 20 30
1250+        trn1            v2.2s, v7.2s, v2.2s     // 01 11 21 31
1251+        trn2            v1.2s, v1.2s, v3.2s     // 02 12 22 32
1252+        mul             v3.4h, v4.4h, v0.h[0]   //                                                          10/2 * src[3]
1253+        mul             v4.4h, v4.4h, v0.h[1]   //                                                          22/2 * src[3]
1254+        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
1255+        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[2]
1256+        mla             v3.4h, v2.4h, v0.h[1]   //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
1257+        mls             v4.4h, v2.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
1258+        add             v2.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[2]
1259+        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[2]
1260+        neg             v7.4h, v3.4h            // -t3/2
1261+        neg             v16.4h, v4.4h           // -t4/2
1262+        ssra            v3.4h, v2.4h, #1        // (t1 + t3) >> 1
1263+        ssra            v4.4h, v1.4h, #1        // (t2 + t4) >> 1
1264+        ssra            v16.4h, v1.4h, #1       // (t2 - t4) >> 1
1265+        ssra            v7.4h, v2.4h, #1        // (t1 - t3) >> 1
1266+        srshr           v1.4h, v3.4h, #2        // (t1 + t3 + 64) >> 3
1267+        srshr           v2.4h, v4.4h, #2        // (t2 + t4 + 64) >> 3
1268+        srshr           v3.4h, v16.4h, #2       // (t2 - t4 + 64) >> 3
1269+        srshr           v4.4h, v7.4h, #2        // (t1 - t3 + 64) >> 3
1270+        trn2            v7.4h, v1.4h, v3.4h     // 10 11 30 31
1271+        trn1            v1.4h, v1.4h, v3.4h     // 00 01 20 21
1272+        trn2            v3.4h, v2.4h, v4.4h     // 12 13 32 33
1273+        trn1            v2.4h, v2.4h, v4.4h     // 02 03 22 23
1274+        trn2            v4.2s, v7.2s, v3.2s     // 30 31 32 33
1275+        trn1            v16.2s, v1.2s, v2.2s    // 00 01 02 03
1276+        trn1            v3.2s, v7.2s, v3.2s     // 10 11 12 13
1277+        trn2            v1.2s, v1.2s, v2.2s     // 20 21 22 23
1278+        mul             v2.4h, v4.4h, v0.h[1]   //                                                           22/2 * src[24]
1279+        mul             v4.4h, v4.4h, v0.h[0]   //                                                           10/2 * src[24]
1280+        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
1281+        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[16]
1282+        mls             v2.4h, v3.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
1283+        mla             v4.4h, v3.4h, v0.h[1]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
1284+        add             v0.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[16]
1285+        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[16]
1286+        neg             v3.4h, v2.4h            // -t4/2
1287+        neg             v7.4h, v4.4h            // -t3/2
1288+        ssra            v4.4h, v0.4h, #1        // (t1 + t3) >> 1
1289+        ssra            v3.4h, v1.4h, #1        // (t2 - t4) >> 1
1290+        ssra            v2.4h, v1.4h, #1        // (t2 + t4) >> 1
1291+        ssra            v7.4h, v0.4h, #1        // (t1 - t3) >> 1
1292+        trn1            v0.2d, v4.2d, v3.2d
1293+        trn1            v1.2d, v2.2d, v7.2d
1294+        srshr           v0.8h, v0.8h, #6        // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
1295+        srshr           v1.8h, v1.8h, #6        // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
1296+        uaddw           v0.8h, v0.8h, v5.8b
1297+        uaddw           v1.8h, v1.8h, v6.8b
1298+        sqxtun          v0.8b, v0.8h
1299+        sqxtun          v1.8b, v1.8h
1300+        st1             {v0.s}[0], [x4], x1
1301+        st1             {v0.s}[1], [x4], x1
1302+        st1             {v1.s}[0], [x4], x1
1303+        st1             {v1.s}[1], [x4]
1304+        ret
1305+endfunc
1306+
1307+// VC-1 8x8 inverse transform, DC case
1308+// On entry:
1309+//   x0 -> array of 8-bit samples, in row-major order
1310+//   x1 = row stride for 8-bit sample array
1311+//   x2 -> 16-bit inverse transform DC coefficient
1312+// On exit:
1313+//   array at x0 updated by saturated addition of (narrowed) transformed block
1314+function ff_vc1_inv_trans_8x8_dc_neon, export=1
1315+        ldrsh           w2, [x2]
1316+        mov             x3, x0
1317+        ld1             {v0.8b}, [x0], x1
1318+        ld1             {v1.8b}, [x0], x1
1319+        ld1             {v2.8b}, [x0], x1
1320+        add             w2, w2, w2, lsl #1
1321+        ld1             {v3.8b}, [x0], x1
1322+        ld1             {v4.8b}, [x0], x1
1323+        add             w2, w2, #1
1324+        ld1             {v5.8b}, [x0], x1
1325+        asr             w2, w2, #1
1326+        ld1             {v6.8b}, [x0], x1
1327+        add             w2, w2, w2, lsl #1
1328+        ld1             {v7.8b}, [x0]
1329+        add             w0, w2, #16
1330+        asr             w0, w0, #5
1331+        dup             v16.8h, w0
1332+        uaddw           v0.8h, v16.8h, v0.8b
1333+        uaddw           v1.8h, v16.8h, v1.8b
1334+        uaddw           v2.8h, v16.8h, v2.8b
1335+        uaddw           v3.8h, v16.8h, v3.8b
1336+        uaddw           v4.8h, v16.8h, v4.8b
1337+        uaddw           v5.8h, v16.8h, v5.8b
1338+        sqxtun          v0.8b, v0.8h
1339+        uaddw           v6.8h, v16.8h, v6.8b
1340+        sqxtun          v1.8b, v1.8h
1341+        uaddw           v7.8h, v16.8h, v7.8b
1342+        sqxtun          v2.8b, v2.8h
1343+        sqxtun          v3.8b, v3.8h
1344+        sqxtun          v4.8b, v4.8h
1345+        st1             {v0.8b}, [x3], x1
1346+        sqxtun          v0.8b, v5.8h
1347+        st1             {v1.8b}, [x3], x1
1348+        sqxtun          v1.8b, v6.8h
1349+        st1             {v2.8b}, [x3], x1
1350+        sqxtun          v2.8b, v7.8h
1351+        st1             {v3.8b}, [x3], x1
1352+        st1             {v4.8b}, [x3], x1
1353+        st1             {v0.8b}, [x3], x1
1354+        st1             {v1.8b}, [x3], x1
1355+        st1             {v2.8b}, [x3]
1356+        ret
1357+endfunc
1358+
1359+// VC-1 8x4 inverse transform, DC case
1360+// On entry:
1361+//   x0 -> array of 8-bit samples, in row-major order
1362+//   x1 = row stride for 8-bit sample array
1363+//   x2 -> 16-bit inverse transform DC coefficient
1364+// On exit:
1365+//   array at x0 updated by saturated addition of (narrowed) transformed block
1366+function ff_vc1_inv_trans_8x4_dc_neon, export=1
1367+        ldrsh           w2, [x2]
1368+        mov             x3, x0
1369+        ld1             {v0.8b}, [x0], x1
1370+        ld1             {v1.8b}, [x0], x1
1371+        ld1             {v2.8b}, [x0], x1
1372+        add             w2, w2, w2, lsl #1
1373+        ld1             {v3.8b}, [x0]
1374+        add             w0, w2, #1
1375+        asr             w0, w0, #1
1376+        add             w0, w0, w0, lsl #4
1377+        add             w0, w0, #64
1378+        asr             w0, w0, #7
1379+        dup             v4.8h, w0
1380+        uaddw           v0.8h, v4.8h, v0.8b
1381+        uaddw           v1.8h, v4.8h, v1.8b
1382+        uaddw           v2.8h, v4.8h, v2.8b
1383+        uaddw           v3.8h, v4.8h, v3.8b
1384+        sqxtun          v0.8b, v0.8h
1385+        sqxtun          v1.8b, v1.8h
1386+        sqxtun          v2.8b, v2.8h
1387+        sqxtun          v3.8b, v3.8h
1388+        st1             {v0.8b}, [x3], x1
1389+        st1             {v1.8b}, [x3], x1
1390+        st1             {v2.8b}, [x3], x1
1391+        st1             {v3.8b}, [x3]
1392+        ret
1393+endfunc
1394+
1395+// VC-1 4x8 inverse transform, DC case
1396+// On entry:
1397+//   x0 -> array of 8-bit samples, in row-major order
1398+//   x1 = row stride for 8-bit sample array
1399+//   x2 -> 16-bit inverse transform DC coefficient
1400+// On exit:
1401+//   array at x0 updated by saturated addition of (narrowed) transformed block
1402+function ff_vc1_inv_trans_4x8_dc_neon, export=1
1403+        ldrsh           w2, [x2]
1404+        mov             x3, x0
1405+        ld1             {v0.s}[0], [x0], x1
1406+        ld1             {v1.s}[0], [x0], x1
1407+        ld1             {v2.s}[0], [x0], x1
1408+        add             w2, w2, w2, lsl #4
1409+        ld1             {v3.s}[0], [x0], x1
1410+        add             w2, w2, #4
1411+        asr             w2, w2, #3
1412+        add             w2, w2, w2, lsl #1
1413+        ld1             {v0.s}[1], [x0], x1
1414+        add             w2, w2, #16
1415+        asr             w2, w2, #5
1416+        dup             v4.8h, w2
1417+        ld1             {v1.s}[1], [x0], x1
1418+        ld1             {v2.s}[1], [x0], x1
1419+        ld1             {v3.s}[1], [x0]
1420+        uaddw           v0.8h, v4.8h, v0.8b
1421+        uaddw           v1.8h, v4.8h, v1.8b
1422+        uaddw           v2.8h, v4.8h, v2.8b
1423+        uaddw           v3.8h, v4.8h, v3.8b
1424+        sqxtun          v0.8b, v0.8h
1425+        sqxtun          v1.8b, v1.8h
1426+        sqxtun          v2.8b, v2.8h
1427+        sqxtun          v3.8b, v3.8h
1428+        st1             {v0.s}[0], [x3], x1
1429+        st1             {v1.s}[0], [x3], x1
1430+        st1             {v2.s}[0], [x3], x1
1431+        st1             {v3.s}[0], [x3], x1
1432+        st1             {v0.s}[1], [x3], x1
1433+        st1             {v1.s}[1], [x3], x1
1434+        st1             {v2.s}[1], [x3], x1
1435+        st1             {v3.s}[1], [x3]
1436+        ret
1437+endfunc
1438+
1439+// VC-1 4x4 inverse transform, DC case
1440+// On entry:
1441+//   x0 -> array of 8-bit samples, in row-major order
1442+//   x1 = row stride for 8-bit sample array
1443+//   x2 -> 16-bit inverse transform DC coefficient
1444+// On exit:
1445+//   array at x0 updated by saturated addition of (narrowed) transformed block
1446+function ff_vc1_inv_trans_4x4_dc_neon, export=1
1447+        ldrsh           w2, [x2]
1448+        mov             x3, x0
1449+        ld1             {v0.s}[0], [x0], x1
1450+        ld1             {v1.s}[0], [x0], x1
1451+        ld1             {v0.s}[1], [x0], x1
1452+        add             w2, w2, w2, lsl #4
1453+        ld1             {v1.s}[1], [x0]
1454+        add             w0, w2, #4
1455+        asr             w0, w0, #3
1456+        add             w0, w0, w0, lsl #4
1457+        add             w0, w0, #64
1458+        asr             w0, w0, #7
1459+        dup             v2.8h, w0
1460+        uaddw           v0.8h, v2.8h, v0.8b
1461+        uaddw           v1.8h, v2.8h, v1.8b
1462+        sqxtun          v0.8b, v0.8h
1463+        sqxtun          v1.8b, v1.8h
1464+        st1             {v0.s}[0], [x3], x1
1465+        st1             {v1.s}[0], [x3], x1
1466+        st1             {v0.s}[1], [x3], x1
1467+        st1             {v1.s}[1], [x3]
1468+        ret
1469+endfunc
1470+
1471+.align  5
1472+.Lcoeffs_it8:
1473+.quad   0x000F00090003
1474+.Lcoeffs_it4:
1475+.quad   0x0011000B0005
1476+.Lcoeffs:
1477+.quad   0x00050002
1478+
1479+// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
1480+// On entry:
1481+//   x0 -> top-left pel of lower block
1482+//   x1 = row stride, bytes
1483+//   w2 = PQUANT bitstream parameter
1484+function ff_vc1_v_loop_filter4_neon, export=1
1485+        sub             x3, x0, w1, sxtw #2
1486+        ldr             d0, .Lcoeffs
1487+        ld1             {v1.s}[0], [x0], x1     // P5
1488+        ld1             {v2.s}[0], [x3], x1     // P1
1489+        ld1             {v3.s}[0], [x3], x1     // P2
1490+        ld1             {v4.s}[0], [x0], x1     // P6
1491+        ld1             {v5.s}[0], [x3], x1     // P3
1492+        ld1             {v6.s}[0], [x0], x1     // P7
1493+        ld1             {v7.s}[0], [x3]         // P4
1494+        ld1             {v16.s}[0], [x0]        // P8
1495+        ushll           v17.8h, v1.8b, #1       // 2*P5
1496+        dup             v18.8h, w2              // pq
1497+        ushll           v2.8h, v2.8b, #1        // 2*P1
1498+        uxtl            v3.8h, v3.8b            // P2
1499+        uxtl            v4.8h, v4.8b            // P6
1500+        uxtl            v19.8h, v5.8b           // P3
1501+        mls             v2.4h, v3.4h, v0.h[1]   // 2*P1-5*P2
1502+        uxtl            v3.8h, v6.8b            // P7
1503+        mls             v17.4h, v4.4h, v0.h[1]  // 2*P5-5*P6
1504+        ushll           v5.8h, v5.8b, #1        // 2*P3
1505+        uxtl            v6.8h, v7.8b            // P4
1506+        mla             v17.4h, v3.4h, v0.h[1]  // 2*P5-5*P6+5*P7
1507+        uxtl            v3.8h, v16.8b           // P8
1508+        mla             v2.4h, v19.4h, v0.h[1]  // 2*P1-5*P2+5*P3
1509+        uxtl            v1.8h, v1.8b            // P5
1510+        mls             v5.4h, v6.4h, v0.h[1]   // 2*P3-5*P4
1511+        mls             v17.4h, v3.4h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
1512+        sub             v3.4h, v6.4h, v1.4h     // P4-P5
1513+        mls             v2.4h, v6.4h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
1514+        mla             v5.4h, v1.4h, v0.h[1]   // 2*P3-5*P4+5*P5
1515+        mls             v5.4h, v4.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
1516+        abs             v4.4h, v3.4h
1517+        srshr           v7.4h, v17.4h, #3
1518+        srshr           v2.4h, v2.4h, #3
1519+        sshr            v4.4h, v4.4h, #1        // clip
1520+        srshr           v5.4h, v5.4h, #3
1521+        abs             v7.4h, v7.4h            // a2
1522+        sshr            v3.4h, v3.4h, #8        // clip_sign
1523+        abs             v2.4h, v2.4h            // a1
1524+        cmeq            v16.4h, v4.4h, #0       // test clip == 0
1525+        abs             v17.4h, v5.4h           // a0
1526+        sshr            v5.4h, v5.4h, #8        // a0_sign
1527+        cmhs            v19.4h, v2.4h, v7.4h    // test a1 >= a2
1528+        cmhs            v18.4h, v17.4h, v18.4h  // test a0 >= pq
1529+        sub             v3.4h, v3.4h, v5.4h     // clip_sign - a0_sign
1530+        bsl             v19.8b, v7.8b, v2.8b    // a3
1531+        orr             v2.8b, v16.8b, v18.8b   // test clip == 0 || a0 >= pq
1532+        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1533+        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
1534+        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
1535+        orr             v5.8b, v2.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
1536+        mov             w0, v5.s[1]             // move to gp reg
1537+        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
1538+        cmhs            v5.4h, v0.4h, v4.4h
1539+        tbnz            w0, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
1540+        bsl             v5.8b, v4.8b, v0.8b     // FFMIN(d, clip)
1541+        bic             v0.8b, v5.8b, v2.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
1542+        mls             v6.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1543+        mla             v1.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1544+        sqxtun          v0.8b, v6.8h
1545+        sqxtun          v1.8b, v1.8h
1546+        st1             {v0.s}[0], [x3], x1
1547+        st1             {v1.s}[0], [x3]
1548+1:      ret
1549+endfunc
1550+
1551+// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
1552+// On entry:
1553+//   x0 -> top-left pel of right block
1554+//   x1 = row stride, bytes
1555+//   w2 = PQUANT bitstream parameter
1556+function ff_vc1_h_loop_filter4_neon, export=1
1557+        sub             x3, x0, #4              // where to start reading
1558+        ldr             d0, .Lcoeffs
1559+        ld1             {v1.8b}, [x3], x1
1560+        sub             x0, x0, #1              // where to start writing
1561+        ld1             {v2.8b}, [x3], x1
1562+        ld1             {v3.8b}, [x3], x1
1563+        ld1             {v4.8b}, [x3]
1564+        dup             v5.8h, w2               // pq
1565+        trn1            v6.8b, v1.8b, v2.8b
1566+        trn2            v1.8b, v1.8b, v2.8b
1567+        trn1            v2.8b, v3.8b, v4.8b
1568+        trn2            v3.8b, v3.8b, v4.8b
1569+        trn1            v4.4h, v6.4h, v2.4h     // P1, P5
1570+        trn1            v7.4h, v1.4h, v3.4h     // P2, P6
1571+        trn2            v2.4h, v6.4h, v2.4h     // P3, P7
1572+        trn2            v1.4h, v1.4h, v3.4h     // P4, P8
1573+        ushll           v3.8h, v4.8b, #1        // 2*P1, 2*P5
1574+        uxtl            v6.8h, v7.8b            // P2, P6
1575+        uxtl            v7.8h, v2.8b            // P3, P7
1576+        uxtl            v1.8h, v1.8b            // P4, P8
1577+        mls             v3.8h, v6.8h, v0.h[1]   // 2*P1-5*P2, 2*P5-5*P6
1578+        ushll           v2.8h, v2.8b, #1        // 2*P3, 2*P7
1579+        uxtl            v4.8h, v4.8b            // P1, P5
1580+        mla             v3.8h, v7.8h, v0.h[1]   // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
1581+        mov             d6, v6.d[1]             // P6
1582+        mls             v3.8h, v1.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
1583+        mov             d4, v4.d[1]             // P5
1584+        mls             v2.4h, v1.4h, v0.h[1]   // 2*P3-5*P4
1585+        mla             v2.4h, v4.4h, v0.h[1]   // 2*P3-5*P4+5*P5
1586+        sub             v7.4h, v1.4h, v4.4h     // P4-P5
1587+        mls             v2.4h, v6.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
1588+        srshr           v3.8h, v3.8h, #3
1589+        abs             v6.4h, v7.4h
1590+        sshr            v7.4h, v7.4h, #8        // clip_sign
1591+        srshr           v2.4h, v2.4h, #3
1592+        abs             v3.8h, v3.8h            // a1, a2
1593+        sshr            v6.4h, v6.4h, #1        // clip
1594+        mov             d16, v3.d[1]            // a2
1595+        abs             v17.4h, v2.4h           // a0
1596+        cmeq            v18.4h, v6.4h, #0       // test clip == 0
1597+        sshr            v2.4h, v2.4h, #8        // a0_sign
1598+        cmhs            v19.4h, v3.4h, v16.4h   // test a1 >= a2
1599+        cmhs            v5.4h, v17.4h, v5.4h    // test a0 >= pq
1600+        sub             v2.4h, v7.4h, v2.4h     // clip_sign - a0_sign
1601+        bsl             v19.8b, v16.8b, v3.8b   // a3
1602+        orr             v3.8b, v18.8b, v5.8b    // test clip == 0 || a0 >= pq
1603+        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1604+        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
1605+        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
1606+        orr             v5.8b, v3.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
1607+        mov             w2, v5.s[1]             // move to gp reg
1608+        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
1609+        cmhs            v5.4h, v0.4h, v6.4h
1610+        tbnz            w2, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
1611+        bsl             v5.8b, v6.8b, v0.8b     // FFMIN(d, clip)
1612+        bic             v0.8b, v5.8b, v3.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
1613+        mla             v4.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1614+        mls             v1.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1615+        sqxtun          v3.8b, v4.8h
1616+        sqxtun          v2.8b, v1.8h
1617+        st2             {v2.b, v3.b}[0], [x0], x1
1618+        st2             {v2.b, v3.b}[1], [x0], x1
1619+        st2             {v2.b, v3.b}[2], [x0], x1
1620+        st2             {v2.b, v3.b}[3], [x0]
1621+1:      ret
1622+endfunc
1623+
1624+// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
1625+// On entry:
1626+//   x0 -> top-left pel of lower block
1627+//   x1 = row stride, bytes
1628+//   w2 = PQUANT bitstream parameter
1629+function ff_vc1_v_loop_filter8_neon, export=1
1630+        sub             x3, x0, w1, sxtw #2
1631+        ldr             d0, .Lcoeffs
1632+        ld1             {v1.8b}, [x0], x1       // P5
1633+        movi            v2.2d, #0x0000ffff00000000
1634+        ld1             {v3.8b}, [x3], x1       // P1
1635+        ld1             {v4.8b}, [x3], x1       // P2
1636+        ld1             {v5.8b}, [x0], x1       // P6
1637+        ld1             {v6.8b}, [x3], x1       // P3
1638+        ld1             {v7.8b}, [x0], x1       // P7
1639+        ushll           v16.8h, v1.8b, #1       // 2*P5
1640+        ushll           v3.8h, v3.8b, #1        // 2*P1
1641+        ld1             {v17.8b}, [x3]          // P4
1642+        uxtl            v4.8h, v4.8b            // P2
1643+        ld1             {v18.8b}, [x0]          // P8
1644+        uxtl            v5.8h, v5.8b            // P6
1645+        dup             v19.8h, w2              // pq
1646+        uxtl            v20.8h, v6.8b           // P3
1647+        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1-5*P2
1648+        uxtl            v4.8h, v7.8b            // P7
1649+        ushll           v6.8h, v6.8b, #1        // 2*P3
1650+        mls             v16.8h, v5.8h, v0.h[1]  // 2*P5-5*P6
1651+        uxtl            v7.8h, v17.8b           // P4
1652+        uxtl            v17.8h, v18.8b          // P8
1653+        mla             v16.8h, v4.8h, v0.h[1]  // 2*P5-5*P6+5*P7
1654+        uxtl            v1.8h, v1.8b            // P5
1655+        mla             v3.8h, v20.8h, v0.h[1]  // 2*P1-5*P2+5*P3
1656+        sub             v4.8h, v7.8h, v1.8h     // P4-P5
1657+        mls             v6.8h, v7.8h, v0.h[1]   // 2*P3-5*P4
1658+        mls             v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
1659+        abs             v17.8h, v4.8h
1660+        sshr            v4.8h, v4.8h, #8        // clip_sign
1661+        mls             v3.8h, v7.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
1662+        sshr            v17.8h, v17.8h, #1      // clip
1663+        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3-5*P4+5*P5
1664+        srshr           v16.8h, v16.8h, #3
1665+        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
1666+        cmeq            v5.8h, v17.8h, #0       // test clip == 0
1667+        srshr           v3.8h, v3.8h, #3
1668+        abs             v16.8h, v16.8h          // a2
1669+        abs             v3.8h, v3.8h            // a1
1670+        srshr           v6.8h, v6.8h, #3
1671+        cmhs            v18.8h, v3.8h, v16.8h   // test a1 >= a2
1672+        abs             v20.8h, v6.8h           // a0
1673+        sshr            v6.8h, v6.8h, #8        // a0_sign
1674+        bsl             v18.16b, v16.16b, v3.16b // a3
1675+        cmhs            v3.8h, v20.8h, v19.8h   // test a0 >= pq
1676+        sub             v4.8h, v4.8h, v6.8h     // clip_sign - a0_sign
1677+        uqsub           v6.8h, v20.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1678+        cmhs            v16.8h, v18.8h, v20.8h  // test a3 >= a0
1679+        orr             v3.16b, v5.16b, v3.16b  // test clip == 0 || a0 >= pq
1680+        mul             v0.8h, v6.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
1681+        orr             v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
1682+        cmtst           v2.2d, v5.2d, v2.2d     // if 2nd of each group of is not filtered, then none of the others in the group should be either
1683+        mov             w0, v5.s[1]             // move to gp reg
1684+        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
1685+        mov             w2, v5.s[3]
1686+        orr             v2.16b, v3.16b, v2.16b
1687+        cmhs            v3.8h, v0.8h, v17.8h
1688+        and             w0, w0, w2
1689+        bsl             v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
1690+        tbnz            w0, #0, 1f              // none of the 8 pixel pairs should be updated in this case
1691+        bic             v0.16b, v3.16b, v2.16b  // set each d to zero if it should not be filtered
1692+        mls             v7.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1693+        mla             v1.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1694+        sqxtun          v0.8b, v7.8h
1695+        sqxtun          v1.8b, v1.8h
1696+        st1             {v0.8b}, [x3], x1
1697+        st1             {v1.8b}, [x3]
1698+1:      ret
1699+endfunc
1700+
1701+// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
1702+// On entry:
1703+//   x0 -> top-left pel of right block
1704+//   x1 = row stride, bytes
1705+//   w2 = PQUANT bitstream parameter
1706+function ff_vc1_h_loop_filter8_neon, export=1
1707+        sub             x3, x0, #4              // where to start reading
1708+        ldr             d0, .Lcoeffs
1709+        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
1710+        sub             x0, x0, #1              // where to start writing
1711+        ld1             {v2.8b}, [x3], x1
1712+        add             x4, x0, x1, lsl #2
1713+        ld1             {v3.8b}, [x3], x1
1714+        ld1             {v4.8b}, [x3], x1
1715+        ld1             {v5.8b}, [x3], x1
1716+        ld1             {v6.8b}, [x3], x1
1717+        ld1             {v7.8b}, [x3], x1
1718+        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
1719+        ld1             {v17.8b}, [x3]
1720+        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
1721+        trn1            v2.8b, v3.8b, v4.8b     // P1[2], P1[3], P3[2]...
1722+        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
1723+        dup             v4.8h, w2               // pq
1724+        trn1            v18.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
1725+        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
1726+        trn1            v6.4h, v16.4h, v2.4h    // P1[0], P1[1], P1[2], P1[3], P5[0]...
1727+        trn1            v19.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
1728+        trn1            v20.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
1729+        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
1730+        trn2            v2.4h, v16.4h, v2.4h    // P3[0], P3[1], P3[2], P3[3], P7[0]...
1731+        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
1732+        trn1            v3.4h, v18.4h, v20.4h   // P1[4], P1[5], P1[6], P1[7], P5[4]...
1733+        trn1            v16.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
1734+        trn2            v17.4h, v18.4h, v20.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
1735+        trn2            v5.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
1736+        trn1            v7.2s, v6.2s, v3.2s     // P1
1737+        trn1            v18.2s, v19.2s, v16.2s  // P2
1738+        trn2            v3.2s, v6.2s, v3.2s     // P5
1739+        trn2            v6.2s, v19.2s, v16.2s   // P6
1740+        trn1            v16.2s, v2.2s, v17.2s   // P3
1741+        trn2            v2.2s, v2.2s, v17.2s    // P7
1742+        ushll           v7.8h, v7.8b, #1        // 2*P1
1743+        trn1            v17.2s, v1.2s, v5.2s    // P4
1744+        ushll           v19.8h, v3.8b, #1       // 2*P5
1745+        trn2            v1.2s, v1.2s, v5.2s     // P8
1746+        uxtl            v5.8h, v18.8b           // P2
1747+        uxtl            v6.8h, v6.8b            // P6
1748+        uxtl            v18.8h, v16.8b          // P3
1749+        mls             v7.8h, v5.8h, v0.h[1]   // 2*P1-5*P2
1750+        uxtl            v2.8h, v2.8b            // P7
1751+        ushll           v5.8h, v16.8b, #1       // 2*P3
1752+        mls             v19.8h, v6.8h, v0.h[1]  // 2*P5-5*P6
1753+        uxtl            v16.8h, v17.8b          // P4
1754+        uxtl            v1.8h, v1.8b            // P8
1755+        mla             v19.8h, v2.8h, v0.h[1]  // 2*P5-5*P6+5*P7
1756+        uxtl            v2.8h, v3.8b            // P5
1757+        mla             v7.8h, v18.8h, v0.h[1]  // 2*P1-5*P2+5*P3
1758+        sub             v3.8h, v16.8h, v2.8h    // P4-P5
1759+        mls             v5.8h, v16.8h, v0.h[1]  // 2*P3-5*P4
1760+        mls             v19.8h, v1.8h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
1761+        abs             v1.8h, v3.8h
1762+        sshr            v3.8h, v3.8h, #8        // clip_sign
1763+        mls             v7.8h, v16.8h, v0.h[0]  // 2*P1-5*P2+5*P3-2*P4
1764+        sshr            v1.8h, v1.8h, #1        // clip
1765+        mla             v5.8h, v2.8h, v0.h[1]   // 2*P3-5*P4+5*P5
1766+        srshr           v17.8h, v19.8h, #3
1767+        mls             v5.8h, v6.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
1768+        cmeq            v6.8h, v1.8h, #0        // test clip == 0
1769+        srshr           v7.8h, v7.8h, #3
1770+        abs             v17.8h, v17.8h          // a2
1771+        abs             v7.8h, v7.8h            // a1
1772+        srshr           v5.8h, v5.8h, #3
1773+        cmhs            v18.8h, v7.8h, v17.8h   // test a1 >= a2
1774+        abs             v19.8h, v5.8h           // a0
1775+        sshr            v5.8h, v5.8h, #8        // a0_sign
1776+        bsl             v18.16b, v17.16b, v7.16b // a3
1777+        cmhs            v4.8h, v19.8h, v4.8h    // test a0 >= pq
1778+        sub             v3.8h, v3.8h, v5.8h     // clip_sign - a0_sign
1779+        uqsub           v5.8h, v19.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1780+        cmhs            v7.8h, v18.8h, v19.8h   // test a3 >= a0
1781+        orr             v4.16b, v6.16b, v4.16b  // test clip == 0 || a0 >= pq
1782+        mul             v0.8h, v5.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
1783+        orr             v5.16b, v4.16b, v7.16b  // test clip == 0 || a0 >= pq || a3 >= a0
1784+        mov             w2, v5.s[1]             // move to gp reg
1785+        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
1786+        mov             w3, v5.s[3]
1787+        cmhs            v5.8h, v0.8h, v1.8h
1788+        and             w5, w2, w3
1789+        bsl             v5.16b, v1.16b, v0.16b  // FFMIN(d, clip)
1790+        tbnz            w5, #0, 2f              // none of the 8 pixel pairs should be updated in this case
1791+        bic             v0.16b, v5.16b, v4.16b  // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
1792+        mla             v2.8h, v0.8h, v3.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1793+        mls             v16.8h, v0.8h, v3.8h    // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1794+        sqxtun          v1.8b, v2.8h
1795+        sqxtun          v0.8b, v16.8h
1796+        tbnz            w2, #0, 1f              // none of the first 4 pixel pairs should be updated if so
1797+        st2             {v0.b, v1.b}[0], [x0], x1
1798+        st2             {v0.b, v1.b}[1], [x0], x1
1799+        st2             {v0.b, v1.b}[2], [x0], x1
1800+        st2             {v0.b, v1.b}[3], [x0]
1801+1:      tbnz            w3, #0, 2f              // none of the second 4 pixel pairs should be updated if so
1802+        st2             {v0.b, v1.b}[4], [x4], x1
1803+        st2             {v0.b, v1.b}[5], [x4], x1
1804+        st2             {v0.b, v1.b}[6], [x4], x1
1805+        st2             {v0.b, v1.b}[7], [x4]
1806+2:      ret
1807+endfunc
1808+
1809+// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
1810+// On entry:
1811+//   x0 -> top-left pel of lower block
1812+//   x1 = row stride, bytes
1813+//   w2 = PQUANT bitstream parameter
1814+function ff_vc1_v_loop_filter16_neon, export=1
1815+        sub             x3, x0, w1, sxtw #2
1816+        ldr             d0, .Lcoeffs
1817+        ld1             {v1.16b}, [x0], x1      // P5
1818+        movi            v2.2d, #0x0000ffff00000000
1819+        ld1             {v3.16b}, [x3], x1      // P1
1820+        ld1             {v4.16b}, [x3], x1      // P2
1821+        ld1             {v5.16b}, [x0], x1      // P6
1822+        ld1             {v6.16b}, [x3], x1      // P3
1823+        ld1             {v7.16b}, [x0], x1      // P7
1824+        ushll           v16.8h, v1.8b, #1       // 2*P5[0..7]
1825+        ushll           v17.8h, v3.8b, #1       // 2*P1[0..7]
1826+        ld1             {v18.16b}, [x3]         // P4
1827+        uxtl            v19.8h, v4.8b           // P2[0..7]
1828+        ld1             {v20.16b}, [x0]         // P8
1829+        uxtl            v21.8h, v5.8b           // P6[0..7]
1830+        dup             v22.8h, w2              // pq
1831+        ushll2          v3.8h, v3.16b, #1       // 2*P1[8..15]
1832+        mls             v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
1833+        ushll2          v19.8h, v1.16b, #1      // 2*P5[8..15]
1834+        uxtl2           v4.8h, v4.16b           // P2[8..15]
1835+        mls             v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
1836+        uxtl2           v5.8h, v5.16b           // P6[8..15]
1837+        uxtl            v23.8h, v6.8b           // P3[0..7]
1838+        uxtl            v24.8h, v7.8b           // P7[0..7]
1839+        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1[8..15]-5*P2[8..15]
1840+        ushll           v4.8h, v6.8b, #1        // 2*P3[0..7]
1841+        uxtl            v25.8h, v18.8b          // P4[0..7]
1842+        mls             v19.8h, v5.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]
1843+        uxtl2           v26.8h, v6.16b          // P3[8..15]
1844+        mla             v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
1845+        uxtl2           v7.8h, v7.16b           // P7[8..15]
1846+        ushll2          v6.8h, v6.16b, #1       // 2*P3[8..15]
1847+        mla             v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1848+        uxtl2           v18.8h, v18.16b         // P4[8..15]
1849+        uxtl            v23.8h, v20.8b          // P8[0..7]
1850+        mls             v4.8h, v25.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
1851+        uxtl            v24.8h, v1.8b           // P5[0..7]
1852+        uxtl2           v20.8h, v20.16b         // P8[8..15]
1853+        mla             v3.8h, v26.8h, v0.h[1]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
1854+        uxtl2           v1.8h, v1.16b           // P5[8..15]
1855+        sub             v26.8h, v25.8h, v24.8h  // P4[0..7]-P5[0..7]
1856+        mla             v19.8h, v7.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
1857+        sub             v7.8h, v18.8h, v1.8h    // P4[8..15]-P5[8..15]
1858+        mls             v6.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]
1859+        abs             v27.8h, v26.8h
1860+        sshr            v26.8h, v26.8h, #8      // clip_sign[0..7]
1861+        mls             v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
1862+        abs             v28.8h, v7.8h
1863+        sshr            v27.8h, v27.8h, #1      // clip[0..7]
1864+        mls             v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
1865+        sshr            v7.8h, v7.8h, #8        // clip_sign[8..15]
1866+        sshr            v23.8h, v28.8h, #1      // clip[8..15]
1867+        mla             v4.8h, v24.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
1868+        cmeq            v28.8h, v27.8h, #0      // test clip[0..7] == 0
1869+        srshr           v17.8h, v17.8h, #3
1870+        mls             v3.8h, v18.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
1871+        cmeq            v29.8h, v23.8h, #0      // test clip[8..15] == 0
1872+        srshr           v16.8h, v16.8h, #3
1873+        mls             v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
1874+        abs             v17.8h, v17.8h          // a1[0..7]
1875+        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
1876+        srshr           v3.8h, v3.8h, #3
1877+        mls             v4.8h, v21.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
1878+        abs             v16.8h, v16.8h          // a2[0..7]
1879+        srshr           v19.8h, v19.8h, #3
1880+        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
1881+        cmhs            v5.8h, v17.8h, v16.8h   // test a1[0..7] >= a2[0..7]
1882+        abs             v3.8h, v3.8h            // a1[8..15]
1883+        srshr           v4.8h, v4.8h, #3
1884+        abs             v19.8h, v19.8h          // a2[8..15]
1885+        bsl             v5.16b, v16.16b, v17.16b // a3[0..7]
1886+        srshr           v6.8h, v6.8h, #3
1887+        cmhs            v16.8h, v3.8h, v19.8h   // test a1[8..15] >= a2[8.15]
1888+        abs             v17.8h, v4.8h           // a0[0..7]
1889+        sshr            v4.8h, v4.8h, #8        // a0_sign[0..7]
1890+        bsl             v16.16b, v19.16b, v3.16b // a3[8..15]
1891+        uqsub           v3.8h, v17.8h, v5.8h    // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1892+        abs             v19.8h, v6.8h           // a0[8..15]
1893+        cmhs            v20.8h, v17.8h, v22.8h  // test a0[0..7] >= pq
1894+        cmhs            v5.8h, v5.8h, v17.8h    // test a3[0..7] >= a0[0..7]
1895+        sub             v4.8h, v26.8h, v4.8h    // clip_sign[0..7] - a0_sign[0..7]
1896+        sshr            v6.8h, v6.8h, #8        // a0_sign[8..15]
1897+        mul             v3.8h, v3.8h, v0.h[1]   // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
1898+        uqsub           v17.8h, v19.8h, v16.8h  // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1899+        orr             v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
1900+        cmhs            v21.8h, v19.8h, v22.8h  // test a0[8..15] >= pq
1901+        cmhs            v16.8h, v16.8h, v19.8h  // test a3[8..15] >= a0[8..15]
1902+        mul             v0.8h, v17.8h, v0.h[1]  // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
1903+        sub             v6.8h, v7.8h, v6.8h     // clip_sign[8..15] - a0_sign[8..15]
1904+        orr             v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
1905+        ushr            v3.8h, v3.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
1906+        orr             v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
1907+        cmtst           v17.2d, v5.2d, v2.2d    // if 2nd of each group of is not filtered, then none of the others in the group should be either
1908+        mov             w0, v5.s[1]             // move to gp reg
1909+        cmhs            v19.8h, v3.8h, v27.8h
1910+        ushr            v0.8h, v0.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
1911+        mov             w2, v5.s[3]
1912+        orr             v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
1913+        orr             v16.16b, v20.16b, v17.16b
1914+        bsl             v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
1915+        cmtst           v2.2d, v5.2d, v2.2d
1916+        cmhs            v3.8h, v0.8h, v23.8h
1917+        mov             w4, v5.s[1]
1918+        mov             w5, v5.s[3]
1919+        and             w0, w0, w2
1920+        bic             v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
1921+        orr             v2.16b, v7.16b, v2.16b
1922+        bsl             v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
1923+        mls             v25.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
1924+        and             w2, w4, w5
1925+        bic             v0.16b, v3.16b, v2.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
1926+        mla             v24.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
1927+        and             w0, w0, w2
1928+        mls             v18.8h, v0.8h, v6.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
1929+        sqxtun          v2.8b, v25.8h
1930+        tbnz            w0, #0, 1f              // none of the 16 pixel pairs should be updated in this case
1931+        mla             v1.8h, v0.8h, v6.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
1932+        sqxtun          v0.8b, v24.8h
1933+        sqxtun2         v2.16b, v18.8h
1934+        sqxtun2         v0.16b, v1.8h
1935+        st1             {v2.16b}, [x3], x1
1936+        st1             {v0.16b}, [x3]
1937+1:      ret
1938+endfunc
1939+
1940+// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
1941+// On entry:
1942+//   x0 -> top-left pel of right block
1943+//   x1 = row stride, bytes
1944+//   w2 = PQUANT bitstream parameter
1945+function ff_vc1_h_loop_filter16_neon, export=1
1946+        sub             x3, x0, #4              // where to start reading
1947+        ldr             d0, .Lcoeffs
1948+        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
1949+        sub             x0, x0, #1              // where to start writing
1950+        ld1             {v2.8b}, [x3], x1
1951+        add             x4, x0, x1, lsl #3
1952+        ld1             {v3.8b}, [x3], x1
1953+        add             x5, x0, x1, lsl #2
1954+        ld1             {v4.8b}, [x3], x1
1955+        add             x6, x4, x1, lsl #2
1956+        ld1             {v5.8b}, [x3], x1
1957+        ld1             {v6.8b}, [x3], x1
1958+        ld1             {v7.8b}, [x3], x1
1959+        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
1960+        ld1             {v17.8b}, [x3], x1
1961+        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
1962+        ld1             {v2.8b}, [x3], x1
1963+        trn1            v18.8b, v3.8b, v4.8b    // P1[2], P1[3], P3[2]...
1964+        ld1             {v19.8b}, [x3], x1
1965+        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
1966+        ld1             {v4.8b}, [x3], x1
1967+        trn1            v20.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
1968+        ld1             {v21.8b}, [x3], x1
1969+        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
1970+        ld1             {v6.8b}, [x3], x1
1971+        trn1            v22.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
1972+        ld1             {v23.8b}, [x3], x1
1973+        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
1974+        ld1             {v17.8b}, [x3], x1
1975+        trn1            v24.8b, v2.8b, v19.8b   // P1[8], P1[9], P3[8]...
1976+        ld1             {v25.8b}, [x3]
1977+        trn2            v2.8b, v2.8b, v19.8b    // P2[8], P2[9], P4[8]...
1978+        trn1            v19.4h, v16.4h, v18.4h  // P1[0], P1[1], P1[2], P1[3], P5[0]...
1979+        trn1            v26.8b, v4.8b, v21.8b   // P1[10], P1[11], P3[10]...
1980+        trn2            v4.8b, v4.8b, v21.8b    // P2[10], P2[11], P4[10]...
1981+        trn1            v21.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
1982+        trn1            v27.4h, v20.4h, v22.4h  // P1[4], P1[5], P1[6], P1[7], P5[4]...
1983+        trn1            v28.8b, v6.8b, v23.8b   // P1[12], P1[13], P3[12]...
1984+        trn2            v6.8b, v6.8b, v23.8b    // P2[12], P2[13], P4[12]...
1985+        trn1            v23.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
1986+        trn1            v29.4h, v24.4h, v26.4h  // P1[8], P1[9], P1[10], P1[11], P5[8]...
1987+        trn1            v30.8b, v17.8b, v25.8b  // P1[14], P1[15], P3[14]...
1988+        trn2            v17.8b, v17.8b, v25.8b  // P2[14], P2[15], P4[14]...
1989+        trn1            v25.4h, v2.4h, v4.4h    // P2[8], P2[9], P2[10], P2[11], P6[8]...
1990+        trn1            v31.2s, v19.2s, v27.2s  // P1[0..7]
1991+        trn2            v19.2s, v19.2s, v27.2s  // P5[0..7]
1992+        trn1            v27.2s, v21.2s, v23.2s  // P2[0..7]
1993+        trn2            v21.2s, v21.2s, v23.2s  // P6[0..7]
1994+        trn1            v23.4h, v28.4h, v30.4h  // P1[12], P1[13], P1[14], P1[15], P5[12]...
1995+        trn2            v16.4h, v16.4h, v18.4h  // P3[0], P3[1], P3[2], P3[3], P7[0]...
1996+        trn1            v18.4h, v6.4h, v17.4h   // P2[12], P2[13], P2[14], P2[15], P6[12]...
1997+        trn2            v20.4h, v20.4h, v22.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
1998+        trn2            v22.4h, v24.4h, v26.4h  // P3[8], P3[9], P3[10], P3[11], P7[8]...
1999+        trn1            v24.2s, v29.2s, v23.2s  // P1[8..15]
2000+        trn2            v23.2s, v29.2s, v23.2s  // P5[8..15]
2001+        trn1            v26.2s, v25.2s, v18.2s  // P2[8..15]
2002+        trn2            v18.2s, v25.2s, v18.2s  // P6[8..15]
2003+        trn2            v25.4h, v28.4h, v30.4h  // P3[12], P3[13], P3[14], P3[15], P7[12]...
2004+        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
2005+        trn2            v3.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
2006+        trn2            v2.4h, v2.4h, v4.4h     // P4[8], P4[9], P4[10], P4[11], P8[8]...
2007+        trn2            v4.4h, v6.4h, v17.4h    // P4[12], P4[13], P4[14], P4[15], P8[12]...
2008+        ushll           v5.8h, v31.8b, #1       // 2*P1[0..7]
2009+        ushll           v6.8h, v19.8b, #1       // 2*P5[0..7]
2010+        trn1            v7.2s, v16.2s, v20.2s   // P3[0..7]
2011+        uxtl            v17.8h, v27.8b          // P2[0..7]
2012+        trn2            v16.2s, v16.2s, v20.2s  // P7[0..7]
2013+        uxtl            v20.8h, v21.8b          // P6[0..7]
2014+        trn1            v21.2s, v22.2s, v25.2s  // P3[8..15]
2015+        ushll           v24.8h, v24.8b, #1      // 2*P1[8..15]
2016+        trn2            v22.2s, v22.2s, v25.2s  // P7[8..15]
2017+        ushll           v25.8h, v23.8b, #1      // 2*P5[8..15]
2018+        trn1            v27.2s, v1.2s, v3.2s    // P4[0..7]
2019+        uxtl            v26.8h, v26.8b          // P2[8..15]
2020+        mls             v5.8h, v17.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]
2021+        uxtl            v17.8h, v18.8b          // P6[8..15]
2022+        mls             v6.8h, v20.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]
2023+        trn1            v18.2s, v2.2s, v4.2s    // P4[8..15]
2024+        uxtl            v28.8h, v7.8b           // P3[0..7]
2025+        mls             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
2026+        uxtl            v16.8h, v16.8b          // P7[0..7]
2027+        uxtl            v26.8h, v21.8b          // P3[8..15]
2028+        mls             v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
2029+        uxtl            v22.8h, v22.8b          // P7[8..15]
2030+        ushll           v7.8h, v7.8b, #1        // 2*P3[0..7]
2031+        uxtl            v27.8h, v27.8b          // P4[0..7]
2032+        trn2            v1.2s, v1.2s, v3.2s     // P8[0..7]
2033+        ushll           v3.8h, v21.8b, #1       // 2*P3[8..15]
2034+        trn2            v2.2s, v2.2s, v4.2s     // P8[8..15]
2035+        uxtl            v4.8h, v18.8b           // P4[8..15]
2036+        mla             v5.8h, v28.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
2037+        uxtl            v1.8h, v1.8b            // P8[0..7]
2038+        mla             v6.8h, v16.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
2039+        uxtl            v2.8h, v2.8b            // P8[8..15]
2040+        uxtl            v16.8h, v19.8b          // P5[0..7]
2041+        mla             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
2042+        uxtl            v18.8h, v23.8b          // P5[8..15]
2043+        dup             v19.8h, w2              // pq
2044+        mla             v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
2045+        sub             v21.8h, v27.8h, v16.8h  // P4[0..7]-P5[0..7]
2046+        sub             v22.8h, v4.8h, v18.8h   // P4[8..15]-P5[8..15]
2047+        mls             v7.8h, v27.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
2048+        abs             v23.8h, v21.8h
2049+        mls             v3.8h, v4.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]
2050+        abs             v26.8h, v22.8h
2051+        sshr            v21.8h, v21.8h, #8      // clip_sign[0..7]
2052+        mls             v5.8h, v27.8h, v0.h[0]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
2053+        sshr            v23.8h, v23.8h, #1      // clip[0..7]
2054+        sshr            v26.8h, v26.8h, #1      // clip[8..15]
2055+        mls             v6.8h, v1.8h, v0.h[0]   // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
2056+        sshr            v1.8h, v22.8h, #8       // clip_sign[8..15]
2057+        cmeq            v22.8h, v23.8h, #0      // test clip[0..7] == 0
2058+        mls             v24.8h, v4.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
2059+        cmeq            v28.8h, v26.8h, #0      // test clip[8..15] == 0
2060+        srshr           v5.8h, v5.8h, #3
2061+        mls             v25.8h, v2.8h, v0.h[0]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
2062+        srshr           v2.8h, v6.8h, #3
2063+        mla             v7.8h, v16.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
2064+        srshr           v6.8h, v24.8h, #3
2065+        mla             v3.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
2066+        abs             v5.8h, v5.8h            // a1[0..7]
2067+        srshr           v24.8h, v25.8h, #3
2068+        mls             v3.8h, v17.8h, v0.h[0]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
2069+        abs             v2.8h, v2.8h            // a2[0..7]
2070+        abs             v6.8h, v6.8h            // a1[8..15]
2071+        mls             v7.8h, v20.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
2072+        abs             v17.8h, v24.8h          // a2[8..15]
2073+        cmhs            v20.8h, v5.8h, v2.8h    // test a1[0..7] >= a2[0..7]
2074+        srshr           v3.8h, v3.8h, #3
2075+        cmhs            v24.8h, v6.8h, v17.8h   // test a1[8..15] >= a2[8.15]
2076+        srshr           v7.8h, v7.8h, #3
2077+        bsl             v20.16b, v2.16b, v5.16b // a3[0..7]
2078+        abs             v2.8h, v3.8h            // a0[8..15]
2079+        sshr            v3.8h, v3.8h, #8        // a0_sign[8..15]
2080+        bsl             v24.16b, v17.16b, v6.16b // a3[8..15]
2081+        abs             v5.8h, v7.8h            // a0[0..7]
2082+        sshr            v6.8h, v7.8h, #8        // a0_sign[0..7]
2083+        cmhs            v7.8h, v2.8h, v19.8h    // test a0[8..15] >= pq
2084+        sub             v1.8h, v1.8h, v3.8h     // clip_sign[8..15] - a0_sign[8..15]
2085+        uqsub           v3.8h, v2.8h, v24.8h    // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
2086+        cmhs            v2.8h, v24.8h, v2.8h    // test a3[8..15] >= a0[8..15]
2087+        uqsub           v17.8h, v5.8h, v20.8h   // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
2088+        cmhs            v19.8h, v5.8h, v19.8h   // test a0[0..7] >= pq
2089+        orr             v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
2090+        sub             v6.8h, v21.8h, v6.8h    // clip_sign[0..7] - a0_sign[0..7]
2091+        mul             v3.8h, v3.8h, v0.h[1]   // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
2092+        cmhs            v5.8h, v20.8h, v5.8h    // test a3[0..7] >= a0[0..7]
2093+        orr             v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
2094+        mul             v0.8h, v17.8h, v0.h[1]  // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
2095+        orr             v2.16b, v7.16b, v2.16b  // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
2096+        orr             v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
2097+        ushr            v3.8h, v3.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
2098+        mov             w7, v2.s[1]
2099+        mov             w8, v2.s[3]
2100+        ushr            v0.8h, v0.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
2101+        mov             w2, v5.s[1]             // move to gp reg
2102+        cmhs            v2.8h, v3.8h, v26.8h
2103+        mov             w3, v5.s[3]
2104+        cmhs            v5.8h, v0.8h, v23.8h
2105+        bsl             v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
2106+        and             w9, w7, w8
2107+        bsl             v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
2108+        and             w10, w2, w3
2109+        bic             v0.16b, v2.16b, v7.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
2110+        and             w9, w10, w9
2111+        bic             v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
2112+        mls             v4.8h, v0.8h, v1.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
2113+        tbnz            w9, #0, 4f              // none of the 16 pixel pairs should be updated in this case
2114+        mls             v27.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
2115+        mla             v16.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
2116+        sqxtun          v2.8b, v4.8h
2117+        mla             v18.8h, v0.8h, v1.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
2118+        sqxtun          v0.8b, v27.8h
2119+        sqxtun          v1.8b, v16.8h
2120+        sqxtun          v3.8b, v18.8h
2121+        tbnz            w2, #0, 1f
2122+        st2             {v0.b, v1.b}[0], [x0], x1
2123+        st2             {v0.b, v1.b}[1], [x0], x1
2124+        st2             {v0.b, v1.b}[2], [x0], x1
2125+        st2             {v0.b, v1.b}[3], [x0]
2126+1:      tbnz            w3, #0, 2f
2127+        st2             {v0.b, v1.b}[4], [x5], x1
2128+        st2             {v0.b, v1.b}[5], [x5], x1
2129+        st2             {v0.b, v1.b}[6], [x5], x1
2130+        st2             {v0.b, v1.b}[7], [x5]
2131+2:      tbnz            w7, #0, 3f
2132+        st2             {v2.b, v3.b}[0], [x4], x1
2133+        st2             {v2.b, v3.b}[1], [x4], x1
2134+        st2             {v2.b, v3.b}[2], [x4], x1
2135+        st2             {v2.b, v3.b}[3], [x4]
2136+3:      tbnz            w8, #0, 4f
2137+        st2             {v2.b, v3.b}[4], [x6], x1
2138+        st2             {v2.b, v3.b}[5], [x6], x1
2139+        st2             {v2.b, v3.b}[6], [x6], x1
2140+        st2             {v2.b, v3.b}[7], [x6]
2141+4:      ret
2142+endfunc
2143+
2144+// Copy at most the specified number of bytes from source to destination buffer,
2145+// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
2146+// On entry:
2147+//   x0 -> source buffer
2148+//   w1 = max number of bytes to copy
2149+//   x2 -> destination buffer, optimally 8-byte aligned
2150+// On exit:
2151+//   w0 = number of bytes not copied
2152+function ff_vc1_unescape_buffer_helper_neon, export=1
2153+        // Offset by 80 to screen out cases that are too short for us to handle,
2154+        // and also make it easy to test for loop termination, or to determine
2155+        // whether we need an odd number of half-iterations of the loop.
2156+        subs            w1, w1, #80
2157+        b.mi            90f
2158+
2159+        // Set up useful constants
2160+        movi            v20.4s, #3, lsl #24
2161+        movi            v21.4s, #3, lsl #16
2162+
2163+        tst             w1, #32
2164+        b.ne            1f
2165+
2166+          ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
2167+          ext             v25.16b, v0.16b, v1.16b, #1
2168+          ext             v26.16b, v0.16b, v1.16b, #2
2169+          ext             v27.16b, v0.16b, v1.16b, #3
2170+          ext             v29.16b, v1.16b, v2.16b, #1
2171+          ext             v30.16b, v1.16b, v2.16b, #2
2172+          ext             v31.16b, v1.16b, v2.16b, #3
2173+          bic             v24.16b, v0.16b, v20.16b
2174+          bic             v25.16b, v25.16b, v20.16b
2175+          bic             v26.16b, v26.16b, v20.16b
2176+          bic             v27.16b, v27.16b, v20.16b
2177+          bic             v28.16b, v1.16b, v20.16b
2178+          bic             v29.16b, v29.16b, v20.16b
2179+          bic             v30.16b, v30.16b, v20.16b
2180+          bic             v31.16b, v31.16b, v20.16b
2181+          eor             v24.16b, v24.16b, v21.16b
2182+          eor             v25.16b, v25.16b, v21.16b
2183+          eor             v26.16b, v26.16b, v21.16b
2184+          eor             v27.16b, v27.16b, v21.16b
2185+          eor             v28.16b, v28.16b, v21.16b
2186+          eor             v29.16b, v29.16b, v21.16b
2187+          eor             v30.16b, v30.16b, v21.16b
2188+          eor             v31.16b, v31.16b, v21.16b
2189+          cmeq            v24.4s, v24.4s, #0
2190+          cmeq            v25.4s, v25.4s, #0
2191+          cmeq            v26.4s, v26.4s, #0
2192+          cmeq            v27.4s, v27.4s, #0
2193+          add             w1, w1, #32
2194+          b               3f
2195+
2196+1:      ld1             {v3.16b, v4.16b, v5.16b}, [x0], #48
2197+        ext             v25.16b, v3.16b, v4.16b, #1
2198+        ext             v26.16b, v3.16b, v4.16b, #2
2199+        ext             v27.16b, v3.16b, v4.16b, #3
2200+        ext             v29.16b, v4.16b, v5.16b, #1
2201+        ext             v30.16b, v4.16b, v5.16b, #2
2202+        ext             v31.16b, v4.16b, v5.16b, #3
2203+        bic             v24.16b, v3.16b, v20.16b
2204+        bic             v25.16b, v25.16b, v20.16b
2205+        bic             v26.16b, v26.16b, v20.16b
2206+        bic             v27.16b, v27.16b, v20.16b
2207+        bic             v28.16b, v4.16b, v20.16b
2208+        bic             v29.16b, v29.16b, v20.16b
2209+        bic             v30.16b, v30.16b, v20.16b
2210+        bic             v31.16b, v31.16b, v20.16b
2211+        eor             v24.16b, v24.16b, v21.16b
2212+        eor             v25.16b, v25.16b, v21.16b
2213+        eor             v26.16b, v26.16b, v21.16b
2214+        eor             v27.16b, v27.16b, v21.16b
2215+        eor             v28.16b, v28.16b, v21.16b
2216+        eor             v29.16b, v29.16b, v21.16b
2217+        eor             v30.16b, v30.16b, v21.16b
2218+        eor             v31.16b, v31.16b, v21.16b
2219+        cmeq            v24.4s, v24.4s, #0
2220+        cmeq            v25.4s, v25.4s, #0
2221+        cmeq            v26.4s, v26.4s, #0
2222+        cmeq            v27.4s, v27.4s, #0
2223+        // Drop through...
2224+2:        mov             v0.16b, v5.16b
2225+          ld1             {v1.16b, v2.16b}, [x0], #32
2226+        cmeq            v28.4s, v28.4s, #0
2227+        cmeq            v29.4s, v29.4s, #0
2228+        cmeq            v30.4s, v30.4s, #0
2229+        cmeq            v31.4s, v31.4s, #0
2230+        orr             v24.16b, v24.16b, v25.16b
2231+        orr             v26.16b, v26.16b, v27.16b
2232+        orr             v28.16b, v28.16b, v29.16b
2233+        orr             v30.16b, v30.16b, v31.16b
2234+          ext             v25.16b, v0.16b, v1.16b, #1
2235+        orr             v22.16b, v24.16b, v26.16b
2236+          ext             v26.16b, v0.16b, v1.16b, #2
2237+          ext             v27.16b, v0.16b, v1.16b, #3
2238+          ext             v29.16b, v1.16b, v2.16b, #1
2239+        orr             v23.16b, v28.16b, v30.16b
2240+          ext             v30.16b, v1.16b, v2.16b, #2
2241+          ext             v31.16b, v1.16b, v2.16b, #3
2242+          bic             v24.16b, v0.16b, v20.16b
2243+          bic             v25.16b, v25.16b, v20.16b
2244+          bic             v26.16b, v26.16b, v20.16b
2245+        orr             v22.16b, v22.16b, v23.16b
2246+          bic             v27.16b, v27.16b, v20.16b
2247+          bic             v28.16b, v1.16b, v20.16b
2248+          bic             v29.16b, v29.16b, v20.16b
2249+          bic             v30.16b, v30.16b, v20.16b
2250+          bic             v31.16b, v31.16b, v20.16b
2251+        addv            s22, v22.4s
2252+          eor             v24.16b, v24.16b, v21.16b
2253+          eor             v25.16b, v25.16b, v21.16b
2254+          eor             v26.16b, v26.16b, v21.16b
2255+          eor             v27.16b, v27.16b, v21.16b
2256+          eor             v28.16b, v28.16b, v21.16b
2257+        mov             w3, v22.s[0]
2258+          eor             v29.16b, v29.16b, v21.16b
2259+          eor             v30.16b, v30.16b, v21.16b
2260+          eor             v31.16b, v31.16b, v21.16b
2261+          cmeq            v24.4s, v24.4s, #0
2262+          cmeq            v25.4s, v25.4s, #0
2263+          cmeq            v26.4s, v26.4s, #0
2264+          cmeq            v27.4s, v27.4s, #0
2265+        cbnz            w3, 90f
2266+        st1             {v3.16b, v4.16b}, [x2], #32
2267+3:          mov             v3.16b, v2.16b
2268+            ld1             {v4.16b, v5.16b}, [x0], #32
2269+          cmeq            v28.4s, v28.4s, #0
2270+          cmeq            v29.4s, v29.4s, #0
2271+          cmeq            v30.4s, v30.4s, #0
2272+          cmeq            v31.4s, v31.4s, #0
2273+          orr             v24.16b, v24.16b, v25.16b
2274+          orr             v26.16b, v26.16b, v27.16b
2275+          orr             v28.16b, v28.16b, v29.16b
2276+          orr             v30.16b, v30.16b, v31.16b
2277+            ext             v25.16b, v3.16b, v4.16b, #1
2278+          orr             v22.16b, v24.16b, v26.16b
2279+            ext             v26.16b, v3.16b, v4.16b, #2
2280+            ext             v27.16b, v3.16b, v4.16b, #3
2281+            ext             v29.16b, v4.16b, v5.16b, #1
2282+          orr             v23.16b, v28.16b, v30.16b
2283+            ext             v30.16b, v4.16b, v5.16b, #2
2284+            ext             v31.16b, v4.16b, v5.16b, #3
2285+            bic             v24.16b, v3.16b, v20.16b
2286+            bic             v25.16b, v25.16b, v20.16b
2287+            bic             v26.16b, v26.16b, v20.16b
2288+          orr             v22.16b, v22.16b, v23.16b
2289+            bic             v27.16b, v27.16b, v20.16b
2290+            bic             v28.16b, v4.16b, v20.16b
2291+            bic             v29.16b, v29.16b, v20.16b
2292+            bic             v30.16b, v30.16b, v20.16b
2293+            bic             v31.16b, v31.16b, v20.16b
2294+          addv            s22, v22.4s
2295+            eor             v24.16b, v24.16b, v21.16b
2296+            eor             v25.16b, v25.16b, v21.16b
2297+            eor             v26.16b, v26.16b, v21.16b
2298+            eor             v27.16b, v27.16b, v21.16b
2299+            eor             v28.16b, v28.16b, v21.16b
2300+          mov             w3, v22.s[0]
2301+            eor             v29.16b, v29.16b, v21.16b
2302+            eor             v30.16b, v30.16b, v21.16b
2303+            eor             v31.16b, v31.16b, v21.16b
2304+            cmeq            v24.4s, v24.4s, #0
2305+            cmeq            v25.4s, v25.4s, #0
2306+            cmeq            v26.4s, v26.4s, #0
2307+            cmeq            v27.4s, v27.4s, #0
2308+          cbnz            w3, 91f
2309+          st1             {v0.16b, v1.16b}, [x2], #32
2310+        subs            w1, w1, #64
2311+        b.pl            2b
2312+
2313+90:     add             w0, w1, #80
2314+        ret
2315+
2316+91:     sub             w1, w1, #32
2317+        b               90b
2318+endfunc
2319--- a/libavcodec/allcodecs.c
2320+++ b/libavcodec/allcodecs.c
2321@@ -149,6 +149,7 @@ extern AVCodec ff_hap_decoder;
2322 extern AVCodec ff_hevc_decoder;
2323 extern AVCodec ff_hevc_qsv_decoder;
2324 extern AVCodec ff_hevc_rkmpp_decoder;
2325+extern AVCodec ff_hevc_rpi_decoder;
2326 extern AVCodec ff_hevc_v4l2m2m_decoder;
2327 extern AVCodec ff_hnm4_video_decoder;
2328 extern AVCodec ff_hq_hqa_decoder;
2329@@ -890,6 +891,41 @@ static enum AVCodecID remap_deprecated_c
2330     }
2331 }
2332
2333+static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
2334+{
2335+    const enum AVPixelFormat *pf = p->pix_fmts;
2336+
2337+    // Assume good if we lack info
2338+    if (pf == NULL)
2339+        return 1;
2340+    if (fmt == AV_PIX_FMT_NONE)
2341+        return 0;
2342+
2343+    for (; *pf != AV_PIX_FMT_NONE; ++pf) {
2344+        if (*pf == fmt)
2345+            return 1;
2346+    }
2347+    return 0;
2348+}
2349+
2350+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt)
2351+{
2352+    const AVCodec *p, *experimental = NULL;
2353+    void *i = 0;
2354+
2355+    id= remap_deprecated_codec_id(id);
2356+    while ((p = av_codec_iterate(&i))) {
2357+        if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) {
2358+            if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
2359+                experimental = p;
2360+            } else
2361+                return (AVCodec *)p;
2362+        }
2363+        p = p->next;
2364+    }
2365+    return (AVCodec *)experimental;
2366+}
2367+
2368 static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *))
2369 {
2370     const AVCodec *p, *experimental = NULL;
2371--- a/libavcodec/arm/Makefile
2372+++ b/libavcodec/arm/Makefile
2373@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER)             +
2374                                           arm/sbrdsp_init_arm.o
2375 OBJS-$(CONFIG_DCA_DECODER)             += arm/synth_filter_init_arm.o
2376 OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
2377+OBJS-$(CONFIG_HEVC_RPI_DECODER)        += arm/rpi_hevcdsp_init_arm.o    \
2378+                                          arm/rpi_hevcpred_init_arm.o
2379 OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
2380 OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
2381 OBJS-$(CONFIG_SBC_ENCODER)             += arm/sbcdsp_init_arm.o
2382@@ -140,10 +142,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        +
2383 NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
2384 NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
2385 NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
2386+                                          arm/hevcdsp_idct_neon.o    \
2387                                           arm/hevcdsp_deblock_neon.o    \
2388                                           arm/hevcdsp_idct_neon.o       \
2389                                           arm/hevcdsp_qpel_neon.o       \
2390                                           arm/hevcdsp_sao_neon.o
2391+NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER)   += arm/rpi_hevcdsp_init_neon.o    \
2392+                                          arm/rpi_hevc_misc_neon.o       \
2393+                                          arm/rpi_hevcdsp_deblock_neon.o \
2394+                                          arm/rpi_hevcdsp_idct_neon.o    \
2395+                                          arm/rpi_hevcdsp_res8_neon.o    \
2396+                                          arm/rpi_hevcdsp_res16_neon.o   \
2397+                                          arm/rpi_hevcdsp_sao_neon.o     \
2398+                                          arm/rpi_hevcpred_init_neon.o   \
2399+                                          arm/rpi_hevcpred_intra_angular_neon.o \
2400+                                          arm/rpi_hevcpred_intra_dc_neon.o \
2401+                                          arm/rpi_hevcpred_intra_filter_neon.o \
2402+                                          arm/rpi_hevcpred_intra_hv_neon.o \
2403+                                          arm/rpi_hevcpred_intra_planar_neon.o
2404 NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
2405 NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
2406                                           arm/rv40dsp_neon.o
2407--- a/libavcodec/arm/cabac.h
2408+++ b/libavcodec/arm/cabac.h
2409@@ -26,83 +26,209 @@
2410 #include "libavutil/internal.h"
2411 #include "libavcodec/cabac.h"
2412
2413+
2414 #define get_cabac_inline get_cabac_inline_arm
2415 static av_always_inline int get_cabac_inline_arm(CABACContext *c,
2416-                                                 uint8_t *const state)
2417+                                                 uint8_t *state)
2418 {
2419-    int bit;
2420-    void *reg_b, *reg_c, *tmp;
2421+    const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128;
2422+    int bit, ptr, low, tmp1, tmp2;
2423+    __asm__ volatile (
2424+        "ldr     %[bit], [%[c], %[range_off]]             \n\t"
2425+        "ldrb    %[ptr], [%[state]]                       \n\t"
2426+        "sub     %[tmp1], %[mlps_tables], %[lps_off]      \n\t"
2427+        "and     %[tmp2], %[bit], #0xc0                   \n\t"
2428+        "add     %[tmp1], %[tmp1], %[ptr]                 \n\t"
2429+        "ldr     %[low], [%[c], %[low_off]]               \n\t"
2430+        "ldrb    %[tmp2], [%[tmp1], %[tmp2], lsl #1]      \n\t"
2431+        "sub     %[bit], %[bit], %[tmp2]                  \n\t"
2432+        "mov     %[tmp1], %[bit]                          \n\t"
2433+        "cmp     %[low], %[bit], lsl #17                  \n\t"
2434+        "itt     ge                                       \n\t"
2435+        "movge   %[tmp1], %[tmp2]                         \n\t"
2436+        "mvnge   %[ptr], %[ptr]                           \n\t"
2437+        "clz     %[tmp2], %[tmp1]                         \n\t"
2438+        "it      ge                                       \n\t"
2439+        "subge   %[low], %[low], %[bit], lsl #17          \n\t"
2440+        "sub     %[tmp2], %[tmp2], #23                    \n\t"
2441+        "and     %[bit], %[ptr], #1                       \n\t"
2442+        "ldrb    %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t"
2443+        "lsl     %[low], %[low], %[tmp2]                  \n\t"
2444+        "lsls    %[ptr], %[low], #16                      \n\t"
2445+        "bne     1f                                       \n\t"
2446+        "ldr     %[ptr], [%[c], %[ptr_off]]               \n\t"
2447+        "lsl     %[tmp2], %[tmp1], %[tmp2]                \n\t"
2448+#if UNCHECKED_BITSTREAM_READER
2449+        "strb    %[mlps_tables], [%[state]]               \n\t"
2450+        "rbit    %[state], %[low]                         \n\t"
2451+        "ldrh    %[tmp1], [%[ptr]], #2                    \n\t"
2452+#else
2453+        "ldr     %[tmp1], [%[c], %[end_off]]              \n\t"
2454+        "strb    %[mlps_tables], [%[state]]               \n\t"
2455+        "rbit    %[state], %[low]                         \n\t"
2456+        "cmp     %[tmp1], %[ptr]                          \n\t"
2457+#if CONFIG_THUMB
2458+        "it      cs                                       \n\t"
2459+        "ldrhcs  %[tmp1], [%[ptr]], #2                    \n\t"
2460+#else
2461+        "ldrcsh  %[tmp1], [%[ptr]], #2                    \n\t"
2462+#endif
2463+#endif
2464+        "clz     %[state], %[state]                       \n\t"
2465+        "movw    %[mlps_tables], #0xffff                  \n\t"
2466+        "sub     %[state], %[state], #16                  \n\t"
2467+        "str     %[tmp2], [%[c], %[range_off]]            \n\t"
2468+        "rev     %[tmp1], %[tmp1]                         \n\t"
2469+        "str     %[ptr], [%[c], %[ptr_off]]               \n\t"
2470+        "lsr     %[tmp1], %[tmp1], #15                    \n\t"
2471+        "sub     %[tmp1], %[tmp1], %[mlps_tables]         \n\t"
2472+#if CONFIG_THUMB
2473+        "lsl     %[tmp1], %[tmp1], %[state]               \n\t"
2474+        "add     %[low], %[low], %[tmp1]                  \n\t"
2475+#else
2476+        "add     %[low], %[low], %[tmp1], lsl %[state]    \n\t"
2477+#endif
2478+        "str     %[low], [%[c], %[low_off]]               \n\t"
2479+        "b       2f                                       \n\t"
2480+        "1:                                               \n\t"
2481+        "strb    %[mlps_tables], [%[state]]               \n\t"
2482+        "lsl     %[tmp1], %[tmp1], %[tmp2]                \n\t"
2483+        "str     %[low], [%[c], %[low_off]]               \n\t"
2484+        "str     %[tmp1], [%[c], %[range_off]]            \n\t"
2485+        "2:                                               \n\t"
2486+    :  // Outputs
2487+             [state]"+r"(state),
2488+       [mlps_tables]"+r"(mlps_tables),
2489+               [bit]"=&r"(bit),
2490+               [ptr]"=&r"(ptr),
2491+               [low]"=&r"(low),
2492+              [tmp1]"=&r"(tmp1),
2493+              [tmp2]"=&r"(tmp2)
2494+    :  // Inputs
2495+               [c]"r"(c),
2496+         [low_off]"J"(offsetof(CABACContext, low)),
2497+       [range_off]"J"(offsetof(CABACContext, range)),
2498+         [ptr_off]"J"(offsetof(CABACContext, bytestream)),
2499+         [end_off]"J"(offsetof(CABACContext, bytestream_end)),
2500+         [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
2501+    :  // Clobbers
2502+       "cc", "memory"
2503+    );
2504+    return bit;
2505+}
2506
2507-    __asm__ volatile(
2508-        "ldrb       %[bit]        , [%[state]]                  \n\t"
2509-        "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
2510-        "mov        %[tmp]        , %[range]                    \n\t"
2511-        "and        %[range]      , %[range]    , #0xC0         \n\t"
2512-        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
2513-        "ldrb       %[range]      , [%[r_b], %[range], lsl #1]  \n\t"
2514-        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
2515-        "sub        %[r_c]        , %[tmp]      , %[range]      \n\t"
2516-        "lsl        %[tmp]        , %[r_c]      , #17           \n\t"
2517-        "cmp        %[tmp]        , %[low]                      \n\t"
2518-        "it         gt                                          \n\t"
2519-        "movgt      %[range]      , %[r_c]                      \n\t"
2520-        "itt        cc                                          \n\t"
2521-        "mvncc      %[bit]        , %[bit]                      \n\t"
2522-        "subcc      %[low]        , %[low]      , %[tmp]        \n\t"
2523-        "add        %[r_c]        , %[tables]   , %[mlps_off]   \n\t"
2524-        "ldrb       %[tmp]        , [%[r_b], %[range]]          \n\t"
2525-        "ldrb       %[r_b]        , [%[r_c], %[bit]]            \n\t"
2526-        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
2527-        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
2528-        "uxth       %[r_c]        , %[low]                      \n\t"
2529-        "strb       %[r_b]        , [%[state]]                  \n\t"
2530-        "tst        %[r_c]        , %[r_c]                      \n\t"
2531-        "bne        2f                                          \n\t"
2532-        "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
2533+#define get_cabac_bypass get_cabac_bypass_arm
2534+static inline int get_cabac_bypass_arm(CABACContext * const c)
2535+{
2536+    uint32_t low = c->low, range, ptr, tmp;
2537+    int rv;
2538+    __asm volatile (
2539+        "ldr        %[range] , [%[c], %[range_off]] \n\t"
2540+        "mov        %[rv]    , #0                   \n\t"
2541+        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
2542+        "lsl        %[low]   , #1                   \n\t"
2543+#if !UNCHECKED_BITSTREAM_READER
2544+        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
2545+#endif
2546+        "cmp        %[low]   , %[range], lsl #17    \n\t"
2547+        "itt         cs                              \n\t"
2548+        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
2549+        "movcs      %[rv]    , #1                   \n\t"
2550 #if UNCHECKED_BITSTREAM_READER
2551-        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
2552-        "add        %[r_c]        , %[r_c]      , #2            \n\t"
2553-        "str        %[r_c]        , [%[c], %[byte]]             \n\t"
2554-#else
2555-        "ldr        %[r_b]        , [%[c], %[end]]              \n\t"
2556-        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
2557-        "cmp        %[r_c]        , %[r_b]                      \n\t"
2558-        "itt        lt                                          \n\t"
2559-        "addlt      %[r_c]        , %[r_c]      , #2            \n\t"
2560-        "strlt      %[r_c]        , [%[c], %[byte]]             \n\t"
2561-#endif
2562-        "sub        %[r_c]        , %[low]      , #1            \n\t"
2563-        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
2564-        "eor        %[r_c]        , %[low]      , %[r_c]        \n\t"
2565-        "rev        %[tmp]        , %[tmp]                      \n\t"
2566-        "lsr        %[r_c]        , %[r_c]      , #15           \n\t"
2567-        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
2568-        "ldrb       %[r_c]        , [%[r_b], %[r_c]]            \n\t"
2569-        "movw       %[r_b]        , #0xFFFF                     \n\t"
2570-        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
2571-        "rsb        %[r_c]        , %[r_c]      , #7            \n\t"
2572-        "lsl        %[tmp]        , %[tmp]      , %[r_c]        \n\t"
2573-        "add        %[low]        , %[low]      , %[tmp]        \n\t"
2574-        "2:                                                     \n\t"
2575-        :    [bit]"=&r"(bit),
2576-             [low]"+&r"(c->low),
2577-           [range]"+&r"(c->range),
2578-             [r_b]"=&r"(reg_b),
2579-             [r_c]"=&r"(reg_c),
2580-             [tmp]"=&r"(tmp)
2581-        :        [c]"r"(c),
2582-             [state]"r"(state),
2583-            [tables]"r"(ff_h264_cabac_tables),
2584-              [byte]"M"(offsetof(CABACContext, bytestream)),
2585-               [end]"M"(offsetof(CABACContext, bytestream_end)),
2586-          [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
2587-           [lps_off]"I"(H264_LPS_RANGE_OFFSET),
2588-          [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
2589-        : "memory", "cc"
2590-        );
2591+        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
2592+#else
2593+        "cmp        %[tmp]   , %[ptr]               \n\t"
2594+#if CONFIG_THUMB
2595+        "it         cs                              \n\t"
2596+        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
2597+#else
2598+        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
2599+#endif
2600+#endif
2601+        "lsls       %[range] , %[low], #16          \n\t"
2602+        "bne        1f                              \n\t"
2603
2604-    return bit & 1;
2605+        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
2606+        "rev        %[tmp]   , %[tmp]               \n\t"
2607+        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
2608+        "movw       %[tmp]   , 0xFFFF               \n\t"
2609+        "sub        %[low]   , %[tmp]               \n\t"
2610+        "1:                                         \n\t"
2611+        "str        %[low]   , [%[c], %[low_off]]   \n\t"
2612+        : // Outputs
2613+               [rv]"=&r"(rv),
2614+              [low]"+r"(low),
2615+            [range]"=&r"(range),
2616+              [ptr]"=&r"(ptr),
2617+              [tmp]"=&r"(tmp)
2618+        : // Inputs
2619+                    [c]"r"(c),
2620+              [low_off]"J"(offsetof(CABACContext, low)),
2621+            [range_off]"J"(offsetof(CABACContext, range)),
2622+              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
2623+              [end_off]"J"(offsetof(CABACContext, bytestream_end))
2624+        : // Clobbers
2625+            "memory", "cc"
2626+    );
2627+    return rv;
2628 }
2629+
2630+
2631+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
2632+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
2633+{
2634+    uint32_t low = c->low, range, ptr, tmp;
2635+    __asm volatile (
2636+        "ldr        %[range] , [%[c], %[range_off]] \n\t"
2637+        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
2638+        "lsl        %[low]   , #1                   \n\t"
2639+#if !UNCHECKED_BITSTREAM_READER
2640+        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
2641+#endif
2642+        "cmp        %[low]   , %[range], lsl #17    \n\t"
2643+        "it         cs                              \n\t"
2644+        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
2645+        "it         cc                              \n\t"
2646+        "rsbcc      %[rv]    , %[rv], #0            \n\t"
2647+#if UNCHECKED_BITSTREAM_READER
2648+        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
2649+#else
2650+        "cmp        %[tmp]   , %[ptr]               \n\t"
2651+#if CONFIG_THUMB
2652+        "it         cs                              \n\t"
2653+        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
2654+#else
2655+        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
2656+#endif
2657+#endif
2658+        "lsls       %[range] , %[low], #16          \n\t"
2659+        "bne        1f                              \n\t"
2660+
2661+        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
2662+        "rev        %[tmp]   , %[tmp]               \n\t"
2663+        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
2664+        "movw       %[tmp]   , 0xFFFF               \n\t"
2665+        "sub        %[low]   , %[tmp]               \n\t"
2666+        "1:                                         \n\t"
2667+        "str        %[low]   , [%[c], %[low_off]]   \n\t"
2668+        : // Outputs
2669+               [rv]"+r"(rv),
2670+              [low]"+r"(low),
2671+            [range]"=&r"(range),
2672+              [ptr]"=&r"(ptr),
2673+              [tmp]"=&r"(tmp)
2674+        : // Inputs
2675+                    [c]"r"(c),
2676+              [low_off]"J"(offsetof(CABACContext, low)),
2677+            [range_off]"J"(offsetof(CABACContext, range)),
2678+              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
2679+              [end_off]"J"(offsetof(CABACContext, bytestream_end))
2680+        : // Clobbers
2681+            "memory", "cc"
2682+    );
2683+    return rv;
2684+}
2685+
2686 #endif /* HAVE_ARMV6T2_INLINE */
2687
2688 #endif /* AVCODEC_ARM_CABAC_H */
2689--- /dev/null
2690+++ b/libavcodec/arm/rpi_hevc_cabac.h
2691@@ -0,0 +1,607 @@
2692+/*
2693+ * This file is part of FFmpeg.
2694+ *
2695+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
2696+ *
2697+ * FFmpeg is free software; you can redistribute it and/or
2698+ * modify it under the terms of the GNU Lesser General Public
2699+ * License as published by the Free Software Foundation; either
2700+ * version 2.1 of the License, or (at your option) any later version.
2701+ *
2702+ * FFmpeg is distributed in the hope that it will be useful,
2703+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
2704+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2705+ * Lesser General Public License for more details.
2706+ *
2707+ * You should have received a copy of the GNU Lesser General Public
2708+ * License along with FFmpeg; if not, write to the Free Software
2709+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2710+ */
2711+
2712+#ifndef AVCODEC_ARM_HEVC_CABAC_H
2713+#define AVCODEC_ARM_HEVC_CABAC_H
2714+
2715+#include "config.h"
2716+#if HAVE_ARMV6T2_INLINE
2717+
2718+#define hevc_mem_bits32 hevc_mem_bits32_arm
2719+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
2720+{
2721+    unsigned int n;
2722+    __asm__ (
2723+        "rev        %[n], %[x]                     \n\t"
2724+        : [n]"=r"(n)
2725+        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
2726+        :
2727+        );
2728+    return n << (bits & 7);
2729+}
2730+
2731+
2732+// ---------------------------------------------------------------------------
2733+//
2734+// Helper fns - little bits of code where ARM has an instraction that the
2735+// compiler doesn't know about / use
2736+
2737+#define trans_scale_sat trans_scale_sat_arm
2738+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
2739+{
2740+    int rv;
2741+    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
2742+
2743+    __asm__ (
2744+    "ssat %[rv], #16, %[t], ASR #1 \n\t"
2745+    : [rv]"=r"(rv)
2746+    : [t]"r"(t)
2747+    :
2748+    );
2749+    return rv;
2750+}
2751+
2752+#define update_rice update_rice_arm
2753+static inline void update_rice_arm(uint8_t * const stat_coeff,
2754+    const unsigned int last_coeff_abs_level_remaining,
2755+    const unsigned int c_rice_param)
2756+{
2757+    int t = last_coeff_abs_level_remaining << 1;
2758+    __asm__ (
2759+    "lsrs  %[t], %[t], %[shift]             \n\t"
2760+
2761+    "it    eq                               \n\t"
2762+    "subeq %[stat], %[stat], #1             \n\t"
2763+    "cmp   %[t], #6                         \n\t"
2764+    "adc   %[stat], %[stat], #0             \n\t"
2765+    "usat  %[stat], #8, %[stat]             \n\t"
2766+    : [stat]"+r"(*stat_coeff),
2767+         [t]"+r"(t)
2768+    :  [shift]"r"(c_rice_param)
2769+    : "cc"
2770+    );
2771+}
2772+
2773+// ---------------------------------------------------------------------------
2774+//
2775+// CABAC get loops
2776+//
2777+// Where the loop is simple enough we can normally do 10-30% better than the
2778+// compiler
2779+
2780+// Get the residual greater than 1 bits
2781+
2782+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
2783+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
2784+    uint8_t * const state0)
2785+{
2786+    unsigned int i, reg_b, st, tmp, bit, rv;
2787+     __asm__ (
2788+         "mov        %[i]          , #0                          \n\t"
2789+         "mov        %[rv]         , #0                          \n\t"
2790+         "1:                                                     \n\t"
2791+         "add        %[i]          , %[i]        , #1            \n\t"
2792+         "cmp        %[rv]         , #0                          \n\t"
2793+         "ite        eq                                          \n\t"
2794+         "usateq     %[st]         , #2          , %[i]          \n\t"
2795+         "movne      %[st]         , #0                          \n\t"
2796+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
2797+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
2798+
2799+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
2800+         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
2801+         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
2802+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
2803+
2804+         "cmp        %[low]        , %[range], lsl #17           \n\t"
2805+         "ittt       ge                                          \n\t"
2806+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
2807+         "movge      %[range]      , %[tmp]                      \n\t"
2808+         "mvnge      %[bit]        , %[bit]                      \n\t"
2809+
2810+         "clz        %[tmp]        , %[range]                    \n\t"
2811+         "sub        %[tmp]        , #23                         \n\t"
2812+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
2813+         "and        %[bit]        , %[bit]      , #1            \n\t"
2814+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
2815+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
2816+         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
2817+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
2818+
2819+// There is a small speed gain from combining both conditions, using a single
2820+// branch and then working out what that meant later
2821+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
2822+         "it         ne                                          \n\t"
2823+         "cmpne      %[n]          , %[i]                        \n\t"
2824+         "bne        1b                                          \n\t"
2825+
2826+// If reload is not required then we must have run out of flags to decode
2827+         "tst        %[tmp]        , %[tmp]                      \n\t"
2828+         "bne        2f                                          \n\t"
2829+
2830+// Do reload
2831+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
2832+         "rbit       %[bit]        , %[low]                      \n\t"
2833+         "movw       %[r_b]        , #0xFFFF                     \n\t"
2834+         "clz        %[bit]        , %[bit]                      \n\t"
2835+         "rev        %[tmp]        , %[tmp]                      \n\t"
2836+         "sub        %[bit]        , %[bit]      , #16           \n\t"
2837+         "cmp        %[n]          , %[i]                        \n\t"
2838+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
2839+
2840+#if CONFIG_THUMB
2841+         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
2842+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
2843+#else
2844+         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
2845+#endif
2846+
2847+         "bne        1b                                          \n\t"
2848+         "2:                                                     \n\t"
2849+         :    [bit]"=&r"(bit),
2850+              [low]"+r"(c->low),
2851+            [range]"+r"(c->range),
2852+              [r_b]"=&r"(reg_b),
2853+             [bptr]"+r"(c->bytestream),
2854+                [i]"=&r"(i),
2855+              [tmp]"=&r"(tmp),
2856+               [st]"=&r"(st),
2857+               [rv]"=&r"(rv)
2858+          :  [state0]"r"(state0),
2859+                  [n]"r"(n),
2860+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
2861+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
2862+         : "memory", "cc"
2863+    );
2864+    return rv;
2865+}
2866+
2867+
2868+// n must be > 0 on entry
2869+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
2870+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
2871+    unsigned int n,
2872+    const uint8_t * ctx_map,
2873+    uint8_t * p)
2874+{
2875+    unsigned int reg_b, tmp, st, bit;
2876+     __asm__ (
2877+// Get bin from map
2878+#if CONFIG_THUMB
2879+         "add        %[ctx_map]    , %[n]                        \n\t"
2880+         "ldrb       %[st]         , [%[ctx_map]]                \n\t"
2881+#else
2882+         "ldrb       %[st]         , [%[ctx_map], %[n]]!         \n\t"
2883+#endif
2884+         "1:                                                     \n\t"
2885+
2886+// Load state & ranges
2887+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
2888+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
2889+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
2890+         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
2891+         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
2892+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
2893+
2894+         "cmp        %[low]        , %[range], lsl #17           \n\t"
2895+         "ittt       ge                                          \n\t"
2896+         "mvnge      %[bit]        , %[bit]                      \n\t"
2897+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
2898+         "movge      %[range]      , %[tmp]                      \n\t"
2899+
2900+// Renorm
2901+         "clz        %[tmp]        , %[range]                    \n\t"
2902+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
2903+         "sub        %[tmp]        , #23                         \n\t"
2904+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
2905+         "tst        %[bit]        , #1                          \n\t"
2906+         "ldrb       %[st]         , [%[ctx_map], #-1]!          \n\t"
2907+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
2908+// GCC asm seems to need strbne written differently for thumb and arm
2909+#if CONFIG_THUMB
2910+         "it         ne                                          \n\t"
2911+         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
2912+#else
2913+         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
2914+#endif
2915+
2916+// There is a small speed gain from combining both conditions, using a single
2917+// branch and then working out what that meant later
2918+         "subs       %[n]          , %[n]        , #1            \n\t"
2919+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
2920+#if CONFIG_THUMB
2921+         "itt        ne                                          \n\t"
2922+         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
2923+#else
2924+         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
2925+#endif
2926+         "bne        1b                                          \n\t"
2927+
2928+// If we have bits left then n must be 0 so give up now
2929+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
2930+         "bne        2f                                          \n\t"
2931+
2932+// Do reload
2933+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
2934+         "rbit       %[bit]        , %[low]                      \n\t"
2935+         "movw       %[r_b]        , #0xFFFF                     \n\t"
2936+         "clz        %[bit]        , %[bit]                      \n\t"
2937+         "cmp        %[n]          , #0                          \n\t"
2938+         "rev        %[tmp]        , %[tmp]                      \n\t"
2939+         "sub        %[bit]        , %[bit]      , #16           \n\t"
2940+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
2941+
2942+#if CONFIG_THUMB
2943+         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
2944+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
2945+#else
2946+         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
2947+#endif
2948+
2949+// Check to see if we still have more to do
2950+         "bne        1b                                          \n\t"
2951+         "2:                                                     \n\t"
2952+         :    [bit]"=&r"(bit),
2953+              [low]"+r"(c->low),
2954+            [range]"+r"(c->range),
2955+              [r_b]"=&r"(reg_b),
2956+             [bptr]"+r"(c->bytestream),
2957+              [idx]"+r"(p),
2958+                [n]"+r"(n),
2959+              [tmp]"=&r"(tmp),
2960+               [st]"=&r"(st),
2961+          [ctx_map]"+r"(ctx_map)
2962+          :  [state0]"r"(state0),
2963+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
2964+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
2965+         : "memory", "cc"
2966+    );
2967+
2968+    return p;
2969+}
2970+
2971+// ---------------------------------------------------------------------------
2972+//
2973+// CABAC_BY22 functions
2974+
2975+
2976+#define get_cabac_by22_start get_cabac_by22_start_arm
2977+static inline void get_cabac_by22_start_arm(CABACContext * const c)
2978+{
2979+    const uint8_t *ptr = c->bytestream;
2980+    register uint32_t low __asm__("r1"), range __asm__("r2");
2981+    uint32_t m, range8, bits;
2982+#if !USE_BY22_DIV
2983+    uintptr_t inv;
2984+#endif
2985+
2986+    av_assert2(offsetof (CABACContext, low) == 0);
2987+    av_assert2(offsetof (CABACContext, range) == 4);
2988+    av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2);
2989+    __asm__ volatile (
2990+        "ldmia   %[c], {%[low], %[range]}                         \n\t"
2991+        : // Outputs
2992+               [low]"=r"(low),
2993+             [range]"=r"(range)
2994+        : // Inputs
2995+                 [c]"r"(c)
2996+        : // Clobbers
2997+    );
2998+#if !USE_BY22_DIV
2999+    inv = (uintptr_t)cabac_by22_inv_range;
3000+#endif
3001+    __asm__ volatile (
3002+        "ldr     %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t"
3003+#if !USE_BY22_DIV
3004+        "uxtb    %[range8], %[range]                              \n\t"
3005+#endif
3006+        "rbit    %[bits], %[low]                                  \n\t"
3007+        "lsl     %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t"
3008+        "clz     %[bits], %[bits]                                 \n\t"
3009+        "str     %[ptr], [%[c], %[ptr_off]]                       \n\t"
3010+        "rev     %[m], %[m]                                       \n\t"
3011+        "rsb     %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t"
3012+        "eor     %[m], %[m], #0x80000000                          \n\t"
3013+#if !USE_BY22_DIV
3014+        "ldr     %[inv], [%[inv], %[range8], lsl #2]              \n\t"
3015+        "pkhbt   %[range], %[bits], %[range], lsl #16             \n\t"
3016+        "str     %[range], [%[c], %[bits_off]]                    \n\t"
3017+#else
3018+        "strh    %[bits], [%[c], %[bits_off]]                     \n\t"
3019+#endif
3020+#if CONFIG_THUMB
3021+        "lsr     %[m], %[ptr]                                     \n\t"
3022+        "eor     %[range], %[low], %[m]                           \n\t"
3023+#else
3024+        "eor     %[range], %[low], %[m], lsr %[ptr]               \n\t"
3025+#endif
3026+        : // Outputs
3027+               [ptr]"+&r"(ptr),
3028+               [low]"+&r"(low),
3029+             [range]"+&r"(range),
3030+#if !USE_BY22_DIV
3031+               [inv]"+&r"(inv),
3032+#endif
3033+                 [m]"=&r"(m),
3034+            [range8]"=&r"(range8),
3035+              [bits]"=&r"(bits)
3036+        : // Inputs
3037+                   [c]"r"(c),
3038+            [bits_off]"J"(offsetof (CABACContext, by22.bits)),
3039+             [ptr_off]"J"(offsetof (CABACContext, bytestream))
3040+        : // Clobbers
3041+            "memory"
3042+    );
3043+    c->low = range;
3044+#if !USE_BY22_DIV
3045+    c->range = inv;
3046+#endif
3047+}
3048+
3049+#define get_cabac_by22_peek get_cabac_by22_peek_arm
3050+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
3051+{
3052+    uint32_t rv = c->low &~ 1, tmp;
3053+    __asm__ (
3054+        "cmp      %[inv] , #0                    \n\t"
3055+        "it       ne                             \n\t"
3056+        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
3057+        :  // Outputs
3058+             [rv]"+r"(rv),
3059+             [tmp]"=r"(tmp)
3060+        :  // Inputs
3061+             [inv]"r"(c->range)
3062+        :  // Clobbers
3063+                "cc"
3064+    );
3065+    return rv << 1;
3066+}
3067+
3068+#define get_cabac_by22_flush get_cabac_by22_flush_arm
3069+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val)
3070+{
3071+    uint32_t bits, ptr, tmp1, tmp2;
3072+    __asm__ volatile (
3073+        "ldrh    %[bits], [%[cc], %[bits_off]]     \n\t"
3074+        "ldr     %[ptr], [%[cc], %[ptr_off]]       \n\t"
3075+        "rsb     %[tmp1], %[n], #32                \n\t"
3076+        "add     %[bits], %[bits], %[n]            \n\t"
3077+        "ldrh    %[tmp2], [%[cc], %[range_off]]    \n\t"
3078+        "lsr     %[tmp1], %[val], %[tmp1]          \n\t"
3079+        "ldr     %[val], [%[cc], %[low_off]]       \n\t"
3080+#if CONFIG_THUMB
3081+        "add     %[ptr], %[ptr], %[bits], lsr #3   \n\t"
3082+        "ldr     %[ptr], [%[ptr]]                  \n\t"
3083+#else
3084+        "ldr     %[ptr], [%[ptr], %[bits], lsr #3] \n\t"
3085+#endif
3086+        "mul     %[tmp1], %[tmp2], %[tmp1]         \n\t"
3087+        "and     %[tmp2], %[bits], #7              \n\t"
3088+        "strh    %[bits], [%[cc], %[bits_off]]     \n\t"
3089+        "rev     %[ptr], %[ptr]                    \n\t"
3090+        "lsl     %[tmp1], %[tmp1], #23             \n\t"
3091+#if CONFIG_THUMB
3092+        "lsl     %[val], %[n]                      \n\t"
3093+        "sub     %[val], %[tmp1]                   \n\t"
3094+#else
3095+        "rsb     %[val], %[tmp1], %[val], lsl %[n] \n\t"
3096+#endif
3097+        "lsl     %[ptr], %[ptr], %[tmp2]           \n\t"
3098+        "orr     %[val], %[val], %[ptr], lsr #9    \n\t"
3099+        "str     %[val], [%[cc], %[low_off]]       \n\t"
3100+        :  // Outputs
3101+            [val]"+r"(val),
3102+           [bits]"=&r"(bits),
3103+            [ptr]"=&r"(ptr),
3104+           [tmp1]"=&r"(tmp1),
3105+           [tmp2]"=&r"(tmp2)
3106+        :  // Inputs
3107+                  [cc]"r"(c),
3108+                   [n]"r"(n),
3109+            [bits_off]"J"(offsetof(CABACContext, by22.bits)),
3110+             [ptr_off]"J"(offsetof(CABACContext, bytestream)),
3111+           [range_off]"J"(offsetof(CABACContext, by22.range)),
3112+             [low_off]"J"(offsetof(CABACContext, low))
3113+        :  // Clobbers
3114+           "memory"
3115+    );
3116+}
3117+
3118+#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm
3119+static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param)
3120+{
3121+    uint32_t last_coeff_abs_level_remaining;
3122+    uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2;
3123+    __asm__ volatile (
3124+        "ldr     %[remain], [%[cc], %[low_off]]               \n\t"
3125+        "ldr     %[prefix], [%[cc], %[range_off]]             \n\t"
3126+        "bic     %[remain], %[remain], #1                     \n\t"
3127+        "ldrh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
3128+        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
3129+        "cmp     %[prefix], #0                                \n\t"
3130+        "it      ne                                           \n\t"
3131+        "umullne %[prefix], %[remain], %[prefix], %[remain]   \n\t"
3132+        "ldrh    %[range], [%[cc], %[by22_range_off]]         \n\t"
3133+        "lsl     %[remain], %[remain], #1                     \n\t"
3134+        "mvn     %[prefix], %[remain]                         \n\t"
3135+        "clz     %[prefix], %[prefix]                         \n\t"
3136+        "rsbs    %[n1], %[prefix], #2                         \n\t"
3137+        "bcc     1f                                           \n\t"
3138+        "adc     %[n1], %[rice], %[prefix]                    \n\t"
3139+        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
3140+        "rsb     %[n2], %[n1], #32                            \n\t"
3141+        "and     %[tmp1], %[tmp2], #7                         \n\t"
3142+        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
3143+        "lsr     %[tmp2], %[tmp2], #3                         \n\t"
3144+        "lsr     %[n2], %[remain], %[n2]                      \n\t"
3145+        "mul     %[n2], %[range], %[n2]                       \n\t"
3146+        "ldr     %[range], [%[cc], %[low_off]]                \n\t"
3147+        "ldr     %[ptr], [%[ptr], %[tmp2]]                    \n\t"
3148+        "rsb     %[tmp2], %[rice], #31                        \n\t"
3149+        "lsl     %[remain], %[remain], %[prefix]              \n\t"
3150+        "lsl     %[n2], %[n2], #23                            \n\t"
3151+#if CONFIG_THUMB
3152+        "lsl     %[range], %[n1]                              \n\t"
3153+        "sub     %[range], %[n2]                              \n\t"
3154+#else
3155+        "rsb     %[range], %[n2], %[range], lsl %[n1]         \n\t"
3156+#endif
3157+        "rev     %[ptr], %[ptr]                               \n\t"
3158+        "lsl     %[n2], %[prefix], %[rice]                    \n\t"
3159+#if CONFIG_THUMB
3160+        "lsr     %[remain], %[tmp2]                           \n\t"
3161+        "add     %[remain], %[n2]                             \n\t"
3162+#else
3163+        "add     %[remain], %[n2], %[remain], lsr %[tmp2]     \n\t"
3164+#endif
3165+        "b       3f                                           \n\t"
3166+        "1:                                                   \n\t"
3167+        "add     %[n2], %[rice], %[prefix], lsl #1            \n\t"
3168+        "cmp     %[n2], %[peek_bits_plus_2]                   \n\t"
3169+        "bhi     2f                                           \n\t"
3170+        "sub     %[n1], %[n2], #2                             \n\t"
3171+        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
3172+        "rsb     %[n2], %[n1], #32                            \n\t"
3173+        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
3174+        "lsr     %[tmp1], %[tmp2], #3                         \n\t"
3175+        "lsr     %[n2], %[remain], %[n2]                      \n\t"
3176+        "mul     %[n2], %[range], %[n2]                       \n\t"
3177+        "rsb     %[range], %[rice], #34                       \n\t"
3178+        "ldr     %[ptr], [%[ptr], %[tmp1]]                    \n\t"
3179+        "and     %[tmp1], %[tmp2], #7                         \n\t"
3180+        "lsl     %[remain], %[remain], %[prefix]              \n\t"
3181+        "ldr     %[tmp2], [%[cc], %[low_off]]                 \n\t"
3182+        "rsb     %[prefix], %[prefix], %[range]               \n\t"
3183+        "orr     %[remain], %[remain], #0x80000000            \n\t"
3184+        "rev     %[ptr], %[ptr]                               \n\t"
3185+        "lsl     %[n2], %[n2], #23                            \n\t"
3186+        "mov     %[range], #2                                 \n\t"
3187+#if CONFIG_THUMB
3188+        "lsl     %[tmp2], %[n1]                               \n\t"
3189+        "sub     %[tmp2], %[n2]                               \n\t"
3190+#else
3191+        "rsb     %[tmp2], %[n2], %[tmp2], lsl %[n1]           \n\t"
3192+#endif
3193+        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
3194+        "lsl     %[rice], %[range], %[rice]                   \n\t"
3195+        "orr     %[range], %[tmp2], %[ptr], lsr #9            \n\t"
3196+#if CONFIG_THUMB
3197+        "lsr     %[remain], %[prefix]                         \n\t"
3198+        "add     %[remain], %[rice]                           \n\t"
3199+#else
3200+        "add     %[remain], %[rice], %[remain], lsr %[prefix] \n\t"
3201+#endif
3202+        "b       4f                                           \n\t"
3203+        "2:                                                   \n\t"
3204+        "add     %[n1], %[tmp2], %[prefix]                    \n\t"
3205+#if CONFIG_THUMB
3206+        "add     %[tmp2], %[ptr], %[n1], lsr #3               \n\t"
3207+        "ldr     %[tmp2], [%[tmp2]]                           \n\t"
3208+#else
3209+        "ldr     %[tmp2], [%[ptr], %[n1], lsr #3]             \n\t"
3210+#endif
3211+        "rsb     %[tmp1], %[prefix], #32                      \n\t"
3212+        "push    {%[rice]}                                    \n\t"
3213+        "and     %[rice], %[n1], #7                           \n\t"
3214+        "lsr     %[tmp1], %[remain], %[tmp1]                  \n\t"
3215+        "ldr     %[ptr], [%[cc], %[low_off]]                  \n\t"
3216+        "mul     %[remain], %[range], %[tmp1]                 \n\t"
3217+        "rev     %[tmp2], %[tmp2]                             \n\t"
3218+        "rsb     %[n2], %[prefix], %[n2]                      \n\t"
3219+        "ldr     %[tmp1], [%[cc], %[range_off]]               \n\t"
3220+        "lsl     %[rice], %[tmp2], %[rice]                    \n\t"
3221+        "sub     %[tmp2], %[n2], #2                           \n\t"
3222+        "lsl     %[remain], %[remain], #23                    \n\t"
3223+#if CONFIG_THUMB
3224+        "lsl     %[ptr], %[prefix]                            \n\t"
3225+        "rsb     %[remain], %[ptr]                            \n\t"
3226+#else
3227+        "rsb     %[remain], %[remain], %[ptr], lsl %[prefix]  \n\t"
3228+#endif
3229+        "orr     %[remain], %[remain], %[rice], lsr #9        \n\t"
3230+        "add     %[prefix], %[n1], %[tmp2]                    \n\t"
3231+        "bic     %[n1], %[remain], #1                         \n\t"
3232+        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
3233+        "cmp     %[tmp1], #0                                  \n\t"
3234+        "rsb     %[rice], %[tmp2], #32                        \n\t"
3235+        "it      ne                                           \n\t"
3236+        "umullne %[tmp1], %[n1], %[tmp1], %[n1]               \n\t"
3237+        "and     %[tmp1], %[prefix], #7                       \n\t"
3238+#if CONFIG_THUMB
3239+        "add     %[ptr], %[ptr], %[prefix], lsr #3            \n\t"
3240+        "ldr     %[ptr], [%[ptr]]                             \n\t"
3241+#else
3242+        "ldr     %[ptr], [%[ptr], %[prefix], lsr #3]          \n\t"
3243+#endif
3244+        "lsl     %[n1], %[n1], #1                             \n\t"
3245+        "lsr     %[rice], %[n1], %[rice]                      \n\t"
3246+        "rsb     %[n2], %[n2], #34                            \n\t"
3247+        "mul     %[range], %[range], %[rice]                  \n\t"
3248+        "pop     {%[rice]}                                    \n\t"
3249+        "rev     %[ptr], %[ptr]                               \n\t"
3250+        "orr     %[n1], %[n1], #0x80000000                    \n\t"
3251+        "strh    %[prefix], [%[cc], %[by22_bits_off]]         \n\t"
3252+        "mov     %[prefix], #2                                \n\t"
3253+        "lsl     %[range], %[range], #23                      \n\t"
3254+#if CONFIG_THUMB
3255+        "lsl     %[remain], %[tmp2]                           \n\t"
3256+        "rsb     %[range], %[remain]                          \n\t"
3257+#else
3258+        "rsb     %[range], %[range], %[remain], lsl %[tmp2]   \n\t"
3259+#endif
3260+        "lsl     %[remain], %[prefix], %[rice]                \n\t"
3261+#if CONFIG_THUMB
3262+        "lsr     %[n1], %[n2]                                 \n\t"
3263+        "add     %[remain], %[n1]                             \n\t"
3264+#else
3265+        "add     %[remain], %[remain], %[n1], lsr %[n2]       \n\t"
3266+#endif
3267+        "3:                                                   \n\t"
3268+        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
3269+        "orr     %[range], %[range], %[ptr], lsr #9           \n\t"
3270+        "4:                                                   \n\t"
3271+        "str     %[range], [%[cc], %[low_off]]                \n\t"
3272+        :  // Outputs
3273+            [remain]"=&r"(last_coeff_abs_level_remaining),
3274+              [rice]"+r"(rice_param),
3275+            [prefix]"=&r"(prefix),
3276+                [n1]"=&r"(n1),
3277+             [range]"=&r"(range),
3278+                [n2]"=&r"(n2),
3279+               [ptr]"=&r"(ptr),
3280+              [tmp1]"=&r"(tmp1),
3281+              [tmp2]"=&r"(tmp2)
3282+        :  // Inputs
3283+                          [cc]"r"(c),
3284+            [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2),
3285+                     [low_off]"J"(offsetof(CABACContext, low)),
3286+                   [range_off]"J"(offsetof(CABACContext, range)),
3287+               [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)),
3288+              [by22_range_off]"J"(offsetof(CABACContext, by22.range)),
3289+                     [ptr_off]"J"(offsetof(CABACContext, bytestream))
3290+        :  // Clobbers
3291+           "cc", "memory"
3292+    );
3293+    return last_coeff_abs_level_remaining;
3294+}
3295+
3296+#endif /* HAVE_ARMV6T2_INLINE */
3297+
3298+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
3299--- /dev/null
3300+++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
3301@@ -0,0 +1,183 @@
3302+/*
3303+ * ARM NEON optimised IDCT functions for HEVC decoding
3304+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
3305+ * Copyright (C) 2018 John Cox, ben Avison for Raspberry Pi (Trading)
3306+ *
3307+ * This file is part of FFmpeg.
3308+ *
3309+ * FFmpeg is free software; you can redistribute it and/or
3310+ * modify it under the terms of the GNU Lesser General Public
3311+ * License as published by the Free Software Foundation; either
3312+ * version 2.1 of the License, or (at your option) any later version.
3313+ *
3314+ * FFmpeg is distributed in the hope that it will be useful,
3315+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
3316+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
3317+ * Lesser General Public License for more details.
3318+ *
3319+ * You should have received a copy of the GNU Lesser General Public
3320+ * License along with FFmpeg; if not, write to the Free Software
3321+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3322+ */
3323+
3324+@ Included multiple times from hevc_idct_neon.S
3325+@ Macros defined there
3326+
3327+#define DC_SHIFT  (15 - BIT_DEPTH)
3328+#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
3329+#define TRN_SHIFT (20 - BIT_DEPTH)
3330+
3331+function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1
3332+        ldrsh       r1, [r0]
3333+        add         r1, #DC_ADD
3334+        asr         r1, #DC_SHIFT
3335+        vdup.16     q0, r1
3336+        vdup.16     q1, r1
3337+        vst1.16     {q0, q1}, [r0]
3338+        bx lr
3339+endfunc
3340+
3341+function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1
3342+        ldrsh       r1, [r0]
3343+        add         r2, r0, #32
3344+        mov         r3, #64
3345+        add         r1, #DC_ADD
3346+        asr         r1, #DC_SHIFT
3347+        vdup.16     q8, r1
3348+        vdup.16     q9, r1
3349+        vst1.16     {q8, q9}, [r0], r3
3350+        vst1.16     {q8, q9}, [r2], r3
3351+        vst1.16     {q8, q9}, [r0]
3352+        vst1.16     {q8, q9}, [r2]
3353+        bx lr
3354+endfunc
3355+
3356+function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1
3357+        ldrsh       r1, [r0]
3358+        add         r2, r0, #32
3359+        mov         r3, #64
3360+        add         r1, #DC_ADD
3361+        mov         ip, #16*16
3362+        asr         r1, #DC_SHIFT
3363+        vdup.16     q8, r1
3364+        vdup.16     q9, r1
3365+1:      vst1.16     {q8, q9}, [r0], r3
3366+        subs        ip, ip, #32
3367+        vst1.16     {q8, q9}, [r2], r3
3368+        bhi         1b
3369+        bx lr
3370+endfunc
3371+
3372+function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1
3373+        ldrsh       r1, [r0]
3374+        add         r2, r0, #32
3375+        mov         r3, #64
3376+        add         r1, #DC_ADD
3377+        mov         ip, #32*32
3378+        asr         r1, #DC_SHIFT
3379+        vdup.16     q8, r1
3380+        vdup.16     q9, r1
3381+1:      vst1.16     {q8, q9}, [r0], r3
3382+        subs        ip, ip, #32
3383+        vst1.16     {q8, q9}, [r2], r3
3384+        bhi         1b
3385+        bx lr
3386+endfunc
3387+
3388+
3389+function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1
3390+        vldr.i32    s0, =0x00240053 // 36 and 83
3391+        vld1.16     {q14, q15}, [r0 :256]  // coeffs
3392+
3393+        tr4_shift   #7
3394+
3395+        vzip.16     d28, d29
3396+        vzip.16     d30, d31
3397+        vzip.32     q14, q15
3398+
3399+        tr4_shift   #TRN_SHIFT
3400+
3401+        vst4.16     {q14, q15}, [r0 :256]
3402+        bx lr
3403+
3404+        .ltorg
3405+endfunc
3406+
3407+
3408+
3409+function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1
3410+        vmov.i32    d0, #0x4a  // 74
3411+        vld1.16     {q14, q15}, [r0 :256]  // coeffs
3412+        vmov.i32    d1, #0x1d  // 29
3413+        vmov.i32    d2, #0x37  // 55
3414+
3415+        tr4_luma_shift #7
3416+
3417+        vzip.16     d28, d29
3418+        vzip.16     d30, d31
3419+        vzip.32     q14, q15
3420+
3421+        tr4_luma_shift #TRN_SHIFT
3422+
3423+        vst4.16     {q14, q15}, [r0 :256]
3424+        bx lr
3425+endfunc
3426+
3427+function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1
3428+        add      r2, r0, #16
3429+        adr      r3, tr4f
3430+        vpush    {d8-d15}
3431+        vld1.16  {d0, d1}, [r3]
3432+        mov      r3, #32
3433+
3434+        tr8_vert  d16, d17, d18, d19, d24, d25, d26, d27, q8,  q9,  \
3435+            "sub      r0, r0, #128-8",                              \
3436+            "sub      r2, r2, #128-8",                              \
3437+            "cmp      r1, #4"
3438+        ble      2f
3439+
3440+        tr8_vert  d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \
3441+            "sub      r0, r0, #128+8",                              \
3442+            "sub      r2, r2, #128+8+16-32",                        \
3443+            "mov      r3, #64"
3444+
3445+        vzip.16  d16, d17
3446+        vzip.16  d18, d19
3447+
3448+        vzip.16  d20, d21
3449+        vzip.16  d22, d23
3450+        vzip.16  d28, d29
3451+        vzip.16  d30, d31
3452+        vzip.32  q10, q11
3453+        vzip.32  q14, q15
3454+1:
3455+        vzip.16  d24, d25
3456+        vzip.16  d26, d27
3457+        vzip.32  q8, q9
3458+        vzip.32  q12, q13
3459+
3460+        tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8,  q9,  TRN_SHIFT
3461+        tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT
3462+
3463+        vpop     {d8-d15}
3464+        bx       lr
3465+
3466+2:      vmov.i64 q10, #0
3467+        sub      r0, r0, #8
3468+        vmov.i64 q11, #0
3469+        sub      r2, r2, #8+16-32
3470+        vmov.i64 q14, #0
3471+        mov      r3, #64
3472+        vmov.i64 q15, #0
3473+
3474+        vzip.16  d16, d17
3475+        vzip.16  d18, d19
3476+
3477+        b        1b
3478+
3479+endfunc
3480+
3481+#undef DC_SHIFT
3482+#undef DC_ADD
3483+#undef TRN_SHIFT
3484+
3485--- /dev/null
3486+++ b/libavcodec/arm/rpi_hevc_misc_neon.S
3487@@ -0,0 +1,267 @@
3488+/*
3489+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
3490+All rights reserved.
3491+
3492+Redistribution and use in source and binary forms, with or without
3493+modification, are permitted provided that the following conditions are met:
3494+    * Redistributions of source code must retain the above copyright
3495+      notice, this list of conditions and the following disclaimer.
3496+    * Redistributions in binary form must reproduce the above copyright
3497+      notice, this list of conditions and the following disclaimer in the
3498+      documentation and/or other materials provided with the distribution.
3499+    * Neither the name of the copyright holder nor the
3500+      names of its contributors may be used to endorse or promote products
3501+      derived from this software without specific prior written permission.
3502+
3503+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
3504+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
3505+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
3506+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
3507+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
3508+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
3509+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
3510+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
3511+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
3512+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3513+
3514+Written by John Cox, Ben Avison
3515+*/
3516+
3517+#include "libavutil/arm/asm.S"
3518+#include "neon.S"
3519+
3520+@ rpi_zap_coeff_vals_neon(
3521+@   uint16_t * buf,          [r0]
3522+@   unsigned int log_n_m2)   [r1]
3523+
3524+function rpi_zap_coeff_vals_neon, export=1
3525+        mov      ip, #1
3526+        vmov.i64 q0, #0
3527+        teq      r1, #0
3528+        vmov.i64 q1, #0
3529+        beq      2f
3530+
3531+        lsl      ip, r1    @ 2, 4 or 8
3532+        add      r2, r0, #32
3533+        lsl      ip, r1    @ 4, 16 or 64 = number of 32-byte blocks to zero
3534+        mov      r3, #64
3535+1:      vst1.8   {q0,q1}, [r0:256], r3
3536+        subs     ip, #2
3537+        vst1.8   {q0,q1}, [r2:256], r3
3538+        bne      1b
3539+        bx       lr
3540+
3541+2:      vst1.8   {q0,q1}, [r0:256]
3542+        bx       lr
3543+endfunc
3544+
3545+@ PIC jump tables are more expensive than absolute for A32 code
3546+.set jent_pic, CONFIG_PIC || CONFIG_THUMB
3547+
3548+@ Jump table entry - if in neon mode the bottom bit must be set
3549+@ ? There is probably a real asm instruction to do this but I haven't found it
3550+.macro jent lab
3551+.if jent_pic
3552+T       .short ((0 + \lab) - (0 + 98b)) / 2
3553+A       .short (0 + \lab) - (4 + 98b)
3554+.else
3555+T       .word   1 + \lab
3556+A       .word   \lab
3557+.endif
3558+.endm
3559+
3560+.set expected_next, 0
3561+
3562+.macro cpy_compound val, p1, p2, drop_thru=0
3563+.if \p1 + \p2 != \val
3564+.error "Bad addition!  \p1 + \p2 != \val"
3565+.endif
3566+.if expected_next != 0 && expected_next != \val
3567+.error "Drop thru failure"
3568+.endif
3569+\val\():
3570+        push       {r0-r3}
3571+        bl          100\p1\()b
3572+        pop        {r0-r3}
3573+        add         r0, #\p1
3574+        add         r2, #\p1
3575+.if \drop_thru == 0
3576+        b           \p2\()b
3577+.set expected_next, 0
3578+.else
3579+.set expected_next, \p2
3580+.endif
3581+.endm
3582+
3583+@ ff_hevc_cpy_blks8x4_neon(
3584+@   dst         [r0]
3585+@   dst_stride  [r1]
3586+@   src         [r2]
3587+@   src_stride  [r3]
3588+@   width       [sp, #0] (bytes)
3589+@   height)     [sp, #4]
3590+@
3591+@ Power of 2 widths are directly coded, all others are done in stripes
3592+@ We expect the vast majority of calls to be power of 2
3593+@
3594+@ Currently has min width of 8, but we could make that 4 without issue
3595+@ Min height is 4
3596+
3597+function ff_hevc_rpi_cpy_blks8x4_neon, export=1
3598+        ldr         r12, [sp, #0]
3599+        push       {r11, lr}
3600+.if jent_pic
3601+A       adr         lr,  98f - 2
3602+.else
3603+A       adr         lr,  98f - 4
3604+.endif
3605+        lsr         r12, #3
3606+        ldr         r11, [sp, #(8 + 4)]
3607+.if jent_pic
3608+A       lsl         r12, #1
3609+A       ldrsh       lr,  [lr,  r12]
3610+A       add         pc,  lr
3611+T       tbh         [pc, r12, lsl #1]
3612+.else
3613+        @ A32 only, Thumb is always PIC
3614+        ldr         pc,  [lr,  r12, lsl #2]
3615+.endif
3616+
3617+98:
3618+T       .short      0 @ unused
3619+        jent        8f
3620+        jent        16f
3621+        jent        24f
3622+        jent        32f
3623+        jent        40f
3624+        jent        48f
3625+        jent        56f
3626+        jent        64f
3627+        jent        72f
3628+        jent        80f
3629+        jent        88f
3630+        jent        96f
3631+        jent        104f
3632+        jent        112f
3633+        jent        120f
3634+        jent        128f
3635+
3636+1008:
3637+        push       {r11, lr}
3638+8:
3639+        add         lr,  r2,  r3
3640+        lsl         r3,  #1
3641+        add         r12, r0,  r1
3642+        lsl         r1,  #1
3643+1:
3644+        vld1.32    {d0 }, [r2],  r3
3645+        vld1.32    {d1 }, [lr],  r3
3646+        vld1.32    {d2 }, [r2],  r3
3647+        vld1.32    {d3 }, [lr],  r3
3648+        subs        r11,  #4
3649+        vst1.32    {d0 }, [r0],  r1
3650+        vst1.32    {d1 }, [r12], r1
3651+        vst1.32    {d2 }, [r0],  r1
3652+        vst1.32    {d3 }, [r12], r1
3653+        bgt         1b
3654+        pop        {r11, pc}
3655+
3656+10016:
3657+        push       {r11, lr}
3658+16:
3659+        add         lr,  r2,  r3
3660+        lsl         r3,  #1
3661+        add         r12, r0,  r1
3662+        lsl         r1,  #1
3663+1:
3664+        vld1.32    {q0 }, [r2],  r3
3665+        vld1.32    {q1 }, [lr],  r3
3666+        vld1.32    {q2 }, [r2],  r3
3667+        vld1.32    {q3 }, [lr],  r3
3668+        subs        r11, #4
3669+        vst1.32    {q0 }, [r0],  r1
3670+        vst1.32    {q1 }, [r12], r1
3671+        vst1.32    {q2 }, [r0],  r1
3672+        vst1.32    {q3 }, [r12], r1
3673+        bgt         1b
3674+        pop        {r11, pc}
3675+
3676+10032:
3677+        push       {r11, lr}
3678+32:
3679+        add         lr,  r2,  r3
3680+        lsl         r3,  #1
3681+        add         r12, r0,  r1
3682+        lsl         r1,  #1
3683+1:
3684+        vld1.32    {q8,  q9 }, [r2],  r3
3685+        vld1.32    {q10, q11}, [lr],  r3
3686+        vld1.32    {q12, q13}, [r2],  r3
3687+        vld1.32    {q14, q15}, [lr],  r3
3688+        subs        r11, #4
3689+        vst1.32    {q8,  q9 }, [r0],  r1
3690+        vst1.32    {q10, q11}, [r12], r1
3691+        vst1.32    {q12, q13}, [r0],  r1
3692+        vst1.32    {q14, q15}, [r12], r1
3693+        bgt         1b
3694+        pop        {r11, pc}
3695+
3696+10064:
3697+        push       {r11, lr}
3698+64:
3699+        add         lr,  r2,  #32
3700+        add         r12, r0,  #32
3701+1:
3702+        vld1.32    {q8,  q9 }, [r2],  r3
3703+        vld1.32    {q10, q11}, [lr],  r3
3704+        vld1.32    {q12, q13}, [r2],  r3
3705+        vld1.32    {q14, q15}, [lr],  r3
3706+        subs        r11, #2
3707+        vst1.32    {q8,  q9 }, [r0],  r1
3708+        vst1.32    {q10, q11}, [r12], r1
3709+        vst1.32    {q12, q13}, [r0],  r1
3710+        vst1.32    {q14, q15}, [r12], r1
3711+        bgt         1b
3712+        pop        {r11, pc}
3713+
3714+128:
3715+        push       {r4, r5}
3716+        @ We could do this with fewer registers if we jump around but I
3717+        @ have a primative urge to load sequentially
3718+        mov         r4,  #64
3719+        add         lr,  r2,  #32
3720+        add         r12, r0,  #32
3721+        sub         r3,  r4
3722+        sub         r1,  r4
3723+1:
3724+        vld1.32    {q8,  q9 }, [r2],  r4
3725+        vld1.32    {q10, q11}, [lr],  r4
3726+        vld1.32    {q12, q13}, [r2],  r3
3727+        vld1.32    {q14, q15}, [lr],  r3
3728+        subs        r11, #1
3729+        vst1.32    {q8,  q9 }, [r0],  r4
3730+        vst1.32    {q10, q11}, [r12], r4
3731+        vst1.32    {q12, q13}, [r0],  r1
3732+        vst1.32    {q14, q15}, [r12], r1
3733+        bgt         1b
3734+        pop        {r4, r5, r11, pc}
3735+
3736+@ Use drop_thru where we can
3737+cpy_compound 104, 64, 40, 1
3738+cpy_compound 40, 32, 8
3739+
3740+cpy_compound 112, 64, 48, 1
3741+cpy_compound 48, 32, 16
3742+
3743+cpy_compound 120, 64, 56, 1
3744+cpy_compound 56, 32, 24, 1
3745+cpy_compound 24, 16, 8
3746+
3747+cpy_compound 72, 64, 8
3748+cpy_compound 80, 64, 16
3749+cpy_compound 88, 64, 24
3750+cpy_compound 96, 64, 32
3751+
3752+
3753+endfunc
3754+
3755--- /dev/null
3756+++ b/libavcodec/arm/rpi_hevc_misc_neon.h
3757@@ -0,0 +1,438 @@
3758+/*
3759+ * This file is part of FFmpeg.
3760+ *
3761+ * FFmpeg is free software; you can redistribute it and/or
3762+ * modify it under the terms of the GNU Lesser General Public
3763+ * License as published by the Free Software Foundation; either
3764+ * version 2.1 of the License, or (at your option) any later version.
3765+ *
3766+ * FFmpeg is distributed in the hope that it will be useful,
3767+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
3768+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
3769+ * Lesser General Public License for more details.
3770+ *
3771+ * You should have received a copy of the GNU Lesser General Public
3772+ * License along with FFmpeg; if not, write to the Free Software
3773+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
3774+ */
3775+
3776+#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H
3777+#define AVCODEC_ARM_RPI_HEVC_MISC_H
3778+
3779+#include "config.h"
3780+#if HAVE_NEON_INLINE && !CONFIG_THUMB
3781+
3782+static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src,
3783+                                                       int pixel_shift, int height,
3784+                                                       ptrdiff_t stride_src)
3785+{
3786+    const uint8_t *src2 = src + stride_src;
3787+    stride_src <<= 1;
3788+    switch (pixel_shift)
3789+    {
3790+        case 2:
3791+            __asm__ volatile (
3792+                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
3793+                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
3794+                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
3795+                "subs        %[height], #4                     \n\t"
3796+                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
3797+                "beq         2f                                \n\t"
3798+                "1:                                            \n\t"
3799+                "vld1.32     {d2[0]}, [%[src]], %[stride_src]  \n\t"
3800+                "vld1.32     {d2[1]}, [%[src2]], %[stride_src] \n\t"
3801+                "vld1.32     {d3[0]}, [%[src]], %[stride_src]  \n\t"
3802+                "vld1.32     {d3[1]}, [%[src2]], %[stride_src] \n\t"
3803+                "subs        %[height], #4                     \n\t"
3804+                "vst1.32     {q0}, [%[dst]]!                   \n\t"
3805+                "beq         3f                                \n\t"
3806+                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
3807+                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
3808+                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
3809+                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
3810+                "subs        %[height], #4                     \n\t"
3811+                "vst1.32     {q1}, [%[dst]]!                   \n\t"
3812+                "bne         1b                                \n\t"
3813+                "2:                                            \n\t"
3814+                "vst1.32     {q0}, [%[dst]]                    \n\t"
3815+                "b           4f                                \n\t"
3816+                "3:                                            \n\t"
3817+                "vst1.32     {q1}, [%[dst]]                    \n\t"
3818+                "4:                                            \n\t"
3819+                :  // Outputs
3820+                           [src]"+r"(src),
3821+                          [src2]"+r"(src2),
3822+                           [dst]"+r"(dst),
3823+                        [height]"+r"(height)
3824+                :  // Inputs
3825+                    [stride_src]"r"(stride_src)
3826+                :  // Clobbers
3827+                    "cc", "memory"
3828+            );
3829+            break;
3830+        case 1:
3831+            __asm__ volatile (
3832+                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
3833+                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
3834+                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
3835+                "subs        %[height], #4                     \n\t"
3836+                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
3837+                "beq         2f                                \n\t"
3838+                "1:                                            \n\t"
3839+                "vld1.16     {d2[0]}, [%[src]], %[stride_src]  \n\t"
3840+                "vld1.16     {d3[0]}, [%[src2]], %[stride_src] \n\t"
3841+                "vld1.16     {d2[1]}, [%[src]], %[stride_src]  \n\t"
3842+                "vld1.16     {d3[1]}, [%[src2]], %[stride_src] \n\t"
3843+                "vzip.16     d0, d1                            \n\t"
3844+                "subs        %[height], #4                     \n\t"
3845+                "vst1.16     {d0}, [%[dst]]!                   \n\t"
3846+                "beq         3f                                \n\t"
3847+                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
3848+                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
3849+                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
3850+                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
3851+                "vzip.16     d2, d3                            \n\t"
3852+                "subs        %[height], #4                     \n\t"
3853+                "vst1.16     {d2}, [%[dst]]!                   \n\t"
3854+                "bne         1b                                \n\t"
3855+                "2:                                            \n\t"
3856+                "vzip.16     d0, d1                            \n\t"
3857+                "vst1.16     {d0}, [%[dst]]                    \n\t"
3858+                "b           4f                                \n\t"
3859+                "3:                                            \n\t"
3860+                "vzip.16     d2, d3                            \n\t"
3861+                "vst1.16     {d2}, [%[dst]]                    \n\t"
3862+                "4:                                            \n\t"
3863+                :  // Outputs
3864+                           [src]"+r"(src),
3865+                          [src2]"+r"(src2),
3866+                           [dst]"+r"(dst),
3867+                        [height]"+r"(height)
3868+                :  // Inputs
3869+                    [stride_src]"r"(stride_src)
3870+                :  // Clobbers
3871+                    "cc", "memory"
3872+            );
3873+            break;
3874+        default:
3875+            __asm__ volatile (
3876+                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
3877+                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
3878+                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
3879+                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
3880+                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
3881+                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
3882+                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
3883+                "subs        %[height], #8                     \n\t"
3884+                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
3885+                "beq         2f                                \n\t"
3886+                "1:                                            \n\t"
3887+                "vld1.8      {d2[0]}, [%[src]], %[stride_src]  \n\t"
3888+                "vld1.8      {d3[0]}, [%[src2]], %[stride_src] \n\t"
3889+                "vld1.8      {d2[1]}, [%[src]], %[stride_src]  \n\t"
3890+                "vld1.8      {d3[1]}, [%[src2]], %[stride_src] \n\t"
3891+                "vld1.8      {d2[2]}, [%[src]], %[stride_src]  \n\t"
3892+                "vld1.8      {d3[2]}, [%[src2]], %[stride_src] \n\t"
3893+                "vld1.8      {d2[3]}, [%[src]], %[stride_src]  \n\t"
3894+                "vld1.8      {d3[3]}, [%[src2]], %[stride_src] \n\t"
3895+                "vzip.8      d0, d1                            \n\t"
3896+                "subs        %[height], #8                     \n\t"
3897+                "vst1.8      {d0}, [%[dst]]!                   \n\t"
3898+                "beq         3f                                \n\t"
3899+                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
3900+                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
3901+                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
3902+                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
3903+                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
3904+                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
3905+                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
3906+                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
3907+                "vzip.8      d2, d3                            \n\t"
3908+                "subs        %[height], #8                     \n\t"
3909+                "vst1.8      {d2}, [%[dst]]!                   \n\t"
3910+                "bne         1b                                \n\t"
3911+                "2:                                            \n\t"
3912+                "vzip.8      d0, d1                            \n\t"
3913+                "vst1.8      {d0}, [%[dst]]                    \n\t"
3914+                "b           4f                                \n\t"
3915+                "3:                                            \n\t"
3916+                "vzip.8      d2, d3                            \n\t"
3917+                "vst1.8      {d2}, [%[dst]]                    \n\t"
3918+                "4:                                            \n\t"
3919+                :  // Outputs
3920+                           [src]"+r"(src),
3921+                          [src2]"+r"(src2),
3922+                           [dst]"+r"(dst),
3923+                        [height]"+r"(height)
3924+                :  // Inputs
3925+                    [stride_src]"r"(stride_src)
3926+                :  // Clobbers
3927+                    "cc", "memory"
3928+            );
3929+            break;
3930+    }
3931+}
3932+
3933+static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src,
3934+                                                       int pixel_shift, int height,
3935+                                                      ptrdiff_t stride_dst)
3936+{
3937+    uint8_t *dst2 = dst + stride_dst;
3938+    stride_dst <<= 1;
3939+    switch (pixel_shift)
3940+    {
3941+        case 2:
3942+            __asm__ volatile (
3943+                "subs        %[height], #4                     \n\t"
3944+                "vld1.32     {q0}, [%[src]]!                   \n\t"
3945+                "beq         2f                                \n\t"
3946+                "1:                                            \n\t"
3947+                "vld1.32     {q1}, [%[src]]!                   \n\t"
3948+                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
3949+                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
3950+                "vst1.32     {d1[0]}, [%[dst]], %[stride_dst]  \n\t"
3951+                "subs        %[height], #4                     \n\t"
3952+                "vst1.32     {d1[1]}, [%[dst2]], %[stride_dst] \n\t"
3953+                "beq         3f                                \n\t"
3954+                "vld1.32     {q0}, [%[src]]!                   \n\t"
3955+                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
3956+                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
3957+                "vst1.32     {d3[0]}, [%[dst]], %[stride_dst]  \n\t"
3958+                "subs        %[height], #4                     \n\t"
3959+                "vst1.32     {d3[1]}, [%[dst2]], %[stride_dst] \n\t"
3960+                "bne         1b                                \n\t"
3961+                "2:                                            \n\t"
3962+                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
3963+                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
3964+                "vst1.32     {d1[0]}, [%[dst]]                 \n\t"
3965+                "vst1.32     {d1[1]}, [%[dst2]]                \n\t"
3966+                "b           4f                                \n\t"
3967+                "3:                                            \n\t"
3968+                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
3969+                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
3970+                "vst1.32     {d3[0]}, [%[dst]]                 \n\t"
3971+                "vst1.32     {d3[1]}, [%[dst2]]                \n\t"
3972+                "4:                                            \n\t"
3973+                :  // Outputs
3974+                           [dst]"+r"(dst),
3975+                          [dst2]"+r"(dst2),
3976+                           [src]"+r"(src),
3977+                        [height]"+r"(height)
3978+                :  // Inputs
3979+                    [stride_dst]"r"(stride_dst)
3980+                :  // Clobbers
3981+                    "cc", "memory"
3982+            );
3983+            break;
3984+        case 1:
3985+            __asm__ volatile (
3986+                "subs        %[height], #4                     \n\t"
3987+                "vld1.16     {d0}, [%[src]]!                   \n\t"
3988+                "beq         2f                                \n\t"
3989+                "1:                                            \n\t"
3990+                "vld1.16     {d2}, [%[src]]!                   \n\t"
3991+                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
3992+                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
3993+                "vst1.16     {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
3994+                "subs        %[height], #4                     \n\t"
3995+                "vst1.16     {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
3996+                "beq         3f                                \n\t"
3997+                "vld1.16     {d0}, [%[src]]!                   \n\t"
3998+                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
3999+                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
4000+                "vst1.16     {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
4001+                "subs        %[height], #4                     \n\t"
4002+                "vst1.16     {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
4003+                "bne         1b                                \n\t"
4004+                "2:                                            \n\t"
4005+                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
4006+                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
4007+                "vst1.16     {d0[2]}, [%[dst]]                 \n\t"
4008+                "vst1.16     {d0[3]}, [%[dst2]]                \n\t"
4009+                "b           4f                                \n\t"
4010+                "3:                                            \n\t"
4011+                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
4012+                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
4013+                "vst1.16     {d2[2]}, [%[dst]]                 \n\t"
4014+                "vst1.16     {d2[3]}, [%[dst2]]                \n\t"
4015+                "4:                                            \n\t"
4016+                :  // Outputs
4017+                           [dst]"+r"(dst),
4018+                          [dst2]"+r"(dst2),
4019+                           [src]"+r"(src),
4020+                        [height]"+r"(height)
4021+                :  // Inputs
4022+                    [stride_dst]"r"(stride_dst)
4023+                :  // Clobbers
4024+                    "cc", "memory"
4025+            );
4026+            break;
4027+        default:
4028+            __asm__ volatile (
4029+                "subs        %[height], #8                     \n\t"
4030+                "vld1.8      {d0}, [%[src]]!                   \n\t"
4031+                "beq         2f                                \n\t"
4032+                "1:                                            \n\t"
4033+                "vld1.8      {d2}, [%[src]]!                   \n\t"
4034+                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
4035+                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
4036+                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
4037+                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
4038+                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
4039+                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
4040+                "vst1.8      {d0[6]}, [%[dst]], %[stride_dst]  \n\t"
4041+                "subs        %[height], #8                     \n\t"
4042+                "vst1.8      {d0[7]}, [%[dst2]], %[stride_dst] \n\t"
4043+                "beq         3f                                \n\t"
4044+                "vld1.8      {d0}, [%[src]]!                   \n\t"
4045+                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
4046+                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
4047+                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
4048+                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
4049+                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
4050+                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
4051+                "vst1.8      {d2[6]}, [%[dst]], %[stride_dst]  \n\t"
4052+                "subs        %[height], #8                     \n\t"
4053+                "vst1.8      {d2[7]}, [%[dst2]], %[stride_dst] \n\t"
4054+                "bne         1b                                \n\t"
4055+                "2:                                            \n\t"
4056+                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
4057+                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
4058+                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
4059+                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
4060+                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
4061+                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
4062+                "vst1.8      {d0[6]}, [%[dst]]                 \n\t"
4063+                "vst1.8      {d0[7]}, [%[dst2]]                \n\t"
4064+                "b           4f                                \n\t"
4065+                "3:                                            \n\t"
4066+                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
4067+                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
4068+                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
4069+                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
4070+                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
4071+                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
4072+                "vst1.8      {d2[6]}, [%[dst]]                 \n\t"
4073+                "vst1.8      {d2[7]}, [%[dst2]]                \n\t"
4074+                "4:                                            \n\t"
4075+                :  // Outputs
4076+                           [dst]"+r"(dst),
4077+                          [dst2]"+r"(dst2),
4078+                           [src]"+r"(src),
4079+                        [height]"+r"(height)
4080+                :  // Inputs
4081+                    [stride_dst]"r"(stride_dst)
4082+                :  // Clobbers
4083+                    "cc", "memory"
4084+            );
4085+            break;
4086+    }
4087+}
4088+
4089+static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src,
4090+                                                       int pixel_shift, int height,
4091+                                                       ptrdiff_t stride_dst, ptrdiff_t stride_src)
4092+{
4093+    int x, y;
4094+    switch (pixel_shift)
4095+    {
4096+        case 2:
4097+            __asm__ volatile (
4098+                "ldr         %[x], [%[src]], %[stride_src] \n\t"
4099+                "ldr         %[y], [%[src]], %[stride_src] \n\t"
4100+                "str         %[x], [%[dst]], %[stride_dst] \n\t"
4101+                "sub         %[height], #2                 \n\t"
4102+                "1:                                        \n\t"
4103+                "ldr         %[x], [%[src]], %[stride_src] \n\t"
4104+                "str         %[y], [%[dst]], %[stride_dst] \n\t"
4105+                "ldr         %[y], [%[src]], %[stride_src] \n\t"
4106+                "subs        %[height], #2                 \n\t"
4107+                "str         %[x], [%[dst]], %[stride_dst] \n\t"
4108+                "bne         1b                            \n\t"
4109+                "str         %[y], [%[dst]]                \n\t"
4110+                :  // Outputs
4111+                             [x]"=&r"(x),
4112+                             [y]"=&r"(y),
4113+                           [src]"+r"(src),
4114+                           [dst]"+r"(dst),
4115+                        [height]"+r"(height)
4116+                :  // Inputs
4117+                    [stride_src]"r"(stride_src),
4118+                    [stride_dst]"r"(stride_dst)
4119+                :  // Clobbers
4120+                    "cc", "memory"
4121+            );
4122+            break;
4123+        case 1:
4124+            __asm__ volatile (
4125+                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
4126+                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
4127+                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
4128+                "sub         %[height], #2                 \n\t"
4129+                "1:                                        \n\t"
4130+                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
4131+                "strh        %[y], [%[dst]], %[stride_dst] \n\t"
4132+                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
4133+                "subs        %[height], #2                 \n\t"
4134+                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
4135+                "bne         1b                            \n\t"
4136+                "strh        %[y], [%[dst]]                \n\t"
4137+                :  // Outputs
4138+                             [x]"=&r"(x),
4139+                             [y]"=&r"(y),
4140+                           [src]"+r"(src),
4141+                           [dst]"+r"(dst),
4142+                        [height]"+r"(height)
4143+                :  // Inputs
4144+                    [stride_src]"r"(stride_src),
4145+                    [stride_dst]"r"(stride_dst)
4146+                :  // Clobbers
4147+                    "cc", "memory"
4148+            );
4149+            break;
4150+        default:
4151+            __asm__ volatile (
4152+                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
4153+                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
4154+                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
4155+                "sub         %[height], #2                 \n\t"
4156+                "1:                                        \n\t"
4157+                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
4158+                "strb        %[y], [%[dst]], %[stride_dst] \n\t"
4159+                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
4160+                "subs        %[height], #2                 \n\t"
4161+                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
4162+                "bne         1b                            \n\t"
4163+                "strb        %[y], [%[dst]]                \n\t"
4164+                :  // Outputs
4165+                             [x]"=&r"(x),
4166+                             [y]"=&r"(y),
4167+                           [src]"+r"(src),
4168+                           [dst]"+r"(dst),
4169+                        [height]"+r"(height)
4170+                :  // Inputs
4171+                    [stride_src]"r"(stride_src),
4172+                    [stride_dst]"r"(stride_dst)
4173+                :  // Clobbers
4174+                    "cc", "memory"
4175+            );
4176+            break;
4177+    }
4178+}
4179+
4180+#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon
4181+static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src,
4182+                                              int pixel_shift, int height,
4183+                                              ptrdiff_t stride_dst, ptrdiff_t stride_src)
4184+{
4185+    if (stride_dst == 1 << pixel_shift)
4186+        ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src);
4187+    else if (stride_src == 1 << pixel_shift)
4188+        ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst);
4189+    else
4190+        ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src);
4191+}
4192+
4193+#endif /* HAVE_NEON_INLINE */
4194+
4195+#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */
4196--- /dev/null
4197+++ b/libavcodec/arm/rpi_hevc_mv_arm.h
4198@@ -0,0 +1,93 @@
4199+/*
4200+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
4201+All rights reserved.
4202+
4203+Redistribution and use in source and binary forms, with or without
4204+modification, are permitted provided that the following conditions are met:
4205+    * Redistributions of source code must retain the above copyright
4206+      notice, this list of conditions and the following disclaimer.
4207+    * Redistributions in binary form must reproduce the above copyright
4208+      notice, this list of conditions and the following disclaimer in the
4209+      documentation and/or other materials provided with the distribution.
4210+    * Neither the name of the copyright holder nor the
4211+      names of its contributors may be used to endorse or promote products
4212+      derived from this software without specific prior written permission.
4213+
4214+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
4215+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
4216+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
4217+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
4218+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
4219+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
4220+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
4221+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
4222+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
4223+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4224+
4225+Written by John Cox, Ben Avison
4226+*/
4227+
4228+#ifndef AVCODEC_ARM_RPI_HEVC_MV_H
4229+#define AVCODEC_ARM_RPI_HEVC_MV_H
4230+
4231+#if HAVE_ARMV6T2_INLINE
4232+static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b)
4233+{
4234+    MvXY r;
4235+    __asm__ (
4236+        "sadd16    %[r], %[a], %[b]        \n\t"
4237+        : [r]"=r"(r)
4238+        : [a]"r"(a),
4239+          [b]"r"(b)
4240+        :
4241+        );
4242+    return r;
4243+}
4244+#define mvxy_add mvxy_add_arm
4245+#endif
4246+
4247+#if HAVE_ARMV6T2_INLINE
4248+#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV))
4249+static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb)
4250+{
4251+    int t;
4252+    __asm__ (
4253+    "ssat   %[td], #8,    %[td]          \n\t"
4254+    "ssat   %[tb], #8,    %[tb]          \n\t"
4255+    "eor    %[t],  %[td], %[td], asr #31 \n\t"
4256+    "adds   %[t],  %[t],  %[td], lsr #31 \n\t"
4257+    "asr    %[t],  #1                    \n\t"
4258+    "add    %[t],  #0x4000               \n\t"
4259+    "it ne                               \n\t"
4260+    "sdivne %[t],  %[t],  %[td]          \n\t"
4261+    "mov    %[td], #32                   \n\t"
4262+    "smlabb %[td], %[t],  %[tb], %[td]   \n\t"
4263+    "ssat   %[td], #13,   %[td], asr #6  \n\t"
4264+    "mov    %[tb], #127                  \n\t"
4265+    "smlatb %[t],  %[xy], %[td], %[tb]   \n\t"
4266+    "smlabb %[tb], %[xy], %[td], %[tb]   \n\t"
4267+// This takes the sign of x & y for rounding at the "wrong" point
4268+// (i.e. after adding 127) but for the range of values (-1,-127)
4269+// where it does the wrong thing you get the right answer (0) anyway
4270+    "add    %[t],  %[t],  %[t],  lsr #31 \n\t"
4271+    "add    %[xy], %[tb], %[tb], lsr #31 \n\t"
4272+    "ssat   %[t],  #16,   %[t],  asr #8  \n\t"
4273+    "ssat   %[xy], #16,   %[xy], asr #8  \n\t"
4274+    "pkhbt  %[xy], %[xy], %[t],  lsl #16 \n\t"
4275+    :
4276+         [t]"=&r"(t),
4277+        [xy]"+r"(xy),
4278+        [td]"+r"(td),
4279+        [tb]"+r"(tb)
4280+    :
4281+    :
4282+        "cc"
4283+    );
4284+    return xy;
4285+}
4286+#define mv_scale_xy mv_scale_xy_arm
4287+#endif
4288+#endif
4289+
4290+#endif // AVCODEC_ARM_RPI_HEVC_MV_H
4291+
4292--- /dev/null
4293+++ b/libavcodec/arm/rpi_hevcdsp_arm.h
4294@@ -0,0 +1,26 @@
4295+/*
4296+ * This file is part of FFmpeg.
4297+ *
4298+ * FFmpeg is free software; you can redistribute it and/or
4299+ * modify it under the terms of the GNU Lesser General Public
4300+ * License as published by the Free Software Foundation; either
4301+ * version 2.1 of the License, or (at your option) any later version.
4302+ *
4303+ * FFmpeg is distributed in the hope that it will be useful,
4304+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
4305+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
4306+ * Lesser General Public License for more details.
4307+ *
4308+ * You should have received a copy of the GNU Lesser General Public
4309+ * License along with FFmpeg; if not, write to the Free Software
4310+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
4311+ */
4312+
4313+#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
4314+#define AVCODEC_ARM_HEVCDSP_ARM_H
4315+
4316+#include "libavcodec/rpi_hevcdsp.h"
4317+
4318+void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth);
4319+
4320+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
4321--- /dev/null
4322+++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
4323@@ -0,0 +1,1634 @@
4324+/*
4325+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
4326+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
4327+ *
4328+ * This file is part of FFmpeg.
4329+ *
4330+ * FFmpeg is free software; you can redistribute it and/or
4331+ * modify it under the terms of the GNU Lesser General Public
4332+ * License as published by the Free Software Foundation; either
4333+ * version 2.1 of the License, or (at your option) any later version.
4334+ *
4335+ * FFmpeg is distributed in the hope that it will be useful,
4336+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
4337+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
4338+ * Lesser General Public License for more details.
4339+ *
4340+ * You should have received a copy of the GNU Lesser General Public
4341+ * License along with FFmpeg; if not, write to the Free Software
4342+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
4343+ */
4344+
4345+
4346+#include "libavutil/arm/asm.S"
4347+#include "neon.S"
4348+
4349+.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8
4350+        vsubl.u8  q0, \Q0a, \P0a
4351+        vsubl.u8  q1, \P1a, \Q1a
4352+        vdup.16   d4, r2
4353+        \I1
4354+        vshl.i16  q0, #2
4355+        \I2
4356+        vadd.i16  q0, q1
4357+        \I3
4358+        vmovl.u8  q2, d4
4359+        \I4
4360+        vneg.s16  q1, q2
4361+        \I5
4362+        vrshr.s16 q0, #3
4363+        \I6
4364+        \I7
4365+        \I8
4366+        vmin.s16  q0, q2
4367+        vmovl.u8  q2, \Q0a
4368+        vmax.s16  q0, q1
4369+        vaddw.u8  q1, q0, \P0a
4370+        vsub.i16  q0, q2, q0
4371+        vqmovun.s16 \P0a, q1
4372+        vqmovun.s16 \Q0a, q0
4373+.endm
4374+
4375+
4376+.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7
4377+        vsubl.u8  q0, \Q0a, \P0a  @ q0a - p0a
4378+        lsr       r12, r2, #16
4379+        vsubl.u8  q1, \Q0b, \P0b  @ q0b - p0b
4380+        vsubl.u8  q2, \P1a, \Q1a  @ p1a - q1a
4381+        vsubl.u8  q3, \P1b, \Q1b  @ p1b - q1b
4382+        vshl.i16  q0, #2          @ (q0a - p0a) * 4
4383+        vshl.i16  q1, #2          @ (q0b - p0b) * 4
4384+        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
4385+        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
4386+        vdup.16   d4, r2          @ tc0a, tc0b
4387+        vdup.16   d6, r12         @ tc1a, tc1b
4388+        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
4389+        \I1
4390+        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
4391+        \I2
4392+        vmovl.u8  q2, d4          @ tc0a, tc0b
4393+        \I3
4394+        vmovl.u8  q3, d6          @ tc1a, tc1b
4395+        \I4
4396+        vmin.s16  q0, q2
4397+        \I5
4398+        vneg.s16  q2, q2          @ -tc0a, -tc0b
4399+        \I6
4400+        vmin.s16  q1, q3
4401+        \I7
4402+        vneg.s16  q3, q3          @ -tc1a, -tc1b
4403+        vmax.s16  q0, q2          @ delta0a
4404+        vmovl.u8  q2, \Q0a
4405+        vmax.s16  q1, q3          @ delta0b
4406+        vaddw.u8  q3, q0, \P0a    @ p0a + delta0a
4407+        vsub.i16  q0, q2, q0      @ q0a - delta0a
4408+        vmovl.u8  q2, \Q0b
4409+        vsub.i16  q2, q1          @ q0b - delta0b
4410+        vaddw.u8  q1, \P0b        @ p0b + delta0b
4411+        vqmovun.s16 \Q0a, q0
4412+        vqmovun.s16 \P0a, q3
4413+        vqmovun.s16 \Q0b, q2
4414+        vqmovun.s16 \P0b, q1
4415+.endm
4416+
4417+
4418+@ Preserves r12
4419+@ Clobbers r2
4420+@ P0a et al all contain UVUVUVUV
4421+@ r2 (tc4) contains
4422+@   [0..7]   tc U a
4423+@   [8..15]  tc V a
4424+
4425+.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8
4426+        vsub.i16  q0, \Q0a, \P0a
4427+        vsub.i16  q1, \P1a, \Q1a
4428+        vdup.16   d4, r2
4429+        \I1
4430+        vshl.i16  q0, #2
4431+        \I2
4432+        vadd.i16  q0, q1
4433+        \I3
4434+        vshll.u8  q2, d4, #\bit_depth - 8
4435+        \I4
4436+        vneg.s16  q1, q2
4437+        \I5
4438+        vrshr.s16 q0, #3
4439+        \I6
4440+        \I7
4441+        \I8
4442+        vmin.s16  q0, q2
4443+        vmov.i16  q2, #0
4444+        vmax.s16  q0, q1
4445+        vadd.i16  \P0a, q0
4446+        vsub.i16  \Q0a, q0
4447+        vmov.i16  q1, #(1 << \bit_depth) - 1
4448+        vmax.s16  \P0a, q2
4449+        vmax.s16  \Q0a, q2
4450+        vmin.s16  \P0a, q1
4451+        vmin.s16  \Q0a, q1
4452+.endm
4453+
4454+@ Clobbers r2, r12
4455+@ P0a et al all contain UVUVUVUV
4456+@ r2 (tc4) contains
4457+@   [0..7]   tc U a
4458+@   [8..15]  tc V a
4459+@  [16..23]  tc U b
4460+@  [24..31]  tc V b
4461+
4462+.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7
4463+        vsub.i16  q0, \Q0a, \P0a  @ q0a - p0a
4464+        lsr       r12, r2, #16
4465+        vsub.i16  q1, \Q0b, \P0b  @ q0b - p0b
4466+        vsub.i16  q2, \P1a, \Q1a  @ p1a - q1a
4467+        vsub.i16  q3, \P1b, \Q1b  @ p1b - q1b
4468+        vshl.i16  q0, #2          @ (q0a - p0a) * 4
4469+        vshl.i16  q1, #2          @ (q0b - p0b) * 4
4470+        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
4471+        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
4472+        vdup.16   d4, r2          @ tc0a, tc0b
4473+        vdup.16   d6, r12         @ tc1a, tc1b
4474+        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
4475+        \I1
4476+        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
4477+        \I2
4478+        vshll.u8  q2, d4, #\bit_depth - 8 @ tc0a, tc0b
4479+        \I3
4480+        vshll.u8  q3, d6, #\bit_depth - 8 @ tc1a, tc1b
4481+        \I4
4482+        vmin.s16  q0, q2
4483+        \I5
4484+        vneg.s16  q2, q2          @ -tc0a, -tc0b
4485+        \I6
4486+        vmin.s16  q1, q3
4487+        \I7
4488+        vneg.s16  q3, q3          @ -tc1a, -tc1b
4489+        vmax.s16  q0, q2          @ delta0a
4490+        vadd.i16  \P0a, q0        @ p0a + delta0a
4491+        vsub.i16  \Q0a, q0        @ q0a - delta0a
4492+        vmax.s16  q1, q3          @ delta0b
4493+        vadd.i16  \P0b, q1        @ p0b + delta0b
4494+        vsub.i16  \Q0b, q1        @ q0b - delta0b
4495+        vmov.i16  q2, #0
4496+        vmov.i16  q3, #(1 << \bit_depth) - 1
4497+        vmax.s16  \P0a, q2
4498+        vmax.s16  \Q0a, q2
4499+        vmax.s16  \P0b, q2
4500+        vmax.s16  \Q0b, q2
4501+        vmin.s16  \P0a, q3
4502+        vmin.s16  \Q0a, q3
4503+        vmin.s16  \P0b, q3
4504+        vmin.s16  \Q0b, q3
4505+.endm
4506+
4507+
4508+
4509+@   uint8_t *_no_p,     [sp+0]
4510+@   uint8_t *_no_q)     [sp+4]
4511+
4512+.macro hevc_loop_filter_luma_start
4513+        ldr     r12, [r3]
4514+        ldr      r3, [r3, #4]
4515+        orrs     r3, r12, r3, lsl #16
4516+        it       eq
4517+        bxeq     lr
4518+        push     {r4-r10,lr}            @ 32 bytes
4519+        ldrd     r4, r5, [sp, #32]      @ &_no_p
4520+        ldrb     r4, [r4]
4521+        ldrb     r5, [r5]
4522+        movs     r10, r4
4523+        it ne
4524+        movne    r10, #1
4525+        cmp      r5, #0
4526+        it ne
4527+        orrne    r10, #2
4528+.endm
4529+
4530+@ Input:
4531+@  r2          beta    (raw: needs shift for bitdepth > 8)
4532+@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
4533+@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
4534+@
4535+@ Input & output
4536+@  8-bit: d16-d23      (Q3,Q2,Q1,Q0,P0,P1,P2,P3)
4537+@ 16-bit:  q8-q15
4538+@
4539+@  r1         -r1
4540+@  r10        b1->C, b0->N  (r10 junk)
4541+@
4542+@ Junks:
4543+@  r5, r6, r7, r8, r9
4544+
4545+.macro m_filter_luma bit_depth, Q11, Q15
4546+.if \bit_depth == 8
4547+        vmovl.u8    q14, d22      @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2
4548+        vmovl.u8    q13, d21      @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1
4549+        vmovl.u8    q12, d20      @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0
4550+        vmovl.u8    \Q11, d19     @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0
4551+        vmovl.u8    q10, d18      @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1
4552+        vmovl.u8    q9, d17       @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2
4553+.endif
4554+        vadd.i16    q0, q9, \Q11  @ P2 + P0
4555+.if \bit_depth > 8
4556+        lsl         r3, r3, #(\bit_depth - 8)
4557+.endif
4558+        vadd.i16    q1, q14, q12  @ Q2 + Q0
4559+.if \bit_depth > 8
4560+        lsl         r2, r2, #(\bit_depth - 8)
4561+.endif
4562+        vsub.i16    q0, q10       @ P2 - P1 + P0
4563+        lsr         r5, r3, #16
4564+        vsub.i16    q1, q13       @ Q2 - Q1 + Q0
4565+.if \bit_depth == 8
4566+        vmovl.u8    q8, d16       @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3
4567+        vmovl.u8    \Q15, d23     @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3
4568+.endif
4569+        vabd.s16    q0, q10       @ dp0 = abs(P2 - 2 * P1 + P0)
4570+        vabd.s16    q1, q13       @ dq0 = abs(Q2 - 2 * Q1 + Q0)
4571+        vmov.i64    q2, #0xffffffff0000
4572+        vbic        q0, q2        @ only dp0(') and dp3(')
4573+        vbic        q1, q2        @ only dq0(') and dq3(')
4574+        vsra.u64    q0, #16
4575+        vsra.u64    q1, #16
4576+        vdup.16     q3, r2        @ beta
4577+        vdup.16     d14, r3       @ tC[0]
4578+        vdup.16     d15, r5       @ tC[1]
4579+        vabd.s16    q4, q8, \Q11  @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0)
4580+        vmovn.i32   d0, q0        @ dp3' dp0' dp3 dp0
4581+        vmovn.i32   d1, q1        @ dq3' dq0' dq3 dq0
4582+        vadd.i16    d5, d0, d1    @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0
4583+        vabd.s16    q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0)
4584+        vaba.s16    q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0)
4585+        vpadd.i16   d2, d5, d5    @ dontcare dontcare d0'+d3' d0+d3
4586+        vshl.s16    q6, q7, #2    @ tC[] * 4
4587+        vrhadd.s16  q6, q7        @ tc25 = (tc[] * 5 + 1) >> 1
4588+        vcgt.s16    d2, d6, d2    @ if (d0 + d3 < beta)
4589+        vmov        r7, s4        @ (d2) r7 = mask of blocks to apply filtering (16b/block)
4590+        vshr.s16    q1, q3, #3    @ beta_3 = beta >> 3
4591+        cmp         r7, #0
4592+        beq         .Lbypasswrite
4593+
4594+        vcgt.s16    q5, q6, q5    @ if < tc25
4595+        vcgt.s16    q4, q1, q4    @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3)
4596+        vand        q4, q5
4597+        vbic        d8, d4
4598+        vbic        d9, d4
4599+        vshr.s16    q3, #2        @ beta_2 = beta >> 2
4600+        vsra.u64    q4, #16
4601+        vshl.s16    d5, #1        @ d3'<<1 d0'<<1 d3<<1 d0<<1
4602+        vshl.i16    q7, #1        @ tc2 = tC[] << 1
4603+        vcgt.s16    d6, d5        @ if (d3'<<1 < beta_2) etc
4604+        vmovn.i32   d8, q4        @ beta_3 && tc25 tests, prime block in ms half
4605+        vand        d6, d8        @ && beta_2 tests, prime in ms half
4606+        vpadd.i16   d0, d1        @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3
4607+        vneg.s16    q6, q7        @ -tc2
4608+        vmovn.i32   d8, q3
4609+        vshrn.i32   d6, q3, #16
4610+        vand        d6, d8
4611+        vmov        r5, r6, d0    @ r5 = dp0'+dp3' dp0+dp3  r6 = dq0'+dq3' dq0+dq3
4612+        vmov        r8, s12       @ (d6) r8 = mask of strong filtering blocks (16b/block)
4613+        vadd.i16    q0, \Q11, q12 @ p0 + q0
4614+        ands        r9, r7, r8
4615+        beq         1f
4616+
4617+        vadd.i16    q2, q0, q10   @ p1 + p0 + q0
4618+        vadd.i16    q3, q0, q13   @ p0 + q0 + q1
4619+        lsr         r3, r9, #16
4620+        vadd.i16    q1, q2, q9    @ p2 + p1 + p0 + q0 (new P1 before clipping)
4621+        vadd.i16    q4, q3, q14   @ p0 + q0 + q1 + q2 (new Q1 before clipping)
4622+        vadd.i16    q0, q8, q9    @ p3 + p2
4623+        vadd.i16    q5, \Q15, q14 @ q2 + q3
4624+        vadd.i16    q2, q1        @ p2 + 2 * p1 + 2 * p0 + 2 * q0
4625+        vadd.i16    q3, q4        @ 2 * p0 + 2 * q0 + 2 * q1 + q2
4626+        vshl.i16    q0, #1        @ 2 * p3 + 2 * p2
4627+        vshl.i16    q5, #1        @ 2 * q2 + 2 * q3
4628+        vadd.i16    q0, q1        @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping)
4629+        vadd.i16    q5, q4        @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping)
4630+        vadd.i16    q2, q13       @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping)
4631+        vadd.i16    q3, q10       @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping)
4632+        vrshr.s16   q0, #3        @ scale, with rounding
4633+        vrshr.s16   q5, #3
4634+        vrshr.s16   q1, #2
4635+        vrshr.s16   q4, #2
4636+        vrshr.s16   q2, #3
4637+        vrshr.s16   q3, #3
4638+        vsub.i16    q0, q9        @ find difference
4639+        vsub.i16    q5, q14
4640+        vsub.i16    q1, q10
4641+        vsub.i16    q4, q13
4642+        vsub.i16    q2, \Q11
4643+        vsub.i16    q3, q12
4644+        vmax.s16    q0, q6        @ clip difference to -tc2 .. tc2
4645+        vmax.s16    q5, q6
4646+        vmax.s16    q1, q6
4647+        vmax.s16    q4, q6
4648+        vmax.s16    q2, q6
4649+        vmax.s16    q3, q6
4650+        vdup.16     d12, r9       @ expand mask, reuse q6 due to register pressure
4651+        vdup.16     d13, r3
4652+        vmin.s16    q0, q7
4653+        vmin.s16    q5, q7
4654+        vmin.s16    q1, q7
4655+        vmin.s16    q4, q7
4656+        vmin.s16    q2, q7
4657+        vmin.s16    q3, q7
4658+        vadd.i16    q0, q9        @ apply difference
4659+        vadd.i16    q5, q14
4660+        vadd.i16    q1, q10
4661+        vadd.i16    q4, q13
4662+        vadd.i16    q2, \Q11
4663+        vadd.i16    q3, q12
4664+        vbit        q9, q0, q6    @ apply filtered values according to mask
4665+        vbit        q14, q5, q6
4666+        vbit        q10, q1, q6
4667+        vbit        q13, q4, q6
4668+        vbit        \Q11, q2, q6
4669+        vbit        q12, q3, q6
4670+        vneg.s16    q6, q7        @ restore -tc2
4671+
4672+1:
4673+        bics        r9, r7, r8
4674+        beq         2f
4675+
4676+        vsub.i16    q0, q12, \Q11 @ q0 - p0
4677+        vsub.i16    q1, q13, q10  @ q1 - p1
4678+        lsr         r3, r9, #16
4679+        vshl.i16    q2, q0, #3
4680+        lsr         r7, r5, #16
4681+        vadd.i16    q3, q0, q2    @ 9 * (q0 - p0)
4682+        lsr         r8, r6, #16
4683+        vshl.i16    q2, q1, #1
4684+        vadd.i16    q4, q1, q2    @ 3 * (q1 - p1)
4685+        vshr.s16    q6, #1        @ -tc = -tc2 >> 1
4686+        vsub.i16    q5, q3, q4
4687+        vrhadd.s16  q1, q9, \Q11  @ (p2 + p0 + 1) >> 1
4688+        vrhadd.s16  q3, q14, q12  @ (q2 + q0 + 1) >> 1
4689+        vrshr.s16   q5, #4        @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
4690+        vsub.i16    q1, q10       @ ((p2 + p0 + 1) >> 1) - p1
4691+        vsub.i16    q3, q13       @ ((q2 + q0 + 1) >> 1) - q1
4692+        vmax.s16    q6, q5        @
4693+        vshr.s16    q4, q7, #1    @ tc = tc2 >> 1
4694+        vdup.16     q0, r2        @ beta
4695+        vmin.s16    q6, q4        @ delta0 clamped to [-tc, tc]
4696+        vshr.s16    q4, #1        @ tc_2 = tc >> 1
4697+        vhadd.s16   q1, q6        @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
4698+        vhsub.s16   q3, q6        @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
4699+        vshr.s16    q2, q0, #1    @ beta >> 1
4700+        vadd.i16    q2, q0        @ beta + (beta >> 1)
4701+        vneg.s16    q0, q4        @ -tc_2
4702+        vabs.s16    q5, q5        @ abs(original delta0)
4703+        vshr.s16    q2, #3        @ (beta + (beta >> 1)) >> 3
4704+        vmax.s16    q1, q0
4705+        vmax.s16    q3, q0
4706+        vshl.s16    q0, q7, #2    @ 8 * tc
4707+        vadd.i16    q7, q0        @ 10 * tc
4708+        vdup.16     d0, r9
4709+        vdup.16     d1, r3        @ q0 = mask of blocks to apply filtering
4710+        vmin.s16    q1, q4        @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
4711+        vmin.s16    q3, q4        @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2)
4712+        vdup.16     d8, r5        @ dp0 + dp3
4713+        vdup.16     d9, r7        @ dp0' + dp3'
4714+        vcgt.s16    q7, q5        @ if ((10 * tc) > abs(delta0))
4715+        vdup.16     d10, r6       @ dq0 + dq3
4716+        vdup.16     d11, r8       @ dq0' + dq3'
4717+        vand        q7, q0        @ AND block and line masks
4718+        vcgt.s16    q4, q2, q4    @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1)
4719+        vadd.i16    q0, q1, q10   @ p1 + deltap1
4720+        vcgt.s16    q5, q2, q5    @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1)
4721+        vadd.i16    q3, q3, q13   @ q1 + deltaq1
4722+        vadd.i16    q1, \Q11, q6  @ p0 + delta0
4723+        vsub.i16    q2, q12, q6   @ q0 - delta0
4724+        vand        q4, q7        @ AND nd_p test with block/line masks
4725+        vand        q5, q7        @ AND nd_q test with block/line masks
4726+        vbit        q10, q0, q4
4727+        vbit        \Q11, q1, q7
4728+        vbit        q12, q2, q7
4729+        vbit        q13, q3, q5
4730+
4731+2:
4732+.if \bit_depth == 8
4733+        vmovn.i16 d16, q8
4734+        vmovn.i16 d23, \Q15
4735+        neg       r1, r1
4736+        vqmovun.s16 d17, q9
4737+        vqmovun.s16 d18, q10
4738+        vqmovun.s16 d19, \Q11
4739+        lsls      r10, #31
4740+        vqmovun.s16 d20, q12
4741+        vqmovun.s16 d21, q13
4742+        vqmovun.s16 d22, q14
4743+.else
4744+        vmov.i16  q0, #0
4745+        vmov.i16  q1, #(1 << \bit_depth - 1)
4746+        @ q8 & q15 should be unaltered and so don't require clipping
4747+        neg       r1, r1
4748+        vmax.s16  q9,  q0
4749+        vmax.s16  q10, q0
4750+        vmax.s16  q11, q0
4751+        vmax.s16  q12, q0
4752+        vmax.s16  q13, q0
4753+        vmax.s16  q14, q0
4754+        lsls      r10, #31
4755+        vmin.s16  q9,  q1
4756+        vmin.s16  q10, q1
4757+        vmin.s16  q11, q1
4758+        vmin.s16  q12, q1
4759+        vmin.s16  q13, q1
4760+        vmin.s16  q14, q1
4761+.endif
4762+        bx        lr
4763+.endm
4764+
4765+function hevc_loop_filter_luma_body
4766+        m_filter_luma 8, q15, q11
4767+endfunc
4768+
4769+@ void ff_hevc_rpi_v_loop_filter_luma_neon_8(
4770+@   uint8_t *_pix,      [r0]
4771+@   ptrdiff_t _stride,  [r1]
4772+@   int _beta,          [r2]
4773+@   int *_tc,           [r3]
4774+@   uint8_t *_no_p,     [sp+0]
4775+@   uint8_t *_no_q)     [sp+4]
4776+
4777+function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1
4778+        hevc_loop_filter_luma_start
4779+
4780+        sub      r4, r0, #4
4781+        b        .Lv_loop_luma_common
4782+endfunc
4783+
4784+@ void ff_hevc_rpi_v_loop_filter2_luma_neon(
4785+@   uint8_t * pix_r,    [r0]
4786+@   ptrdiff_t _stride,  [r1]
4787+@   int _beta,          [r2]
4788+@   int tc2,            [r3]
4789+@   int no_f,           [sp+0]
4790+@   uint8_t * pix_l)    [sp+4]
4791+
4792+function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1
4793+        cmp      r3, #0
4794+        it       eq
4795+        bxeq     lr
4796+        push     {r4-r10,lr}            @ 32 bytes
4797+        ldr      r4, [sp, #36]
4798+        ldr      r10, [sp, #32]
4799+
4800+.Lv_loop_luma_common:
4801+        vpush    {d8-d15}
4802+
4803+        @ It's slightly faster to do unlaned loads and transpose in the
4804+        @ 8-bit case, even though it needs more instructions, because
4805+        @ VLD4.8 is a really slow way to read from memory.
4806+        vld1.32 {d16[0]}, [r4:32], r1
4807+        vld1.32 {d20[0]}, [r0:32], r1
4808+        vld1.32 {d16[1]}, [r4:32], r1
4809+        vld1.32 {d20[1]}, [r0:32], r1
4810+        vld1.32 {d17[0]}, [r4:32], r1
4811+        vld1.32 {d21[0]}, [r0:32], r1
4812+        vld1.32 {d17[1]}, [r4:32], r1
4813+        vld1.32 {d21[1]}, [r0:32], r1
4814+        vld1.32 {d18[0]}, [r4:32], r1
4815+        vld1.32 {d22[0]}, [r0:32], r1
4816+        vld1.32 {d18[1]}, [r4:32], r1
4817+        vld1.32 {d22[1]}, [r0:32], r1
4818+        vld1.32 {d19[0]}, [r4:32], r1
4819+        vld1.32 {d23[0]}, [r0:32], r1
4820+        vld1.32 {d19[1]}, [r4:32]
4821+        vld1.32 {d23[1]}, [r0:32]
4822+        vuzp.16 q8, q9
4823+        vuzp.16 q10, q11
4824+        vuzp.8  q8, q9
4825+        vuzp.8  q10, q11
4826+        vswp    d17, d18
4827+        vswp    d21, d22
4828+
4829+        bl hevc_loop_filter_luma_body
4830+
4831+        add     r6, r4, r1
4832+        add     r2, r0, r1
4833+        lsl     r1, #1
4834+
4835+        vpop     {d8-d15}
4836+
4837+        @ no_p[1]
4838+        bmi     1f
4839+        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
4840+        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1
4841+        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
4842+        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1
4843+
4844+        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
4845+        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1
4846+        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
4847+        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r6:32]
4848+1:
4849+        @ no_q[1]
4850+        bcs     1f
4851+        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
4852+        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1
4853+        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
4854+        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1
4855+
4856+        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
4857+        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
4858+        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
4859+        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
4860+1:
4861+        pop      {r4-r10,pc}
4862+
4863+.Lbypasswrite:
4864+        vpop     {d8-d15}
4865+        pop      {r4-r10,pc}
4866+endfunc
4867+
4868+.macro m_filter_v_luma_16 bit_depth
4869+        vpush    {d8-d15}
4870+
4871+        @ Uses slightly fewer instructions to do laned loads than unlaned
4872+        @ and transpose.  This also means that we can use the same code for
4873+        @ both split & unsplit deblock
4874+        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
4875+        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
4876+
4877+        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
4878+        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
4879+
4880+        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
4881+        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
4882+
4883+        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
4884+        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
4885+
4886+        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
4887+        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
4888+
4889+        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
4890+        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
4891+
4892+        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
4893+        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
4894+
4895+        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
4896+        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
4897+
4898+        bl hevc_loop_filter_luma_body_\bit_depth
4899+
4900+        add      r6, r4, r1
4901+        add      r2, r0, r1
4902+        lsl      r1, #1
4903+
4904+        vpop     {d8-d15}
4905+
4906+        @ p[1]
4907+        bmi      1f
4908+        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
4909+        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r6], r1
4910+        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
4911+        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r6], r1
4912+        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
4913+        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r6], r1
4914+        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
4915+        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r6]
4916+1:
4917+        @ q[1]
4918+        bcs      1f
4919+        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
4920+        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r2], r1
4921+        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
4922+        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r2], r1
4923+        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
4924+        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
4925+        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
4926+        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
4927+1:
4928+        pop      {r4-r10,pc}
4929+.endm
4930+
4931+
4932+
4933+
4934+@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
4935+@                                 ptrdiff_t stride, [r1]
4936+@                                 int beta,         [r2]
4937+@                                 int32_t *tc,      [r3]
4938+@                                 uint8_t *no_p,    sp[0]
4939+@                                 uint8_t *no_q);   sp[4]
4940+@
4941+@ Src should always be on 8 byte boundry & all in the same slice
4942+
4943+function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1
4944+        hevc_loop_filter_luma_start
4945+        b        .Lh_loop_filter_luma_common_8
4946+endfunc
4947+
4948+function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1
4949+        cmp      r3, #0
4950+        it       eq
4951+        bxeq     lr
4952+        push     {r4-r10,lr}            @ 32 bytes
4953+        ldr      r10, [sp, #32]
4954+
4955+.Lh_loop_filter_luma_common_8:
4956+        sub      r4, r0, r1, lsl #2
4957+        add      r0, r4, r1
4958+        lsl      r1, #1
4959+        vpush    {d8-d15}
4960+
4961+        vld1.8  {d16}, [r4], r1
4962+        vld1.8  {d17}, [r0], r1
4963+        vld1.8  {d18}, [r4], r1
4964+        vld1.8  {d19}, [r0], r1
4965+        vld1.8  {d20}, [r4], r1
4966+        vld1.8  {d21}, [r0], r1
4967+        vld1.8  {d22}, [r4]
4968+        vld1.8  {d23}, [r0]
4969+
4970+        bl hevc_loop_filter_luma_body
4971+
4972+        add      r0, r0, r1, lsl #1
4973+        add      r2, r4, r1, lsl #1
4974+        add      r6, r4, r1, asr #1
4975+        vpop     {d8-d15}
4976+
4977+        @ P2-P0
4978+        bcs      1f
4979+        vst1.8   {d22}, [r4], r1
4980+        vst1.8   {d21}, [r6]
4981+        vst1.8   {d20}, [r4]
4982+1:
4983+        @ Q0-Q2
4984+        bmi      1f
4985+        vst1.8   {d19}, [r0], r1
4986+        vst1.8   {d18}, [r2]
4987+        vst1.8   {d17}, [r0]
4988+1:
4989+        pop      {r4-r10,pc}
4990+endfunc
4991+
4992+
4993+.macro m_filter_h_luma_16 bit_depth
4994+        sub      r4, r0, r1, lsl #2
4995+        add      r0, r4, r1
4996+        lsl      r1, #1
4997+        vpush    {d8-d15}
4998+
4999+        vld1.16 { q8}, [r4], r1
5000+        vld1.16 { q9}, [r0], r1
5001+        vld1.16 {q10}, [r4], r1
5002+        vld1.16 {q11}, [r0], r1
5003+        vld1.16 {q12}, [r4], r1
5004+        vld1.16 {q13}, [r0], r1
5005+        vld1.16 {q14}, [r4]
5006+        vld1.16 {q15}, [r0]
5007+
5008+        bl hevc_loop_filter_luma_body_\bit_depth
5009+
5010+        add      r0, r0, r1, lsl #1
5011+        add      r2, r4, r1, lsl #1
5012+        add      r6, r4, r1, asr #1
5013+        vpop     {d8-d15}
5014+
5015+        @ P2-P0
5016+        bcs      1f
5017+        vst1.16  {q14}, [r4], r1
5018+        vst1.16  {q13}, [r6]
5019+        vst1.16  {q12}, [r4]
5020+1:
5021+        bmi      1f
5022+        vst1.16  {q11}, [r0], r1
5023+        vst1.16  {q10}, [r2]
5024+        vst1.16  { q9}, [r0]
5025+1:
5026+        pop      {r4-r10,pc}
5027+.endm
5028+
5029+
5030+@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
5031+@                                     unsigned int stride,   // r1
5032+@                                     uint32_t tc4,          // r2
5033+@                                     unsigned int no_f);    // r3
5034+@
5035+@ no_f
5036+@ 0  tl P0
5037+@ 1  tr P1
5038+@ 2  bl Q0
5039+@ 3  br Q1
5040+@
5041+@ Probably not worth having the P/Qa only special case in this direction
5042+@ Given layout we won't save any memory reads or avoid any cache dirtying
5043+@ We would save a bit of computation but I expect the partials to be less
5044+@ common in the H direction than V due to how we arrange deblock.
5045+
5046+function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
5047+        sub      r12, r0, r1
5048+        cmp      r2, #0
5049+        it eq
5050+        bxeq     lr
5051+        vld1.8   {d26,d27}, [r0]
5052+        lsl      r1, #1
5053+        sub      r0, r1
5054+        vld1.8   {d18,d19}, [r12], r1
5055+        vld1.8   {d16,d17}, [r0], r1
5056+        vld1.8   {d28,d29}, [r12]
5057+
5058+        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \
5059+        "sub      r12, r0, r1, asr #1"
5060+
5061+        lsls     r3, #29                @ b2 -> N, b3 -> C
5062+        it pl
5063+        vstrpl   d26, [r0, #0]
5064+        it cc
5065+        vstrcc   d27, [r0, #8]
5066+        lsls     r3, #2                 @ b0 -> N, b1 -> C
5067+        it pl
5068+        vstrpl   d18, [r12, #0]
5069+        it cc
5070+        vstrcc   d19, [r12, #8]
5071+        bx       lr
5072+
5073+endfunc
5074+
5075+
5076+@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
5077+@                                     unsigned int stride,   // r1
5078+@                                     uint32_t tc4,          // r2
5079+@                                     unsigned int no_f);    // r3
5080+@
5081+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
5082+@
5083+@ Macro here actual function near bottom
5084+
5085+.macro m_filter_h_uv_16 bit_depth
5086+        sub      r12, r0, r1
5087+        cmp      r2, #0
5088+        it eq
5089+        bxeq     lr
5090+        vld1.16  {q12, q13}, [r0]
5091+        lsl      r1, #1
5092+        sub      r0, r1
5093+        vld1.16  {q10, q11}, [r12], r1
5094+        vld1.16  {q8,  q9 }, [r0], r1
5095+        vld1.16  {q14, q15}, [r12]
5096+
5097+        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \
5098+        "sub      r12, r0, r1, asr #1", \
5099+        "cmp      r3, #0"
5100+
5101+        bne      1f
5102+        vst1.16  {q10, q11}, [r12]
5103+        vst1.16  {q12, q13}, [r0]
5104+        bx       lr
5105+
5106+        @ At least one no_f bit is set
5107+        @ Which means we need to break this apart in an ugly fashion
5108+1:
5109+        lsls     r3, #29                @ b2 -> N, b3 -> C
5110+        itt pl
5111+        vstrpl   d24, [r0, #0]
5112+        vstrpl   d25, [r0, #8]
5113+        itt cc
5114+        vstrcc   d26, [r0, #16]
5115+        vstrcc   d27, [r0, #24]
5116+        lsls     r3, #2                 @ b0 -> N, b1 -> C
5117+        itt pl
5118+        vstrpl   d20, [r12, #0]
5119+        vstrpl   d21, [r12, #8]
5120+        itt cc
5121+        vstrcc   d22, [r12, #16]
5122+        vstrcc   d23, [r12, #24]
5123+        bx       lr
5124+.endm
5125+
5126+
5127+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
5128+@                                     unsigned int stride,   // r1
5129+@                                     uint32_t tc4,          // r2
5130+@                                     uint8_t * src_l,       // r3
5131+@                                     unsigned int no_f);   // sp[0]
5132+@
5133+@ no_f:
5134+@ 0  tl P0
5135+@ 1  tr Q0
5136+@ 2  bl P1
5137+@ 3  br Q1
5138+
5139+function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
5140+        cmp      r2, #0
5141+        it eq
5142+        bxeq     lr
5143+        push     {lr}
5144+        vld2.16  {d16[0], d18[0]}, [r3], r1
5145+        vld2.16  {d20[0], d22[0]}, [r0], r1
5146+
5147+        cmp      r2, #0x10000
5148+        vld2.16  {d16[1], d18[1]}, [r3], r1
5149+        vld2.16  {d20[1], d22[1]}, [r0], r1
5150+
5151+        vld2.16  {d16[2], d18[2]}, [r3], r1
5152+        vld2.16  {d20[2], d22[2]}, [r0], r1
5153+
5154+        vld2.16  {d16[3], d18[3]}, [r3], r1
5155+        vld2.16  {d20[3], d22[3]}, [r0], r1
5156+        blo      10f
5157+
5158+        vld2.16  {d17[0], d19[0]}, [r3], r1
5159+        vld2.16  {d21[0], d23[0]}, [r0], r1
5160+
5161+        sub      ip, r0, r3
5162+        vld2.16  {d17[1], d19[1]}, [r3], r1
5163+        vld2.16  {d21[1], d23[1]}, [r0], r1
5164+
5165+        cmp      ip, #4
5166+        vld2.16  {d17[2], d19[2]}, [r3], r1
5167+        vld2.16  {d21[2], d23[2]}, [r0], r1
5168+
5169+        vld2.16  {d17[3], d19[3]}, [r3]
5170+        vld2.16  {d21[3], d23[3]}, [r0]
5171+
5172+        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \
5173+        "ldr      lr, [sp, #4]", \
5174+        "neg      r1, r1",       \
5175+        "it eq; cmpeq lr, #0",   \
5176+        "add      r3, #2",       \
5177+        "add      ip, r3, r1",   \
5178+        "add      r2, r0, r1",   \
5179+        "lsl      r1, #1"
5180+
5181+        bne      1f
5182+
5183+@ Much/most of the time r0 == r3 + 4 and no_f == 0
5184+@ so it is worth having this special case
5185+        vst2.16   {d19[3], d21[3]}, [r3], r1    @ P0b, Q0b
5186+        vst2.16   {d19[2], d21[2]}, [ip], r1
5187+        vst2.16   {d19[1], d21[1]}, [r3], r1
5188+        vst2.16   {d19[0], d21[0]}, [ip], r1
5189+        vst2.16   {d18[3], d20[3]}, [r3], r1    @ P0a, Q0a
5190+        vst2.16   {d18[2], d20[2]}, [ip], r1
5191+        vst2.16   {d18[1], d20[1]}, [r3]
5192+        vst2.16   {d18[0], d20[0]}, [ip]
5193+        pop       {pc}
5194+
5195+@ Either split or partial
5196+1:
5197+        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
5198+        ittt cs
5199+        addcs    r0, r0, r1, lsl #1
5200+        addcs    r2, r2, r1, lsl #1
5201+        bcs      1f
5202+        @ Q0b
5203+        vst1.16  {d21[3]}, [r0], r1
5204+        vst1.16  {d21[2]}, [r2], r1
5205+        vst1.16  {d21[1]}, [r0], r1
5206+        vst1.16  {d21[0]}, [r2], r1
5207+1:
5208+        ittt mi
5209+        addmi    r3, r3, r1, lsl #1
5210+        addmi    ip, ip, r1, lsl #1
5211+        bmi      1f
5212+        @ P0b
5213+        vst1.16  {d19[3]}, [r3], r1
5214+        vst1.16  {d19[2]}, [ip], r1
5215+        vst1.16  {d19[1]}, [r3], r1
5216+        vst1.16  {d19[0]}, [ip], r1
5217+1:
5218+        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
5219+        bcs      1f
5220+        @ Q0a
5221+        vst1.16  {d20[3]}, [r0], r1
5222+        vst1.16  {d20[2]}, [r2], r1
5223+        vst1.16  {d20[1]}, [r0]
5224+        vst1.16  {d20[0]}, [r2]
5225+1:
5226+        it       mi
5227+        popmi    {pc}
5228+        @ P0a
5229+        vst1.16  {d18[3]}, [r3], r1
5230+        vst1.16  {d18[2]}, [ip], r1
5231+        vst1.16  {d18[1]}, [r3]
5232+        vst1.16  {d18[0]}, [ip]
5233+        pop      {pc}
5234+
5235+@ Single lump (rather than double)
5236+10:
5237+        @ As we have post inced r0/r3 in the load the easiest thing to do is
5238+        @ to subtract and write forwards, rather than backwards (as above)
5239+        @ b0 (P0a) -> N, b1 (Q0a) -> C
5240+
5241+        hevc_loop_filter_uv_body1 d16, d18, d20, d22 \
5242+        "ldr      lr, [sp, #4]",       \
5243+        "add      r3, #2",             \
5244+        "sub      r0, r0, r1, lsl #2", \
5245+        "sub      r3, r3, r1, lsl #2", \
5246+        "lsls     lr, #31",            \
5247+        "add      r2, r0, r1",         \
5248+        "add      ip, r3, r1",         \
5249+        "lsl      r1, #1"
5250+
5251+        bcs      3f
5252+        @ Q0a
5253+        vst1.16  {d20[0]}, [r0], r1
5254+        vst1.16  {d20[1]}, [r2], r1
5255+        vst1.16  {d20[2]}, [r0]
5256+        vst1.16  {d20[3]}, [r2]
5257+3:
5258+        it       mi
5259+        popmi    {pc}
5260+        @ P0a
5261+        vst1.16  {d18[0]}, [r3], r1
5262+        vst1.16  {d18[1]}, [ip], r1
5263+        vst1.16  {d18[2]}, [r3]
5264+        vst1.16  {d18[3]}, [ip]
5265+        pop      {pc}
5266+
5267+endfunc
5268+
5269+
5270+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
5271+@                                     unsigned int stride,   // r1
5272+@                                     uint32_t tc4,          // r2
5273+@                                     uint8_t * src_l,       // r3
5274+@                                     unsigned int no_f);   // sp[0]
5275+@
5276+
5277+@ no_f
5278+@ 0  tl P0a
5279+@ 1  tr Q0a
5280+@ 2  bl P0b
5281+@ 3  br Q0b
5282+
5283+@ P1: q8,  q12
5284+@ P0: q9,  q13
5285+@ Q0: q10, q14
5286+@ Q1: q11, q15
5287+
5288+.macro m_filter_v_uv2_16 bit_depth
5289+        cmp      r2, #0
5290+        it eq
5291+        bxeq     lr
5292+        push     {lr}
5293+        vld2.32  {d16[0], d18[0]}, [r3], r1
5294+        vld2.32  {d20[0], d22[0]}, [r0], r1
5295+
5296+        cmp      r2, #0x10000
5297+        vld2.32  {d16[1], d18[1]}, [r3], r1
5298+        vld2.32  {d20[1], d22[1]}, [r0], r1
5299+
5300+        vld2.32  {d17[0], d19[0]}, [r3], r1
5301+        vld2.32  {d21[0], d23[0]}, [r0], r1
5302+
5303+        vld2.32  {d17[1], d19[1]}, [r3], r1
5304+        vld2.32  {d21[1], d23[1]}, [r0], r1
5305+        blo      10f
5306+
5307+        vld2.32  {d24[0], d26[0]}, [r3], r1
5308+        vld2.32  {d28[0], d30[0]}, [r0], r1
5309+
5310+        sub      ip, r0, r3
5311+        vld2.32  {d24[1], d26[1]}, [r3], r1
5312+        vld2.32  {d28[1], d30[1]}, [r0], r1
5313+
5314+        cmp      ip, #8
5315+        vld2.32  {d25[0], d27[0]}, [r3], r1
5316+        vld2.32  {d29[0], d31[0]}, [r0], r1
5317+
5318+        vld2.32  {d25[1], d27[1]}, [r3]
5319+        vld2.32  {d29[1], d31[1]}, [r0]
5320+
5321+        hevc_loop_filter_uv_body2_16  q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \
5322+        "ldr      lr, [sp, #4]", \
5323+        "neg      r1, r1",       \
5324+        "it eq; cmpeq lr, #0",   \
5325+        "add      r3, #4",       \
5326+        "add      ip, r3, r1",   \
5327+        "add      r2, r0, r1",   \
5328+        "lsl      r1, #1"
5329+
5330+        bne      1f
5331+
5332+@ Much/most of the time r0 == r3 + 8 and no_f == 0
5333+@ so it is worth having this special case
5334+        vst2.32   {d27[1], d29[1]}, [r3], r1    @ P0b, Q0b
5335+        vst2.32   {d27[0], d29[0]}, [ip], r1
5336+        vst2.32   {d26[1], d28[1]}, [r3], r1
5337+        vst2.32   {d26[0], d28[0]}, [ip], r1
5338+        vst2.32   {d19[1], d21[1]}, [r3], r1    @ P0a, Q0a
5339+        vst2.32   {d19[0], d21[0]}, [ip], r1
5340+        vst2.32   {d18[1], d20[1]}, [r3]
5341+        vst2.32   {d18[0], d20[0]}, [ip]
5342+        pop       {pc}
5343+
5344+@ Either split or partial
5345+1:
5346+        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
5347+        ittt cs
5348+        addcs    r0, r0, r1, lsl #1
5349+        addcs    r2, r2, r1, lsl #1
5350+        bcs      1f
5351+        @ Q0b
5352+        vst1.32  {d29[1]}, [r0], r1
5353+        vst1.32  {d29[0]}, [r2], r1
5354+        vst1.32  {d28[1]}, [r0], r1
5355+        vst1.32  {d28[0]}, [r2], r1
5356+1:
5357+        ittt mi
5358+        addmi    r3, r3, r1, lsl #1
5359+        addmi    ip, ip, r1, lsl #1
5360+        bmi      1f
5361+        @ P0b
5362+        vst1.32  {d27[1]}, [r3], r1
5363+        vst1.32  {d27[0]}, [ip], r1
5364+        vst1.32  {d26[1]}, [r3], r1
5365+        vst1.32  {d26[0]}, [ip], r1
5366+1:
5367+        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
5368+        bcs      1f
5369+        @ Q0a
5370+        vst1.32  {d21[1]}, [r0], r1
5371+        vst1.32  {d21[0]}, [r2], r1
5372+        vst1.32  {d20[1]}, [r0]
5373+        vst1.32  {d20[0]}, [r2]
5374+1:
5375+        it       mi
5376+        popmi    {pc}
5377+        @ P0a
5378+        vst1.32  {d19[1]}, [r3], r1
5379+        vst1.32  {d19[0]}, [ip], r1
5380+        vst1.32  {d18[1]}, [r3]
5381+        vst1.32  {d18[0]}, [ip]
5382+        pop      {pc}
5383+
5384+@ Single lump (rather than double)
5385+10:
5386+        @ As we have post inced r0/r3 in the load the easiest thing to do is
5387+        @ to subtract and write forwards, rather than backwards (as above)
5388+        @ b0 (P0a) -> N, b1 (Q0a) -> C
5389+
5390+        hevc_loop_filter_uv_body1_16  q8, q9, q10, q11, \bit_depth, \
5391+        "ldr      lr, [sp, #4]",       \
5392+        "add      r3, #4",             \
5393+        "sub      r0, r0, r1, lsl #2", \
5394+        "sub      r3, r3, r1, lsl #2", \
5395+        "lsls     lr, #31",            \
5396+        "add      r2, r0, r1",         \
5397+        "add      ip, r3, r1",         \
5398+        "lsl      r1, #1"
5399+
5400+        bcs      3f
5401+        @ Q0a
5402+        vst1.32  {d20[0]}, [r0], r1
5403+        vst1.32  {d20[1]}, [r2], r1
5404+        vst1.32  {d21[0]}, [r0]
5405+        vst1.32  {d21[1]}, [r2]
5406+3:
5407+        it       mi
5408+        popmi    {pc}
5409+        @ P0a
5410+        vst1.32  {d18[0]}, [r3], r1
5411+        vst1.32  {d18[1]}, [ip], r1
5412+        vst1.32  {d19[0]}, [r3]
5413+        vst1.32  {d19[1]}, [ip]
5414+        pop      {pc}
5415+.endm
5416+
5417+
5418+@ The NEON version is faster under ideal circumstances (i.e. everything in L1)
5419+@ But in real world testing it is ~20% slower, presumably due to code size
5420+
5421+#if 0 // NEON version
5422+
5423+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
5424+ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
5425+ *                                            int in_inc0, int in_inc1)
5426+ */
5427+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
5428+        mov         ip, sp
5429+        push        {a1-a3,v1-v8,lr}
5430+        ldm         ip, {v1-v6}
5431+        cmp         a1, #2
5432+        bls         2f
5433+        vpush       {d8-d13}
5434+        sub         v5, v5, #10
5435+        sub         v6, v6, #10
5436+1:
5437+        vld2.32     {d0[0], d2[0]}, [a3]!
5438+        vld2.32     {d4[0], d6[0]}, [a4]!
5439+          vmov.u8     q12, #0
5440+        ldrb        a2, [a3], #1
5441+        ldrb        ip, [a4], #1
5442+        ldrb        v8, [a3], #1
5443+        ldrb        lr, [a4], #1
5444+        add         a2, v1, a2, lsl #2
5445+        vld1.8      {d24[0]}, [a3], v5
5446+        add         ip, v3, ip, lsl #2
5447+        vld1.8      {d25[0]}, [a4], v6
5448+        add         v8, v2, v8, lsl #2
5449+        vld1.32     {d16[0]}, [a2]
5450+        add         lr, v4, lr, lsl #2
5451+        vld1.32     {d20[0]}, [ip]
5452+        vld1.32     {d18[0]}, [v8]
5453+        vld1.32     {d22[0]}, [lr]
5454+
5455+        vld2.32     {d0[1], d2[1]}, [a3]!
5456+        vld2.32     {d4[1], d6[1]}, [a4]!
5457+        ldrb        a2, [a3], #1
5458+          vmov.u16    d12, #1
5459+        ldrb        ip, [a4], #1
5460+          vmov.u16    d13, #2
5461+        ldrb        v8, [a3], #1
5462+          vmov.u16    d27, #4
5463+        ldrb        lr, [a4], #1
5464+        add         a2, v1, a2, lsl #2
5465+        vld1.8      {d24[2]}, [a3], v5
5466+        add         ip, v3, ip, lsl #2
5467+        vld1.8      {d25[2]}, [a4], v6
5468+        add         v8, v2, v8, lsl #2
5469+        vld1.32     {d16[1]}, [a2]
5470+        add         lr, v4, lr, lsl #2
5471+        vld1.32     {d20[1]}, [ip]
5472+        vld1.32     {d18[1]}, [v8]
5473+        vld1.32     {d22[1]}, [lr]
5474+
5475+        vld2.32     {d1[0], d3[0]}, [a3]!
5476+        vld2.32     {d5[0], d7[0]}, [a4]!
5477+        ldrb        a2, [a3], #1
5478+        ldrb        ip, [a4], #1
5479+        ldrb        lr, [a4], #1
5480+        ldrb        v8, [a3], #1
5481+        add         a2, v1, a2, lsl #2
5482+        vld1.8      {d24[4]}, [a3], v5
5483+        add         ip, v3, ip, lsl #2
5484+        vld1.8      {d25[4]}, [a4], v6
5485+        add         v8, v2, v8, lsl #2
5486+        vld1.32     {d17[0]}, [a2]
5487+        add         lr, v4, lr, lsl #2
5488+        vld1.32     {d21[0]}, [ip]
5489+        vld1.32     {d19[0]}, [v8]
5490+        vld1.32     {d23[0]}, [lr]
5491+
5492+        vld2.32     {d1[1], d3[1]}, [a3]!
5493+        vld2.32     {d5[1], d7[1]}, [a4]!
5494+        ldrb        a2, [a3], #1
5495+        ldrb        ip, [a4], #1
5496+        ldrb        v8, [a3], #1
5497+        ldrb        lr, [a4], #1
5498+        add         a2, v1, a2, lsl #2
5499+        vld1.8      {d24[6]}, [a3], v5
5500+        add         ip, v3, ip, lsl #2
5501+        vld1.8      {d25[6]}, [a4], v6
5502+        add         v8, v2, v8, lsl #2
5503+        vld1.32     {d17[1]}, [a2]
5504+        add         lr, v4, lr, lsl #2
5505+        vld1.32     {d21[1]}, [ip]
5506+        vld1.32     {d19[1]}, [v8]
5507+        vld1.32     {d23[1]}, [lr]
5508+
5509+        @ So now we have:
5510+        @ q0.32[i]  = curr[i].mv[0]
5511+        @ q1.32[i]  = curr[i].mv[1]
5512+        @ q2.32[i]  = neigh[i].mv[0]
5513+        @ q3.32[i]  = neigh[i].mv[1]
5514+        @ q8.32[i]  = curr_rpl0[curr[i].ref_idx[0]]
5515+        @ q9.32[i]  = curr_rpl1[curr[i].ref_idx[1]]
5516+        @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
5517+        @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
5518+        @ d24.16[i] = curr[i].pred_flag
5519+        @ d25.16[i] = neigh[i].pred_flag
5520+
5521+        vtst.16     d28, d24, d12
5522+        vtst.16     d29, d24, d13
5523+        vadd.i16    d8, d24, d12
5524+        vadd.i16    d9, d25, d12
5525+        vtst.16     d30, d25, d12
5526+        vtst.16     d31, d25, d13
5527+        veor        d26, d8, d9
5528+          ldr         lr, [sp, 6*8 + 1*4]
5529+        vmovl.s16   q4, d28
5530+        vmovl.s16   q5, d29
5531+          teq         lr, #1
5532+        vmovl.s16   q14, d30
5533+          it ne
5534+          lslne       v1, lr, #1
5535+        vmovl.s16   q15, d31
5536+          it ne
5537+          rsbne       v2, v1, #32
5538+        vbif        q0, q1, q4
5539+        vbif        q2, q3, q14
5540+        vbif        q1, q0, q5
5541+        vbif        q3, q2, q15
5542+        vabd.s16    q12, q0, q2
5543+        vabd.s16    q2, q1
5544+        vabd.s16    q0, q3
5545+        vabd.s16    q1, q3
5546+        vbif        q8, q9, q4
5547+        vbif        q10, q11, q14
5548+        vbif        q9, q8, q5
5549+        vbif        q11, q10, q15
5550+        vclt.u16    d6, d24, d27
5551+        vclt.u16    d8, d2, d27
5552+        vclt.u16    d7, d25, d27
5553+        vclt.u16    d9, d3, d27
5554+        vclt.u16    d2, d0, d27
5555+        vclt.u16    d0, d4, d27
5556+        vclt.u16    d3, d1, d27
5557+        vclt.u16    d1, d5, d27
5558+        vceq.i32    q12, q10, q8
5559+        vceq.i32    q10, q9
5560+        vceq.i32    q8, q11
5561+        vceq.i32    q9, q11
5562+        vshrn.i32   d6, q3, #8
5563+        vshrn.i32   d7, q4, #8
5564+        vshrn.i32   d8, q1, #8
5565+        vshrn.i32   d9, q0, #8
5566+        vmovn.i32   d4, q12
5567+        vmovn.i32   d2, q10
5568+        vmovn.i32   d3, q8
5569+        vmovn.i32   d5, q9
5570+        vand        q2, q3
5571+        vrev16.8    q3, q3
5572+        vand        q2, q3
5573+        vand        q1, q4
5574+        vrev16.8    q4, q4
5575+        vand        q1, q4
5576+        vand        d4, d5
5577+        vand        d2, d3
5578+        vbic        d0, d12, d4
5579+        vshr.u16    d26, #2
5580+        vbic        d0, d2
5581+        vmov.i16    d1, #0x5555
5582+        vorr        d0, d26
5583+          bne         10f
5584+
5585+        @ Merge results into result word, no duplicates
5586+        vmov        a2, s0
5587+        vmov        v8, s1
5588+        vmov.u16    ip, d0[1]
5589+        vmov.u16    lr, d0[3]
5590+        lsl         a2, #30
5591+        lsl         v8, #30
5592+        lsl         ip, #30
5593+        lsl         lr, #30
5594+        orr         a2, ip, a2, lsr #2
5595+        orr         v8, lr, v8, lsr #2
5596+        orr         a2, v8, a2, lsr #4
5597+        subs        a1, #4
5598+        orr         v7, a2, v7, lsr #8
5599+        bhi         1b
5600+
5601+        mov         a1, #32
5602+        ldr         a3, [sp, #6*8]
5603+        vpop        {d8-d13}
5604+        sub         a1, a1, a3, lsl #1
5605+        mov         a1, v7, lsr a1
5606+        pop         {a2-a4,v1-v8,pc}
5607+10:
5608+        @ Merge results into result word, with duplicates
5609+        vmul.i16    d0, d1
5610+        vmov        a2, s0
5611+        vmov        v8, s1
5612+        vmov.u16    ip, d0[1]
5613+        vmov.u16    lr, d0[3]
5614+        lsl         a2, v2
5615+        subs        a1, #4
5616+        lsl         v8, v2
5617+        lsl         ip, v2
5618+        lsl         lr, v2
5619+        ldr         v2, [sp, #6*8 + 12*4 + 1*4]
5620+T       lsr         a2, v1
5621+T       orr         a2, ip, a2
5622+A       orr         a2, ip, a2, lsr v1
5623+        lsl         ip, v1, #1
5624+T       lsr         v8, v1
5625+T       orr         v8, lr, v8
5626+A       orr         v8, lr, v8, lsr v1
5627+        lsl         lr, v1, #2
5628+T       lsr         a2, ip
5629+T       orr         a2, v8, a2
5630+A       orr         a2, v8, a2, lsr ip
5631+        ldr         v1, [sp, #6*8 + 12*4]
5632+T       lsr         v7, lr
5633+T       orr         v7, a2, v7
5634+A       orr         v7, a2, v7, lsr lr
5635+        bhi         1b
5636+
5637+        mov         a1, #32
5638+        ldrd        a3, a4, [sp, #6*8]
5639+        vpop        {d8-d13}
5640+        mls         a1, a3, a4, a1
5641+        mls         a1, a3, a4, a1
5642+        mov         a1, v7, lsr a1
5643+        pop         {a2-a4,v1-v8,pc}
5644+
5645+
5646+2:
5647+        sub         v5, v5, #10
5648+        sub         v6, v6, #10
5649+        vmov.u8     d16, #0
5650+        blo         3f
5651+        vld2.32     {d0[0], d1[0]}, [a3]!
5652+        vld2.32     {d2[0], d3[0]}, [a4]!
5653+        ldrb        a2, [a3], #1
5654+        ldrb        ip, [a4], #1
5655+        ldrb        lr, [a4], #1
5656+        ldrb        v8, [a3], #1
5657+        add         a2, v1, a2, lsl #2
5658+        vld1.8      {d16[0]}, [a3], v5
5659+        add         ip, v3, ip, lsl #2
5660+        vld1.8      {d16[4]}, [a4], v6
5661+        add         v8, v2, v8, lsl #2
5662+        vld1.32     {d4[0]}, [a2]
5663+        add         lr, v4, lr, lsl #2
5664+        vld1.32     {d5[0]}, [ip]
5665+        vld1.32     {d6[0]}, [v8]
5666+        vld1.32     {d7[0]}, [lr]
5667+
5668+3:
5669+        vld2.32     {d0[1], d1[1]}, [a3]!
5670+        vld2.32     {d2[1], d3[1]}, [a4]!
5671+        ldrb        a2, [a3], #1
5672+          vmov.u16    d17, #1
5673+        ldrb        ip, [a4], #1
5674+          vmov.u16    d18, #2
5675+        ldrb        v8, [a3], #1
5676+          vmov.u16    d19, #4
5677+        ldrb        lr, [a4], #1
5678+        add         a2, v1, a2, lsl #2
5679+        vld1.8      {d16[2]}, [a3], v5
5680+        add         ip, v3, ip, lsl #2
5681+        vld1.8      {d16[6]}, [a4], v6
5682+        add         v8, v2, v8, lsl #2
5683+        vld1.32     {d4[1]}, [a2]
5684+        add         lr, v4, lr, lsl #2
5685+        vld1.32     {d5[1]}, [ip]
5686+        vld1.32     {d6[1]}, [v8]
5687+        vld1.32     {d7[1]}, [lr]
5688+
5689+        @ So now we have:
5690+        @ d0.32[i]  = curr[i].mv[0]
5691+        @ d1.32[i]  = curr[i].mv[1]
5692+        @ d2.32[i]  = neigh[i].mv[0]
5693+        @ d3.32[i]  = neigh[i].mv[1]
5694+        @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]]
5695+        @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
5696+        @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]]
5697+        @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
5698+        @ d16.16[i] = curr[i].pred_flag
5699+        @ d16.16[2+i] = neigh[i].pred_flag
5700+
5701+        vtst.16     d20, d16, d17
5702+        vtst.16     d22, d16, d18
5703+        vadd.i16    d30, d16, d17
5704+        vswp        d2, d3
5705+        ldr         lr, [sp, #1*4]
5706+        vmovl.s16   q10, d20
5707+          teq         lr, #1
5708+        vmovl.s16   q11, d22
5709+          it ne
5710+          lslne       v1, lr, #1
5711+        vbif        d0, d1, d20
5712+        vbif        d4, d6, d20
5713+        vbif        d3, d2, d21
5714+        vbif        d5, d7, d21
5715+        vbif        d1, d0, d22
5716+        vbif        d6, d4, d22
5717+        vbif        d2, d3, d23
5718+        vbif        d7, d5, d23
5719+        vshr.u16    d30, #2
5720+        vabd.s16    d24, d0, d3
5721+        vabd.s16    d25, d1, d2
5722+        vabd.s16    q0, q0, q1
5723+        vceq.i32    d2, d4, d5
5724+        vceq.i32    d20, d5, d6
5725+        vceq.i32    d21, d4, d7
5726+        vceq.i32    d3, d6, d7
5727+        vclt.u16    d6, d24, d19
5728+        vclt.u16    d7, d25, d19
5729+        vclt.u16    d22, d1, d19
5730+        vclt.u16    d23, d0, d19
5731+        vshrn.i32   d6, q3, #8
5732+        vmovn.i32   d2, q1
5733+        vshrn.i32   d7, q11, #8
5734+        vmovn.i32   d3, q10
5735+        vand        q0, q3, q1
5736+          it ne
5737+          rsbne       v2, v1, #32
5738+        vrev16.8    q3, q3
5739+        vand        q0, q3
5740+        vsra.u64    d30, #32
5741+        vshr.u64    q1, q0, #32
5742+        vand        q0, q1
5743+        vbic        d0, d17, d0
5744+        vand        d30, d30, d17
5745+        vbic        d0, d1
5746+        vmov.i16    d1, #0x5555
5747+        vorr        d0, d30
5748+          bne         10f
5749+
5750+        @ Construct result word, no duplicates
5751+        cmp         a1, #2
5752+        vmov.u16    a1, d0[1]
5753+        vmov.u16    a2, d0[0]
5754+        it eq
5755+        orreq       a1, a2, a1, lsl #2
5756+        pop         {a2-a4,v1-v8,pc}
5757+10:
5758+        @ Construct result word, with duplicates
5759+        cmp         a1, #2
5760+        vmul.i16    d0, d1
5761+        vmov        a2, s0
5762+        vmov.u16    a1, d0[1]
5763+        lsl         a2, #16
5764+        pkhbt       a1, a1, a1, lsl #16
5765+        lsr         a2, v2
5766+        lsr         a1, v2
5767+T       itt eq
5768+T       lsleq       a1, v1
5769+T       orreq       a1, a2, a1
5770+A       orreq       a1, a2, a1, lsl v1
5771+        pop         {a2-a4,v1-v8,pc}
5772+endfunc
5773+
5774+
5775+
5776+#else // non-NEON version
5777+
5778+
5779+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
5780+ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
5781+ *                                            int in_inc0, in_inc1)
5782+ */
5783+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
5784+        add         ip, sp, #4*4
5785+        push        {a2-a4,v1-v8,lr}
5786+        mov         v6, #32
5787+1:      ldmdb       ip, {v1-v4}
5788+        ldrsb       v5, [a3, #8]    @ curr->ref_idx
5789+        ldrsb       v8, [a3, #9]
5790+        ldrsb       ip, [a4, #8]    @ neigh->ref_idx
5791+        ldrsb       lr, [a4, #9]
5792+        ldr         v1, [v1, v5, lsl #2]
5793+        ldrb        v5, [a3, #10]   @ curr->pred_flag
5794+        ldr         v2, [v2, v8, lsl #2]
5795+        ldrb        v8, [a4, #10]   @ neigh->pred_flag
5796+        ldr         v3, [v3, ip, lsl #2]
5797+        ldr         v4, [v4, lr, lsl #2]
5798+        teq         v5, #3
5799+        beq         20f
5800+        teq         v8, #3
5801+        beq         90f
5802+
5803+        tst         v5, #1
5804+        itee        ne
5805+        ldrne       v5, [a3, #0]    @ curr->mv[0]
5806+        moveq       v1, v2
5807+        ldreq       v5, [a3, #4]    @ curr->mv[1]
5808+        tst         v8, #1
5809+        itee        ne
5810+        ldrne       v8, [a4, #0]    @ neigh->mv[0]
5811+        moveq       v3, v4
5812+        ldreq       v8, [a4, #4]    @ neigh->mv[1]
5813+        teq         v1, v3
5814+        bne         10f
5815+        ldr         lr, =0xFFFCFFFC
5816+        ssub16      ip, v8, v5
5817+        ssub16      v5, v5, v8
5818+        sel         v5, v5, ip
5819+        ands        v5, v5, lr
5820+        @ drop through
5821+10:     it          ne
5822+        movne       v5, #1<<30
5823+11:
5824+        sub         v6, v6, #2
5825+T       mov         v7, v7, lsr #2
5826+        subs        a2, a2, #1
5827+A       orr         v7, v5, v7, lsr #2
5828+T       orr         v7, v5, v7
5829+        bhi         11b
5830+
5831+        ldrd        v3, v4, [sp, #16*4]
5832+        ldr         a2, [sp]
5833+        add         ip, sp, #16*4
5834+        subs        a1, a1, #1
5835+        add         a3, a3, v3
5836+        add         a4, a4, v4
5837+        bhi         1b
5838+        mov         a1, v7, lsr v6
5839+        pop         {a2-a4,v1-v8,pc}
5840+
5841+20:     teq         v8, #3
5842+        bne         10b
5843+
5844+        teq         v1, v3
5845+        it          eq
5846+        teqeq       v2, v4
5847+        bne         40f
5848+        teq         v1, v2
5849+        bne         30f
5850+
5851+        ldrd        v1, v2, [a3]    @ curr->mv
5852+        ldrd        v3, v4, [a4]    @ neigh->mv
5853+        ldr         lr, =0xFFFCFFFC
5854+        ssub16      ip, v3, v1
5855+        ssub16      v5, v1, v3
5856+        sel         v5, v5, ip
5857+        ands        v5, v5, lr
5858+        bne         25f
5859+        ssub16      ip, v4, v2
5860+        ssub16      v5, v2, v4
5861+        sel         v5, v5, ip
5862+        ands        v5, v5, lr
5863+        beq         11b
5864+        @ drop through
5865+25:     ssub16      ip, v4, v1
5866+        ssub16      v5, v1, v4
5867+        sel         v5, v5, ip
5868+        ands        v5, v5, lr
5869+        bne         10b
5870+        ssub16      ip, v3, v2
5871+        ssub16      v5, v2, v3
5872+        sel         v5, v5, ip
5873+        ands        v5, v5, lr
5874+        b           10b
5875+
5876+30:     ldrd        v1, v2, [a3]    @ curr->mv
5877+        ldrd        v3, v4, [a4]    @ neigh->mv
5878+        ldr         lr, =0xFFFCFFFC
5879+        ssub16      ip, v3, v1
5880+        ssub16      v5, v1, v3
5881+        sel         v5, v5, ip
5882+        ands        v5, v5, lr
5883+        bne         10b
5884+        ssub16      ip, v4, v2
5885+        ssub16      v5, v2, v4
5886+        sel         v5, v5, ip
5887+        ands        v5, v5, lr
5888+        b           10b
5889+
5890+40:     teq         v1, v4
5891+        ite         eq
5892+        teqeq       v2, v3
5893+        bne         10b
5894+
5895+        ldrd        v1, v2, [a3]    @ curr->mv
5896+        ldrd        v3, v4, [a4]    @ neigh->mv
5897+        ldr         lr, =0xFFFCFFFC
5898+        b           25b
5899+
5900+90:
5901+        mov         v5, #1<<30
5902+        b           11b
5903+endfunc
5904+
5905+
5906+#endif
5907+
5908+
5909+@ =============================================================================
5910+@
5911+@ 10 bit
5912+
5913+function hevc_loop_filter_luma_body_10
5914+        m_filter_luma 10, q11, q15
5915+endfunc
5916+
5917+function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
5918+        hevc_loop_filter_luma_start
5919+        b        .Lh_loop_luma_common_10
5920+endfunc
5921+
5922+function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1
5923+        cmp      r3, #0
5924+        it       eq
5925+        bxeq     lr
5926+        push     {r4-r10,lr}            @ 32 bytes
5927+        ldr      r10, [sp, #32]
5928+.Lh_loop_luma_common_10:
5929+        m_filter_h_luma_16 10
5930+endfunc
5931+
5932+function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1
5933+        hevc_loop_filter_luma_start
5934+        sub      r4, r0, #8
5935+        b        .Lv_loop_luma_common_10
5936+endfunc
5937+
5938+function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1
5939+        cmp      r3, #0
5940+        it       eq
5941+        bxeq     lr
5942+        push     {r4-r10,lr}            @ 32 bytes
5943+        ldr      r4, [sp, #36]
5944+        ldr      r10, [sp, #32]
5945+
5946+.Lv_loop_luma_common_10:
5947+        m_filter_v_luma_16 10
5948+endfunc
5949+
5950+function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
5951+        m_filter_h_uv_16 10
5952+endfunc
5953+
5954+function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1
5955+        m_filter_v_uv2_16 10
5956+endfunc
5957+
5958--- /dev/null
5959+++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
5960@@ -0,0 +1,184 @@
5961+/*
5962+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
5963+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
5964+ *
5965+ * This file is part of FFmpeg.
5966+ *
5967+ * FFmpeg is free software; you can redistribute it and/or
5968+ * modify it under the terms of the GNU Lesser General Public
5969+ * License as published by the Free Software Foundation; either
5970+ * version 2.1 of the License, or (at your option) any later version.
5971+ *
5972+ * FFmpeg is distributed in the hope that it will be useful,
5973+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
5974+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
5975+ * Lesser General Public License for more details.
5976+ *
5977+ * You should have received a copy of the GNU Lesser General Public
5978+ * License along with FFmpeg; if not, write to the Free Software
5979+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
5980+ */
5981+
5982+#include "libavutil/arm/asm.S"
5983+#include "neon.S"
5984+
5985+/* uses registers q8 - q13 for temp values */
5986+.macro tr4_luma_shift shift
5987+        vaddl.s16   q8, d28, d30    // c0 = src0 + src2
5988+        vaddl.s16   q9, d30, d31    // c1 = src2 + src3
5989+        vsubl.s16   q10, d28, d31   // c2 = src0 - src3
5990+        vaddl.s16   q11, d28, d31   // src0 + src3
5991+
5992+        vmul.i32    q12, q8, d1[0]  // 29 * c0
5993+        vmul.i32    q13, q10, d2[0] // 55 * c2
5994+        vmul.i32    q8, q8, d2[0]   // 55 * c0
5995+        vmull.s16   q14, d29, d0[0] // c3 = 74 * src1
5996+
5997+        vsubw.s16   q11, q11, d30   // src0 - src2 + src3
5998+        vmla.i32    q12, q9, d2[0]  // 29 * c0 + 55 * c1
5999+        vmls.i32    q13, q9, d1[0]  // 55 * c2 - 29 * c1
6000+        vmla.i32    q8, q10, d1[0]  // 55 * c0 + 29 * c2
6001+
6002+        vmul.i32    q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3)
6003+        vadd.i32    q12, q12, q14   // dst0 = 29 * c0 + 55 * c1 + c3
6004+        vadd.i32    q13, q13, q14   // dst1 = 55 * c2 - 29 * c1 + c3
6005+        vsub.i32    q8, q8, q14     // dst3 = 55 * c0 + 29 * c2 - c3
6006+
6007+        vqrshrn.s32 d28, q12, \shift
6008+        vqrshrn.s32 d29, q13, \shift
6009+        vqrshrn.s32 d30, q11, \shift
6010+        vqrshrn.s32 d31, q8, \shift
6011+.endm
6012+
6013+/* uses registers q8 - q11 for temp values */
6014+.macro tr4_shift shift
6015+        vmull.s16   q9, d29, d0[0]   // 83 * src1
6016+        vmull.s16   q8, d29, d0[1]   // 36 * src1
6017+        vshll.s16   q14, d28, #6     // 64 * src0
6018+        vshll.s16   q10, d30, #6     // 64 * src2
6019+        vmlal.s16   q9, d31, d0[1]   // 83 * src1 + 36 * src3  o0
6020+        vmlsl.s16   q8, d31, d0[0]   // 36 * src1 - 83 * src3  o1
6021+        vadd.s32    q11, q14, q10    // 64 * (src0 + src2)     e0
6022+        vsub.s32    q10, q14, q10    // 64 * (src0 - src2)     e1
6023+        vadd.s32    q14, q11, q9     // e0 + o0
6024+        vadd.s32    q15, q10, q8     // e1 + o1
6025+        vsub.s32    q8, q10, q8      // e1 - o1
6026+        vsub.s32    q9, q11, q9      // e0 - o0
6027+
6028+        vqrshrn.s32 d28, q14, \shift
6029+        vqrshrn.s32 d29, q15, \shift
6030+        vqrshrn.s32 d30, q8, \shift
6031+        vqrshrn.s32 d31, q9, \shift
6032+.endm
6033+
6034+.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7,                         \
6035+                   tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \
6036+                   tmp1, /* Q reg which doesn't alias with d7 or d0     */ \
6037+                   shift, I1, I2, I3
6038+
6039+        vmull.s16  q4, \d1, d1[1]        // 89 * src1
6040+        \I1
6041+        vmull.s16  q5, \d1, d1[0]        // 75 * src1
6042+        \I2
6043+        vmull.s16  q6, \d1, d1[3]        // 50 * src1
6044+        \I3
6045+        vmull.s16  q7, \d1, d1[2]        // 18 * src1
6046+        vmlal.s16  q4, \d3, d1[0]        // 75 * src3
6047+        vmlsl.s16  q5, \d3, d1[2]        //-18 * src3
6048+        vmlsl.s16  q6, \d3, d1[1]        //-89 * src3
6049+        vmlsl.s16  q7, \d3, d1[3]        //-50 * src3
6050+
6051+          // tr4
6052+          vmull.s16  q1, \d2, d0[0]      // 83 * src(1*2)
6053+          vmull.s16  q2, \d2, d0[1]      // 36 * src(1*2)
6054+
6055+        vmlal.s16  q4, \d5, d1[3]        // 50 * src5
6056+        vmlsl.s16  q5, \d5, d1[1]        //-89 * src5
6057+        vmlal.s16  q6, \d5, d1[2]        // 18 * src5
6058+        vmlal.s16  q7, \d5, d1[0]        // 75 * src5
6059+
6060+          vshll.s16  q3, \d0, #6         // 64 * src(0*2)
6061+          vshll.s16  \tmp0, \d4, #6      // 64 * src(2*2)
6062+          vmlal.s16  q1, \d6, d0[1]      // 83 * src(1*2) + 36 * src(3*2)  o0
6063+          vmlsl.s16  q2, \d6, d0[0]      // 36 * src(1*2) - 83 * src(3*2)  o1
6064+          vadd.i32   \tmp1, q3, \tmp0    // 64 * (src(0*2) + src(2*2))     e0
6065+          vsub.i32   \tmp0, q3, \tmp0    // 64 * (src(0*2) - src(2*2))     e1
6066+
6067+        vmlal.s16  q4, \d7, d1[2]        // 18 * src7
6068+        vmlsl.s16  q5, \d7, d1[3]        //-50 * src7
6069+        vmlal.s16  q6, \d7, d1[0]        // 75 * src7
6070+        vmlsl.s16  q7, \d7, d1[1]        //-89 * src7
6071+
6072+          vsub.i32   q3, \tmp1, q1       // e0 - o0
6073+          vadd.i32   \tmp1, \tmp1, q1    // e0 + o0
6074+          vadd.i32   q1, \tmp0, q2       // e1 + o1
6075+          vsub.i32   q2, \tmp0, q2       // e1 - o1
6076+
6077+        vadd.i32   \tmp0, \tmp1, q4      // e_8[0] + o_8[0], dst[0]
6078+        vsub.i32   q4, \tmp1, q4         // e_8[0] - o_8[0], dst[7]
6079+        vsub.i32   \tmp1, q3, q7         // e_8[3] - o_8[3], dst[4]
6080+        vadd.i32   q7, q3, q7            // e_8[3] + o_8[3], dst[3]
6081+        vadd.i32   q3, q1, q5            // e_8[1] + o_8[1], dst[1]
6082+        vsub.i32   q5, q1, q5            // e_8[1] - o_8[1], dst[6]
6083+        vsub.i32   q1, q2, q6            // e_8[2] - o_8[2], dst[5]
6084+        vadd.i32   q6, q2, q6            // e_8[2] + o_8[2], dst[2]
6085+        vqrshrn.s32   \d0, \tmp0, #\shift
6086+        vqrshrn.s32   \d4, \tmp1, #\shift
6087+        vqrshrn.s32   \d1, q3, #\shift
6088+        vqrshrn.s32   \d5, q1, #\shift
6089+        vqrshrn.s32   \d2, q6, #\shift
6090+        vqrshrn.s32   \d6, q5, #\shift
6091+        vqrshrn.s32   \d3, q7, #\shift
6092+        vqrshrn.s32   \d7, q4, #\shift
6093+.endm
6094+
6095+.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3
6096+        vld1.16     {\d0}, [r0 :64], r3
6097+        vld1.16     {\d1}, [r2 :64], r3
6098+        vld1.16     {\d2}, [r0 :64], r3
6099+        vld1.16     {\d3}, [r2 :64], r3
6100+        vld1.16     {\d4}, [r0 :64], r3
6101+        vld1.16     {\d5}, [r2 :64], r3
6102+        vld1.16     {\d6}, [r0 :64], r3
6103+        vld1.16     {\d7}, [r2 :64], r3
6104+
6105+        tr8_process \
6106+            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
6107+            \q01, \q23, 7, "\I1", "\I2", "\I3"
6108+.endm
6109+
6110+.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift
6111+        tr8_process \
6112+            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
6113+            \q01, \q23, \shift
6114+
6115+        vzip.16    \d0, \d4
6116+        vzip.16    \d1, \d5
6117+        vzip.16    \d2, \d6
6118+        vzip.16    \d3, \d7
6119+        vst4.16    {\d0-\d3}, [r0 :128], r3
6120+        vst4.16    {\d4-\d7}, [r2 :128], r3
6121+.endm
6122+
6123+#define BIT_DEPTH 8
6124+#include "rpi_hevc_idct_fn_neon.S"
6125+
6126+.text
6127+
6128+.align 4
6129+tr4f:
6130+.word 0x00240053  // 36 and d1[0] = 83
6131+.word 0x00000000
6132+tr8f:
6133+.word 0x0059004b  // 89, d0[0] = 75
6134+.word 0x00320012  // 50, d0[2] = 18
6135+tr16:
6136+.word 0x005a0057  // 90, d2[0] = 87
6137+.word 0x00500046  // 80, d2[2] = 70
6138+.word 0x0039002b  // 57, d2[0] = 43
6139+.word 0x00190009  // 25, d2[2] = 9
6140+
6141+#undef BIT_DEPTH
6142+#define BIT_DEPTH 10
6143+#include "rpi_hevc_idct_fn_neon.S"
6144+
6145--- /dev/null
6146+++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c
6147@@ -0,0 +1,32 @@
6148+/*
6149+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
6150+ *
6151+ * This file is part of FFmpeg.
6152+ *
6153+ * FFmpeg is free software; you can redistribute it and/or
6154+ * modify it under the terms of the GNU Lesser General Public
6155+ * License as published by the Free Software Foundation; either
6156+ * version 2.1 of the License, or (at your option) any later version.
6157+ *
6158+ * FFmpeg is distributed in the hope that it will be useful,
6159+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
6160+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
6161+ * Lesser General Public License for more details.
6162+ *
6163+ * You should have received a copy of the GNU Lesser General Public
6164+ * License along with FFmpeg; if not, write to the Free Software
6165+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6166+ */
6167+
6168+#include "libavutil/attributes.h"
6169+#include "libavutil/arm/cpu.h"
6170+#include "libavcodec/rpi_hevcdsp.h"
6171+#include "rpi_hevcdsp_arm.h"
6172+
6173+av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth)
6174+{
6175+    int cpu_flags = av_get_cpu_flags();
6176+
6177+    if (have_neon(cpu_flags))
6178+        ff_hevcdsp_rpi_init_neon(c, bit_depth);
6179+}
6180--- /dev/null
6181+++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
6182@@ -0,0 +1,467 @@
6183+/*
6184+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
6185+ *
6186+ * This file is part of FFmpeg.
6187+ *
6188+ * FFmpeg is free software; you can redistribute it and/or
6189+ * modify it under the terms of the GNU Lesser General Public
6190+ * License as published by the Free Software Foundation; either
6191+ * version 2.1 of the License, or (at your option) any later version.
6192+ *
6193+ * FFmpeg is distributed in the hope that it will be useful,
6194+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
6195+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
6196+ * Lesser General Public License for more details.
6197+ *
6198+ * You should have received a copy of the GNU Lesser General Public
6199+ * License along with FFmpeg; if not, write to the Free Software
6200+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6201+ */
6202+
6203+#include "config.h"
6204+#include "libavutil/attributes.h"
6205+#include "libavutil/arm/cpu.h"
6206+#include "libavcodec/rpi_hevcdsp.h"
6207+#include "rpi_hevcdsp_arm.h"
6208+#include "libavcodec/avcodec.h"
6209+#include "libavcodec/bit_depth_template.c"
6210+
6211+// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
6212+// have been removed from head as we never use them.
6213+
6214+void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
6215+void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
6216+
6217+void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
6218+void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
6219+
6220+void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r,
6221+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
6222+void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
6223+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
6224+                             uint8_t * _pix_l);
6225+void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
6226+                             unsigned int no_f);
6227+void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
6228+                             uint8_t * src_l,
6229+                             unsigned int no_f);
6230+
6231+void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r,
6232+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
6233+void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
6234+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
6235+                             uint8_t * _pix_l);
6236+void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
6237+                             unsigned int no_f);
6238+void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
6239+                             uint8_t * src_l,
6240+                             unsigned int no_f);
6241+
6242+void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
6243+void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
6244+void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs);
6245+void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs);
6246+void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs);
6247+void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs);
6248+void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs);
6249+
6250+void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
6251+void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
6252+void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs);
6253+void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs);
6254+void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs);
6255+void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs);
6256+void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs);
6257+
6258+void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
6259+                                     ptrdiff_t stride);
6260+void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
6261+                                     ptrdiff_t stride);
6262+void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
6263+                                       ptrdiff_t stride);
6264+void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
6265+                                       ptrdiff_t stride);
6266+
6267+void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
6268+void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
6269+void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
6270+void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
6271+
6272+
6273+void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
6274+                                     ptrdiff_t stride);
6275+void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
6276+                                     ptrdiff_t stride);
6277+void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
6278+                                       ptrdiff_t stride);
6279+void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
6280+                                       ptrdiff_t stride);
6281+
6282+void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
6283+void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
6284+void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
6285+void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
6286+
6287+
6288+void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
6289+                                       ptrdiff_t stride, int dc_v);
6290+void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
6291+                                       ptrdiff_t stride, int dc_v);
6292+void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
6293+                                       ptrdiff_t stride, int dc_v);
6294+void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
6295+                                       ptrdiff_t stride, int dc_u);
6296+void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
6297+                                       ptrdiff_t stride, int dc_u);
6298+void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
6299+                                       ptrdiff_t stride, int dc_u);
6300+void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
6301+                                       ptrdiff_t stride);
6302+void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
6303+                                       ptrdiff_t stride);
6304+void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
6305+                                       ptrdiff_t stride);
6306+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
6307+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
6308+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
6309+
6310+
6311+void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
6312+                                       ptrdiff_t stride, int dc_v);
6313+void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
6314+                                       ptrdiff_t stride, int dc_v);
6315+void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
6316+                                       ptrdiff_t stride, int dc_v);
6317+void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
6318+                                       ptrdiff_t stride, int dc_u);
6319+void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
6320+                                       ptrdiff_t stride, int dc_u);
6321+void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
6322+                                       ptrdiff_t stride, int dc_u);
6323+void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
6324+                                       ptrdiff_t stride);
6325+void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
6326+                                       ptrdiff_t stride);
6327+void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
6328+                                       ptrdiff_t stride);
6329+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
6330+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
6331+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
6332+
6333+void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
6334+void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
6335+void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
6336+void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
6337+
6338+void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
6339+void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
6340+void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
6341+void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
6342+
6343+void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
6344+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
6345+                                  int eo, int width, int height);
6346+void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
6347+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
6348+                                  int eo, int width, int height);
6349+void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
6350+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
6351+                                  int eo, int width, int height);
6352+
6353+void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
6354+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
6355+                                  int eo, int width, int height);
6356+void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
6357+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
6358+                                  int eo, int width, int height);
6359+void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
6360+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
6361+                                  int eo, int width, int height);
6362+
6363+void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
6364+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
6365+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
6366+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
6367+                                  int width, int height);
6368+void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
6369+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
6370+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
6371+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
6372+                                  int width, int height);
6373+void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
6374+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
6375+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
6376+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
6377+                                  int width, int height);
6378+
6379+void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
6380+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
6381+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
6382+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
6383+                                  int width, int height);
6384+void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
6385+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
6386+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
6387+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
6388+                                  int width, int height);
6389+void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
6390+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
6391+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
6392+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
6393+                                  int width, int height);
6394+
6395+void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
6396+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
6397+void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
6398+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
6399+void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
6400+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
6401+void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
6402+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
6403+
6404+void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
6405+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
6406+void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
6407+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
6408+void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
6409+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
6410+void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
6411+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
6412+
6413+
6414+uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
6415+                                                const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
6416+                                                int in_inc0, int in_inc1);
6417+void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height);
6418+
6419+
6420+static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
6421+{
6422+    ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
6423+    ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
6424+}
6425+static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
6426+{
6427+    ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
6428+    ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
6429+}
6430+
6431+static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
6432+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
6433+{
6434+    ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
6435+    ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
6436+}
6437+static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
6438+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
6439+{
6440+    ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
6441+    ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
6442+}
6443+
6444+#if SAO_FILTER_N == 6
6445+static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
6446+{
6447+    ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
6448+    ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
6449+}
6450+static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
6451+{
6452+    ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
6453+    ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
6454+}
6455+
6456+static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
6457+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
6458+{
6459+    ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
6460+    ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
6461+}
6462+static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
6463+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
6464+{
6465+    ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
6466+    ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
6467+}
6468+
6469+static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
6470+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
6471+                                  int eo, int width, int height)
6472+{
6473+    ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
6474+    ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
6475+}
6476+static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
6477+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
6478+                                  int eo, int width, int height)
6479+{
6480+    ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
6481+    ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
6482+}
6483+
6484+static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
6485+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
6486+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
6487+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
6488+                                  int width, int height)
6489+{
6490+    ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
6491+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
6492+    ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
6493+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
6494+}
6495+static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
6496+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
6497+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
6498+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
6499+                                  int width, int height)
6500+{
6501+    ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
6502+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
6503+    ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
6504+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
6505+}
6506+#endif
6507+
6508+
6509+
6510+#if RPI_HEVC_SAO_BUF_STRIDE != 160
6511+#error SAO edge src stride not 160 - value used in .S
6512+#endif
6513+
6514+av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
6515+{
6516+    if (bit_depth == 8) {
6517+        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_8;
6518+        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_8;
6519+        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_8;
6520+        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_8;
6521+        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_8;
6522+        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
6523+        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_8;
6524+        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_8;
6525+        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_8;
6526+        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_8;
6527+        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_8;
6528+        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_8;
6529+        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_8;
6530+        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_8;
6531+        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_8;
6532+        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_8;
6533+        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_8;
6534+        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_8;
6535+        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_8;
6536+        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_8;
6537+        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_8;
6538+        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_8;
6539+        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_8;
6540+        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_8;
6541+        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_8;
6542+        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_8;
6543+        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_8;
6544+        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_8;
6545+        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_8;
6546+        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_8;
6547+        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_8;
6548+        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8;
6549+        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8;
6550+        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8;
6551+        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_8;
6552+        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_8;
6553+        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_8;
6554+        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_8;
6555+        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_8;
6556+        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_8;
6557+        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_8;
6558+        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_8;
6559+        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_8;
6560+        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_8;
6561+        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_8;
6562+#if SAO_FILTER_N == 6
6563+        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_8;
6564+        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_8;
6565+#endif
6566+        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_8;
6567+        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_8;
6568+        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_8;
6569+
6570+        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_8;
6571+        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_8;
6572+        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_8;
6573+
6574+#if SAO_FILTER_N == 6
6575+        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_8;
6576+        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_8;
6577+#endif
6578+    }
6579+    else if (bit_depth == 10) {
6580+        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_10;
6581+        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_10;
6582+        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_10;
6583+        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_10;
6584+        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_10;
6585+        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_10;
6586+        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_10;
6587+        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_10;
6588+        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_10;
6589+        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_10;
6590+        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_10;
6591+        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_10;
6592+        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_10;
6593+        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_10;
6594+        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_10;
6595+        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_10;
6596+        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_10;
6597+        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_10;
6598+        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_10;
6599+        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_10;
6600+        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_10;
6601+        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_10;
6602+        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_10;
6603+        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_10;
6604+        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_10;
6605+        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_10;
6606+        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_10;
6607+        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_10;
6608+        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_10;
6609+        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_10;
6610+        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_10;
6611+        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10;
6612+        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10;
6613+        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10;
6614+        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_10;
6615+        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_10;
6616+        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_10;
6617+        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_10;
6618+        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_10;
6619+        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_10;
6620+
6621+        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_10;
6622+        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_10;
6623+        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_10;
6624+        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_10;
6625+        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_10;
6626+#if SAO_FILTER_N == 6
6627+        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_10;
6628+        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_10;
6629+#endif
6630+        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_10;
6631+        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_10;
6632+        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_10;
6633+
6634+        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_10;
6635+        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_10;
6636+        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_10;
6637+
6638+#if SAO_FILTER_N == 6
6639+        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_10;
6640+        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_10;
6641+#endif
6642+    }
6643+
6644+    assert(offsetof(HEVCRpiMvField, mv) == 0);
6645+    assert(offsetof(HEVCRpiMvField, ref_idx) == 8);
6646+    assert(offsetof(HEVCRpiMvField, pred_flag) == 10);
6647+    c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
6648+    c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon;
6649+}
6650--- /dev/null
6651+++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
6652@@ -0,0 +1,620 @@
6653+/*
6654+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
6655+All rights reserved.
6656+
6657+Redistribution and use in source and binary forms, with or without
6658+modification, are permitted provided that the following conditions are met:
6659+    * Redistributions of source code must retain the above copyright
6660+      notice, this list of conditions and the following disclaimer.
6661+    * Redistributions in binary form must reproduce the above copyright
6662+      notice, this list of conditions and the following disclaimer in the
6663+      documentation and/or other materials provided with the distribution.
6664+    * Neither the name of the copyright holder nor the
6665+      names of its contributors may be used to endorse or promote products
6666+      derived from this software without specific prior written permission.
6667+
6668+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
6669+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
6670+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
6671+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
6672+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
6673+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
6674+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
6675+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
6676+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
6677+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6678+
6679+Authors: John Cox, Ben Avison
6680+*/
6681+
6682+#include "libavutil/arm/asm.S"
6683+#include "neon.S"
6684+
6685+ .arch_extension mp @ enable PLDW
6686+
6687+#define BIT_DEPTH 10
6688+
6689+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
6690+        vmax.s16  \Q0, \Q_MIN
6691+        vmax.s16  \Q1, \Q_MIN
6692+        vmax.s16  \Q2, \Q_MIN
6693+        vmax.s16  \Q3, \Q_MIN
6694+        vmin.s16  \Q0, \Q_MAX
6695+        vmin.s16  \Q1, \Q_MAX
6696+        vmin.s16  \Q2, \Q_MAX
6697+        vmin.s16  \Q3, \Q_MAX
6698+.endm
6699+
6700+@ add_residual4x4(
6701+@  uint16_t *_dst,    [r0]
6702+@  int16_t *res,      [r1]
6703+@  ptrdiff_t stride)  [r2]
6704+
6705+function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1
6706+        add         ip, r0, r2
6707+        vld1.16     {q10, q11}, [r1]
6708+        lsl         r2, #1
6709+        vld1.16     {d0}, [r0 :64], r2
6710+        vld1.16     {d1}, [ip :64], r2
6711+        vld1.16     {d2}, [r0 :64]
6712+        vld1.16     {d3}, [ip :64]
6713+        sub         r0, r2
6714+        vqadd.s16   q0,  q10
6715+        sub         ip, r2
6716+        vqadd.s16   q1,  q11
6717+        vmov.i16    q8,  #0
6718+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
6719+        vmax.s16    q0,  q0,  q8
6720+        vmax.s16    q1,  q1,  q8
6721+        vmin.s16    q0,  q0,  q9
6722+        vmin.s16    q1,  q1,  q9
6723+        vst1.16     {d0}, [r0 :64], r2
6724+        vst1.16     {d1}, [ip :64], r2
6725+        vst1.16     {d2}, [r0 :64]
6726+        vst1.16     {d3}, [ip :64]
6727+        bx          lr
6728+
6729+endfunc
6730+
6731+@ add_residual4x4_dc(
6732+@  uint16_t *_dst,    [r0]
6733+@  ptrdiff_t stride,  [r1]
6734+@  int dc)            [r2]
6735+
6736+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
6737+        add         ip, r0, r1
6738+        vdup.16     q15, r2
6739+        lsl         r1, #1
6740+        vld1.16     {d0}, [r0 :64], r1
6741+        vld1.16     {d1}, [ip :64], r1
6742+        vld1.16     {d2}, [r0 :64]
6743+        vld1.16     {d3}, [ip :64]
6744+        sub         r0, r1
6745+        vqadd.s16   q0,  q15
6746+        sub         ip, r1
6747+        vqadd.s16   q1,  q15
6748+        vmov.i16    q8,  #0
6749+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
6750+        vmax.s16    q0,  q0,  q8
6751+        vmax.s16    q1,  q1,  q8
6752+        vmin.s16    q0,  q0,  q9
6753+        vmin.s16    q1,  q1,  q9
6754+        vst1.16     {d0}, [r0 :64], r1
6755+        vst1.16     {d1}, [ip :64], r1
6756+        vst1.16     {d2}, [r0 :64]
6757+        vst1.16     {d3}, [ip :64]
6758+        bx          lr
6759+
6760+endfunc
6761+
6762+
6763+@ add_residual8x8(
6764+@  uint16_t *_dst,    [r0]
6765+@  int16_t *res,      [r1]
6766+@  ptrdiff_t stride)  [r2]
6767+
6768+function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1
6769+        mov         r3, #8
6770+        vmov.i64    q8,  #0
6771+        add         ip, r0, r2
6772+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
6773+        lsl         r2, #1
6774+1:
6775+        vldm        r1!, {q10-q13}
6776+        vld1.16     {q0}, [r0 :128], r2
6777+        vld1.16     {q1}, [ip :128], r2
6778+        vld1.16     {q2}, [r0 :128]
6779+        vld1.16     {q3}, [ip :128]
6780+        sub         r0, r2
6781+        vqadd.s16   q0,  q10
6782+        sub         ip, r2
6783+        vqadd.s16   q1,  q11
6784+        subs        r3, #4
6785+        vqadd.s16   q2,  q12
6786+        vqadd.s16   q3,  q13
6787+        clip16_4    q0, q1, q2, q3, q8, q9
6788+        vst1.16     {q0}, [r0 :128], r2
6789+        vst1.16     {q1}, [ip :128], r2
6790+        vst1.16     {q2}, [r0 :128], r2
6791+        vst1.16     {q3}, [ip :128], r2
6792+        bne         1b
6793+        bx          lr
6794+
6795+endfunc
6796+
6797+@ add_residual4x4_dc_c(
6798+@  uint16_t *_dst,    [r0]
6799+@  ptrdiff_t stride,  [r1]
6800+@  int dc_uv)         [r2]
6801+
6802+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
6803+        mov         r3, #4
6804+        vdup.32     q15, r2
6805+        b           9f
6806+endfunc
6807+
6808+@ add_residual8x8_dc(
6809+@  uint16_t *_dst,    [r0]
6810+@  ptrdiff_t stride,  [r1]
6811+@  int dc)            [r2]
6812+
6813+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
6814+        vdup.16     q15, r2
6815+        mov         r3, #8
6816+9:
6817+        vmov.i16    q8,  #0
6818+        add         ip, r0, r1
6819+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
6820+        lsl         r1, #1
6821+1:
6822+        vld1.16     {q0}, [r0 :128], r1
6823+        vld1.16     {q1}, [ip :128], r1
6824+        vld1.16     {q2}, [r0 :128]
6825+        vld1.16     {q3}, [ip :128]
6826+        sub         r0, r1
6827+        vqadd.s16   q0,  q15
6828+        sub         ip, r1
6829+        vqadd.s16   q1,  q15
6830+        subs        r3, #4
6831+        vqadd.s16   q2,  q15
6832+        vqadd.s16   q3,  q15
6833+        clip16_4    q0, q1, q2, q3, q8, q9
6834+        vst1.16     {q0}, [r0 :128], r1
6835+        vst1.16     {q1}, [ip :128], r1
6836+        vst1.16     {q2}, [r0 :128], r1
6837+        vst1.16     {q3}, [ip :128], r1
6838+        bne         1b
6839+        bx          lr
6840+
6841+endfunc
6842+
6843+@ add_residual16x16(
6844+@  uint16_t *_dst,    [r0]
6845+@  int16_t *res,      [r1]
6846+@  ptrdiff_t stride)  [r2]
6847+
6848+function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1
6849+        add         ip, r0, r2
6850+        vmov.i16    q8,  #0
6851+        lsl         r2, #1
6852+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
6853+        mov         r3, #16
6854+1:
6855+        vldm        r1!, {q10-q13}
6856+        @ For RPI Sand we could guarantee :256 but not for general
6857+        @ non-RPI allocation. :128 is as good as we can claim
6858+        vld1.16     {q0, q1}, [r0 :128]
6859+        subs        r3, #2
6860+        vld1.16     {q2, q3}, [ip :128]
6861+        vqadd.s16   q0,  q10
6862+        vqadd.s16   q1,  q11
6863+        vqadd.s16   q2,  q12
6864+        vqadd.s16   q3,  q13
6865+        clip16_4    q0, q1, q2, q3, q8, q9
6866+        vst1.16     {q0, q1}, [r0 :128], r2
6867+        vst1.16     {q2, q3}, [ip :128], r2
6868+        bne         1b
6869+        bx          lr
6870+endfunc
6871+
6872+@ add_residual8x8_dc_c(
6873+@  uint16_t *_dst,    [r0]
6874+@  ptrdiff_t stride,  [r1]
6875+@  int dc_uv)         [r2]
6876+
6877+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
6878+        mov         r3, #8
6879+        vdup.32     q15, r2
6880+        b           9f
6881+endfunc
6882+
6883+@ add_residual16x16_dc(
6884+@  uint16_t *_dst,    [r0]
6885+@  ptrdiff_t stride,  [r1]
6886+@  int dc)            [r2]
6887+
6888+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
6889+        vdup.i16    q15, r2
6890+        mov         r3, #16
6891+9:
6892+        vmov.i16    q8,  #0
6893+        add         ip, r0, r1
6894+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
6895+        lsl         r1, #1
6896+1:
6897+        @ For RPI Sand we could guarantee :256 but not for general
6898+        @ non-RPI allocation. :128 is as good as we can claim
6899+        vld1.16     {q0, q1}, [r0 :128]
6900+        subs        r3, #2
6901+        vqadd.s16   q0,  q15
6902+        vqadd.s16   q1,  q15
6903+        vld1.16     {q2, q3}, [ip :128]
6904+        vqadd.s16   q2,  q15
6905+        vqadd.s16   q3,  q15
6906+        clip16_4    q0, q1, q2, q3, q8, q9
6907+        vst1.16     {q0, q1}, [r0 :128], r1
6908+        vst1.16     {q2, q3}, [ip :128], r1
6909+        bne         1b
6910+        bx          lr
6911+
6912+endfunc
6913+
6914+
6915+@ add_residual32x32(
6916+@  uint16_t *_dst,    [r0]
6917+@  int16_t *res,      [r1]
6918+@  ptrdiff_t stride)  [r2]
6919+
6920+function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1
6921+        push        {lr}
6922+        mov         r3, #32
6923+        vmov.i16    q8,  #0
6924+        add         lr, r0, r2
6925+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
6926+        add         ip, r0, #32
6927+1:
6928+        vldm        r1!, {q10-q13}
6929+        vldm        r0,  {q0-q3}
6930+        vqadd.s16   q0,  q10
6931+          pldw        [lr]
6932+        vqadd.s16   q1,  q11
6933+          add         lr, r2
6934+        vqadd.s16   q2,  q12
6935+        subs        r3, #1
6936+        vqadd.s16   q3,  q13
6937+        clip16_4    q0, q1, q2, q3, q8, q9
6938+        vst1.16     {q0-q1}, [r0], r2
6939+        vst1.16     {q2-q3}, [ip], r2
6940+        bne         1b
6941+        pop         {pc}
6942+
6943+endfunc
6944+
6945+@ add_residual16x16_dc_c(
6946+@  uint16_t *_dst,    [r0]
6947+@  ptrdiff_t stride,  [r1]
6948+@  int dc_uv)         [r2]
6949+
6950+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
6951+        mov         r3, #16
6952+        vdup.32     q15, r2
6953+        b           9f
6954+endfunc
6955+
6956+@ add_residual32x32_dc(
6957+@  uint16_t *_dst,    [r0]
6958+@  ptrdiff_t stride,  [r1]
6959+@  int dc)            [r2]
6960+
6961+function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
6962+        vdup.16     q15, r2
6963+        mov         r3, #32
6964+9:
6965+        vmov.i16    q8,  #0
6966+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
6967+        add         ip, r0, #32
6968+1:
6969+        vldm        r0,  {q0-q3}
6970+        vqadd.s16   q0,  q15
6971+        subs        r3, #1
6972+        vqadd.s16   q1,  q15
6973+        vqadd.s16   q2,  q15
6974+        vqadd.s16   q3,  q15
6975+        clip16_4    q0, q1, q2, q3, q8, q9
6976+        vst1.16     {q0-q1}, [r0], r1
6977+        vst1.16     {q2-q3}, [ip], r1
6978+        bne         1b
6979+        bx          lr
6980+
6981+endfunc
6982+
6983+@ ============================================================================
6984+@ U add
6985+
6986+@ add_residual4x4_u(
6987+@   uint16_t *_dst,       [r0]
6988+@   const int16_t *res,   [r1]
6989+@   ptrdiff_t stride,     [r2]
6990+@   int dc)               [r3]
6991+
6992+function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
6993+        vdup.16     q15, r3
6994+        add         ip, r0, r2
6995+        vld1.16     {q10, q11}, [r1 :256]
6996+        lsl         r2, #1
6997+        vld2.16     {d0, d2}, [r0 :128], r2
6998+        vld2.16     {d1, d3}, [ip :128], r2
6999+        vld2.16     {d4, d6}, [r0 :128]
7000+        vld2.16     {d5, d7}, [ip :128]
7001+        sub         r0, r2
7002+        vmov.i16    q8,  #0
7003+        sub         ip, r2
7004+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
7005+
7006+        vqadd.s16   q0,  q10
7007+        vqadd.s16   q1,  q15
7008+        vqadd.s16   q2,  q11
7009+        vqadd.s16   q3,  q15
7010+        clip16_4    q0, q1, q2, q3, q8, q9
7011+
7012+        vst2.16     {d0, d2}, [r0 :128], r2
7013+        vst2.16     {d1, d3}, [ip :128], r2
7014+        vst2.16     {d4, d6}, [r0 :128]
7015+        vst2.16     {d5, d7}, [ip :128]
7016+        bx          lr
7017+endfunc
7018+
7019+@ add_residual8x8_u(
7020+@   uint16_t *_dst,       [r0]
7021+@   const int16_t *res,   [r1]
7022+@   ptrdiff_t stride,     [r2]
7023+@   int dc)               [r3]
7024+
7025+function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
7026+        vdup.16     q15, r3
7027+        mov         r3, #8
7028+        vmov.i16    q8,  #0
7029+        add         ip, r0, r2
7030+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
7031+        lsl         r2, #1
7032+1:
7033+        vld2.16     {q0, q1}, [r0 :256]
7034+        subs        r3, #2
7035+        vld2.16     {q2, q3}, [ip :256]
7036+        vld1.16     {q10, q11}, [r1 :256]!
7037+        vqadd.s16   q0,  q10
7038+        vqadd.s16   q1,  q15
7039+        vqadd.s16   q2,  q11
7040+        vqadd.s16   q3,  q15
7041+        clip16_4    q0, q1, q2, q3, q8, q9
7042+        vst2.16     {q0, q1}, [r0 :256], r2
7043+        vst2.16     {q2, q3}, [ip :256], r2
7044+        bne         1b
7045+        bx          lr
7046+endfunc
7047+
7048+@ add_residual16x16_u(
7049+@   uint16_t *_dst,       [r0]
7050+@   const int16_t *res,   [r1]
7051+@   ptrdiff_t stride,     [r2]
7052+@   int dc)               [r3]
7053+
7054+function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
7055+        push        {lr}
7056+        vdup.16     q15, r3
7057+        mov         r3, #16
7058+        vmov.i16    q8,  #0
7059+        add         lr, r0, r2
7060+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
7061+        add         ip, r0, #32
7062+1:
7063+        vld2.16     {q0, q1}, [r0 :256]
7064+        vld2.16     {q2, q3}, [ip :256]
7065+        vld1.16     {q10, q11}, [r1 :256]!
7066+        vqadd.s16   q0,  q10
7067+          pldw        [lr]
7068+        vqadd.s16   q1,  q15
7069+          add         lr, r2
7070+        vqadd.s16   q2,  q11
7071+        subs        r3, #1
7072+        vqadd.s16   q3,  q15
7073+        clip16_4    q0, q1, q2, q3, q8, q9
7074+        vst2.16     {q0, q1}, [r0 :256], r2
7075+        vst2.16     {q2, q3}, [ip :256], r2
7076+        bne         1b
7077+        pop         {pc}
7078+endfunc
7079+
7080+@ ============================================================================
7081+@ V add
7082+
7083+@ add_residual4x4_v(
7084+@   uint16_t *_dst,       [r0]
7085+@   const int16_t *res,   [r1]
7086+@   ptrdiff_t stride,     [r2]
7087+@   int dc)               [r3]
7088+
7089+function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
7090+        vdup.16     q15, r3
7091+        add         ip, r0, r2
7092+        vld1.16     {q10, q11}, [r1 :256]
7093+        lsl         r2, #1
7094+        vld2.16     {d0, d2}, [r0 :128], r2
7095+        vld2.16     {d1, d3}, [ip :128], r2
7096+        vld2.16     {d4, d6}, [r0 :128]
7097+        vld2.16     {d5, d7}, [ip :128]
7098+        sub         r0, r2
7099+        vmov.i16    q8,  #0
7100+        sub         ip, r2
7101+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
7102+
7103+        vqadd.s16   q0,  q15
7104+        vqadd.s16   q1,  q10
7105+        vqadd.s16   q2,  q15
7106+        vqadd.s16   q3,  q11
7107+        clip16_4    q0, q1, q2, q3, q8, q9
7108+
7109+        vst2.16     {d0, d2}, [r0 :128], r2
7110+        vst2.16     {d1, d3}, [ip :128], r2
7111+        vst2.16     {d4, d6}, [r0 :128]
7112+        vst2.16     {d5, d7}, [ip :128]
7113+        bx          lr
7114+endfunc
7115+
7116+@ add_residual8x8_v(
7117+@   uint16_t *_dst,       [r0]
7118+@   const int16_t *res,   [r1]
7119+@   ptrdiff_t stride,     [r2]
7120+@   int dc)               [r3]
7121+
7122+function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
7123+        vdup.16     q15, r3
7124+        mov         r3, #8
7125+        vmov.i16    q8,  #0
7126+        add         ip, r0, r2
7127+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
7128+        lsl         r2, #1
7129+1:
7130+        vld2.16     {q0, q1}, [r0 :256]
7131+        subs        r3, #2
7132+        vld2.16     {q2, q3}, [ip :256]
7133+        vld1.16     {q10, q11}, [r1 :256]!
7134+        vqadd.s16   q0,  q15
7135+        vqadd.s16   q1,  q10
7136+        vqadd.s16   q2,  q15
7137+        vqadd.s16   q3,  q11
7138+        clip16_4    q0, q1, q2, q3, q8, q9
7139+        vst2.16     {q0, q1}, [r0 :256], r2
7140+        vst2.16     {q2, q3}, [ip :256], r2
7141+        bne         1b
7142+        bx          lr
7143+endfunc
7144+
7145+@ add_residual16x16_v(
7146+@   uint16_t *_dst,       [r0]
7147+@   const int16_t *res,   [r1]
7148+@   ptrdiff_t stride,     [r2]
7149+@   int dc)               [r3]
7150+
7151+function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
7152+        push        {lr}
7153+        vdup.16     q15, r3
7154+        mov         r3, #16
7155+        vmov.i16    q8,  #0
7156+        add         lr, r0, r2
7157+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
7158+        add         ip, r0, #32
7159+1:
7160+        vld2.16     {q0, q1}, [r0 :256]
7161+        vld2.16     {q2, q3}, [ip :256]
7162+        vld1.16     {q10, q11}, [r1 :256]!
7163+        vqadd.s16   q0,  q15
7164+          pldw        [lr]
7165+        vqadd.s16   q1,  q10
7166+          add         lr, r2
7167+        vqadd.s16   q2,  q15
7168+        subs        r3, #1
7169+        vqadd.s16   q3,  q11
7170+        clip16_4    q0, q1, q2, q3, q8, q9
7171+        vst2.16     {q0, q1}, [r0 :256], r2
7172+        vst2.16     {q2, q3}, [ip :256], r2
7173+        bne         1b
7174+        pop         {pc}
7175+endfunc
7176+
7177+@ ============================================================================
7178+@ U & V add
7179+
7180+@ add_residual4x4_c(
7181+@   uint16_t *_dst,       [r0]
7182+@   const int16_t *res,   [r1]
7183+@   ptrdiff_t stride)     [r2]
7184+
7185+function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
7186+        vmov.i16    q8,  #0
7187+        add         ip, r0, r2
7188+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
7189+        lsl         r2, #1
7190+        vldm        r1, {q10-q13}
7191+        vld2.16     {d0, d2}, [r0 :128], r2
7192+        vld2.16     {d1, d3}, [ip :128], r2
7193+        vld2.16     {d4, d6}, [r0 :128]
7194+        vld2.16     {d5, d7}, [ip :128]
7195+
7196+        sub         r0, r2
7197+        vqadd.s16   q0,  q10
7198+        sub         ip, r2
7199+        vqadd.s16   q1,  q12
7200+        vqadd.s16   q2,  q11
7201+        vqadd.s16   q3,  q13
7202+        clip16_4    q0, q1, q2, q3, q8, q9
7203+
7204+        vst2.16     {d0, d2}, [r0 :128], r2
7205+        vst2.16     {d1, d3}, [ip :128], r2
7206+        vst2.16     {d4, d6}, [r0 :128]
7207+        vst2.16     {d5, d7}, [ip :128]
7208+        bx          lr
7209+endfunc
7210+
7211+@ add_residual8x8_c(
7212+@   uint16_t *_dst,       [r0]
7213+@   const int16_t *res,   [r1]
7214+@   ptrdiff_t stride)     [r2]
7215+
7216+function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
7217+        push        {lr}
7218+        add         ip, r0, r2
7219+        lsl         r2, #1
7220+        vmov.i16    q8,  #0
7221+        add         r3, r1, #(8*8*2)  @ Offset to V
7222+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
7223+        mov         lr, #8
7224+1:
7225+        vld1.16     {q10, q11}, [r1 :256]!
7226+        subs        lr, #2
7227+        vld2.16     {q0, q1}, [r0 :256]
7228+        vld2.16     {q2, q3}, [ip :256]
7229+        vld1.16     {q12, q13}, [r3 :256]!
7230+        vqadd.s16   q0,  q10
7231+        vqadd.s16   q1,  q12
7232+        vqadd.s16   q2,  q11
7233+        vqadd.s16   q3,  q13
7234+        clip16_4    q0, q1, q2, q3, q8, q9
7235+        vst2.16     {q0, q1}, [r0 :256], r2
7236+        vst2.16     {q2, q3}, [ip :256], r2
7237+        bne         1b
7238+        pop         {pc}
7239+endfunc
7240+
7241+@ add_residual16x16_c(
7242+@   uint16_t *_dst,       [r0]
7243+@   const int16_t *res,   [r1]
7244+@   ptrdiff_t stride)     [r2]
7245+
7246+function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
7247+        push        {r4, lr}
7248+        vmov.i16    q8,  #0
7249+        add         r3,  r1, #(16*16*2)  @ Offset to V
7250+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
7251+        add         ip, r0, #32
7252+        add         r4, r0, r2
7253+        mov         lr, #16
7254+1:
7255+        vld2.16     {q0, q1}, [r0 :256]
7256+        vld2.16     {q2, q3}, [ip :256]
7257+        vld1.16     {q10, q11}, [r1 :256]!
7258+        vld1.16     {q12, q13}, [r3 :256]!
7259+        vqadd.s16   q0,  q10
7260+          pldw        [r4]
7261+        vqadd.s16   q1,  q12
7262+          add         r4, r2
7263+        vqadd.s16   q2,  q11
7264+        subs        lr, #1
7265+        vqadd.s16   q3,  q13
7266+        clip16_4    q0, q1, q2, q3, q8, q9
7267+        vst2.16     {q0, q1}, [r0 :256], r2
7268+        vst2.16     {q2, q3}, [ip :256], r2
7269+        bne         1b
7270+        pop         {r4,pc}
7271+endfunc
7272+
7273--- /dev/null
7274+++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
7275@@ -0,0 +1,741 @@
7276+/*
7277+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
7278+All rights reserved.
7279+
7280+Redistribution and use in source and binary forms, with or without
7281+modification, are permitted provided that the following conditions are met:
7282+    * Redistributions of source code must retain the above copyright
7283+      notice, this list of conditions and the following disclaimer.
7284+    * Redistributions in binary form must reproduce the above copyright
7285+      notice, this list of conditions and the following disclaimer in the
7286+      documentation and/or other materials provided with the distribution.
7287+    * Neither the name of the copyright holder nor the
7288+      names of its contributors may be used to endorse or promote products
7289+      derived from this software without specific prior written permission.
7290+
7291+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
7292+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
7293+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
7294+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
7295+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
7296+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
7297+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
7298+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
7299+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
7300+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
7301+
7302+Authors: John Cox, Ben Avison
7303+*/
7304+
7305+#include "libavutil/arm/asm.S"
7306+#include "neon.S"
7307+
7308+ .arch_extension mp @ enable PLDW
7309+
7310+@ General notes:
7311+@
7312+@ Residual is generally only guaranteed to be clipped to 16 bits.
7313+@ This means that we do need to do vmovl, vqadd, vqmovun
7314+@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away
7315+@ with this).
7316+@
7317+@ There is an exception for the DC case because its transform is guaranteed
7318+@ to be small enough that overflow cannot occur during the first add.
7319+
7320+@ ============================================================================
7321+@ Y add
7322+
7323+function ff_hevc_rpi_add_residual_4x4_neon_8, export=1
7324+        add         ip, r0, r2
7325+        vld1.16     {q0, q1}, [r1]
7326+        lsl         r2, #1
7327+        vld1.32     d4[0], [r0], r2
7328+        rsb         r3, r2, #0
7329+        vld1.32     d4[1], [ip], r2
7330+        vld1.32     d5[0], [r0], r3
7331+        vld1.32     d5[1], [ip], r3
7332+        vmovl.u8    q8, d4
7333+        vmovl.u8    q9, d5
7334+        vqadd.s16   q0, q8
7335+        vqadd.s16   q1, q9
7336+        vqmovun.s16 d0, q0
7337+        vqmovun.s16 d1, q1
7338+        vst1.32     d0[0], [r0], r2
7339+        vst1.32     d0[1], [ip], r2
7340+        vst1.32     d1[0], [r0]
7341+        vst1.32     d1[1], [ip]
7342+        bx          lr
7343+endfunc
7344+
7345+function ff_hevc_rpi_add_residual_8x8_neon_8, export=1
7346+        push        {r4, lr}
7347+        vld1.16     {q0, q1}, [r1]!
7348+        add         ip, r0, r2
7349+        vld1.8      {d6}, [r0]
7350+        add         r4, r0, r2, lsl #1
7351+        vld1.8      {d7}, [ip]
7352+        add         lr, ip, r2, lsl #1
7353+        lsl         r2, #1
7354+        mov         r3, #8-2
7355+        vmovl.u8    q2, d6
7356+        vmovl.u8    q3, d7
7357+        vqadd.s16   q2, q0
7358+        vqadd.s16   q3, q1
7359+1:
7360+          vld1.16     {q0, q1}, [r1]!
7361+        subs        r3, #2
7362+        vqmovun.s16 d4, q2
7363+        vqmovun.s16 d5, q3
7364+          vld1.8      {d6}, [r4], r2
7365+          vld1.8      {d7}, [lr], r2
7366+        vst1.8      {d4}, [r0], r2
7367+        vst1.8      {d5}, [ip], r2
7368+          vmovl.u8    q2, d6
7369+            pldw        [r4]
7370+          vmovl.u8    q3, d7
7371+          vqadd.s16   q2, q0
7372+          vqadd.s16   q3, q1
7373+        bne         1b
7374+
7375+          vqmovun.s16 d4, q2
7376+          vqmovun.s16 d5, q3
7377+          vst1.8      {d4}, [r0]
7378+          vst1.8      {d5}, [ip]
7379+          pop         {r4, pc}
7380+endfunc
7381+
7382+function ff_hevc_rpi_add_residual_16x16_neon_8, export=1
7383+        vld1.16     {q0, q1}, [r1]!
7384+        add         ip, r0, r2
7385+        vld1.8      {q3}, [r0]
7386+        mov         r3, #16-1
7387+        vmovl.u8    q2, d6
7388+        vmovl.u8    q3, d7
7389+        vqadd.s16   q2, q0
7390+        vqadd.s16   q3, q1
7391+1:
7392+          vld1.16     {q0, q1}, [r1]!
7393+        subs        r3, #1
7394+        vqmovun.s16 d4, q2
7395+        vqmovun.s16 d5, q3
7396+          vld1.8      {q3}, [ip], r2
7397+        vst1.8      {q2}, [r0], r2
7398+          vmovl.u8    q2, d6
7399+            pldw        [ip]
7400+          vmovl.u8    q3, d7
7401+          vqadd.s16   q2, q0
7402+          vqadd.s16   q3, q1
7403+        bne         1b
7404+
7405+          vqmovun.s16 d4, q2
7406+          vqmovun.s16 d5, q3
7407+          vst1.8      {q2}, [r0]
7408+          bx          lr
7409+endfunc
7410+
7411+function ff_hevc_rpi_add_residual_32x32_neon_8, export=1
7412+        vldm        r1!, {q0-q3}
7413+        vld1.8      {q8, q9}, [r0]
7414+        add         ip, r0, r2
7415+        vmovl.u8    q10, d16
7416+        mov         r3, #32-1
7417+        vmovl.u8    q11, d17
7418+        vmovl.u8    q12, d18
7419+        vmovl.u8    q13, d19
7420+        vqadd.s16   q10, q0
7421+        vqadd.s16   q11, q1
7422+        vqadd.s16   q12, q2
7423+        vqadd.s16   q13, q3
7424+1:
7425+          vldm        r1!, {q0-q3}
7426+        vqmovun.s16 d20, q10
7427+        vqmovun.s16 d21, q11
7428+        vqmovun.s16 d22, q12
7429+        vqmovun.s16 d23, q13
7430+          vld1.8      {q8, q9}, [ip], r2
7431+        subs        r3, #1
7432+        vst1.8      {q10, q11}, [r0], r2
7433+          vmovl.u8    q10, d16
7434+            pldw        [ip]
7435+          vmovl.u8    q11, d17
7436+          vmovl.u8    q12, d18
7437+          vmovl.u8    q13, d19
7438+          vqadd.s16   q10, q0
7439+          vqadd.s16   q11, q1
7440+          vqadd.s16   q12, q2
7441+          vqadd.s16   q13, q3
7442+        bne     1b
7443+
7444+          vqmovun.s16 d20, q10
7445+          vqmovun.s16 d21, q11
7446+          vqmovun.s16 d22, q12
7447+          vqmovun.s16 d23, q13
7448+          vst1.8      {q10, q11}, [r0]
7449+          bx          lr
7450+endfunc
7451+
7452+
7453+@ ff_hevc_rpi_add_residual_4x4_dc_neon_8(
7454+@   uint8_t * dst,              // [r0]
7455+@   unsigned int stride,        // [r1]
7456+@   int dc)                     // [r2]
7457+
7458+function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1
7459+        add         ip, r0, r1
7460+        vdup.16     q15, r2
7461+        lsl         r1, #1
7462+        vld1.32     d4[0], [r0], r1
7463+        rsb         r3, r1, #0
7464+        vld1.32     d4[1], [ip], r1
7465+        vld1.32     d5[0], [r0], r3
7466+        vld1.32     d5[1], [ip], r3
7467+        vaddw.u8    q0, q15, d4
7468+        vaddw.u8    q1, q15, d5
7469+        vqmovun.s16 d0, q0
7470+        vqmovun.s16 d1, q1
7471+        vst1.32     d0[0], [r0], r1
7472+        vst1.32     d0[1], [ip], r1
7473+        vst1.32     d1[0], [r0]
7474+        vst1.32     d1[1], [ip]
7475+        bx          lr
7476+endfunc
7477+
7478+@ ============================================================================
7479+@ DC Y or C add
7480+
7481+@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(
7482+@   uint8_t * dst,              // [r0]
7483+@   unsigned int stride,        // [r1]
7484+@   int dc)                     // [r2]
7485+
7486+function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1
7487+        mov         r3,  #4-2
7488+        vdup.32     q15, r2
7489+        b           1f
7490+endfunc
7491+
7492+@ ff_hevc_rpi_add_residual_8x8_dc_neon_8(
7493+@   uint8_t * dst,              // [r0]
7494+@   unsigned int stride,        // [r1]
7495+@   int dc)                     // [r2]
7496+
7497+function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1
7498+        vdup.16     q15, r2
7499+        mov         r3, #8-2
7500+1:      vld1.8      d16, [r0]
7501+        add         ip, r0, r1
7502+        push        {r4, lr}
7503+        vld1.8      d17, [ip]
7504+        add         r4, r0, r1, lsl #1
7505+        vaddw.u8    q0, q15, d16
7506+        lsl         r1, #1
7507+        vaddw.u8    q1, q15, d17
7508+        add         lr, ip, r1
7509+1:
7510+          vld1.8      {d16}, [r4], r1
7511+          vld1.8      {d17}, [lr], r1
7512+        subs        r3, #2
7513+        vqmovun.s16 d4, q0
7514+        vqmovun.s16 d5, q1
7515+          vaddw.u8    q0, q15, d16
7516+          vaddw.u8    q1, q15, d17
7517+        vst1.8      {d4}, [r0], r1
7518+        vst1.8      {d5}, [ip], r1
7519+        bne         1b
7520+
7521+          vqmovun.s16 d4, q0
7522+          vqmovun.s16 d5, q1
7523+          vst1.8      {d4}, [r0]
7524+          vst1.8      {d5}, [ip]
7525+          pop         {r4, pc}
7526+endfunc
7527+
7528+
7529+@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(
7530+@   uint8_t * dst,              // [r0]
7531+@   unsigned int stride,        // [r1]
7532+@   int dc)                     // [r2]
7533+
7534+function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1
7535+        mov         r3,  #8-1
7536+        vdup.32     q15, r2
7537+        b           1f
7538+endfunc
7539+
7540+@ ff_hevc_rpi_add_residual_16x16_dc_neon_8(
7541+@   uint8_t * dst,              // [r0]
7542+@   unsigned int stride,        // [r1]
7543+@   int dc)                     // [r2]
7544+
7545+function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1
7546+        vdup.16     q15, r2
7547+        mov         r3,  #16-1
7548+1:      vld1.8      {q8}, [r0]
7549+        add         ip, r0, r1
7550+        vaddw.u8    q0, q15, d16
7551+        vaddw.u8    q1, q15, d17
7552+1:
7553+          vld1.8      {q8}, [ip], r1
7554+        subs        r3, #1
7555+        vqmovun.s16 d4, q0
7556+        vqmovun.s16 d5, q1
7557+          vaddw.u8    q0, q15, d16
7558+          vaddw.u8    q1, q15, d17
7559+        vst1.8      {q2}, [r0], r1
7560+        bne         1b
7561+
7562+          vqmovun.s16 d4, q0
7563+          vqmovun.s16 d5, q1
7564+          vst1.8      {q2}, [r0]
7565+          bx          lr
7566+endfunc
7567+
7568+
7569+@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(
7570+@   uint8_t * dst,              // [r0]
7571+@   unsigned int stride,        // [r1]
7572+@   int dc)                     // [r2]
7573+
7574+function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1
7575+        mov         r3,  #16-1
7576+        vdup.32     q15, r2
7577+        b           1f
7578+endfunc
7579+
7580+@ ff_hevc_rpi_add_residual_32x32_dc_neon_8(
7581+@   uint8_t * dst,              // [r0]
7582+@   unsigned int stride,        // [r1]
7583+@   int dc)                     // [r2]
7584+
7585+function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1
7586+        vdup.16     q15, r2
7587+        mov         r3, #32-1
7588+1:      vld1.8      {q8, q9}, [r0]
7589+        add         ip, r0, r1
7590+        vaddw.u8    q0, q15, d16
7591+        vaddw.u8    q1, q15, d17
7592+        vaddw.u8    q2, q15, d18
7593+        vaddw.u8    q3, q15, d19
7594+1:
7595+        vqmovun.s16 d20, q0
7596+        vqmovun.s16 d21, q1
7597+        vqmovun.s16 d22, q2
7598+        vqmovun.s16 d23, q3
7599+          vld1.8      {q8, q9}, [ip], r1
7600+        subs        r3, #1
7601+          vaddw.u8    q0, q15, d16
7602+          vaddw.u8    q1, q15, d17
7603+          vaddw.u8    q2, q15, d18
7604+          vaddw.u8    q3, q15, d19
7605+        vst1.8      {q10, q11}, [r0], r1
7606+        bne     1b
7607+
7608+          vqmovun.s16 d20, q0
7609+          vqmovun.s16 d21, q1
7610+          vqmovun.s16 d22, q2
7611+          vqmovun.s16 d23, q3
7612+          vst1.8      {q10, q11}, [r0]
7613+          bx          lr
7614+endfunc
7615+
7616+@ ============================================================================
7617+@ U add
7618+
7619+@ add_residual4x4_c(
7620+@   uint8_t *_dst,        [r0]
7621+@   const int16_t *res,   [r1]
7622+@   ptrdiff_t stride,     [r2]
7623+@   int dc_v)             [r3]
7624+
7625+function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1
7626+        add         ip, r0, r2
7627+        vld1.16     {q0, q1}, [r1]
7628+        lsl         r2, #1
7629+        vld1.8      {d16}, [r0 :64], r2
7630+        vld1.8      {d17}, [ip :64], r2
7631+        vld1.8      {d18}, [r0 :64]
7632+        sub         r0, r2
7633+        vld1.8      {d19}, [ip :64]
7634+        sub         ip, r2
7635+        vdup.16     q2, r3
7636+        vdup.16     q3, r3
7637+        vmovl.u8    q10, d16
7638+        vmovl.u8    q11, d17
7639+        vmovl.u8    q12, d18
7640+        vmovl.u8    q13, d19
7641+        vzip.16     q0, q2
7642+        vzip.16     q1, q3
7643+        vqadd.s16   q0,  q10
7644+        vqadd.s16   q2,  q11
7645+        vqadd.s16   q1,  q12
7646+        vqadd.s16   q3,  q13
7647+        vqmovun.s16 d0,  q0
7648+        vqmovun.s16 d1,  q2
7649+        vqmovun.s16 d2,  q1
7650+        vqmovun.s16 d3,  q3
7651+        vst1.8      {d0}, [r0 :64], r2
7652+        vst1.8      {d1}, [ip :64], r2
7653+        vst1.8      {d2}, [r0 :64]
7654+        vst1.8      {d3}, [ip :64]
7655+        bx          lr
7656+endfunc
7657+
7658+@ add_residual8x8_c(
7659+@   uint8_t *_dst,        [r0]
7660+@   const int16_t *res,   [r1]
7661+@   ptrdiff_t stride)     [r2]
7662+@   int dc_v)             [r3]
7663+
7664+function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1
7665+        vdup.16     q15, r3
7666+        add         ip, r0, r2
7667+        push        {r4, lr}
7668+        vld2.8      {d16, d17}, [r0 :128]
7669+        lsl         r2, #1
7670+        vld2.8      {d18, d19}, [ip :128]
7671+        mov         r3, #8-2
7672+        vld1.16     {q0, q1}, [r1 :256]!
7673+        add         r4, r0, r2
7674+        vmovl.u8    q10, d16
7675+        add         lr, ip, r2
7676+        vmovl.u8    q11, d18
7677+        vqadd.s16   q0,  q10
7678+        vaddw.u8    q2,  q15, d17
7679+        vqadd.s16   q1,  q11
7680+        vaddw.u8    q3,  q15, d19
7681+1:
7682+        vqmovun.s16 d20,  q0
7683+        vqmovun.s16 d21,  q2
7684+          vld2.8      {d16, d17}, [r4 :128], r2
7685+        subs        r3, #2
7686+        vqmovun.s16 d22,  q1
7687+        vqmovun.s16 d23,  q3
7688+        vst2.8      {d20, d21}, [r0 :128], r2
7689+          vld2.8      {d18, d19}, [lr :128], r2
7690+        vst2.8      {d22, d23}, [ip :128], r2
7691+          vld1.16     {q0, q1}, [r1 :256]!
7692+          vmovl.u8    q10, d16
7693+          vmovl.u8    q11, d18
7694+          vqadd.s16   q0,  q10
7695+          vaddw.u8    q2,  q15, d17
7696+          vqadd.s16   q1,  q11
7697+          vaddw.u8    q3,  q15, d19
7698+        bne         1b
7699+
7700+          vqmovun.s16 d20,  q0
7701+          vqmovun.s16 d21,  q2
7702+          vqmovun.s16 d22,  q1
7703+          vqmovun.s16 d23,  q3
7704+          vst2.8      {d20, d21}, [r0 :128]
7705+          vst2.8      {d22, d23}, [ip :128]
7706+          pop         {r4, pc}
7707+endfunc
7708+
7709+@ add_residual16x16_u(
7710+@   uint8_t *_dst,        [r0]
7711+@   const int16_t *res,   [r1]
7712+@   ptrdiff_t stride)     [r2]
7713+@   int dc_v)             [r3]
7714+
7715+function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1
7716+        vdup.16     q15, r3
7717+        add         ip, r0, r2
7718+        vld2.8      {q8, q9}, [r0 :256]
7719+        mov         r3, #16-1
7720+        vld1.16     {q0, q1}, [r1 :256]!
7721+        vmovl.u8    q11, d16
7722+        vmovl.u8    q12, d17
7723+        vqadd.s16   q0,  q11
7724+        vaddw.u8    q11, q15, d18
7725+        vqadd.s16   q1,  q12
7726+        vaddw.u8    q12, q15, d19
7727+1:
7728+          vld2.8      {q8, q9}, [ip :256], r2
7729+        subs        r3, #1
7730+        vqmovun.s16 d20, q0
7731+        vqmovun.s16 d22, q11
7732+        vqmovun.s16 d21, q1
7733+        vqmovun.s16 d23, q12
7734+          vld1.16     {q0, q1}, [r1 :256]!
7735+        vst2.8      {q10, q11}, [r0 :256], r2
7736+          vmovl.u8    q11, d16
7737+            pldw        [ip]
7738+          vmovl.u8    q12, d17
7739+          vqadd.s16   q0,  q11
7740+          vaddw.u8    q11, q15, d18
7741+          vqadd.s16   q1,  q12
7742+          vaddw.u8    q12, q15, d19
7743+        bne         1b
7744+
7745+          vqmovun.s16 d20, q0
7746+          vqmovun.s16 d22, q11
7747+          vqmovun.s16 d21, q1
7748+          vqmovun.s16 d23, q12
7749+          vst2.8      {q10, q11}, [r0 :256]
7750+          bx          lr
7751+endfunc
7752+
7753+@ ============================================================================
7754+@ V add
7755+
7756+@ add_residual4x4_v(
7757+@   uint8_t *_dst,        [r0]
7758+@   const int16_t *res,   [r1]
7759+@   ptrdiff_t stride)     [r2]
7760+
7761+function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1
7762+        add         ip, r0, r2
7763+        vld1.16     {q2, q3}, [r1]
7764+        lsl         r2, #1
7765+        vld1.8      {d16}, [r0 :64], r2
7766+        vld1.8      {d17}, [ip :64], r2
7767+        vld1.8      {d18}, [r0 :64]
7768+        sub         r0, r2
7769+        vld1.8      {d19}, [ip :64]
7770+        sub         ip, r2
7771+        vdup.16     q0, r3
7772+        vdup.16     q1, r3
7773+        vmovl.u8    q10, d16
7774+        vmovl.u8    q11, d17
7775+        vmovl.u8    q12, d18
7776+        vmovl.u8    q13, d19
7777+        vzip.16     q0, q2
7778+        vzip.16     q1, q3
7779+        vqadd.s16   q0,  q10
7780+        vqadd.s16   q2,  q11
7781+        vqadd.s16   q1,  q12
7782+        vqadd.s16   q3,  q13
7783+        vqmovun.s16 d0,  q0
7784+        vqmovun.s16 d1,  q2
7785+        vqmovun.s16 d2,  q1
7786+        vqmovun.s16 d3,  q3
7787+        vst1.8      {d0}, [r0 :64], r2
7788+        vst1.8      {d1}, [ip :64], r2
7789+        vst1.8      {d2}, [r0 :64]
7790+        vst1.8      {d3}, [ip :64]
7791+        bx          lr
7792+endfunc
7793+
7794+@ add_residual8x8_v(
7795+@   uint8_t *_dst,        [r0]
7796+@   const int16_t *res,   [r1]
7797+@   ptrdiff_t stride)     [r2]
7798+
7799+function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1
7800+        vdup.16     q15, r3
7801+        add         ip, r0, r2
7802+        push        {r4, lr}
7803+        vld2.8      {d16, d17}, [r0 :128]
7804+        lsl         r2, #1
7805+        vld2.8      {d18, d19}, [ip :128]
7806+        mov         r3, #8-2
7807+        vld1.16     {q0, q1}, [r1 :256]!
7808+        add         r4, r0, r2
7809+        vmovl.u8    q10, d17
7810+        add         lr, ip, r2
7811+        vmovl.u8    q11, d19
7812+        vqadd.s16   q0,  q10
7813+        vaddw.u8    q2,  q15, d16
7814+        vqadd.s16   q1,  q11
7815+        vaddw.u8    q3,  q15, d18
7816+1:
7817+        vqmovun.s16 d20,  q2
7818+        vqmovun.s16 d21,  q0
7819+          vld2.8      {d16, d17}, [r4 :128], r2
7820+        subs        r3, #2
7821+        vqmovun.s16 d22,  q3
7822+        vqmovun.s16 d23,  q1
7823+        vst2.8      {d20, d21}, [r0 :128], r2
7824+          vld2.8      {d18, d19}, [lr :128], r2
7825+        vst2.8      {d22, d23}, [ip :128], r2
7826+          vld1.16     {q0, q1}, [r1 :256]!
7827+          vmovl.u8    q10, d17
7828+          vmovl.u8    q11, d19
7829+          vqadd.s16   q0,  q10
7830+          vaddw.u8    q2,  q15, d16
7831+          vqadd.s16   q1,  q11
7832+          vaddw.u8    q3,  q15, d18
7833+        bne         1b
7834+
7835+          vqmovun.s16 d20,  q2
7836+          vqmovun.s16 d21,  q0
7837+          vqmovun.s16 d22,  q3
7838+          vqmovun.s16 d23,  q1
7839+          vst2.8      {d20, d21}, [r0 :128]
7840+          vst2.8      {d22, d23}, [ip :128]
7841+          pop         {r4, pc}
7842+endfunc
7843+
7844+@ add_residual16x16_v(
7845+@   uint8_t *_dst,        [r0]
7846+@   const int16_t *res,   [r1]
7847+@   ptrdiff_t stride)     [r2]
7848+
7849+function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1
7850+        vdup.16     q15, r3
7851+        add         ip, r0, r2
7852+        vld2.8      {q8, q9}, [r0 :256]
7853+        mov         r3, #16-1
7854+        vld1.16     {q0, q1}, [r1 :256]!
7855+        vmovl.u8    q11, d18
7856+        vmovl.u8    q12, d19
7857+        vqadd.s16   q0,  q11
7858+        vaddw.u8    q11, q15, d16
7859+        vqadd.s16   q1,  q12
7860+        vaddw.u8    q12, q15, d17
7861+1:
7862+          vld2.8      {q8, q9}, [ip :256], r2
7863+        subs        r3, #1
7864+        vqmovun.s16 d20, q11
7865+        vqmovun.s16 d22, q0
7866+        vqmovun.s16 d21, q12
7867+        vqmovun.s16 d23, q1
7868+          vld1.16     {q0, q1}, [r1 :256]!
7869+        vst2.8      {q10, q11}, [r0 :256], r2
7870+          vmovl.u8    q11, d18
7871+            pldw        [ip]
7872+          vmovl.u8    q12, d19
7873+          vqadd.s16   q0,  q11
7874+          vaddw.u8    q11, q15, d16
7875+          vqadd.s16   q1,  q12
7876+          vaddw.u8    q12, q15, d17
7877+        bne         1b
7878+
7879+          vqmovun.s16 d20, q11
7880+          vqmovun.s16 d22, q0
7881+          vqmovun.s16 d21, q12
7882+          vqmovun.s16 d23, q1
7883+          vst2.8      {q10, q11}, [r0 :256]
7884+          bx          lr
7885+endfunc
7886+
7887+@ ============================================================================
7888+@ U & V add
7889+
7890+@ add_residual4x4_c(
7891+@   uint8_t *_dst,        [r0]
7892+@   const int16_t *res,   [r1]
7893+@   ptrdiff_t stride)     [r2]
7894+
7895+function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1
7896+        add         ip, r0, r2
7897+        vld1.16     {q0, q1}, [r1]!       @ all of U
7898+        lsl         r2, #1
7899+        vld1.8      {d16}, [r0 :64], r2
7900+        rsb         r3, r2, #0
7901+        vld1.8      {d17}, [ip :64], r2
7902+        vld1.16     {q2, q3}, [r1]        @ all of V
7903+        vld1.8      {d18}, [r0 :64], r3
7904+        vld1.8      {d19}, [ip :64], r3
7905+        vmovl.u8    q10, d16
7906+        vmovl.u8    q11, d17
7907+        vmovl.u8    q12, d18
7908+        vmovl.u8    q13, d19
7909+        vzip.16     q0, q2
7910+        vzip.16     q1, q3
7911+        vqadd.s16   q0,  q10
7912+        vqadd.s16   q2,  q11
7913+        vqadd.s16   q1,  q12
7914+        vqadd.s16   q3,  q13
7915+        vqmovun.s16 d0,  q0
7916+        vqmovun.s16 d1,  q2
7917+        vqmovun.s16 d2,  q1
7918+        vqmovun.s16 d3,  q3
7919+        vst1.8      {d0}, [r0 :64], r2
7920+        vst1.8      {d1}, [ip :64], r2
7921+        vst1.8      {d2}, [r0 :64]
7922+        vst1.8      {d3}, [ip :64]
7923+        bx          lr
7924+endfunc
7925+
7926+@ add_residual8x8_c(
7927+@   uint8_t *_dst,        [r0]
7928+@   const int16_t *res,   [r1]
7929+@   ptrdiff_t stride)     [r2]
7930+
7931+function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1
7932+        vld2.8      {d16, d17}, [r0 :128]
7933+        add         r3, r1, #(8*8*2)  @ Offset to V
7934+        vld1.16     {q0}, [r1 :128]!
7935+        add         ip, r0, r2
7936+        vld1.16     {q1}, [r3 :128]!
7937+        vmovl.u8    q10, d16
7938+        push        {lr}
7939+        vmovl.u8    q8,  d17
7940+        mov         lr, #8-1
7941+        vqadd.s16   q10, q0
7942+        vqadd.s16   q1,  q8
7943+1:
7944+          vld2.8      {d16, d17}, [ip :128], r2
7945+        subs        lr, #1
7946+          vld1.16     {q0}, [r1 :128]!
7947+        vqmovun.s16 d20, q10
7948+        vqmovun.s16 d21, q1
7949+          vld1.16     {q1}, [r3 :128]!
7950+        vst2.8      {d20, d21}, [r0 :128], r2
7951+          vmovl.u8    q10, d16
7952+            pldw        [ip]
7953+          vmovl.u8    q8,  d17
7954+          vqadd.s16   q10, q0
7955+          vqadd.s16   q1,  q8
7956+        bne         1b
7957+
7958+          vqmovun.s16 d20, q10
7959+          vqmovun.s16 d21, q1
7960+          vst2.8      {d20, d21}, [r0 :128]
7961+          pop         {pc}
7962+endfunc
7963+
7964+@ add_residual16x16_c(
7965+@   uint8_t *_dst,        [r0]
7966+@   const int16_t *res,   [r1]
7967+@   ptrdiff_t stride)     [r2]
7968+
7969+function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1
7970+        vld2.8      {q8, q9}, [r0 :256]
7971+        add         r3, r1, #(16*16*2)  @ Offset to V
7972+        vld1.16     {q0, q1}, [r1 :256]!
7973+        add         ip, r0, r2
7974+        vld1.16     {q2, q3}, [r3 :256]!
7975+        vmovl.u8    q10, d16
7976+        push        {lr}
7977+        vmovl.u8    q8,  d17
7978+        mov         lr, #16-1
7979+        vmovl.u8    q11, d18
7980+        vmovl.u8    q9,  d19
7981+        vqadd.s16   q0,  q10
7982+        vqadd.s16   q1,  q8
7983+        vqadd.s16   q2,  q11
7984+        vqadd.s16   q3,  q9
7985+1:
7986+          vld2.8      {q8, q9}, [ip :256], r2
7987+        subs        lr, #1
7988+        vqmovun.s16 d20, q0
7989+        vqmovun.s16 d22, q2
7990+        vqmovun.s16 d21, q1
7991+        vqmovun.s16 d23, q3
7992+          vld1.16     {q0, q1}, [r1 :256]!
7993+        vst2.8      {d20-d23}, [r0 :256], r2
7994+          vld1.16     {q2, q3}, [r3 :256]!
7995+          vmovl.u8    q10, d16
7996+            pldw        [ip]
7997+          vmovl.u8    q8,  d17
7998+          vmovl.u8    q11, d18
7999+          vmovl.u8    q9,  d19
8000+          vqadd.s16   q0,  q10
8001+          vqadd.s16   q1,  q8
8002+          vqadd.s16   q2,  q11
8003+          vqadd.s16   q3,  q9
8004+        bne         1b
8005+
8006+          vqmovun.s16 d20, q0
8007+          vqmovun.s16 d22, q2
8008+          vqmovun.s16 d21, q1
8009+          vqmovun.s16 d23, q3
8010+          vst2.8      {d20-d23}, [r0 :256]
8011+          pop         {pc}
8012+endfunc
8013+
8014+@ 32x32 chroma never occurs so NIF
8015+
8016+@ ============================================================================
8017--- /dev/null
8018+++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
8019@@ -0,0 +1,2245 @@
8020+/*
8021+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
8022+ *               2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
8023+ *
8024+ * This file is part of FFmpeg.
8025+ *
8026+ * FFmpeg is free software; you can redistribute it and/or
8027+ * modify it under the terms of the GNU Lesser General Public
8028+ * License as published by the Free Software Foundation; either
8029+ * version 2.1 of the License, or (at your option) any later version.
8030+ *
8031+ * FFmpeg is distributed in the hope that it will be useful,
8032+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
8033+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8034+ * Lesser General Public License for more details.
8035+ *
8036+ * You should have received a copy of the GNU Lesser General Public
8037+ * License along with FFmpeg; if not, write to the Free Software
8038+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
8039+ */
8040+
8041+#include "libavutil/arm/asm.S"
8042+#include "neon.S"
8043+
8044+.set EDGE_SRC_STRIDE, 160
8045+
8046+@ PIC jump tables are fractionally more expensive than absolute in our code
8047+.set jent_pic, CONFIG_PIC
8048+
8049+
8050+.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4
8051+        vshr.u8   q12, q8, #3
8052+        \I1
8053+        vadd.i8   q8, \Q_K128
8054+        \I2
8055+        vshr.u8   q13, q9, #3
8056+        \I3
8057+        vadd.i8   q9, \Q_K128
8058+        \I4
8059+        vtbl.8    d24, \XLAT0, d24
8060+        vtbl.8    d25, \XLAT0, d25
8061+        vtbl.8    d26, \XLAT1, d26
8062+        vtbl.8    d27, \XLAT1, d27
8063+
8064+        vqadd.s8  q8, q12
8065+        vshr.u8   q12, q10, #3
8066+        vadd.i8   q10, \Q_K128
8067+        vqadd.s8  q9, q13
8068+        vshr.u8   q13, q11, #3
8069+        vadd.i8   q11, \Q_K128
8070+
8071+        vtbl.8    d24, \XLAT0, d24
8072+        vtbl.8    d25, \XLAT0, d25
8073+        vtbl.8    d26, \XLAT1, d26
8074+        vtbl.8    d27, \XLAT1, d27
8075+        vqadd.s8  q10, q12
8076+        vsub.i8   q8, \Q_K128
8077+        vqadd.s8  q11, q13
8078+        vsub.i8   q9, \Q_K128
8079+        vsub.i8   q10, \Q_K128
8080+        vsub.i8   q11, \Q_K128
8081+.endm
8082+
8083+.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4
8084+        \L1
8085+        \L2
8086+        \L3
8087+        \L4
8088+        \L5
8089+        vadd.i8   q12, q8, \Q_K128
8090+        vshr.u8   q8, #3
8091+        vtbl.8    d16, \XLAT0, d16
8092+        vtbl.8    d17, \XLAT1, d17
8093+        vqadd.s8  q12, q8
8094+        bmi       2f
8095+1:        \L1
8096+          \L2
8097+          \L3
8098+          \L4
8099+          \L5
8100+        vsub.i8   q13, q12, \Q_K128
8101+          vadd.i8   q12, q8, \Q_K128
8102+          vshr.u8   q8, #3
8103+        \S1
8104+        \S2
8105+        \S3
8106+        \S4
8107+          vtbl.8    d16, \XLAT0, d16
8108+          vtbl.8    d17, \XLAT1, d17
8109+          vqadd.s8  q12, q8
8110+          bpl       1b
8111+2:        vsub.i8   q13, q12, \Q_K128
8112+          \S1
8113+          \S2
8114+          \S3
8115+          \S4
8116+.endm
8117+
8118+
8119+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
8120+        vmax.s16  \Q0, \Q_MIN
8121+        vmax.s16  \Q1, \Q_MIN
8122+        vmax.s16  \Q2, \Q_MIN
8123+        vmax.s16  \Q3, \Q_MIN
8124+        vmin.s16  \Q0, \Q_MAX
8125+        vmin.s16  \Q1, \Q_MAX
8126+        vmin.s16  \Q2, \Q_MAX
8127+        vmin.s16  \Q3, \Q_MAX
8128+.endm
8129+
8130+@ Clobbers q12, q13
8131+.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2
8132+        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
8133+        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
8134+        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
8135+        \I1
8136+        vtbl.8    d24, \XLAT0, d24
8137+        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
8138+        vtbl.8    d25, \XLAT1, d25
8139+        \I2
8140+        vtbl.8    d26, \XLAT0, d26
8141+        vtbl.8    d27, \XLAT1, d27
8142+        vaddw.s8  \Q0, d24
8143+        vaddw.s8  \Q1, d25
8144+        vaddw.s8  \Q2, d26
8145+        vaddw.s8  \Q3, d27
8146+        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
8147+.endm
8148+
8149+@ Clobbers q10, q11, q12
8150+.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4
8151+        \L1
8152+        \L2
8153+        \L3
8154+        \L4
8155+        \L5
8156+        vshrn.i16 d24, \Q0, #\bit_depth - 5
8157+        vshrn.i16 d25, \Q1, #\bit_depth - 5
8158+        vtbl.8    d24, \XLAT0, d24
8159+        vtbl.8    d25, \XLAT1, d25
8160+        vaddw.s8  q10, \Q0, d24
8161+        vaddw.s8  q11, \Q1, d25
8162+        bmi       2f
8163+1:        \L1
8164+          \L2
8165+          \L3
8166+          \L4
8167+          \L5
8168+        vmax.s16  q10, \Q_MIN
8169+        vmax.s16  q11, \Q_MIN
8170+          vshrn.i16 d24, \Q0, #\bit_depth - 5
8171+          vshrn.i16 d25, \Q1, #\bit_depth - 5
8172+        vmin.s16  q10, \Q_MAX
8173+        vmin.s16  q11, \Q_MAX
8174+        \S1
8175+        \S2
8176+        \S3
8177+        \S4
8178+          vtbl.8    d24, \XLAT0, d24
8179+          vtbl.8    d25, \XLAT1, d25
8180+          vaddw.s8  q10, \Q0, d24
8181+          vaddw.s8  q11, \Q1, d25
8182+          bpl       1b
8183+2:        vmax.s16  q10, \Q_MIN
8184+          vmax.s16  q11, \Q_MIN
8185+          vmin.s16  q10, \Q_MAX
8186+          vmin.s16  q11, \Q_MAX
8187+          \S1
8188+          \S2
8189+          \S3
8190+          \S4
8191+.endm
8192+
8193+
8194+@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
8195+@ so we are quite safe stuffing it into a byte array
8196+@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
8197+@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
8198+@ precision
8199+
8200+@ This, somewhat nasty, bit of code builds the {d0-d3} translation
8201+@ array via the stack
8202+@ Given that sao_left_class > 28 can cause wrap we can't just poke
8203+@ all 4 bytes in at once
8204+@
8205+@ It also loads other common regs
8206+
8207+@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately
8208+function band_load_y
8209+        ldr       ip, [sp, #16]         @ &sao_offset_val[0]
8210+        ldr       r4, [sp, #20]         @ sao_left_class
8211+        vmov.i64  d4, #0
8212+        vmov.i64  q0, #0
8213+        pld       [r1]
8214+        vld2.8    {q8}, [ip]
8215+        sub       ip, sp, #8*5
8216+        vmov.i64  q1, #0
8217+        add       r4, ip, r4
8218+        vpush     {d0-d4}               @ Put zero array on stack
8219+        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
8220+        ldr       ip, [ip, #8*5 + 28]   @ height
8221+        vst1.32   {d16[0]}, [r4]
8222+        add       r4, r1, r3
8223+        vpop      {d0-d4}               @ Pop modified array
8224+        sub       ip, ip, #1
8225+        vorr      d0, d0, d4
8226+        bx        lr
8227+endfunc
8228+
8229+@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately
8230+function band_load_c
8231+        ldr       ip, [sp, #16]         @ &sao_offset_val1[0]
8232+        ldr       r4, [sp, #20]         @ sao_left_class1
8233+        vmov.i64  d24, #0
8234+        vmov.i64  q10, #0
8235+        pld       [r1]
8236+        vld2.8    {q8}, [ip]
8237+        sub       ip, sp, #8*5
8238+        vmov.i64  q11, #0
8239+        add       r4, ip, r4
8240+        ldr       ip, [sp, #24]         @ &sao_offset_val2[0]
8241+        vpush     {d20-d24}             @ Put zero array on stack
8242+        vld2.8    {q9}, [ip]
8243+        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
8244+        ldr       ip, [sp, #8*5 + 28]   @ sao_left_class2
8245+        vst1.32   {d16[0]}, [r4]
8246+        add       ip, sp, ip
8247+        vshr.u64  d18, d18, #8          @ 1st interesting val is [1]
8248+        vldmia    sp, {d0-d3}           @ Load modified array
8249+        vldr      d16, [sp, #8*4]
8250+        add       r4, r1, r3
8251+        vstmia    sp, {d20-d24}         @ Put zero array on stack (again)
8252+        vst1.32   {d18[0]}, [ip]
8253+        vorr      d0, d0, d16
8254+        vldmia    sp, {d4-d7}           @ Load modified array
8255+        vldr      d18, [sp, #8*4]
8256+        ldr       ip, [sp, #8*5 + 36]   @ height
8257+        add       sp, sp, #8*5
8258+        vorr      d4, d4, d18
8259+        sub       ip, ip, #1
8260+        bx        lr
8261+endfunc
8262+
8263+
8264+@ ff_hevc_rpi_sao_band_64_neon_8 (
8265+@   uint8_t *_dst,              [r0]
8266+@   uint8_t *_src,              [r1]
8267+@   ptrdiff_t stride_dst,       [r2]
8268+@   ptrdiff_t stride_src,       [r3]
8269+@   int16_t *sao_offset_val,    [sp, #0]
8270+@   int sao_left_class,         [sp, #4]
8271+@   int width,                  [sp, #8]
8272+@   int height)                 [sp, #12]
8273+
8274+function ff_hevc_rpi_sao_band_64_neon_8, export=1
8275+        push      {r4-r6, lr}
8276+        vmov.u8   q15, #128
8277+        bl        band_load_y
8278+
8279+1:      vldmia    r1, {q8-q11}
8280+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \
8281+            "pld       [r4]",                 \
8282+            "subs      ip, #1",               \
8283+            "it ne; addne r4, r3",            \
8284+            "add       r1, r3"
8285+        vstmia    r0, {q8-q11}
8286+        add       r0, r2
8287+        bpl       1b
8288+
8289+        pop       {r4-r6, pc}
8290+endfunc
8291+
8292+@ ff_hevc_rpi_sao_band_32_neon_8 (
8293+@   uint8_t *_dst,              [r0]
8294+@   uint8_t *_src,              [r1]
8295+@   ptrdiff_t stride_dst,       [r2]
8296+@   ptrdiff_t stride_src,       [r3]
8297+@   int16_t *sao_offset_val,    [sp, #0]
8298+@   int sao_left_class,         [sp, #4]
8299+@   int width,                  [sp, #8]
8300+@   int height)                 [sp, #12]
8301+
8302+function ff_hevc_rpi_sao_band_32_neon_8, export=1
8303+        push      {r4-r6, lr}
8304+        add       r5, r0, r2
8305+        add       r6, r1, r3
8306+        lsl       r2, #1
8307+        lsl       r3, #1
8308+        vmov.u8   q15, #128
8309+        bl        band_load_y
8310+
8311+1:      vld1.8    { q8, q9 }, [r1, :128], r3
8312+        subs      ip, #2
8313+        vld1.8    {q10, q11}, [r6, :128], r3
8314+
8315+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
8316+
8317+        vst1.8    { q8, q9 }, [r0, :128], r2
8318+        vst1.8    {q10, q11}, [r5, :128], r2
8319+        bpl       1b
8320+
8321+        pop       {r4-r6, pc}
8322+endfunc
8323+
8324+@ ff_hevc_rpi_sao_band_16_neon_8 (
8325+@   uint8_t *_dst,              [r0]
8326+@   uint8_t *_src,              [r1]
8327+@   ptrdiff_t stride_dst,       [r2]
8328+@   ptrdiff_t stride_src,       [r3]
8329+@   int16_t *sao_offset_val,    [sp, #0]
8330+@   int sao_left_class,         [sp, #4]
8331+@   int width,                  [sp, #8]
8332+@   int height)                 [sp, #12]
8333+
8334+function ff_hevc_rpi_sao_band_16_neon_8, export=1
8335+        push      {r4-r6, lr}
8336+        add       r5, r0, r2
8337+        add       r6, r1, r3
8338+        lsl       r2, #1
8339+        lsl       r3, #1
8340+        vmov.u8   q15, #128
8341+        bl        band_load_y
8342+
8343+1:      vld1.8    { q8}, [r1, :128], r3
8344+        subs      ip, #4
8345+        vld1.8    { q9}, [r6, :128], r3
8346+        vld1.8    {q10}, [r1, :128], r3
8347+        vld1.8    {q11}, [r6, :128], r3
8348+
8349+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
8350+
8351+        vst1.8    { q8}, [r0, :128], r2
8352+        vst1.8    { q9}, [r5, :128], r2
8353+        vst1.8    {q10}, [r0, :128], r2
8354+        vst1.8    {q11}, [r5, :128], r2
8355+        bpl       1b
8356+
8357+        pop       {r4-r6, pc}
8358+endfunc
8359+
8360+@ ff_hevc_rpi_sao_band_8_neon_8 (
8361+@   uint8_t *_dst,              [r0]
8362+@   uint8_t *_src,              [r1]
8363+@   ptrdiff_t stride_dst,       [r2]
8364+@   ptrdiff_t stride_src,       [r3]
8365+@   int16_t *sao_offset_val,    [sp, #0]
8366+@   int sao_left_class,         [sp, #4]
8367+@   int width,                  [sp, #8]
8368+@   int height)                 [sp, #12]
8369+
8370+function ff_hevc_rpi_sao_band_8_neon_8, export=1
8371+        ldr       ip, [sp, #8]          @ width
8372+        push      {r4-r6, lr}
8373+        vmov.u8   q15, #128
8374+        cmp       ip, #8
8375+        bl        band_load_y
8376+        add       r5, r0, r2
8377+        add       r6, r1, r3
8378+        lsl       r2, #1
8379+        lsl       r3, #1
8380+        blt       4f
8381+
8382+        sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
8383+            "vld1.8    {d16}, [r1, :64], r3", \
8384+            "subs      ip, #2",               \
8385+            "vld1.8    {d17}, [r6, :64], r3", \
8386+            "",                               \
8387+            "",                               \
8388+            "vst1.8 {d26}, [r0, :64], r2",    \
8389+            "vst1.8 {d27}, [r5, :64], r2"
8390+        pop       {r4-r6, pc}
8391+4:
8392+        sao_band_16b_8 {d0-d3}, {d0-d3}, q15,    \
8393+            "vld1.32   {d16[0]}, [r1, :32], r3", \
8394+            "subs      ip, #4",                  \
8395+            "vld1.32   {d16[1]}, [r6, :32], r3", \
8396+            "vld1.32   {d17[0]}, [r1, :32], r3", \
8397+            "vld1.32   {d17[1]}, [r6, :32], r3", \
8398+            "vst1.32   {d26[0]}, [r0, :32], r2", \
8399+            "vst1.32   {d26[1]}, [r5, :32], r2", \
8400+            "vst1.32   {d27[0]}, [r0, :32], r2", \
8401+            "vst1.32   {d27[1]}, [r5, :32], r2"
8402+        pop       {r4-r6, pc}
8403+endfunc
8404+
8405+@ ff_hevc_rpi_sao_band_c_32_neon_8(
8406+@   uint8_t * dst          [r0]
8407+@   uint8_t * src          [r1]
8408+@   uint32_t dst_stride    [r2]
8409+@   uint32_t src_stride    [r3]
8410+@   const int16_t * table1 sp[0]
8411+@   uint32_t offset1       sp[4]
8412+@   const int16_t * table2 sp[8]
8413+@   uint32_t offset2       sp[12]
8414+@   int width              sp[16]
8415+@   int height             sp[20]
8416+
8417+function ff_hevc_rpi_sao_band_c_32_neon_8, export=1
8418+        push      {r4-r6, lr}
8419+        add       r5, r0, #32
8420+        add       r6, r1, #32
8421+        vmov.u8   q15, #128
8422+        bl        band_load_c
8423+
8424+1:      vld2.8    { q8, q9 }, [r1, :128], r3
8425+        subs      ip, #1
8426+        vld2.8    {q10, q11}, [r6, :128], r3
8427+
8428+        sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \
8429+            "pld       [r4]",                 \
8430+            "it ne; addne r4, r3"
8431+
8432+        vst2.8    { q8, q9 }, [r0, :128], r2
8433+        vst2.8    {q10, q11}, [r5, :128], r2
8434+        bpl       1b
8435+
8436+        pop     {r4-r6, pc}
8437+endfunc
8438+
8439+@ ff_hevc_rpi_sao_band_c_16_neon_8(
8440+@   uint8_t * dst          [r0]
8441+@   uint8_t * src          [r1]
8442+@   uint32_t dst_stride    [r2]
8443+@   uint32_t src_stride    [r3]
8444+@   const int16_t * table1 sp[0]
8445+@   uint32_t offset1       sp[4]
8446+@   const int16_t * table2 sp[8]
8447+@   uint32_t offset2       sp[12]
8448+@   int width              sp[16]
8449+@   int height             sp[20]
8450+
8451+function ff_hevc_rpi_sao_band_c_16_neon_8, export=1
8452+        push      {r4-r6, lr}
8453+        add       r5, r0, r2
8454+        add       r6, r1, r3
8455+        lsl       r2, #1
8456+        lsl       r3, #1
8457+        vmov.u8   q15, #128
8458+        bl        band_load_c
8459+
8460+1:      vld2.8    { q8, q9 }, [r1, :128], r3
8461+        subs      ip, #2
8462+        vld2.8    {q10, q11}, [r6, :128], r3
8463+
8464+        sao_band_64b_8 {d0-d3}, {d4-d7}, q15
8465+
8466+        vst2.8    { q8, q9 }, [r0, :128], r2
8467+        vst2.8    {q10, q11}, [r5, :128], r2
8468+        bpl       1b
8469+
8470+        pop     {r4-r6, pc}
8471+endfunc
8472+
8473+@ ff_hevc_rpi_sao_band_c_8_neon_8(
8474+@   uint8_t * dst          [r0]
8475+@   uint8_t * src          [r1]
8476+@   uint32_t dst_stride    [r2]
8477+@   uint32_t src_stride    [r3]
8478+@   const int16_t * table1 sp[0]
8479+@   uint32_t offset1       sp[4]
8480+@   const int16_t * table2 sp[8]
8481+@   uint32_t offset2       sp[12]
8482+@   int width              sp[16]
8483+@   int height             sp[20]
8484+
8485+function ff_hevc_rpi_sao_band_c_8_neon_8, export=1
8486+        ldr       ip, [sp, #16]         @ width
8487+        push      {r4-r6, lr}
8488+        vmov.u8   q15, #128
8489+        cmp       ip, #8
8490+        bl        band_load_c
8491+        blt       4f
8492+
8493+        sao_band_16b_8 {d0-d3}, {d4-d7}, q15,      \
8494+            "vld2.8    {d16-d17}, [r1, :128], r3", \
8495+            "subs      ip, #1",                    \
8496+            "",                                    \
8497+            "",                                    \
8498+            "",                                    \
8499+            "vst2.8    {d26-d27}, [r0, :128], r2"
8500+        pop       {r4-r6, pc}
8501+4:
8502+        add       r5, r0, r2
8503+        add       r6, r1, r3
8504+        lsl       r2, #1
8505+        lsl       r3, #1
8506+        sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
8507+            "vld1.8    {d16}, [r1, :64], r3", \
8508+            "subs      ip, #2",               \
8509+            "vld1.8    {d17}, [r6, :64], r3", \
8510+            "vuzp.8    d16, d17",             \
8511+            "",                               \
8512+            "vzip.8    d26, d27",             \
8513+            "vst1.8    {d26}, [r0, :64], r2", \
8514+            "vst1.8    {d27}, [r5, :64], r2"
8515+        pop       {r4-r6, pc}
8516+endfunc
8517+
8518+
8519+@ ff_hevc_rpi_sao_band_64_neon_10 (
8520+@   uint8_t *_dst,              [r0]
8521+@   uint8_t *_src,              [r1]
8522+@   ptrdiff_t stride_dst,       [r2]
8523+@   ptrdiff_t stride_src,       [r3]
8524+@   int16_t *sao_offset_val,    [sp, #0]
8525+@   int sao_left_class,         [sp, #4]
8526+@   int width,                  [sp, #8]
8527+@   int height)                 [sp, #12]
8528+
8529+.macro band_64_16 bit_depth
8530+        push      {r4-r6, lr}
8531+        vmov.i64  q2, #0
8532+        vmov.i16  q3, #(1 << \bit_depth) - 1
8533+        bl        band_load_y
8534+        vpush     {q4-q7}
8535+
8536+1:      vldm      r1, {q4-q11}
8537+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
8538+            "subs      ip, #1",                                                  \
8539+            "add       r1, r3"
8540+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth
8541+        vstm      r0, {q4-q11}
8542+        add       r0, r2
8543+        bpl       1b
8544+
8545+        vpop      {q4-q7}
8546+        pop       {r4-r6, pc}
8547+.endm
8548+
8549+function ff_hevc_rpi_sao_band_64_neon_10, export=1
8550+        band_64_16 10
8551+endfunc
8552+
8553+@ ff_hevc_rpi_sao_band_32_neon_10 (
8554+@   uint8_t *_dst,              [r0]
8555+@   uint8_t *_src,              [r1]
8556+@   ptrdiff_t stride_dst,       [r2]
8557+@   ptrdiff_t stride_src,       [r3]
8558+@   int16_t *sao_offset_val,    [sp, #0]
8559+@   int sao_left_class,         [sp, #4]
8560+@   int width,                  [sp, #8]
8561+@   int height)                 [sp, #12]
8562+
8563+.macro band_32_16 bit_depth
8564+        push      {r4-r6, lr}
8565+        vmov.i64  q2, #0
8566+        vmov.i16  q3, #(1 << \bit_depth) - 1
8567+        bl        band_load_y
8568+
8569+1:      vldm      r1, {q8-q11}
8570+        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
8571+            "subs      ip, #1",                                                   \
8572+            "add       r1, r3"
8573+        vstm      r0, {q8-q11}
8574+        add       r0, r2
8575+        bpl       1b
8576+
8577+        pop       {r4-r6, pc}
8578+.endm
8579+
8580+function ff_hevc_rpi_sao_band_32_neon_10, export=1
8581+        band_32_16 10
8582+endfunc
8583+
8584+@ ff_hevc_rpi_sao_band_16_neon_10 (
8585+@   uint8_t *_dst,              [r0]
8586+@   uint8_t *_src,              [r1]
8587+@   ptrdiff_t stride_dst,       [r2]
8588+@   ptrdiff_t stride_src,       [r3]
8589+@   int16_t *sao_offset_val,    [sp, #0]
8590+@   int sao_left_class,         [sp, #4]
8591+@   int width,                  [sp, #8]
8592+@   int height)                 [sp, #12]
8593+
8594+.macro band_16_16 bit_depth
8595+        push      {r4-r6, lr}
8596+        add       r5, r0, r2
8597+        add       r6, r1, r3
8598+        lsl       r2, #1
8599+        lsl       r3, #1
8600+        vmov.i64  q14, #0
8601+        vmov.i16  q15, #(1 << \bit_depth) - 1
8602+        bl        band_load_y
8603+
8604+1:      vld1.16   { q8, q9 }, [r1, :128], r3
8605+        subs      r12, #2
8606+        vld1.16   {q10, q11}, [r6, :128], r3
8607+        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth
8608+        vst1.16   { q8, q9 }, [r0, :128], r2
8609+        vst1.16   {q10, q11}, [r5, :128], r2
8610+        bpl       1b
8611+
8612+        pop       {r4-r6, pc}
8613+.endm
8614+
8615+function ff_hevc_rpi_sao_band_16_neon_10, export=1
8616+        band_16_16 10
8617+endfunc
8618+
8619+@ ff_hevc_rpi_sao_band_8_neon_10 (
8620+@   uint8_t *_dst,              [r0]
8621+@   uint8_t *_src,              [r1]
8622+@   ptrdiff_t stride_dst,       [r2]
8623+@   ptrdiff_t stride_src,       [r3]
8624+@   int16_t *sao_offset_val,    [sp, #0]
8625+@   int sao_left_class,         [sp, #4]
8626+@   int width,                  [sp, #8]
8627+@   int height)                 [sp, #12]
8628+
8629+.macro band_8_16 bit_depth
8630+        ldr       ip, [sp, #8]          @ width
8631+        push      {r4-r6, lr}
8632+        vmov.i64  q14, #0
8633+        cmp       ip, #8
8634+        vmov.i16  q15, #(1 << \bit_depth) - 1
8635+        bl        band_load_y
8636+        add       r5, r0, r2
8637+        add       r6, r1, r3
8638+        lsl       r2, #1
8639+        lsl       r3, #1
8640+        blt       4f
8641+
8642+        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
8643+            "vld1.16   {q8}, [r1, :128], r3",                           \
8644+            "subs      ip, #2",                                         \
8645+            "vld1.16   {q9}, [r6, :128], r3",                           \
8646+            "",                                                         \
8647+            "",                                                         \
8648+            "vst1.16   {q10}, [r0, :128], r2",                          \
8649+            "vst1.16   {q11}, [r5, :128], r2"
8650+        pop       {r4-r6, pc}
8651+4:
8652+        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
8653+            "vld1.16   {d16}, [r1, :64], r3",                           \
8654+            "subs      ip, #4",                                         \
8655+            "vld1.16   {d17}, [r6, :64], r3",                           \
8656+            "vld1.16   {d18}, [r1, :64], r3",                           \
8657+            "vld1.16   {d19}, [r6, :64], r3",                           \
8658+            "vst1.16   {d20}, [r0, :64], r2",                           \
8659+            "vst1.16   {d21}, [r5, :64], r2",                           \
8660+            "vst1.16   {d22}, [r0, :64], r2",                           \
8661+            "vst1.16   {d23}, [r5, :64], r2"
8662+        pop       {r4-r6, pc}
8663+.endm
8664+
8665+function ff_hevc_rpi_sao_band_8_neon_10, export=1
8666+        band_8_16 10
8667+endfunc
8668+
8669+
8670+@ ff_hevc_rpi_sao_band_c_32_neon_10(
8671+@   uint8_t * dst          [r0]
8672+@   uint8_t * src          [r1]
8673+@   uint32_t dst_stride    [r2]
8674+@   uint32_t src_stride    [r3]
8675+@   const int16_t * table1 sp[0]
8676+@   uint32_t offset1       sp[4]
8677+@   const int16_t * table2 sp[8]
8678+@   uint32_t offset2       sp[12]
8679+@   int width              sp[16]
8680+@   int height             sp[20]
8681+
8682+.macro band_c_32_16 bit_depth
8683+        push      {r4-r6, lr}
8684+        add       r5, r0, #32
8685+        add       r6, r1, #32
8686+        sub       r2, #64
8687+        sub       r3, #64
8688+        vmov.i64  q14, #0
8689+        vmov.i16  q15, #(1 << \bit_depth) - 1
8690+        bl        band_load_c
8691+        mov       lr, #64
8692+        vpush     {q4-q7}
8693+
8694+1:      vld2.16   { q4, q5 }, [r1, :128], lr
8695+        subs      ip, #1
8696+        vld2.16   { q6, q7 }, [r6, :128], lr
8697+        vld2.16   { q8, q9 }, [r1, :128], r3
8698+        vld2.16   {q10, q11}, [r6, :128], r3
8699+
8700+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
8701+            "pld       [r4]",                                                      \
8702+            "it ne; addne r4, r3"
8703+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
8704+
8705+        vst2.16   { q4, q5 }, [r0, :128], lr
8706+        vst2.16   { q6, q7 }, [r5, :128], lr
8707+        vst2.16   { q8, q9 }, [r0, :128], r2
8708+        vst2.16   {q10, q11}, [r5, :128], r2
8709+
8710+        bpl       1b
8711+
8712+        vpop      {q4-q7}
8713+        pop       {r4-r6, pc}
8714+.endm
8715+
8716+function ff_hevc_rpi_sao_band_c_32_neon_10, export=1
8717+        band_c_32_16 10
8718+endfunc
8719+
8720+
8721+@ ff_hevc_rpi_sao_band_c_16_neon_10(
8722+@   uint8_t * dst          [r0]
8723+@   uint8_t * src          [r1]
8724+@   uint32_t dst_stride    [r2]
8725+@   uint32_t src_stride    [r3]
8726+@   const int16_t * table1 sp[0]
8727+@   uint32_t offset1       sp[4]
8728+@   const int16_t * table2 sp[8]
8729+@   uint32_t offset2       sp[12]
8730+@   int width              sp[16]
8731+@   int height             sp[20]
8732+
8733+.macro band_c_16_16 bit_depth
8734+        push      {r4-r6, lr}
8735+        add       r5, r0, #32
8736+        add       r6, r1, #32
8737+        vmov.i64  q14, #0
8738+        vmov.i16  q15, #(1 << \bit_depth) - 1
8739+        bl        band_load_c
8740+
8741+1:      vld2.16   { q8, q9 }, [r1, :128], r3
8742+        subs      ip, #1
8743+        vld2.16   {q10, q11}, [r6, :128], r3
8744+
8745+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
8746+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
8747+
8748+        vst2.16   { q8, q9 }, [r0, :128], r2
8749+        vst2.16   {q10, q11}, [r5, :128], r2
8750+
8751+        bpl       1b
8752+        pop       {r4-r6, pc}
8753+.endm
8754+
8755+function ff_hevc_rpi_sao_band_c_16_neon_10, export=1
8756+        band_c_16_16 10
8757+endfunc
8758+
8759+
8760+@ ff_hevc_rpi_sao_band_c_8_neon_10(
8761+@   uint8_t * dst          [r0]
8762+@   uint8_t * src          [r1]
8763+@   uint32_t dst_stride    [r2]
8764+@   uint32_t src_stride    [r3]
8765+@   const int16_t * table1 sp[0]
8766+@   uint32_t offset1       sp[4]
8767+@   const int16_t * table2 sp[8]
8768+@   uint32_t offset2       sp[12]
8769+@   int width              sp[16]
8770+@   int height             sp[20]
8771+
8772+.macro band_c_8_16 bit_depth
8773+        ldr       ip, [sp, #16]         @ width
8774+        push      {r4-r6, lr}
8775+        vmov.i64  q14, #0
8776+        cmp       ip, #8
8777+        vmov.i16  q15, #(1 << \bit_depth) - 1
8778+        bl        band_load_c
8779+        blt       4f
8780+
8781+        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
8782+            "vld2.16   {q8,q9}, [r1, :128], r3",                        \
8783+            "subs      ip, #1",                                         \
8784+            "",                                                         \
8785+            "",                                                         \
8786+            "",                                                         \
8787+            "vst2.16   {q10,q11}, [r0, :128], r2"
8788+        pop       {r4-r6, pc}
8789+4:
8790+        add       r5, r0, r2
8791+        add       r6, r1, r3
8792+        lsl       r2, #1
8793+        lsl       r3, #1
8794+        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
8795+            "vld2.16   {d16,d18}, [r1, :128], r3",                      \
8796+            "subs      ip, #2",                                         \
8797+            "vld2.16   {d17,d19}, [r6, :128], r3",                      \
8798+            "",                                                         \
8799+            "",                                                         \
8800+            "vst2.16   {d20,d22}, [r0, :128], r2",                      \
8801+            "vst2.16   {d21,d23}, [r5, :128], r2"
8802+        pop       {r4-r6, pc}
8803+.endm
8804+
8805+function ff_hevc_rpi_sao_band_c_8_neon_10, export=1
8806+        band_c_8_16 10
8807+endfunc
8808+
8809+
8810+@ =============================================================================
8811+@ SAO EDGE
8812+
8813+@ r0    destination address
8814+@ r2    stride to post-increment r0 with
8815+@ [r5]  translate values
8816+@
8817+@ a <- c <- b
8818+@ a in q0 - q3
8819+@ c in q4 - q7
8820+@ b in q8 - q11
8821+@
8822+@ q12-15 used as temp
8823+@
8824+@ Can be used for both Y & C as we unzip/zip the deltas and
8825+@ transform "u/v" separately via d26/d27.  For Y d26=d27
8826+
8827+function edge_64b_body_8
8828+
8829+        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
8830+        vcgt.u8 q13,  q5,  q1
8831+        vcgt.u8 q14,  q6,  q2
8832+        vcgt.u8 q15,  q7,  q3
8833+
8834+        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
8835+        vcgt.u8  q1,  q5
8836+        vcgt.u8  q2,  q6
8837+        vcgt.u8  q3,  q7
8838+
8839+        vsub.s8  q0,  q12       @ a = sign(c-a)
8840+        vsub.s8  q1,  q13
8841+        vsub.s8  q2,  q14
8842+        vsub.s8  q3,  q15
8843+
8844+        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
8845+        vcgt.u8  q13, q5,  q9
8846+        vcgt.u8  q14, q6,  q10
8847+        vcgt.u8  q15, q7,  q11
8848+
8849+        vsub.s8  q0,  q12
8850+        vsub.s8  q1,  q13
8851+        vsub.s8  q2,  q14
8852+        vsub.s8  q3,  q15
8853+
8854+        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
8855+        vcgt.u8  q13, q9,  q5
8856+        vcgt.u8  q14, q10, q6
8857+        vcgt.u8  q15, q11, q7
8858+
8859+        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
8860+        vadd.s8  q1,  q13
8861+        vmov.u8  q12, #2
8862+        vadd.s8  q2,  q14
8863+        vadd.s8  q3,  q15
8864+
8865+        vadd.s8  q0,  q12
8866+        vadd.s8  q1,  q12
8867+
8868+        vld1.8   {d26, d27}, [r5]
8869+
8870+        vadd.s8  q2,  q12
8871+        vuzp.8   q0,  q1
8872+        vmov.u8  q15, #128
8873+        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
8874+
8875+        vtbl.8   d0,  {d26}, d0
8876+        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
8877+
8878+        vtbl.8   d1,  {d26}, d1
8879+        vadd.s8  q14, q5, q15
8880+
8881+        vtbl.8   d2,  {d27}, d2
8882+        vuzp.8   q2,  q3
8883+
8884+        vtbl.8   d3,  {d27}, d3
8885+
8886+        vtbl.8   d4,  {d26}, d4
8887+        vzip.8   q0,  q1
8888+
8889+        vtbl.8   d5,  {d26}, d5
8890+        vqadd.s8 q0,  q12
8891+        vqadd.s8 q1,  q14
8892+        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
8893+
8894+        vtbl.8   d6,  {d27}, d6
8895+        vtbl.8   d7,  {d27}, d7
8896+        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
8897+        vzip.8   q2,  q3
8898+
8899+        vsub.s8  q0,  q15
8900+        vqadd.s8 q2,  q12
8901+        vqadd.s8 q3,  q14
8902+        vsub.s8  q1,  q15
8903+        vsub.s8  q2,  q15
8904+        vsub.s8  q3,  q15
8905+
8906+        bx      lr
8907+endfunc
8908+
8909+@ r0    destination address
8910+@ r2    stride to post-increment r0 with
8911+@ r4    upper clip value
8912+@ [r5]  translate values
8913+@
8914+@ a <- c <- b
8915+@ a in q0 - q3
8916+@ c in q4 - q7
8917+@ b in q8 - q11
8918+@
8919+@ q12-15 used as temp
8920+@
8921+@ Can be used for both Y & C as we unzip/zip the deltas and
8922+@ transform "u/v" separately via d26/d27.  For Y d26=d27
8923+
8924+function edge_64b_body_16
8925+
8926+        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
8927+        vcgt.u16 q13, q5, q1
8928+        vcgt.u16 q14, q6, q2
8929+        vcgt.u16 q15, q7, q3
8930+
8931+        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
8932+        vcgt.u16 q1, q1, q5
8933+        vcgt.u16 q2, q2, q6
8934+        vcgt.u16 q3, q3, q7
8935+
8936+        vsub.s16 q0, q0, q12 // a = sign(c-a)
8937+        vsub.s16 q1, q1, q13
8938+        vsub.s16 q2, q2, q14
8939+        vsub.s16 q3, q3, q15
8940+
8941+        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
8942+        vcgt.u16 q13, q5, q9
8943+        vcgt.u16 q14, q6, q10
8944+        vcgt.u16 q15, q7, q11
8945+
8946+        vsub.s16 q0, q0, q12
8947+        vsub.s16 q1, q1, q13
8948+        vsub.s16 q2, q2, q14
8949+        vsub.s16 q3, q3, q15
8950+
8951+        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
8952+        vcgt.u16 q13, q9, q5
8953+        vcgt.u16 q14, q10, q6
8954+        vcgt.u16 q15, q11, q7
8955+
8956+        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
8957+        vadd.s16 q1, q1, q13
8958+        vadd.s16 q2, q2, q14
8959+        vadd.s16 q3, q3, q15
8960+
8961+        vmov.u8  q12, #2
8962+
8963+        vmovn.s16 d0, q0
8964+        vmovn.s16 d1, q1
8965+        vmovn.s16 d2, q2
8966+        vmovn.s16 d3, q3
8967+
8968+        vldr     d26, [r5]
8969+
8970+        vuzp.8   q0, q1
8971+
8972+        vldr     d27, [r5, #8]
8973+
8974+        vadd.s8  q0, q0, q12
8975+        vadd.s8  q1, q1, q12
8976+
8977+        vmov.i64 q12, #0
8978+
8979+        vtbl.8   d0, {d26}, d0
8980+        vtbl.8   d1, {d26}, d1
8981+        vtbl.8   d2, {d27}, d2
8982+        vtbl.8   d3, {d27}, d3
8983+
8984+        vdup.i16 q13, r4
8985+
8986+        vzip.8   q0, q1
8987+
8988+        @ Avoid overwrite whilst widening
8989+        vaddw.s8 q2, q6, d2
8990+        vaddw.s8 q3, q7, d3
8991+        vaddw.s8 q1, q5, d1
8992+        vaddw.s8 q0, q4, d0
8993+
8994+        @ now clip
8995+        clip16_4 q2, q3, q1, q0, q12, q13
8996+
8997+        bx       lr
8998+endfunc
8999+
9000+
9001+@ a <- c <- b
9002+@ a in q0
9003+@ c in q1
9004+@ b in q2
9005+@ Temp q3, q9, q10
9006+@
9007+@ d16, d17 (q8) xlat U, V
9008+@ q14.u8 #2
9009+@ q15.u8 #128
9010+
9011+function edge_16b_body_8
9012+        vcgt.u8  q9,  q0,  q1   @ a > c -> -1 , otherwise 0
9013+        vadd.u8  q9,  q14, q9
9014+        vcgt.u8  q0,  q1,  q0   @ c > a -> -1 , otherwise 0
9015+        vsub.u8  q9,  q9,  q0
9016+        vcgt.u8  q0,  q2,  q1   @ c < b -> -1 , otherwise 0
9017+        vadd.u8  q9,  q9,  q0
9018+        vcgt.u8  q0,  q1,  q2   @ c > b -> -1 , otherwise 0
9019+        vsub.u8  q0,  q9,  q0
9020+
9021+        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
9022+
9023+        vuzp.8   d0,  d1
9024+
9025+        vtbl.8   d0,  {d16}, d0
9026+        vtbl.8   d1,  {d17}, d1
9027+
9028+        vzip.8   d0,  d1
9029+        vqadd.s8 q0,  q3
9030+        vsub.s8  q0,  q15
9031+
9032+        bx      lr
9033+endfunc
9034+
9035+@ a <- c <- b
9036+@ a in q0
9037+@ c in q1
9038+@ b in q2
9039+@ Temp q3
9040+@
9041+@ q12, #0
9042+@ d16, d17 xlat U, V
9043+@ q14.u8 #2
9044+@ q15.u16 max
9045+function edge_16b_body_16
9046+        vcgt.u16 q9, q0, q1     @ a > c -> -1 , otherwise 0
9047+        vadd.u16 q9, q14, q9
9048+        vcgt.u16 q0, q1, q0     @ c > a -> -1 , otherwise 0
9049+        vsub.u16 q9, q9, q0
9050+        vcgt.u16 q0, q2, q1     @ c < b -> -1 , otherwise 0
9051+        vadd.u16 q9, q9, q0
9052+        vcgt.u16 q0, q1, q2     @ c > b -> -1 , otherwise 0
9053+        vsub.u16 q0, q9, q0
9054+
9055+        vmovn.s16 d0, q0
9056+        @ d1 will have random contents that we transform but
9057+        @ that doesn't matter as we then discard them
9058+        vuzp.8   d0, d1
9059+
9060+        vtbl.8   d0, {d16}, d0
9061+        vtbl.8   d1, {d17}, d1
9062+
9063+        vzip.8   d0, d1
9064+
9065+        vaddw.s8 q0, q1, d0
9066+
9067+        @ now clip
9068+        vmax.s16 q0, q12
9069+        vmin.s16 q0, q15
9070+        bx       lr
9071+endfunc
9072+
9073+
9074+@ ff_hevc_rpi_sao_edge_[c_]xx_neon(
9075+@   uint8_t *_dst,                    [r0]
9076+@   const uint8_t *_src,              [r1]
9077+@   ptrdiff_t stride_dst,             [r2]
9078+@   const int16_t *_sao_offset_val_u, [r3]
9079+@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
9080+@   int eo,                           [sp, #sp_base + 0]
9081+@   int width,                        [sp, #sp_base + 4]
9082+@   int height)                       [sp, #sp_base + 8]
9083+
9084+@ Jumps via jump_tab with
9085+@   uint8_t *_dst,                    [r0]
9086+@   const uint8_t *_src,              [r1]
9087+@   ptrdiff_t stride_dst,             [r2]
9088+@   EDGE_SRC_STRIDE                   [r3]
9089+@   (1 << \bit_depth) - 1             [r4]
9090+@   * xlat_table                      [r5]  // setup_64b only
9091+@   int height                        [r12]
9092+@
9093+@   0                                 [q12] // > 8 bit
9094+@   2                                 [q14]
9095+@   128                               [q15] // = 8 bit
9096+@   r4                                [q15] // > 8 bit
9097+
9098+.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0
9099+
9100+@ Build translate registers
9101+@ As translate values can only be 0-4 we don't care about junk in the rest
9102+@ of the register
9103+.if \is_chroma
9104+        ldr      ip, [sp, #0]
9105+        push     {r4-r6, lr}    @ 16 bytes
9106+        vld1.8   {d16[2]}, [r3]
9107+        add      r3, r3, #2
9108+        vld1.8   {d17[2]}, [ip]
9109+        add      ip, ip, #2
9110+        vld1.8   {d16[0]}, [r3]
9111+        add      r3, r3, #2
9112+        vld1.8   {d17[0]}, [ip]
9113+        add      ip, ip, #2
9114+        vld1.8   {d16[1]}, [r3]
9115+        add      r3, r3, #2
9116+        vld1.8   {d17[1]}, [ip]
9117+        add      ip, ip, #2
9118+        vld1.8   {d16[3]}, [r3]
9119+        add      r3, r3, #2
9120+        vld1.8   {d17[3]}, [ip]
9121+        add      ip, ip, #2
9122+        vld1.8   {d16[4]}, [r3]
9123+        vld1.8   {d17[4]}, [ip]
9124+        movw     r3, EDGE_SRC_STRIDE
9125+.set sp_base, 20
9126+.else
9127+        add      ip, r3, #4
9128+        vld1.8   {d16[1]}, [r3]
9129+        add      r3, r3, #2
9130+        vld1.8   {d17[0]}, [ip]
9131+        add      ip, ip, #2
9132+        vld1.8   {d16[0]}, [r3]
9133+        add      r3, r3, #6
9134+        vld1.8   {d17[1]}, [ip]
9135+        vld1.8   {d16[2]}, [r3]
9136+        movw     r3, EDGE_SRC_STRIDE
9137+        push     {r4-r6, lr}    @ 16 bytes
9138+        vzip.8   d16, d17
9139+        vmov     d17, d16
9140+.set sp_base, 16
9141+.endif
9142+
9143+@ If setup_64b we need the xlat table on the stack
9144+.if \setup_64b
9145+        sub      r5, sp, #16
9146+.endif
9147+
9148+@ Get jump address
9149+@ We have a special case for width 4 as the calling code doesn't detect it
9150+@ If we may have w4 then we add a 2nd jump table after the 1st
9151+.if \check_w4
9152+        ldr      r12, [sp, #sp_base + 4]        @ width
9153+        adr      r6, \jump_tab
9154+        ldr      lr, [sp, #sp_base + 0]        @ e0
9155+        cmp      r12, #8
9156+        it lt
9157+        addlt    r6, #16
9158+.else
9159+        ldr      lr, [sp, #sp_base + 0]        @ e0
9160+        adr      r6, \jump_tab
9161+.endif
9162+
9163+        ldr      r12, [sp, #sp_base + 8]        @ height
9164+
9165+.if \bit_depth > 8
9166+        movw     r4, (1 << \bit_depth) - 1
9167+.endif
9168+.if \setup_16b
9169+.if \bit_depth > 8
9170+        vmov.i64 q12, #0
9171+        vdup.16  q15, r4
9172+        vmov.u16 q14, #2
9173+.else
9174+        vmov.u8  q15, #128
9175+        vmov.u8  q14, #2
9176+.endif
9177+.endif
9178+
9179+@ If setup_64b we need q4-q7 saved.
9180+.if \setup_64b
9181+        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
9182+.set sp_base, sp_base + 80
9183+.endif
9184+
9185+        ldr      r6, [r6, lr, lsl #2]
9186+
9187+@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
9188+.if \do2
9189+        push     {r0, r1, r6, r12}
9190+.if jent_pic
9191+        bl       98f
9192+.else
9193+        blx      r6
9194+.endif
9195+        pop      {r0, r1, r6, r12}
9196+
9197+        add      r0, #64
9198+        add      r1, #64
9199+.endif
9200+
9201+.if jent_pic
9202+        bl       98f
9203+.else
9204+        blx      r6
9205+.endif
9206+
9207+@ Tidy up & return
9208+.if \setup_64b
9209+        vpop     {q4-q8}        @ spurious but harmless load of q8
9210+.endif
9211+        pop      {r4-r6, pc}
9212+
9213+.if jent_pic && !\xjump
9214+@ Magic label - used as 98b in jent macro
9215+98:
9216+        add      pc, r6
9217+.endif
9218+.endm
9219+
9220+
9221+.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
9222+        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
9223+.endm
9224+
9225+.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0
9226+        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump
9227+.endm
9228+
9229+
9230+.macro  edge_64b_e0, body_fn, pb
9231+        sub      r1, #8
9232+        mov      r6, lr
9233+1:      vldm     r1, {d7-d16}
9234+        // load a
9235+        vext.8   q0,  q3,  q4, #(16 - \pb)
9236+        add      r1, r3
9237+        vext.8   q1,  q4,  q5, #(16 - \pb)
9238+        subs     r12, #1
9239+        vext.8   q2,  q5,  q6, #(16 - \pb)
9240+        vext.8   q3,  q6,  q7, #(16 - \pb)
9241+        pld      [r1]
9242+        // load b
9243+        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
9244+        pld      [r1, #64]
9245+        vext.8   q8,  q4,  q5, #\pb
9246+        vext.8   q9,  q5,  q6, #\pb
9247+        vext.8   q10, q6,  q7, #\pb
9248+        bl       \body_fn
9249+        vstm     r0, {q0-q3}
9250+        add      r0, r0, r2
9251+        bgt      1b
9252+        bx       r6
9253+.endm
9254+
9255+.macro  edge_32bx2_e0, body_fn, pb
9256+        add      r6, r1, r3
9257+        push     {r7,lr}
9258+        sub      r1, #8
9259+        add      r7, r0, r2
9260+        lsl      r2, #1
9261+1:      vldmia   r1, {d7-d12}
9262+        // load a
9263+        vext.8   q0, q3, q4, #16 - \pb
9264+        add      r1, r1, r3, lsl #1
9265+        vext.8   q1, q4, q5, #16 - \pb
9266+        subs     r12, #2
9267+        // load b
9268+        vext.8   q8, q4, q5, #\pb
9269+        vext.8   q9, q5, q6, #\pb
9270+        vldr     d25, [r6, #-8]
9271+        vldmia   r6, {d12-d15}
9272+        vldr     d26, [r6, #32]
9273+        // load a
9274+        vext.8   q2, q12, q6, #16 - \pb
9275+        add      r6, r6, r3, lsl #1
9276+        vext.8   q3, q6, q7, #16 - \pb
9277+        // load b
9278+        vext.8   q10, q6, q7, #\pb
9279+        vext.8   q11, q7, q13, #\pb
9280+        bl       \body_fn
9281+        vst1.8   {q0-q1}, [r0, :256], r2
9282+        vst1.8   {q2-q3}, [r7, :256], r2
9283+        bgt      1b
9284+        pop      {r7,pc}
9285+.endm
9286+
9287+.macro  edge_16b_e0, body_fn, pb
9288+        sub      r1, #8
9289+        mov      r6, lr
9290+1:      vldmia   r1, {d1-d4}
9291+        add      r1, r3
9292+        subs     r12, #1
9293+        vext.8   q0, q0, q1, #16 - \pb
9294+        vext.8   q2, q1, q2, #\pb
9295+
9296+        bl       \body_fn
9297+        vst1.8   {q0}, [r0, :128], r2
9298+        bgt      1b
9299+        bx       r6
9300+.endm
9301+
9302+.macro  edge_8bx2_e0, body_fn, pb
9303+        add      r6, r1, r3
9304+        push     {r7,lr}
9305+        sub      r1, #8
9306+        add      r7, r0, r2
9307+        lsl      r2, #1
9308+1:      vldmia   r1, {d1-d2}
9309+        vldmia   r6, {d3-d4}
9310+        vldr     d6, [r1, #16]
9311+        subs     r12, #2
9312+        vldr     d7, [r6, #-8]
9313+        add      r1, r1, r3, lsl #1
9314+        vext.8   d0, d1, d2, #8 - \pb
9315+        add      r6, r6, r3, lsl #1
9316+        vext.8   d5, d3, d4, #\pb
9317+        vext.8   d4, d2, d6, #\pb
9318+        vext.8   d1, d7, d3, #8 - \pb
9319+
9320+        bl       \body_fn
9321+        vst1.8   {d0}, [r0, :64], r2
9322+        vst1.8   {d1}, [r7, :64], r2
9323+        bgt      1b
9324+        pop      {r7,pc}
9325+.endm
9326+
9327+.macro  edge_4bx4_e0, body_fn, pb
9328+        add      r6, r1, r3
9329+        push     {r7,lr}
9330+        add      r7, r0, r2
9331+        lsl      r2, #1
9332+
9333+        tst      r1, #4
9334+        bne      2f
9335+1:      // r1 (and assumed r6) are 64-bit aligned
9336+        vldr     d2, [r1]
9337+        vldr     d0, [r1, #-8]
9338+        add      r1, r1, r3, lsl #1
9339+        vldr     d20, [r6]
9340+        subs     r12, #4
9341+        vldr     d18, [r6, #-8]
9342+        add      r6, r6, r3, lsl #1
9343+        vldr     d3, [r1]
9344+        vshr.u64 d4, d2, #\pb * 8
9345+        vldr     d1, [r1, #-8]
9346+        add      r1, r1, r3, lsl #1
9347+        vldr     d21, [r6]
9348+        vext.8   d0, d0, d2, #8 - \pb
9349+        vldr     d19, [r6,#-8]
9350+        add      r6, r6, r3, lsl #1
9351+        vshr.u64 d22, d20, #\pb * 8
9352+        vext.8   d18, d18, d20, #8 - \pb
9353+        vshr.u64 d5, d3, #\pb * 8
9354+        vext.8   d1, d1, d3, #8 - \pb
9355+        vshr.u64 d23, d21, #\pb * 8
9356+        vext.8   d19, d19, d21, #8 - \pb
9357+        vsli.64  q1, q10, #32
9358+        vsli.64  q2, q11, #32
9359+        vsli.64  q0, q9, #32
9360+
9361+        bl       \body_fn
9362+        vst1.32  {d0[0]}, [r0, :32], r2
9363+        vst1.32  {d0[1]}, [r7, :32], r2
9364+        vst1.32  {d1[0]}, [r0, :32], r2
9365+        vst1.32  {d1[1]}, [r7, :32], r2
9366+        bgt      1b
9367+        pop      {r7,pc}
9368+
9369+2:      // r1 (and assumed r6) are 32-bit but not 64-bit aligned
9370+        vldr     d20, [r1, #-4]
9371+        vldr     d22, [r1, #4]
9372+        add      r1, r1, r3, lsl #1
9373+        vldr     d2, [r6, #-4]
9374+        subs     r12, #4
9375+        vldr     d4, [r6, #4]
9376+        add      r6, r6, r3, lsl #1
9377+        vldr     d21, [r1, #-4]
9378+        vshl.i64 d18, d20, #\pb * 8
9379+        vldr     d23, [r1, #4]
9380+        add      r1, r1, r3, lsl #1
9381+        vldr     d3, [r6, #-4]
9382+        vext.8   d22, d20, d22, #\pb
9383+        vldr     d5, [r6, #4]
9384+        add      r6, r6, r3, lsl #1
9385+        vshl.i64 d0, d2, #\pb * 8
9386+        vext.8   d4, d2, d4, #\pb
9387+        vshl.i64 d19, d21, #\pb * 8
9388+        vext.8   d23, d21, d23, #\pb
9389+        vshl.i64 d1, d3, #\pb * 8
9390+        vext.8   d5, d3, d5, #\pb
9391+        vsri.64  q1, q10, #32
9392+        vsri.64  q0, q9, #32
9393+        vsri.64  q2, q11, #32
9394+
9395+        bl       \body_fn
9396+        vst1.32  {d0[0]}, [r0, :32], r2
9397+        vst1.32  {d0[1]}, [r7, :32], r2
9398+        vst1.32  {d1[0]}, [r0, :32], r2
9399+        vst1.32  {d1[1]}, [r7, :32], r2
9400+        bgt      2b
9401+        pop      {r7,pc}
9402+.endm
9403+
9404+
9405+.macro  edge_64b_e1, body_fn
9406+        sub      r1, r3
9407+        push     {lr}
9408+        add      r6, r1, #32
9409+        // load a
9410+        vld1.8   {q0-q1}, [r1, :256], r3
9411+        vld1.8   {q2-q3}, [r6, :256], r3
9412+        // load c
9413+        vld1.8   {q4-q5}, [r1, :256], r3
9414+        vld1.8   {q6-q7}, [r6, :256], r3
9415+1:      // load b
9416+        vld1.8   {q8-q9}, [r1, :256], r3
9417+        subs     r12, #1
9418+        vld1.8   {q10-q11}, [r6, :256], r3
9419+        bl       \body_fn
9420+        vstm     r0, {q0-q3}
9421+        // copy c to a
9422+        vmov.64  q0, q4
9423+        pld      [r1, r3]
9424+        vmov.64  q1, q5
9425+        it       le
9426+        pople    {lr}
9427+        vmov.64  q2, q6
9428+        it       le
9429+        bxle     lr
9430+        vmov.64  q3, q7
9431+        add      r0, r0, r2
9432+        // copy b to c
9433+        vmov.64  q4, q8
9434+        vmov.64  q5, q9
9435+        vmov.64  q6, q10
9436+        vmov.64  q7, q11
9437+        b        1b
9438+.endm
9439+
9440+.macro  edge_32bx2_e1, body_fn
9441+        sub      r6, r1, r3
9442+        vld1.8   {q2-q3}, [r1, :256], r3
9443+        vld1.8   {q0-q1}, [r6, :256]
9444+        mov      r6, lr
9445+
9446+1:      @ Given the data duplication here we could obviously do better than
9447+        @ using the generic body_fn but it almost certainly isn't worth it
9448+        vld1.8   {q8-q9}, [r1, :256], r3
9449+        subs     r12, #2
9450+        vmov     q4, q2
9451+        vmov     q5, q3
9452+        vld1.8   {q10-q11}, [r1, :256], r3
9453+        vmov     q6, q8
9454+        vmov     q7, q9
9455+
9456+        bl       \body_fn
9457+
9458+        vst1.8   {q0-q1}, [r0, :256], r2
9459+        // copy b to a
9460+        vmov     q0, q8
9461+        vmov     q1, q9
9462+        vst1.8   {q2-q3}, [r0, :256], r2
9463+        vmov     q2, q10
9464+        it       le
9465+        bxle     r6
9466+        vmov     q3, q11
9467+        b        1b
9468+.endm
9469+
9470+.macro  edge_16b_e1, body_fn
9471+        sub      r6, r1, r3
9472+        // load c
9473+        vld1.8   {q1}, [r1, :128], r3
9474+        // load a
9475+        vld1.8   {q0}, [r6, :128]
9476+        mov      r6, lr
9477+1:      // load b
9478+        vld1.8   {q2}, [r1, :128], r3
9479+        bl       \body_fn
9480+        vst1.8   {q0}, [r0, :128], r2
9481+        subs     r12, #1
9482+        // copy c to a
9483+        vmov.64  q0, q1
9484+        it       le
9485+        bxle     r6
9486+        // copy b to c
9487+        vmov.64  q1, q2
9488+        b        1b
9489+.endm
9490+
9491+.macro  edge_8bx2_e1, body_fn
9492+        sub      r6, r1, r3
9493+        lsl      r3, #1
9494+        push     {r7, lr}
9495+        vld1.8   {d1}, [r1, :64], r3
9496+        vld1.8   {d0}, [r6, :64], r3
9497+        add      r7, r0, r2
9498+        lsl      r2, #1
9499+1:      @ Given the data duplication here we could obviously do better than
9500+        @ using the generic body_fn but it almost certainly isn't worth it
9501+        vld1.8   {d4}, [r6, :64], r3
9502+        vmov     d2, d1
9503+        vld1.8   {d5}, [r1, :64], r3
9504+        subs     r12, #2
9505+        vmov     d3, d4
9506+
9507+        bl       \body_fn
9508+
9509+        vst1.8   {d0}, [r0, :64], r2
9510+        vst1.8   {d1}, [r7, :64], r2
9511+
9512+        // copy b to a
9513+        vmov     q0, q2
9514+        bgt      1b
9515+        pop      {r7, pc}
9516+.endm
9517+
9518+.macro  edge_4bx4_e1, body_fn
9519+        sub      r6, r1, r3
9520+        lsl      r3, #1
9521+        push     {r7, lr}
9522+        vld1.32  {d0[1]}, [r1, :32], r3
9523+        add      r7, r0, r2
9524+        vld1.32  {d0[0]}, [r6, :32], r3
9525+        lsl      r2, #1
9526+        vld1.32  {d4[1]}, [r1, :32], r3
9527+        vld1.32  {d4[0]}, [r6, :32], r3
9528+        vld1.32  {d5[1]}, [r1, :32], r3
9529+        vld1.32  {d5[0]}, [r6, :32], r3
9530+        vmov     d1, d4
9531+        vext.32  d2, d0, d4, #1
9532+        subs     r12, #4
9533+        vmov     d22, d5
9534+        vext.32  d3, d4, d5, #1
9535+        b        2f
9536+
9537+1:      vst1.32  {d0[0]}, [r0, :32], r2
9538+        vext.32  d2, d22, d4, #1
9539+        vst1.32  {d0[1]}, [r7, :32], r2
9540+        vmov     d0, d22
9541+        vst1.32  {d1[0]}, [r0, :32], r2
9542+        vext.32  d3, d4, d5, #1
9543+        vst1.32  {d1[1]}, [r7, :32], r2
9544+        vmov     d1, d4
9545+        vmov     d22, d5
9546+2:      @ Given the data duplication here we could probably do better than
9547+        @ using the generic body_fn but it almost certainly isn't worth it
9548+        bl       \body_fn
9549+        ble      3f
9550+        vld1.32  {d4[0]}, [r6, :32], r3
9551+        subs     r12, #4
9552+        vld1.32  {d4[1]}, [r1, :32], r3
9553+        vld1.32  {d5[0]}, [r6, :32], r3
9554+        vld1.32  {d5[1]}, [r1, :32], r3
9555+        b        1b
9556+
9557+3:      vst1.32  {d0[0]}, [r0, :32], r2
9558+        vst1.32  {d0[1]}, [r7, :32], r2
9559+        vst1.32  {d1[0]}, [r0, :32]
9560+        vst1.32  {d1[1]}, [r7, :32]
9561+        pop      {r7, pc}
9562+.endm
9563+
9564+.macro  edge_64b_e2, body_fn, pb
9565+        push     {lr}
9566+        sub      r6, r1, r3
9567+        // load c and a
9568+        vld1.8   {q4-q5}, [r1, :128]
9569+        vldr     d25, [r6, #-8]
9570+        vldmia   r6, {d16-d23}
9571+        vext.8   q0, q12, q8, #16 - \pb
9572+        add      r6, r1, #32
9573+        vext.8   q1, q8, q9, #16 - \pb
9574+        add      r1, r1, r3
9575+        vext.8   q2, q9, q10, #16 - \pb
9576+        vld1.8   {q6-q7}, [r6, :128]
9577+        sub      r6, r1, r3
9578+        vext.8   q3, q10, q11, #16 - \pb
9579+
9580+1:      // load b
9581+        vldmia   r1, {d16-d24}
9582+        vext.8   q8, q8, q9, #\pb
9583+        pld      [r1, r3]
9584+        vext.8   q9, q9, q10, #\pb
9585+        subs     r12, #1
9586+        vext.8   q10, q10, q11, #\pb
9587+        vext.8   q11, q11, q12, #\pb
9588+        bl       \body_fn
9589+        // next a is mostly available in c
9590+        vldr     d25, [r6, #-8]
9591+        vstmia   r0, {q0-q3}
9592+        vext.8   q3, q6, q7, #16 - \pb
9593+        it       le
9594+        pople    {lr}
9595+        vext.8   q2, q5, q6, #16 - \pb
9596+        it       le
9597+        bxle     lr
9598+        vext.8   q1, q4, q5, #16 - \pb
9599+        add      r6, r6, r3
9600+        vext.8   q0, q12, q4, #16 - \pb
9601+        add      r0, r0, r2
9602+        // next c is mostly available in b
9603+        vldr     d8, [r1]
9604+        vext.8   d9, d16, d17, #8 - \pb
9605+        vext.8   q5, q8, q9, #16 - \pb
9606+        add      r1, r1, r3
9607+        vext.8   q6, q9, q10, #16 - \pb
9608+        pld      [r6, #-8]
9609+        vext.8   q7, q10, q11, #16 - \pb
9610+        b        1b
9611+.endm
9612+
9613+.macro  edge_32bx2_e2, body_fn, pb
9614+        sub      r6, r1, r3
9615+        push     {r7, lr}
9616+        add      r7, r0, r2
9617+        lsl      r2, #1
9618+        // load a and first 32b of c
9619+        vld1.8   {q4-q5}, [r1, :256]
9620+        vldr     d25, [r6, #-8]
9621+        vld1.8   {q13-q14}, [r6, :256]
9622+        vldr     d31, [r1, #-8]
9623+        add      r6, r6, r3, lsl #1
9624+        vext.8   q0, q12, q13, #16 - \pb
9625+        add      r1, r1, r3, lsl #1
9626+        vext.8   q1, q13, q14, #16 - \pb
9627+        vext.8   q2, q15, q4, #16 - \pb
9628+        vext.8   q3, q4, q5, #16 - \pb
9629+1:
9630+        // load second 32b of c and second 32b of b
9631+        vldmia   r6, {d12-d16}
9632+        vldmia   r1, {d20-d24}
9633+        // first 32b of b is mostly available in second 32b of c
9634+        vext.8   q9, q7, q8, #\pb
9635+        subs     r12, #2
9636+        vext.8   q8, q6, q7, #\pb
9637+        vext.8   q10, q10, q11, #\pb
9638+        vext.8   q11, q11, q12, #\pb
9639+
9640+        bl       \body_fn
9641+
9642+        vst1.8   {q0-q1}, [r0, :256], r2
9643+        vst1.8   {q2-q3}, [r7, :256], r2
9644+        ble      2f
9645+
9646+        vldr     d25, [r6, #-8]
9647+        add      r6, r6, r3, lsl #1
9648+        vldr     d8, [r1]
9649+        vext.8   d9, d20, d21, #8 - \pb
9650+        vldr     d31, [r1, #-8]
9651+        add      r1, r1, r3, lsl #1
9652+        // first 32b of a is mostly available in second 32b of c
9653+        vext.8   q1, q6, q7, #16 - \pb
9654+        vext.8   q0, q12, q6, #16 - \pb
9655+        // first 32b of c is mostly available in second 32b of b
9656+        vext.8   q5, q10, q11, #16 - \pb
9657+        // second 32b of a is mostly available in first 32b of c
9658+        vext.8   q2, q15, q4, #16 - \pb
9659+        vext.8   q3, q4, q5, #16 - \pb
9660+        b        1b
9661+
9662+2:      pop      {r7, pc}
9663+.endm
9664+
9665+.macro  edge_16b_e2, body_fn, pb
9666+        push     {lr}
9667+        sub      r6, r1, r3
9668+        vld1.8   {q1}, [r1, :128], r3
9669+        vldr     d19, [r6, #-8]
9670+        vld1.8   {q10}, [r6, :128], r3
9671+
9672+1:      vldmia   r1, {d4-d6}
9673+        vext.8   q0, q9, q10, #16 - \pb
9674+        subs     r12, #1
9675+        vext.8   q2, q2, q3, #\pb
9676+        bl       \body_fn
9677+        vst1.8   {q0}, [r0, :128], r2
9678+        ble      2f
9679+        vmov     q10, q1
9680+        vldr     d2, [r1]
9681+        add      r1, r1, r3
9682+        vldr     d19, [r6, #-8]
9683+        add      r6, r6, r3
9684+        vext.8   d3, d4, d5, #8 - \pb
9685+        b        1b
9686+
9687+2:      pop      {pc}
9688+.endm
9689+
9690+.macro  edge_8bx2_e2, body_fn, pb
9691+        sub      r6, r1, r3
9692+        push     {r7, lr}
9693+        add      r7, r0, r2
9694+        lsl      r2, #1
9695+        vldr     d18, [r6, #-8]
9696+        vldr     d19, [r6]
9697+        add      r6, r6, r3, lsl #1
9698+        vldr     d20, [r1, #-8]
9699+        vldr     d2, [r1]
9700+        add      r1, r1, r3, lsl #1
9701+        vldmia   r6, {d3-d4}
9702+        vld1.8   {d21-d22}, [r1, :128]
9703+
9704+1:      vext.8   d0, d18, d19, #8 - \pb
9705+        vext.8   d4, d3, d4, #\pb
9706+        vext.8   d1, d20, d2, #8 - \pb
9707+        subs     r12, #2
9708+        vext.8   d5, d21, d22, #\pb
9709+
9710+        bl       \body_fn
9711+
9712+        vst1.8   {d0}, [r0, :64], r2
9713+        vst1.8   {d1}, [r7, :64], r2
9714+        ble      2f
9715+
9716+        vldr     d18, [r6, #-8]
9717+        add      r6, r6, r3, lsl #1
9718+        vldr     d20, [r1, #-8]
9719+        vmov     d19, d3
9720+        vldr     d2, [r1]
9721+        add      r1, r1, r3, lsl #1
9722+        vldmia   r6, {d3-d4}
9723+        vld1.8   {d21-d22}, [r1, :128]
9724+        b        1b
9725+
9726+2:      pop      {r7, pc}
9727+.endm
9728+
9729+.macro  edge_4bx4_e2, body_fn, pb
9730+        sub      r6, r1, r3
9731+        push     {r7-r9, lr}
9732+        add      r8, r1, r3
9733+        sub      r6, r6, #\pb
9734+        add      r8, r8, #\pb
9735+        add      r7, r0, r2
9736+        lsl      r2, #1
9737+
9738+1:      vld1.32  {d0[0]}, [r6], r3
9739+        subs     r12, #4
9740+        vld1.32  {d2[0]}, [r1], r3
9741+        vld1.32  {d4[0]}, [r8], r3
9742+        vld1.32  {d0[1]}, [r6], r3
9743+        vld1.32  {d2[1]}, [r1], r3
9744+        vld1.32  {d4[1]}, [r8], r3
9745+        vld1.32  {d1[0]}, [r6], r3
9746+        vld1.32  {d3[0]}, [r1], r3
9747+        vld1.32  {d5[0]}, [r8], r3
9748+        vld1.32  {d1[1]}, [r6], r3
9749+        vld1.32  {d3[1]}, [r1], r3
9750+        vld1.32  {d5[1]}, [r8], r3
9751+
9752+        bl       \body_fn
9753+
9754+        vst1.32  {d0[0]}, [r0, :32], r2
9755+        vst1.32  {d0[1]}, [r7, :32], r2
9756+        vst1.32  {d1[0]}, [r0, :32], r2
9757+        vst1.32  {d1[1]}, [r7, :32], r2
9758+        bgt      1b
9759+
9760+        pop      {r7-r9,pc}
9761+.endm
9762+
9763+.macro  edge_64b_e3, body_fn, pb
9764+        push     {lr}
9765+        sub      r6, r1, r3
9766+        // load c and a
9767+        vld1.8   {q4-q5}, [r1, :128]
9768+        vldmia   r6, {d16-d24}
9769+        vext.8   q0, q8, q9, #\pb
9770+        add      r6, r1, #32
9771+        vext.8   q1, q9, q10, #\pb
9772+        add      r1, r1, r3
9773+        vext.8   q2, q10, q11, #\pb
9774+        vld1.8   {q6-q7}, [r6, :128]
9775+        sub      r6, r1, r3
9776+        vext.8   q3, q11, q12, #\pb
9777+
9778+1:      // load b
9779+        vldr     d17, [r1, #-8]
9780+        vldmia   r1, {d18-d25}
9781+        vext.8   q8, q8, q9, #16 - \pb
9782+        pld      [r1, r3]
9783+        vext.8   q9, q9, q10, #16 - \pb
9784+        subs     r12, #1
9785+        vext.8   q10, q10, q11, #16 - \pb
9786+        vext.8   q11, q11, q12, #16 - \pb
9787+        bl       \body_fn
9788+        // next a is mostly available in c
9789+        vldr     d24, [r6, #64]
9790+        vstmia   r0, {q0-q3}
9791+        vext.8   q0, q4, q5, #\pb
9792+        it       le
9793+        pople    {lr}
9794+        vext.8   q1, q5, q6, #\pb
9795+        it       le
9796+        bxle     lr
9797+        vext.8   q2, q6, q7, #\pb
9798+        add      r6, r6, r3
9799+        vext.8   q3, q7, q12, #\pb
9800+        add      r0, r0, r2
9801+        // next c is mostly available in b
9802+        vext.8   d14, d22, d23, #\pb
9803+        vldr     d15, [r1, #56]
9804+        vext.8   q4, q8, q9, #\pb
9805+        add      r1, r1, r3
9806+        vext.8   q5, q9, q10, #\pb
9807+        vext.8   q6, q10, q11, #\pb
9808+        b        1b
9809+.endm
9810+
9811+.macro  edge_32bx2_e3, body_fn, pb
9812+        sub      r6, r1, r3
9813+        push     {r7, lr}
9814+        add      r7, r0, r2
9815+        lsl      r2, #1
9816+        // load a and first 32b of c
9817+        vldmia   r1, {d8-d12}
9818+        vldmia   r6, {d24-d28}
9819+        vext.8   q2, q4, q5, #\pb
9820+        add      r6, r6, r3, lsl #1
9821+        vext.8   q3, q5, q6, #\pb
9822+        add      r1, r1, r3, lsl #1
9823+        vext.8   q0, q12, q13, #\pb
9824+        vext.8   q1, q13, q14, #\pb
9825+1:
9826+        // load second 32b of c and second 32b of b
9827+        vldr     d25, [r6, #-8]
9828+        subs     r12, #2
9829+        vldmia   r6, {d12-d15}
9830+        vldr     d27, [r1, #-8]
9831+        vldmia   r1, {d20-d23}
9832+        // first 32b of b is mostly available in second 32b of c
9833+        vext.8   q8, q12, q6, #16 - \pb
9834+        vext.8   q9, q6, q7, #16 - \pb
9835+        vext.8   q11, q10, q11, #16 - \pb
9836+        vext.8   q10, q13, q10, #16 - \pb
9837+
9838+        bl       \body_fn
9839+
9840+        vst1.8   {q0-q1}, [r0, :256], r2
9841+        vst1.8   {q2-q3}, [r7, :256], r2
9842+        ble      2f
9843+
9844+        vldr     d24, [r6, #32]
9845+        add      r6, r6, r3, lsl #1
9846+        vldr     d11, [r1, #24]
9847+        vext.8   d10, d22, d23, #\pb
9848+        vldr     d30, [r1, #32]
9849+        add      r1, r1, r3, lsl #1
9850+        // first 32b of a is mostly available in second 32b of c
9851+        vext.8   q0, q6, q7, #\pb
9852+        vext.8   q1, q7, q12, #\pb
9853+        // first 32b of c is mostly available in second 32b of b
9854+        vext.8   q4, q10, q11, #\pb
9855+        // second 32b of a is mostly available in first 32b of c
9856+        vext.8   q3, q5, q15, #\pb
9857+        vext.8   q2, q4, q5, #\pb
9858+        b        1b
9859+
9860+2:      pop      {r7, pc}
9861+.endm
9862+
9863+.macro  edge_16b_e3, body_fn, pb
9864+        push     {lr}
9865+        sub      r6, r1, r3
9866+        vld1.8   {q1}, [r1, :128], r3
9867+        vldmia   r6, {d18-d20}
9868+        add      r6, r6, r3
9869+
9870+1:      vldr     d5, [r1, #-8]
9871+        vld1.8   {q3}, [r1, :128]
9872+        subs     r12, #1
9873+        vext.8   q0, q9, q10, #\pb
9874+        vext.8   q2, q2, q3, #16 - \pb
9875+        bl       \body_fn
9876+        vst1.8   {q0}, [r0, :128], r2
9877+        ble      2f
9878+        vmov     q9, q1
9879+        vldr     d3, [r1, #8]
9880+        add      r1, r1, r3
9881+        vldr     d20, [r6, #16]
9882+        add      r6, r6, r3
9883+        vext.8   d2, d4, d5, #\pb
9884+        b        1b
9885+
9886+2:      pop      {pc}
9887+.endm
9888+
9889+.macro  edge_8bx2_e3, body_fn, pb
9890+        sub      r6, r1, r3
9891+        push     {r7, lr}
9892+        add      r7, r0, r2
9893+        lsl      r2, #1
9894+        vld1.8   {d18-d19}, [r6]
9895+        add      r6, r6, r3, lsl #1
9896+        vldr     d20, [r1, #8]
9897+        vldr     d2, [r1]
9898+        add      r1, r1, r3, lsl #1
9899+        vldr     d4, [r6, #-8]
9900+        vldr     d3, [r6]
9901+        vldr     d21, [r1, #-8]
9902+        vldr     d22, [r1]
9903+
9904+1:      vext.8   d0, d18, d19, #\pb
9905+        vext.8   d4, d4, d3, #8 - \pb
9906+        vext.8   d1, d2, d20, #\pb
9907+        subs     r12, #2
9908+        vext.8   d5, d21, d22, #8 - \pb
9909+
9910+        bl       \body_fn
9911+
9912+        vst1.8   {d0}, [r0, :64], r2
9913+        vst1.8   {d1}, [r7, :64], r2
9914+        ble      2f
9915+
9916+        vldr     d19, [r6, #8]
9917+        add      r6, r6, r3, lsl #1
9918+        vldr     d20, [r1, #8]
9919+        vmov     d18, d3
9920+        vldr     d2, [r1]
9921+        add      r1, r1, r3, lsl #1
9922+        vldr     d4, [r6, #-8]
9923+        vldr     d3, [r6]
9924+        vldr     d21, [r1, #-8]
9925+        vldr     d22, [r1]
9926+        b        1b
9927+
9928+2:      pop      {r7, pc}
9929+.endm
9930+
9931+.macro  edge_4bx4_e3, body_fn, pb
9932+        @ e3 is the same as e2 but with the X offset reversed
9933+        edge_4bx4_e2 \body_fn, (-\pb)
9934+.endm
9935+
9936+@ Jump table entry - if in neon mode the bottom bit must be set
9937+@ ? There is probably a real asm instruction to do this but I haven't found it
9938+.macro jent lab
9939+.if jent_pic
9940+@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is
9941+@ simpler and clearer in the code to stick with .word
9942+T       .word  (0 + \lab) - (4 + 98b)
9943+A       .word  (0 + \lab) - (8 + 98b)
9944+.else
9945+T       .word   1 + \lab
9946+A       .word   \lab
9947+.endif
9948+.endm
9949+
9950+.macro edge_64b_bodies, body_fn, pb
9951+        jent    0f
9952+        jent    10f
9953+        jent    20f
9954+        jent    30f
9955+
9956+0:      edge_64b_e0     \body_fn, \pb
9957+10:     edge_64b_e1     \body_fn
9958+20:     edge_64b_e2     \body_fn, \pb
9959+30:     edge_64b_e3     \body_fn, \pb
9960+.endm
9961+
9962+.macro edge_32bx2_bodies, body_fn, pb
9963+        jent    0f
9964+        jent    10f
9965+        jent    20f
9966+        jent    30f
9967+
9968+0:      edge_32bx2_e0   \body_fn, \pb
9969+10:     edge_32bx2_e1   \body_fn
9970+20:     edge_32bx2_e2   \body_fn, \pb
9971+30:     edge_32bx2_e3   \body_fn, \pb
9972+.endm
9973+
9974+.macro edge_16b_bodies, body_fn, pb
9975+        jent    0f
9976+        jent    10f
9977+        jent    20f
9978+        jent    30f
9979+
9980+0:      edge_16b_e0     \body_fn, \pb
9981+10:     edge_16b_e1     \body_fn
9982+20:     edge_16b_e2     \body_fn, \pb
9983+30:     edge_16b_e3     \body_fn, \pb
9984+.endm
9985+
9986+.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
9987+        jent    0f
9988+        jent    10f
9989+        jent    20f
9990+        jent    30f
9991+        jent    5f
9992+        jent    15f
9993+        jent    25f
9994+        jent    35f
9995+
9996+0:      edge_32bx2_e0   \body_fn_64b, \pb
9997+10:     edge_32bx2_e1   \body_fn_64b
9998+20:     edge_32bx2_e2   \body_fn_64b, \pb
9999+30:     edge_32bx2_e3   \body_fn_64b, \pb
10000+5:      edge_16b_e0     \body_fn_16b, \pb
10001+15:     edge_16b_e1     \body_fn_16b
10002+25:     edge_16b_e2     \body_fn_16b, \pb
10003+35:     edge_16b_e3     \body_fn_16b, \pb
10004+.endm
10005+
10006+.macro edge_16b_8bx2_bodies, body_fn, pb
10007+        jent    0f
10008+        jent    10f
10009+        jent    20f
10010+        jent    30f
10011+        jent    5f
10012+        jent    15f
10013+        jent    25f
10014+        jent    35f
10015+
10016+0:      edge_16b_e0     \body_fn, \pb
10017+10:     edge_16b_e1     \body_fn
10018+20:     edge_16b_e2     \body_fn, \pb
10019+30:     edge_16b_e3     \body_fn, \pb
10020+5:      edge_8bx2_e0    \body_fn, \pb
10021+15:     edge_8bx2_e1    \body_fn
10022+25:     edge_8bx2_e2    \body_fn, \pb
10023+35:     edge_8bx2_e3    \body_fn, \pb
10024+.endm
10025+
10026+.macro edge_8bx2_4bx4_bodies, body_fn, pb
10027+        jent    0f
10028+        jent    10f
10029+        jent    20f
10030+        jent    30f
10031+        jent    5f
10032+        jent    15f
10033+        jent    25f
10034+        jent    35f
10035+
10036+0:      edge_8bx2_e0    \body_fn, \pb
10037+10:     edge_8bx2_e1    \body_fn
10038+20:     edge_8bx2_e2    \body_fn, \pb
10039+30:     edge_8bx2_e3    \body_fn, \pb
10040+5:      edge_4bx4_e0    \body_fn, \pb
10041+15:     edge_4bx4_e1    \body_fn
10042+25:     edge_4bx4_e2    \body_fn, \pb
10043+35:     edge_4bx4_e3    \body_fn, \pb
10044+.endm
10045+
10046+@ void ff_hevc_rpi_sao_edge_8_neon_8(
10047+@   uint8_t *_dst,            [r0]
10048+@   uint8_t *_src,            [r1]
10049+@   int  stride_dst,          [r2]
10050+@   int16_t *_sao_offset_val, [r3]
10051+@   int eo,                   [sp, #0]
10052+@   int width,                [sp, #4]
10053+@   int height)               [sp, #8]
10054+
10055+function ff_hevc_rpi_sao_edge_8_neon_8, export=1
10056+        edge_16b_init   8, 0, 1, 99f
10057+99:
10058+        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
10059+endfunc
10060+
10061+@ void ff_hevc_rpi_sao_edge_16_neon_8(
10062+@   uint8_t *_dst,            [r0]
10063+@   uint8_t *_src,            [r1]
10064+@   int  stride_dst,          [r2]
10065+@   int16_t *_sao_offset_val, [r3]
10066+@   int eo,                   [sp, #0]
10067+@   int width,                [sp, #4]
10068+@   int height)               [sp, #8]
10069+
10070+function ff_hevc_rpi_sao_edge_16_neon_8, export=1
10071+        edge_16b_init   8, 0, 0, 99f
10072+99:
10073+        edge_16b_bodies edge_16b_body_8, 1
10074+endfunc
10075+
10076+@ void ff_hevc_rpi_sao_edge_32_neon_8(
10077+@   uint8_t *_dst,            [r0]
10078+@   uint8_t *_src,            [r1]
10079+@   int  stride_dst,          [r2]
10080+@   int16_t *_sao_offset_val, [r3]
10081+@   int eo,                   [sp, #0]
10082+@   int width,                [sp, #4]
10083+@   int height)               [sp, #8]
10084+
10085+function ff_hevc_rpi_sao_edge_32_neon_8, export=1
10086+        edge_64b_init   8, 0, 0, 99f
10087+99:
10088+        edge_32bx2_bodies edge_64b_body_8, 1
10089+endfunc
10090+
10091+@ void ff_hevc_rpi_sao_edge_64_neon_8(
10092+@   uint8_t *_dst,            [r0]
10093+@   uint8_t *_src,            [r1]
10094+@   int  stride_dst,          [r2]
10095+@   int16_t *_sao_offset_val, [r3]
10096+@   int eo,                   [sp, #0]
10097+@   int width,                [sp, #4]
10098+@   int height)               [sp, #8]
10099+
10100+function ff_hevc_rpi_sao_edge_64_neon_8, export=1
10101+        edge_64b_init   8, 0, 0, 99f
10102+99:
10103+        edge_64b_bodies edge_64b_body_8, 1
10104+endfunc
10105+
10106+@ ff_hevc_rpi_sao_edge_c_8_neon_8(
10107+@   uint8_t *_dst,                    [r0]
10108+@   const uint8_t *_src,              [r1]
10109+@   ptrdiff_t stride_dst,             [r2]
10110+@   const int16_t *_sao_offset_val_u, [r3]
10111+@   const int16_t *_sao_offset_val_v, [sp, #0]
10112+@   int eo,                           [sp, #4]
10113+@   int width,                        [sp, #8]
10114+@   int height)                       [sp, #12]
10115+
10116+function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1
10117+        edge_16b_init   8, 1, 1, 99f
10118+99:
10119+        edge_16b_8bx2_bodies edge_16b_body_8, 2
10120+endfunc
10121+
10122+@ ff_hevc_rpi_sao_edge_c_16_neon_8(
10123+@   uint8_t *_dst,                    [r0]
10124+@   const uint8_t *_src,              [r1]
10125+@   ptrdiff_t stride_dst,             [r2]
10126+@   const int16_t *_sao_offset_val_u, [r3]
10127+@   const int16_t *_sao_offset_val_v, [sp, #0]
10128+@   int eo,                           [sp, #4]
10129+@   int width,                        [sp, #8]
10130+@   int height)                       [sp, #12]
10131+
10132+function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1
10133+        edge_64b_init   8, 1, 0, 99f
10134+99:
10135+        edge_32bx2_bodies edge_64b_body_8, 2
10136+endfunc
10137+
10138+@ ff_hevc_rpi_sao_edge_c_32_neon_8(
10139+@   uint8_t *_dst,                    [r0]
10140+@   const uint8_t *_src,              [r1]
10141+@   ptrdiff_t stride_dst,             [r2]
10142+@   const int16_t *_sao_offset_val_u, [r3]
10143+@   const int16_t *_sao_offset_val_v, [sp, #0]
10144+@   int eo,                           [sp, #4]
10145+@   int width,                        [sp, #8]
10146+@   int height)                       [sp, #12]
10147+
10148+function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1
10149+        edge_64b_init   8, 1, 0, 99f
10150+99:
10151+        edge_64b_bodies edge_64b_body_8, 2
10152+endfunc
10153+
10154+@ void ff_hevc_rpi_sao_edge_8_neon_10(
10155+@   uint8_t *_dst,            [r0]
10156+@   uint8_t *_src,            [r1]
10157+@   int  stride_dst,          [r2]
10158+@   int16_t *_sao_offset_val, [r3]
10159+@   int eo,                   [sp, #0]
10160+@   int width,                [sp, #4]
10161+@   int height)               [sp, #8]
10162+
10163+function ff_hevc_rpi_sao_edge_8_neon_10, export=1
10164+        edge_16b_init   10, 0, 1, 99f
10165+99:
10166+        edge_16b_8bx2_bodies edge_16b_body_16, 2
10167+endfunc
10168+
10169+@ void ff_hevc_rpi_sao_edge_16_neon_10(
10170+@   uint8_t *_dst,            [r0]
10171+@   uint8_t *_src,            [r1]
10172+@   int  stride_dst,          [r2]
10173+@   int16_t *_sao_offset_val, [r3]
10174+@   int eo,                   [sp, #0]
10175+@   int width,                [sp, #4]
10176+@   int height)               [sp, #8]
10177+
10178+function ff_hevc_rpi_sao_edge_16_neon_10, export=1
10179+        edge_64b_init   10, 0, 0, 99f
10180+99:
10181+        edge_32bx2_bodies edge_64b_body_16, 2
10182+endfunc
10183+
10184+@ void ff_hevc_rpi_sao_edge_64_neon_10(
10185+@   uint8_t *_dst,            [r0]
10186+@   uint8_t *_src,            [r1]
10187+@   int  stride_dst,          [r2]
10188+@   int16_t *_sao_offset_val, [r3]
10189+@   int eo,                   [sp, #0]
10190+@   int width,                [sp, #4]
10191+@   int height)               [sp, #8]
10192+
10193+@ We simply split the 32 case into 2 vertical stripes
10194+@ and call the fns for w32
10195+@
10196+@ Calling code will always have src != dst so we don't have to worry
10197+@ about edge effects
10198+
10199+function ff_hevc_rpi_sao_edge_64_neon_10, export=1
10200+        edge_64b_init   10, 0, 1, 99f, xjump=1
10201+endfunc
10202+
10203+@ void ff_hevc_rpi_sao_edge_32_neon_10(
10204+@   uint8_t *_dst,            [r0]
10205+@   uint8_t *_src,            [r1]
10206+@   int  stride_dst,          [r2]
10207+@   int16_t *_sao_offset_val, [r3]
10208+@   int eo,                   [sp, #0]
10209+@   int width,                [sp, #4]
10210+@   int height)               [sp, #8]
10211+
10212+function ff_hevc_rpi_sao_edge_32_neon_10, export=1
10213+        edge_64b_init   10, 0, 0, 99f
10214+99:
10215+        edge_64b_bodies edge_64b_body_16, 2
10216+endfunc
10217+
10218+@ ff_hevc_rpi_sao_edge_c_8_neon_10(
10219+@   uint8_t *_dst,                    [r0]
10220+@   const uint8_t *_src,              [r1]
10221+@   ptrdiff_t stride_dst,             [r2]
10222+@   const int16_t *_sao_offset_val_u, [r3]
10223+@   const int16_t *_sao_offset_val_v, [sp, #0]
10224+@   int eo,                           [sp, #4]
10225+@   int width,                        [sp, #8]
10226+@   int height)                       [sp, #12]
10227+
10228+function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1
10229+        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
10230+99:
10231+        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
10232+endfunc
10233+
10234+@ ff_hevc_rpi_sao_edge_c_32_neon_10(
10235+@   uint8_t *_dst,                    [r0]
10236+@   const uint8_t *_src,              [r1]
10237+@   ptrdiff_t stride_dst,             [r2]
10238+@   const int16_t *_sao_offset_val_u, [r3]
10239+@   const int16_t *_sao_offset_val_v, [sp, #0]
10240+@   int eo,                           [sp, #4]
10241+@   int width,                        [sp, #8]
10242+@   int height)                       [sp, #12]
10243+
10244+function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1
10245+        edge_64b_init   10, 1, 1, 99f, xjump=1
10246+endfunc
10247+
10248+
10249+@ ff_hevc_rpi_sao_edge_c_16_neon_10(
10250+@   uint8_t *_dst,                    [r0]
10251+@   const uint8_t *_src,              [r1]
10252+@   ptrdiff_t stride_dst,             [r2]
10253+@   const int16_t *_sao_offset_val_u, [r3]
10254+@   const int16_t *_sao_offset_val_v, [sp, #0]
10255+@   int eo,                           [sp, #4]
10256+@   int width,                        [sp, #8]
10257+@   int height)                       [sp, #12]
10258+
10259+function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1
10260+        edge_64b_init   10, 1, 0, 99f
10261+99:
10262+        edge_64b_bodies edge_64b_body_16, 4
10263+endfunc
10264+
10265--- /dev/null
10266+++ b/libavcodec/arm/rpi_hevcpred_arm.h
10267@@ -0,0 +1,28 @@
10268+/*
10269+ * This file is part of FFmpeg.
10270+ *
10271+ * FFmpeg is free software; you can redistribute it and/or
10272+ * modify it under the terms of the GNU Lesser General Public
10273+ * License as published by the Free Software Foundation; either
10274+ * version 2.1 of the License, or (at your option) any later version.
10275+ *
10276+ * FFmpeg is distributed in the hope that it will be useful,
10277+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10278+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
10279+ * Lesser General Public License for more details.
10280+ *
10281+ * You should have received a copy of the GNU Lesser General Public
10282+ * License along with FFmpeg; if not, write to the Free Software
10283+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
10284+ */
10285+
10286+#ifndef AVCODEC_ARM_HEVCPRED_ARM_H
10287+#define AVCODEC_ARM_HEVCPRED_ARM_H
10288+
10289+#include "libavcodec/rpi_hevcpred.h"
10290+
10291+void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth);
10292+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth);
10293+
10294+#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */
10295+
10296--- /dev/null
10297+++ b/libavcodec/arm/rpi_hevcpred_init_arm.c
10298@@ -0,0 +1,35 @@
10299+/*
10300+ * Copyright (c) 2018 John Cox (for Raspberry Pi)
10301+ *
10302+ * This file is part of FFmpeg.
10303+ *
10304+ * FFmpeg is free software; you can redistribute it and/or
10305+ * modify it under the terms of the GNU Lesser General Public
10306+ * License as published by the Free Software Foundation; either
10307+ * version 2.1 of the License, or (at your option) any later version.
10308+ *
10309+ * FFmpeg is distributed in the hope that it will be useful,
10310+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10311+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
10312+ * Lesser General Public License for more details.
10313+ *
10314+ * You should have received a copy of the GNU Lesser General Public
10315+ * License along with FFmpeg; if not, write to the Free Software
10316+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
10317+ */
10318+
10319+#include "libavutil/attributes.h"
10320+#include "libavutil/cpu.h"
10321+#include "libavutil/arm/cpu.h"
10322+
10323+#include "libavcodec/rpi_hevcpred.h"
10324+#include "rpi_hevcpred_arm.h"
10325+
10326+av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth)
10327+{
10328+    int cpu_flags = av_get_cpu_flags();
10329+
10330+    if (have_neon(cpu_flags))
10331+        ff_hevc_rpi_pred_init_neon(c, bit_depth);
10332+}
10333+
10334--- /dev/null
10335+++ b/libavcodec/arm/rpi_hevcpred_init_neon.c
10336@@ -0,0 +1,210 @@
10337+/*
10338+ * Copyright (c) 2018 John Cox (for Raspberry Pi)
10339+ *
10340+ * This file is part of FFmpeg.
10341+ *
10342+ * FFmpeg is free software; you can redistribute it and/or
10343+ * modify it under the terms of the GNU Lesser General Public
10344+ * License as published by the Free Software Foundation; either
10345+ * version 2.1 of the License, or (at your option) any later version.
10346+ *
10347+ * FFmpeg is distributed in the hope that it will be useful,
10348+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10349+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
10350+ * Lesser General Public License for more details.
10351+ *
10352+ * You should have received a copy of the GNU Lesser General Public
10353+ * License along with FFmpeg; if not, write to the Free Software
10354+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
10355+ */
10356+
10357+#include "rpi_hevcpred_arm.h"
10358+
10359+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8;
10360+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
10361+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
10362+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
10363+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16;
10364+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32;
10365+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32;
10366+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32;
10367+
10368+void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10369+void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10370+void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10371+void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10372+void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10373+void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10374+void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10375+void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10376+void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10377+void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10378+void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10379+void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10380+void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10381+void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10382+
10383+void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10384+void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10385+void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10386+void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10387+void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10388+void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10389+void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10390+void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10391+void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10392+void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10393+void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10394+void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10395+void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10396+void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10397+
10398+void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10399+void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10400+void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10401+void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10402+void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10403+void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10404+void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10405+void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10406+void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10407+void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10408+void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10409+void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10410+void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10411+void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
10412+
10413+void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10414+void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10415+void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10416+void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10417+void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10418+void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10419+void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10420+void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10421+void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10422+void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10423+void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10424+void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10425+void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10426+void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10427+
10428+void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10429+void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10430+void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10431+void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10432+void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10433+void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10434+void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10435+void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10436+void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10437+void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10438+void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10439+void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10440+void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10441+void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
10442+
10443+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth)
10444+{
10445+    switch (bit_depth)
10446+    {
10447+    case 8:
10448+        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8;
10449+        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8;
10450+        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16;  // Equivalent to c_4_neon_8
10451+        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16;
10452+        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16;
10453+
10454+        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8;
10455+        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8;
10456+        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8;
10457+        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8;
10458+        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8;
10459+        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8;
10460+        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8;
10461+
10462+        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8;
10463+        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8;
10464+        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8;
10465+        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8;
10466+        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8;
10467+        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8;
10468+        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8;
10469+
10470+        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8;
10471+        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8;
10472+        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8;
10473+        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8;
10474+        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8;
10475+        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8;
10476+        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8;
10477+
10478+        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8;
10479+        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8;
10480+        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8;
10481+        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8;
10482+        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8;
10483+        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8;
10484+        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8;
10485+
10486+        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_8;
10487+        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_8;
10488+        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_8;
10489+        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_8;
10490+        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8;
10491+        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8;
10492+        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8;
10493+        break;
10494+    case 10:
10495+        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16;
10496+        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16;
10497+        c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16;
10498+        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32;
10499+        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32;
10500+        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32;
10501+
10502+        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10;
10503+        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10;
10504+        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10;
10505+        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10;
10506+        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10;
10507+        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10;
10508+        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10;
10509+
10510+        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10;
10511+        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10;
10512+        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10;
10513+        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10;
10514+        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10;
10515+        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10;
10516+        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10;
10517+
10518+        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10;
10519+        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10;
10520+        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10;
10521+        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10;
10522+        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10;
10523+        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10;
10524+        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10;
10525+
10526+        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10;
10527+        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10;
10528+        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10;
10529+        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10;
10530+        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10;
10531+        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10;
10532+        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10;
10533+
10534+        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_10;
10535+        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_10;
10536+        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_10;
10537+        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_10;
10538+        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10;
10539+        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10;
10540+        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10;
10541+        break;
10542+    default:
10543+        break;
10544+    }
10545+}
10546+
10547--- /dev/null
10548+++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
10549@@ -0,0 +1,2984 @@
10550+/*
10551+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
10552+All rights reserved.
10553+
10554+Redistribution and use in source and binary forms, with or without
10555+modification, are permitted provided that the following conditions are met:
10556+    * Redistributions of source code must retain the above copyright
10557+      notice, this list of conditions and the following disclaimer.
10558+    * Redistributions in binary form must reproduce the above copyright
10559+      notice, this list of conditions and the following disclaimer in the
10560+      documentation and/or other materials provided with the distribution.
10561+    * Neither the name of the copyright holder nor the
10562+      names of its contributors may be used to endorse or promote products
10563+      derived from this software without specific prior written permission.
10564+
10565+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
10566+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
10567+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
10568+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
10569+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
10570+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
10571+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
10572+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
10573+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
10574+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
10575+
10576+Authors: John Cox, Ben Avison
10577+*/
10578+
10579+/*
10580+ * General angular pred
10581+ *
10582+ * Horizontal (10) & Vertical (26) cases have their own file
10583+ * and are not dealt with properly here (luma filtering is missing)
10584+ *
10585+ * The inv_angle calculations are annoying - if it wasn't for the +128
10586+ * rounding step then the result would simply be the loop counter :-(
10587+ */
10588+
10589+
10590+#include "libavutil/arm/asm.S"
10591+#include "neon.S"
10592+
10593+.text
10594+
10595+@ Horizontal Patch functions
10596+@ These need a transpose before store so exist as smaller patches
10597+@ Patches can be called repeatedly without any intermediate setup
10598+@ to generate a horizontal block
10599+@
10600+@ It is almost certainly the case that larger patch fns can be built
10601+@ and they would be a little faster, but we would still need the small
10602+@ fns and code size (or at least instruction cache size) is an issue
10603+@ given how much code we already have here
10604+
10605+@ Generate 8x8 luma 8 patch
10606+@
10607+@ r3   Out stride
10608+@ r4   Angle add
10609+@ r7   Inv angle (_up only)
10610+@
10611+@ In/Out (updated)
10612+@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
10613+@ r2   Left ptr - updated
10614+@ r10  Inv angle accumulator (_up only)
10615+@ r12  32 - angle frac (_down) or angle frac (_up)
10616+@ d0   Older reference samples
10617+@ d1=r8+r9  Newer reference samples
10618+@ d2   32 - angle frac
10619+@ d3   Angle frac
10620+@ q2   Partially computed next result (_up only)
10621+@
10622+@ Temps
10623+@ r5   Loop counter
10624+@ r6
10625+@ r7   (_down only)
10626+@ r11  (_up only)
10627+@ q2, q8-q11
10628+
10629+patch_h_down_8x8_8:
10630+        ldrd        r8, r9, [r2]        @ Left
10631+        rsb         r12, r6, #32
10632+        vmov        d0, r8, r9
10633+        vdup.8      d3, r6
10634+        lsr         r8, #8
10635+        vdup.8      d2, r12
10636+        orr         r8, r8, r9, lsl #24
10637+        ldr         r9, [r2, #5]!
10638+        vmov        d1, r8, r9
10639+        // drop through...
10640+patch_h_down_8x8_8_continue:
10641+        mov         r5, #8
10642+1:
10643+          subs        r12, r4
10644+        vmull.u8    q2, d0, d2
10645+          it          mi
10646+          addmi       r12, #32
10647+        vmlal.u8    q2, d1, d3
10648+          rsb         r6, r12, #32
10649+        vext.8      q8, q8, q9, #8
10650+          itt         mi
10651+          lsrmi       r7, r8, #8
10652+          vmovmi      d0, r8, r9
10653+          vdup.8      d2, r12
10654+        vext.8      q9, q9, q10, #8
10655+          it          mi
10656+          orrmi       r8, r7, r9, lsl #24
10657+        vext.8      q10, q10, q11, #8
10658+          it          mi
10659+          ldrmi       r9, [r2, #1]!
10660+        vmov        d22, d23
10661+        vrshrn.u16  d23, q2, #5
10662+          it          mi
10663+          vmovmi      d1, r8, r9
10664+        subs        r5, #1
10665+          vdup.8      d3, r6
10666+        bne         1b
10667+        // drop through...
10668+store_tran_8x8_8:
10669+        vzip.8      d16, d17
10670+        add         r6, r0, r3
10671+        vzip.8      d18, d19
10672+        lsl         r3, #1
10673+        vzip.8      d20, d21
10674+        add         r5, r0, r3
10675+        vzip.8      d22, d23
10676+        vzip.16     q8, q9
10677+        vzip.16     q10, q11
10678+        vzip.32     q8, q10
10679+        vzip.32     q9, q11
10680+        vst1.8      {d16}, [r0]!
10681+        vst1.8      {d17}, [r6], r3
10682+        vst1.8      {d20}, [r5], r3
10683+        vst1.8      {d21}, [r6], r3
10684+        vst1.8      {d18}, [r5], r3
10685+        vst1.8      {d19}, [r6], r3
10686+        vst1.8      {d22}, [r5]
10687+        asr         r3, #1
10688+        vst1.8      {d23}, [r6]
10689+
10690+        bx          lr
10691+
10692+patch_h_up_8x8_8:
10693+        ldrd        r8, r9, [r2]
10694+        rsb         r6, r4, #32
10695+        vmov        d0, r8, r9
10696+        vdup.8      d3, r4
10697+        lsr         r11, r8, #24
10698+        vdup.8      d2, r6
10699+        ldr         r8, [r2, #-1]!
10700+        orr         r9, r11, r9, lsl #8
10701+        vmov        d1, r8, r9
10702+        mov         r12, r4
10703+        vmull.u8    q2, d0, d2
10704+        vmlal.u8    q2, d1, d3
10705+patch_h_up_8x8_8_continue:
10706+        mov         r5, #8
10707+1:
10708+          add         r12, r4
10709+          mov         r11, #0
10710+          cmp         r12, #33
10711+          it          cs
10712+          addcs       r10, r7
10713+        vext.8      q8, q8, q9, #8
10714+          itt         cs
10715+          subcs       r12, #32
10716+          tstcs       r10, #1<<31
10717+          rsb         r6, r12, #32
10718+          it          eq
10719+          asreq       r11, r10, #8
10720+          it          cs
10721+          vmovcs      d0, r8, r9
10722+          vdup.8      d2, r6
10723+          it          cs
10724+          lsrcs       r6, r8, #24
10725+        vext.8      q9, q9, q10, #8
10726+          itt         cs
10727+          orrcs       r9, r6, r9, lsl #8
10728+          ldrbcs      r11, [r1, r11]
10729+          vdup.8      d3, r12
10730+        vext.8      q10, q10, q11, #8
10731+          it          hi
10732+          ldrbhi      r11, [r2, #-1]!
10733+        vmov        d22, d23
10734+        vrshrn.u16  d23, q2, #5
10735+          itt         cs
10736+          orrcs       r8, r11, r8, lsl #8
10737+          vmovcs      d1, r8, r9
10738+          vmull.u8    q2, d0, d2
10739+        subs        r5, #1
10740+          vmlal.u8    q2, d1, d3
10741+        bne         1b
10742+
10743+        b           store_tran_8x8_8
10744+
10745+
10746+.macro ADRT reg, val
10747+@ adr in T32 has enough range but not in A32
10748+A       adrl        \reg, \val
10749+T       adr         \reg, \val
10750+.endm
10751+
10752+@ ff_hevc_rpi_pred_angular_4_neon_8
10753+@       uint8_t *_src,          [r0]
10754+@       const uint8_t *_top,    [r1]
10755+@       const uint8_t *_left,   [r2]
10756+@       ptrdiff_t stride        [r3]
10757+@       unsigned int mode       [sp, #0]  2..34
10758+
10759+function ff_hevc_rpi_pred_angular_4_neon_8, export=1
10760+        ldr         r12, [sp]
10761+        push        {r4-r8, lr}
10762+        ADRT        r4, angle_2 - 2
10763+        ADRT        r7, inv_angle - 11*2
10764+        add         r7, r7, r12, lsl #1
10765+        ldrsb       r6, [r4, r12]
10766+        cmp         r12, #26
10767+        ldrsb       r4, [r4, r12]
10768+        bge         26f
10769+        cmp         r12, #18
10770+        bge         18f
10771+        cmp         r12, #10
10772+        bge         10f
10773+
10774+@ Down of Horizontal - works down left
10775+        ldr         lr, [r2], #1        @ Top
10776+        rsb         r12, r6, #32
10777+        vmov        s0, lr
10778+        vdup.8      d3, r6
10779+        ldr         lr, [r2], #1
10780+        vdup.8      d2, r12
10781+        vmov        s2, lr
10782+          subs        r12, r4
10783+        vmull.u8    q2, d0, d2
10784+          it          mi
10785+          addmi       r12, #32
10786+        vmlal.u8    q2, d1, d3
10787+          rsb         r6, r12, #32
10788+          itt         mi
10789+          vmovmi      s0, lr
10790+          ldrmi       lr, [r2], #1
10791+          vdup.8      d2, r12
10792+          it          mi
10793+          vmovmi      s2, lr
10794+          vdup.8      d3, r6
10795+        mov         r5, #2
10796+1:
10797+        vrshrn.u16  d20, q2, #5
10798+            subs        r12, r4
10799+          vmull.u8    q2, d0, d2
10800+            it          mi
10801+            addmi       r12, #32
10802+          vmlal.u8    q2, d1, d3
10803+            rsb         r6, r12, #32
10804+        vext.64     q8, q8, q9, #1
10805+            it          mi
10806+            vmovmi      s0, lr
10807+        vext.64     q9, q9, q10, #1
10808+            it          mi
10809+            ldrmi       lr, [r2], #1
10810+            vdup.8      d2, r12
10811+            it          mi
10812+            vmovmi      s2, lr
10813+        subs        r5, #1
10814+            vdup.8      d3, r6
10815+        bne         1b
10816+
10817+          vrshrn.u16  d20, q2, #5
10818+            vmull.u8    q2, d0, d2
10819+        add         r12, r0,  r3
10820+            vmlal.u8    q2, d1, d3
10821+        lsl         r3,  #1
10822+          vext.64     q8, q8, q9, #1
10823+          vext.64     q9, q9, q10, #1
10824+            vrshrn.u16  d20, q2, #5
10825+
10826+98:
10827+        vst4.8      {d17[0], d18[0], d19[0], d20[0]}, [r0], r3
10828+        vst4.8      {d17[1], d18[1], d19[1], d20[1]}, [r12], r3
10829+        vst4.8      {d17[2], d18[2], d19[2], d20[2]}, [r0]
10830+        vst4.8      {d17[3], d18[3], d19[3], d20[3]}, [r12]
10831+        pop        {r4-r8, pc}
10832+
10833+@ Up of Horizontal - works down up
10834+10:
10835+        ldrh        r7, [r7]
10836+        rsb         r12, r6, #32
10837+        ldr         lr, [r2]            @ Left
10838+        ldrb        r2, [r2, #-1]       @ Top-left
10839+        vmov        s0, lr
10840+        vdup.8      d2, r12
10841+        vdup.8      d3, r6
10842+        orr         lr, r2, lr, lsl #8
10843+        vmov        s2, lr
10844+        sub         r8, r7, #128
10845+        mov         r5, #3
10846+2:
10847+        vmull.u8    q2, d0, d2
10848+          subs        r12, r4
10849+        vmlal.u8    q2, d1, d3
10850+T         it          mi
10851+          addmi       r12, #32
10852+T         asr         r6, r8, #8
10853+T         it          mi
10854+T         ldrbmi      r2, [r1, r6]
10855+A         ldrbmi      r2, [r1, r8, asr #8]
10856+          rsb         r6, r12, #32
10857+          vdup.8      d2, r12
10858+          ittt        mi
10859+          vmovmi      s0, lr
10860+          orrmi       lr, r2, lr, lsl #8
10861+          vmovmi      s2, lr
10862+        vrshrn.u16  d20, q2, #5
10863+          vdup.8      d3, r6
10864+          it          mi
10865+          addmi       r8, r7
10866+        subs        r5, #1
10867+        vext.64     q8, q8, q9, #1
10868+        vext.64     q9, q9, q10, #1
10869+        bne         2b
10870+
10871+          vmull.u8    q2, d0, d2
10872+        add         r12, r0,  r3
10873+          vmlal.u8    q2, d1, d3
10874+        lsl         r3,  #1
10875+          vrshrn.u16  d20, q2, #5
10876+        b           98b
10877+
10878+@ Left of vertical - works down left
10879+18:
10880+        ldrh        r7, [r7]
10881+        rsb         r12, r6, #32
10882+        ldr         lr, [r1]            @ Top
10883+        ldrb        r1, [r2, #-1]       @ Top-left
10884+        vmov        s0, lr
10885+        vdup.8      d2, r12
10886+        vdup.8      d3, r6
10887+        orr         lr, r1, lr, lsl #8
10888+        vmov        s2, lr
10889+        sub         r8, r7, #128
10890+        mov         r5, #3
10891+2:
10892+        vmull.u8    q2, d0, d2
10893+          subs        r12, r4
10894+        vmlal.u8    q2, d1, d3
10895+T         it          mi
10896+          addmi       r12, #32
10897+T         asr         r6, r8, #8
10898+T         it          mi
10899+T         ldrbmi      r1, [r2, r6]
10900+A         ldrbmi      r1, [r2, r8, asr #8]
10901+          rsb         r6, r12, #32
10902+          vdup.8      d2, r12
10903+          ittt        mi
10904+          vmovmi      s0, lr
10905+          orrmi       lr, r1, lr, lsl #8
10906+          vmovmi      s2, lr
10907+        vrshrn.u16  d4, q2, #5
10908+          vdup.8      d3, r6
10909+          it          mi
10910+          addmi       r8, r7
10911+        subs        r5, #1
10912+        vst1.32     {d4[0]}, [r0], r3
10913+        bne         2b
10914+
10915+          vmull.u8    q2, d0, d2
10916+          vmlal.u8    q2, d1, d3
10917+          vrshrn.u16  d4, q2, #5
10918+          vst1.32     {d4[0]}, [r0]
10919+
10920+        pop         {r4-r8, pc}
10921+
10922+@ Right of vertical - works along top - left unused
10923+26:
10924+        ldr         lr, [r1], #1        @ Top
10925+        rsb         r12, r6, #32
10926+        vmov        s0, lr
10927+        vdup.8      d3, r6
10928+        ldr         lr, [r1], #1
10929+        vdup.8      d2, r12
10930+        vmov        s2, lr
10931+          subs        r12, r4
10932+        vmull.u8    q2, d0, d2
10933+          it          mi
10934+          addmi       r12, #32
10935+        vmlal.u8    q2, d1, d3
10936+          rsb         r6, r12, #32
10937+          itt         mi
10938+          vmovmi      s0, lr
10939+          ldrmi       lr, [r1], #1
10940+          vdup.8      d2, r12
10941+          it          mi
10942+          vmovmi      s2, lr
10943+          vdup.8      d3, r6
10944+        mov         r5, #2
10945+1:
10946+        vrshrn.u16  d6, q2, #5
10947+            subs        r12, r4
10948+          vmull.u8    q2, d0, d2
10949+            it          mi
10950+            addmi       r12, #32
10951+          vmlal.u8    q2, d1, d3
10952+            rsb         r6, r12, #32
10953+        vst1.32     {d6[0]}, [r0], r3
10954+            itt         mi
10955+            vmovmi      s0, lr
10956+            ldrmi       lr, [r1], #1
10957+            vdup.8      d2, r12
10958+            it          mi
10959+            vmovmi      s2, lr
10960+        subs        r5, #1
10961+            vdup.8      d3, r6
10962+        bne         1b
10963+
10964+          vrshrn.u16  d6, q2, #5
10965+            vmull.u8    q2, d0, d2
10966+            vmlal.u8    q2, d1, d3
10967+          vst1.32     {d6[0]}, [r0], r3
10968+            vrshrn.u16  d6, q2, #5
10969+            vst1.32     {d6[0]}, [r0]
10970+
10971+        pop         {r4-r8, pc}
10972+
10973+endfunc
10974+
10975+
10976+
10977+@ ff_hevc_rpi_pred_angular_8_neon_8
10978+@       uint8_t *_src,          [r0]
10979+@       const uint8_t *_top,    [r1]
10980+@       const uint8_t *_left,   [r2]
10981+@       ptrdiff_t stride        [r3]
10982+@       unsigned int mode       [sp, #0]  2..34
10983+
10984+function ff_hevc_rpi_pred_angular_8_neon_8, export=1
10985+        ldr         r12, [sp]
10986+        push        {r4-r11, lr}
10987+        ADRT        r4, angle_2 - 2
10988+        ADRT        r7, inv_angle - 11*2
10989+        add         r7, r7, r12, lsl #1
10990+        ldrsb       r6, [r4, r12]
10991+        cmp         r12, #26
10992+        ldrsb       r4, [r4, r12]
10993+        bge         26f
10994+        cmp         r12, #18
10995+        bge         18f
10996+        cmp         r12, #10
10997+        bge         10f
10998+
10999+@ Down of Horizontal - works down left
11000+        bl          patch_h_down_8x8_8
11001+        pop         {r4-r11, pc}
11002+
11003+@ Up of Horizontal - works down up
11004+10:
11005+        ldrh        r7, [r7]
11006+        mov         r10, #-128
11007+        bl          patch_h_up_8x8_8
11008+        pop         {r4-r11, pc}
11009+
11010+@ Left of vertical - works down left
11011+18:
11012+        ldrd        r8, r9, [r1]        @ Top
11013+        rsb         r12, r6, #32
11014+        ldrb        lr, [r2, #-1]       @ Top-left
11015+        ldrh        r7, [r7]
11016+        vmov        d0, r8, r9
11017+        lsl         r9, r9, #8
11018+        vdup.8      d2, r12
11019+        orr         r9, r9, r8, lsr #24
11020+        orr         r8, lr, r8, lsl #8
11021+        vmov        d1, r8, r9
11022+        sub         r1, r7, #128
11023+        mov         r5, #7
11024+1:
11025+        vdup.8      d3, r6
11026+        vmull.u8    q2, d0, d2
11027+          subs        r12, r12, r4
11028+        vmlal.u8    q2, d1, d3
11029+          ittt        mi
11030+          addmi       lr, r2, r1, asr #8
11031+          addmi       r12, r12, #32
11032+          vmovmi      d0, r8, r9
11033+          rsb         r6, r12, #32
11034+          itt         mi
11035+          lslmi       r9, r9, #8
11036+          ldrbmi      lr, [lr]
11037+          vdup.8      d2, r12
11038+        vrshrn.u16  d4, q2, #5
11039+          itttt       mi
11040+          orrmi       r9, r9, r8, lsr #24
11041+          orrmi       r8, lr, r8, lsl #8
11042+          vmovmi      d1, r8, r9
11043+          addmi       r1, r1, r7
11044+        subs        r5, r5, #1
11045+        vst1.8      {d4}, [r0], r3
11046+        bne         1b
11047+
11048+          vdup.8      d3, r6
11049+          vmull.u8    q2, d0, d2
11050+          vmlal.u8    q2, d1, d3
11051+          vrshrn.u16  d4, q2, #5
11052+          vst1.8      {d4}, [r0]
11053+
11054+        pop         {r4-r11, pc}
11055+
11056+@ Right of vertical - works along top - left unused
11057+26:
11058+        ldrd        r8, r9, [r1]        @ Top
11059+        rsb         r12, r6, #32
11060+        vmov        d0, r8, r9
11061+        vdup.8      d3, r6
11062+        mov         r5, #7
11063+        lsr         r8, #8
11064+        vdup.8      d2, r12
11065+        orr         r8, r8, r9, lsl #24
11066+        ldr         r9, [r1, #5]!
11067+        vmov        d1, r8, r9
11068+1:
11069+        vmull.u8    q2, d0, d2
11070+          subs        r12, r4
11071+        vmlal.u8    q2, d1, d3
11072+          it          mi
11073+          addmi       r12, #32
11074+          rsb         r6, r12, #32
11075+          itt         mi
11076+          vmovmi      d0, r8, r9
11077+          lsrmi       r8, #8
11078+          vdup.8      d2, r12
11079+          itt         mi
11080+          orrmi       r8, r8, r9, lsl #24
11081+          ldrmi       r9, [r1, #1]!
11082+        vrshrn.u16  d6, q2, #5
11083+          it          mi
11084+          vmovmi      d1, r8, r9
11085+          vdup.8      d3, r6
11086+        subs        r5, #1
11087+        vst1.8      {d6}, [r0], r3
11088+        bne         1b
11089+
11090+          vmull.u8    q2, d0, d2
11091+          vmlal.u8    q2, d1, d3
11092+          vrshrn.u16  d6, q2, #5
11093+          vst1.8      {d6}, [r0]
11094+
11095+        pop         {r4-r11, pc}
11096+
11097+endfunc
11098+
11099+
11100+@ ff_hevc_rpi_pred_angular_16_neon_8
11101+@       uint8_t *_src,          [r0]
11102+@       const uint8_t *_top,    [r1]
11103+@       const uint8_t *_left,   [r2]
11104+@       ptrdiff_t stride        [r3]
11105+@       unsigned int mode       [sp, #0]  2..34
11106+
11107+function ff_hevc_rpi_pred_angular_16_neon_8, export=1
11108+        ldr         r12, [sp]
11109+        push        {r4-r11, lr}
11110+        ADRT        r4, angle_2 - 2
11111+        ADRT        r7, inv_angle - 11*2
11112+        add         r7, r7, r12, lsl #1
11113+        ldrsb       r6, [r4, r12]
11114+        cmp         r12, #26
11115+        ldrsb       r4, [r4, r12]
11116+        bge         26f
11117+        cmp         r12, #18
11118+        bge         18f
11119+        cmp         r12, #10
11120+        bge         10f
11121+
11122+@ Down of Horizontal - works down left
11123+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
11124+
11125+        bl          patch_h_down_8x8_8
11126+        bl          patch_h_down_8x8_8_continue
11127+
11128+        add         r2, r1, #8          @ restore r2, but 8 rows further down left
11129+        sub         r0, #16
11130+        mov         r6, r4
11131+        add         r0, r0, r3, lsl #3
11132+
11133+        bl          patch_h_down_8x8_8
11134+        bl          patch_h_down_8x8_8_continue
11135+
11136+        pop         {r4-r11, pc}
11137+
11138+@ Up of Horizontal - works down up
11139+10:
11140+        ldrh        r7, [r7]
11141+        mov         r10, #-128
11142+
11143+        push        {r2}
11144+        bl          patch_h_up_8x8_8
11145+        bl          patch_h_up_8x8_8_continue
11146+        pop         {r2}
11147+
11148+        sub         r0, #16
11149+        mov         r10, #-128
11150+        add         r2, #8
11151+        add         r0, r0, r3, lsl #3
11152+        sub         r10, r10, r7, lsl #3
11153+
11154+        bl          patch_h_up_8x8_8
11155+        bl          patch_h_up_8x8_8_continue
11156+
11157+        pop         {r4-r11, pc}
11158+
11159+@ Left of vertical - works down left
11160+18:
11161+        vld1.8      {q9}, [r1]
11162+        sub         r1, r2, #1
11163+        rsb         r12, r6, #32
11164+        ldrh        r7, [r7]
11165+        vdup.8      d6, r6
11166+        vext.8      q8, q9, q9, #15
11167+        sub         r8, r7, #128
11168+        vld1.8      {d16[0]}, [r1]
11169+        vdup.8      d7, r12
11170+        mov         r5, #15
11171+1:
11172+        vmull.u8    q0, d18, d7
11173+        subs        r12, r4
11174+        vmlal.u8    q0, d16, d6
11175+        it          cc
11176+        addcc       r12, #32
11177+        vmull.u8    q1, d19, d7
11178+        it          cc
11179+        addcc       r1, r2, r8, asr #8
11180+        vmlal.u8    q1, d17, d6
11181+        rsb         r6, r12, #32
11182+        vext.8      q10, q8, q8, #15
11183+        sub         r5, #1
11184+        vld1.8      {d20[0]}, [r1]
11185+        it          cc
11186+        addcc       r8, r7
11187+        vmov        q11, q8
11188+        teq         r5, #0
11189+        vrshrn.u16  d0, q0, #5
11190+        vrshrn.u16  d1, q1, #5
11191+        vdup.8      d6, r6
11192+        vdup.8      d7, r12
11193+        vst1.8      {q0}, [r0], r3
11194+        bhi         1b
11195+        beq         4f
11196+2:
11197+        vmull.u8    q0, d22, d7
11198+        subs        r12, r4
11199+        vmlal.u8    q0, d20, d6
11200+        it          cc
11201+        addcc       r12, #32
11202+        vmull.u8    q1, d23, d7
11203+        it          cc
11204+        addcc       r1, r2, r8, asr #8
11205+        vmlal.u8    q1, d21, d6
11206+        rsb         r6, r12, #32
11207+        vext.8      q8, q10, q10, #15
11208+        sub         r5, #1
11209+        vld1.8      {d16[0]}, [r1]
11210+        it          cc
11211+        addcc       r8, r7
11212+        vmov        q9, q10
11213+        teq         r5, #0
11214+        vrshrn.u16  d0, q0, #5
11215+        vrshrn.u16  d1, q1, #5
11216+        vdup.8      d6, r6
11217+        vdup.8      d7, r12
11218+        vst1.8      {q0}, [r0], r3
11219+        bhi         2b
11220+        bne         1b
11221+        bcc         5f
11222+3:
11223+        vmull.u8    q0, d22, d7
11224+        vmlal.u8    q0, d20, d6
11225+        vmull.u8    q1, d23, d7
11226+        vmlal.u8    q1, d21, d6
11227+        vrshrn.u16  d0, q0, #5
11228+        vrshrn.u16  d1, q1, #5
11229+        vst1.8      {q0}, [r0]
11230+
11231+        pop         {r4-r11, pc}
11232+4:
11233+        bcc         3b
11234+5:
11235+        vmull.u8    q0, d18, d7
11236+        vmlal.u8    q0, d16, d6
11237+        vmull.u8    q1, d19, d7
11238+        vmlal.u8    q1, d17, d6
11239+        vrshrn.u16  d0, q0, #5
11240+        vrshrn.u16  d1, q1, #5
11241+        vst1.8      {q0}, [r0]
11242+
11243+        pop         {r4-r11, pc}
11244+
11245+@ Right of vertical - works along top - left unused
11246+26:
11247+        vld1.8      {q9}, [r1]!
11248+        rsb         r12, r6, #32
11249+        vdup.8      d6, r6
11250+        vdup.8      d7, r12
11251+        vext.8      q8, q9, q9, #1
11252+        vld1.8      {d17[7]}, [r1]!
11253+        mov         r5, #15
11254+1:
11255+        vmull.u8    q0, d16, d6
11256+        subs        r12, r4
11257+        vmlal.u8    q0, d18, d7
11258+        it          cc
11259+        addcc       r12, #32
11260+        vmull.u8    q1, d17, d6
11261+        rsb         r6, r12, #32
11262+        vmlal.u8    q1, d19, d7
11263+        sub         r5, #1
11264+        vext.8      q10, q8, q8, #1
11265+        teq         r5, #0
11266+        vld1.8      {d21[7]}, [r1]
11267+        it          cc
11268+        addcc       r1, #1
11269+        vmov        q11, q8
11270+        vrshrn.u16  d0, q0, #5
11271+        vrshrn.u16  d1, q1, #5
11272+        vdup.8      d6, r6
11273+        vdup.8      d7, r12
11274+        vst1.8      {q0}, [r0], r3
11275+        bhi         1b
11276+        beq         4f
11277+2:
11278+        vmull.u8    q0, d20, d6
11279+        subs        r12, r4
11280+        vmlal.u8    q0, d22, d7
11281+        it          cc
11282+        addcc       r12, #32
11283+        vmull.u8    q1, d21, d6
11284+        rsb         r6, r12, #32
11285+        vmlal.u8    q1, d23, d7
11286+        sub         r5, #1
11287+        vext.8      q8, q10, q10, #1
11288+        teq         r5, #0
11289+        vld1.8      {d17[7]}, [r1]
11290+        it          cc
11291+        addcc       r1, #1
11292+        vmov        q9, q10
11293+        vrshrn.u16  d0, q0, #5
11294+        vrshrn.u16  d1, q1, #5
11295+        vdup.8      d6, r6
11296+        vdup.8      d7, r12
11297+        vst1.8      {q0}, [r0], r3
11298+        bhi         2b
11299+        bne         1b
11300+        bcc         5f
11301+3:
11302+        vmull.u8    q0, d20, d6
11303+        vmlal.u8    q0, d22, d7
11304+        vmull.u8    q1, d21, d6
11305+        vmlal.u8    q1, d23, d7
11306+        vrshrn.u16  d0, q0, #5
11307+        vrshrn.u16  d1, q1, #5
11308+        vst1.8      {q0}, [r0]
11309+
11310+        pop         {r4-r11, pc}
11311+4:
11312+        bcc         3b
11313+5:
11314+        vmull.u8    q0, d16, d6
11315+        vmlal.u8    q0, d18, d7
11316+        vmull.u8    q1, d17, d6
11317+        vmlal.u8    q1, d19, d7
11318+        vrshrn.u16  d0, q0, #5
11319+        vrshrn.u16  d1, q1, #5
11320+        vst1.8      {q0}, [r0]
11321+
11322+        pop         {r4-r11, pc}
11323+
11324+endfunc
11325+
11326+
11327+@ ff_hevc_rpi_pred_angular_32_neon_8
11328+@       uint8_t *_src,          [r0]
11329+@       const uint8_t *_top,    [r1]
11330+@       const uint8_t *_left,   [r2]
11331+@       ptrdiff_t stride        [r3]
11332+@       unsigned int mode       [sp, #0]  2..34
11333+
11334+function ff_hevc_rpi_pred_angular_32_neon_8, export=1
11335+        ldr         r12, [sp]
11336+        push        {r4-r11, lr}
11337+        ADRT        r4, angle_2 - 2
11338+        ADRT        r7, inv_angle - 11*2
11339+        add         r7, r7, r12, lsl #1
11340+        ldrsb       r6, [r4, r12]
11341+        cmp         r12, #26
11342+        ldrsb       r4, [r4, r12]
11343+        bge         26f
11344+        cmp         r12, #18
11345+        bge         18f
11346+        cmp         r12, #10
11347+        bge         10f
11348+
11349+@ Down of Horizontal - works down left
11350+        mov         r10, #4
11351+        mov         r1, r2
11352+1:
11353+        bl          patch_h_down_8x8_8
11354+        bl          patch_h_down_8x8_8_continue
11355+        bl          patch_h_down_8x8_8_continue
11356+        bl          patch_h_down_8x8_8_continue
11357+
11358+        add         r2, r1, #8          @ restore r2, but 8 rows further down left
11359+        add         r1, r1, #8
11360+        mov         r6, r4
11361+        sub         r0, #32
11362+        subs        r10, #1
11363+        add         r0, r0, r3, lsl #3
11364+        bne         1b
11365+
11366+        pop        {r4-r11, pc}
11367+
11368+@ Up of Horizontal - works down up
11369+10:
11370+        ldrh        r7, [r7]
11371+        mov         r10, #-128
11372+        vmov.i8     d6, #1<<2
11373+1:
11374+        push        {r2,r10}
11375+        bl          patch_h_up_8x8_8
11376+        bl          patch_h_up_8x8_8_continue
11377+        bl          patch_h_up_8x8_8_continue
11378+        bl          patch_h_up_8x8_8_continue
11379+        pop         {r2,r10}
11380+
11381+        vmov        r8, s12
11382+        sub         r0, #32
11383+        add         r2, #8
11384+        add         r0, r0, r3, lsl #3
11385+        sub         r10, r10, r7, lsl #3
11386+        vshr.u8     d6, #1
11387+        teq         r8, #0
11388+        bne         1b
11389+
11390+        pop        {r4-r11, pc}
11391+
11392+@ Left of vertical - works down left
11393+18:
11394+        vld1.8      {q0-q1}, [r1]
11395+        sub         r9, r2, #1
11396+        rsb         r12, r6, #32
11397+        ldrh        r7, [r7]
11398+        mov         r8, #-128
11399+        vdup.8      d18, r6
11400+        vdup.8      d19, r12
11401+        mov         r5, #32
11402+1:
11403+        vld1.8      {d17[7]}, [r9]
11404+        add         r8, r7
11405+        vmov        q2, q0
11406+        vmov        q3, q1
11407+        add         r9, r2, r8, asr #8
11408+        vext.8      q1, q0, q1, #15
11409+        vext.8      q0, q8, q0, #15
11410+2:
11411+        vmull.u8    q10, d4, d19
11412+        subs        r12, r4
11413+        vmlal.u8    q10, d0, d18
11414+        it          cc
11415+        addcc       r12, #32
11416+        vmull.u8    q11, d5, d19
11417+        rsb         r6, r12, #32
11418+        vmlal.u8    q11, d1, d18
11419+        sub         r5, #1
11420+        vmull.u8    q12, d6, d19
11421+        teq         r5, #0
11422+        vmlal.u8    q12, d2, d18
11423+        vmull.u8    q13, d7, d19
11424+        vmlal.u8    q13, d3, d18
11425+        vdup.8      d18, r6
11426+        vdup.8      d19, r12
11427+        vrshrn.u16  d20, q10, #5
11428+        vrshrn.u16  d21, q11, #5
11429+        vrshrn.u16  d22, q12, #5
11430+        vrshrn.u16  d23, q13, #5
11431+        vst1.8      {q10-q11}, [r0], r3
11432+        bhi         2b
11433+        bne         1b
11434+
11435+        pop         {r4-r11, pc}
11436+
11437+@ Right of vertical - works along top - left unused
11438+26:
11439+        add         r5, r1, #32
11440+        vld1.8      {q0-q1}, [r1]!
11441+        rsb         r12, r6, #32
11442+        vld1.8      {d16[0]}, [r5]
11443+        mov         r5, #32
11444+        vdup.8      d18, r6
11445+        vdup.8      d19, r12
11446+1:
11447+        vmov        q2, q0
11448+        add         r1, #1
11449+        vmov        q3, q1
11450+        vext.8      q0, q0, q1, #1
11451+        vext.8      q1, q1, q8, #1
11452+2:
11453+        vmull.u8    q10, d0, d18
11454+        subs        r12, r4
11455+        vmlal.u8    q10, d4, d19
11456+        it          cc
11457+        addcc       r12, #32
11458+        vmull.u8    q11, d1, d18
11459+        rsb         r6, r12, #32
11460+        vmlal.u8    q11, d5, d19
11461+        sub         r5, #1
11462+        vmull.u8    q12, d2, d18
11463+        teq         r5, #0
11464+        vmlal.u8    q12, d6, d19
11465+        vmull.u8    q13, d3, d18
11466+        vmlal.u8    q13, d7, d19
11467+        vld1.8      {d16[0]}, [r1]
11468+        vdup.8      d18, r6
11469+        vdup.8      d19, r12
11470+        vrshrn.u16  d20, q10, #5
11471+        vrshrn.u16  d21, q11, #5
11472+        vrshrn.u16  d22, q12, #5
11473+        vrshrn.u16  d23, q13, #5
11474+        vst1.8      {q10-q11}, [r0], r3
11475+        bhi         2b
11476+        bne         1b
11477+
11478+        pop         {r4-r11, pc}
11479+
11480+endfunc
11481+
11482+
11483+@ Chroma 8 bit 4x4 patch fns
11484+        .text
11485+
11486+patch_h_down_c_4x4_8:
11487+        ldrd        r8, r9, [r2]        @ Left
11488+        rsb         r12, r6, #32
11489+        vmov        d0, r8, r9
11490+        vdup.8      d3, r6
11491+        lsr         r8, #16
11492+        vdup.8      d2, r12
11493+        orr         r8, r8, r9, lsl #16
11494+        ldr         r9, [r2, #6]!
11495+        vmov        d1, r8, r9
11496+        // drop through...
11497+patch_h_down_c_4x4_8_continue:
11498+        mov         r5, #4
11499+1:
11500+          subs        r12, r4
11501+        vmull.u8    q2, d0, d2
11502+          it          mi
11503+          addmi       r12, #32
11504+        vmlal.u8    q2, d1, d3
11505+          rsb         r6, r12, #32
11506+        vext.8      q8, q8, q9, #8
11507+          it          mi
11508+          lsrmi       r7, r8, #16
11509+        vmov        d18, d19
11510+          it          mi
11511+          vmovmi      d0, r8, r9
11512+          vdup.8      d2, r12
11513+          it          mi
11514+          orrmi       r8, r7, r9, lsl #16
11515+        vrshrn.u16  d19, q2, #5
11516+          itt         mi
11517+          ldrmi       r9, [r2, #2]!
11518+          vmovmi      d1, r8, r9
11519+        subs        r5, #1
11520+          vdup.8      d3, r6
11521+        bne         1b
11522+        // drop through...
11523+store_tran_c_4x4_8:
11524+        vzip.16     d16, d17
11525+        add         r6, r0, r3
11526+        vzip.16     d18, d19
11527+        lsl         r3, #1
11528+        vzip.32     q8, q9
11529+        add         r5, r0, r3
11530+        vst1.16     {d16}, [r0]!
11531+        vst1.16     {d17}, [r6], r3
11532+        vst1.16     {d18}, [r5]
11533+        asr         r3, #1
11534+        vst1.16     {d19}, [r6]
11535+
11536+        bx          lr
11537+
11538+patch_h_up_c_4x4_8:
11539+        ldrd        r8, r9, [r2]
11540+        rsb         r6, r4, #32
11541+        vmov        d0, r8, r9
11542+        vdup.8      d3, r4
11543+        lsr         r11, r8, #16
11544+        vdup.8      d2, r6
11545+        ldr         r8, [r2, #-2]!
11546+        orr         r9, r11, r9, lsl #16
11547+        vmov        d1, r8, r9
11548+        mov         r12, r4
11549+        vmull.u8    q2, d0, d2
11550+        vmlal.u8    q2, d1, d3
11551+patch_h_up_c_4x4_8_continue:
11552+        mov         r5, #4
11553+1:
11554+          add         r12, r4
11555+          cmp         r12, #33
11556+          it          cs
11557+          addcs       r10, r7
11558+          mov         r11, #0
11559+          itt         cs
11560+          subcs       r12, #32
11561+          tstcs       r10, #1<<31
11562+          rsb         r6, r12, #32
11563+          it          eq
11564+          asreq       r11, r10, #7
11565+          it          cs
11566+          vmovcs      d0, r8, r9
11567+          it          eq
11568+          biceq       r11, #1
11569+          vdup.8      d2, r6
11570+          it          cs
11571+          lsrcs       r6, r8, #16
11572+          vdup.8      d3, r12
11573+        vext.8      q8, q8, q9, #8
11574+          itt         cs
11575+          orrcs       r9, r6, r9, lsl #16
11576+          ldrhcs      r11, [r1, r11]
11577+        vmov        d18, d19
11578+          it          hi
11579+          ldrhhi      r11, [r2, #-2]!
11580+        vrshrn.u16  d19, q2, #5
11581+          itt         cs
11582+          orrcs       r8, r11, r8, lsl #16
11583+          vmovcs      d1, r8, r9
11584+          vmull.u8    q2, d0, d2
11585+        subs        r5, #1
11586+          vmlal.u8    q2, d1, d3
11587+        bne         1b
11588+
11589+        b           store_tran_c_4x4_8
11590+
11591+
11592+@ ff_hevc_rpi_pred_angular_c_4_neon_8
11593+@       uint8_t *_src,          [r0]
11594+@       const uint8_t *_top,    [r1]
11595+@       const uint8_t *_left,   [r2]
11596+@       ptrdiff_t stride        [r3]
11597+@       unsigned int mode       [sp, #0]  2..34
11598+
11599+function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1
11600+        ldr         r12, [sp]
11601+        push        {r4-r11, lr}
11602+        ADRT        r4, angle_2 - 2
11603+        ADRT        r7, inv_angle - 11*2
11604+        add         r7, r7, r12, lsl #1
11605+        lsl         r3, #1
11606+        ldrsb       r6, [r4, r12]
11607+        cmp         r12, #26
11608+        ldrsb       r4, [r4, r12]
11609+        bge         26f
11610+        cmp         r12, #18
11611+        bge         18f
11612+        cmp         r12, #10
11613+        bge         10f
11614+
11615+@ Down of Horizontal - works down left
11616+        bl          patch_h_down_c_4x4_8
11617+        pop         {r4-r11, pc}
11618+
11619+@ Up of Horizontal - works down up
11620+10:
11621+        ldrh        r7, [r7]
11622+        mov         r10, #-128
11623+        bl          patch_h_up_c_4x4_8
11624+        pop         {r4-r11, pc}
11625+
11626+@ Left of vertical - works down left
11627+18:
11628+        ldrd        r8, r9, [r1]        @ Top
11629+        rsb         r12, r6, #32
11630+        ldrh        lr, [r2, #-2]       @ Top-left
11631+        ldrh        r7, [r7]
11632+        vmov        d0, r8, r9
11633+        lsl         r9, r9, #16
11634+        vdup.8      d2, r12
11635+        orr         r9, r9, r8, lsr #16
11636+        orr         r8, lr, r8, lsl #16
11637+        vmov        d1, r8, r9
11638+        sub         r1, r7, #128
11639+        mov         r5, #3
11640+1:
11641+        vdup.8      d3, r6
11642+        vmull.u8    q2, d0, d2
11643+          subs        r12, r12, r4
11644+        vmlal.u8    q2, d1, d3
11645+          itttt       mi
11646+          addmi       lr, r2, r1, asr #7
11647+          bicmi       lr, #1
11648+          addmi       r12, r12, #32
11649+          vmovmi      d0, r8, r9
11650+          rsb         r6, r12, #32
11651+          itt         mi
11652+          lslmi       r9, r9, #16
11653+          ldrhmi      lr, [lr]
11654+          vdup.8      d2, r12
11655+        vrshrn.u16  d4, q2, #5
11656+          itttt       mi
11657+          orrmi       r9, r9, r8, lsr #16
11658+          orrmi       r8, lr, r8, lsl #16
11659+          vmovmi      d1, r8, r9
11660+          addmi       r1, r1, r7
11661+        subs        r5, r5, #1
11662+        vst1.16     {d4}, [r0], r3
11663+        bne         1b
11664+
11665+          vdup.8      d3, r6
11666+          vmull.u8    q2, d0, d2
11667+          vmlal.u8    q2, d1, d3
11668+          vrshrn.u16  d4, q2, #5
11669+          vst1.16     {d4}, [r0]
11670+
11671+        pop         {r4-r11, pc}
11672+
11673+@ Right of vertical - works along top - left unused
11674+26:
11675+        ldrd        r8, r9, [r1]        @ Top
11676+        rsb         r12, r6, #32
11677+        vmov        d0, r8, r9
11678+        vdup.8      d3, r6
11679+        mov         r5, #3
11680+        lsr         r8, #16
11681+        vdup.8      d2, r12
11682+        orr         r8, r8, r9, lsl #16
11683+        ldr         r9, [r1, #6]!
11684+        vmov        d1, r8, r9
11685+1:
11686+        vmull.u8    q2, d0, d2
11687+          subs        r12, r4
11688+        vmlal.u8    q2, d1, d3
11689+          it          mi
11690+          addmi       r12, #32
11691+          rsb         r6, r12, #32
11692+          itt         mi
11693+          vmovmi      d0, r8, r9
11694+          lsrmi       r8, #16
11695+          vdup.8      d2, r12
11696+          itt         mi
11697+          orrmi       r8, r8, r9, lsl #16
11698+          ldrmi       r9, [r1, #2]!
11699+        vrshrn.u16  d6, q2, #5
11700+          it          mi
11701+          vmovmi      d1, r8, r9
11702+          vdup.8      d3, r6
11703+        subs        r5, #1
11704+        vst1.16     {d6}, [r0], r3
11705+        bne         1b
11706+
11707+          vmull.u8    q2, d0, d2
11708+          vmlal.u8    q2, d1, d3
11709+          vrshrn.u16  d6, q2, #5
11710+          vst1.16     {d6}, [r0]
11711+
11712+        pop         {r4-r11, pc}
11713+
11714+endfunc
11715+
11716+
11717+@ ff_hevc_rpi_pred_angular_c_8_neon_8
11718+@       uint8_t *_src,          [r0]
11719+@       const uint8_t *_top,    [r1]
11720+@       const uint8_t *_left,   [r2]
11721+@       ptrdiff_t stride        [r3]
11722+@       unsigned int mode       [sp, #0]  2..34
11723+
11724+function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1
11725+        ldr         r12, [sp]
11726+        push        {r4-r11, lr}
11727+        ADRT        r4, angle_2 - 2
11728+        ADRT        r7, inv_angle - 11*2
11729+        add         r7, r7, r12, lsl #1
11730+        lsl         r3, #1
11731+        ldrsb       r6, [r4, r12]
11732+        cmp         r12, #26
11733+        ldrsb       r4, [r4, r12]
11734+        bge         26f
11735+        cmp         r12, #18
11736+        bge         18f
11737+        cmp         r12, #10
11738+        bge         10f
11739+
11740+@ Down of Horizontal - works down left
11741+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
11742+
11743+        bl          patch_h_down_c_4x4_8
11744+        bl          patch_h_down_c_4x4_8_continue
11745+
11746+        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
11747+        sub         r0, #16
11748+        mov         r6, r4
11749+        add         r0, r0, r3, lsl #2
11750+
11751+        bl          patch_h_down_c_4x4_8
11752+        bl          patch_h_down_c_4x4_8_continue
11753+
11754+        pop         {r4-r11, pc}
11755+
11756+@ Up of Horizontal - works down up
11757+10:
11758+        ldrh        r7, [r7]
11759+        mov         r10, #-128
11760+
11761+        push        {r2}
11762+        bl          patch_h_up_c_4x4_8
11763+        bl          patch_h_up_c_4x4_8_continue
11764+        pop         {r2}
11765+
11766+        sub         r0, #16
11767+        mov         r10, #-128
11768+        add         r2, #8
11769+        add         r0, r0, r3, lsl #2
11770+        sub         r10, r10, r7, lsl #2
11771+
11772+        bl          patch_h_up_c_4x4_8
11773+        bl          patch_h_up_c_4x4_8_continue
11774+
11775+        pop         {r4-r11, pc}
11776+
11777+@ Left of vertical - works down left
11778+18:
11779+        vld1.8      {q9}, [r1]
11780+        sub         r1, r2, #2
11781+        rsb         r12, r6, #32
11782+        ldrh        r7, [r7]
11783+        vdup.8      d6, r6
11784+        vext.8      q8, q9, q9, #14
11785+        sub         r8, r7, #128
11786+        vld1.16     {d16[0]}, [r1]
11787+        vdup.8      d7, r12
11788+        mov         r5, #7
11789+1:
11790+        subs        r12, r4
11791+        vmull.u8    q0, d18, d7
11792+        it          cc
11793+        asrcc       r1, r8, #8
11794+        vmlal.u8    q0, d16, d6
11795+        it          cc
11796+        addcc       r12, #32
11797+        vmull.u8    q1, d19, d7
11798+        it          cc
11799+        addcc       r1, r2, r1, lsl #1
11800+        vmlal.u8    q1, d17, d6
11801+        rsb         r6, r12, #32
11802+        vext.8      q10, q8, q8, #14
11803+        sub         r5, #1
11804+        vld1.16     {d20[0]}, [r1]
11805+        it          cc
11806+        addcc       r8, r7
11807+        vmov        q11, q8
11808+        teq         r5, #0
11809+        vrshrn.u16  d0, q0, #5
11810+        vrshrn.u16  d1, q1, #5
11811+        vdup.8      d6, r6
11812+        vdup.8      d7, r12
11813+        vst1.8      {q0}, [r0], r3
11814+        bhi         1b
11815+        beq         4f
11816+2:
11817+        subs        r12, r4
11818+        vmull.u8    q0, d22, d7
11819+        it          cc
11820+        asrcc       r1, r8, #8
11821+        vmlal.u8    q0, d20, d6
11822+        it          cc
11823+        addcc       r12, #32
11824+        vmull.u8    q1, d23, d7
11825+        it          cc
11826+        addcc       r1, r2, r1, lsl #1
11827+        vmlal.u8    q1, d21, d6
11828+        rsb         r6, r12, #32
11829+        vext.8      q8, q10, q10, #14
11830+        sub         r5, #1
11831+        vld1.16     {d16[0]}, [r1]
11832+        it          cc
11833+        addcc       r8, r7
11834+        vmov        q9, q10
11835+        teq         r5, #0
11836+        vrshrn.u16  d0, q0, #5
11837+        vrshrn.u16  d1, q1, #5
11838+        vdup.8      d6, r6
11839+        vdup.8      d7, r12
11840+        vst1.8      {q0}, [r0], r3
11841+        bhi         2b
11842+        bne         1b
11843+        bcc         5f
11844+3:
11845+        vmull.u8    q0, d22, d7
11846+        vmlal.u8    q0, d20, d6
11847+        vmull.u8    q1, d23, d7
11848+        vmlal.u8    q1, d21, d6
11849+        vrshrn.u16  d0, q0, #5
11850+        vrshrn.u16  d1, q1, #5
11851+        vst1.8      {q0}, [r0]
11852+
11853+        pop         {r4-r11, pc}
11854+4:
11855+        bcc         3b
11856+5:
11857+        vmull.u8    q0, d18, d7
11858+        vmlal.u8    q0, d16, d6
11859+        vmull.u8    q1, d19, d7
11860+        vmlal.u8    q1, d17, d6
11861+        vrshrn.u16  d0, q0, #5
11862+        vrshrn.u16  d1, q1, #5
11863+        vst1.8      {q0}, [r0]
11864+
11865+        pop         {r4-r11, pc}
11866+
11867+@ Right of vertical - works along top - left unused
11868+26:
11869+        vld1.8      {q9}, [r1]!
11870+        rsb         r12, r6, #32
11871+        vdup.8      d6, r6
11872+        vdup.8      d7, r12
11873+        vext.8      q8, q9, q9, #2
11874+        vld1.16     {d17[3]}, [r1]!
11875+        mov         r5, #7
11876+1:
11877+        vmull.u8    q0, d16, d6
11878+        subs        r12, r4
11879+        vmlal.u8    q0, d18, d7
11880+        it          cc
11881+        addcc       r12, #32
11882+        vmull.u8    q1, d17, d6
11883+        rsb         r6, r12, #32
11884+        vmlal.u8    q1, d19, d7
11885+        sub         r5, #1
11886+        vext.8      q10, q8, q8, #2
11887+        teq         r5, #0
11888+        vld1.16     {d21[3]}, [r1]
11889+        it          cc
11890+        addcc       r1, #2
11891+        vmov        q11, q8
11892+        vrshrn.u16  d0, q0, #5
11893+        vrshrn.u16  d1, q1, #5
11894+        vdup.8      d6, r6
11895+        vdup.8      d7, r12
11896+        vst1.8      {q0}, [r0], r3
11897+        bhi         1b
11898+        beq         4f
11899+2:
11900+        vmull.u8    q0, d20, d6
11901+        subs        r12, r4
11902+        vmlal.u8    q0, d22, d7
11903+        it          cc
11904+        addcc       r12, #32
11905+        vmull.u8    q1, d21, d6
11906+        rsb         r6, r12, #32
11907+        vmlal.u8    q1, d23, d7
11908+        sub         r5, #1
11909+        vext.8      q8, q10, q10, #2
11910+        teq         r5, #0
11911+        vld1.16     {d17[3]}, [r1]
11912+        it          cc
11913+        addcc       r1, #2
11914+        vmov        q9, q10
11915+        vrshrn.u16  d0, q0, #5
11916+        vrshrn.u16  d1, q1, #5
11917+        vdup.8      d6, r6
11918+        vdup.8      d7, r12
11919+        vst1.8      {q0}, [r0], r3
11920+        bhi         2b
11921+        bne         1b
11922+        bcc         5f
11923+3:
11924+        vmull.u8    q0, d20, d6
11925+        vmlal.u8    q0, d22, d7
11926+        vmull.u8    q1, d21, d6
11927+        vmlal.u8    q1, d23, d7
11928+        vrshrn.u16  d0, q0, #5
11929+        vrshrn.u16  d1, q1, #5
11930+        vst1.8      {q0}, [r0]
11931+
11932+        pop         {r4-r11, pc}
11933+4:
11934+        bcc         3b
11935+5:
11936+        vmull.u8    q0, d16, d6
11937+        vmlal.u8    q0, d18, d7
11938+        vmull.u8    q1, d17, d6
11939+        vmlal.u8    q1, d19, d7
11940+        vrshrn.u16  d0, q0, #5
11941+        vrshrn.u16  d1, q1, #5
11942+        vst1.8      {q0}, [r0]
11943+
11944+        pop         {r4-r11, pc}
11945+
11946+endfunc
11947+
11948+
11949+@ ff_hevc_rpi_pred_angular_c_16_neon_8
11950+@       uint8_t *_src,          [r0]
11951+@       const uint8_t *_top,    [r1]
11952+@       const uint8_t *_left,   [r2]
11953+@       ptrdiff_t stride        [r3]
11954+@       unsigned int mode       [sp, #0]  2..34
11955+
11956+function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1
11957+        ldr         r12, [sp]
11958+        push        {r4-r11, lr}
11959+        ADRT        r4, angle_2 - 2
11960+        ADRT        r7, inv_angle - 11*2
11961+        add         r7, r7, r12, lsl #1
11962+        lsl         r3, #1
11963+        ldrsb       r6, [r4, r12]
11964+        cmp         r12, #26
11965+        ldrsb       r4, [r4, r12]
11966+        bge         26f
11967+        cmp         r12, #18
11968+        bge         18f
11969+        cmp         r12, #10
11970+        bge         10f
11971+
11972+@ Down of Horizontal - works down left
11973+        mov         r10, #4
11974+        mov         r1, r2
11975+1:
11976+        bl          patch_h_down_c_4x4_8
11977+        bl          patch_h_down_c_4x4_8_continue
11978+        bl          patch_h_down_c_4x4_8_continue
11979+        bl          patch_h_down_c_4x4_8_continue
11980+
11981+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
11982+        add         r1, r1, #4*2
11983+        mov         r6, r4
11984+        sub         r0, #32
11985+        subs        r10, #1
11986+        add         r0, r0, r3, lsl #2
11987+        bne         1b
11988+
11989+        pop         {r4-r11, pc}
11990+
11991+@ Up of Horizontal - works down up
11992+10:
11993+        ldrh        r7, [r7]
11994+        mov         r10, #-128
11995+        vmov.i8     d6, #1<<2
11996+1:
11997+        push        {r2, r10}
11998+        bl          patch_h_up_c_4x4_8
11999+        bl          patch_h_up_c_4x4_8_continue
12000+        bl          patch_h_up_c_4x4_8_continue
12001+        bl          patch_h_up_c_4x4_8_continue
12002+        pop         {r2, r10}
12003+
12004+        vmov        r8, s12
12005+        sub         r0, #32
12006+        add         r2, #8
12007+        add         r0, r0, r3, lsl #2
12008+        sub         r10, r10, r7, lsl #2
12009+        vshr.u8     d6, #1
12010+        teq         r8, #0
12011+        bne         1b
12012+
12013+        pop         {r4-r11, pc}
12014+
12015+@ Left of vertical - works down left
12016+18:
12017+        vld1.8      {q0-q1}, [r1]
12018+        sub         r9, r2, #2
12019+        rsb         r12, r6, #32
12020+        ldrh        r7, [r7]
12021+        mov         r8, #-128
12022+        vdup.8      d18, r6
12023+        vdup.8      d19, r12
12024+        mov         r5, #16
12025+1:
12026+        vld1.16     {d17[3]}, [r9]
12027+        add         r8, r7
12028+        vmov        q2, q0
12029+        vmov        q3, q1
12030+        asr         r9, r8, #8
12031+        vext.8      q1, q0, q1, #14
12032+        add         r9, r2, r9, lsl #1
12033+        vext.8      q0, q8, q0, #14
12034+2:
12035+        vmull.u8    q10, d4, d19
12036+        subs        r12, r4
12037+        vmlal.u8    q10, d0, d18
12038+        it          cc
12039+        addcc       r12, #32
12040+        vmull.u8    q11, d5, d19
12041+        rsb         r6, r12, #32
12042+        vmlal.u8    q11, d1, d18
12043+        sub         r5, #1
12044+        vmull.u8    q12, d6, d19
12045+        teq         r5, #0
12046+        vmlal.u8    q12, d2, d18
12047+        vmull.u8    q13, d7, d19
12048+        vmlal.u8    q13, d3, d18
12049+        vdup.8      d18, r6
12050+        vdup.8      d19, r12
12051+        vrshrn.u16  d20, q10, #5
12052+        vrshrn.u16  d21, q11, #5
12053+        vrshrn.u16  d22, q12, #5
12054+        vrshrn.u16  d23, q13, #5
12055+        vst1.8      {q10-q11}, [r0], r3
12056+        bhi         2b
12057+        bne         1b
12058+
12059+        pop         {r4-r11, pc}
12060+
12061+@ Right of vertical - works along top - left unused
12062+26:
12063+        add         r5, r1, #32
12064+        vld1.8      {q0-q1}, [r1]!
12065+        rsb         r12, r6, #32
12066+        vld1.16     {d16[0]}, [r5]
12067+        mov         r5, #16
12068+        vdup.8      d18, r6
12069+        vdup.8      d19, r12
12070+1:
12071+        vmov        q2, q0
12072+        add         r1, #2
12073+        vmov        q3, q1
12074+        vext.8      q0, q0, q1, #2
12075+        vext.8      q1, q1, q8, #2
12076+2:
12077+        vmull.u8    q10, d0, d18
12078+        subs        r12, r4
12079+        vmlal.u8    q10, d4, d19
12080+        it          cc
12081+        addcc       r12, #32
12082+        vmull.u8    q11, d1, d18
12083+        rsb         r6, r12, #32
12084+        vmlal.u8    q11, d5, d19
12085+        sub         r5, #1
12086+        vmull.u8    q12, d2, d18
12087+        teq         r5, #0
12088+        vmlal.u8    q12, d6, d19
12089+        vmull.u8    q13, d3, d18
12090+        vmlal.u8    q13, d7, d19
12091+        vld1.16     {d16[0]}, [r1]
12092+        vdup.8      d18, r6
12093+        vdup.8      d19, r12
12094+        vrshrn.u16  d20, q10, #5
12095+        vrshrn.u16  d21, q11, #5
12096+        vrshrn.u16  d22, q12, #5
12097+        vrshrn.u16  d23, q13, #5
12098+        vst1.8      {q10-q11}, [r0], r3
12099+        bhi         2b
12100+        bne         1b
12101+
12102+        pop         {r4-r11, pc}
12103+
12104+endfunc
12105+
12106+@------------------------------------------------------------------------------
12107+@ Data
12108+
12109+        .text
12110+        .balign  64
12111+angle_2:
12112+        .byte    32
12113+        .byte    26,  21,  17,  13,   9,   5,   2,   0
12114+        @ Sign inverted from standards table
12115+        .byte     2,   5,   9,  13,  17,  21,  26,  32
12116+        .byte    26,  21,  17,  13,   9,   5,   2,   0
12117+        @ Standard sign
12118+        .byte     2,   5,   9,  13,  17,  21,  26,  32
12119+
12120+        .balign   2
12121+
12122+        @ Sign inverted from standards table
12123+inv_angle:
12124+        .short   4096, 1638,  910,  630,  482,  390,  315
12125+        .short    256
12126+        .short    315,  390,  482,  630,  910, 1638, 4096
12127+
12128+@------------------------------------------------------------------------------
12129+@
12130+@ 10 bit fns
12131+@ Should work for 9 & 11 bit as there is no actual bit-depth specific code
12132+@ but runs out of register width for 12+ bit
12133+
12134+        .text
12135+        .balign 64
12136+
12137+patch_h_down_4x4_10:
12138+        ldrd        r8, r9, [r2]        @ Left
12139+        rsb         r12, r6, #32
12140+        vmov        d0, r8, r9
12141+        vdup.16     d3, r6
12142+        lsr         r8, #16
12143+        vdup.16     d2, r12
12144+        orr         r8, r8, r9, lsl #16
12145+        ldr         r9, [r2, #6]!
12146+        vmov        d1, r8, r9
12147+        // drop through...
12148+patch_h_down_4x4_10_continue:
12149+        mov         r5, #4
12150+1:
12151+          subs        r12, r4
12152+        vmul.u16    d4, d0, d2
12153+          it          mi
12154+          addmi       r12, #32
12155+        vmla.u16    d4, d1, d3
12156+          rsb         r6, r12, #32
12157+        vext.16     q8, q8, q9, #4
12158+          it          mi
12159+          lsrmi       r7, r8, #16
12160+        vmov        d18, d19
12161+          it          mi
12162+          vmovmi      d0, r8, r9
12163+          vdup.16     d2, r12
12164+          it          mi
12165+          orrmi       r8, r7, r9, lsl #16
12166+        vrshr.u16   d19, d4, #5
12167+          itt         mi
12168+          ldrmi       r9, [r2, #2]!
12169+          vmovmi      d1, r8, r9
12170+        subs        r5, #1
12171+          vdup.16     d3, r6
12172+        bne         1b
12173+        // drop through...
12174+store_tran_4x4_10:
12175+        vzip.16     d16, d17
12176+        add         r6, r0, r3
12177+        vzip.16     d18, d19
12178+        lsl         r3, #1
12179+        vzip.32     q8, q9
12180+        add         r5, r0, r3
12181+        vst1.16     {d16}, [r0]!
12182+        vst1.16     {d17}, [r6], r3
12183+        vst1.16     {d18}, [r5]
12184+        asr         r3, #1
12185+        vst1.16     {d19}, [r6]
12186+
12187+        bx          lr
12188+
12189+patch_h_up_4x4_10:
12190+        ldrd        r8, r9, [r2]
12191+        rsb         r6, r4, #32
12192+        vmov        d0, r8, r9
12193+        vdup.16     d3, r4
12194+        lsr         r11, r8, #16
12195+        vdup.16     d2, r6
12196+        ldr         r8, [r2, #-2]!
12197+        orr         r9, r11, r9, lsl #16
12198+        vmov        d1, r8, r9
12199+        mov         r12, r4
12200+        vmul.u16    d4, d0, d2
12201+        vmla.u16    d4, d1, d3
12202+patch_h_up_4x4_10_continue:
12203+        mov         r5, #4
12204+1:
12205+          add         r12, r4
12206+          cmp         r12, #33
12207+          it          cs
12208+          addcs       r10, r7
12209+          mov         r11, #0
12210+          itt         cs
12211+          subcs       r12, #32
12212+          tstcs       r10, #1<<31
12213+          rsb         r6, r12, #32
12214+          it          eq
12215+          asreq       r11, r10, #7
12216+          it          cs
12217+          vmovcs      d0, r8, r9
12218+          it          eq
12219+          biceq       r11, #1
12220+          vdup.16     d2, r6
12221+          it          cs
12222+          lsrcs       r6, r8, #16
12223+          vdup.16     d3, r12
12224+        vext.16     q8, q8, q9, #4
12225+          itt         cs
12226+          orrcs       r9, r6, r9, lsl #16
12227+          ldrhcs      r11, [r1, r11]
12228+        vmov        d18, d19
12229+          it          hi
12230+          ldrhhi      r11, [r2, #-2]!
12231+        vrshr.u16   d19, d4, #5
12232+          itt         cs
12233+          orrcs       r8, r11, r8, lsl #16
12234+          vmovcs      d1, r8, r9
12235+          vmul.u16    d4, d0, d2
12236+        subs        r5, #1
12237+          vmla.u16    d4, d1, d3
12238+        bne         1b
12239+
12240+        b           store_tran_4x4_10
12241+
12242+
12243+@ ff_hevc_rpi_pred_angular_4_neon_10
12244+@       uint8_t *_src,          [r0]
12245+@       const uint8_t *_top,    [r1]
12246+@       const uint8_t *_left,   [r2]
12247+@       ptrdiff_t stride        [r3]
12248+@       unsigned int mode       [sp, #0]  2..34
12249+
12250+function ff_hevc_rpi_pred_angular_4_neon_10, export=1
12251+        ldr         r12, [sp]
12252+        push        {r4-r11, lr}
12253+        ADRT        r4, angle_2 - 2
12254+        ADRT        r7, inv_angle - 11*2
12255+        add         r7, r7, r12, lsl #1
12256+        lsl         r3, #1
12257+        ldrsb       r6, [r4, r12]
12258+        cmp         r12, #26
12259+        ldrsb       r4, [r4, r12]
12260+        bge         26f
12261+        cmp         r12, #18
12262+        bge         18f
12263+        cmp         r12, #10
12264+        bge         10f
12265+
12266+@ Down of Horizontal - works down left
12267+        bl          patch_h_down_4x4_10
12268+        pop         {r4-r11, pc}
12269+
12270+@ Up of Horizontal - works down up
12271+10:
12272+        ldrh        r7, [r7]
12273+        mov         r10, #-128
12274+        bl          patch_h_up_4x4_10
12275+        pop         {r4-r11, pc}
12276+
12277+@ Left of vertical - works down left
12278+18:
12279+        ldrd        r8, r9, [r1]        @ Top
12280+        rsb         r12, r6, #32
12281+        ldrh        lr, [r2, #-2]       @ Top-left
12282+        ldrh        r7, [r7]
12283+        vmov        d0, r8, r9
12284+        lsl         r9, r9, #16
12285+        vdup.16     d2, r12
12286+        orr         r9, r9, r8, lsr #16
12287+        orr         r8, lr, r8, lsl #16
12288+        vmov        d1, r8, r9
12289+        sub         r1, r7, #128
12290+        mov         r5, #3
12291+1:
12292+        sel         lr, lr, lr          @ force pipeline 0 on Cortex-A53
12293+        vdup.16     d3, r6
12294+        vmul.u16    d4, d0, d2
12295+          subs        r12, r12, r4
12296+        vmla.u16    d4, d1, d3
12297+          itttt       mi
12298+          addmi       lr, r2, r1, asr #7
12299+          bicmi       lr, #1
12300+          addmi       r12, r12, #32
12301+          vmovmi      d0, r8, r9
12302+          rsb         r6, r12, #32
12303+          itt         mi
12304+          lslmi       r9, r9, #16
12305+          ldrhmi      lr, [lr]
12306+          vdup.16     d2, r12
12307+        vrshr.u16   d4, d4, #5
12308+          itttt       mi
12309+          orrmi       r9, r9, r8, lsr #16
12310+          orrmi       r8, lr, r8, lsl #16
12311+          vmovmi      d1, r8, r9
12312+          addmi       r1, r1, r7
12313+        subs        r5, r5, #1
12314+        vst1.16     {d4}, [r0], r3
12315+        bne         1b
12316+
12317+          vdup.16     d3, r6
12318+          nop                           @ force next insn into pipeline 0 to enable
12319+          vmul.u16    d4, d0, d2        @ vmla to execute back-to-back on Cortex-A53
12320+          vmla.u16    d4, d1, d3
12321+          vrshr.u16   d4, d4, #5
12322+          vst1.16     {d4}, [r0]
12323+
12324+        pop         {r4-r11, pc}
12325+
12326+@ Right of vertical - works along top - left unused
12327+26:
12328+        ldrd        r8, r9, [r1]        @ Top
12329+        rsb         r12, r6, #32
12330+        vmov        d0, r8, r9
12331+        vdup.16     d3, r6
12332+        lsr         r8, #16
12333+        vdup.16     d2, r12
12334+        orr         r8, r8, r9, lsl #16
12335+        ldr         r9, [r1, #6]!
12336+        vmov        d1, r8, r9
12337+        mov         r5, #3
12338+1:
12339+        vmul.u16    d4, d0, d2
12340+          subs        r12, r4
12341+        vmla.u16    d4, d1, d3
12342+          it          mi
12343+          addmi       r12, #32
12344+          rsb         r6, r12, #32
12345+          itt         mi
12346+          vmovmi      d0, r8, r9
12347+          lsrmi       r8, #16
12348+          vdup.16     d2, r12
12349+          itt         mi
12350+          orrmi       r8, r8, r9, lsl #16
12351+          ldrmi       r9, [r1, #2]!
12352+        vrshr.u16   d4, d4, #5
12353+          it          mi
12354+          vmovmi      d1, r8, r9
12355+          vdup.16     d3, r6
12356+        subs        r5, #1
12357+        vst1.16     {d4}, [r0], r3
12358+        bne         1b
12359+
12360+          vmul.u16    d4, d0, d2
12361+          vmla.u16    d4, d1, d3
12362+          vrshr.u16   d4, d4, #5
12363+          vst1.16     {d4}, [r0]
12364+
12365+        pop         {r4-r11, pc}
12366+
12367+endfunc
12368+
12369+
12370+@ ff_hevc_rpi_pred_angular_8_neon_10
12371+@       uint8_t *_src,          [r0]
12372+@       const uint8_t *_top,    [r1]
12373+@       const uint8_t *_left,   [r2]
12374+@       ptrdiff_t stride        [r3]
12375+@       unsigned int mode       [sp, #0]  2..34
12376+
12377+function ff_hevc_rpi_pred_angular_8_neon_10, export=1
12378+        ldr         r12, [sp]
12379+        push        {r4-r11, lr}
12380+        ADRT        r4, angle_2 - 2
12381+        ADRT        r7, inv_angle - 11*2
12382+        add         r7, r7, r12, lsl #1
12383+        lsl         r3, #1
12384+        ldrsb       r6, [r4, r12]
12385+        cmp         r12, #26
12386+        ldrsb       r4, [r4, r12]
12387+        bge         26f
12388+        cmp         r12, #18
12389+        bge         18f
12390+        cmp         r12, #10
12391+        bge         10f
12392+
12393+@ Down of Horizontal - works down left
12394+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
12395+
12396+        bl          patch_h_down_4x4_10
12397+        bl          patch_h_down_4x4_10_continue
12398+
12399+        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
12400+        sub         r0, #16
12401+        mov         r6, r4
12402+        add         r0, r0, r3, lsl #2
12403+
12404+        bl          patch_h_down_4x4_10
12405+        bl          patch_h_down_4x4_10_continue
12406+
12407+        pop         {r4-r11, pc}
12408+
12409+@ Up of Horizontal - works down up
12410+10:
12411+        ldrh        r7, [r7]
12412+        mov         r10, #-128
12413+
12414+        push        {r2}
12415+        bl          patch_h_up_4x4_10
12416+        bl          patch_h_up_4x4_10_continue
12417+        pop         {r2}
12418+
12419+        sub         r0, #16
12420+        mov         r10, #-128
12421+        add         r2, #8
12422+        add         r0, r0, r3, lsl #2
12423+        sub         r10, r10, r7, lsl #2
12424+
12425+        bl          patch_h_up_4x4_10
12426+        bl          patch_h_up_4x4_10_continue
12427+
12428+        pop         {r4-r11, pc}
12429+
12430+@ Left of vertical - works down left
12431+18:
12432+        vld1.16     {q9}, [r1]
12433+        sub         r1, r2, #2
12434+        rsb         r12, r6, #32
12435+        ldrh        r7, [r7]
12436+        vdup.16     q2, r6
12437+        vext.16     q8, q9, q9, #7
12438+        sub         r8, r7, #128
12439+        vld1.16     {d16[0]}, [r1]
12440+        vdup.16     q3, r12
12441+        mov         r5, #7
12442+1:
12443+        vmul.u16    q0, q9, q3
12444+        subs        r12, r4
12445+        vmla.u16    q0, q8, q2
12446+        ittt        cc
12447+        asrcc       r1, r8, #8
12448+        addcc       r12, #32
12449+        addcc       r1, r2, r1, lsl #1
12450+        vext.16     q10, q8, q8, #7
12451+        rsb         r6, r12, #32
12452+        vmov        q11, q8
12453+        sub         r5, #1
12454+        vrshr.u16   q0, q0, #5
12455+        it          cc
12456+        addcc       r8, r7
12457+        vld1.16     {d20[0]}, [r1]
12458+        teq         r5, #0
12459+        vdup.16     q2, r6
12460+        vdup.16     q3, r12
12461+        vst1.16     {q0}, [r0], r3
12462+        bhi         1b
12463+        beq         4f
12464+2:
12465+        vmul.u16    q0, q11, q3
12466+        subs        r12, r4
12467+        vmla.u16    q0, q10, q2
12468+        ittt        cc
12469+        asrcc       r1, r8, #8
12470+        addcc       r12, #32
12471+        addcc       r1, r2, r1, lsl #1
12472+        vext.16     q8, q10, q10, #7
12473+        rsb         r6, r12, #32
12474+        vmov        q9, q10
12475+        sub         r5, #1
12476+        vrshr.u16   q0, q0, #5
12477+        it          cc
12478+        addcc       r8, r7
12479+        vld1.16     {d16[0]}, [r1]
12480+        teq         r5, #0
12481+        vdup.16     q2, r6
12482+        vdup.16     q3, r12
12483+        vst1.16     {q0}, [r0], r3
12484+        bhi         2b
12485+        bne         1b
12486+        bcc         5f
12487+3:
12488+        vmul.u16    q0, q11, q3
12489+        vmla.u16    q0, q10, q2
12490+        vrshr.u16   q0, q0, #5
12491+        vst1.16     {q0}, [r0]
12492+
12493+        pop         {r4-r11, pc}
12494+4:
12495+        bcc         3b
12496+5:
12497+        vmul.u16    q0, q9, q3
12498+        vmla.u16    q0, q8, q2
12499+        vrshr.u16   q0, q0, #5
12500+        vst1.16     {q0}, [r0]
12501+
12502+        pop         {r4-r11, pc}
12503+
12504+@ Right of vertical - works along top - left unused
12505+26:
12506+        vld1.16     {q9}, [r1]!
12507+        rsb         r12, r6, #32
12508+        vdup.16     q2, r6
12509+        vdup.16     q3, r12
12510+        vext.16     q8, q9, q9, #1
12511+        vld1.16     {d17[3]}, [r1]!
12512+        mov         r5, #7
12513+1:
12514+        vmul.u16    q0, q8, q2
12515+        subs        r12, r4
12516+        vmla.u16    q0, q9, q3
12517+        it          cc
12518+        addcc       r12, #32
12519+        vext.16     q10, q8, q8, #1
12520+        rsb         r6, r12, #32
12521+        vld1.16     {d21[3]}, [r1]
12522+        sub         r5, #1
12523+        vmov        q11, q8
12524+        teq         r5, #0
12525+        vrshr.u16   q0, q0, #5
12526+        it          cc
12527+        addcc       r1, #2
12528+        vdup.16     q2, r6
12529+        vdup.16     q3, r12
12530+        vst1.16     {q0}, [r0], r3
12531+        bhi         1b
12532+        beq         4f
12533+2:
12534+        vmul.u16    q0, q10, q2
12535+        subs        r12, r4
12536+        vmla.u16    q0, q11, q3
12537+        it          cc
12538+        addcc       r12, #32
12539+        vext.16     q8, q10, q10, #1
12540+        rsb         r6, r12, #32
12541+        vld1.16     {d17[3]}, [r1]
12542+        sub         r5, #1
12543+        vmov        q9, q10
12544+        teq         r5, #0
12545+        vrshr.u16   q0, q0, #5
12546+        it          cc
12547+        addcc       r1, #2
12548+        vdup.16     q2, r6
12549+        vdup.16     q3, r12
12550+        vst1.16     {q0}, [r0], r3
12551+        bhi         2b
12552+        bne         1b
12553+        bcc         5f
12554+3:
12555+        vmul.u16    q0, q10, q2
12556+        vmla.u16    q0, q11, q3
12557+        vrshr.u16   q0, q0, #5
12558+        vst1.16     {q0}, [r0]
12559+
12560+        pop         {r4-r11, pc}
12561+4:
12562+        bcc         3b
12563+5:
12564+        vmul.u16    q0, q8, q2
12565+        vmla.u16    q0, q9, q3
12566+        vrshr.u16   q0, q0, #5
12567+        vst1.16     {q0}, [r0]
12568+
12569+        pop         {r4-r11, pc}
12570+
12571+endfunc
12572+
12573+
12574+@ ff_hevc_rpi_pred_angular_16_neon_10
12575+@       uint8_t *_src,          [r0]
12576+@       const uint8_t *_top,    [r1]
12577+@       const uint8_t *_left,   [r2]
12578+@       ptrdiff_t stride        [r3]
12579+@       unsigned int mode       [sp, #0]  2..34
12580+
12581+function ff_hevc_rpi_pred_angular_16_neon_10, export=1
12582+        ldr         r12, [sp]
12583+        push        {r4-r11, lr}
12584+        ADRT        r4, angle_2 - 2
12585+        ADRT        r7, inv_angle - 11*2
12586+        add         r7, r7, r12, lsl #1
12587+        lsl         r3, #1
12588+        ldrsb       r6, [r4, r12]
12589+        cmp         r12, #26
12590+        ldrsb       r4, [r4, r12]
12591+        bge         26f
12592+        cmp         r12, #18
12593+        bge         18f
12594+        cmp         r12, #10
12595+        bge         10f
12596+
12597+@ Down of Horizontal - works down left
12598+        mov         r10, #4
12599+        mov         r1, r2
12600+1:
12601+        bl          patch_h_down_4x4_10
12602+        bl          patch_h_down_4x4_10_continue
12603+        bl          patch_h_down_4x4_10_continue
12604+        bl          patch_h_down_4x4_10_continue
12605+
12606+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
12607+        add         r1, r1, #4*2
12608+        mov         r6, r4
12609+        sub         r0, #32
12610+        subs        r10, #1
12611+        add         r0, r0, r3, lsl #2
12612+        bne         1b
12613+
12614+        pop         {r4-r11, pc}
12615+
12616+@ Up of Horizontal - works down up
12617+10:
12618+        ldrh        r7, [r7]
12619+        mov         r10, #-128
12620+        vmov.i8     d6, #1<<2
12621+1:
12622+        push        {r2, r10}
12623+        bl          patch_h_up_4x4_10
12624+        bl          patch_h_up_4x4_10_continue
12625+        bl          patch_h_up_4x4_10_continue
12626+        bl          patch_h_up_4x4_10_continue
12627+        pop         {r2, r10}
12628+
12629+        vmov        r8, s12
12630+        sub         r0, #32
12631+        add         r2, #8
12632+        add         r0, r0, r3, lsl #2
12633+        sub         r10, r10, r7, lsl #2
12634+        vshr.u8     d6, #1
12635+        teq         r8, #0
12636+        bne         1b
12637+
12638+        pop         {r4-r11, pc}
12639+
12640+@ Left of vertical - works down left
12641+18:
12642+        vld1.16     {q0-q1}, [r1]
12643+        sub         r9, r2, #2
12644+        rsb         r12, r6, #32
12645+        ldrh        r7, [r7]
12646+        mov         r8, #-128
12647+        vdup.16     q9, r6
12648+        vdup.16     q10, r12
12649+        mov         r5, #16
12650+1:
12651+        vld1.16     {d17[3]}, [r9]
12652+        add         r8, r7
12653+        vmov        q2, q0
12654+        vmov        q3, q1
12655+        asr         r9, r8, #8
12656+        vext.16     q1, q0, q1, #7
12657+        add         r9, r2, r9, lsl #1
12658+        vext.16     q0, q8, q0, #7
12659+2:
12660+        vmul.u16    q11, q2, q10
12661+        subs        r12, r4
12662+        vmla.u16    q11, q0, q9
12663+        it          cc
12664+        addcc       r12, #32
12665+        vmul.u16    q12, q3, q10
12666+        rsb         r6, r12, #32
12667+        vmla.u16    q12, q1, q9
12668+        sub         r5, #1
12669+        teq         r5, #0
12670+        vdup.16     q9, r6
12671+        vdup.16     q10, r12
12672+        vrshr.u16   q11, q11, #5
12673+        vrshr.u16   q12, q12, #5
12674+        vst1.16     {q11-q12}, [r0], r3
12675+        bhi         2b
12676+        bne         1b
12677+
12678+        pop         {r4-r11, pc}
12679+
12680+@ Right of vertical - works along top - left unused
12681+26:
12682+        add         r5, r1, #32
12683+        vld1.16     {q0-q1}, [r1]!
12684+        rsb         r12, r6, #32
12685+        vld1.16     {d16[0]}, [r5]
12686+        mov         r5, #16
12687+        vdup.16     q9, r6
12688+        vdup.16     q10, r12
12689+1:
12690+        vmov        q2, q0
12691+        add         r1, #2
12692+        vmov        q3, q1
12693+        vext.16     q0, q0, q1, #1
12694+        vext.16     q1, q1, q8, #1
12695+2:
12696+        vmul.u16    q11, q0, q9
12697+        subs        r12, r4
12698+        vmla.u16    q11, q2, q10
12699+        it          cc
12700+        addcc       r12, #32
12701+        vmul.u16    q12, q1, q9
12702+        rsb         r6, r12, #32
12703+        vmla.u16    q12, q3, q10
12704+        sub         r5, #1
12705+        vld1.16     {d16[0]}, [r1]
12706+        teq         r5, #0
12707+        vdup.16     q9, r6
12708+        vdup.16     q10, r12
12709+        vrshr.u16   q11, q11, #5
12710+        vrshr.u16   q12, q12, #5
12711+        vst1.16     {q11-q12}, [r0], r3
12712+        bhi         2b
12713+        bne         1b
12714+
12715+        pop         {r4-r11, pc}
12716+
12717+endfunc
12718+
12719+
12720+@ ff_hevc_rpi_pred_angular_32_neon_10
12721+@       uint8_t *_src,          [r0]
12722+@       const uint8_t *_top,    [r1]
12723+@       const uint8_t *_left,   [r2]
12724+@       ptrdiff_t stride        [r3]
12725+@       unsigned int mode       [sp, #0]  2..34
12726+
12727+function ff_hevc_rpi_pred_angular_32_neon_10, export=1
12728+        ldr         r12, [sp]
12729+        push        {r4-r11, lr}
12730+        ADRT        r4, angle_2 - 2
12731+        ADRT        r7, inv_angle - 11*2
12732+        add         r7, r7, r12, lsl #1
12733+        lsl         r3, #1
12734+        vpush       {d8}
12735+        ldrsb       r6, [r4, r12]
12736+        cmp         r12, #26
12737+        ldrsb       r4, [r4, r12]
12738+        bge         26f
12739+        cmp         r12, #18
12740+        bge         18f
12741+        cmp         r12, #10
12742+        bge         10f
12743+
12744+@ Down of Horizontal - works down left
12745+        add         sp, #8
12746+        mov         r10, #8
12747+        mov         r1, r2
12748+1:
12749+        bl          patch_h_down_4x4_10
12750+        bl          patch_h_down_4x4_10_continue
12751+        bl          patch_h_down_4x4_10_continue
12752+        bl          patch_h_down_4x4_10_continue
12753+        bl          patch_h_down_4x4_10_continue
12754+        bl          patch_h_down_4x4_10_continue
12755+        bl          patch_h_down_4x4_10_continue
12756+        bl          patch_h_down_4x4_10_continue
12757+
12758+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
12759+        add         r1, r1, #4*2
12760+        mov         r6, r4
12761+        sub         r0, #64
12762+        subs        r10, #1
12763+        add         r0, r0, r3, lsl #2
12764+        bne         1b
12765+
12766+        pop         {r4-r11, pc}
12767+
12768+@ Up of Horizontal - works down up
12769+10:
12770+        add         sp, #8
12771+        ldrh        r7, [r7]
12772+        mov         r10, #-128
12773+        vmov.i8     d6, #1<<6
12774+1:
12775+        push        {r2, r10}
12776+        bl          patch_h_up_4x4_10
12777+        bl          patch_h_up_4x4_10_continue
12778+        bl          patch_h_up_4x4_10_continue
12779+        bl          patch_h_up_4x4_10_continue
12780+        bl          patch_h_up_4x4_10_continue
12781+        bl          patch_h_up_4x4_10_continue
12782+        bl          patch_h_up_4x4_10_continue
12783+        bl          patch_h_up_4x4_10_continue
12784+        pop         {r2, r10}
12785+
12786+        vmov        r8, s12
12787+        sub         r0, #64
12788+        add         r2, #8
12789+        add         r0, r0, r3, lsl #2
12790+        sub         r10, r10, r7, lsl #2
12791+        vshr.u8     d6, #1
12792+        teq         r8, #0
12793+        bne         1b
12794+
12795+        pop         {r4-r11, pc}
12796+
12797+@ Left of vertical - works down left
12798+18:
12799+        add         r5, r1, #32
12800+        vld1.16     {q1-q2}, [r1]
12801+        rsb         r12, r6, r6, lsl #16
12802+        vld1.16     {q3-q4}, [r5]
12803+        sub         r9, r2, #2
12804+        rsb         r4, r12, #0
12805+        rsb         r12, r12, #32 << 16
12806+        ldrh        r7, [r7]
12807+        mov         r8, #-128
12808+        vmov        d0, d9
12809+        vmov        s2, r12
12810+        add         r10, r0, #32
12811+        mov         r5, #32
12812+1:
12813+        vld1.16     {d1[3]}, [r9]
12814+        add         r8, r7
12815+        vmov        q11, q4
12816+        vmov        q10, q3
12817+        asr         r9, r8, #8
12818+        vmov        q9, q2
12819+        add         r9, r2, r9, lsl #1
12820+        vmov        q8, q1
12821+        vext.16     q4, q3, q4, #7
12822+        vext.16     q3, q2, q3, #7
12823+        vext.16     q2, q1, q2, #7
12824+        vext.16     q1, q0, q1, #7
12825+2:
12826+        vmul.u16    q12, q8, d1[1]
12827+        adds        r12, r4
12828+        vmla.u16    q12, q1, d1[0]
12829+        it          cc
12830+        addcc       r12, #32 << 16
12831+        vmul.u16    q13, q9, d1[1]
12832+        it          cc
12833+        subcc       r12, #32
12834+        vmla.u16    q13, q2, d1[0]
12835+        sub         r5, #1
12836+        vmul.u16    q14, q10, d1[1]
12837+        teq         r5, #0
12838+        vmla.u16    q14, q3, d1[0]
12839+        vmul.u16    q15, q11, d1[1]
12840+        vmla.u16    q15, q4, d1[0]
12841+        vmov        s2, r12
12842+        vrshr.u16   q12, q12, #5
12843+        vrshr.u16   q13, q13, #5
12844+        vrshr.u16   q14, q14, #5
12845+        vrshr.u16   q15, q15, #5
12846+        vst1.16     {q12-q13}, [r0], r3
12847+        vst1.16     {q14-q15}, [r10], r3
12848+        bhi         2b
12849+        bne         1b
12850+
12851+        vpop        {d8}
12852+        vmov        d9, d0
12853+        pop         {r4-r11, pc}
12854+
12855+@ Right of vertical - works along top - left unused
12856+26:
12857+        add         r5, r1, #32
12858+        vld1.16     {q1-q2}, [r1]
12859+        rsb         r12, r6, r6, lsl #16
12860+        vld1.16     {q3-q4}, [r5]
12861+        add         r1, r1, #64
12862+        rsb         r4, r12, #0
12863+        rsb         r12, r12, #32 << 16
12864+        vmov        d1, d9
12865+        vmov        s1, r12
12866+        add         r10, r0, #32
12867+        mov         r5, #32
12868+1:
12869+        vld1.16     {d0[0]}, [r1]!
12870+        vmov        q8, q1
12871+        vmov        q9, q2
12872+        vmov        q10, q3
12873+        vmov        q11, q4
12874+        vext.16     q1, q1, q2, #1
12875+        vext.16     q2, q2, q3, #1
12876+        vext.16     q3, q3, q4, #1
12877+        vext.16     q4, q4, q0, #1
12878+2:
12879+        vmul.u16    q12, q1, d0[2]
12880+        adds        r12, r4
12881+        vmla.u16    q12, q8, d0[3]
12882+        it          cc
12883+        addcc       r12, #32 << 16
12884+        vmul.u16    q13, q2, d0[2]
12885+        it          cc
12886+        subcc       r12, #32
12887+        vmla.u16    q13, q9, d0[3]
12888+        sub         r5, #1
12889+        vmul.u16    q14, q3, d0[2]
12890+        teq         r5, #0
12891+        vmla.u16    q14, q10, d0[3]
12892+        vmul.u16    q15, q4, d0[2]
12893+        vmla.u16    q15, q11, d0[3]
12894+        vmov        s1, r12
12895+        vrshr.u16   q12, q12, #5
12896+        vrshr.u16   q13, q13, #5
12897+        vrshr.u16   q14, q14, #5
12898+        vrshr.u16   q15, q15, #5
12899+        vst1.16     {q12-q13}, [r0], r3
12900+        vst1.16     {q14-q15}, [r10], r3
12901+        bhi         2b
12902+        bne         1b
12903+
12904+        vpop        {d8}
12905+        vmov        d9, d1
12906+        pop         {r4-r11, pc}
12907+
12908+endfunc
12909+
12910+
12911+
12912+@ Generate 4x4 chroma patch
12913+@
12914+@ In (const)
12915+@ r1   Up ptr (_up only)
12916+@ r3   Out stride
12917+@ r4   Angle add
12918+@ r7   Inv angle (_up only)
12919+@
12920+@ In/Out (updated)
12921+@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
12922+@ r2   Left ptr - updated
12923+@ r6   Angle frac (init to r4 + 32)
12924+@ r8   Inv angle accumulator
12925+@ q2   Cur Line - load before 1st call for down - set by _up
12926+@ q8   Cur Line - load before 1st call for up   - set by _down
12927+@
12928+@ Temps
12929+@ r5   Loop counter
12930+@ r12
12931+@ d0, q1, q12-q15
12932+
12933+patch_h_down_c_4x4_10:
12934+        vld1.16     {q12}, [r2]!
12935+        rsb         r12, r6, #32
12936+        vdup.16     q2, r6
12937+        vdup.16     q3, r12
12938+        mov         r5, #4
12939+1:
12940+        vmov        q13, q12
12941+        vext.16     q12, q12, q12, #2
12942+        vld1.32     {d25[1]}, [r2]!
12943+patch_h_down_c_4x4_10_continue:
12944+2:
12945+        vmov        q8, q9
12946+        subs        r12, r4
12947+        vmul.u16    q0, q13, q3
12948+        it          cc
12949+        addcc       r12, #32
12950+        vmla.u16    q0, q12, q2
12951+        rsb         r6, r12, #32
12952+        vmov        q9, q10
12953+        sub         r5, #1
12954+        vmov        q10, q11
12955+        teq         r5, #0
12956+        vdup.16     q2, r6
12957+        vdup.16     q3, r12
12958+        vrshr.u16   q11, q0, #5
12959+        bhi         2b
12960+        bne         1b
12961+
12962+        bcs         3f
12963+        vmov        q13, q12
12964+        vext.16     q12, q12, q12, #2
12965+        vld1.32     {d25[1]}, [r2]!
12966+3:
12967+
12968+store_tran_c_4x4_10:
12969+T       add         r6, r0, r3
12970+        vzip.32     q8, q10
12971+A       add         r6, r0, r3
12972+T       lsl         r3, #1
12973+        vzip.32     q9, q11
12974+A       add         r5, r0, r3, lsl #1
12975+T       add         r5, r0, r3
12976+        vst2.32     {d16,d18}, [r0]!
12977+A       lsl         r3, #1
12978+        vst2.32     {d17,d19}, [r6], r3
12979+        asr         r3, #1
12980+        vst2.32     {d20,d22}, [r5]
12981+        mov         r5, #4
12982+        vst2.32     {d21,d23}, [r6]
12983+        bx          lr
12984+
12985+patch_h_up_c_4x4_10:
12986+        vld1.16     {q1}, [r2]
12987+        rsb         r12, r6, #32
12988+        vdup.16     q2, r6
12989+        vdup.16     q3, r12
12990+        mov         r5, #4
12991+1:
12992+        adds        r8, r7
12993+        vmov        q12, q1
12994+        it          mi
12995+        ldrmi       r6, [r2, #-4]!
12996+        vext.16     q1, q1, q1, #6
12997+        itt         pl
12998+        asrpl       r6, r8, #8
12999+        ldrpl       r6, [r1, r6, lsl #2]
13000+        vmov        s4, r6
13001+patch_h_up_c_4x4_10_continue:
13002+2:
13003+        vmov        q8, q9
13004+        subs        r12, r4
13005+        vmul.u16    q0, q12, q3
13006+        it          cc
13007+        addcc       r12, #32
13008+        vmla.u16    q0, q1, q2
13009+        rsb         r6, r12, #32
13010+        vmov        q9, q10
13011+        sub         r5, #1
13012+        vmov        q10, q11
13013+        teq         r5, #0
13014+        vdup.16     q2, r6
13015+        vdup.16     q3, r12
13016+        vrshr.u16   q11, q0, #5
13017+        bhi         2b
13018+        bne         1b
13019+
13020+        bcs         store_tran_c_4x4_10
13021+        adds        r8, r7
13022+        vmov        q12, q1
13023+        it          mi
13024+        ldrmi       r6, [r2, #-4]!
13025+        vext.16     q1, q1, q1, #6
13026+        itt         pl
13027+        asrpl       r6, r8, #8
13028+        ldrpl       r6, [r1, r6, lsl #2]
13029+        vmov        s4, r6
13030+        b           store_tran_c_4x4_10
13031+
13032+
13033+@ ff_hevc_rpi_pred_angular_c_4_neon_10
13034+@       uint8_t *_src,          [r0]
13035+@       const uint8_t *_top,    [r1]
13036+@       const uint8_t *_left,   [r2]
13037+@       ptrdiff_t stride        [r3]
13038+@       unsigned int mode       [sp, #0]  2..34
13039+
13040+function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1
13041+        ldr         r12, [sp]
13042+        push        {r4-r8, lr}
13043+        ADRT        r4, angle_2 - 2
13044+        ADRT        r7, inv_angle - 11*2
13045+        add         r7, r7, r12, lsl #1
13046+        lsl         r3, #2
13047+        ldrsb       r6, [r4, r12]
13048+        cmp         r12, #26
13049+        ldrsb       r4, [r4, r12]
13050+        bge         26f
13051+        cmp         r12, #18
13052+        bge         18f
13053+        cmp         r12, #10
13054+        bge         10f
13055+
13056+@ Down of Horizontal - works down left
13057+        bl          patch_h_down_c_4x4_10
13058+        pop         {r4-r8, pc}
13059+
13060+@ Up of Horizontal - works down up
13061+10:
13062+        ldrh        r7, [r7]
13063+        mov         r8, #-128
13064+        sub         r8, r7
13065+        bl          patch_h_up_c_4x4_10
13066+        pop         {r4-r8, pc}
13067+
13068+@ Left of vertical - works down left
13069+18:
13070+        vld1.16     {q9}, [r1]
13071+        sub         r1, r2, #4
13072+        rsb         r12, r6, #32
13073+        ldrh        r7, [r7]
13074+        vdup.16     q2, r6
13075+        vext.16     q8, q9, q9, #6
13076+        sub         r8, r7, #128
13077+        vld1.32     {d16[0]}, [r1]
13078+        vdup.16     q3, r12
13079+        mov         r5, #3
13080+1:
13081+        vmul.u16    q0, q9, q3
13082+        subs        r12, r4
13083+        vmla.u16    q0, q8, q2
13084+        ittt        cc
13085+        asrcc       r1, r8, #8
13086+        addcc       r12, #32
13087+        addcc       r1, r2, r1, lsl #2
13088+        vext.16     q10, q8, q8, #6
13089+        rsb         r6, r12, #32
13090+        vmov        q11, q8
13091+        sub         r5, #1
13092+        vrshr.u16   q0, q0, #5
13093+        it          cc
13094+        addcc       r8, r7
13095+        vld1.32     {d20[0]}, [r1]
13096+        teq         r5, #0
13097+        vdup.16     q2, r6
13098+        vdup.16     q3, r12
13099+        vst1.16     {q0}, [r0], r3
13100+        bhi         1b
13101+        beq         4f
13102+2:
13103+        vmul.u16    q0, q11, q3
13104+        subs        r12, r4
13105+        vmla.u16    q0, q10, q2
13106+        ittt        cc
13107+        asrcc       r1, r8, #8
13108+        addcc       r12, #32
13109+        addcc       r1, r2, r1, lsl #2
13110+        vext.16     q8, q10, q10, #6
13111+        rsb         r6, r12, #32
13112+        vmov        q9, q10
13113+        sub         r5, #1
13114+        vrshr.u16   q0, q0, #5
13115+        it          cc
13116+        addcc       r8, r7
13117+        vld1.32     {d16[0]}, [r1]
13118+        teq         r5, #0
13119+        vdup.16     q2, r6
13120+        vdup.16     q3, r12
13121+        vst1.16     {q0}, [r0], r3
13122+        bhi         2b
13123+        bne         1b
13124+        bcc         5f
13125+3:
13126+        vmul.u16    q0, q11, q3
13127+        vmla.u16    q0, q10, q2
13128+        vrshr.u16   q0, q0, #5
13129+        vst1.16     {q0}, [r0]
13130+
13131+        pop         {r4-r8, pc}
13132+4:
13133+        bcc         3b
13134+5:
13135+        vmul.u16    q0, q9, q3
13136+        vmla.u16    q0, q8, q2
13137+        vrshr.u16   q0, q0, #5
13138+        vst1.16     {q0}, [r0]
13139+
13140+        pop         {r4-r8, pc}
13141+
13142+@ Right of vertical - works along top - left unused
13143+26:
13144+        vld1.16     {q9}, [r1]!
13145+        rsb         r12, r6, #32
13146+        vdup.16     q2, r6
13147+        vdup.16     q3, r12
13148+        vext.16     q8, q9, q9, #2
13149+        vld1.32     {d17[1]}, [r1]!
13150+        mov         r5, #3
13151+1:
13152+        vmul.u16    q0, q8, q2
13153+        subs        r12, r4
13154+        vmla.u16    q0, q9, q3
13155+        it          cc
13156+        addcc       r12, #32
13157+        vext.16     q10, q8, q8, #2
13158+        rsb         r6, r12, #32
13159+        vld1.32     {d21[1]}, [r1]
13160+        sub         r5, #1
13161+        vmov        q11, q8
13162+        teq         r5, #0
13163+        vrshr.u16   q0, q0, #5
13164+        it          cc
13165+        addcc       r1, #4
13166+        vdup.16     q2, r6
13167+        vdup.16     q3, r12
13168+        vst1.16     {q0}, [r0], r3
13169+        bhi         1b
13170+        beq         4f
13171+2:
13172+        vmul.u16    q0, q10, q2
13173+        subs        r12, r4
13174+        vmla.u16    q0, q11, q3
13175+        it          cc
13176+        addcc       r12, #32
13177+        vext.16     q8, q10, q10, #2
13178+        rsb         r6, r12, #32
13179+        vld1.32     {d17[1]}, [r1]
13180+        sub         r5, #1
13181+        vmov        q9, q10
13182+        teq         r5, #0
13183+        vrshr.u16   q0, q0, #5
13184+        it          cc
13185+        addcc       r1, #4
13186+        vdup.16     q2, r6
13187+        vdup.16     q3, r12
13188+        vst1.16     {q0}, [r0], r3
13189+        bhi         2b
13190+        bne         1b
13191+        bcc         5f
13192+3:
13193+        vmul.u16    q0, q10, q2
13194+        vmla.u16    q0, q11, q3
13195+        vrshr.u16   q0, q0, #5
13196+        vst1.16     {q0}, [r0]
13197+
13198+        pop         {r4-r8, pc}
13199+4:
13200+        bcc         3b
13201+5:
13202+        vmul.u16    q0, q8, q2
13203+        vmla.u16    q0, q9, q3
13204+        vrshr.u16   q0, q0, #5
13205+        vst1.16     {q0}, [r0]
13206+
13207+        pop         {r4-r8, pc}
13208+
13209+endfunc
13210+
13211+
13212+@ ff_hevc_rpi_pred_angular_c_8_neon_10
13213+@       uint8_t *_src,          [r0]
13214+@       const uint8_t *_top,    [r1]
13215+@       const uint8_t *_left,   [r2]
13216+@       ptrdiff_t stride        [r3]
13217+@       unsigned int mode       [sp, #0]  2..34
13218+
13219+function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1
13220+        ldr         r12, [sp]
13221+        push        {r4-r8, lr}
13222+        ADRT        r4, angle_2 - 2
13223+        ADRT        r7, inv_angle - 11*2
13224+        add         r7, r7, r12, lsl #1
13225+        lsl         r3, #2
13226+        ldrsb       r6, [r4, r12]
13227+        cmp         r12, #26
13228+        ldrsb       r4, [r4, r12]
13229+        bge         26f
13230+        cmp         r12, #18
13231+        bge         18f
13232+        cmp         r12, #10
13233+        bge         10f
13234+
13235+@ Down of Horizontal - works down left
13236+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
13237+
13238+        bl          patch_h_down_c_4x4_10
13239+        bl          patch_h_down_c_4x4_10_continue
13240+
13241+        add         r2, r1, #4*4        @ restore r2, but 4 rows further down left
13242+        sub         r0, #32
13243+        mov         r6, r4
13244+        add         r0, r0, r3, lsl #2
13245+
13246+        bl          patch_h_down_c_4x4_10
13247+        bl          patch_h_down_c_4x4_10_continue
13248+
13249+        pop         {r4-r8, pc}
13250+
13251+@ Up of Horizontal - works down up
13252+10:
13253+        ldrh        r7, [r7]
13254+        mov         r8, #-128
13255+        sub         r8, r7
13256+
13257+        push        {r2, r8}
13258+        bl          patch_h_up_c_4x4_10
13259+        bl          patch_h_up_c_4x4_10_continue
13260+        pop         {r2, r8}
13261+
13262+        sub         r0, #32
13263+        mov         r6, r4
13264+        add         r2, #16
13265+        sub         r8, r8, r7, lsl #2
13266+        add         r0, r0, r3, lsl #2
13267+
13268+        bl          patch_h_up_c_4x4_10
13269+        bl          patch_h_up_c_4x4_10_continue
13270+
13271+        pop         {r4-r8, pc}
13272+
13273+@ Left of vertical - works down left
13274+18:
13275+        vld1.16     {q0-q1}, [r1]
13276+        sub         r9, r2, #4
13277+        rsb         r12, r6, #32
13278+        ldrh        r7, [r7]
13279+        mov         r8, #-128
13280+        vdup.16     q9, r6
13281+        vdup.16     q10, r12
13282+        mov         r5, #8
13283+1:
13284+        vld1.32     {d17[1]}, [r9]
13285+        add         r8, r7
13286+        vmov        q2, q0
13287+        vmov        q3, q1
13288+        asr         r9, r8, #8
13289+        vext.16     q1, q0, q1, #6
13290+        add         r9, r2, r9, lsl #2
13291+        vext.16     q0, q8, q0, #6
13292+2:
13293+        vmul.u16    q11, q2, q10
13294+        subs        r12, r4
13295+        vmla.u16    q11, q0, q9
13296+        it          cc
13297+        addcc       r12, #32
13298+        vmul.u16    q12, q3, q10
13299+        rsb         r6, r12, #32
13300+        vmla.u16    q12, q1, q9
13301+        sub         r5, #1
13302+        teq         r5, #0
13303+        vdup.16     q9, r6
13304+        vdup.16     q10, r12
13305+        vrshr.u16   q11, q11, #5
13306+        vrshr.u16   q12, q12, #5
13307+        vst1.16     {q11-q12}, [r0], r3
13308+        bhi         2b
13309+        bne         1b
13310+
13311+        pop         {r4-r8, pc}
13312+
13313+@ Right of vertical - works along top - left unused
13314+26:
13315+        add         r5, r1, #32
13316+        vld1.16     {q0-q1}, [r1]!
13317+        rsb         r12, r6, #32
13318+        vld1.32     {d16[0]}, [r5]
13319+        mov         r5, #8
13320+        vdup.16     q9, r6
13321+        vdup.16     q10, r12
13322+1:
13323+        vmov        q2, q0
13324+        add         r1, #4
13325+        vmov        q3, q1
13326+        vext.16     q0, q0, q1, #2
13327+        vext.16     q1, q1, q8, #2
13328+2:
13329+        vmul.u16    q11, q0, q9
13330+        subs        r12, r4
13331+        vmla.u16    q11, q2, q10
13332+        it          cc
13333+        addcc       r12, #32
13334+        vmul.u16    q12, q1, q9
13335+        rsb         r6, r12, #32
13336+        vmla.u16    q12, q3, q10
13337+        sub         r5, #1
13338+        vld1.32     {d16[0]}, [r1]
13339+        teq         r5, #0
13340+        vdup.16     q9, r6
13341+        vdup.16     q10, r12
13342+        vrshr.u16   q11, q11, #5
13343+        vrshr.u16   q12, q12, #5
13344+        vst1.16     {q11-q12}, [r0], r3
13345+        bhi         2b
13346+        bne         1b
13347+
13348+        pop         {r4-r8, pc}
13349+
13350+endfunc
13351+
13352+
13353+@ ff_hevc_rpi_pred_angular_c_16_neon_10
13354+@       uint8_t *_src,          [r0]
13355+@       const uint8_t *_top,    [r1]
13356+@       const uint8_t *_left,   [r2]
13357+@       ptrdiff_t stride        [r3]
13358+@       unsigned int mode       [sp, #0]  2..34
13359+
13360+function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1
13361+        ldr         r12, [sp]
13362+        push        {r4-r10, lr}
13363+        ADRT        r4, angle_2 - 2
13364+        ADRT        r7, inv_angle - 11*2
13365+        add         r7, r7, r12, lsl #1
13366+        lsl         r3, #2
13367+        vpush       {d8}
13368+        ldrsb       r6, [r4, r12]
13369+        cmp         r12, #26
13370+        ldrsb       r4, [r4, r12]
13371+        bge         26f
13372+        cmp         r12, #18
13373+        bge         18f
13374+        cmp         r12, #10
13375+        bge         10f
13376+
13377+@ Down of Horizontal - works down left
13378+        add         sp, #8
13379+        mov         r10, #4
13380+        mov         r1, r2
13381+1:
13382+        bl          patch_h_down_c_4x4_10
13383+        bl          patch_h_down_c_4x4_10_continue
13384+        bl          patch_h_down_c_4x4_10_continue
13385+        bl          patch_h_down_c_4x4_10_continue
13386+
13387+        add         r2, r1, #4*4         @ restore r2, but 4 rows further down left
13388+        add         r1, r1, #4*4
13389+        mov         r6, r4
13390+        sub         r0, #64
13391+        subs        r10, #1
13392+        add         r0, r0, r3, lsl #2
13393+        bne         1b
13394+
13395+        pop         {r4-r10, pc}
13396+
13397+@ Up of Horizontal - works down up
13398+10:
13399+        add         sp, #8
13400+        mov         r10, #4
13401+        ldrh        r7, [r7]
13402+        mov         r8, #-128
13403+        sub         r8, r7
13404+2:
13405+        push        {r2, r8}
13406+        bl          patch_h_up_c_4x4_10
13407+        bl          patch_h_up_c_4x4_10_continue
13408+        bl          patch_h_up_c_4x4_10_continue
13409+        bl          patch_h_up_c_4x4_10_continue
13410+        pop         {r2, r8}
13411+
13412+        sub         r0, #64
13413+        mov         r6, r4
13414+        add         r2, #16
13415+        sub         r8, r8, r7, lsl #2
13416+        add         r0, r0, r3, lsl #2
13417+        subs        r10, #1
13418+        bne         2b
13419+
13420+        pop         {r4-r10, pc}
13421+
13422+@ Left of vertical - works down left
13423+18:
13424+        add         r5, r1, #32
13425+        vld1.16     {q1-q2}, [r1]
13426+        rsb         r12, r6, r6, lsl #16
13427+        vld1.16     {q3-q4}, [r5]
13428+        sub         r9, r2, #4
13429+        rsb         r4, r12, #0
13430+        rsb         r12, r12, #32 << 16
13431+        ldrh        r7, [r7]
13432+        mov         r8, #-128
13433+        vmov        d0, d9
13434+        vmov        s2, r12
13435+        add         r10, r0, #32
13436+        mov         r5, #16
13437+1:
13438+        vld1.32     {d1[1]}, [r9]
13439+        add         r8, r7
13440+        vmov        q11, q4
13441+        vmov        q10, q3
13442+        asr         r9, r8, #8
13443+        vmov        q9, q2
13444+        add         r9, r2, r9, lsl #2
13445+        vmov        q8, q1
13446+        vext.16     q4, q3, q4, #6
13447+        vext.16     q3, q2, q3, #6
13448+        vext.16     q2, q1, q2, #6
13449+        vext.16     q1, q0, q1, #6
13450+2:
13451+        vmul.u16    q12, q8, d1[1]
13452+        adds        r12, r4
13453+        vmla.u16    q12, q1, d1[0]
13454+        it          cc
13455+        addcc       r12, #32 << 16
13456+        vmul.u16    q13, q9, d1[1]
13457+        it          cc
13458+        subcc       r12, #32
13459+        vmla.u16    q13, q2, d1[0]
13460+        sub         r5, #1
13461+        vmul.u16    q14, q10, d1[1]
13462+        teq         r5, #0
13463+        vmla.u16    q14, q3, d1[0]
13464+        vmul.u16    q15, q11, d1[1]
13465+        vmla.u16    q15, q4, d1[0]
13466+        vmov        s2, r12
13467+        vrshr.u16   q12, q12, #5
13468+        vrshr.u16   q13, q13, #5
13469+        vrshr.u16   q14, q14, #5
13470+        vrshr.u16   q15, q15, #5
13471+        vst1.16     {q12-q13}, [r0], r3
13472+        vst1.16     {q14-q15}, [r10], r3
13473+        bhi         2b
13474+        bne         1b
13475+
13476+        vpop        {d8}
13477+        vmov        d9, d0
13478+        pop         {r4-r10, pc}
13479+
13480+@ Right of vertical - works along top - left unused
13481+26:
13482+        add         r5, r1, #32
13483+        vld1.16     {q1-q2}, [r1]
13484+        rsb         r12, r6, r6, lsl #16
13485+        vld1.16     {q3-q4}, [r5]
13486+        add         r1, r1, #64
13487+        rsb         r4, r12, #0
13488+        rsb         r12, r12, #32 << 16
13489+        vmov        d1, d9
13490+        vmov        s1, r12
13491+        add         r10, r0, #32
13492+        mov         r5, #16
13493+1:
13494+        vld1.32     {d0[0]}, [r1]!
13495+        vmov        q8, q1
13496+        vmov        q9, q2
13497+        vmov        q10, q3
13498+        vmov        q11, q4
13499+        vext.16     q1, q1, q2, #2
13500+        vext.16     q2, q2, q3, #2
13501+        vext.16     q3, q3, q4, #2
13502+        vext.16     q4, q4, q0, #2
13503+2:
13504+        vmul.u16    q12, q1, d0[2]
13505+        adds        r12, r4
13506+        vmla.u16    q12, q8, d0[3]
13507+        it          cc
13508+        addcc       r12, #32 << 16
13509+        vmul.u16    q13, q2, d0[2]
13510+        it          cc
13511+        subcc       r12, #32
13512+        vmla.u16    q13, q9, d0[3]
13513+        sub         r5, #1
13514+        vmul.u16    q14, q3, d0[2]
13515+        teq         r5, #0
13516+        vmla.u16    q14, q10, d0[3]
13517+        vmul.u16    q15, q4, d0[2]
13518+        vmla.u16    q15, q11, d0[3]
13519+        vmov        s1, r12
13520+        vrshr.u16   q12, q12, #5
13521+        vrshr.u16   q13, q13, #5
13522+        vrshr.u16   q14, q14, #5
13523+        vrshr.u16   q15, q15, #5
13524+        vst1.16     {q12-q13}, [r0], r3
13525+        vst1.16     {q14-q15}, [r10], r3
13526+        bhi         2b
13527+        bne         1b
13528+
13529+        vpop        {d8}
13530+        vmov        d9, d1
13531+        pop         {r4-r10, pc}
13532+
13533+endfunc
13534--- /dev/null
13535+++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
13536@@ -0,0 +1,705 @@
13537+/*
13538+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
13539+All rights reserved.
13540+
13541+Redistribution and use in source and binary forms, with or without
13542+modification, are permitted provided that the following conditions are met:
13543+    * Redistributions of source code must retain the above copyright
13544+      notice, this list of conditions and the following disclaimer.
13545+    * Redistributions in binary form must reproduce the above copyright
13546+      notice, this list of conditions and the following disclaimer in the
13547+      documentation and/or other materials provided with the distribution.
13548+    * Neither the name of the copyright holder nor the
13549+      names of its contributors may be used to endorse or promote products
13550+      derived from this software without specific prior written permission.
13551+
13552+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
13553+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
13554+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
13555+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
13556+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
13557+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
13558+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
13559+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
13560+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
13561+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13562+
13563+Authors: John Cox, Ben Avison
13564+*/
13565+
13566+
13567+#include "libavutil/arm/asm.S"
13568+#include "neon.S"
13569+
13570+
13571+@ ff_hevc_rpi_pred_dc_4_neon_8
13572+@       uint8_t *_src,          [r0]
13573+@       const uint8_t *_top,    [r1]
13574+@       const uint8_t *_left,   [r2]
13575+@       ptrdiff_t stride)       [r3]
13576+
13577+function ff_hevc_rpi_pred_dc_4_neon_8, export=1
13578+
13579+        @ Average the els of top & left
13580+        ldr         r2, [r2]
13581+        vld1.32     {d0[0]}, [r1]
13582+        mov         r1, #2
13583+        vmov        s1, r2
13584+        vmov        s2, r2
13585+        vmov.i16    q2, #3
13586+        add         r2, r0, r3
13587+        vaddl.u8    q1, d0, d1    @ d2[0] = top[0] + left[0]
13588+        lsl         r3, #1
13589+        vmovl.u8    q0, d0
13590+        vmov.i64    d7, #0xffff
13591+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
13592+        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
13593+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
13594+
13595+        @ top line gets some smoothing
13596+        @ (top[i] + 3*dc + 2) >> 2
13597+        @ as does left
13598+        @ top_line[0] is extra special
13599+        @ (top[0] + left[0] + 2*dc + 2) >> 2
13600+
13601+        vmov.i64    d7, #0xff
13602+        vpadd.i16   d6, d6        @ 1 (all the same)
13603+        vrshr.u16   d6, #3
13604+        vmla.i16    q0, q2, d6[0]
13605+        vdup.8      d6, d6[0]
13606+        vrshrn.i16  d0, q0, #2
13607+
13608+        @ Store top line
13609+        vst1.32     {d0[0]}, [r0], r3
13610+
13611+        @ Store the rest
13612+        vshr.u64    d1, d0, #5*8
13613+        vshr.u64    d2, d0, #6*8
13614+        vshr.u64    d3, d0, #7*8
13615+        vbif        d1, d6, d7
13616+        vbif        d2, d6, d7
13617+        vst1.32     {d1[0]}, [r2], r3
13618+        vbif        d3, d6, d7
13619+        vst1.32     {d2[0]}, [r0]
13620+        vst1.32     {d3[0]}, [r2]
13621+
13622+        bx          lr
13623+endfunc
13624+
13625+
13626+@ ff_hevc_rpi_pred_dc_c_4_neon_8
13627+@       uint8_t *_src,          [r0]
13628+@       const uint8_t *_top,    [r1]
13629+@       const uint8_t *_left,   [r2]
13630+@       ptrdiff_t stride)       [r3]
13631+
13632+function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1
13633+
13634+        @ Average the els of top & left
13635+        vld1.8      {d0}, [r1]
13636+        vld1.8      {d1}, [r2]
13637+A       add         r2, r0, r3, lsl #1
13638+A       lsl         r3, #2
13639+T       lsl         r3, #1
13640+T       add         r2, r0, r3
13641+T       lsl         r3, #1
13642+        vaddl.u8    q0, d0, d1
13643+        vadd.i16    d0, d1       @ d0 has 2 val pairs
13644+        vpadd.i32   d2, d0, d0   @ This adds U & V separately
13645+        vpadd.i32   d3, d0, d0
13646+        vrshrn.u16  d0, q1, #3
13647+
13648+        @ Store
13649+        vst1.8      {d0}, [r0], r3
13650+        vst1.8      {d0}, [r2], r3
13651+        vst1.8      {d0}, [r0]
13652+        vst1.8      {d0}, [r2]
13653+
13654+        bx          lr
13655+endfunc
13656+
13657+
13658+@ ff_hevc_rpi_pred_dc_8_neon_8
13659+@       uint8_t *_src,          [r0]
13660+@       const uint8_t *_top,    [r1]
13661+@       const uint8_t *_left,   [r2]
13662+@       ptrdiff_t stride)       [r3]
13663+
13664+function ff_hevc_rpi_pred_dc_8_neon_8, export=1
13665+
13666+        @ Average the els of top & left
13667+        vld1.8      {d0}, [r1]
13668+        mov         r1, #2
13669+        vld1.8      {d16}, [r2]
13670+        vmov.i16    q2, #3
13671+        vmov.i64    d7, #0xffff
13672+        vaddl.u8    q1, d0, d16   @ d2[0] = top[0] + left[0]
13673+        vmovl.u8    q0, d0
13674+        vadd.i16    d6, d2, d3    @ d6 has 4 vals
13675+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
13676+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
13677+
13678+        @ top line gets some smoothing
13679+        @ (top[i] + 3*dc + 2) >> 2
13680+        @ as does left
13681+        @ top_line[0] is extra special
13682+        @ (top[0] + left[0] + 2*dc + 2) >> 2
13683+
13684+        vmov.i64    d7, #0xff
13685+        vmovl.u8    q1, d16
13686+        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
13687+        vpadd.i16   d6, d6        @ 1 (all the same)
13688+        vrshr.u16   d6, #4
13689+        vmla.i16    q1, q2, d6[0]
13690+        vmla.i16    q0, q2, d6[0]
13691+        vdup.8      d6, d6[0]
13692+        vrshrn.i16  d2, q1, #2
13693+        vrshrn.i16  d0, q0, #2
13694+
13695+        @ Store top line
13696+        vst1.8      {d0}, [r0], r3
13697+
13698+        @ Store the rest
13699+        vshr.u64    d2, #8
13700+        vbit        d6, d2, d7
13701+        vshr.u64    d2, #8
13702+        vst1.8      {d6}, [r0], r3
13703+        mov         r1, #6
13704+1:
13705+        vbit        d6, d2, d7
13706+        vshr.u64    d2, #8
13707+        vst1.8      {d6}, [r0], r3
13708+        subs        r1, #2
13709+        vbit        d6, d2, d7
13710+        vshr.u64    d2, #8
13711+        vst1.8      {d6}, [r0], r3
13712+        bne         1b
13713+
13714+        bx          lr
13715+endfunc
13716+
13717+
13718+@ ff_hevc_rpi_pred_dc_c_8_neon_8
13719+@       uint8_t *_src,          [r0]
13720+@       const uint8_t *_top,    [r1]
13721+@       const uint8_t *_left,   [r2]
13722+@       ptrdiff_t stride)       [r3]
13723+
13724+function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1
13725+
13726+        @ Average the els of top & left
13727+        vld1.8      {q0}, [r1]
13728+        mov         r1, #8
13729+        vld1.8      {q1}, [r2]
13730+T       lsl         r3, #1
13731+        vaddl.u8    q0, d0, d1
13732+A       add         r2, r0, r3, lsl #1
13733+A       lsl         r3, #2
13734+T       add         r2, r0, r3
13735+T       lsl         r3, #1
13736+        vaddl.u8    q1, d2, d3
13737+        vadd.i16    q1, q0
13738+        vadd.i16    d3, d2        @ d3 has 2 val pairs
13739+        vpadd.i32   d2, d3, d3    @ This add U & V separately
13740+        vpadd.i32   d3, d3, d3
13741+        vrshrn.u16  d0, q1, #4
13742+        vrshrn.u16  d1, q1, #4
13743+
13744+        @ Store
13745+1:
13746+        vst1.8      {q0}, [r0], r3
13747+        subs        r1, #4
13748+        vst1.8      {q0}, [r2], r3
13749+        vst1.8      {q0}, [r0], r3
13750+        vst1.8      {q0}, [r2], r3
13751+        bne         1b
13752+
13753+        bx          lr
13754+endfunc
13755+
13756+
13757+@ ff_hevc_rpi_pred_dc_16_neon_8
13758+@       uint8_t *_src,          [r0]
13759+@       const uint8_t *_top,    [r1]
13760+@       const uint8_t *_left,   [r2]
13761+@       ptrdiff_t stride)       [r3]
13762+
13763+function ff_hevc_rpi_pred_dc_16_neon_8, export=1
13764+
13765+        @ Average the els of top & left
13766+        vld1.8      {q8}, [r1]
13767+        mov         r1, #2
13768+        vld1.8      {q9}, [r2]
13769+        vaddl.u8    q10, d16, d17
13770+        vaddl.u8    q11, d16, d18
13771+        vaddl.u8    q0, d18, d19
13772+        vmov.i16    q1, #3
13773+        vadd.i16    q10, q0
13774+        vmovl.u8    q0, d18
13775+        vadd.i16    d20, d21
13776+        vmov.i16    d2[0], r1     @ 2, 3, 3, 3...
13777+
13778+        @ top line gets some smoothing
13779+        @ (top[i] + 3*dc + 2) >> 2
13780+        @ as does left
13781+        @ top_line[0] is extra special
13782+        @ (top[0] + left[0] + 2*dc + 2) >> 2
13783+
13784+        vmovl.u8    q2, d16
13785+        vmovl.u8    q9, d19
13786+        vpadd.i16   d20, d20      @ 2 (top & bottom of vector the same)
13787+        vmov.i64    d7, #0xffff
13788+        vmovl.u8    q8, d17
13789+        vbit        d4, d22, d7   @ q2 = top[0]+left[0], top[1..7]
13790+        vmov.i64    d7, #0xff
13791+        vpadd.i16   d20, d20      @ 1 (all the same)
13792+        vrshr.u16   d21, d20, #5
13793+        vrshr.u16   d20, d20, #5
13794+        vmla.i16    q0, q10, d2[1]
13795+        vmla.i16    q9, q10, d2[1]
13796+        vmla.i16    q2, q10, q1
13797+        vmla.i16    q8, q10, d2[1]
13798+        vdup.8      q1, d20[0]
13799+        vrshrn.i16  d0, q0, #2
13800+        vrshrn.i16  d1, q9, #2
13801+        vrshrn.i16  d4, q2, #2
13802+        vrshrn.i16  d5, q8, #2
13803+        vext.8      q0, q0, q0, #1
13804+
13805+        @ Store top line
13806+        vst1.8      {q2}, [r0], r3
13807+
13808+        @ Store the rest
13809+        mov         r1, #15
13810+1:
13811+        vbit        d2, d0, d7
13812+        vext.8      q0, q0, q0, #1
13813+        subs        r1, #1
13814+        vst1.8      {q1}, [r0], r3
13815+        bne         1b
13816+
13817+        bx          lr
13818+endfunc
13819+
13820+
13821+@ ff_hevc_rpi_pred_dc_c_16_neon_8
13822+@       uint8_t *_src,          [r0]
13823+@       const uint8_t *_top,    [r1]
13824+@       const uint8_t *_left,   [r2]
13825+@       ptrdiff_t stride)       [r3]
13826+
13827+function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1
13828+
13829+        @ Average the els of top & left
13830+        vld1.8      {q0-q1}, [r1]
13831+        mov         r1, #16
13832+        vld1.8      {q2-q3}, [r2]
13833+T       lsl         r3, #1
13834+        vaddl.u8    q0, d0, d1
13835+A       add         r2, r0, r3, lsl #1
13836+T       add         r2, r0, r3
13837+        vaddl.u8    q1, d2, d3
13838+A       lsl         r3, #2
13839+T       lsl         r3, #1
13840+        vaddl.u8    q2, d4, d5
13841+        vaddl.u8    q3, d6, d7
13842+        vadd.i16    q0, q1
13843+        vadd.i16    q2, q3
13844+        vadd.i16    q0, q2
13845+        vadd.i16    d0, d1        @ d0 has 2 val pairs
13846+        vpadd.i32   d4, d0, d0    @ This adds U & V separately
13847+        vpadd.i32   d5, d0, d0
13848+        vrshrn.u16  d0, q2, #5
13849+        vrshrn.u16  d1, q2, #5
13850+        vrshrn.u16  d2, q2, #5
13851+        vrshrn.u16  d3, q2, #5
13852+
13853+        @ Store
13854+1:
13855+        vst1.8      {q0-q1}, [r0], r3
13856+        subs        r1, #2
13857+        vst1.8      {q0-q1}, [r2], r3
13858+        bne         1b
13859+
13860+        bx          lr
13861+endfunc
13862+
13863+
13864+@ ff_hevc_rpi_pred_dc_32_neon_8
13865+@       uint8_t *_src,          [r0]
13866+@       const uint8_t *_top,    [r1]
13867+@       const uint8_t *_left,   [r2]
13868+@       ptrdiff_t stride)       [r3]
13869+
13870+function ff_hevc_rpi_pred_dc_32_neon_8, export=1
13871+
13872+        @ Average the els of top & left
13873+        vld1.8      {q0-q1}, [r1]
13874+        mov         r1, #32
13875+        vld1.8      {q2-q3}, [r2]
13876+        add         r2, r0, r3
13877+        vaddl.u8    q0, d0, d1
13878+        lsl         r3, #1
13879+        vaddl.u8    q1, d2, d3
13880+        vaddl.u8    q2, d4, d5
13881+        vaddl.u8    q3, d6, d7
13882+        vadd.i16    q0, q1
13883+        vadd.i16    q2, q3
13884+        vadd.i16    q0, q2
13885+        vadd.i16    d0, d1        @ d0 has 4 vals
13886+        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
13887+        vpadd.i16   d4, d0, d0    @ 1 (all the same)
13888+        vpadd.i16   d5, d0, d0
13889+        vrshrn.u16  d0, q2, #6
13890+        vrshrn.u16  d1, q2, #6
13891+        vrshrn.u16  d2, q2, #6
13892+        vrshrn.u16  d3, q2, #6
13893+
13894+        @ Store
13895+1:
13896+        vst1.8      {q0-q1}, [r0], r3
13897+        subs        r1, #2
13898+        vst1.8      {q0-q1}, [r2], r3
13899+        bne         1b
13900+
13901+        bx          lr
13902+endfunc
13903+
13904+
13905+@ -----------------------------------------------------------------------------
13906+@
13907+@ 10 Bit versions
13908+@
13909+@ There is no actual bit depth dependency in this code except that our
13910+@ intermediate results will overflow the 16 bits they are stored in
13911+@ All there functions are good to 10 bits - with the worst case being
13912+@ in dc_32 where we use all 16 bits.
13913+
13914+
13915+@ ff_hevc_rpi_pred_dc_4_neon_10
13916+@       uint8_t *_src,          [r0]
13917+@       const uint8_t *_top,    [r1]
13918+@       const uint8_t *_left,   [r2]
13919+@       ptrdiff_t stride)       [r3]
13920+
13921+function ff_hevc_rpi_pred_dc_4_neon_10, export=1
13922+
13923+        @ Average the els of top & left
13924+        vld1.16     {d0}, [r1]
13925+        mov         r1, #2
13926+        vld1.16     {d1}, [r2]
13927+T       lsl         r3, #1
13928+        vmov.i16    q2, #3
13929+A       add         r2, r0, r3, lsl #1
13930+T       add         r2, r0, r3
13931+        vadd.u16    d2, d0, d1    @ d2[0] = top[0] + left[0]
13932+A       lsl         r3, #2
13933+T       lsl         r3, #1
13934+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
13935+        vmov.i64    d7, #0xffff
13936+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
13937+
13938+        @ top line gets some smoothing
13939+        @ (top[i] + 3*dc + 2) >> 2
13940+        @ as does left
13941+        @ top_line[0] is extra special
13942+        @ (top[0] + left[0] + 2*dc + 2) >> 2
13943+
13944+        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
13945+        vpadd.i16   d6, d6        @ 1 (all the same)
13946+        vrshr.u16   d6, #3
13947+        vmla.i16    q0, q2, d6[0]
13948+        vrshr.u16   q0, #2
13949+
13950+        @ Store top line
13951+        vst1.16     {d0}, [r0], r3
13952+
13953+        @ Store the rest
13954+        vshr.u64    d3, d1, #1*16
13955+        vshr.u64    d4, d1, #2*16
13956+        vshr.u64    d5, d1, #3*16
13957+        vbif        d3, d6, d7
13958+        vbif        d4, d6, d7
13959+        vst1.16     {d3}, [r2], r3
13960+        vbif        d5, d6, d7
13961+        vst1.16     {d4}, [r0]
13962+        vst1.16     {d5}, [r2]
13963+
13964+        bx          lr
13965+endfunc
13966+
13967+
13968+@ ff_hevc_rpi_pred_dc_c_4_neon_10
13969+@       uint8_t *_src,          [r0]
13970+@       const uint8_t *_top,    [r1]
13971+@       const uint8_t *_left,   [r2]
13972+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
13973+
13974+function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1
13975+
13976+        @ Average the els of top & left
13977+        vld1.8      {q0}, [r1]
13978+        vld1.8      {q1}, [r2]
13979+A       add         r2, r0, r3, lsl #2
13980+A       lsl         r3, #3
13981+T       lsl         r3, #2
13982+T       add         r2, r0, r3
13983+T       lsl         r3, #1
13984+        vadd.i16    q0, q1
13985+        vadd.i16    d0, d1       @ d0 has 2 val pairs
13986+        vpadd.i32   d2, d0, d0   @ This adds U & V separately
13987+        vpadd.i32   d3, d0, d0
13988+        vrshr.u16   q0, q1, #3
13989+
13990+        vst1.16     {q0}, [r0], r3
13991+        vst1.16     {q0}, [r2], r3
13992+        vst1.16     {q0}, [r0]
13993+        vst1.16     {q0}, [r2]
13994+
13995+        bx          lr
13996+endfunc
13997+
13998+
13999+@ ff_hevc_rpi_pred_dc_8_neon_10
14000+@       uint8_t *_src,          [r0]
14001+@       const uint8_t *_top,    [r1]
14002+@       const uint8_t *_left,   [r2]
14003+@       ptrdiff_t stride)       [r3]
14004+
14005+function ff_hevc_rpi_pred_dc_8_neon_10, export=1
14006+
14007+        @ Average the els of top & left
14008+        vld1.16     {q0}, [r1]
14009+        mov         r1, #2
14010+        vld1.16     {q8}, [r2]
14011+T       lsl         r3, #1
14012+        vmov.i16    q2, #3
14013+A       add         r2, r0, r3, lsl #1
14014+T       add         r2, r0, r3
14015+        vadd.i16    q1, q0, q8    @ q1[0] = top[0] + left[0]
14016+A       lsl         r3, #2
14017+T       lsl         r3, #1
14018+        vmov.i64    d7, #0xffff
14019+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
14020+        vadd.i16    d6, d2, d3    @ d6 has 4 vals
14021+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
14022+
14023+        @ top line gets some smoothing
14024+        @ (top[i] + 3*dc + 2) >> 2
14025+        @ as does left
14026+        @ top_line[0] is extra special
14027+        @ (top[0] + left[0] + 2*dc + 2) >> 2
14028+
14029+        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
14030+        vpadd.i16   d6, d6        @ 1 (all the same)
14031+        vrshr.u16   d6, #4
14032+        vmla.i16    q8, q2, d6[0]
14033+        vmla.i16    q0, q2, d6[0]
14034+        vdup.16     q2, d6[0]
14035+        vdup.16     q9, d6[0]
14036+        vrshr.u16   q8, q8, #2
14037+        vrshr.u16   q0, q0, #2
14038+        vext.16     q1, q8, q8, #1
14039+
14040+        @ Store top line
14041+        vst1.16     {q0}, [r0], r3
14042+
14043+        @ Store the rest
14044+        vbit        d18, d2, d7
14045+        vst1.16     {q9}, [r2], r3
14046+        mov         r1, #6
14047+1:
14048+        vext.16     q8, q8, q8, #2
14049+        subs        r1, #2
14050+        vext.16     q1, q1, q1, #2
14051+        vbit        d4, d16, d7
14052+        vst1.16     {q2}, [r0], r3
14053+        vbit        d18, d2, d7
14054+        vst1.16     {q9}, [r2], r3
14055+        bne         1b
14056+
14057+        bx          lr
14058+endfunc
14059+
14060+
14061+@ ff_hevc_rpi_pred_dc_c_8_neon_10
14062+@       uint8_t *_src,          [r0]
14063+@       const uint8_t *_top,    [r1]
14064+@       const uint8_t *_left,   [r2]
14065+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
14066+
14067+function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1
14068+
14069+        @ Average the els of top & left
14070+        vld1.16     {q0-q1}, [r1]
14071+        mov         r1, #8
14072+        vld1.16     {q2-q3}, [r2]
14073+T       lsl         r3, #2
14074+        vadd.i16    q1, q0
14075+A       add         r2, r0, r3, lsl #2
14076+A       lsl         r3, #3
14077+T       add         r2, r0, r3
14078+T       lsl         r3, #1
14079+        vadd.i16    q2, q3
14080+        vadd.i16    q1, q2
14081+        vadd.i16    d3, d2        @ d3 has 2 val pairs
14082+        vpadd.i32   d2, d3, d3    @ This add U & V separately
14083+        vpadd.i32   d3, d3, d3
14084+        vrshr.u16   q0, q1, #4
14085+        vrshr.u16   q1, q1, #4
14086+
14087+        @ Store
14088+1:
14089+        vst1.8      {q0-q1}, [r0], r3
14090+        subs        r1, #2
14091+        vst1.8      {q0-q1}, [r2], r3
14092+        bne         1b
14093+
14094+        bx          lr
14095+endfunc
14096+
14097+
14098+@ ff_hevc_rpi_pred_dc_16_neon_10
14099+@       uint8_t *_src,          [r0]
14100+@       const uint8_t *_top,    [r1]
14101+@       const uint8_t *_left,   [r2]
14102+@       ptrdiff_t stride)       [r3]
14103+
14104+function ff_hevc_rpi_pred_dc_16_neon_10, export=1
14105+
14106+        @ Average the els of top & left
14107+        vld1.16     {q8-q9}, [r1]
14108+        mov         r1, #2
14109+        vld1.16     {q10-q11}, [r2]
14110+        lsl         r3, #1        @ stride given in pels
14111+        vadd.i16    q0, q8, q9
14112+        vadd.i16    q1, q10, q11
14113+        vmov.i16    q3, #3
14114+        vadd.i16    q1, q0
14115+        vadd.i16    d0, d16, d20
14116+        vmov.i64    d31, #0xffff
14117+        vadd.i16    d3, d2
14118+        vmov.16     d6[0], r1     @ 2, 3, 3, 3...
14119+
14120+        @ top line gets some smoothing
14121+        @ (top[i] + 3*dc + 2) >> 2
14122+        @ as does left
14123+        @ topline[0] is extra special
14124+        @ (top[0] + left[0] + 2*dc + 2) >> 2
14125+
14126+        vbit        d16, d0, d31  @ q8 = top[0]+left[0], top[1..7]
14127+        vpadd.i16   d3, d3        @ 2 (top & bottom of vector the same)
14128+        vpadd.i16   d3, d3        @ 1 (all the same)
14129+        vrshr.u16   d2, d3, #5
14130+        vrshr.u16   d3, d3, #5
14131+        vmov        q0, q1
14132+        vmla.i16    q10, q1, d6[1]
14133+        vmla.i16    q11, q1, d6[1]
14134+        vmla.i16    q8, q1, q3
14135+        vmla.i16    q9, q1, d6[1]
14136+        vrshr.u16   q2, q10, #2
14137+        vrshr.u16   q3, q11, #2
14138+        vrshr.u16   q8, #2
14139+        vrshr.u16   q9, #2
14140+        vext.16     q2, q2, q2, #1
14141+        mov         r1, #7<<29
14142+
14143+        @ Store top line
14144+        vst1.16     {q8-q9}, [r0], r3
14145+
14146+        @ Store the rest
14147+1:
14148+        vbit        d0, d4, d31
14149+        vext.16     q2, q2, q2, #1
14150+        subs        r1, #1<<29
14151+        vst1.16     {q0-q1}, [r0], r3
14152+        bne         1b
14153+1:
14154+        vbit        d0, d6, d31
14155+        vext.16     q3, q3, q3, #1
14156+        subs        r1, #1<<29
14157+        vst1.16     {q0-q1}, [r0], r3
14158+        bne         1b
14159+
14160+        bx          lr
14161+endfunc
14162+
14163+
14164+@ ff_hevc_rpi_pred_dc_c_16_neon_10
14165+@       uint8_t *_src,          [r0]
14166+@       const uint8_t *_top,    [r1]
14167+@       const uint8_t *_left,   [r2]
14168+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
14169+
14170+function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1
14171+
14172+        @ Average the els of top & left
14173+        vldm        r1, {q0-q3}
14174+        vldm        r2, {q8-q11}
14175+        vadd.i16    q0, q1
14176+        mov         r1, #16
14177+        vadd.i16    q2, q3
14178+        add         r2, r0, #32
14179+        vadd.i16    q8, q9
14180+        lsl         r3, #2
14181+        vadd.i16    q10, q11
14182+        vadd.u16    q0, q2
14183+        vadd.u16    q8, q10
14184+        vadd.i16    q0, q8
14185+        vadd.i16    d0, d1        @ d0 has 2 val pairs
14186+        vpadd.i32   d4, d0, d0    @ This adds U & V separately
14187+        vpadd.i32   d5, d0, d0
14188+        vrshr.u16   q0, q2, #5
14189+        vrshr.u16   q1, q2, #5
14190+
14191+        @ Store
14192+1:
14193+        vst1.16     {q0-q1}, [r0], r3
14194+        subs        r1, #1
14195+        vst1.16     {q0-q1}, [r2], r3
14196+        bne         1b
14197+
14198+        bx           lr
14199+endfunc
14200+
14201+
14202+@ ff_hevc_rpi_pred_dc_32_neon_10
14203+@       uint8_t *_src,          [r0]
14204+@       const uint8_t *_top,    [r1]
14205+@       const uint8_t *_left,   [r2]
14206+@       ptrdiff_t stride)       [r3]  (In pels)
14207+
14208+function ff_hevc_rpi_pred_dc_32_neon_10, export=1
14209+
14210+        @ Average the els of top & left
14211+        @ With 10 bits we are (just) safe from overflow in i16
14212+        vldm        r1, {q0-q3}
14213+        vldm        r2, {q8-q11}
14214+        vadd.i16    q0, q1
14215+        mov         r1, #32
14216+        vadd.i16    q2, q3
14217+        add         r2, r0, #32
14218+        vadd.i16    q8, q9
14219+        lsl         r3, #1
14220+        vadd.i16    q10, q11
14221+        vadd.u16    q0, q2
14222+        vadd.u16    q8, q10
14223+        vadd.i16    q0, q8
14224+        vadd.i16    d0, d1        @ d0 has 4 vals
14225+        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
14226+        vpadd.i16   d4, d0, d0    @ 1 (all the same)
14227+        vpadd.i16   d5, d0, d0
14228+        vrshr.u16   q0, q2, #6
14229+        vrshr.u16   q1, q2, #6
14230+
14231+        @ Store
14232+1:
14233+        vst1.16     {q0-q1}, [r0], r3
14234+        subs        r1, #1
14235+        vst1.16     {q0-q1}, [r2], r3
14236+        bne         1b
14237+
14238+        bx           lr
14239+endfunc
14240+
14241+
14242--- /dev/null
14243+++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
14244@@ -0,0 +1,881 @@
14245+/*
14246+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
14247+All rights reserved.
14248+
14249+Redistribution and use in source and binary forms, with or without
14250+modification, are permitted provided that the following conditions are met:
14251+    * Redistributions of source code must retain the above copyright
14252+      notice, this list of conditions and the following disclaimer.
14253+    * Redistributions in binary form must reproduce the above copyright
14254+      notice, this list of conditions and the following disclaimer in the
14255+      documentation and/or other materials provided with the distribution.
14256+    * Neither the name of the copyright holder nor the
14257+      names of its contributors may be used to endorse or promote products
14258+      derived from this software without specific prior written permission.
14259+
14260+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14261+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
14262+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
14263+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
14264+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
14265+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
14266+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
14267+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
14268+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
14269+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
14270+
14271+Authors: John Cox, Ben Avison
14272+*/
14273+
14274+#include "libavutil/arm/asm.S"
14275+#include "neon.S"
14276+
14277+@ All functions have the call
14278+@
14279+@ int ff_hevc_rpi_intra_filter_N_neon_PW(
14280+@    pixel * const left,                   [r0]
14281+@    pixel * const top,                    [r1]
14282+@    const unsigned int req,               [r2]
14283+@    const unsigned int avail,             [r3]
14284+@    const pixel * const src_l,            [sp, #0]
14285+@    const pixel * const src_u,            [sp, #4]
14286+@    const pixel * const src_ur,           [sp, #8]
14287+@    const unsigned int stride,            [sp, #12] (pels)
14288+@    const unsigned int top_right_size,    [sp, #16]
14289+@    const unsigned int down_left_size)    [sp, #20]
14290+@
14291+@ Assumptions:
14292+@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware
14293+@  if reuseing this code)
14294+@
14295+@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for
14296+@ N==4, but do for chroma N>=8.  As we share Y/C fns that means we can ignore
14297+@ N==8,PW=8 (chroma always PW>8) but have to cope for larger
14298+@
14299+@ We always have at least 64 pixel H frame width rounding - this lets us
14300+@ load UR widthout having to worry about exactly how many pixels are actually
14301+@ within the frame.  As partial loads will only occur very occasionally this
14302+@ should be a win in nearly all cases.
14303+@
14304+@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters
14305+@ so we do no maths on the contents
14306+@
14307+@ No filtering in 32bit fns as they are chroma only
14308+
14309+
14310+.equ    AVAIL_UR, 1
14311+.equ    AVAIL_U,  2
14312+.equ    AVAIL_UL, 4
14313+.equ    AVAIL_L,  8
14314+.equ    AVAIL_DL, 16
14315+
14316+.equ    FILTER_LIGHT, 0x40
14317+.equ    FILTER_STRONG, 0x80
14318+
14319+.equ    AVAIL_S_UR_N_U_C, 32 - 1
14320+.equ    AVAIL_S_U_N_UL_C, 32 - 2
14321+.equ    AVAIL_S_UL_N_L_C, 32 - 3
14322+.equ    AVAIL_S_L_N_DL_C, 32 - 4
14323+
14324+.equ    AVAIL_S_U_DL_CPSR, 31 - 4  @ Shift for u..dl to go into flags via cpsr
14325+
14326+@ On entry
14327+@  r2   req
14328+@  r3   avail
14329+@ [sp, #sp_offset...]  args
14330+@
14331+@ On Exit:
14332+@
14333+@ Extend values:
14334+@  d_l  scalar contains value for L & DL
14335+@       if DL avail then this is is DL[0] so we don't need to load that
14336+@  d_ul scalar containing value for UL
14337+@  d_u  scalar containing value for U
14338+@  d_ur scalar containing value for UR
14339+@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else...
14340+@ This means that L-light-filter works even if nreq DL (we never filter
14341+@ req-DL without req-L, but we do filter req-L without req-DL)
14342+@ If UR avail then d_ur == a_ur so U-filter good too
14343+@
14344+@ Data load pointers (only load if req & avail):
14345+@  r4   DL + stride
14346+@  r10  L
14347+@  r6   U
14348+@  r5   UR
14349+@
14350+@ Others:
14351+@  r2   req
14352+@  r7   req & avail
14353+@  r3   L + stride
14354+@  r8   DL + stride * 2
14355+@  r9   stride * 2
14356+@  cs   Load U
14357+@  mi   Load UR
14358+@
14359+@ Clobbered:
14360+@  r12
14361+
14362+.macro  load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur
14363+
14364+.equ    src_l\@,   \sp_offset + 0
14365+.equ    src_u\@,   \sp_offset + 4
14366+.equ    src_ur\@,  \sp_offset + 8
14367+.equ    stride\@,  \sp_offset + 12
14368+.equ    pw\@,      (1 << \pw_s)                 @ pel width in bytes
14369+.equ    b_size\@,  (1 << (\pw_s + \log2_s))     @ size in bytes
14370+
14371+@ r9    stride
14372+@                       r7 = ab_ul, r6 = a_u, r5 = a_ur
14373+@ r4 = b_dl, r10 = b_l,             r8 = b_u
14374+
14375+        ldr        r5,  [sp, #src_ur\@]
14376+        lsl        r12, r3,  #AVAIL_S_U_DL_CPSR
14377+        ldr        r10, [sp, #src_l\@]
14378+        ldr        r9,  [sp, #stride\@]
14379+        ldr        r6,  [sp, #src_u\@]
14380+
14381+        @ This is quite a slow instruction but it replaces
14382+        @ a decent number of tests that yield a max of 2 flags/op
14383+        @ It is annoying we can't branch on Q!
14384+        @ If L navail (ne) then DL must be navail (pl)
14385+        msr        APSR_nzcvq, r12      @ n=dl, z=l, c=ul, v=u, q=ur
14386+
14387+        mov        r4,  r5
14388+        sub        r7,  r10, r9
14389+        it vs
14390+        movvs      r4,  r6
14391+        add        r8,  r6,  #b_size\@ - pw\@
14392+        it cs
14393+        movcs      r4,  r7
14394+        ite ne
14395+        movne      r10, r4
14396+        addeq      r4,  r7,  r9,  lsl #\log2_s
14397+        it cc
14398+        movcc      r7,  r10
14399+        it mi
14400+        addmi      r4,  r10, r9,  lsl #\log2_s
14401+        vld1.\d_type {\d_ul}, [r7]
14402+        itt vc
14403+        movvc      r8,  r7
14404+        movvc      r6,  r7
14405+        vld1.\d_type {\d_l }, [r4], r9
14406+        tst        r3,  #AVAIL_UR
14407+        vld1.\d_type {\d_u }, [r6]
14408+        it eq
14409+        moveq      r5,  r8
14410+        and        r7,  r2,  r3
14411+        add        r8,  r4,  r9
14412+        vld1.\d_type {\d_ur}, [r5]
14413+        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
14414+        add        r3,  r10, r9
14415+        lsl        r9,  #1
14416+.endm
14417+
14418+
14419+
14420+@ int ff_hevc_rpi_intra_filter_4_neon_8(
14421+@    pixel * const left,                   [r0]
14422+@    pixel * const top,                    [r1]
14423+@    const unsigned int req,               [r2]
14424+@    const unsigned int avail,             [r3]
14425+@    const pixel * const src_l,            [sp, #0]
14426+@    const pixel * const src_u,            [sp, #4]
14427+@    const pixel * const src_ur,           [sp, #8]
14428+@    const unsigned int stride,            [sp, #12] (pels)
14429+@    const unsigned int top_right_size,    [sp, #16]
14430+@    const unsigned int down_left_size)    [sp, #20]
14431+
14432+.set    sp_base, 8*4
14433+.set    pw_s,    0
14434+.set    pw,      (1 << pw_s)
14435+.set    log2_s,  2
14436+
14437+function ff_hevc_rpi_intra_filter_4_neon_8, export=1
14438+        push       {r4-r10, lr}
14439+        load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[]
14440+
14441+        it cs
14442+        vldrcs     s2,  [r6]
14443+        ite pl
14444+        vmovpl     s3,  s4
14445+        vldrmi     s3,  [r5]
14446+
14447+        lsls       r7,  #AVAIL_S_L_N_DL_C
14448+        add        r12, r0,  #-pw
14449+        bpl        1f
14450+
14451+        vld1.8    {d0[0]}, [r10], r9
14452+        vld1.8    {d0[1]}, [r3],  r9
14453+        vld1.8    {d0[2]}, [r10]
14454+        vld1.8    {d0[3]}, [r3]
14455+1:
14456+        bcc        1f
14457+        vld1.8    {d0[5]}, [r4],  r9
14458+        vld1.8    {d0[6]}, [r8]
14459+        vld1.8    {d0[7]}, [r4]
14460+1:
14461+        vstr       d1,  [r1]            @ Up
14462+        vst1.8    {d31[7]}, [r12]
14463+        vstr       d0,  [r0]            @ Left
14464+        pop       {r4-r10, pc}
14465+endfunc
14466+
14467+
14468+@ int ff_hevc_rpi_intra_filter_4_neon_16(
14469+@    pixel * const left,                   [r0]
14470+@    pixel * const top,                    [r1]
14471+@    const unsigned int req,               [r2]
14472+@    const unsigned int avail,             [r3]
14473+@    const pixel * const src_l,            [sp, #0]
14474+@    const pixel * const src_u,            [sp, #4]
14475+@    const pixel * const src_ur,           [sp, #8]
14476+@    const unsigned int stride,            [sp, #12] (pels)
14477+@    const unsigned int top_right_size,    [sp, #16]
14478+@    const unsigned int down_left_size)    [sp, #20]
14479+
14480+.set    sp_base, 8*4
14481+.set    pw_s,    1
14482+.set    pw,      (1 << pw_s)
14483+.set    log2_s,  2
14484+
14485+function ff_hevc_rpi_intra_filter_4_neon_16, export=1
14486+        push       {r4-r10, lr}
14487+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[]
14488+
14489+        it cs
14490+        vldrcs     d2,  [r6]
14491+        it mi
14492+        vldrmi     d3,  [r5]
14493+        lsls       r7,  #AVAIL_S_L_N_DL_C
14494+        add        r12, r0, #-pw
14495+        bpl        1f
14496+        vld1.16   {d0[0]}, [r10], r9
14497+        vld1.16   {d0[1]}, [r3],  r9
14498+        vld1.16   {d0[2]}, [r10]
14499+        vld1.16   {d0[3]}, [r3]
14500+1:
14501+        bcc        1f
14502+        vld1.16   {d1[1]}, [r4],  r9
14503+        vld1.16   {d1[2]}, [r8]
14504+        vld1.16   {d1[3]}, [r4]
14505+1:
14506+        vst1.16   {q1}, [r1]           @ Up
14507+        vst1.16   {d31[3]}, [r12]
14508+        vst1.16   {q0}, [r0]           @ Left
14509+        pop       {r4-r10, pc}
14510+endfunc
14511+
14512+
14513+@ int ff_hevc_rpi_intra_filter_8_neon_8(
14514+@    pixel * const left,                   [r0]
14515+@    pixel * const top,                    [r1]
14516+@    const unsigned int req,               [r2]
14517+@    const unsigned int avail,             [r3]
14518+@    const pixel * const src_l,            [sp, #0]
14519+@    const pixel * const src_u,            [sp, #4]
14520+@    const pixel * const src_ur,           [sp, #8]
14521+@    const unsigned int stride,            [sp, #12] (pels)
14522+@    const unsigned int top_right_size,    [sp, #16]
14523+@    const unsigned int down_left_size)    [sp, #20]
14524+
14525+.set    sp_base, 8*4
14526+.set    pw_s,    0
14527+.set    pw,      (1 << pw_s)
14528+.set    log2_s,  3
14529+
14530+function ff_hevc_rpi_intra_filter_8_neon_8, export=1
14531+        push      {r4-r10, lr}
14532+        load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[]
14533+
14534+        it cs
14535+        vldrcs     d4,  [r6]
14536+        it mi
14537+        vldrmi     d5,  [r5]
14538+
14539+        lsls       r7,  #AVAIL_S_L_N_DL_C
14540+        bpl        1f
14541+        vld1.8    {d0[0]}, [r10], r9
14542+        vld1.8    {d0[1]}, [r3],  r9
14543+        vld1.8    {d0[2]}, [r10], r9
14544+        vld1.8    {d0[3]}, [r3],  r9
14545+        vld1.8    {d0[4]}, [r10], r9
14546+        vld1.8    {d0[5]}, [r3],  r9
14547+        vld1.8    {d0[6]}, [r10]
14548+        vld1.8    {d0[7]}, [r3]
14549+1:
14550+        bcc        1f
14551+        vld1.8    {d1[1]}, [r4],  r9
14552+        vld1.8    {d1[2]}, [r8],  r9
14553+        vld1.8    {d1[3]}, [r4],  r9
14554+        vld1.8    {d1[4]}, [r8],  r9
14555+        vld1.8    {d1[5]}, [r4],  r9
14556+        vld1.8    {d1[6]}, [r8]
14557+        vld1.8    {d1[7]}, [r4]
14558+1:
14559+        tst        r2,  #FILTER_LIGHT
14560+        add        r12, r0,  #-pw
14561+        beq        10f
14562+
14563+        @ Luma light filter
14564+        vext.8     q8,  q15, q2,  #15
14565+        vext.8     q12, q15, q0,  #15
14566+        vaddl.u8   q9,  d17, d5
14567+        vaddl.u8   q8,  d16, d4
14568+        vaddl.u8   q13, d25, d1
14569+        vaddl.u8   q12, d24, d0
14570+        vmov.u8    r3,  d5[7]           @ Save final pel
14571+        vmov.u8    r2,  d1[7]           @ Save final pel
14572+
14573+        vext.16    q2,  q8,  q9,  #1
14574+        vext.16    q3,  q9,  q9,  #1
14575+        vext.16    q0,  q12, q13, #1
14576+        vext.16    q1,  q13, q13, #1
14577+        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
14578+        vadd.u16   q2,  q8
14579+        vadd.u16   q3,  q9
14580+        vadd.u16   q0,  q12
14581+        vadd.u16   q1,  q13
14582+
14583+        vrshrn.u16 d4,  q2,  #2
14584+        vrshrn.u16 d5,  q3,  #2
14585+        vrshrn.u16 d0,  q0,  #2
14586+        vrshrn.u16 d1,  q1,  #2
14587+        vrshr.u16  d30, #2
14588+        vmov.u8    d5[7], r3            @ Restore final pel
14589+        vmov.u8    d1[7], r2            @ Restore final pel
14590+        vdup.u8    d31, d30[0]          @ d31[3] = d30[0]
14591+
14592+10:
14593+        vst1.8    {q2 }, [r1]           @ Up
14594+        vst1.8    {d31[7]}, [r12]       @ Up-left
14595+        vst1.8    {q0 }, [r0]           @ Left
14596+        pop       {r4-r10, pc}
14597+endfunc
14598+
14599+
14600+@ int ff_hevc_rpi_intra_filter_8_neon_16(
14601+@    pixel * const left,                   [r0]
14602+@    pixel * const top,                    [r1]
14603+@    const unsigned int req,               [r2]
14604+@    const unsigned int avail,             [r3]
14605+@    const pixel * const src_l,            [sp, #0]
14606+@    const pixel * const src_u,            [sp, #4]
14607+@    const pixel * const src_ur,           [sp, #8]
14608+@    const unsigned int stride,            [sp, #12] (pels)
14609+@    const unsigned int top_right_size,    [sp, #16]
14610+@    const unsigned int down_left_size)    [sp, #20]
14611+
14612+.set    sp_base, 8*4
14613+.set    ur_size, sp_base + 16
14614+.set    dl_size, sp_base + 20
14615+.set    pw_s,    1
14616+.set    pw,      (1 << pw_s)
14617+.set    log2_s,  3
14618+.set    p_size,  (1 << log2_s)          @ size in pels
14619+
14620+function ff_hevc_rpi_intra_filter_8_neon_16, export=1
14621+        push      {r4-r10, lr}
14622+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]"
14623+
14624+        it cs
14625+        vldmcs     r6,  {d4, d5}
14626+        ldr        r12, [sp, #ur_size]
14627+        bpl        1f
14628+        cmp        r12, #4
14629+        vldm       r5,  {d6, d7}
14630+        bgt        1f
14631+        vdup.16    d7,  d6[3]
14632+1:
14633+        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
14634+        vdup.16    q1,  d0[0]
14635+        bpl        1f
14636+        vld1.16   {d0[0]}, [r10], r9
14637+        vld1.16   {d0[1]}, [r3],  r9
14638+        vld1.16   {d0[2]}, [r10], r9
14639+        vld1.16   {d0[3]}, [r3],  r9
14640+        vld1.16   {d1[0]}, [r10], r9
14641+        vld1.16   {d1[1]}, [r3],  r9
14642+        vld1.16   {d1[2]}, [r10]
14643+        vld1.16   {d1[3]}, [r3]
14644+1:
14645+        bcc        1f
14646+        ldr        r12, [sp, #dl_size]
14647+        vld1.16   {d2[1]}, [r4],  r9
14648+        cmp        r12, #p_size
14649+        vld1.16   {d2[2]}, [r8],  r9
14650+        vld1.16   {d2[3]}, [r4],  r9
14651+        blt        2f
14652+        vld1.16   {d3[0]}, [r8],  r9
14653+        vld1.16   {d3[1]}, [r4],  r9
14654+        vld1.16   {d3[2]}, [r8]
14655+        vld1.16   {d3[3]}, [r4]
14656+        b          1f
14657+2:
14658+        vdup.16    d3,  d2[3]
14659+1:
14660+        tst        r2,  #FILTER_LIGHT
14661+        add        r12, r0,  #-pw
14662+        beq        10f
14663+
14664+        @ Luma light filter
14665+        vext.16    q9,  q2,  q3,  #7
14666+        vext.16    q8,  q15, q2,  #7
14667+        vext.16    q13, q0,  q1,  #7
14668+        vext.16    q12, q15, q0,  #7
14669+        vadd.u16   q9,  q3
14670+        vadd.u16   q8,  q2
14671+        vadd.u16   q13, q1
14672+        vadd.u16   q12, q0
14673+        vmov.u16   r3,  d7[3]           @ Save final pel
14674+        vmov.u16   r2,  d3[3]           @ Save final pel
14675+
14676+        vext.16    q2,  q8,  q9,  #1
14677+        vext.16    q3,  q9,  q9,  #1
14678+        vext.16    q0,  q12, q13, #1
14679+        vext.16    q1,  q13, q13, #1
14680+        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
14681+        vadd.u16   q2,  q8
14682+        vadd.u16   q3,  q9
14683+        vadd.u16   q0,  q12
14684+        vadd.u16   q1,  q13
14685+
14686+        vrshr.u16  q2,  #2
14687+        vrshr.u16  q3,  #2
14688+        vrshr.u16  q0,  #2
14689+        vrshr.u16  q1,  #2
14690+        vrshr.u16  d30, #2
14691+        vmov.u16   d7[3], r3            @ Restore final pel
14692+        vmov.u16   d3[3], r2            @ Restore final pel
14693+        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
14694+
14695+10:
14696+        vst1.16   {q2,  q3}, [r1]       @ Up
14697+        vst1.16   {d31[3]}, [r12]       @ Up-left
14698+        vst1.16   {q0,  q1}, [r0]       @ Left
14699+        pop       {r4-r10, pc}
14700+endfunc
14701+
14702+@ int ff_hevc_rpi_intra_filter_16_neon_16(
14703+@    pixel * const left,                   [r0]
14704+@    pixel * const top,                    [r1]
14705+@    const unsigned int req,               [r2]
14706+@    const unsigned int avail,             [r3]
14707+@    const pixel * const src_l,            [sp, #0]
14708+@    const pixel * const src_u,            [sp, #4]
14709+@    const pixel * const src_ur,           [sp, #8]
14710+@    const unsigned int stride,            [sp, #12] (pels)
14711+@    const unsigned int top_right_size,    [sp, #16]
14712+@    const unsigned int down_left_size)    [sp, #20]
14713+
14714+.set    sp_base, 8*4
14715+.set    ur_size, sp_base + 16
14716+.set    dl_size, sp_base + 20
14717+.set    pw_s,    1
14718+.set    pw,      (1 << pw_s)
14719+.set    log2_s,  4
14720+.set    p_size,  (1 << log2_s)          @ size in pels
14721+
14722+function ff_hevc_rpi_intra_filter_16_neon_16, export=1
14723+        push      {r4-r10, lr}
14724+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]"
14725+
14726+        vdup.16    q9,  d16[0]
14727+        vdup.16    q11, d20[0]
14728+
14729+        it cs
14730+        vldmcs     r6,  {d16-d19}
14731+        ldr        r12, [sp, #ur_size]
14732+        bpl        1f
14733+        cmp        r12, #12
14734+        @ Given chroma frame layout, if UR exists then it is always legit to
14735+        @ load all of it even if most of it is outside the frame.
14736+        vldm       r5,  {d20-d23}
14737+        bgt        1f
14738+        bge        4f
14739+        cmp        r12,  #8
14740+        bge        3f
14741+        vdup.16    d21, d20[3]
14742+3:      vdup.16    d22, d21[3]
14743+4:      vdup.16    d23, d22[3]
14744+
14745+1:
14746+        lsls       r7,  #AVAIL_S_L_N_DL_C
14747+        ldr        r12, [sp, #dl_size]
14748+        vdup.16    q1,  d0[0]
14749+        vdup.16    q2,  d0[0]
14750+        vdup.16    q3,  d0[0]
14751+        bpl        1f
14752+        vld1.16   {d0[0]}, [r10], r9
14753+        vld1.16   {d0[1]}, [r3],  r9
14754+        vld1.16   {d0[2]}, [r10], r9
14755+        vld1.16   {d0[3]}, [r3],  r9
14756+        vld1.16   {d1[0]}, [r10], r9
14757+        vld1.16   {d1[1]}, [r3],  r9
14758+        vld1.16   {d1[2]}, [r10], r9
14759+        vld1.16   {d1[3]}, [r3],  r9
14760+        vld1.16   {d2[0]}, [r10], r9
14761+        vld1.16   {d2[1]}, [r3],  r9
14762+        vld1.16   {d2[2]}, [r10], r9
14763+        vld1.16   {d2[3]}, [r3],  r9
14764+        vld1.16   {d3[0]}, [r10], r9
14765+        vld1.16   {d3[1]}, [r3],  r9
14766+        vld1.16   {d3[2]}, [r10]
14767+        vld1.16   {d3[3]}, [r3]
14768+1:
14769+        bcc        1f
14770+        vld1.16   {d4[1]}, [r4],  r9
14771+        cmp        r12, #4
14772+        vld1.16   {d4[2]}, [r8],  r9
14773+        vld1.16   {d4[3]}, [r4],  r9
14774+        ble        2f
14775+        vld1.16   {d5[0]}, [r8],  r9
14776+        vld1.16   {d5[1]}, [r4],  r9
14777+        cmp        r12, #12
14778+        vld1.16   {d5[2]}, [r8],  r9
14779+        vld1.16   {d5[3]}, [r4],  r9
14780+        blt        3f
14781+        vld1.16   {d6[0]}, [r8],  r9
14782+        vld1.16   {d6[1]}, [r4],  r9
14783+        vld1.16   {d6[2]}, [r8],  r9
14784+        vld1.16   {d6[3]}, [r4],  r9
14785+        ble        4f
14786+        vld1.16   {d7[0]}, [r8],  r9
14787+        vld1.16   {d7[1]}, [r4],  r9
14788+        vld1.16   {d7[2]}, [r8]
14789+        vld1.16   {d7[3]}, [r4]
14790+        b          1f
14791+2:      vdup.16    d5,  d4[3]
14792+3:      vdup.16    d6,  d5[3]
14793+4:      vdup.16    d7,  d6[3]
14794+1:
14795+        tst        r2,  #FILTER_LIGHT
14796+        add        r12, r0,  #-pw
14797+        beq        10f
14798+
14799+        vpush     {q5}
14800+        @ Luma light filter
14801+        @ Left
14802+        vext.16    q5,  q2,  q3,  #7
14803+        vext.16    q14, q1,  q2,  #7
14804+        vext.16    q13, q0,  q1,  #7
14805+        vext.16    q12, q15, q0,  #7
14806+
14807+        vadd.u16   q5,  q3
14808+        vadd.u16   q14, q2
14809+        vadd.u16   q13, q1
14810+        vadd.u16   q12, q0
14811+        vmov.u16   r2,  d7[3]           @ Save final pel
14812+
14813+        vext.16    q0,  q12, q13, #1
14814+        vext.16    q1,  q13, q14, #1
14815+        vext.16    q2,  q14, q5,  #1
14816+        vext.16    q3,  q5,  q5,  #1
14817+
14818+        vmov       d30, d24             @ d30[0] = l[0] + ul
14819+        vadd.u16   q0,  q12
14820+        vadd.u16   q1,  q13
14821+        vadd.u16   q2,  q14
14822+        vadd.u16   q3,  q5
14823+
14824+        vrshr.u16  q0,  #2
14825+        vrshr.u16  q1,  #2
14826+        vrshr.u16  q2,  #2
14827+        vrshr.u16  q3,  #2
14828+
14829+        @ Up
14830+        vext.16    q5,  q10, q11, #7
14831+        vext.16    q14, q9,  q10, #7
14832+        vext.16    q13, q8,  q9,  #7
14833+        vext.16    q12, q15, q8,  #7
14834+
14835+        vadd.u16   q5,  q11
14836+        vadd.u16   q14, q10
14837+        vadd.u16   q13, q9
14838+        vadd.u16   q12, q8
14839+        vmov.u16   r3,  d23[3]          @ Save final pel
14840+
14841+        vext.16    q8,  q12, q13, #1
14842+        vext.16    q9,  q13, q14, #1
14843+        vext.16    q10, q14, q5,  #1
14844+        vext.16    q11, q5,  q5,  #1
14845+
14846+        vadd.u16   d30, d24             @ d30[0] = l[0] + 2ul + u[0]
14847+        vadd.u16   q8,  q12
14848+        vadd.u16   q9,  q13
14849+        vadd.u16   q10, q14
14850+        vadd.u16   q11, q5
14851+
14852+        vrshr.u16  q8,  #2
14853+        vrshr.u16  q9,  #2
14854+        vrshr.u16  q10, #2
14855+        vrshr.u16  q11, #2
14856+
14857+        @ Misc
14858+        vrshr.u16  d30, #2
14859+        vmov.u16   d7[3], r2            @ Restore final pel
14860+        vmov.u16   d23[3], r3           @ Restore final pel
14861+        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
14862+        vpop      {q5}
14863+
14864+10:
14865+        vstm       r1, {d16-d23}        @ Up
14866+        vst1.16   {d31[3]}, [r12]       @ Up-left
14867+        vstm       r0, { d0-d7 }        @ Left
14868+        pop       {r4-r10, pc}
14869+endfunc
14870+
14871+@ int ff_hevc_rpi_intra_filter_4_neon_32(
14872+@    pixel * const left,                   [r0]
14873+@    pixel * const top,                    [r1]
14874+@    const unsigned int req,               [r2]
14875+@    const unsigned int avail,             [r3]
14876+@    const pixel * const src_l,            [sp, #0]
14877+@    const pixel * const src_u,            [sp, #4]
14878+@    const pixel * const src_ur,           [sp, #8]
14879+@    const unsigned int stride,            [sp, #12] (pels)
14880+@    const unsigned int top_right_size,    [sp, #16]
14881+@    const unsigned int down_left_size)    [sp, #20]
14882+
14883+.set    sp_base, 8*4
14884+.set    pw_s,    2
14885+.set    pw,      (1 << pw_s)
14886+.set    log2_s,  2
14887+
14888+function ff_hevc_rpi_intra_filter_4_neon_32, export=1
14889+        push       {r4-r10, lr}
14890+        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]"
14891+
14892+        it cs
14893+        vldmcs     r6,  {d4, d5}
14894+        it mi
14895+        vldmmi     r5,  {d6, d7}
14896+        lsls       r7,  #AVAIL_S_L_N_DL_C
14897+        vdup.32    q1,  d0[0]
14898+        add        r12, r0,  #-pw
14899+        bpl        1f
14900+        vld1.32   {d0[0]}, [r10], r9
14901+        vld1.32   {d0[1]}, [r3],  r9
14902+        vld1.32   {d1[0]}, [r10]
14903+        vld1.32   {d1[1]}, [r3]
14904+1:
14905+        bcc        1f
14906+        vld1.32   {d2[1]}, [r4],  r9
14907+        vld1.32   {d3[0]}, [r8]
14908+        vld1.32   {d3[1]}, [r4]
14909+1:
14910+        vst1.32    {q2,  q3 }, [r1]     @ Up
14911+        vst1.32    {d31[1]}, [r12]
14912+        vst1.32    {q0,  q1 }, [r0]     @ Left
14913+        pop        {r4-r10, pc}
14914+endfunc
14915+
14916+
14917+@ int ff_hevc_rpi_intra_filter_8_neon_32(
14918+@    pixel * const left,                   [r0]
14919+@    pixel * const top,                    [r1]
14920+@    const unsigned int req,               [r2]
14921+@    const unsigned int avail,             [r3]
14922+@    const pixel * const src_l,            [sp, #0]
14923+@    const pixel * const src_u,            [sp, #4]
14924+@    const pixel * const src_ur,           [sp, #8]
14925+@    const unsigned int stride,            [sp, #12] (pels)
14926+@    const unsigned int top_right_size,    [sp, #16]
14927+@    const unsigned int down_left_size)    [sp, #20]
14928+
14929+.set    sp_base, 8*4
14930+.set    ur_size, sp_base + 16
14931+.set    dl_size, sp_base + 20
14932+.set    pw_s,    2
14933+.set    pw,      (1 << pw_s)
14934+.set    log2_s,  3
14935+.set    p_size,  (1 << log2_s)          @ size in pels
14936+
14937+function ff_hevc_rpi_intra_filter_8_neon_32, export=1
14938+        push       {r4-r10, lr}
14939+        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]"
14940+
14941+        vdup.32    q9,  d16[0]
14942+        vdup.32    q11, d20[0]
14943+
14944+        it cs
14945+        vldmcs     r6,  {q8,  q9 }
14946+        ldr        r12, [sp, #ur_size]
14947+        bpl        1f
14948+        cmp        r12, #p_size
14949+        vldm       r5,  {q10, q11}
14950+        bge        1f
14951+        vdup.32    q11, d21[1]
14952+1:
14953+        lsls       r7,  #AVAIL_S_L_N_DL_C
14954+        vdup.32    q1,  d0[0]
14955+        vdup.32    q2,  d0[0]
14956+        vdup.32    q3,  d0[0]
14957+        bpl        1f
14958+        vld1.32   {d0[0]}, [r10], r9
14959+        vld1.32   {d0[1]}, [r3],  r9
14960+        vld1.32   {d1[0]}, [r10], r9
14961+        vld1.32   {d1[1]}, [r3],  r9
14962+        vld1.32   {d2[0]}, [r10], r9
14963+        vld1.32   {d2[1]}, [r3],  r9
14964+        vld1.32   {d3[0]}, [r10]
14965+        vld1.32   {d3[1]}, [r3]
14966+1:
14967+        bcc        1f
14968+        ldr        r12, [sp, #dl_size]
14969+        vld1.32   {d4[1]}, [r4],  r9
14970+        cmp        r12, #p_size
14971+        vld1.32   {d5[0]}, [r8],  r9
14972+        vld1.32   {d5[1]}, [r4],  r9
14973+        blt        2f
14974+        vld1.32   {d6[0]}, [r8],  r9
14975+        vld1.32   {d6[1]}, [r4],  r9
14976+        vld1.32   {d7[0]}, [r8]
14977+        vld1.32   {d7[1]}, [r4]
14978+        b          1f
14979+2:
14980+        vdup.32    q3,  d5[1]
14981+1:
14982+        add        r12, r0,  #-pw
14983+        vstm       r1,  { q8-q11}       @ Up
14984+        vst1.32   {d31[1]}, [r12]
14985+        vstm       r0,  { q0-q3 }       @ Left
14986+        pop       {r4-r10, pc}
14987+endfunc
14988+
14989+
14990+@ int ff_hevc_rpi_intra_filter_16_neon_32(
14991+@    pixel * const left,                   [r0]
14992+@    pixel * const top,                    [r1]
14993+@    const unsigned int req,               [r2]
14994+@    const unsigned int avail,             [r3]
14995+@    const pixel * const src_l,            [sp, #0]
14996+@    const pixel * const src_u,            [sp, #4]
14997+@    const pixel * const src_ur,           [sp, #8]
14998+@    const unsigned int stride,            [sp, #12] (pels)
14999+@    const unsigned int top_right_size,    [sp, #16]
15000+@    const unsigned int down_left_size)    [sp, #20]
15001+
15002+.set    sp_base, 8*4
15003+.set    ur_size, sp_base + 16
15004+.set    dl_size, sp_base + 20
15005+.set    pw_s,    2
15006+.set    pw,      (1 << pw_s)
15007+.set    log2_s,  4
15008+.set    p_size,  (1 << log2_s)          @ size in pels
15009+
15010+function ff_hevc_rpi_intra_filter_16_neon_32, export=1
15011+        push       {r4-r10, lr}
15012+        load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1]
15013+
15014+        @ Once we get this big we have run out of neon regs to store
15015+        @ everything at once so do in pieces
15016+
15017+        @ Up (have)
15018+        it cs
15019+        vldmcs     r6,  { q0-q3 }
15020+        ldr        r12, [sp, #ur_size]
15021+        it mi
15022+        vldmmi     r5,  { q8-q11}
15023+        it cs
15024+        vstmcs     r1,  { q0-q3 }
15025+        bpl        1f
15026+        cmp        r12, #12
15027+        add        lr,  r1,  #(pw << log2_s)
15028+        bgt        2f
15029+        cmp        r12, #8
15030+        bge        3f
15031+        vdup.16    q9,  d17[1]
15032+4:      vdup.16    d10, d19[1]
15033+3:      vdup.16    q11, d21[1]
15034+2:      vstm       lr, { q8-q11}
15035+1:
15036+
15037+        @ Left (have)
15038+        add        lr,  r0,  #-pw
15039+        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
15040+        vst1.32   {d30[1]}, [lr]        @ UL
15041+        bpl        1f
15042+        vld1.32   { d0[0]}, [r10], r9
15043+        vld1.32   { d0[1]}, [r3],  r9
15044+        vld1.32   { d1[0]}, [r10], r9
15045+        vld1.32   { d1[1]}, [r3],  r9
15046+        vld1.32   { d2[0]}, [r10], r9
15047+        vld1.32   { d2[1]}, [r3],  r9
15048+        vld1.32   { d3[0]}, [r10], r9
15049+        vld1.32   { d3[1]}, [r3],  r9
15050+        vld1.32   { d4[0]}, [r10], r9
15051+        vld1.32   { d4[1]}, [r3],  r9
15052+        vld1.32   { d5[0]}, [r10], r9
15053+        vld1.32   { d5[1]}, [r3],  r9
15054+        vld1.32   { d6[0]}, [r10], r9
15055+        vld1.32   { d6[1]}, [r3],  r9
15056+        vld1.32   { d7[0]}, [r10]
15057+        vld1.32   { d7[1]}, [r3]
15058+        vstm       r0,  { q0-q3 }
15059+1:
15060+        bcc        1f
15061+        ldr        r12, [sp, #dl_size]
15062+        vdup.32    d16, d30[0]          @ d16[0] = d30[0]
15063+        add        lr,  r0,  #(pw << log2_s)
15064+        vld1.32   {d16[1]}, [r4],  r9
15065+        cmp        r12, #4
15066+        vld1.32   {d17[0]}, [r8],  r9
15067+        vld1.32   {d17[1]}, [r4],  r9
15068+        ble        2f
15069+        vld1.32   {d18[0]}, [r8],  r9
15070+        vld1.32   {d18[1]}, [r4],  r9
15071+        cmp        r12, #12
15072+        vld1.32   {d19[0]}, [r8],  r9
15073+        vld1.32   {d19[1]}, [r4],  r9
15074+        blt        3f
15075+        vld1.32   {d20[0]}, [r8],  r9
15076+        vld1.32   {d20[1]}, [r4],  r9
15077+        vld1.32   {d21[0]}, [r8],  r9
15078+        vld1.32   {d21[1]}, [r4],  r9
15079+        ble        4f
15080+        vld1.32   {d22[0]}, [r8],  r9
15081+        vld1.32   {d22[1]}, [r4],  r9
15082+        vld1.32   {d23[0]}, [r8]
15083+        vld1.32   {d23[1]}, [r4]
15084+        b          5f
15085+2:      vdup.32    q9,  d17[1]
15086+3:      vdup.32    q10, d19[1]
15087+4:      vdup.32    q11, d21[1]
15088+5:      vstm       lr,  { q8-q11}
15089+1:
15090+        eors       r7,  r2
15091+        beq        99f
15092+
15093+        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
15094+        vdup.32    q0,  d31[0]
15095+        vdup.32    q1,  d31[0]
15096+        vdup.32    q2,  d31[0]
15097+        vdup.32    q3,  d31[0]
15098+        add        lr,  r1,  #(pw << log2_s)
15099+        vdup.32    q8,  d31[1]
15100+        vdup.32    q9,  d31[1]
15101+        vdup.32    q10, d31[1]
15102+        vdup.32    q11, d31[1]
15103+        it cs
15104+        vstmcs     r1,  { q0-q3 }
15105+        it mi
15106+        vstmmi     lr,  { q8-q11}
15107+
15108+        lsls       r7,  #AVAIL_S_L_N_DL_C
15109+        vdup.32    q0,  d30[0]
15110+        vdup.32    q1,  d30[0]
15111+        vdup.32    q2,  d30[0]
15112+        vdup.32    q3,  d30[0]
15113+        add        lr,  r0,  #(pw << log2_s)
15114+        it mi
15115+        vstmmi     r0, { q0-q3 }
15116+        it cs
15117+        vstmcs     lr, { q0-q3 }
15118+
15119+99:
15120+        pop       {r4-r10, pc}
15121+endfunc
15122+
15123+
15124+
15125+
15126--- /dev/null
15127+++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
15128@@ -0,0 +1,920 @@
15129+/*
15130+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
15131+All rights reserved.
15132+
15133+Redistribution and use in source and binary forms, with or without
15134+modification, are permitted provided that the following conditions are met:
15135+    * Redistributions of source code must retain the above copyright
15136+      notice, this list of conditions and the following disclaimer.
15137+    * Redistributions in binary form must reproduce the above copyright
15138+      notice, this list of conditions and the following disclaimer in the
15139+      documentation and/or other materials provided with the distribution.
15140+    * Neither the name of the copyright holder nor the
15141+      names of its contributors may be used to endorse or promote products
15142+      derived from this software without specific prior written permission.
15143+
15144+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15145+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15146+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
15147+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
15148+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
15149+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
15150+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
15151+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
15152+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
15153+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15154+
15155+Authors: John Cox, Ben Avison
15156+*/
15157+
15158+/*
15159+ * Horizontal & Vertical special cases of angular intra pred
15160+ *
15161+ * Split out because:
15162+ *  Vertical, at least, is relatively common
15163+ *  Much simpler code than the general angular case
15164+ *  Luma with size < 32 has extra filtering that doesn't happen anywhere else
15165+ *
15166+ * *** Currently luma filtering is mandatory where it occurs, but there are
15167+ *     cases where it should be turned off (rdpcm & an extension sps flag).
15168+ *     These don't occur in the standard conformance suite for Main Profile
15169+ */
15170+
15171+#include "libavutil/arm/asm.S"
15172+#include "neon.S"
15173+
15174+@ ff_hevc_rpi_pred_vertical_4_neon_8
15175+@       uint8_t *_src,          [r0]
15176+@       const uint8_t *_top,    [r1]
15177+@       const uint8_t *_left,   [r2]
15178+@       ptrdiff_t stride)       [r3]
15179+
15180+function ff_hevc_rpi_pred_vertical_4_neon_8, export=1
15181+        ldrb        ip, [r2, #-1]       @ Top-left
15182+        vld1.32     {d0[0]}, [r2 :32]   @ Left
15183+        add         r2, r0, r3
15184+        vld1.8      {d1[]}, [r1]
15185+        lsl         r3, #1
15186+        vdup.8      d4, ip
15187+        vmov.i8     d2, #128
15188+        vhsub.u8    d4, d0, d4
15189+        veor        d1, d2
15190+        vld1.32     {d0[0]}, [r1 :32]   @ Top
15191+        vqadd.s8    d1, d4
15192+        vmov.i64    d3, #0xff
15193+        vmov        d4, d0
15194+        veor        d5, d1, d2
15195+        veor        d1, d1, d2
15196+        vbit        d0, d1, d3
15197+        vshr.u64    d5, #8
15198+        vst1.32     {d0[0]}, [r0], r3
15199+        vshr.u64    d1, #16
15200+        vbit        d4, d5, d3
15201+        vshr.u64    d5, #16
15202+        vst1.32     {d4[0]}, [r2], r3
15203+        vbit        d0, d1, d3
15204+        vst1.32     {d0[0]}, [r0]
15205+        vbit        d4, d5, d3
15206+        vst1.32     {d4[0]}, [r2]
15207+
15208+        bx          lr
15209+endfunc
15210+
15211+
15212+@ ff_hevc_rpi_pred_vertical_8_neon_8
15213+@       uint8_t *_src,          [r0]
15214+@       const uint8_t *_top,    [r1]
15215+@       const uint8_t *_left,   [r2]
15216+@       ptrdiff_t stride)       [r3]
15217+
15218+function ff_hevc_rpi_pred_vertical_8_neon_8, export=1
15219+        ldrb        ip, [r2, #-1]       @ Top-left
15220+        vld1.8      {d0}, [r2 :64]      @ Left
15221+        vmov.i8     d1, #128
15222+        vld1.8      {d2[]}, [r1]
15223+        vld1.8      {d3}, [r1 :64]      @ Top
15224+        vdup.8      d4, ip
15225+        vhsub.u8    d4, d0, d4
15226+        veor        d2, d1
15227+        vmov.i64    d0, #0xff
15228+        mov         r1, #8
15229+        vqadd.s8    d2, d4, d2
15230+        veor        d1, d2, d1
15231+1:
15232+        vbit        d3, d1, d0
15233+        vshr.u64    d1, #8
15234+        vst1.8      {d3}, [r0 :64], r3
15235+        subs        r1, #2
15236+        vbit        d3, d1, d0
15237+        vshr.u64    d1, #8
15238+        vst1.8      {d3}, [r0 :64], r3
15239+        bne         1b
15240+
15241+        bx          lr
15242+endfunc
15243+
15244+
15245+@ ff_hevc_rpi_pred_vertical_16_neon_8
15246+@       uint8_t *_src,          [r0]
15247+@       const uint8_t *_top,    [r1]
15248+@       const uint8_t *_left,   [r2]
15249+@       ptrdiff_t stride)       [r3]
15250+
15251+function ff_hevc_rpi_pred_vertical_16_neon_8, export=1
15252+        ldrb        ip, [r2, #-1]       @ Top-left
15253+        vld1.8      {q0}, [r2 :128]     @ Left
15254+        vdup.8      q1, ip
15255+        vld1.8      {d4[],d5[]}, [r1]
15256+        vhsub.u8    q0, q1
15257+        vmov.i8     q1, #128
15258+        veor        q2, q1
15259+        vmov.i64    d16, #0xff
15260+        vqadd.s8    q0, q2
15261+        vld1.8      {q3}, [r1 :128]     @ Top
15262+        mov         r1, #16
15263+        veor        q0, q1
15264+        vmov        q1, q3
15265+        vext.8      q2, q0, q0, #1
15266+1:
15267+        vbit        d2, d0, d16
15268+        vbit        d6, d4, d16
15269+        vext.8      q0, q0, q0, #2
15270+        subs        r1, #2
15271+        vst1.8      {q1}, [r0 :128], r3
15272+        vext.8      q2, q2, q2, #2
15273+        vst1.8      {q3}, [r0 :128], r3
15274+        bne         1b
15275+
15276+        bx          lr
15277+endfunc
15278+
15279+
15280+@ ff_hevc_rpi_pred_vert_32_neon_8
15281+@       uint8_t *_src,          [r0]
15282+@       const uint8_t *_top,    [r1]
15283+@       const uint8_t *_left,   [r2]
15284+@       ptrdiff_t stride)       [r3]
15285+
15286+function ff_hevc_rpi_pred_vertical_32_neon_8, export=1
15287+        vld1.8     {q0,  q1 }, [r1  :128]    @ Up
15288+        add         r2,  r0,  r3
15289+        lsl         r3,  #1
15290+        mov         r1,  #16
15291+1:
15292+        vst1.8     {q0,  q1 }, [r0  :128], r3
15293+        subs        r1,  #1
15294+        vst1.8     {q0,  q1 }, [r2  :128], r3
15295+        bne         1b
15296+
15297+        bx          lr
15298+endfunc
15299+
15300+
15301+@ ff_hevc_rpi_pred_vertical_c_4_neon_8
15302+@       uint8_t *_src,          [r0]
15303+@       const uint8_t *_top,    [r1]
15304+@       const uint8_t *_left,   [r2]
15305+@       ptrdiff_t stride)       [r3]
15306+
15307+function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1
15308+        vld1.16    {d0 }, [r1  :64]    @ Up
15309+        add         r2,  r0,  r3,  lsl #1
15310+        lsl         r3,  #2
15311+
15312+        vst1.16    {d0 }, [r0  :64], r3
15313+        vst1.16    {d0 }, [r2  :64], r3
15314+        vst1.16    {d0 }, [r0  :64]
15315+        vst1.16    {d0 }, [r2  :64]
15316+
15317+        bx          lr
15318+endfunc
15319+
15320+
15321+@ ff_hevc_rpi_pred_vertical_c_8_neon_8
15322+@       uint8_t *_src,          [r0]
15323+@       const uint8_t *_top,    [r1]
15324+@       const uint8_t *_left,   [r2]
15325+@       ptrdiff_t stride)       [r3]
15326+
15327+function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1
15328+        vld1.16    {q0 }, [r1  :128]    @ Up
15329+        add         r2,  r0,  r3,  lsl #1
15330+        lsl         r3,  #2
15331+        mov         r1,  #4
15332+1:
15333+        vst1.16    {q0 }, [r0  :128], r3
15334+        subs        r1,  #2
15335+        vst1.16    {q0 }, [r2  :128], r3
15336+        vst1.16    {q0 }, [r0  :128], r3
15337+        vst1.16    {q0 }, [r2  :128], r3
15338+        bne         1b
15339+
15340+        bx          lr
15341+endfunc
15342+
15343+
15344+@ ff_hevc_rpi_pred_vertical_c_16_neon_8
15345+@       uint8_t *_src,          [r0]
15346+@       const uint8_t *_top,    [r1]
15347+@       const uint8_t *_left,   [r2]
15348+@       ptrdiff_t stride)       [r3]
15349+
15350+function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1
15351+        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
15352+        add         r2,  r0,  r3,  lsl #1
15353+        lsl         r3,  #2
15354+        mov         r1,  #8
15355+1:
15356+        vst1.16    {q0,  q1 }, [r0  :128], r3
15357+        subs        r1,  #1
15358+        vst1.16    {q0,  q1 }, [r2  :128], r3
15359+        bne         1b
15360+
15361+        bx          lr
15362+endfunc
15363+
15364+
15365+@ ff_hevc_rpi_pred_horizontalal_4_neon_8
15366+@       uint8_t *_src,          [r0]
15367+@       const uint8_t *_top,    [r1]
15368+@       const uint8_t *_left,   [r2]
15369+@       ptrdiff_t stride)       [r3]
15370+
15371+@ ? Might be faster as simple arm
15372+
15373+function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
15374+        ldrb        ip, [r2, #-1]       @ Top-left
15375+        vld1.32     {d0[0]}, [r1 :32]   @ Top
15376+        add         r1, r2, #3
15377+        vld1.8      {d1[]}, [r2]!
15378+        vdup.8      d2, ip
15379+        vmov.i8     d3, #128
15380+        vhsub.u8    d0, d2
15381+        veor        d1, d3
15382+        vld1.8      {d2[]}, [r2]!
15383+        add         ip, r0, r3
15384+        vqadd.s8    d0, d0, d1
15385+        lsl         r3, #1
15386+        vld1.8      {d1[]}, [r2]
15387+        vld1.8      {d4[]}, [r1]
15388+        veor        d0, d3
15389+        vst1.32     {d0[0]}, [r0 :32], r3
15390+        vst1.32     {d2[0]}, [ip :32], r3
15391+        vst1.32     {d1[0]}, [r0 :32]
15392+        vst1.32     {d4[0]}, [ip :32]
15393+
15394+        bx          lr
15395+endfunc
15396+
15397+
15398+@ ff_hevc_rpi_pred_horizontal_8_neon_8
15399+@       uint8_t *_src,          [r0]
15400+@       const uint8_t *_top,    [r1]
15401+@       const uint8_t *_left,   [r2]
15402+@       ptrdiff_t stride)       [r3]
15403+
15404+function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
15405+        ldrb        ip, [r2, #-1]       @ Top-left
15406+        vld1.8      {d0}, [r1 :64]      @ Top
15407+        vmov.i8     d1, #128
15408+        vld1.8      {d2[]}, [r2]!
15409+        mov         r1, #8-2
15410+        vdup.8      d3, ip
15411+        vhsub.u8    d0, d3
15412+        veor        d2, d1
15413+        vqadd.s8    d0, d2
15414+          vld1.8      {d2[]}, [r2]!
15415+        veor        d0, d1
15416+        vst1.8      {d0}, [r0], r3
15417+1:
15418+            vld1.8      {d0[]}, [r2]!
15419+        subs        r1, #2
15420+          vst1.8      {d2}, [r0 :64], r3
15421+              vld1.8      {d2[]}, [r2]!
15422+            vst1.8      {d0}, [r0 :64], r3
15423+        bne         1b
15424+
15425+              vst1.8      {d2}, [r0 :64]
15426+        bx          lr
15427+endfunc
15428+
15429+
15430+@ ff_hevc_rpi_pred_horizontal_16_neon_8
15431+@       uint8_t *_src,          [r0]
15432+@       const uint8_t *_top,    [r1]
15433+@       const uint8_t *_left,   [r2]
15434+@       ptrdiff_t stride)       [r3]
15435+
15436+function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
15437+        ldrb        ip, [r2, #-1]       @ Top-left
15438+        vld1.8      {q0}, [r1 :64]      @ Top
15439+        mov         r1, #16-2
15440+        vld1.8      {d4[],d5[]}, [r2]!
15441+        vdup.8      q3, ip
15442+        vhsub.u8    q0, q3
15443+        vmov.i8     q1, #128
15444+        veor        q2, q1
15445+        vqadd.s8    q0, q2
15446+          vld1.8      {d4[],d5[]}, [r2]!
15447+        veor        q0, q1
15448+        vst1.8      {q0}, [r0], r3
15449+1:
15450+            vld1.8      {d0[],d1[]}, [r2]!
15451+        subs        r1, #2
15452+          vst1.8      {q2}, [r0 :64], r3
15453+              vld1.8      {d4[],d5[]}, [r2]!
15454+            vst1.8      {q0}, [r0 :64], r3
15455+        bne         1b
15456+
15457+              vst1.8      {q2}, [r0 :64]
15458+        bx          lr
15459+endfunc
15460+
15461+
15462+@ ff_hevc_rpi_pred_horizontal_32_neon_8
15463+@       uint8_t *_src,          [r0]
15464+@       const uint8_t *_top,    [r1]
15465+@       const uint8_t *_left,   [r2]
15466+@       ptrdiff_t stride)       [r3]
15467+
15468+function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
15469+        vld1.8      {d0[],d1[]}, [r2]!
15470+        add         ip, r0, #16
15471+        mov         r1, #32-2
15472+          vld1.8      {d2[],d3[]}, [r2]!
15473+        vst1.8      {q0}, [r0 :128], r3
15474+        vst1.8      {q0}, [ip :128], r3
15475+1:
15476+            vld1.8      {d0[],d1[]}, [r2]!
15477+        subs        r1, #2
15478+          vst1.8      {q1}, [r0 :128], r3
15479+          vst1.8      {q1}, [ip :128], r3
15480+              vld1.8      {d2[],d3[]}, [r2]!
15481+            vst1.8      {q0}, [r0 :128], r3
15482+            vst1.8      {q0}, [ip :128], r3
15483+        bne         1b
15484+
15485+              vst1.8      {q1}, [r0 :128]
15486+              vst1.8      {q1}, [ip :128]
15487+        bx          lr
15488+endfunc
15489+
15490+
15491+@ ff_hevc_rpi_pred_horizontal_c_4_neon_8
15492+@       uint8_t *_src,          [r0]
15493+@       const uint8_t *_top,    [r1]
15494+@       const uint8_t *_left,   [r2]
15495+@       ptrdiff_t stride)       [r3]
15496+
15497+function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
15498+        add         r1, r2, #2
15499+        vld1.16     {d0[]}, [r2]
15500+        add         r2, #4
15501+        vld1.16     {d1[]}, [r1]
15502+        add         r1, #4
15503+        vld1.16     {d2[]}, [r2]
15504+A       add         r2, r0, r3, lsl #1
15505+T       lsl         r3, #1
15506+T       add         r2, r0, r3
15507+        vld1.16     {d3[]}, [r1]
15508+A       lsl         r3, #2
15509+T       lsl         r3, #1
15510+        vst1.16     {d0}, [r0 :64], r3
15511+        vst1.16     {d1}, [r2 :64], r3
15512+        vst1.16     {d2}, [r0 :64]
15513+        vst1.16     {d3}, [r2 :64]
15514+
15515+        bx          lr
15516+endfunc
15517+
15518+
15519+@ ff_hevc_rpi_pred_horizontal_c_8_neon_8
15520+@       uint8_t *_src,          [r0]
15521+@       const uint8_t *_top,    [r1]
15522+@       const uint8_t *_left,   [r2]
15523+@       ptrdiff_t stride)       [r3]
15524+
15525+function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
15526+        vld1.16     {d0[],d1[]}, [r2]!
15527+        lsl         r3, #1
15528+          vld1.16     {d2[],d3[]}, [r2]!
15529+        mov         r1, #8-2
15530+        vst1.16     {q0}, [r0 :64], r3
15531+1:
15532+            vld1.16     {d0[],d1[]}, [r2]!
15533+        subs        r1, #2
15534+          vst1.16     {q1}, [r0 :64], r3
15535+              vld1.16     {d2[],d3[]}, [r2]!
15536+            vst1.16     {q0}, [r0 :64], r3
15537+        bne         1b
15538+
15539+              vst1.16     {q1}, [r0 :64]
15540+        bx          lr
15541+endfunc
15542+
15543+
15544+@ ff_hevc_rpi_pred_horizontal_c_16_neon_8
15545+@       uint8_t *_src,          [r0]
15546+@       const uint8_t *_top,    [r1]
15547+@       const uint8_t *_left,   [r2]
15548+@       ptrdiff_t stride)       [r3]
15549+
15550+function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
15551+        vld1.16     {d0[],d1[]}, [r2]!
15552+        lsl         r3, #1
15553+        add         ip, r0, #16
15554+        mov         r1, #16-2
15555+          vld1.16     {d2[],d3[]}, [r2]!
15556+        vst1.16     {q0}, [r0 :128], r3
15557+        vst1.16     {q0}, [ip :128], r3
15558+1:
15559+            vld1.16     {d0[],d1[]}, [r2]!
15560+        subs        r1, #2
15561+          vst1.16     {q1}, [r0 :128], r3
15562+          vst1.16     {q1}, [ip :128], r3
15563+              vld1.16     {d2[],d3[]}, [r2]!
15564+            vst1.16     {q0}, [r0 :128], r3
15565+            vst1.16     {q0}, [ip :128], r3
15566+        bne         1b
15567+
15568+              vst1.16     {q1}, [r0 :128]
15569+              vst1.16     {q1}, [ip :128]
15570+        bx          lr
15571+endfunc
15572+
15573+
15574+@------------------------------------------------------------------------------
15575+@
15576+@ 10 Bit
15577+@ Has clipping constants so 10-bit only but could easily be macroed up to
15578+@ 14-bit before we run out of bits
15579+
15580+
15581+@ ff_hevc_rpi_pred_vertical_4_neon_10
15582+@       uint8_t *_src,          [r0]
15583+@       const uint8_t *_top,    [r1]
15584+@       const uint8_t *_left,   [r2]
15585+@       ptrdiff_t stride)       [r3]
15586+
15587+function ff_hevc_rpi_pred_vertical_4_neon_10, export=1
15588+        ldrh        ip, [r2, #-2]       @ Top-left
15589+        vld1.16     {d0}, [r2 :64]      @ Left
15590+        vmov.i16    d2, #0
15591+        vld1.16     {d1[]}, [r1]
15592+T       lsl         r3, #1
15593+        vdup.16     d4, ip
15594+        vmov.i16    d3, #0x3ff
15595+        vld1.16     {d5}, [r1 :64]      @ Top
15596+        vhsub.u16   d4, d0, d4
15597+        vmov.i64    d0, #0xffff
15598+A       add         r2, r0, r3, lsl #1
15599+T       add         r2, r0, r3
15600+        vadd.i16    d1, d1, d4
15601+        vmov        d6, d5
15602+        vmax.s16    d1, d1, d2
15603+        vmin.s16    d2, d1, d3
15604+        vmin.s16    d1, d1, d3
15605+        vbit        d5, d1, d0
15606+A       lsl         r3, #2
15607+T       lsl         r3, #1
15608+        vshr.u64    d2, #16
15609+        vshr.u64    d1, #32
15610+        vbit        d6, d2, d0
15611+        vst1.16     {d5}, [r0], r3
15612+        vshr.u64    d2, #32
15613+        vst1.16     {d6}, [r2], r3
15614+        vbit        d5, d1, d0
15615+        vst1.16     {d5}, [r0]
15616+        vbit        d6, d2, d0
15617+        vst1.16     {d6}, [r2]
15618+        bx          lr
15619+endfunc
15620+
15621+
15622+@ ff_hevc_rpi_pred_vertical_8_neon_10
15623+@       uint8_t *_src,          [r0]
15624+@       const uint8_t *_top,    [r1]
15625+@       const uint8_t *_left,   [r2]
15626+@       ptrdiff_t stride)       [r3]
15627+
15628+function ff_hevc_rpi_pred_vertical_8_neon_10, export=1
15629+        ldrh        ip, [r2, #-2]       @ Top-left
15630+        vld1.16     {q0}, [r2 :128]     @ Left
15631+        lsl         r3, #1
15632+        vdup.16     q1, ip
15633+        vld1.16     {d4[],d5[]}, [r1]
15634+        vhsub.u16   q0, q0, q1
15635+        vmov.i16    q1, #0
15636+        vadd.i16    q0, q2
15637+        vmov.i16    q2, #0x3ff
15638+        vld1.16     {q3}, [r1 :128]     @ Top
15639+        mov         r1, #8
15640+        vmax.s16    q0, q1
15641+        vmov        q1, q3
15642+        vmin.s16    q0, q2
15643+        vmov.i64    d16, #0xffff
15644+        vext.16     q2, q0, q0, #1
15645+1:
15646+        vbit        d2, d0, d16
15647+        vbit        d6, d4, d16
15648+        vext.16     q0, q0, q0, #2
15649+        subs        r1, #2
15650+        vst1.16     {q1}, [r0 :128], r3
15651+        vext.16     q2, q2, q2, #2
15652+        vst1.16     {q3}, [r0 :128], r3
15653+        bne         1b
15654+
15655+        bx          lr
15656+endfunc
15657+
15658+
15659+@ ff_hevc_rpi_pred_vertical_16_neon_10
15660+@       uint8_t *_src,          [r0]
15661+@       const uint8_t *_top,    [r1]
15662+@       const uint8_t *_left,   [r2]
15663+@       ptrdiff_t stride)       [r3]
15664+
15665+function ff_hevc_rpi_pred_vertical_16_neon_10, export=1
15666+        ldrh        ip, [r2, #-2]       @ Top-left
15667+        vld1.16     {q0-q1}, [r2 :128]  @ Left
15668+T       lsl         r3, #1
15669+        vdup.16     q2, ip
15670+A       add         r2, r0, r3, lsl #1
15671+T       add         r2, r0, r3
15672+        vld1.16     {d6[],d7[]}, [r1]
15673+A       lsl         r3, #2
15674+T       lsl         r3, #1
15675+        vhsub.u16   q0, q2
15676+        vhsub.u16   q1, q2
15677+        vadd.i16    q0, q3
15678+        vadd.i16    q1, q3
15679+        vmov.i16    q2, #0
15680+        vld1.16     {q8-q9}, [r1 :128]  @ Top
15681+        mov         r1, #0
15682+        vmov.i16    q3, #0x3ff
15683+        vmax.s16    q0, q2
15684+        vmax.s16    q1, q2
15685+        vmin.s16    q0, q3
15686+        vmin.s16    q1, q3
15687+        vmov        q10, q8
15688+        vmov        q11, q9
15689+        vext.16     q2, q0, q1, #1
15690+        vext.16     q3, q1, q1, #1
15691+        vmov.i64    d24, #0xffff
15692+1:
15693+        vbit        d16, d0, d24
15694+        vbit        d20, d4, d24
15695+        vext.16     q0, q0, q0, #2
15696+        subs        r1, #1<<30
15697+        vst1.16     {q8-q9}, [r0 :128], r3
15698+        vext.16     q2, q2, q2, #2
15699+        vst1.16     {q10-q11}, [r2 :128], r3
15700+        bne         1b
15701+1:
15702+        vbit        d16, d2, d24
15703+        vbit        d20, d6, d24
15704+        vext.16     q1, q1, q1, #2
15705+        subs        r1, #1<<30
15706+        vst1.16     {q8-q9}, [r0 :128], r3
15707+        vext.16     q3, q3, q3, #2
15708+        vst1.16     {q10-q11}, [r2 :128], r3
15709+        bne         1b
15710+
15711+        bx          lr
15712+endfunc
15713+
15714+
15715+@ ff_hevc_rpi_pred_vertical_32_neon_10
15716+@       uint8_t *_src,          [r0]
15717+@       const uint8_t *_top,    [r1]
15718+@       const uint8_t *_left,   [r2]
15719+@       ptrdiff_t stride)       [r3]
15720+
15721+function ff_hevc_rpi_pred_vertical_32_neon_10, export=1
15722+        vldm        r1, { q0-q3 }    @ Up
15723+        lsl         r3, #1
15724+        mov         r1, #32
15725+        add         r2, r0, #32
15726+1:
15727+        vst1.16     {q0-q1}, [r0 :128], r3
15728+        subs        r1, #1
15729+        vst1.16     {q2-q3}, [r2 :128], r3
15730+        bne         1b
15731+
15732+        bx          lr
15733+endfunc
15734+
15735+
15736+@ ff_hevc_rpi_pred_vertical_c_4_neon_10
15737+@       uint8_t *_src,          [r0]
15738+@       const uint8_t *_top,    [r1]
15739+@       const uint8_t *_left,   [r2]
15740+@       ptrdiff_t stride)       [r3]
15741+
15742+function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1
15743+        vld1.16    {q0 }, [r1  :128]    @ Up
15744+        add         r2,  r0,  r3,  lsl #2
15745+        lsl         r3,  #3
15746+
15747+        vst1.16    {q0 }, [r0  :128], r3
15748+        vst1.16    {q0 }, [r2  :128], r3
15749+        vst1.16    {q0 }, [r0  :128]
15750+        vst1.16    {q0 }, [r2  :128]
15751+
15752+        bx          lr
15753+endfunc
15754+
15755+
15756+@ ff_hevc_rpi_pred_vertical_c_8_neon_10
15757+@       uint8_t *_src,          [r0]
15758+@       const uint8_t *_top,    [r1]
15759+@       const uint8_t *_left,   [r2]
15760+@       ptrdiff_t stride)       [r3]
15761+
15762+function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1
15763+        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
15764+        add         r2,  r0,  r3,  lsl #2
15765+        lsl         r3,  #3
15766+        mov         r1,  #4
15767+1:
15768+        vst1.16    {q0,  q1 }, [r0  :128], r3
15769+        subs        r1,  #1
15770+        vst1.16    {q0,  q1 }, [r2  :128], r3
15771+        bne         1b
15772+
15773+        bx          lr
15774+endfunc
15775+
15776+
15777+@ ff_hevc_rpi_pred_vertical_c_16_neon_10
15778+@       uint8_t *_src,          [r0]
15779+@       const uint8_t *_top,    [r1]
15780+@       const uint8_t *_left,   [r2]
15781+@       ptrdiff_t stride)       [r3]
15782+
15783+function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1
15784+        vldm        r1, { q0-q3 }    @ Up
15785+        lsl         r3, #2
15786+        mov         r1, #16
15787+        add         r2, r0, #32
15788+1:
15789+        vst1.16     {q0-q1}, [r0 :128], r3
15790+        subs        r1, #1
15791+        vst1.16     {q2-q3}, [r2 :128], r3
15792+        bne         1b
15793+
15794+        bx          lr
15795+endfunc
15796+
15797+@ ff_hevc_rpi_pred_horizontal_4_neon_10
15798+@       uint8_t *_src,          [r0]
15799+@       const uint8_t *_top,    [r1]
15800+@       const uint8_t *_left,   [r2]
15801+@       ptrdiff_t stride)       [r3]
15802+
15803+function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
15804+        ldrh        ip, [r2, #-2]       @ Top-left
15805+        vld1.16     {d0}, [r1 :64]      @ Top
15806+        vmov.i16    d1, #0
15807+        vld1.16     {d2[]}, [r2]!
15808+T       lsl         r3, #1
15809+        vdup.16     d3, ip
15810+        vmov.i16    d4, #0x3ff
15811+        vhsub.u16   d0, d3
15812+A       add         ip, r0, r3, lsl #1
15813+T       add         ip, r0, r3
15814+        vld1.16     {d3[]}, [r2]!
15815+A       lsl         r3, #2
15816+T       lsl         r3, #1
15817+        vadd.i16    d0, d2
15818+        vld1.16     {d2[]}, [r2]!
15819+        vmax.s16    d0, d1
15820+        vld1.16     {d1[]}, [r2]
15821+        vmin.s16    d0, d4
15822+        vst1.16     {d0}, [r0 :64], r3
15823+        vst1.16     {d3}, [ip :64], r3
15824+        vst1.16     {d2}, [r0 :64]
15825+        vst1.16     {d1}, [ip :64]
15826+
15827+        bx          lr
15828+endfunc
15829+
15830+
15831+@ ff_hevc_rpi_pred_horizontal_8_neon_10
15832+@       uint8_t *_src,          [r0]
15833+@       const uint8_t *_top,    [r1]
15834+@       const uint8_t *_left,   [r2]
15835+@       ptrdiff_t stride)       [r3]
15836+
15837+function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
15838+        ldrh        ip, [r2, #-2]       @ Top-left
15839+        vld1.16     {q0}, [r1 :128]     @ Top
15840+        lsl         r3, #1
15841+        vdup.16     q1, ip
15842+        mov         r1, #8-2
15843+        vhsub.u16   q0, q1
15844+        vld1.16     {d2[],d3[]}, [r2]!
15845+        vmov.i16    q2, #0
15846+        vadd.i16    q0, q1
15847+        vmov.i16    q1, #0x3ff
15848+        vmax.s16    q0, q2
15849+          vld1.16     {d4[],d5[]}, [r2]!
15850+        vmin.s16    q0, q1
15851+        vst1.16     {q0}, [r0 :128], r3
15852+1:
15853+            vld1.16     {d0[],d1[]}, [r2]!
15854+        subs        r1, #2
15855+          vst1.16     {q2}, [r0 :128], r3
15856+              vld1.16     {d4[],d5[]}, [r2]!
15857+            vst1.16     {q0}, [r0 :128], r3
15858+        bne         1b
15859+
15860+              vst1.16     {q2}, [r0 :128]
15861+        bx          lr
15862+endfunc
15863+
15864+
15865+@ ff_hevc_rpi_pred_horizontalal_16_neon_10
15866+@       uint8_t *_src,          [r0]
15867+@       const uint8_t *_top,    [r1]
15868+@       const uint8_t *_left,   [r2]
15869+@       ptrdiff_t stride)       [r3]
15870+
15871+function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
15872+        ldrh        ip, [r2, #-2]       @ Top-left
15873+        vld1.16     {q0-q1}, [r1 :128]  @ Top
15874+        lsl         r3, #1
15875+        vdup.16     q2, ip
15876+        add         ip, r0, r3
15877+        vhsub.u16   q0, q2
15878+        add         ip, #16
15879+        vhsub.u16   q1, q2
15880+        mov         r1, #16-2
15881+        vld1.16     {d4[],d5[]}, [r2]!
15882+        vmov.i16    q3, #0
15883+        vadd.u16    q0, q2
15884+        vadd.i16    q1, q2
15885+        vmov.i16    q2, #0x3ff
15886+        vmax.s16    q0, q3
15887+        vmax.s16    q1, q3
15888+          vld1.16     {d6[],d7[]}, [r2]!
15889+        vmin.s16    q0, q2
15890+        vmin.s16    q1, q2
15891+        vst1.16     {q0-q1}, [r0 :128], r3
15892+1:
15893+            vld1.16     {d0[],d1[]}, [r2]!
15894+        subs        r1, #2
15895+          vst1.16     {q3}, [r0 :128], r3
15896+          vst1.16     {q3}, [ip :128], r3
15897+              vld1.16     {d6[],d7[]}, [r2]!
15898+            vst1.16     {q0}, [r0 :128], r3
15899+            vst1.16     {q0}, [ip :128], r3
15900+        bne         1b
15901+
15902+              vst1.16     {q3}, [r0 :128]
15903+              vst1.16     {q3}, [ip :128]
15904+        bx          lr
15905+endfunc
15906+
15907+
15908+@ ff_hevc_rpi_pred_horizontal_32_neon_10
15909+@       uint8_t *_src,          [r0]
15910+@       const uint8_t *_top,    [r1]
15911+@       const uint8_t *_left,   [r2]
15912+@       ptrdiff_t stride)       [r3]
15913+
15914+function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
15915+        vld1.16     {d0[],d1[]}, [r2]!
15916+        add         ip, r0, #16
15917+        push        {lr}
15918+        mov         lr, #32
15919+          vld1.16     {d2[],d3[]}, [r2]!
15920+        lsl         r3, #1
15921+        vst1.16     {q0}, [r0 :128], lr
15922+        sub         r3, #32
15923+        vst1.16     {q0}, [ip :128], lr
15924+        mov         r1, #32-2
15925+        vst1.16     {q0}, [r0 :128], r3
15926+        vst1.16     {q0}, [ip :128], r3
15927+1:
15928+            vld1.16     {d0[],d1[]}, [r2]!
15929+        subs        r1, #2
15930+          vst1.16     {q1}, [r0 :128], lr
15931+          vst1.16     {q1}, [ip :128], lr
15932+          vst1.16     {q1}, [r0 :128], r3
15933+          vst1.16     {q1}, [ip :128], r3
15934+              vld1.16     {d2[],d3[]}, [r2]!
15935+            vst1.16     {q0}, [r0 :128], lr
15936+            vst1.16     {q0}, [ip :128], lr
15937+            vst1.16     {q0}, [r0 :128], r3
15938+            vst1.16     {q0}, [ip :128], r3
15939+        bne         1b
15940+
15941+              vst1.16     {q1}, [r0 :128], lr
15942+              vst1.16     {q1}, [ip :128], lr
15943+              vst1.16     {q1}, [r0 :128]
15944+              vst1.16     {q1}, [ip :128]
15945+        pop         {pc}
15946+endfunc
15947+
15948+
15949+@ ff_hevc_rpi_pred_horizontal_c_4_neon_10
15950+@       uint8_t *_src,          [r0]
15951+@       const uint8_t *_top,    [r1]
15952+@       const uint8_t *_left,   [r2]
15953+@       ptrdiff_t stride)       [r3]
15954+
15955+function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
15956+        add         r1, r2, #4
15957+        vld1.32     {d0[],d1[]}, [r2]
15958+        add         r2, #8
15959+        vld1.32     {d2[],d3[]}, [r1]
15960+        add         r1, #8
15961+        vld1.32     {d4[],d5[]}, [r2]
15962+A       add         r2, r0, r3, lsl #2
15963+T       lsl         r3, #2
15964+T       add         r2, r0, r3
15965+        vld1.32     {d6[],d7[]}, [r1]
15966+A       lsl         r3, #3
15967+T       lsl         r3, #1
15968+        vst1.32     {q0}, [r0 :128], r3
15969+        vst1.32     {q1}, [r2 :128], r3
15970+        vst1.32     {q2}, [r0 :128]
15971+        vst1.32     {q3}, [r2 :128]
15972+
15973+        bx          lr
15974+endfunc
15975+
15976+
15977+@ ff_hevc_rpi_pred_horizontal_c_8_neon_10
15978+@       uint8_t *_src,          [r0]
15979+@       const uint8_t *_top,    [r1]
15980+@       const uint8_t *_left,   [r2]
15981+@       ptrdiff_t stride)       [r3]
15982+
15983+function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
15984+        vld1.32     {d0[],d1[]}, [r2]!
15985+        lsl         r3, #2
15986+        add         ip, r0, #16
15987+        mov         r1, #8-2
15988+          vld1.32     {d2[],d3[]}, [r2]!
15989+        vst1.32     {q0}, [r0 :128], r3
15990+        vst1.32     {q0}, [ip :128], r3
15991+1:
15992+            vld1.32     {d0[],d1[]}, [r2]!
15993+        subs        r1, #2
15994+          vst1.32     {q1}, [r0 :128], r3
15995+          vst1.32     {q1}, [ip :128], r3
15996+              vld1.32     {d2[],d3[]}, [r2]!
15997+            vst1.32     {q0}, [r0 :128], r3
15998+            vst1.32     {q0}, [ip :128], r3
15999+        bne         1b
16000+
16001+              vst1.32     {q1}, [r0 :128]
16002+              vst1.32     {q1}, [ip :128]
16003+        bx          lr
16004+endfunc
16005+
16006+
16007+@ ff_hevc_rpi_pred_horizontal_c_16_neon_10
16008+@       uint8_t *_src,          [r0]
16009+@       const uint8_t *_top,    [r1]
16010+@       const uint8_t *_left,   [r2]
16011+@       ptrdiff_t stride)       [r3]
16012+
16013+function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
16014+        vld1.32     {d0[],d1[]}, [r2]!
16015+        add         ip, r0, #16
16016+        push        {lr}
16017+        mov         lr, #32
16018+          vld1.32     {d2[],d3[]}, [r2]!
16019+        lsl         r3, #2
16020+        vst1.32     {q0}, [r0 :128], lr
16021+        sub         r3, #32
16022+        vst1.32     {q0}, [ip :128], lr
16023+        mov         r1, #16-2
16024+        vst1.32     {q0}, [r0 :128], r3
16025+        vst1.32     {q0}, [ip :128], r3
16026+1:
16027+            vld1.32     {d0[],d1[]}, [r2]!
16028+        subs        r1, #2
16029+          vst1.32     {q1}, [r0 :128], lr
16030+          vst1.32     {q1}, [ip :128], lr
16031+          vst1.32     {q1}, [r0 :128], r3
16032+          vst1.32     {q1}, [ip :128], r3
16033+              vld1.32     {d2[],d3[]}, [r2]!
16034+            vst1.32     {q0}, [r0 :128], lr
16035+            vst1.32     {q0}, [ip :128], lr
16036+            vst1.32     {q0}, [r0 :128], r3
16037+            vst1.32     {q0}, [ip :128], r3
16038+        bne         1b
16039+
16040+              vst1.32     {q1}, [r0 :128], lr
16041+              vst1.32     {q1}, [ip :128], lr
16042+              vst1.32     {q1}, [r0 :128]
16043+              vst1.32     {q1}, [ip :128]
16044+        pop         {pc}
16045+endfunc
16046+
16047+
16048+
16049--- /dev/null
16050+++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
16051@@ -0,0 +1,1043 @@
16052+/*
16053+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
16054+All rights reserved.
16055+
16056+Redistribution and use in source and binary forms, with or without
16057+modification, are permitted provided that the following conditions are met:
16058+    * Redistributions of source code must retain the above copyright
16059+      notice, this list of conditions and the following disclaimer.
16060+    * Redistributions in binary form must reproduce the above copyright
16061+      notice, this list of conditions and the following disclaimer in the
16062+      documentation and/or other materials provided with the distribution.
16063+    * Neither the name of the copyright holder nor the
16064+      names of its contributors may be used to endorse or promote products
16065+      derived from this software without specific prior written permission.
16066+
16067+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16068+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16069+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16070+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
16071+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
16072+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
16073+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
16074+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
16075+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
16076+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
16077+
16078+Authors: John Cox, Ben Avison
16079+*/
16080+
16081+#include "libavutil/arm/asm.S"
16082+#include "neon.S"
16083+
16084+@ Planar intra pred (8.4.4.2.4)
16085+@
16086+@ predSamples[ x ][ y ] =
16087+@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] +
16088+@   ( x + 1 ) * p[ nTbS ][ -1 ] +
16089+@   ( nTbS - 1 - y ) * p[ x ][ -1 ] +
16090+@   ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 )
16091+
16092+@ All 10-bit functions would work with 9
16093+
16094+
16095+@ ff_hevc_rpi_pred_planar_8_neon_8
16096+@       uint8_t *_src,          [r0]
16097+@       const uint8_t *_top,    [r1]
16098+@       const uint8_t *_left,   [r2]
16099+@       ptrdiff_t stride)       [r3]
16100+
16101+function ff_hevc_rpi_pred_planar_4_neon_8, export=1
16102+
16103+        vld1.8      {d0}, [r1]          @ Top
16104+        adr         ip, nb_3_0_1_4
16105+        vld1.8      {d1}, [r2]          @ Left
16106+        vmov.i64    d2, #0xffffffff
16107+        vldr        d3, [ip, #8]        @ {1,2,3,4,1,2,3,4}
16108+        add         r1, r0, r3
16109+        vdup.32     d4, d0[0]           @ {t0,t1,t2,t3,t0,t1,t2,t3}
16110+        vdup.8      d0, d0[4]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
16111+        vdup.8      d5, d1[4]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
16112+        vdup.8      d6, d1[0]           @ {l0,l0,l0,l0,l0,l0,l0,l0}
16113+        vshll.u8    q8, d4, #2
16114+        lsl         r3, #1
16115+        vsubl.u8    q2, d5, d4
16116+        vmlal.u8    q8, d0, d3
16117+        vld1.8      {d0}, [ip]          @ {3,2,1,0,3,2,1,0}
16118+        vdup.8      d7, d1[1]           @ {l1,l1,l1,l1,l1,l1,l1,l1}
16119+        vshl.s16    q9, q2, #1
16120+        vbif        d6, d7, d2          @ {l0,l0,l0,l0,l1,l1,l1,l1}
16121+        vadd.i16    d16, d4
16122+        vdup.8      d7, d1[2]           @ {l2,l2,l2,l2,l2,l2,l2,l2}
16123+        vadd.i16    d17, d18
16124+        vdup.8      d1, d1[3]           @ {l3,l3,l3,l3,l3,l3,l3,l3}
16125+        vadd.i16    q2, q8, q9
16126+        vmlal.u8    q8, d0, d6
16127+        vbif        d7, d1, d2          @ {l2,l2,l2,l2,l3,l3,l3,l3}
16128+        vmlal.u8    q2, d0, d7
16129+        vrshrn.i16  d0, q8, #3
16130+        vst1.32     d0[0], [r0 :32], r3
16131+        vst1.32     d0[1], [r1 :32], r3
16132+        vrshrn.i16  d0, q2, #3
16133+        vst1.32     d0[0], [r0 :32]
16134+        vst1.32     d0[1], [r1 :32]
16135+
16136+        bx          lr
16137+endfunc
16138+
16139+
16140+@ ff_hevc_rpi_pred_planar_4_neon_10
16141+@       uint8_t *_src,          [r0]
16142+@       const uint8_t *_top,    [r1]
16143+@       const uint8_t *_left,   [r2]
16144+@       ptrdiff_t stride)       [r3]
16145+
16146+function ff_hevc_rpi_pred_planar_4_neon_10, export=1
16147+        @ Load from bytes & expand later - at the very least this uses less
16148+        @ memory than having a short table
16149+        vld1.16     {q0}, [r1 :64]      @ Top
16150+        adr         ip, nbh_3_0_1_4
16151+        vldr        d2, [r2, #8]        @ Left (lower)
16152+        vldr        d3, [ip, #8]        @ {1,2,3,4}
16153+T       lsl         r3, #1
16154+        vshl.s16    d4, d0, #2
16155+        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4}
16156+        vldr        d5, [r2]            @ Left (upper)
16157+        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4}
16158+        vldr        d6, [ip]            @ {3,2,1,0}
16159+        vmla.i16    d4, d3, d1          @ Acc set up
16160+        vsub.i16    d0, d2, d0          @ Add set up
16161+        vmov        d7, d6
16162+        vdup.16     d2, d5[0]
16163+        vdup.16     d3, d5[1]
16164+        vdup.16     d16, d5[2]
16165+        vadd.i16    d18, d0, d4
16166+        vshl.s16    d0, #1              @ x2
16167+        vadd.i16    d19, d0, d4
16168+        vdup.16     d17, d5[3]
16169+        vadd.i16    d4, d0, d18
16170+A       add         r1, r0, r3, lsl #1
16171+T       add         r1, r0, r3
16172+        vadd.i16    d5, d0, d19
16173+A       lsl         r3, #2
16174+T       lsl         r3, #1
16175+        vmla.i16    q9, q1, q3
16176+        vmla.i16    q2, q8, q3
16177+        vrshr.u16   q0, q9, #3
16178+        vst1.16     {d0}, [r0], r3
16179+        vrshr.u16   d2, d4, #3
16180+        vst1.16     {d1}, [r1], r3
16181+        vrshr.u16   d3, d5, #3
16182+        vst1.16     {d2}, [r0]
16183+        vst1.16     {d3}, [r1]
16184+
16185+        bx         lr
16186+endfunc
16187+
16188+
16189+@ ff_hevc_rpi_pred_planar_8_neon_8
16190+@       uint8_t *_src,          [r0]
16191+@       const uint8_t *_top,    [r1]
16192+@       const uint8_t *_left,   [r2]
16193+@       ptrdiff_t stride)       [r3]
16194+
16195+function ff_hevc_rpi_pred_planar_8_neon_8, export=1
16196+
16197+        vld1.8      {q0}, [r1]          @ Top
16198+        adr         ip, nb_7_0_1_8
16199+        vldr        d2, [r2, #8]        @ Left (lower)
16200+        mov         r1, #8
16201+        vldr        d3, [ip, #8]        @ {1,2,3,4,5,6,7,8}
16202+        vshll.u8    q2, d0, #3
16203+        vdup.8      d1, d1[0]           @ {t8,t8,t8,t8,t8,t8,t8,t8}
16204+        vdup.8      d2, d2[0]           @ {l8,l8,l8,l8,l8,l8,l8,l8}
16205+        vldr        d6, [r2]            @ Left (upper)
16206+        vmlal.u8    q2, d3, d1
16207+        vsubl.u8    q0, d2, d0
16208+        vldr        d7, [ip]            @ {7,6,5,4,3,2,1,0}
16209+
16210+@ u8   7..0    [1]  d7
16211+@ u8  left[y]  [1]  d6
16212+@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
16213+@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
16214+
16215+        vdup.8      d2, d6[0]
16216+        vadd.i16    q2, q0
16217+        vdup.8      d3, d6[1]
16218+        vadd.i16    q8, q2, q0
16219+1:
16220+        vmlal.u8    q2, d7, d2
16221+        subs        r1, #2
16222+        vadd.i16    q9, q8, q0
16223+        vmlal.u8    q8, d7, d3
16224+        vdup.8      d2, d6[2]
16225+        vdup.8      d3, d6[3]
16226+        vrshrn.i16  d20, q2, #4
16227+        vshr.u64    d6, #16
16228+        vmov        q2, q9
16229+        vst1.8      {d20}, [r0], r3
16230+        vrshrn.i16  d20, q8, #4
16231+        vadd.i16    q8, q2, q0
16232+        vst1.8      {d20}, [r0], r3
16233+        bne         1b
16234+
16235+        bx          lr
16236+
16237+endfunc
16238+
16239+
16240+@ ff_hevc_rpi_pred_planar_8_neon_10
16241+@       uint8_t *_src,          [r0]
16242+@       const uint8_t *_top,    [r1]
16243+@       const uint8_t *_left,   [r2]
16244+@       ptrdiff_t stride)       [r3]
16245+
16246+function ff_hevc_rpi_pred_planar_8_neon_10, export=1
16247+
16248+        adr         ip, nb_7_0_1_8
16249+        vld1.16     {q0}, [r1 :128]!    @ Top (left)
16250+        lsl         r3, #1
16251+        vld1.16     {q1}, [ip :128]     @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8}
16252+        add         ip, r2, #16
16253+        vld1.16     {d4[],d5[]}, [r1]   @ Top (right)
16254+        mov         r1, #8-2
16255+        vshl.s16    q3, q0, #3
16256+        vmovl.u8    q8, d3              @ {1,2,3,4,5,6,7,8}
16257+        vld1.16     {d18[],d19[]}, [ip] @ Left (lower)
16258+        vmla.i16    q3, q8, q2          @ Acc set up
16259+        vsub.i16    q0, q9, q0          @ Add set up
16260+        vmovl.u8    q1, d2              @ {7,6,5,4,3,2,1,0}
16261+        vadd.i16    q2, q3, q0
16262+
16263+@ u16  7..0        [1]  q1
16264+@ u32 left[y]      [1]  [r2]
16265+@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
16266+@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
16267+
16268+        vld1.16     {d6[],d7[]}, [r2]!
16269+        vadd.i16    q8, q2, q0
16270+        vld1.16     {d18[],d19[]}, [r2]!
16271+        vmla.i16    q2, q1, q3
16272+        vadd.i16    q3, q8, q0
16273+        vmla.i16    q8, q1, q9
16274+1:
16275+        vrshr.u16   q9, q2, #4
16276+        subs        r1, #2
16277+        vmov        q2, q3
16278+        vrshr.u16   q10, q8, #4
16279+          vld1.16     {d6[],d7[]}, [r2]!
16280+        vst1.16     {q9}, [r0 :128], r3
16281+          vadd.i16    q8, q2, q0
16282+          vld1.16     {d18[],d19[]}, [r2]!
16283+          vmla.i16    q2, q1, q3
16284+          vadd.i16    q3, q8, q0
16285+          vmla.i16    q8, q1, q9
16286+        vst1.16     {q10}, [r0 :128], r3
16287+        bne         1b
16288+
16289+        vrshr.u16   q9, q2, #4
16290+        add         r3, r0
16291+        vrshr.u16   q10, q8, #4
16292+        vst1.16     {q9}, [r0 :128]
16293+        vst1.16     {q10}, [r3 :128]
16294+
16295+        bx         lr
16296+endfunc
16297+
16298+
16299+@------------------------------------------------------------------------------
16300+@
16301+@ Data - has to be in two lumps to ensure we can always reach using adr
16302+
16303+        .balign 64
16304+
16305+nb_31_0_1_32:
16306+        .byte   31, 30, 29, 28, 27, 26, 25, 24
16307+        .byte   23, 22, 21, 20, 19, 18, 17, 16
16308+nb_15_0_1_16:
16309+        .byte   15, 14, 13, 12, 11, 10,  9,  8
16310+        .byte    7,  6,  5,  4,  3,  2,  1,  0
16311+        .byte    1,  2,  3,  4,  5,  6,  7,  8
16312+        .byte    9, 10, 11, 12, 13, 14, 15, 16
16313+        .byte   17, 18, 19, 20, 21, 22, 23, 24
16314+        .byte   25, 26, 27, 28, 29, 30, 31, 32
16315+
16316+        @ should be back on a 64-byte boundary here
16317+
16318+        @ These could be extracted from the above array, but separate out
16319+        @ out for better (16 byte) alignment
16320+nb_3_0_1_4:
16321+        .byte    3,  2,  1,  0,  3,  2,  1,  0
16322+        .byte    1,  2,  3,  4,  1,  2,  3,  4
16323+nb_7_0_1_8:
16324+        .byte    7,  6,  5,  4,  3,  2,  1,  0
16325+        .byte    1,  2,  3,  4,  5,  6,  7,  8
16326+nbh_3_0_1_4:
16327+        .short   3,  2,  1,  0,  1,  2,  3,  4
16328+
16329+@------------------------------------------------------------------------------
16330+
16331+
16332+@ ff_hevc_rpi_pred_planar_16_neon_8
16333+@       uint8_t *_src,          [r0]
16334+@       const uint8_t *_top,    [r1]
16335+@       const uint8_t *_left,   [r2]
16336+@       ptrdiff_t stride)       [r3]
16337+
16338+function ff_hevc_rpi_pred_planar_16_neon_8, export=1
16339+
16340+        adr         ip, nb_15_0_1_16 + 16
16341+        vld1.8      {q0}, [r1 :128]!    @ Top (left)
16342+        add         r2, #16
16343+        vld1.8      {q1}, [ip: 128]     @ {1,2,3...16}
16344+        vld1.8      {d4[]}, [r1]        @ Top (right)
16345+        sub         ip, #16
16346+        vshll.u8    q3, d0, #4
16347+        mov         r1, #16
16348+        vshll.u8    q8, d1, #4
16349+        vld1.8      {d5[]}, [r2]        @ Left (lower)
16350+        sub         r2, #16
16351+        vmlal.u8    q3, d2, d4
16352+        vmlal.u8    q8, d3, d4          @ Acc set up
16353+        vsubl.u8    q1, d5, d0
16354+        vsubl.u8    q0, d5, d1          @ Add set up
16355+        vld1.8      {q2}, [ip :128]     @ {15,14,13...0}
16356+
16357+@ u8  15..0    [1]  q2
16358+@ u8  left[y]  [1]  [r2]
16359+@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
16360+@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
16361+
16362+        vadd.i16    q3, q1
16363+        vadd.i16    q8, q0
16364+1:
16365+        vadd.i16    q10, q3, q1
16366+        subs        r1, #2
16367+        vld1.8      {d18[]}, [r2]!
16368+        vadd.i16    q11, q8, q0
16369+        vld1.8      {d19[]}, [r2]!
16370+        vmlal.u8    q3, d4, d18
16371+        vmlal.u8    q8, d5, d18
16372+        vadd.i16    q12, q10, q1
16373+        vmlal.u8    q10, d4, d19
16374+        vadd.i16    q13, q11, q0
16375+        vmlal.u8    q11, d5, d19
16376+        vrshrn.u16  d18, q3, #5
16377+        vrshrn.u16  d19, q8, #5
16378+        vmov        q3, q12
16379+        vst1.8      {q9}, [r0 :128], r3
16380+        vrshrn.u16  d18, q10, #5
16381+        vrshrn.u16  d19, q11, #5
16382+        vmov        q8, q13
16383+        vst1.8      {q9}, [r0 :128], r3
16384+        bne         1b
16385+
16386+        bx          lr
16387+
16388+endfunc
16389+
16390+
16391+@ ff_hevc_rpi_pred_planar_16_neon_10
16392+@       uint8_t *_src,          [r0]
16393+@       const uint8_t *_top,    [r1]
16394+@       const uint8_t *_left,   [r2]
16395+@       ptrdiff_t stride)       [r3]
16396+
16397+function ff_hevc_rpi_pred_planar_16_neon_10, export=1
16398+
16399+        @ Load from bytes & expand later - at the very least this uses less
16400+        @ memory than having a short table
16401+        adr         ip, nb_15_0_1_16 + 16
16402+        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
16403+        add         r2, #32
16404+        vld1.8      {q2}, [ip :128]     @ {1,2,3...16}
16405+        lsl         r3, #1
16406+        vld1.16     {d6[],d7[]}, [r1]   @ Top (right)
16407+        sub         ip, #16
16408+        vmovl.u8    q8, d4
16409+        mov         r1, #16
16410+        vshl.i16    q9, q0, #4
16411+        vmovl.u8    q2, d5
16412+        vshl.i16    q10, q1, #4
16413+        vld1.16     {d22[],d23[]}, [r2] @ Left (lower)
16414+        sub         r2, #32
16415+        vld1.8      {q12}, [ip]         @ {15,14,13...0}
16416+        vmla.i16    q9, q8, q3
16417+        vmla.i16    q10, q2, q3         @ Acc set up
16418+        vsub.i16    q0, q11, q0
16419+        vsub.i16    q1, q11, q1         @ Add set up
16420+        vadd.i16    q2, q9, q0
16421+        vadd.i16    q3, q10, q1
16422+        vmovl.u8    q8, d24
16423+        vmovl.u8    q9, d25
16424+
16425+@ u16  15..0       [2]  q8,q9
16426+@ u32 left[y]      [2]  [r2]
16427+@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
16428+@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
16429+
16430+1:
16431+        vadd.i16    q10, q2, q0
16432+        subs        r1, #2
16433+        vld1.16     {d24[],d25[]}, [r2]!
16434+        vadd.i16    q11, q3, q1
16435+        vld1.16     {d28[],d29[]}, [r2]!
16436+        vmla.i16    q2, q8, q12
16437+        vmla.i16    q3, q9, q12
16438+        vadd.i16    q12, q10, q0
16439+        vmla.i16    q10, q8, q14
16440+        vadd.i16    q13, q11, q1
16441+        vmla.i16    q11, q9, q14
16442+        vrshr.u16   q14, q2, #5
16443+        vrshr.u16   q15, q3, #5
16444+        vmov        q2, q12
16445+        vst1.16     {q14-q15}, [r0 :128], r3
16446+        vrshr.u16   q14, q10, #5
16447+        vrshr.u16   q15, q11, #5
16448+        vmov        q3, q13
16449+        vst1.16     {q14-q15}, [r0 :128], r3
16450+        bne         1b
16451+
16452+        bx         lr
16453+endfunc
16454+
16455+
16456+@ ff_hevc_rpi_pred_planar_32_neon_8
16457+@       uint8_t *_src,          [r0]
16458+@       const uint8_t *_top,    [r1]
16459+@       const uint8_t *_left,   [r2]
16460+@       ptrdiff_t stride)       [r3]
16461+
16462+function ff_hevc_rpi_pred_planar_32_neon_8, export=1
16463+
16464+        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
16465+        adr         ip, nb_31_0_1_32 + 32
16466+        vpush       {d8-d12}
16467+        vld1.8      {q2-q3}, [ip :128]  @ {1,2,3...32}
16468+        add         r2, #32
16469+        vld1.8      {d8[]}, [r1]        @ Top (right)
16470+        sub         ip, #32
16471+        vshll.u8    q8, d0, #5
16472+        mov         r1, #32
16473+        vld1.8      {d9[]}, [r2]        @ Left (lower)
16474+        sub         r2, #32
16475+        vshll.u8    q9, d1, #5
16476+        vshll.u8    q10, d2, #5
16477+        vshll.u8    q11, d3, #5
16478+        vmlal.u8    q8, d4, d8
16479+        vsubl.u8    q12, d9, d0
16480+        vmlal.u8    q9, d5, d8
16481+        vsubl.u8    q13, d9, d1
16482+        vmlal.u8    q10, d6, d8
16483+        vsubl.u8    q14, d9, d2
16484+        vmlal.u8    q11, d7, d8         @ Acc set up
16485+        vsubl.u8    q15, d9, d3         @ Add set up
16486+        vadd.i16    q8, q12
16487+        vadd.i16    q9, q13
16488+        vadd.i16    q10, q14
16489+        vadd.i16    q11, q15
16490+        vld1.8      {q4-q5}, [ip :128]  @ {31,30,29...0}
16491+
16492+@ u8  31..0    [2]  q4,q5
16493+@ u8  left[y]  [2]  [r2]
16494+@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
16495+@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
16496+
16497+        vld1.8      {d12[]}, [r2]!
16498+        vadd.i16    q0, q8, q12
16499+        b           2f
16500+1:
16501+          vld1.8      {d12[]}, [r2]!
16502+        vrshrn.u16  d3, q1, #6
16503+        vrshrn.u16  d2, q0, #6
16504+          vadd.i16    q0, q8, q12
16505+        vrshrn.u16  d4, q2, #6
16506+        vrshrn.u16  d5, q3, #6
16507+        vst1.8      {q1-q2}, [r0 :128], r3
16508+2:        vadd.i16    q1, q9, q13
16509+          subs        r1, #2
16510+          vadd.i16    q2, q10, q14
16511+          vadd.i16    q3, q11, q15
16512+          vmlal.u8    q8, d8, d12
16513+          vmlal.u8    q9, d9, d12
16514+          vmlal.u8    q10, d10, d12
16515+          vmlal.u8    q11, d11, d12
16516+            vld1.8      {d12[]}, [r2]!
16517+          vrshrn.u16  d19, q9, #6
16518+          vrshrn.u16  d18, q8, #6
16519+            vadd.i16    q8, q0, q12
16520+          vrshrn.u16  d20, q10, #6
16521+          vrshrn.u16  d21, q11, #6
16522+          vst1.8      {q9-q10}, [r0 :128], r3
16523+            vadd.i16    q9, q1, q13
16524+            vadd.i16    q10, q2, q14
16525+            vadd.i16    q11, q3, q15
16526+            vmlal.u8    q0, d8, d12
16527+            vmlal.u8    q1, d9, d12
16528+            vmlal.u8    q2, d10, d12
16529+            vmlal.u8    q3, d11, d12
16530+
16531+        bne         1b
16532+
16533+        vpop        {d8-d12}
16534+
16535+        vrshrn.u16  d3, q1, #6
16536+        vrshrn.u16  d2, q0, #6
16537+        vrshrn.u16  d4, q2, #6
16538+        vrshrn.u16  d5, q3, #6
16539+        vst1.8      {q1-q2}, [r0 :128]
16540+
16541+        bx          lr
16542+
16543+endfunc
16544+
16545+
16546+@ ff_hevc_rpi_pred_planar_32_neon_10
16547+@       uint8_t *_src,          [r0]
16548+@       const uint8_t *_top,    [r1]
16549+@       const uint8_t *_left,   [r2]
16550+@       ptrdiff_t stride)       [r3]
16551+
16552+function ff_hevc_rpi_pred_planar_32_neon_10, export=1
16553+
16554+        @ Load from bytes & expand later - at the very least this uses less
16555+        @ memory than having a short table
16556+        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
16557+        adr         ip, nb_31_0_1_32 + 32
16558+        vpush       {q4-q7}
16559+        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
16560+        add         r2, #64
16561+        vld1.8      {q14-q15}, [ip :128] @ {1,2,3...32}
16562+T       lsl         r3, #1
16563+        vld1.16     {d8[],d9[]}, [r1]    @ Top (right)
16564+        sub         ip, #32
16565+        vmovl.u8    q12, d28
16566+        mov         r1, #32
16567+        vmovl.u8    q13, d29
16568+        vld1.8      {q6-q7}, [ip :128]   @ {31,30,29...0}
16569+        vmovl.u8    q14, d30
16570+        vmovl.u8    q15, d31
16571+        vld1.16     {d10[],d11[]}, [r2]  @ Left (lower)
16572+        sub         r2, #64
16573+        vshl.i16    q8, q0, #5
16574+        vshl.i16    q9, q1, #5
16575+        vshl.i16    q10, q2, #5
16576+        vshl.i16    q11, q3, #5
16577+        vmla.i16    q8, q12, q4
16578+        vsub.i16    q0, q5, q0
16579+        vmla.i16    q9, q13, q4
16580+        vsub.i16    q1, q5, q1
16581+        vmla.i16    q10, q14, q4
16582+        vmov.u16    ip, d0[0]
16583+        vsub.i16    q2, q5, q2
16584+        vmla.i16    q11, q15, q4         @ Acc set up
16585+        vsub.i16    q3, q5, q3           @ Add set up
16586+        vadd.i16    q8, q0
16587+        vadd.i16    q9, q1
16588+        vadd.i16    q10, q2
16589+        vadd.i16    q11, q3
16590+        vmovl.u8    q4, d12
16591+        vmovl.u8    q5, d13
16592+        vmovl.u8    q6, d14
16593+        vmovl.u8    q7, d15
16594+
16595+@ u16 31..0    [4]  q4-q7
16596+@ u16 left[y]  [4]  [r2]
16597+@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
16598+@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
16599+
16600+        vadd.i16    q12, q8, q0
16601+A       sub         r0, r0, r3, lsl #1
16602+T       sub         r0, r3
16603+1:
16604+        vld1.16     {d0[0]}, [r2]!
16605+A       add         r0, r0, r3, lsl #1
16606+T       add         r0, r3
16607+        vadd.i16    q13, q9, q1
16608+        subs        r1, #2
16609+        vadd.i16    q14, q10, q2
16610+        vadd.i16    q15, q11, q3
16611+        vmla.i16    q8, q4, d0[0]
16612+        vmla.i16    q9, q5, d0[0]
16613+        vmla.i16    q10, q6, d0[0]
16614+        vmla.i16    q11, q7, d0[0]
16615+        vmov.16     d0[0], ip
16616+        vrshr.u16   q8, #6
16617+        vrshr.u16   q9, #6
16618+        vrshr.u16   q10, #6
16619+        vrshr.u16   q11, #6
16620+        vstm        r0, {q8-q11}
16621+        vadd.i16    q8, q12, q0
16622+A       add         r0, r0, r3, lsl #1
16623+T       add         r0, r3
16624+        vld1.16     {d0[0]}, [r2]!
16625+        vadd.i16    q9, q13, q1
16626+        vadd.i16    q10, q14, q2
16627+        vadd.i16    q11, q15, q3
16628+        vmla.i16    q12, q4, d0[0]
16629+        vmla.i16    q13, q5, d0[0]
16630+        vmla.i16    q14, q6, d0[0]
16631+        vmla.i16    q15, q7, d0[0]
16632+        vmov.16     d0[0], ip
16633+        vrshr.u16   q12, #6
16634+        vrshr.u16   q13, #6
16635+        vrshr.u16   q14, #6
16636+        vrshr.u16   q15, #6
16637+        vstm        r0, {q12-q15}
16638+        vadd.i16    q12, q8, q0
16639+        bne         1b
16640+
16641+        vpop        {q4-q7}
16642+        bx          lr
16643+
16644+endfunc
16645+
16646+
16647+@ ff_hevc_rpi_pred_planar_c_4_neon_8
16648+@       uint8_t *_src,          [r0]
16649+@       const uint8_t *_top,    [r1]
16650+@       const uint8_t *_left,   [r2]
16651+@       ptrdiff_t stride)       [r3]
16652+
16653+function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1
16654+
16655+        vld1.8      {q0}, [r1]          @ Top
16656+        adr         ip, nbx2_3_0_1_4
16657+        vldr        d2, [r2, #8]        @ Left (lower)
16658+        mov         r1, #4
16659+        vldr        d3, [ip, #8]        @ {1,1,2,2,3,3,4,4}
16660+        lsl         r3, #1
16661+        vshll.u8    q2, d0, #2
16662+        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
16663+        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
16664+        vldr        d6, [r2]            @ Left (upper)
16665+        vmlal.u8    q2, d3, d1
16666+        vsubl.u8    q0, d2, d0
16667+        vldr        d7, [ip]            @ {3,3,2,2,1,1,0,0}
16668+
16669+@ u8   3..0    [1]  d7
16670+@ u8  left[y]  [1]  d6
16671+@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
16672+@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
16673+
16674+        vdup.16     d2, d6[0]
16675+        vadd.i16    q2, q0
16676+        vdup.16     d3, d6[1]
16677+        vadd.i16    q8, q2, q0
16678+1:
16679+        vmlal.u8    q2, d7, d2
16680+        subs        r1, #2
16681+        vadd.i16    q9, q8, q0
16682+        vmlal.u8    q8, d7, d3
16683+        vdup.16     d2, d6[2]
16684+        vdup.16     d3, d6[3]
16685+        vrshrn.i16  d20, q2, #3
16686+        vmov        q2, q9
16687+        vst1.8      {d20}, [r0], r3
16688+        vrshrn.i16  d20, q8, #3
16689+        vadd.i16    q8, q2, q0
16690+        vst1.8      {d20}, [r0], r3
16691+        bne         1b
16692+
16693+        bx          lr
16694+
16695+endfunc
16696+
16697+
16698+@ ff_hevc_rpi_pred_planar_c_4_neon_10
16699+@       uint8_t *_src,          [r0]
16700+@       const uint8_t *_top,    [r1]
16701+@       const uint8_t *_left,   [r2]
16702+@       ptrdiff_t stride)       [r3]
16703+
16704+function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1
16705+
16706+        adr         ip, nbx2_3_0_1_4
16707+        vld1.16     {q0}, [r1 :128]!    @ Top (left)
16708+        lsl         r3, #2
16709+        vld1.16     {q1}, [ip :128]     @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4}
16710+        add         ip, r2, #16
16711+        vld1.32     {d4[],d5[]}, [r1]   @ Top (right)
16712+        vshl.s16    q3, q0, #2
16713+        vmovl.u8    q8, d3              @ {1,1,2,2,3,3,4,4}
16714+        vld1.32     {d18[],d19[]}, [ip] @ Left (lower)
16715+        vmla.i16    q3, q8, q2          @ Acc set up
16716+        vsub.i16    q0, q9, q0          @ Add set up
16717+        vmovl.u8    q1, d2              @ {3,3,2,2,1,1,0,0}
16718+        vadd.i16    q2, q3, q0
16719+
16720+@ u16  3..0        [1]  q1
16721+@ u32 left[y]      [1]  [r2]
16722+@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
16723+@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
16724+
16725+        vld1.32     {d6[],d7[]}, [r2]!
16726+        vadd.i16    q8, q2, q0
16727+        vld1.32     {d18[],d19[]}, [r2]!
16728+        vmla.i16    q2, q1, q3
16729+        vadd.i16    q3, q8, q0
16730+        vmla.i16    q8, q1, q9
16731+
16732+        vrshr.u16   q9, q2, #3
16733+        vmov        q2, q3
16734+        vrshr.u16   q10, q8, #3
16735+          vld1.32     {d6[],d7[]}, [r2]!
16736+        vst1.16     {q9}, [r0 :128], r3
16737+          vadd.i16    q8, q2, q0
16738+          vld1.32     {d18[],d19[]}, [r2]!
16739+          vmla.i16    q2, q1, q3
16740+          vadd.i16    q3, q8, q0
16741+          vmla.i16    q8, q1, q9
16742+        vst1.16     {q10}, [r0 :128], r3
16743+
16744+          vrshr.u16   q9, q2, #3
16745+          add         r3, r0
16746+          vrshr.u16   q10, q8, #3
16747+          vst1.16     {q9}, [r0 :128]
16748+          vst1.16     {q10}, [r3 :128]
16749+
16750+          bx         lr
16751+endfunc
16752+
16753+
16754+@ ff_hevc_rpi_pred_planar_c_8_neon_8
16755+@       uint8_t *_src,          [r0]
16756+@       const uint8_t *_top,    [r1]
16757+@       const uint8_t *_left,   [r2]
16758+@       ptrdiff_t stride)       [r3]
16759+
16760+function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1
16761+
16762+        adr         ip, nbx2_7_0_1_8 + 16
16763+        vld1.8      {q0}, [r1 :128]!    @ Top (left)
16764+        add         r2, #16
16765+        vld1.8      {q1}, [ip: 128]     @ {1,1,2,2,3,3...8,8}
16766+        lsl         r3, #1
16767+        vld1.16     {d4[]}, [r1]        @ Top (right)
16768+        sub         ip, #16
16769+        vshll.u8    q3, d0, #3
16770+        mov         r1, #8
16771+        vshll.u8    q8, d1, #3
16772+        vld1.16     {d5[]}, [r2]        @ Left (lower)
16773+        sub         r2, #16
16774+        vmlal.u8    q3, d2, d4
16775+        vmlal.u8    q8, d3, d4          @ Acc set up
16776+        vsubl.u8    q1, d5, d0
16777+        vsubl.u8    q0, d5, d1          @ Add set up
16778+        vld1.8      {q2}, [ip :128]     @ {7,7,6,6,5,5...0,0}
16779+
16780+@ u8  7..0     [1]  q2
16781+@ u8  left[y]  [1]  [r2]
16782+@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
16783+@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
16784+
16785+        vadd.i16    q3, q1
16786+        vadd.i16    q8, q0
16787+1:
16788+        vadd.i16    q10, q3, q1
16789+        subs        r1, #2
16790+        vld1.16     {d18[]}, [r2]!
16791+        vadd.i16    q11, q8, q0
16792+        vld1.16     {d19[]}, [r2]!
16793+        vmlal.u8    q3, d4, d18
16794+        vmlal.u8    q8, d5, d18
16795+        vadd.i16    q12, q10, q1
16796+        vmlal.u8    q10, d4, d19
16797+        vadd.i16    q13, q11, q0
16798+        vmlal.u8    q11, d5, d19
16799+        vrshrn.u16  d18, q3, #4
16800+        vrshrn.u16  d19, q8, #4
16801+        vmov        q3, q12
16802+        vst1.8      {q9}, [r0 :128], r3
16803+        vrshrn.u16  d18, q10, #4
16804+        vrshrn.u16  d19, q11, #4
16805+        vmov        q8, q13
16806+        vst1.8      {q9}, [r0 :128], r3
16807+        bne         1b
16808+
16809+        bx          lr
16810+
16811+endfunc
16812+
16813+
16814+@------------------------------------------------------------------------------
16815+@
16816+@ Data - has to be in two lumps to ensure we can always reach using adr
16817+
16818+        .balign 64
16819+
16820+nbx2_15_0_1_16:
16821+        .byte   15, 15, 14, 14, 13, 13, 12, 12
16822+        .byte   11, 11, 10, 10,  9,  9,  8,  8
16823+nbx2_7_0_1_8:
16824+        .byte    7,  7,  6,  6,  5,  5,  4,  4
16825+        .byte    3,  3,  2,  2,  1,  1,  0,  0
16826+        .byte    1,  1,  2,  2,  3,  3,  4,  4
16827+        .byte    5,  5,  6,  6,  7,  7,  8,  8
16828+        .byte    9,  9, 10, 10, 11, 11, 12, 12
16829+        .byte   13, 13, 14, 14, 15, 15, 16, 16
16830+
16831+        @ should be back on a 64-byte boundary here
16832+
16833+nbx2_3_0_1_4:
16834+        .byte    3,  3,  2,  2,  1,  1,  0,  0
16835+        .byte    1,  1,  2,  2,  3,  3,  4,  4
16836+
16837+@------------------------------------------------------------------------------
16838+
16839+
16840+@ ff_hevc_rpi_pred_planar_c_8_neon_10
16841+@       uint8_t *_src,          [r0]
16842+@       const uint8_t *_top,    [r1]
16843+@       const uint8_t *_left,   [r2]
16844+@       ptrdiff_t stride)       [r3]
16845+
16846+function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1
16847+
16848+        @ Load from bytes & expand later - at the very least this uses less
16849+        @ memory than having a short table
16850+        adr         ip, nbx2_7_0_1_8 + 16
16851+        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
16852+        add         r2, #32
16853+        vld1.8      {q2}, [ip :128]     @ {1,1,2,2,3,3...8,8}
16854+        lsl         r3, #2
16855+        vld1.32     {d6[],d7[]}, [r1]   @ Top (right)
16856+        sub         ip, #16
16857+        vmovl.u8    q8, d4
16858+        mov         r1, #8
16859+        vshl.i16    q9, q0, #3
16860+        vmovl.u8    q2, d5
16861+        vshl.i16    q10, q1, #3
16862+        vld1.32     {d22[],d23[]}, [r2] @ Left (lower)
16863+        sub         r2, #32
16864+        vld1.8      {q12}, [ip]         @ {7,7,6,6,5,5...0,0}
16865+        vmla.i16    q9, q8, q3
16866+        vmla.i16    q10, q2, q3         @ Acc set up
16867+        vsub.i16    q0, q11, q0
16868+        vsub.i16    q1, q11, q1         @ Add set up
16869+        vadd.i16    q2, q9, q0
16870+        vadd.i16    q3, q10, q1
16871+        vmovl.u8    q8, d24
16872+        vmovl.u8    q9, d25
16873+
16874+@ u16  7..0        [2]  q8,q9
16875+@ u32 left[y]      [2]  [r2]
16876+@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
16877+@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
16878+
16879+1:
16880+        vadd.i16    q10, q2, q0
16881+        subs        r1, #2
16882+        vld1.32     {d24[],d25[]}, [r2]!
16883+        vadd.i16    q11, q3, q1
16884+        vld1.32     {d28[],d29[]}, [r2]!
16885+        vmla.i16    q2, q8, q12
16886+        vmla.i16    q3, q9, q12
16887+        vadd.i16    q12, q10, q0
16888+        vmla.i16    q10, q8, q14
16889+        vadd.i16    q13, q11, q1
16890+        vmla.i16    q11, q9, q14
16891+        vrshr.u16   q14, q2, #4
16892+        vrshr.u16   q15, q3, #4
16893+        vmov        q2, q12
16894+        vst1.16     {q14-q15}, [r0 :128], r3
16895+        vrshr.u16   q14, q10, #4
16896+        vrshr.u16   q15, q11, #4
16897+        vmov        q3, q13
16898+        vst1.16     {q14-q15}, [r0 :128], r3
16899+        bne         1b
16900+
16901+        bx         lr
16902+endfunc
16903+
16904+
16905+@ ff_hevc_rpi_pred_planar_c_16_neon_8
16906+@       uint8_t *_src,          [r0]
16907+@       const uint8_t *_top,    [r1]
16908+@       const uint8_t *_left,   [r2]
16909+@       ptrdiff_t stride)       [r3]
16910+
16911+function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1
16912+
16913+        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
16914+        adr         ip, nbx2_15_0_1_16 + 32
16915+        vpush       {d8-d12}
16916+        vld1.8      {q2-q3}, [ip :128]  @ {1,1,2,2,3,3...16,16}
16917+        add         r2, #32
16918+        vld1.16     {d8[]}, [r1]        @ Top (right)
16919+        sub         ip, #32
16920+        vshll.u8    q8, d0, #4
16921+        mov         r1, #16
16922+        vld1.16     {d9[]}, [r2]        @ Left (lower)
16923+        sub         r2, #32
16924+        vshll.u8    q9, d1, #4
16925+        lsl         r3, #1
16926+        vshll.u8    q10, d2, #4
16927+        vshll.u8    q11, d3, #4
16928+        vmlal.u8    q8, d4, d8
16929+        vsubl.u8    q12, d9, d0
16930+        vmlal.u8    q9, d5, d8
16931+        vsubl.u8    q13, d9, d1
16932+        vmlal.u8    q10, d6, d8
16933+        vsubl.u8    q14, d9, d2
16934+        vmlal.u8    q11, d7, d8         @ Acc set up
16935+        vsubl.u8    q15, d9, d3         @ Add set up
16936+        vadd.i16    q8, q12
16937+        vadd.i16    q9, q13
16938+        vadd.i16    q10, q14
16939+        vadd.i16    q11, q15
16940+        vld1.8      {q4-q5}, [ip :128]  @ {15,15,14,14,13,13...0,0}
16941+
16942+@ u8  15..0    [2]  q4,q5
16943+@ u8  left[y]  [2]  [r2]
16944+@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
16945+@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
16946+
16947+        vld1.16     {d12[]}, [r2]!
16948+        vadd.i16    q0, q8, q12
16949+        b           2f
16950+1:
16951+          vld1.16     {d12[]}, [r2]!
16952+        vrshrn.u16  d3, q1, #5
16953+        vrshrn.u16  d2, q0, #5
16954+          vadd.i16    q0, q8, q12
16955+        vrshrn.u16  d4, q2, #5
16956+        vrshrn.u16  d5, q3, #5
16957+        vst1.8      {q1-q2}, [r0 :128], r3
16958+2:        vadd.i16    q1, q9, q13
16959+          subs        r1, #2
16960+          vadd.i16    q2, q10, q14
16961+          vadd.i16    q3, q11, q15
16962+          vmlal.u8    q8, d8, d12
16963+          vmlal.u8    q9, d9, d12
16964+          vmlal.u8    q10, d10, d12
16965+          vmlal.u8    q11, d11, d12
16966+            vld1.16     {d12[]}, [r2]!
16967+          vrshrn.u16  d19, q9, #5
16968+          vrshrn.u16  d18, q8, #5
16969+            vadd.i16    q8, q0, q12
16970+          vrshrn.u16  d20, q10, #5
16971+          vrshrn.u16  d21, q11, #5
16972+          vst1.8      {q9-q10}, [r0 :128], r3
16973+            vadd.i16    q9, q1, q13
16974+            vadd.i16    q10, q2, q14
16975+            vadd.i16    q11, q3, q15
16976+            vmlal.u8    q0, d8, d12
16977+            vmlal.u8    q1, d9, d12
16978+            vmlal.u8    q2, d10, d12
16979+            vmlal.u8    q3, d11, d12
16980+
16981+        bne         1b
16982+
16983+        vpop        {d8-d12}
16984+
16985+        vrshrn.u16  d3, q1, #5
16986+        vrshrn.u16  d2, q0, #5
16987+        vrshrn.u16  d4, q2, #5
16988+        vrshrn.u16  d5, q3, #5
16989+        vst1.8      {q1-q2}, [r0 :128]
16990+
16991+        bx          lr
16992+
16993+endfunc
16994+
16995+
16996+@ ff_hevc_rpi_pred_planar_c_16_neon_10
16997+@       uint8_t *_src,          [r0]
16998+@       const uint8_t *_top,    [r1]
16999+@       const uint8_t *_left,   [r2]
17000+@       ptrdiff_t stride)       [r3]
17001+
17002+function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1
17003+
17004+        @ Load from bytes & expand later - at the very least this uses less
17005+        @ memory than having a short table
17006+        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
17007+        adr         ip, nbx2_15_0_1_16 + 32
17008+        vpush       {q4-q7}
17009+        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
17010+        add         r2, #64
17011+        vld1.8      {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16}
17012+T       lsl         r3, #2
17013+        vld1.32     {d8[],d9[]}, [r1]    @ Top (right)
17014+        sub         ip, #32
17015+        vmovl.u8    q12, d28
17016+        mov         r1, #16
17017+        vmovl.u8    q13, d29
17018+        vld1.8      {q6-q7}, [ip :128]   @ {15,15,14,14,13,13...0,0}
17019+        vmovl.u8    q14, d30
17020+        vmovl.u8    q15, d31
17021+        vld1.32     {d10[],d11[]}, [r2]  @ Left (lower)
17022+        sub         r2, #64
17023+        vshl.i16    q8, q0, #4
17024+        vshl.i16    q9, q1, #4
17025+        vshl.i16    q10, q2, #4
17026+        vshl.i16    q11, q3, #4
17027+        vmla.i16    q8, q12, q4
17028+        vsub.i16    q0, q5, q0
17029+        vmla.i16    q9, q13, q4
17030+        vpush       {q0}
17031+        vsub.i16    q1, q5, q1
17032+        vmla.i16    q10, q14, q4
17033+        vsub.i16    q2, q5, q2
17034+        vmla.i16    q11, q15, q4         @ Acc set up
17035+        vsub.i16    q3, q5, q3           @ Add set up
17036+        vadd.i16    q8, q0
17037+        vadd.i16    q9, q1
17038+        vadd.i16    q10, q2
17039+        vadd.i16    q11, q3
17040+        vmovl.u8    q4, d12
17041+        vmovl.u8    q5, d13
17042+        vmovl.u8    q6, d14
17043+        vmovl.u8    q7, d15
17044+
17045+@ u16 31..0    [4]  q4-q7
17046+@ u16 left[y]  [4]  [r2]
17047+@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
17048+@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
17049+
17050+        vadd.i16    q12, q8, q0
17051+A       sub         r0, r0, r3, lsl #2
17052+T       sub         r0, r3
17053+1:
17054+        vld1.32     {d0[],d1[]}, [r2]!
17055+A       add         r0, r0, r3, lsl #2
17056+T       add         r0, r3
17057+        vadd.i16    q13, q9, q1
17058+        subs        r1, #2
17059+        vadd.i16    q14, q10, q2
17060+        vadd.i16    q15, q11, q3
17061+        vmla.i16    q8, q4, q0
17062+        vmla.i16    q9, q5, q0
17063+        vmla.i16    q10, q6, q0
17064+        vmla.i16    q11, q7, q0
17065+        vld1.16     {q0}, [sp]
17066+        vrshr.u16   q8, #5
17067+        vrshr.u16   q9, #5
17068+        vrshr.u16   q10, #5
17069+        vrshr.u16   q11, #5
17070+        vstm        r0, {q8-q11}
17071+        vadd.i16    q8, q12, q0
17072+A       add         r0, r0, r3, lsl #2
17073+T       add         r0, r3
17074+        vld1.32     {d0[],d1[]}, [r2]!
17075+        vadd.i16    q9, q13, q1
17076+        vadd.i16    q10, q14, q2
17077+        vadd.i16    q11, q15, q3
17078+        vmla.i16    q12, q4, q0
17079+        vmla.i16    q13, q5, q0
17080+        vmla.i16    q14, q6, q0
17081+        vmla.i16    q15, q7, q0
17082+        vld1.16     {q0}, [sp]
17083+        vrshr.u16   q12, #5
17084+        vrshr.u16   q13, #5
17085+        vrshr.u16   q14, #5
17086+        vrshr.u16   q15, #5
17087+        vstm        r0, {q12-q15}
17088+        vadd.i16    q12, q8, q0
17089+        bne         1b
17090+
17091+        vpop        {q3-q7}
17092+        bx          lr
17093+
17094+endfunc
17095--- a/libavcodec/arm/vc1dsp_init_neon.c
17096+++ b/libavcodec/arm/vc1dsp_init_neon.c
17097@@ -19,6 +19,7 @@
17098 #include <stdint.h>
17099
17100 #include "libavutil/attributes.h"
17101+#include "libavutil/intreadwrite.h"
17102 #include "libavcodec/vc1dsp.h"
17103 #include "vc1dsp.h"
17104
17105@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_
17106 void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
17107 void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
17108
17109+void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
17110+void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
17111+void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
17112+void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
17113+void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
17114+void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
17115+
17116 void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
17117                            ptrdiff_t line_size, int rnd);
17118
17119@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t
17120 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
17121                                 int h, int x, int y);
17122
17123+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
17124+
17125+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
17126+{
17127+    /* Dealing with starting and stopping, and removing escape bytes, are
17128+     * comparatively less time-sensitive, so are more clearly expressed using
17129+     * a C wrapper around the assembly inner loop. Note that we assume a
17130+     * little-endian machine that supports unaligned loads. */
17131+    int dsize = 0;
17132+    while (size >= 4)
17133+    {
17134+        int found = 0;
17135+        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
17136+        {
17137+            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
17138+            if (!found)
17139+            {
17140+                *dst++ = *src++;
17141+                --size;
17142+                ++dsize;
17143+            }
17144+        }
17145+        if (!found)
17146+        {
17147+            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
17148+            dst += skip;
17149+            src += skip;
17150+            size -= skip;
17151+            dsize += skip;
17152+            while (!found && size >= 4)
17153+            {
17154+                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
17155+                if (!found)
17156+                {
17157+                    *dst++ = *src++;
17158+                    --size;
17159+                    ++dsize;
17160+                }
17161+            }
17162+        }
17163+        if (found)
17164+        {
17165+            *dst++ = *src++;
17166+            *dst++ = *src++;
17167+            ++src;
17168+            size -= 3;
17169+            dsize += 2;
17170+        }
17171+    }
17172+    while (size > 0)
17173+    {
17174+        *dst++ = *src++;
17175+        --size;
17176+        ++dsize;
17177+    }
17178+    return dsize;
17179+}
17180+
17181 #define FN_ASSIGN(X, Y) \
17182     dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
17183     dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
17184@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC
17185     dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
17186     dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
17187
17188+    dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
17189+    dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
17190+    dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
17191+    dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
17192+    dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
17193+    dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
17194+
17195     dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
17196     FN_ASSIGN(1, 0);
17197     FN_ASSIGN(2, 0);
17198@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC
17199     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
17200     dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
17201     dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
17202+
17203+    dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
17204 }
17205--- a/libavcodec/arm/vc1dsp_neon.S
17206+++ b/libavcodec/arm/vc1dsp_neon.S
17207@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, e
17208         vst1.32         {d1[1]},  [r0,:32]
17209         bx              lr
17210 endfunc
17211+
17212+@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
17213+@ On entry:
17214+@   r0 -> top-left pel of lower block
17215+@   r1 = row stride, bytes
17216+@   r2 = PQUANT bitstream parameter
17217+function ff_vc1_v_loop_filter4_neon, export=1
17218+        sub             r3, r0, r1, lsl #2
17219+        vldr            d0, .Lcoeffs
17220+        vld1.32         {d1[0]}, [r0], r1       @ P5
17221+        vld1.32         {d2[0]}, [r3], r1       @ P1
17222+        vld1.32         {d3[0]}, [r3], r1       @ P2
17223+        vld1.32         {d4[0]}, [r0], r1       @ P6
17224+        vld1.32         {d5[0]}, [r3], r1       @ P3
17225+        vld1.32         {d6[0]}, [r0], r1       @ P7
17226+        vld1.32         {d7[0]}, [r3]           @ P4
17227+        vld1.32         {d16[0]}, [r0]          @ P8
17228+        vshll.u8        q9, d1, #1              @ 2*P5
17229+        vdup.16         d17, r2                 @ pq
17230+        vshll.u8        q10, d2, #1             @ 2*P1
17231+        vmovl.u8        q11, d3                 @ P2
17232+        vmovl.u8        q1, d4                  @ P6
17233+        vmovl.u8        q12, d5                 @ P3
17234+        vmls.i16        d20, d22, d0[1]         @ 2*P1-5*P2
17235+        vmovl.u8        q11, d6                 @ P7
17236+        vmls.i16        d18, d2, d0[1]          @ 2*P5-5*P6
17237+        vshll.u8        q2, d5, #1              @ 2*P3
17238+        vmovl.u8        q3, d7                  @ P4
17239+        vmla.i16        d18, d22, d0[1]         @ 2*P5-5*P6+5*P7
17240+        vmovl.u8        q11, d16                @ P8
17241+        vmla.u16        d20, d24, d0[1]         @ 2*P1-5*P2+5*P3
17242+        vmovl.u8        q12, d1                 @ P5
17243+        vmls.u16        d4, d6, d0[1]           @ 2*P3-5*P4
17244+        vmls.u16        d18, d22, d0[0]         @ 2*P5-5*P6+5*P7-2*P8
17245+        vsub.i16        d1, d6, d24             @ P4-P5
17246+        vmls.i16        d20, d6, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
17247+        vmla.i16        d4, d24, d0[1]          @ 2*P3-5*P4+5*P5
17248+        vmls.i16        d4, d2, d0[0]           @ 2*P3-5*P4+5*P5-2*P6
17249+        vabs.s16        d2, d1
17250+        vrshr.s16       d3, d18, #3
17251+        vrshr.s16       d5, d20, #3
17252+        vshr.s16        d2, d2, #1              @ clip
17253+        vrshr.s16       d4, d4, #3
17254+        vabs.s16        d3, d3                  @ a2
17255+        vshr.s16        d1, d1, #8              @ clip_sign
17256+        vabs.s16        d5, d5                  @ a1
17257+        vceq.i16        d7, d2, #0              @ test clip == 0
17258+        vabs.s16        d16, d4                 @ a0
17259+        vshr.s16        d4, d4, #8              @ a0_sign
17260+        vcge.s16        d18, d5, d3             @ test a1 >= a2
17261+        vcge.s16        d17, d16, d17           @ test a0 >= pq
17262+        vbsl            d18, d3, d5             @ a3
17263+        vsub.i16        d1, d1, d4              @ clip_sign - a0_sign
17264+        vorr            d3, d7, d17             @ test clip == 0 || a0 >= pq
17265+        vqsub.u16       d4, d16, d18            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
17266+        vcge.s16        d5, d18, d16            @ test a3 >= a0
17267+        vmul.i16        d0, d4, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
17268+        vorr            d4, d3, d5              @ test clip == 0 || a0 >= pq || a3 >= a0
17269+        vmov.32         r0, d4[1]               @ move to gp reg
17270+        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
17271+        vcge.s16        d4, d0, d2
17272+        tst             r0, #1
17273+        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
17274+        vbsl            d4, d2, d0              @ FFMIN(d, clip)
17275+        vbic            d0, d4, d3              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
17276+        vmls.i16        d6, d0, d1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
17277+        vmla.i16        d24, d0, d1             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
17278+        vqmovun.s16     d0, q3
17279+        vqmovun.s16     d1, q12
17280+        vst1.32         {d0[0]}, [r3], r1
17281+        vst1.32         {d1[0]}, [r3]
17282+1:      bx              lr
17283+endfunc
17284+
17285+@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
17286+@ On entry:
17287+@   r0 -> top-left pel of right block
17288+@   r1 = row stride, bytes
17289+@   r2 = PQUANT bitstream parameter
17290+function ff_vc1_h_loop_filter4_neon, export=1
17291+        sub             r3, r0, #4              @ where to start reading
17292+        vldr            d0, .Lcoeffs
17293+        vld1.32         {d2}, [r3], r1
17294+        sub             r0, r0, #1              @ where to start writing
17295+        vld1.32         {d4}, [r3], r1
17296+        vld1.32         {d3}, [r3], r1
17297+        vld1.32         {d5}, [r3]
17298+        vdup.16         d1, r2                  @ pq
17299+        vtrn.8          q1, q2
17300+        vtrn.16         d2, d3                  @ P1, P5, P3, P7
17301+        vtrn.16         d4, d5                  @ P2, P6, P4, P8
17302+        vshll.u8        q3, d2, #1              @ 2*P1, 2*P5
17303+        vmovl.u8        q8, d4                  @ P2, P6
17304+        vmovl.u8        q9, d3                  @ P3, P7
17305+        vmovl.u8        q2, d5                  @ P4, P8
17306+        vmls.i16        q3, q8, d0[1]           @ 2*P1-5*P2, 2*P5-5*P6
17307+        vshll.u8        q10, d3, #1             @ 2*P3, 2*P7
17308+        vmovl.u8        q1, d2                  @ P1, P5
17309+        vmla.i16        q3, q9, d0[1]           @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
17310+        vmls.i16        q3, q2, d0[0]           @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
17311+        vmov            d2, d3                  @ needs to be in an even-numbered vector for when we come to narrow it later
17312+        vmls.i16        d20, d4, d0[1]          @ 2*P3-5*P4
17313+        vmla.i16        d20, d3, d0[1]          @ 2*P3-5*P4+5*P5
17314+        vsub.i16        d3, d4, d2              @ P4-P5
17315+        vmls.i16        d20, d17, d0[0]         @ 2*P3-5*P4+5*P5-2*P6
17316+        vrshr.s16       q3, q3, #3
17317+        vabs.s16        d5, d3
17318+        vshr.s16        d3, d3, #8              @ clip_sign
17319+        vrshr.s16       d16, d20, #3
17320+        vabs.s16        q3, q3                  @ a1, a2
17321+        vshr.s16        d5, d5, #1              @ clip
17322+        vabs.s16        d17, d16                @ a0
17323+        vceq.i16        d18, d5, #0             @ test clip == 0
17324+        vshr.s16        d16, d16, #8            @ a0_sign
17325+        vcge.s16        d19, d6, d7             @ test a1 >= a2
17326+        vcge.s16        d1, d17, d1             @ test a0 >= pq
17327+        vsub.i16        d16, d3, d16            @ clip_sign - a0_sign
17328+        vbsl            d19, d7, d6             @ a3
17329+        vorr            d1, d18, d1             @ test clip == 0 || a0 >= pq
17330+        vqsub.u16       d3, d17, d19            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
17331+        vcge.s16        d6, d19, d17            @ test a3 >= a0    @
17332+        vmul.i16        d0, d3, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
17333+        vorr            d3, d1, d6              @ test clip == 0 || a0 >= pq || a3 >= a0
17334+        vmov.32         r2, d3[1]               @ move to gp reg
17335+        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
17336+        vcge.s16        d3, d0, d5
17337+        tst             r2, #1
17338+        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
17339+        vbsl            d3, d5, d0              @ FFMIN(d, clip)
17340+        vbic            d0, d3, d1              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
17341+        vmla.i16        d2, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
17342+        vmls.i16        d4, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
17343+        vqmovun.s16     d1, q1
17344+        vqmovun.s16     d0, q2
17345+        vst2.8          {d0[0], d1[0]}, [r0], r1
17346+        vst2.8          {d0[1], d1[1]}, [r0], r1
17347+        vst2.8          {d0[2], d1[2]}, [r0], r1
17348+        vst2.8          {d0[3], d1[3]}, [r0]
17349+1:      bx              lr
17350+endfunc
17351+
17352+@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
17353+@ On entry:
17354+@   r0 -> top-left pel of lower block
17355+@   r1 = row stride, bytes
17356+@   r2 = PQUANT bitstream parameter
17357+function ff_vc1_v_loop_filter8_neon, export=1
17358+        sub             r3, r0, r1, lsl #2
17359+        vldr            d0, .Lcoeffs
17360+        vld1.32         {d1}, [r0 :64], r1      @ P5
17361+        vld1.32         {d2}, [r3 :64], r1      @ P1
17362+        vld1.32         {d3}, [r3 :64], r1      @ P2
17363+        vld1.32         {d4}, [r0 :64], r1      @ P6
17364+        vld1.32         {d5}, [r3 :64], r1      @ P3
17365+        vld1.32         {d6}, [r0 :64], r1      @ P7
17366+        vshll.u8        q8, d1, #1              @ 2*P5
17367+        vshll.u8        q9, d2, #1              @ 2*P1
17368+        vld1.32         {d7}, [r3 :64]          @ P4
17369+        vmovl.u8        q1, d3                  @ P2
17370+        vld1.32         {d20}, [r0 :64]         @ P8
17371+        vmovl.u8        q11, d4                 @ P6
17372+        vdup.16         q12, r2                 @ pq
17373+        vmovl.u8        q13, d5                 @ P3
17374+        vmls.i16        q9, q1, d0[1]           @ 2*P1-5*P2
17375+        vmovl.u8        q1, d6                  @ P7
17376+        vshll.u8        q2, d5, #1              @ 2*P3
17377+        vmls.i16        q8, q11, d0[1]          @ 2*P5-5*P6
17378+        vmovl.u8        q3, d7                  @ P4
17379+        vmovl.u8        q10, d20                @ P8
17380+        vmla.i16        q8, q1, d0[1]           @ 2*P5-5*P6+5*P7
17381+        vmovl.u8        q1, d1                  @ P5
17382+        vmla.i16        q9, q13, d0[1]          @ 2*P1-5*P2+5*P3
17383+        vsub.i16        q13, q3, q1             @ P4-P5
17384+        vmls.i16        q2, q3, d0[1]           @ 2*P3-5*P4
17385+        vmls.i16        q8, q10, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
17386+        vabs.s16        q10, q13
17387+        vshr.s16        q13, q13, #8            @ clip_sign
17388+        vmls.i16        q9, q3, d0[0]           @ 2*P1-5*P2+5*P3-2*P4
17389+        vshr.s16        q10, q10, #1            @ clip
17390+        vmla.i16        q2, q1, d0[1]           @ 2*P3-5*P4+5*P5
17391+        vrshr.s16       q8, q8, #3
17392+        vmls.i16        q2, q11, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
17393+        vceq.i16        q11, q10, #0            @ test clip == 0
17394+        vrshr.s16       q9, q9, #3
17395+        vabs.s16        q8, q8                  @ a2
17396+        vabs.s16        q9, q9                  @ a1
17397+        vrshr.s16       q2, q2, #3
17398+        vcge.s16        q14, q9, q8             @ test a1 >= a2
17399+        vabs.s16        q15, q2                 @ a0
17400+        vshr.s16        q2, q2, #8              @ a0_sign
17401+        vbsl            q14, q8, q9             @ a3
17402+        vcge.s16        q8, q15, q12            @ test a0 >= pq
17403+        vsub.i16        q2, q13, q2             @ clip_sign - a0_sign
17404+        vqsub.u16       q9, q15, q14            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
17405+        vcge.s16        q12, q14, q15           @ test a3 >= a0
17406+        vorr            q8, q11, q8             @ test clip == 0 || a0 >= pq
17407+        vmul.i16        q0, q9, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
17408+        vorr            q9, q8, q12             @ test clip == 0 || a0 >= pq || a3 >= a0
17409+        vshl.i64        q11, q9, #16
17410+        vmov.32         r0, d18[1]              @ move to gp reg
17411+        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
17412+        vmov.32         r2, d19[1]
17413+        vshr.s64        q9, q11, #48
17414+        vcge.s16        q11, q0, q10
17415+        vorr            q8, q8, q9
17416+        and             r0, r0, r2
17417+        vbsl            q11, q10, q0            @ FFMIN(d, clip)
17418+        tst             r0, #1
17419+        bne             1f                      @ none of the 8 pixel pairs should be updated in this case
17420+        vbic            q0, q11, q8             @ set each d to zero if it should not be filtered
17421+        vmls.i16        q3, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
17422+        vmla.i16        q1, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
17423+        vqmovun.s16     d0, q3
17424+        vqmovun.s16     d1, q1
17425+        vst1.32         {d0}, [r3 :64], r1
17426+        vst1.32         {d1}, [r3 :64]
17427+1:      bx              lr
17428+endfunc
17429+
17430+.align  5
17431+.Lcoeffs:
17432+.quad   0x00050002
17433+
17434+@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
17435+@ On entry:
17436+@   r0 -> top-left pel of right block
17437+@   r1 = row stride, bytes
17438+@   r2 = PQUANT bitstream parameter
17439+function ff_vc1_h_loop_filter8_neon, export=1
17440+        push            {lr}
17441+        sub             r3, r0, #4              @ where to start reading
17442+        vldr            d0, .Lcoeffs
17443+        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
17444+        sub             r0, r0, #1              @ where to start writing
17445+        vld1.32         {d4}, [r3], r1
17446+        add             r12, r0, r1, lsl #2
17447+        vld1.32         {d3}, [r3], r1
17448+        vld1.32         {d5}, [r3], r1
17449+        vld1.32         {d6}, [r3], r1
17450+        vld1.32         {d16}, [r3], r1
17451+        vld1.32         {d7}, [r3], r1
17452+        vld1.32         {d17}, [r3]
17453+        vtrn.8          q1, q2                  @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]...
17454+        vdup.16         q9, r2                  @ pq
17455+        vtrn.16         d2, d3                  @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
17456+        vtrn.16         d4, d5                  @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
17457+        vtrn.8          q3, q8                  @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]...
17458+        vtrn.16         d6, d7                  @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]...
17459+        vtrn.16         d16, d17                @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
17460+        vtrn.32         d2, d6                  @ P1, P5
17461+        vtrn.32         d4, d16                 @ P2, P6
17462+        vtrn.32         d3, d7                  @ P3, P7
17463+        vtrn.32         d5, d17                 @ P4, P8
17464+        vshll.u8        q10, d2, #1             @ 2*P1
17465+        vshll.u8        q11, d6, #1             @ 2*P5
17466+        vmovl.u8        q12, d4                 @ P2
17467+        vmovl.u8        q13, d16                @ P6
17468+        vmovl.u8        q14, d3                 @ P3
17469+        vmls.i16        q10, q12, d0[1]         @ 2*P1-5*P2
17470+        vmovl.u8        q12, d7                 @ P7
17471+        vshll.u8        q1, d3, #1              @ 2*P3
17472+        vmls.i16        q11, q13, d0[1]         @ 2*P5-5*P6
17473+        vmovl.u8        q2, d5                  @ P4
17474+        vmovl.u8        q8, d17                 @ P8
17475+        vmla.i16        q11, q12, d0[1]         @ 2*P5-5*P6+5*P7
17476+        vmovl.u8        q3, d6                  @ P5
17477+        vmla.i16        q10, q14, d0[1]         @ 2*P1-5*P2+5*P3
17478+        vsub.i16        q12, q2, q3             @ P4-P5
17479+        vmls.i16        q1, q2, d0[1]           @ 2*P3-5*P4
17480+        vmls.i16        q11, q8, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
17481+        vabs.s16        q8, q12
17482+        vshr.s16        q12, q12, #8            @ clip_sign
17483+        vmls.i16        q10, q2, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
17484+        vshr.s16        q8, q8, #1              @ clip
17485+        vmla.i16        q1, q3, d0[1]           @ 2*P3-5*P4+5*P5
17486+        vrshr.s16       q11, q11, #3
17487+        vmls.i16        q1, q13, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
17488+        vceq.i16        q13, q8, #0             @ test clip == 0
17489+        vrshr.s16       q10, q10, #3
17490+        vabs.s16        q11, q11                @ a2
17491+        vabs.s16        q10, q10                @ a1
17492+        vrshr.s16       q1, q1, #3
17493+        vcge.s16        q14, q10, q11           @ test a1 >= a2
17494+        vabs.s16        q15, q1                 @ a0
17495+        vshr.s16        q1, q1, #8              @ a0_sign
17496+        vbsl            q14, q11, q10           @ a3
17497+        vcge.s16        q9, q15, q9             @ test a0 >= pq
17498+        vsub.i16        q1, q12, q1             @ clip_sign - a0_sign
17499+        vqsub.u16       q10, q15, q14           @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
17500+        vcge.s16        q11, q14, q15           @ test a3 >= a0
17501+        vorr            q9, q13, q9             @ test clip == 0 || a0 >= pq
17502+        vmul.i16        q0, q10, d0[1]          @ a0 >= a3 ? 5*(a0-a3) : 0
17503+        vorr            q10, q9, q11            @ test clip == 0 || a0 >= pq || a3 >= a0
17504+        vmov.32         r2, d20[1]              @ move to gp reg
17505+        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
17506+        vmov.32         r3, d21[1]
17507+        vcge.s16        q10, q0, q8
17508+        and             r14, r2, r3
17509+        vbsl            q10, q8, q0             @ FFMIN(d, clip)
17510+        tst             r14, #1
17511+        bne             2f                      @ none of the 8 pixel pairs should be updated in this case
17512+        vbic            q0, q10, q9             @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
17513+        vmla.i16        q3, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
17514+        vmls.i16        q2, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
17515+        vqmovun.s16     d1, q3
17516+        vqmovun.s16     d0, q2
17517+        tst             r2, #1
17518+        bne             1f                      @ none of the first 4 pixel pairs should be updated if so
17519+        vst2.8          {d0[0], d1[0]}, [r0], r1
17520+        vst2.8          {d0[1], d1[1]}, [r0], r1
17521+        vst2.8          {d0[2], d1[2]}, [r0], r1
17522+        vst2.8          {d0[3], d1[3]}, [r0]
17523+1:      tst             r3, #1
17524+        bne             2f                      @ none of the second 4 pixel pairs should be updated if so
17525+        vst2.8          {d0[4], d1[4]}, [r12], r1
17526+        vst2.8          {d0[5], d1[5]}, [r12], r1
17527+        vst2.8          {d0[6], d1[6]}, [r12], r1
17528+        vst2.8          {d0[7], d1[7]}, [r12]
17529+2:      pop             {pc}
17530+endfunc
17531+
17532+@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
17533+@ On entry:
17534+@   r0 -> top-left pel of lower block
17535+@   r1 = row stride, bytes
17536+@   r2 = PQUANT bitstream parameter
17537+function ff_vc1_v_loop_filter16_neon, export=1
17538+        vpush           {d8-d15}
17539+        sub             r3, r0, r1, lsl #2
17540+        vldr            d0, .Lcoeffs
17541+        vld1.64         {q1}, [r0 :128], r1     @ P5
17542+        vld1.64         {q2}, [r3 :128], r1     @ P1
17543+        vld1.64         {q3}, [r3 :128], r1     @ P2
17544+        vld1.64         {q4}, [r0 :128], r1     @ P6
17545+        vld1.64         {q5}, [r3 :128], r1     @ P3
17546+        vld1.64         {q6}, [r0 :128], r1     @ P7
17547+        vshll.u8        q7, d2, #1              @ 2*P5[0..7]
17548+        vshll.u8        q8, d4, #1              @ 2*P1[0..7]
17549+        vld1.64         {q9}, [r3 :128]         @ P4
17550+        vmovl.u8        q10, d6                 @ P2[0..7]
17551+        vld1.64         {q11}, [r0 :128]        @ P8
17552+        vmovl.u8        q12, d8                 @ P6[0..7]
17553+        vdup.16         q13, r2                 @ pq
17554+        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
17555+        vmls.i16        q8, q10, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
17556+        vshll.u8        q10, d3, #1             @ 2*P5[8..15]
17557+        vmovl.u8        q3, d7                  @ P2[8..15]
17558+        vmls.i16        q7, q12, d0[1]          @ 2*P5[0..7]-5*P6[0..7]
17559+        vmovl.u8        q4, d9                  @ P6[8..15]
17560+        vmovl.u8        q14, d10                @ P3[0..7]
17561+        vmovl.u8        q15, d12                @ P7[0..7]
17562+        vmls.i16        q2, q3, d0[1]           @ 2*P1[8..15]-5*P2[8..15]
17563+        vshll.u8        q3, d10, #1             @ 2*P3[0..7]
17564+        vmls.i16        q10, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
17565+        vmovl.u8        q6, d13                 @ P7[8..15]
17566+        vmla.i16        q8, q14, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
17567+        vmovl.u8        q14, d18                @ P4[0..7]
17568+        vmovl.u8        q9, d19                 @ P4[8..15]
17569+        vmla.i16        q7, q15, d0[1]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
17570+        vmovl.u8        q15, d11                @ P3[8..15]
17571+        vshll.u8        q5, d11, #1             @ 2*P3[8..15]
17572+        vmls.i16        q3, q14, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
17573+        vmla.i16        q2, q15, d0[1]          @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
17574+        vmovl.u8        q15, d22                @ P8[0..7]
17575+        vmovl.u8        q11, d23                @ P8[8..15]
17576+        vmla.i16        q10, q6, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
17577+        vmovl.u8        q6, d2                  @ P5[0..7]
17578+        vmovl.u8        q1, d3                  @ P5[8..15]
17579+        vmls.i16        q5, q9, d0[1]           @ 2*P3[8..15]-5*P4[8..15]
17580+        vmls.i16        q8, q14, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
17581+        vmls.i16        q7, q15, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
17582+        vsub.i16        q15, q14, q6            @ P4[0..7]-P5[0..7]
17583+        vmla.i16        q3, q6, d0[1]           @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
17584+        vrshr.s16       q8, q8, #3
17585+        vmls.i16        q2, q9, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
17586+        vrshr.s16       q7, q7, #3
17587+        vmls.i16        q10, q11, d0[0]         @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
17588+        vabs.s16        q11, q15
17589+        vabs.s16        q8, q8                  @ a1[0..7]
17590+        vmla.i16        q5, q1, d0[1]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
17591+        vshr.s16        q15, q15, #8            @ clip_sign[0..7]
17592+        vrshr.s16       q2, q2, #3
17593+        vmls.i16        q3, q12, d0[0]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
17594+        vabs.s16        q7, q7                  @ a2[0..7]
17595+        vrshr.s16       q10, q10, #3
17596+        vsub.i16        q12, q9, q1             @ P4[8..15]-P5[8..15]
17597+        vshr.s16        q11, q11, #1            @ clip[0..7]
17598+        vmls.i16        q5, q4, d0[0]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
17599+        vcge.s16        q4, q8, q7              @ test a1[0..7] >= a2[0..7]
17600+        vabs.s16        q2, q2                  @ a1[8..15]
17601+        vrshr.s16       q3, q3, #3
17602+        vabs.s16        q10, q10                @ a2[8..15]
17603+        vbsl            q4, q7, q8              @ a3[0..7]
17604+        vabs.s16        q7, q12
17605+        vshr.s16        q8, q12, #8             @ clip_sign[8..15]
17606+        vrshr.s16       q5, q5, #3
17607+        vcge.s16        q12, q2, q10            @ test a1[8..15] >= a2[8.15]
17608+        vshr.s16        q7, q7, #1              @ clip[8..15]
17609+        vbsl            q12, q10, q2            @ a3[8..15]
17610+        vabs.s16        q2, q3                  @ a0[0..7]
17611+        vceq.i16        q10, q11, #0            @ test clip[0..7] == 0
17612+        vshr.s16        q3, q3, #8              @ a0_sign[0..7]
17613+        vsub.i16        q3, q15, q3             @ clip_sign[0..7] - a0_sign[0..7]
17614+        vcge.s16        q15, q2, q13            @ test a0[0..7] >= pq
17615+        vorr            q10, q10, q15           @ test clip[0..7] == 0 || a0[0..7] >= pq
17616+        vqsub.u16       q15, q2, q4             @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
17617+        vcge.s16        q2, q4, q2              @ test a3[0..7] >= a0[0..7]
17618+        vabs.s16        q4, q5                  @ a0[8..15]
17619+        vshr.s16        q5, q5, #8              @ a0_sign[8..15]
17620+        vmul.i16        q15, q15, d0[1]         @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
17621+        vcge.s16        q13, q4, q13            @ test a0[8..15] >= pq
17622+        vorr            q2, q10, q2             @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
17623+        vsub.i16        q5, q8, q5              @ clip_sign[8..15] - a0_sign[8..15]
17624+        vceq.i16        q8, q7, #0              @ test clip[8..15] == 0
17625+        vshr.u16        q15, q15, #3            @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
17626+        vmov.32         r0, d4[1]               @ move to gp reg
17627+        vorr            q8, q8, q13             @ test clip[8..15] == 0 || a0[8..15] >= pq
17628+        vqsub.u16       q13, q4, q12            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
17629+        vmov.32         r2, d5[1]
17630+        vcge.s16        q4, q12, q4             @ test a3[8..15] >= a0[8..15]
17631+        vshl.i64        q2, q2, #16
17632+        vcge.s16        q12, q15, q11
17633+        vmul.i16        q0, q13, d0[1]          @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
17634+        vorr            q4, q8, q4              @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
17635+        vshr.s64        q2, q2, #48
17636+        and             r0, r0, r2
17637+        vbsl            q12, q11, q15           @ FFMIN(d[0..7], clip[0..7])
17638+        vshl.i64        q11, q4, #16
17639+        vmov.32         r2, d8[1]
17640+        vshr.u16        q0, q0, #3              @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
17641+        vorr            q2, q10, q2
17642+        vmov.32         r12, d9[1]
17643+        vshr.s64        q4, q11, #48
17644+        vcge.s16        q10, q0, q7
17645+        vbic            q2, q12, q2             @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
17646+        vorr            q4, q8, q4
17647+        and             r2, r2, r12
17648+        vbsl            q10, q7, q0             @ FFMIN(d[8..15], clip[8..15])
17649+        vmls.i16        q14, q2, q3             @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
17650+        and             r0, r0, r2
17651+        vbic            q0, q10, q4             @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
17652+        tst             r0, #1
17653+        bne             1f                      @ none of the 16 pixel pairs should be updated in this case
17654+        vmla.i16        q6, q2, q3              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
17655+        vmls.i16        q9, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
17656+        vqmovun.s16     d4, q14
17657+        vmla.i16        q1, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
17658+        vqmovun.s16     d0, q6
17659+        vqmovun.s16     d5, q9
17660+        vqmovun.s16     d1, q1
17661+        vst1.64         {q2}, [r3 :128], r1
17662+        vst1.64         {q0}, [r3 :128]
17663+1:      vpop            {d8-d15}
17664+        bx              lr
17665+endfunc
17666+
17667+@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
17668+@ On entry:
17669+@   r0 -> top-left pel of right block
17670+@   r1 = row stride, bytes
17671+@   r2 = PQUANT bitstream parameter
17672+function ff_vc1_h_loop_filter16_neon, export=1
17673+        push            {r4-r6,lr}
17674+        vpush           {d8-d15}
17675+        sub             r3, r0, #4              @ where to start reading
17676+        vldr            d0, .Lcoeffs
17677+        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
17678+        sub             r0, r0, #1              @ where to start writing
17679+        vld1.32         {d3}, [r3], r1
17680+        add             r4, r0, r1, lsl #2
17681+        vld1.32         {d10}, [r3], r1
17682+        vld1.32         {d11}, [r3], r1
17683+        vld1.32         {d16}, [r3], r1
17684+        vld1.32         {d4}, [r3], r1
17685+        vld1.32         {d8}, [r3], r1
17686+        vtrn.8          d2, d3                  @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]...
17687+        vld1.32         {d14}, [r3], r1
17688+        vld1.32         {d5}, [r3], r1
17689+        vtrn.8          d10, d11                @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]...
17690+        vld1.32         {d6}, [r3], r1
17691+        vld1.32         {d12}, [r3], r1
17692+        vtrn.8          d16, d4                 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]...
17693+        vld1.32         {d13}, [r3], r1
17694+        vtrn.16         d2, d10                 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
17695+        vld1.32         {d1}, [r3], r1
17696+        vtrn.8          d8, d14                 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]...
17697+        vld1.32         {d7}, [r3], r1
17698+        vtrn.16         d3, d11                 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
17699+        vld1.32         {d9}, [r3], r1
17700+        vtrn.8          d5, d6                  @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]...
17701+        vld1.32         {d15}, [r3]
17702+        vtrn.16         d16, d8                 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]...
17703+        vtrn.16         d4, d14                 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
17704+        vtrn.8          d12, d13                @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]...
17705+        vdup.16         q9, r2                  @ pq
17706+        vtrn.8          d1, d7                  @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]...
17707+        vtrn.32         d2, d16                 @ P1[0..7], P5[0..7]
17708+        vtrn.16         d5, d12                 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]...
17709+        vtrn.16         d6, d13                 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]...
17710+        vtrn.8          d9, d15                 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]...
17711+        vtrn.32         d3, d4                  @ P2[0..7], P6[0..7]
17712+        vshll.u8        q10, d2, #1             @ 2*P1[0..7]
17713+        vtrn.32         d10, d8                 @ P3[0..7], P7[0..7]
17714+        vshll.u8        q11, d16, #1            @ 2*P5[0..7]
17715+        vtrn.32         d11, d14                @ P4[0..7], P8[0..7]
17716+        vtrn.16         d1, d9                  @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]...
17717+        vtrn.16         d7, d15                 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]...
17718+        vmovl.u8        q1, d3                  @ P2[0..7]
17719+        vmovl.u8        q12, d4                 @ P6[0..7]
17720+        vtrn.32         d5, d1                  @ P1[8..15], P5[8..15]
17721+        vtrn.32         d6, d7                  @ P2[8..15], P6[8..15]
17722+        vtrn.32         d12, d9                 @ P3[8..15], P7[8..15]
17723+        vtrn.32         d13, d15                @ P4[8..15], P8[8..15]
17724+        vmls.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
17725+        vmovl.u8        q1, d10                 @ P3[0..7]
17726+        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
17727+        vshll.u8        q13, d1, #1             @ 2*P5[8..15]
17728+        vmls.i16        q11, q12, d0[1]         @ 2*P5[0..7]-5*P6[0..7]
17729+        vmovl.u8        q14, d6                 @ P2[8..15]
17730+        vmovl.u8        q3, d7                  @ P6[8..15]
17731+        vmovl.u8        q15, d8                 @ P7[0..7]
17732+        vmla.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
17733+        vmovl.u8        q1, d12                 @ P3[8..15]
17734+        vmls.i16        q2, q14, d0[1]          @ 2*P1[8..15]-5*P2[8..15]
17735+        vmovl.u8        q4, d9                  @ P7[8..15]
17736+        vshll.u8        q14, d10, #1            @ 2*P3[0..7]
17737+        vmls.i16        q13, q3, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
17738+        vmovl.u8        q5, d11                 @ P4[0..7]
17739+        vmla.i16        q11, q15, d0[1]         @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
17740+        vshll.u8        q15, d12, #1            @ 2*P3[8..15]
17741+        vmovl.u8        q6, d13                 @ P4[8..15]
17742+        vmla.i16        q2, q1, d0[1]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
17743+        vmovl.u8        q1, d14                 @ P8[0..7]
17744+        vmovl.u8        q7, d15                 @ P8[8..15]
17745+        vmla.i16        q13, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
17746+        vmovl.u8        q4, d16                 @ P5[0..7]
17747+        vmovl.u8        q8, d1                  @ P5[8..15]
17748+        vmls.i16        q14, q5, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
17749+        vmls.i16        q15, q6, d0[1]          @ 2*P3[8..15]-5*P4[8..15]
17750+        vmls.i16        q10, q5, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
17751+        vmls.i16        q11, q1, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
17752+        vsub.i16        q1, q5, q4              @ P4[0..7]-P5[0..7]
17753+        vmls.i16        q2, q6, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
17754+        vrshr.s16       q10, q10, #3
17755+        vmls.i16        q13, q7, d0[0]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
17756+        vsub.i16        q7, q6, q8              @ P4[8..15]-P5[8..15]
17757+        vrshr.s16       q11, q11, #3
17758+        vmla.s16        q14, q4, d0[1]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
17759+        vrshr.s16       q2, q2, #3
17760+        vmla.i16        q15, q8, d0[1]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
17761+        vabs.s16        q10, q10                @ a1[0..7]
17762+        vrshr.s16       q13, q13, #3
17763+        vmls.i16        q15, q3, d0[0]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
17764+        vabs.s16        q3, q11                 @ a2[0..7]
17765+        vabs.s16        q2, q2                  @ a1[8..15]
17766+        vmls.i16        q14, q12, d0[0]         @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
17767+        vabs.s16        q11, q1
17768+        vabs.s16        q12, q13                @ a2[8..15]
17769+        vcge.s16        q13, q10, q3            @ test a1[0..7] >= a2[0..7]
17770+        vshr.s16        q1, q1, #8              @ clip_sign[0..7]
17771+        vrshr.s16       q15, q15, #3
17772+        vshr.s16        q11, q11, #1            @ clip[0..7]
17773+        vrshr.s16       q14, q14, #3
17774+        vbsl            q13, q3, q10            @ a3[0..7]
17775+        vcge.s16        q3, q2, q12             @ test a1[8..15] >= a2[8.15]
17776+        vabs.s16        q10, q15                @ a0[8..15]
17777+        vshr.s16        q15, q15, #8            @ a0_sign[8..15]
17778+        vbsl            q3, q12, q2             @ a3[8..15]
17779+        vabs.s16        q2, q14                 @ a0[0..7]
17780+        vabs.s16        q12, q7
17781+        vshr.s16        q7, q7, #8              @ clip_sign[8..15]
17782+        vshr.s16        q14, q14, #8            @ a0_sign[0..7]
17783+        vshr.s16        q12, q12, #1            @ clip[8..15]
17784+        vsub.i16        q7, q7, q15             @ clip_sign[8..15] - a0_sign[8..15]
17785+        vqsub.u16       q15, q10, q3            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
17786+        vcge.s16        q3, q3, q10             @ test a3[8..15] >= a0[8..15]
17787+        vcge.s16        q10, q10, q9            @ test a0[8..15] >= pq
17788+        vcge.s16        q9, q2, q9              @ test a0[0..7] >= pq
17789+        vsub.i16        q1, q1, q14             @ clip_sign[0..7] - a0_sign[0..7]
17790+        vqsub.u16       q14, q2, q13            @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
17791+        vcge.s16        q2, q13, q2             @ test a3[0..7] >= a0[0..7]
17792+        vmul.i16        q13, q15, d0[1]         @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
17793+        vceq.i16        q15, q11, #0            @ test clip[0..7] == 0
17794+        vmul.i16        q0, q14, d0[1]          @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
17795+        vorr            q9, q15, q9             @ test clip[0..7] == 0 || a0[0..7] >= pq
17796+        vceq.i16        q14, q12, #0            @ test clip[8..15] == 0
17797+        vshr.u16        q13, q13, #3            @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
17798+        vorr            q2, q9, q2              @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
17799+        vshr.u16        q0, q0, #3              @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
17800+        vorr            q10, q14, q10           @ test clip[8..15] == 0 || a0[8..15] >= pq
17801+        vcge.s16        q14, q13, q12
17802+        vmov.32         r2, d4[1]               @ move to gp reg
17803+        vorr            q3, q10, q3             @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
17804+        vmov.32         r3, d5[1]
17805+        vcge.s16        q2, q0, q11
17806+        vbsl            q14, q12, q13           @ FFMIN(d[8..15], clip[8..15])
17807+        vbsl            q2, q11, q0             @ FFMIN(d[0..7], clip[0..7])
17808+        vmov.32         r5, d6[1]
17809+        vbic            q0, q14, q10            @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
17810+        vmov.32         r6, d7[1]
17811+        and             r12, r2, r3
17812+        vbic            q2, q2, q9              @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
17813+        vmls.i16        q6, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
17814+        vmls.i16        q5, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
17815+        and             r14, r5, r6
17816+        vmla.i16        q4, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
17817+        and             r12, r12, r14
17818+        vqmovun.s16     d4, q6
17819+        vmla.i16        q8, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
17820+        tst             r12, #1
17821+        bne             4f                      @ none of the 16 pixel pairs should be updated in this case
17822+        vqmovun.s16     d2, q5
17823+        vqmovun.s16     d3, q4
17824+        vqmovun.s16     d5, q8
17825+        tst             r2, #1
17826+        bne             1f
17827+        vst2.8          {d2[0], d3[0]}, [r0], r1
17828+        vst2.8          {d2[1], d3[1]}, [r0], r1
17829+        vst2.8          {d2[2], d3[2]}, [r0], r1
17830+        vst2.8          {d2[3], d3[3]}, [r0]
17831+1:      add             r0, r4, r1, lsl #2
17832+        tst             r3, #1
17833+        bne             2f
17834+        vst2.8          {d2[4], d3[4]}, [r4], r1
17835+        vst2.8          {d2[5], d3[5]}, [r4], r1
17836+        vst2.8          {d2[6], d3[6]}, [r4], r1
17837+        vst2.8          {d2[7], d3[7]}, [r4]
17838+2:      add             r4, r0, r1, lsl #2
17839+        tst             r5, #1
17840+        bne             3f
17841+        vst2.8          {d4[0], d5[0]}, [r0], r1
17842+        vst2.8          {d4[1], d5[1]}, [r0], r1
17843+        vst2.8          {d4[2], d5[2]}, [r0], r1
17844+        vst2.8          {d4[3], d5[3]}, [r0]
17845+3:      tst             r6, #1
17846+        bne             4f
17847+        vst2.8          {d4[4], d5[4]}, [r4], r1
17848+        vst2.8          {d4[5], d5[5]}, [r4], r1
17849+        vst2.8          {d4[6], d5[6]}, [r4], r1
17850+        vst2.8          {d4[7], d5[7]}, [r4]
17851+4:      vpop            {d8-d15}
17852+        pop             {r4-r6,pc}
17853+endfunc
17854+
17855+@ Copy at most the specified number of bytes from source to destination buffer,
17856+@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
17857+@ On entry:
17858+@   r0 -> source buffer
17859+@   r1 = max number of bytes to copy
17860+@   r2 -> destination buffer, optimally 8-byte aligned
17861+@ On exit:
17862+@   r0 = number of bytes not copied
17863+function ff_vc1_unescape_buffer_helper_neon, export=1
17864+        @ Offset by 48 to screen out cases that are too short for us to handle,
17865+        @ and also make it easy to test for loop termination, or to determine
17866+        @ whether we need an odd number of half-iterations of the loop.
17867+        subs    r1, r1, #48
17868+        bmi     90f
17869+
17870+        @ Set up useful constants
17871+        vmov.i32        q0, #0x3000000
17872+        vmov.i32        q1, #0x30000
17873+
17874+        tst             r1, #16
17875+        bne             1f
17876+
17877+          vld1.8          {q8, q9}, [r0]!
17878+          vbic            q12, q8, q0
17879+          vext.8          q13, q8, q9, #1
17880+          vext.8          q14, q8, q9, #2
17881+          vext.8          q15, q8, q9, #3
17882+          veor            q12, q12, q1
17883+          vbic            q13, q13, q0
17884+          vbic            q14, q14, q0
17885+          vbic            q15, q15, q0
17886+          vceq.i32        q12, q12, #0
17887+          veor            q13, q13, q1
17888+          veor            q14, q14, q1
17889+          veor            q15, q15, q1
17890+          vceq.i32        q13, q13, #0
17891+          vceq.i32        q14, q14, #0
17892+          vceq.i32        q15, q15, #0
17893+          add             r1, r1, #16
17894+          b               3f
17895+
17896+1:      vld1.8          {q10, q11}, [r0]!
17897+        vbic            q12, q10, q0
17898+        vext.8          q13, q10, q11, #1
17899+        vext.8          q14, q10, q11, #2
17900+        vext.8          q15, q10, q11, #3
17901+        veor            q12, q12, q1
17902+        vbic            q13, q13, q0
17903+        vbic            q14, q14, q0
17904+        vbic            q15, q15, q0
17905+        vceq.i32        q12, q12, #0
17906+        veor            q13, q13, q1
17907+        veor            q14, q14, q1
17908+        veor            q15, q15, q1
17909+        vceq.i32        q13, q13, #0
17910+        vceq.i32        q14, q14, #0
17911+        vceq.i32        q15, q15, #0
17912+        @ Drop through...
17913+2:        vmov            q8, q11
17914+          vld1.8          {q9}, [r0]!
17915+        vorr            q13, q12, q13
17916+        vorr            q15, q14, q15
17917+          vbic            q12, q8, q0
17918+        vorr            q3, q13, q15
17919+          vext.8          q13, q8, q9, #1
17920+          vext.8          q14, q8, q9, #2
17921+          vext.8          q15, q8, q9, #3
17922+          veor            q12, q12, q1
17923+        vorr            d6, d6, d7
17924+          vbic            q13, q13, q0
17925+          vbic            q14, q14, q0
17926+          vbic            q15, q15, q0
17927+          vceq.i32        q12, q12, #0
17928+        vmov            r3, r12, d6
17929+          veor            q13, q13, q1
17930+          veor            q14, q14, q1
17931+          veor            q15, q15, q1
17932+          vceq.i32        q13, q13, #0
17933+          vceq.i32        q14, q14, #0
17934+          vceq.i32        q15, q15, #0
17935+        orrs            r3, r3, r12
17936+        bne             90f
17937+        vst1.64         {q10}, [r2]!
17938+3:          vmov            q10, q9
17939+            vld1.8          {q11}, [r0]!
17940+          vorr            q13, q12, q13
17941+          vorr            q15, q14, q15
17942+            vbic            q12, q10, q0
17943+          vorr            q3, q13, q15
17944+            vext.8          q13, q10, q11, #1
17945+            vext.8          q14, q10, q11, #2
17946+            vext.8          q15, q10, q11, #3
17947+            veor            q12, q12, q1
17948+          vorr            d6, d6, d7
17949+            vbic            q13, q13, q0
17950+            vbic            q14, q14, q0
17951+            vbic            q15, q15, q0
17952+            vceq.i32        q12, q12, #0
17953+          vmov            r3, r12, d6
17954+            veor            q13, q13, q1
17955+            veor            q14, q14, q1
17956+            veor            q15, q15, q1
17957+            vceq.i32        q13, q13, #0
17958+            vceq.i32        q14, q14, #0
17959+            vceq.i32        q15, q15, #0
17960+          orrs            r3, r3, r12
17961+          bne             91f
17962+          vst1.64         {q8}, [r2]!
17963+        subs            r1, r1, #32
17964+        bpl             2b
17965+
17966+90:     add             r0, r1, #48
17967+        bx              lr
17968+
17969+91:     sub             r1, r1, #16
17970+        b               90b
17971+endfunc
17972--- a/libavcodec/avcodec.h
17973+++ b/libavcodec/avcodec.h
17974@@ -2567,6 +2567,17 @@ typedef struct AVHWAccel {
17975      * that avctx->hwaccel_priv_data is invalid.
17976      */
17977     int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
17978+
17979+    /**
17980+     * Called if parsing fails
17981+     *
17982+     * An error has occured, end_frame will not be called
17983+     * start_frame & decode_slice may or may not have been called
17984+     * Optional
17985+     *
17986+     * @param avctx the codec context
17987+     */
17988+    void (*abort_frame)(AVCodecContext *avctx);
17989 } AVHWAccel;
17990
17991 /**
17992--- a/libavcodec/cabac.h
17993+++ b/libavcodec/cabac.h
17994@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_table
17995 typedef struct CABACContext{
17996     int low;
17997     int range;
17998-    int outstanding_count;
17999+    union
18000+    {
18001+        int outstanding_count;
18002+        struct {
18003+            uint16_t bits;
18004+            uint16_t range;
18005+        } by22;
18006+    };
18007     const uint8_t *bytestream_start;
18008     const uint8_t *bytestream;
18009     const uint8_t *bytestream_end;
18010--- a/libavcodec/codec.h
18011+++ b/libavcodec/codec.h
18012@@ -350,6 +350,17 @@ const AVCodec *av_codec_iterate(void **o
18013 AVCodec *avcodec_find_decoder(enum AVCodecID id);
18014
18015 /**
18016+ * Find a registered decoder with a matching codec ID and pix_fmt.
18017+ * A decoder will pix_fmt set to NULL will match any fmt.
18018+ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL.
18019+ *
18020+ * @param id AVCodecID of the requested decoder
18021+ * @param fmt AVPixelForma that msut be supported by decoder
18022+ * @return A decoder if one was found, NULL otherwise.
18023+ */
18024+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt);
18025+
18026+/**
18027  * Find a registered decoder with the specified name.
18028  *
18029  * @param name name of the requested decoder
18030--- /dev/null
18031+++ b/libavcodec/hevc-ctrls-v1.h
18032@@ -0,0 +1,229 @@
18033+/* SPDX-License-Identifier: GPL-2.0 */
18034+/*
18035+ * These are the HEVC state controls for use with stateless HEVC
18036+ * codec drivers.
18037+ *
18038+ * It turns out that these structs are not stable yet and will undergo
18039+ * more changes. So keep them private until they are stable and ready to
18040+ * become part of the official public API.
18041+ */
18042+
18043+#ifndef _HEVC_CTRLS_H_
18044+#define _HEVC_CTRLS_H_
18045+
18046+#include <linux/videodev2.h>
18047+
18048+/* The pixel format isn't stable at the moment and will likely be renamed. */
18049+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
18050+
18051+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
18052+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
18053+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
18054+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_MPEG_BASE + 1011)
18055+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
18056+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
18057+
18058+/* enum v4l2_ctrl_type type values */
18059+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
18060+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
18061+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
18062+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
18063+
18064+enum v4l2_mpeg_video_hevc_decode_mode {
18065+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
18066+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
18067+};
18068+
18069+enum v4l2_mpeg_video_hevc_start_code {
18070+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
18071+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
18072+};
18073+
18074+#define V4L2_HEVC_SLICE_TYPE_B	0
18075+#define V4L2_HEVC_SLICE_TYPE_P	1
18076+#define V4L2_HEVC_SLICE_TYPE_I	2
18077+
18078+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
18079+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
18080+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
18081+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
18082+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
18083+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
18084+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
18085+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
18086+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
18087+
18088+/* The controls are not stable at the moment and will likely be reworked. */
18089+struct v4l2_ctrl_hevc_sps {
18090+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
18091+	__u16	pic_width_in_luma_samples;
18092+	__u16	pic_height_in_luma_samples;
18093+	__u8	bit_depth_luma_minus8;
18094+	__u8	bit_depth_chroma_minus8;
18095+	__u8	log2_max_pic_order_cnt_lsb_minus4;
18096+	__u8	sps_max_dec_pic_buffering_minus1;
18097+	__u8	sps_max_num_reorder_pics;
18098+	__u8	sps_max_latency_increase_plus1;
18099+	__u8	log2_min_luma_coding_block_size_minus3;
18100+	__u8	log2_diff_max_min_luma_coding_block_size;
18101+	__u8	log2_min_luma_transform_block_size_minus2;
18102+	__u8	log2_diff_max_min_luma_transform_block_size;
18103+	__u8	max_transform_hierarchy_depth_inter;
18104+	__u8	max_transform_hierarchy_depth_intra;
18105+	__u8	pcm_sample_bit_depth_luma_minus1;
18106+	__u8	pcm_sample_bit_depth_chroma_minus1;
18107+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
18108+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
18109+	__u8	num_short_term_ref_pic_sets;
18110+	__u8	num_long_term_ref_pics_sps;
18111+	__u8	chroma_format_idc;
18112+	__u8	sps_max_sub_layers_minus1;
18113+
18114+	__u64	flags;
18115+};
18116+
18117+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 0)
18118+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
18119+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
18120+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
18121+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
18122+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
18123+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
18124+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
18125+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
18126+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
18127+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
18128+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
18129+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
18130+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
18131+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
18132+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
18133+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
18134+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
18135+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
18136+
18137+struct v4l2_ctrl_hevc_pps {
18138+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
18139+	__u8	num_extra_slice_header_bits;
18140+	__s8	init_qp_minus26;
18141+	__u8	diff_cu_qp_delta_depth;
18142+	__s8	pps_cb_qp_offset;
18143+	__s8	pps_cr_qp_offset;
18144+	__u8	num_tile_columns_minus1;
18145+	__u8	num_tile_rows_minus1;
18146+	__u8	column_width_minus1[20];
18147+	__u8	row_height_minus1[22];
18148+	__s8	pps_beta_offset_div2;
18149+	__s8	pps_tc_offset_div2;
18150+	__u8	log2_parallel_merge_level_minus2;
18151+
18152+	__u8	padding[4];
18153+	__u64	flags;
18154+};
18155+
18156+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
18157+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
18158+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
18159+
18160+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
18161+
18162+struct v4l2_hevc_dpb_entry {
18163+	__u64	timestamp;
18164+	__u8	rps;
18165+	__u8	field_pic;
18166+	__u16	pic_order_cnt[2];
18167+	__u8	padding[2];
18168+};
18169+
18170+struct v4l2_hevc_pred_weight_table {
18171+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18172+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18173+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
18174+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
18175+
18176+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18177+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18178+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
18179+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
18180+
18181+	__u8	padding[6];
18182+
18183+	__u8	luma_log2_weight_denom;
18184+	__s8	delta_chroma_log2_weight_denom;
18185+};
18186+
18187+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
18188+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
18189+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
18190+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
18191+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
18192+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
18193+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
18194+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
18195+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
18196+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 9)
18197+
18198+struct v4l2_ctrl_hevc_slice_params {
18199+	__u32	bit_size;
18200+	__u32	data_bit_offset;
18201+
18202+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
18203+	__u32	slice_segment_addr;
18204+	__u32	num_entry_point_offsets;
18205+
18206+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
18207+	__u8	nal_unit_type;
18208+	__u8	nuh_temporal_id_plus1;
18209+
18210+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
18211+	__u8	slice_type;
18212+	__u8	colour_plane_id;
18213+	__u16	slice_pic_order_cnt;
18214+	__u8	num_ref_idx_l0_active_minus1;
18215+	__u8	num_ref_idx_l1_active_minus1;
18216+	__u8	collocated_ref_idx;
18217+	__u8	five_minus_max_num_merge_cand;
18218+	__s8	slice_qp_delta;
18219+	__s8	slice_cb_qp_offset;
18220+	__s8	slice_cr_qp_offset;
18221+	__s8	slice_act_y_qp_offset;
18222+	__s8	slice_act_cb_qp_offset;
18223+	__s8	slice_act_cr_qp_offset;
18224+	__s8	slice_beta_offset_div2;
18225+	__s8	slice_tc_offset_div2;
18226+
18227+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
18228+	__u8	pic_struct;
18229+
18230+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
18231+	__u8	num_active_dpb_entries;
18232+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18233+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18234+
18235+	__u8	num_rps_poc_st_curr_before;
18236+	__u8	num_rps_poc_st_curr_after;
18237+	__u8	num_rps_poc_lt_curr;
18238+
18239+	__u8	padding;
18240+
18241+	__u32	entry_point_offset_minus1[256];
18242+
18243+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
18244+	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18245+
18246+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
18247+	struct v4l2_hevc_pred_weight_table pred_weight_table;
18248+
18249+	__u64	flags;
18250+};
18251+
18252+struct v4l2_ctrl_hevc_scaling_matrix {
18253+	__u8	scaling_list_4x4[6][16];
18254+	__u8	scaling_list_8x8[6][64];
18255+	__u8	scaling_list_16x16[6][64];
18256+	__u8	scaling_list_32x32[2][64];
18257+	__u8	scaling_list_dc_coef_16x16[6];
18258+	__u8	scaling_list_dc_coef_32x32[2];
18259+};
18260+
18261+#endif
18262--- /dev/null
18263+++ b/libavcodec/hevc-ctrls-v2.h
18264@@ -0,0 +1,257 @@
18265+/* SPDX-License-Identifier: GPL-2.0 */
18266+/*
18267+ * These are the HEVC state controls for use with stateless HEVC
18268+ * codec drivers.
18269+ *
18270+ * It turns out that these structs are not stable yet and will undergo
18271+ * more changes. So keep them private until they are stable and ready to
18272+ * become part of the official public API.
18273+ */
18274+
18275+#ifndef _HEVC_CTRLS_H_
18276+#define _HEVC_CTRLS_H_
18277+
18278+#include <linux/videodev2.h>
18279+
18280+/* The pixel format isn't stable at the moment and will likely be renamed. */
18281+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
18282+
18283+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
18284+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
18285+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
18286+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
18287+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
18288+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
18289+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
18290+
18291+/* enum v4l2_ctrl_type type values */
18292+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
18293+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
18294+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
18295+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
18296+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
18297+
18298+enum v4l2_mpeg_video_hevc_decode_mode {
18299+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
18300+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
18301+};
18302+
18303+enum v4l2_mpeg_video_hevc_start_code {
18304+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
18305+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
18306+};
18307+
18308+#define V4L2_HEVC_SLICE_TYPE_B	0
18309+#define V4L2_HEVC_SLICE_TYPE_P	1
18310+#define V4L2_HEVC_SLICE_TYPE_I	2
18311+
18312+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
18313+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
18314+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
18315+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
18316+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
18317+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
18318+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
18319+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
18320+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
18321+
18322+/* The controls are not stable at the moment and will likely be reworked. */
18323+struct v4l2_ctrl_hevc_sps {
18324+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
18325+	__u16	pic_width_in_luma_samples;
18326+	__u16	pic_height_in_luma_samples;
18327+	__u8	bit_depth_luma_minus8;
18328+	__u8	bit_depth_chroma_minus8;
18329+	__u8	log2_max_pic_order_cnt_lsb_minus4;
18330+	__u8	sps_max_dec_pic_buffering_minus1;
18331+	__u8	sps_max_num_reorder_pics;
18332+	__u8	sps_max_latency_increase_plus1;
18333+	__u8	log2_min_luma_coding_block_size_minus3;
18334+	__u8	log2_diff_max_min_luma_coding_block_size;
18335+	__u8	log2_min_luma_transform_block_size_minus2;
18336+	__u8	log2_diff_max_min_luma_transform_block_size;
18337+	__u8	max_transform_hierarchy_depth_inter;
18338+	__u8	max_transform_hierarchy_depth_intra;
18339+	__u8	pcm_sample_bit_depth_luma_minus1;
18340+	__u8	pcm_sample_bit_depth_chroma_minus1;
18341+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
18342+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
18343+	__u8	num_short_term_ref_pic_sets;
18344+	__u8	num_long_term_ref_pics_sps;
18345+	__u8	chroma_format_idc;
18346+	__u8	sps_max_sub_layers_minus1;
18347+
18348+	__u64	flags;
18349+};
18350+
18351+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
18352+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
18353+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
18354+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
18355+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
18356+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
18357+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
18358+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
18359+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
18360+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
18361+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
18362+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
18363+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
18364+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
18365+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
18366+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
18367+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
18368+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
18369+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
18370+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
18371+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
18372+
18373+struct v4l2_ctrl_hevc_pps {
18374+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
18375+	__u8	num_extra_slice_header_bits;
18376+	__u8	num_ref_idx_l0_default_active_minus1;
18377+	__u8	num_ref_idx_l1_default_active_minus1;
18378+	__s8	init_qp_minus26;
18379+	__u8	diff_cu_qp_delta_depth;
18380+	__s8	pps_cb_qp_offset;
18381+	__s8	pps_cr_qp_offset;
18382+	__u8	num_tile_columns_minus1;
18383+	__u8	num_tile_rows_minus1;
18384+	__u8	column_width_minus1[20];
18385+	__u8	row_height_minus1[22];
18386+	__s8	pps_beta_offset_div2;
18387+	__s8	pps_tc_offset_div2;
18388+	__u8	log2_parallel_merge_level_minus2;
18389+
18390+	__u8	padding[4];
18391+	__u64	flags;
18392+};
18393+
18394+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
18395+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
18396+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
18397+
18398+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
18399+
18400+struct v4l2_hevc_dpb_entry {
18401+	__u64	timestamp;
18402+	__u8	rps;
18403+	__u8	field_pic;
18404+	__u16	pic_order_cnt[2];
18405+	__u8	padding[2];
18406+};
18407+
18408+struct v4l2_hevc_pred_weight_table {
18409+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18410+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18411+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
18412+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
18413+
18414+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18415+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18416+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
18417+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
18418+
18419+	__u8	padding[6];
18420+
18421+	__u8	luma_log2_weight_denom;
18422+	__s8	delta_chroma_log2_weight_denom;
18423+};
18424+
18425+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
18426+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
18427+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
18428+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
18429+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
18430+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
18431+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
18432+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
18433+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
18434+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
18435+
18436+struct v4l2_ctrl_hevc_slice_params {
18437+	__u32	bit_size;
18438+	__u32	data_bit_offset;
18439+
18440+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
18441+	__u32	slice_segment_addr;
18442+	__u32	num_entry_point_offsets;
18443+
18444+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
18445+	__u8	nal_unit_type;
18446+	__u8	nuh_temporal_id_plus1;
18447+
18448+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
18449+	__u8	slice_type;
18450+	__u8	colour_plane_id;
18451+	__u16	slice_pic_order_cnt;
18452+	__u8	num_ref_idx_l0_active_minus1;
18453+	__u8	num_ref_idx_l1_active_minus1;
18454+	__u8	collocated_ref_idx;
18455+	__u8	five_minus_max_num_merge_cand;
18456+	__s8	slice_qp_delta;
18457+	__s8	slice_cb_qp_offset;
18458+	__s8	slice_cr_qp_offset;
18459+	__s8	slice_act_y_qp_offset;
18460+	__s8	slice_act_cb_qp_offset;
18461+	__s8	slice_act_cr_qp_offset;
18462+	__s8	slice_beta_offset_div2;
18463+	__s8	slice_tc_offset_div2;
18464+
18465+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
18466+	__u8	pic_struct;
18467+
18468+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
18469+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18470+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18471+
18472+	__u8	padding[5];
18473+
18474+	__u32	entry_point_offset_minus1[256];
18475+
18476+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
18477+	struct v4l2_hevc_pred_weight_table pred_weight_table;
18478+
18479+	__u64	flags;
18480+};
18481+
18482+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
18483+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
18484+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
18485+
18486+struct v4l2_ctrl_hevc_decode_params {
18487+	__s32	pic_order_cnt_val;
18488+	__u8	num_active_dpb_entries;
18489+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18490+	__u8	num_poc_st_curr_before;
18491+	__u8	num_poc_st_curr_after;
18492+	__u8	num_poc_lt_curr;
18493+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18494+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18495+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18496+	__u64	flags;
18497+};
18498+
18499+/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
18500+#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
18501+/*
18502+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
18503+ * the number of data (in bits) to skip in the
18504+ * slice segment header.
18505+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
18506+ * to before syntax element "slice_temporal_mvp_enabled_flag".
18507+ * If IDR, the skipped bits are just "pic_output_flag"
18508+ * (separate_colour_plane_flag is not supported).
18509+ */
18510+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
18511+
18512+struct v4l2_ctrl_hevc_scaling_matrix {
18513+	__u8	scaling_list_4x4[6][16];
18514+	__u8	scaling_list_8x8[6][64];
18515+	__u8	scaling_list_16x16[6][64];
18516+	__u8	scaling_list_32x32[2][64];
18517+	__u8	scaling_list_dc_coef_16x16[6];
18518+	__u8	scaling_list_dc_coef_32x32[2];
18519+};
18520+
18521+#endif
18522--- /dev/null
18523+++ b/libavcodec/hevc-ctrls-v3.h
18524@@ -0,0 +1,255 @@
18525+/* SPDX-License-Identifier: GPL-2.0 */
18526+/*
18527+ * These are the HEVC state controls for use with stateless HEVC
18528+ * codec drivers.
18529+ *
18530+ * It turns out that these structs are not stable yet and will undergo
18531+ * more changes. So keep them private until they are stable and ready to
18532+ * become part of the official public API.
18533+ */
18534+
18535+#ifndef _HEVC_CTRLS_H_
18536+#define _HEVC_CTRLS_H_
18537+
18538+#include <linux/videodev2.h>
18539+
18540+/* The pixel format isn't stable at the moment and will likely be renamed. */
18541+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
18542+
18543+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
18544+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
18545+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
18546+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
18547+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
18548+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
18549+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
18550+
18551+/* enum v4l2_ctrl_type type values */
18552+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
18553+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
18554+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
18555+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
18556+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
18557+
18558+enum v4l2_mpeg_video_hevc_decode_mode {
18559+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
18560+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
18561+};
18562+
18563+enum v4l2_mpeg_video_hevc_start_code {
18564+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
18565+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
18566+};
18567+
18568+#define V4L2_HEVC_SLICE_TYPE_B	0
18569+#define V4L2_HEVC_SLICE_TYPE_P	1
18570+#define V4L2_HEVC_SLICE_TYPE_I	2
18571+
18572+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
18573+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
18574+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
18575+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
18576+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
18577+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
18578+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
18579+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
18580+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
18581+
18582+/* The controls are not stable at the moment and will likely be reworked. */
18583+struct v4l2_ctrl_hevc_sps {
18584+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
18585+	__u16	pic_width_in_luma_samples;
18586+	__u16	pic_height_in_luma_samples;
18587+	__u8	bit_depth_luma_minus8;
18588+	__u8	bit_depth_chroma_minus8;
18589+	__u8	log2_max_pic_order_cnt_lsb_minus4;
18590+	__u8	sps_max_dec_pic_buffering_minus1;
18591+	__u8	sps_max_num_reorder_pics;
18592+	__u8	sps_max_latency_increase_plus1;
18593+	__u8	log2_min_luma_coding_block_size_minus3;
18594+	__u8	log2_diff_max_min_luma_coding_block_size;
18595+	__u8	log2_min_luma_transform_block_size_minus2;
18596+	__u8	log2_diff_max_min_luma_transform_block_size;
18597+	__u8	max_transform_hierarchy_depth_inter;
18598+	__u8	max_transform_hierarchy_depth_intra;
18599+	__u8	pcm_sample_bit_depth_luma_minus1;
18600+	__u8	pcm_sample_bit_depth_chroma_minus1;
18601+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
18602+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
18603+	__u8	num_short_term_ref_pic_sets;
18604+	__u8	num_long_term_ref_pics_sps;
18605+	__u8	chroma_format_idc;
18606+	__u8	sps_max_sub_layers_minus1;
18607+
18608+	__u64	flags;
18609+};
18610+
18611+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
18612+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
18613+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
18614+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
18615+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
18616+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
18617+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
18618+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
18619+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
18620+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
18621+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
18622+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
18623+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
18624+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
18625+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
18626+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
18627+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
18628+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
18629+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
18630+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
18631+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
18632+
18633+struct v4l2_ctrl_hevc_pps {
18634+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
18635+	__u8	num_extra_slice_header_bits;
18636+	__u8	num_ref_idx_l0_default_active_minus1;
18637+	__u8	num_ref_idx_l1_default_active_minus1;
18638+	__s8	init_qp_minus26;
18639+	__u8	diff_cu_qp_delta_depth;
18640+	__s8	pps_cb_qp_offset;
18641+	__s8	pps_cr_qp_offset;
18642+	__u8	num_tile_columns_minus1;
18643+	__u8	num_tile_rows_minus1;
18644+	__u8	column_width_minus1[20];
18645+	__u8	row_height_minus1[22];
18646+	__s8	pps_beta_offset_div2;
18647+	__s8	pps_tc_offset_div2;
18648+	__u8	log2_parallel_merge_level_minus2;
18649+
18650+	__u8	padding[4];
18651+	__u64	flags;
18652+};
18653+
18654+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
18655+
18656+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
18657+
18658+struct v4l2_hevc_dpb_entry {
18659+	__u64	timestamp;
18660+	__u8	flags;
18661+	__u8	field_pic;
18662+	__u16	pic_order_cnt[2];
18663+	__u8	padding[2];
18664+};
18665+
18666+struct v4l2_hevc_pred_weight_table {
18667+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18668+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18669+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
18670+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
18671+
18672+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18673+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18674+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
18675+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
18676+
18677+	__u8	padding[6];
18678+
18679+	__u8	luma_log2_weight_denom;
18680+	__s8	delta_chroma_log2_weight_denom;
18681+};
18682+
18683+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
18684+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
18685+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
18686+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
18687+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
18688+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
18689+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
18690+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
18691+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
18692+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
18693+
18694+struct v4l2_ctrl_hevc_slice_params {
18695+	__u32	bit_size;
18696+	__u32	data_bit_offset;
18697+
18698+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
18699+	__u32	slice_segment_addr;
18700+	__u32	num_entry_point_offsets;
18701+
18702+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
18703+	__u8	nal_unit_type;
18704+	__u8	nuh_temporal_id_plus1;
18705+
18706+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
18707+	__u8	slice_type;
18708+	__u8	colour_plane_id;
18709+	__u16	slice_pic_order_cnt;
18710+	__u8	num_ref_idx_l0_active_minus1;
18711+	__u8	num_ref_idx_l1_active_minus1;
18712+	__u8	collocated_ref_idx;
18713+	__u8	five_minus_max_num_merge_cand;
18714+	__s8	slice_qp_delta;
18715+	__s8	slice_cb_qp_offset;
18716+	__s8	slice_cr_qp_offset;
18717+	__s8	slice_act_y_qp_offset;
18718+	__s8	slice_act_cb_qp_offset;
18719+	__s8	slice_act_cr_qp_offset;
18720+	__s8	slice_beta_offset_div2;
18721+	__s8	slice_tc_offset_div2;
18722+
18723+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
18724+	__u8	pic_struct;
18725+
18726+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
18727+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18728+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18729+
18730+	__u8	padding[5];
18731+
18732+	__u32	entry_point_offset_minus1[256];
18733+
18734+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
18735+	struct v4l2_hevc_pred_weight_table pred_weight_table;
18736+
18737+	__u64	flags;
18738+};
18739+
18740+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
18741+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
18742+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
18743+
18744+struct v4l2_ctrl_hevc_decode_params {
18745+	__s32	pic_order_cnt_val;
18746+	__u8	num_active_dpb_entries;
18747+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18748+	__u8	num_poc_st_curr_before;
18749+	__u8	num_poc_st_curr_after;
18750+	__u8	num_poc_lt_curr;
18751+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18752+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18753+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
18754+	__u64	flags;
18755+};
18756+
18757+struct v4l2_ctrl_hevc_scaling_matrix {
18758+	__u8	scaling_list_4x4[6][16];
18759+	__u8	scaling_list_8x8[6][64];
18760+	__u8	scaling_list_16x16[6][64];
18761+	__u8	scaling_list_32x32[2][64];
18762+	__u8	scaling_list_dc_coef_16x16[6];
18763+	__u8	scaling_list_dc_coef_32x32[2];
18764+};
18765+
18766+/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
18767+#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
18768+/*
18769+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
18770+ * the number of data (in bits) to skip in the
18771+ * slice segment header.
18772+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
18773+ * to before syntax element "slice_temporal_mvp_enabled_flag".
18774+ * If IDR, the skipped bits are just "pic_output_flag"
18775+ * (separate_colour_plane_flag is not supported).
18776+ */
18777+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
18778+
18779+#endif
18780--- /dev/null
18781+++ b/libavcodec/hevc-ctrls-v4.h
18782@@ -0,0 +1,515 @@
18783+/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
18784+/*
18785+ *  Video for Linux Two controls header file
18786+ *
18787+ *  Copyright (C) 1999-2012 the contributors
18788+ *
18789+ *  This program is free software; you can redistribute it and/or modify
18790+ *  it under the terms of the GNU General Public License as published by
18791+ *  the Free Software Foundation; either version 2 of the License, or
18792+ *  (at your option) any later version.
18793+ *
18794+ *  This program is distributed in the hope that it will be useful,
18795+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
18796+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18797+ *  GNU General Public License for more details.
18798+ *
18799+ *  Alternatively you can redistribute this file under the terms of the
18800+ *  BSD license as stated below:
18801+ *
18802+ *  Redistribution and use in source and binary forms, with or without
18803+ *  modification, are permitted provided that the following conditions
18804+ *  are met:
18805+ *  1. Redistributions of source code must retain the above copyright
18806+ *     notice, this list of conditions and the following disclaimer.
18807+ *  2. Redistributions in binary form must reproduce the above copyright
18808+ *     notice, this list of conditions and the following disclaimer in
18809+ *     the documentation and/or other materials provided with the
18810+ *     distribution.
18811+ *  3. The names of its contributors may not be used to endorse or promote
18812+ *     products derived from this software without specific prior written
18813+ *     permission.
18814+ *
18815+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18816+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18817+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18818+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
18819+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
18820+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
18821+ *  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
18822+ *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
18823+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
18824+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
18825+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
18826+ *
18827+ *  The contents of this header was split off from videodev2.h. All control
18828+ *  definitions should be added to this header, which is included by
18829+ *  videodev2.h.
18830+ */
18831+
18832+#ifndef AVCODEC_HEVC_CTRLS_V4_H
18833+#define AVCODEC_HEVC_CTRLS_V4_H
18834+
18835+#include <linux/const.h>
18836+#include <linux/types.h>
18837+
18838+#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
18839+#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
18840+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
18841+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_STATELESS_BASE + 403)
18842+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 404)
18843+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_STATELESS_BASE + 405)
18844+#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_STATELESS_BASE + 406)
18845+#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
18846+
18847+enum v4l2_stateless_hevc_decode_mode {
18848+	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
18849+	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
18850+};
18851+
18852+enum v4l2_stateless_hevc_start_code {
18853+	V4L2_STATELESS_HEVC_START_CODE_NONE,
18854+	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
18855+};
18856+
18857+#define V4L2_HEVC_SLICE_TYPE_B	0
18858+#define V4L2_HEVC_SLICE_TYPE_P	1
18859+#define V4L2_HEVC_SLICE_TYPE_I	2
18860+
18861+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
18862+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
18863+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
18864+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
18865+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
18866+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
18867+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
18868+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
18869+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
18870+
18871+/**
18872+ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
18873+ *
18874+ * @video_parameter_set_id: specifies the value of the
18875+ *			vps_video_parameter_set_id of the active VPS
18876+ * @seq_parameter_set_id: provides an identifier for the SPS for
18877+ *			  reference by other syntax elements
18878+ * @pic_width_in_luma_samples:	specifies the width of each decoded picture
18879+ *				in units of luma samples
18880+ * @pic_height_in_luma_samples: specifies the height of each decoded picture
18881+ *				in units of luma samples
18882+ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
18883+ *                         samples of the luma array
18884+ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
18885+ *                           samples of the chroma arrays
18886+ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
18887+ *                                     the variable MaxPicOrderCntLsb
18888+ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
18889+ *                                    required size of the decoded picture
18890+ *                                    buffer for the codec video sequence
18891+ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
18892+ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
18893+ *				    value of SpsMaxLatencyPictures array
18894+ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
18895+ *					    luma coding block size
18896+ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
18897+ *					      the maximum and minimum luma
18898+ *					      coding block size
18899+ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
18900+ *					       transform block size
18901+ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
18902+ *						 the maximum and minimum luma
18903+ *						 transform block size
18904+ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
18905+ *					 depth for transform units of
18906+ *					 coding units coded in inter
18907+ *					 prediction mode
18908+ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
18909+ *					 depth for transform units of
18910+ *					 coding units coded in intra
18911+ *					 prediction mode
18912+ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
18913+ *                                    bits used to represent each of PCM sample
18914+ *                                    values of the luma component
18915+ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
18916+ *                                      of bits used to represent each of PCM
18917+ *                                      sample values of the chroma components
18918+ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
18919+ *                                              minimum size of coding blocks
18920+ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
18921+ *						  the maximum and minimum size of
18922+ *						  coding blocks
18923+ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
18924+ *				 syntax structures included in the SPS
18925+ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
18926+ *				reference pictures that are specified in the SPS
18927+ * @chroma_format_idc: specifies the chroma sampling
18928+ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
18929+ *                             of temporal sub-layers
18930+ * @reserved: padding field. Should be zeroed by applications.
18931+ * @flags: see V4L2_HEVC_SPS_FLAG_{}
18932+ */
18933+struct v4l2_ctrl_hevc_sps {
18934+	__u8	video_parameter_set_id;
18935+	__u8	seq_parameter_set_id;
18936+	__u16	pic_width_in_luma_samples;
18937+	__u16	pic_height_in_luma_samples;
18938+	__u8	bit_depth_luma_minus8;
18939+	__u8	bit_depth_chroma_minus8;
18940+	__u8	log2_max_pic_order_cnt_lsb_minus4;
18941+	__u8	sps_max_dec_pic_buffering_minus1;
18942+	__u8	sps_max_num_reorder_pics;
18943+	__u8	sps_max_latency_increase_plus1;
18944+	__u8	log2_min_luma_coding_block_size_minus3;
18945+	__u8	log2_diff_max_min_luma_coding_block_size;
18946+	__u8	log2_min_luma_transform_block_size_minus2;
18947+	__u8	log2_diff_max_min_luma_transform_block_size;
18948+	__u8	max_transform_hierarchy_depth_inter;
18949+	__u8	max_transform_hierarchy_depth_intra;
18950+	__u8	pcm_sample_bit_depth_luma_minus1;
18951+	__u8	pcm_sample_bit_depth_chroma_minus1;
18952+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
18953+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
18954+	__u8	num_short_term_ref_pic_sets;
18955+	__u8	num_long_term_ref_pics_sps;
18956+	__u8	chroma_format_idc;
18957+	__u8	sps_max_sub_layers_minus1;
18958+
18959+	__u8	reserved[6];
18960+	__u64	flags;
18961+};
18962+
18963+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
18964+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
18965+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
18966+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
18967+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
18968+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
18969+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
18970+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
18971+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
18972+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
18973+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
18974+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
18975+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
18976+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
18977+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
18978+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
18979+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
18980+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
18981+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
18982+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
18983+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
18984+
18985+/**
18986+ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
18987+ *
18988+ * @pic_parameter_set_id: identifies the PPS for reference by other
18989+ *			  syntax elements
18990+ * @num_extra_slice_header_bits: specifies the number of extra slice header
18991+ *				 bits that are present in the slice header RBSP
18992+ *				 for coded pictures referring to the PPS.
18993+ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
18994+ *                                        inferred value of num_ref_idx_l0_active_minus1
18995+ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
18996+ *                                        inferred value of num_ref_idx_l1_active_minus1
18997+ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
18998+ *		     each slice referring to the PPS
18999+ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
19000+ *			    tree block size and the minimum luma coding block
19001+ *			    size of coding units that convey cu_qp_delta_abs
19002+ *			    and cu_qp_delta_sign_flag
19003+ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
19004+ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
19005+ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
19006+ *			     partitioning the picture
19007+ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
19008+ *			  the picture
19009+ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
19010+ *			 units of coding tree blocks
19011+ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
19012+ *		       units of coding tree blocks
19013+ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
19014+ *			  beta divided by 2
19015+ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
19016+ *			divided by 2
19017+ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
19018+ *                                    the variable Log2ParMrgLevel
19019+ * @reserved: padding field. Should be zeroed by applications.
19020+ * @flags: see V4L2_HEVC_PPS_FLAG_{}
19021+ */
19022+struct v4l2_ctrl_hevc_pps {
19023+	__u8	pic_parameter_set_id;
19024+	__u8	num_extra_slice_header_bits;
19025+	__u8	num_ref_idx_l0_default_active_minus1;
19026+	__u8	num_ref_idx_l1_default_active_minus1;
19027+	__s8	init_qp_minus26;
19028+	__u8	diff_cu_qp_delta_depth;
19029+	__s8	pps_cb_qp_offset;
19030+	__s8	pps_cr_qp_offset;
19031+	__u8	num_tile_columns_minus1;
19032+	__u8	num_tile_rows_minus1;
19033+	__u8	column_width_minus1[20];
19034+	__u8	row_height_minus1[22];
19035+	__s8	pps_beta_offset_div2;
19036+	__s8	pps_tc_offset_div2;
19037+	__u8	log2_parallel_merge_level_minus2;
19038+	__u8	reserved;
19039+	__u64	flags;
19040+};
19041+
19042+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
19043+
19044+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
19045+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
19046+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
19047+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
19048+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
19049+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
19050+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
19051+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
19052+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
19053+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
19054+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
19055+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
19056+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
19057+
19058+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
19059+
19060+/**
19061+ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
19062+ *
19063+ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
19064+ * @flags: long term flag for the reference frame
19065+ * @field_pic: whether the reference is a field picture or a frame.
19066+ * @reserved: padding field. Should be zeroed by applications.
19067+ * @pic_order_cnt_val: the picture order count of the current picture.
19068+ */
19069+struct v4l2_hevc_dpb_entry {
19070+	__u64	timestamp;
19071+	__u8	flags;
19072+	__u8	field_pic;
19073+	__u16	reserved;
19074+	__s32	pic_order_cnt_val;
19075+};
19076+
19077+/**
19078+ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
19079+ *
19080+ * @delta_luma_weight_l0: the difference of the weighting factor applied
19081+ *			  to the luma prediction value for list 0
19082+ * @luma_offset_l0: the additive offset applied to the luma prediction value
19083+ *		    for list 0
19084+ * @delta_chroma_weight_l0: the difference of the weighting factor applied
19085+ *			    to the chroma prediction values for list 0
19086+ * @chroma_offset_l0: the difference of the additive offset applied to
19087+ *		      the chroma prediction values for list 0
19088+ * @delta_luma_weight_l1: the difference of the weighting factor applied
19089+ *			  to the luma prediction value for list 1
19090+ * @luma_offset_l1: the additive offset applied to the luma prediction value
19091+ *		    for list 1
19092+ * @delta_chroma_weight_l1: the difference of the weighting factor applied
19093+ *			    to the chroma prediction values for list 1
19094+ * @chroma_offset_l1: the difference of the additive offset applied to
19095+ *		      the chroma prediction values for list 1
19096+ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
19097+ *			    all luma weighting factors
19098+ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
19099+ *				    of the denominator for all chroma
19100+ *				    weighting factors
19101+ */
19102+struct v4l2_hevc_pred_weight_table {
19103+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
19104+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
19105+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
19106+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
19107+
19108+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
19109+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
19110+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
19111+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
19112+
19113+	__u8	luma_log2_weight_denom;
19114+	__s8	delta_chroma_log2_weight_denom;
19115+};
19116+
19117+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
19118+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
19119+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
19120+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
19121+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
19122+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
19123+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
19124+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
19125+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
19126+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
19127+
19128+/**
19129+ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
19130+ *
19131+ * This control is a dynamically sized 1-dimensional array,
19132+ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
19133+ *
19134+ * @bit_size: size (in bits) of the current slice data
19135+ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
19136+ * @num_entry_point_offsets: specifies the number of entry point offset syntax
19137+ *			     elements in the slice header.
19138+ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
19139+ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
19140+ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
19141+ * @colour_plane_id: specifies the colour plane associated with the current slice
19142+ * @slice_pic_order_cnt: specifies the picture order count
19143+ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
19144+ *                                reference index for reference picture list 0
19145+ *                                that may be used to decode the slice
19146+ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
19147+ *                                reference index for reference picture list 1
19148+ *                                that may be used to decode the slice
19149+ * @collocated_ref_idx: specifies the reference index of the collocated picture used
19150+ *			for temporal motion vector prediction
19151+ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
19152+ *				   motion vector prediction candidates supported in
19153+ *				   the slice subtracted from 5
19154+ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
19155+ *		    blocks in the slice
19156+ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
19157+ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
19158+ * @slice_act_y_qp_offset: screen content extension parameters
19159+ * @slice_act_cb_qp_offset: screen content extension parameters
19160+ * @slice_act_cr_qp_offset: screen content extension parameters
19161+ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
19162+ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
19163+ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
19164+ *		more fields
19165+ * @reserved0: padding field. Should be zeroed by applications.
19166+ * @slice_segment_addr: specifies the address of the first coding tree block in
19167+ *			the slice segment
19168+ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
19169+ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
19170+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
19171+ *				 pictures set included in the SPS
19172+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
19173+ *				pictures set include in the SPS
19174+ * @pred_weight_table: the prediction weight coefficients for inter-picture
19175+ *		       prediction
19176+ * @reserved1: padding field. Should be zeroed by applications.
19177+ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
19178+ */
19179+struct v4l2_ctrl_hevc_slice_params {
19180+	__u32	bit_size;
19181+	__u32	data_byte_offset;
19182+	__u32	num_entry_point_offsets;
19183+
19184+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
19185+	__u8	nal_unit_type;
19186+	__u8	nuh_temporal_id_plus1;
19187+
19188+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
19189+	__u8	slice_type;
19190+	__u8	colour_plane_id;
19191+	__s32	slice_pic_order_cnt;
19192+	__u8	num_ref_idx_l0_active_minus1;
19193+	__u8	num_ref_idx_l1_active_minus1;
19194+	__u8	collocated_ref_idx;
19195+	__u8	five_minus_max_num_merge_cand;
19196+	__s8	slice_qp_delta;
19197+	__s8	slice_cb_qp_offset;
19198+	__s8	slice_cr_qp_offset;
19199+	__s8	slice_act_y_qp_offset;
19200+	__s8	slice_act_cb_qp_offset;
19201+	__s8	slice_act_cr_qp_offset;
19202+	__s8	slice_beta_offset_div2;
19203+	__s8	slice_tc_offset_div2;
19204+
19205+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
19206+	__u8	pic_struct;
19207+
19208+	__u8	reserved0[3];
19209+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
19210+	__u32	slice_segment_addr;
19211+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
19212+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
19213+	__u16	short_term_ref_pic_set_size;
19214+	__u16	long_term_ref_pic_set_size;
19215+
19216+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
19217+	struct v4l2_hevc_pred_weight_table pred_weight_table;
19218+
19219+	__u8	reserved1[2];
19220+	__u64	flags;
19221+};
19222+
19223+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
19224+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
19225+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
19226+
19227+/**
19228+ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
19229+ *
19230+ * @pic_order_cnt_val: picture order count
19231+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
19232+ *				 pictures set included in the SPS of the first slice
19233+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
19234+ *				pictures set include in the SPS of the first slice
19235+ * @num_active_dpb_entries: the number of entries in dpb
19236+ * @num_poc_st_curr_before: the number of reference pictures in the short-term
19237+ *			    set that come before the current frame
19238+ * @num_poc_st_curr_after: the number of reference pictures in the short-term
19239+ *			   set that come after the current frame
19240+ * @num_poc_lt_curr: the number of reference pictures in the long-term set
19241+ * @poc_st_curr_before: provides the index of the short term before references
19242+ *			in DPB array
19243+ * @poc_st_curr_after: provides the index of the short term after references
19244+ *		       in DPB array
19245+ * @poc_lt_curr: provides the index of the long term references in DPB array
19246+ * @reserved: padding field. Should be zeroed by applications.
19247+ * @dpb: the decoded picture buffer, for meta-data about reference frames
19248+ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
19249+ */
19250+struct v4l2_ctrl_hevc_decode_params {
19251+	__s32	pic_order_cnt_val;
19252+	__u16	short_term_ref_pic_set_size;
19253+	__u16	long_term_ref_pic_set_size;
19254+	__u8	num_active_dpb_entries;
19255+	__u8	num_poc_st_curr_before;
19256+	__u8	num_poc_st_curr_after;
19257+	__u8	num_poc_lt_curr;
19258+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
19259+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
19260+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
19261+	__u8	reserved[4];
19262+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
19263+	__u64	flags;
19264+};
19265+
19266+/**
19267+ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
19268+ *
19269+ * @scaling_list_4x4: scaling list is used for the scaling process for
19270+ *		      transform coefficients. The values on each scaling
19271+ *		      list are expected in raster scan order
19272+ * @scaling_list_8x8: scaling list is used for the scaling process for
19273+ *		      transform coefficients. The values on each scaling
19274+ *		      list are expected in raster scan order
19275+ * @scaling_list_16x16:	scaling list is used for the scaling process for
19276+ *			transform coefficients. The values on each scaling
19277+ *			list are expected in raster scan order
19278+ * @scaling_list_32x32:	scaling list is used for the scaling process for
19279+ *			transform coefficients. The values on each scaling
19280+ *			list are expected in raster scan order
19281+ * @scaling_list_dc_coef_16x16:	scaling list is used for the scaling process
19282+ *				for transform coefficients. The values on each
19283+ *				scaling list are expected in raster scan order.
19284+ * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
19285+ *				for transform coefficients. The values on each
19286+ *				scaling list are expected in raster scan order.
19287+ */
19288+struct v4l2_ctrl_hevc_scaling_matrix {
19289+	__u8	scaling_list_4x4[6][16];
19290+	__u8	scaling_list_8x8[6][64];
19291+	__u8	scaling_list_16x16[6][64];
19292+	__u8	scaling_list_32x32[2][64];
19293+	__u8	scaling_list_dc_coef_16x16[6];
19294+	__u8	scaling_list_dc_coef_32x32[2];
19295+};
19296+
19297+#endif
19298--- a/libavcodec/hevc_parser.c
19299+++ b/libavcodec/hevc_parser.c
19300@@ -98,6 +98,19 @@ static int hevc_parse_slice_header(AVCod
19301     avctx->profile  = ps->sps->ptl.general_ptl.profile_idc;
19302     avctx->level    = ps->sps->ptl.general_ptl.level_idc;
19303
19304+    if (ps->sps->chroma_format_idc == 1) {
19305+        avctx->chroma_sample_location = ps->sps->vui.chroma_loc_info_present_flag ?
19306+            ps->sps->vui.chroma_sample_loc_type_top_field + 1 :
19307+            AVCHROMA_LOC_LEFT;
19308+    }
19309+    else if (ps->sps->chroma_format_idc == 2 ||
19310+             ps->sps->chroma_format_idc == 3) {
19311+        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
19312+    }
19313+    else {
19314+        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
19315+    }
19316+
19317     if (ps->vps->vps_timing_info_present_flag) {
19318         num = ps->vps->vps_num_units_in_tick;
19319         den = ps->vps->vps_time_scale;
19320--- a/libavcodec/hevc_refs.c
19321+++ b/libavcodec/hevc_refs.c
19322@@ -96,18 +96,22 @@ static HEVCFrame *alloc_frame(HEVCContex
19323         if (!frame->rpl_buf)
19324             goto fail;
19325
19326-        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
19327-        if (!frame->tab_mvf_buf)
19328-            goto fail;
19329-        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
19330+        if (s->tab_mvf_pool) {
19331+            frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
19332+            if (!frame->tab_mvf_buf)
19333+                goto fail;
19334+            frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
19335+        }
19336
19337-        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
19338-        if (!frame->rpl_tab_buf)
19339-            goto fail;
19340-        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
19341-        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
19342-        for (j = 0; j < frame->ctb_count; j++)
19343-            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
19344+        if (s->rpl_tab_pool) {
19345+            frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
19346+            if (!frame->rpl_tab_buf)
19347+                goto fail;
19348+            frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
19349+            frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
19350+            for (j = 0; j < frame->ctb_count; j++)
19351+                frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
19352+        }
19353
19354         frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
19355         frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
19356@@ -276,14 +280,17 @@ static int init_slice_rpl(HEVCContext *s
19357     int ctb_count    = frame->ctb_count;
19358     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
19359     int i;
19360+    RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
19361
19362     if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
19363         return AVERROR_INVALIDDATA;
19364
19365-    for (i = ctb_addr_ts; i < ctb_count; i++)
19366-        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
19367+    if (frame->rpl_tab) {
19368+        for (i = ctb_addr_ts; i < ctb_count; i++)
19369+            frame->rpl_tab[i] = tab;
19370+    }
19371
19372-    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
19373+    frame->refPicList = tab->refPicList;
19374
19375     return 0;
19376 }
19377--- a/libavcodec/hevcdec.c
19378+++ b/libavcodec/hevcdec.c
19379@@ -332,6 +332,19 @@ static void export_stream_params(HEVCCon
19380
19381     ff_set_sar(avctx, sps->vui.sar);
19382
19383+    if (sps->chroma_format_idc == 1) {
19384+        avctx->chroma_sample_location = sps->vui.chroma_loc_info_present_flag ?
19385+            sps->vui.chroma_sample_loc_type_top_field + 1 :
19386+            AVCHROMA_LOC_LEFT;
19387+    }
19388+    else if (sps->chroma_format_idc == 2 ||
19389+             sps->chroma_format_idc == 3) {
19390+        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
19391+    }
19392+    else {
19393+        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
19394+    }
19395+
19396     if (sps->vui.video_signal_type_present_flag)
19397         avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
19398                                                             : AVCOL_RANGE_MPEG;
19399@@ -372,14 +385,20 @@ static enum AVPixelFormat get_format(HEV
19400 #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
19401                      CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
19402                      CONFIG_HEVC_NVDEC_HWACCEL + \
19403+                     CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
19404                      CONFIG_HEVC_VAAPI_HWACCEL + \
19405                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
19406+                     CONFIG_HEVC_RPI4_8_HWACCEL + \
19407+                     CONFIG_HEVC_RPI4_10_HWACCEL + \
19408                      CONFIG_HEVC_VDPAU_HWACCEL)
19409     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
19410
19411     switch (sps->pix_fmt) {
19412     case AV_PIX_FMT_YUV420P:
19413     case AV_PIX_FMT_YUVJ420P:
19414+#if CONFIG_HEVC_RPI4_8_HWACCEL
19415+        *fmt++ = AV_PIX_FMT_RPI4_8;
19416+#endif
19417 #if CONFIG_HEVC_DXVA2_HWACCEL
19418         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
19419 #endif
19420@@ -399,8 +418,14 @@ static enum AVPixelFormat get_format(HEV
19421 #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
19422         *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
19423 #endif
19424+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
19425+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
19426+#endif
19427         break;
19428     case AV_PIX_FMT_YUV420P10:
19429+#if CONFIG_HEVC_RPI4_10_HWACCEL
19430+        *fmt++ = AV_PIX_FMT_RPI4_10;
19431+#endif
19432 #if CONFIG_HEVC_DXVA2_HWACCEL
19433         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
19434 #endif
19435@@ -417,6 +442,9 @@ static enum AVPixelFormat get_format(HEV
19436 #if CONFIG_HEVC_NVDEC_HWACCEL
19437         *fmt++ = AV_PIX_FMT_CUDA;
19438 #endif
19439+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
19440+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
19441+#endif
19442         break;
19443     case AV_PIX_FMT_YUV444P:
19444 #if CONFIG_HEVC_VDPAU_HWACCEL
19445@@ -459,6 +487,16 @@ static int set_sps(HEVCContext *s, const
19446     if (!sps)
19447         return 0;
19448
19449+    // If hwaccel then we don't need all the s/w decode helper arrays
19450+    if (s->avctx->hwaccel) {
19451+        export_stream_params(s, sps);
19452+
19453+        s->avctx->pix_fmt = pix_fmt;
19454+        s->ps.sps = sps;
19455+        s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
19456+        return 0;
19457+    }
19458+
19459     ret = pic_arrays_init(s, sps);
19460     if (ret < 0)
19461         goto fail;
19462@@ -2809,11 +2847,13 @@ static int hevc_frame_start(HEVCContext
19463                            ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
19464     int ret;
19465
19466-    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
19467-    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
19468-    memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
19469-    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
19470-    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
19471+    if (s->horizontal_bs) {
19472+        memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
19473+        memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
19474+        memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
19475+        memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
19476+        memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
19477+    }
19478
19479     s->is_decoded        = 0;
19480     s->first_nal_type    = s->nal_unit_type;
19481@@ -3230,7 +3270,14 @@ static int hevc_decode_frame(AVCodecCont
19482     s->ref = NULL;
19483     ret    = decode_nal_units(s, avpkt->data, avpkt->size);
19484     if (ret < 0)
19485+    {
19486+        // Ensure that hwaccel knows this frame is over
19487+        if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) {
19488+            s->avctx->hwaccel->abort_frame(s->avctx);
19489+        }
19490+
19491         return ret;
19492+    }
19493
19494     if (avctx->hwaccel) {
19495         if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
19496@@ -3273,15 +3320,19 @@ static int hevc_ref_frame(HEVCContext *s
19497     if (ret < 0)
19498         return ret;
19499
19500-    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
19501-    if (!dst->tab_mvf_buf)
19502-        goto fail;
19503-    dst->tab_mvf = src->tab_mvf;
19504+    if (src->tab_mvf_buf) {
19505+        dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
19506+        if (!dst->tab_mvf_buf)
19507+            goto fail;
19508+        dst->tab_mvf = src->tab_mvf;
19509+    }
19510
19511-    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
19512-    if (!dst->rpl_tab_buf)
19513-        goto fail;
19514-    dst->rpl_tab = src->rpl_tab;
19515+    if (src->rpl_tab_buf) {
19516+        dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
19517+        if (!dst->rpl_tab_buf)
19518+            goto fail;
19519+        dst->rpl_tab = src->rpl_tab;
19520+    }
19521
19522     dst->rpl_buf = av_buffer_ref(src->rpl_buf);
19523     if (!dst->rpl_buf)
19524@@ -3585,6 +3636,15 @@ AVCodec ff_hevc_decoder = {
19525 #if CONFIG_HEVC_NVDEC_HWACCEL
19526                                HWACCEL_NVDEC(hevc),
19527 #endif
19528+#if CONFIG_HEVC_RPI4_8_HWACCEL
19529+                               HWACCEL_RPI4_8(hevc),
19530+#endif
19531+#if CONFIG_HEVC_RPI4_10_HWACCEL
19532+                               HWACCEL_RPI4_10(hevc),
19533+#endif
19534+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
19535+                               HWACCEL_V4L2REQUEST(hevc),
19536+#endif
19537 #if CONFIG_HEVC_VAAPI_HWACCEL
19538                                HWACCEL_VAAPI(hevc),
19539 #endif
19540--- a/libavcodec/hwaccels.h
19541+++ b/libavcodec/hwaccels.h
19542@@ -34,6 +34,9 @@ extern const AVHWAccel ff_hevc_d3d11va_h
19543 extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
19544 extern const AVHWAccel ff_hevc_dxva2_hwaccel;
19545 extern const AVHWAccel ff_hevc_nvdec_hwaccel;
19546+extern const AVHWAccel ff_hevc_rpi4_8_hwaccel;
19547+extern const AVHWAccel ff_hevc_rpi4_10_hwaccel;
19548+extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
19549 extern const AVHWAccel ff_hevc_vaapi_hwaccel;
19550 extern const AVHWAccel ff_hevc_vdpau_hwaccel;
19551 extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
19552--- a/libavcodec/hwconfig.h
19553+++ b/libavcodec/hwconfig.h
19554@@ -24,6 +24,7 @@
19555
19556
19557 #define HWACCEL_CAP_ASYNC_SAFE      (1 << 0)
19558+#define HWACCEL_CAP_MT_SAFE         (1 << 1)
19559
19560
19561 typedef struct AVCodecHWConfigInternal {
19562@@ -70,6 +71,12 @@ typedef struct AVCodecHWConfigInternal {
19563     HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
19564 #define HWACCEL_NVDEC(codec) \
19565     HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _nvdec_hwaccel)
19566+#define HWACCEL_RPI4_8(codec) \
19567+    HW_CONFIG_HWACCEL(0, 0, 1, RPI4_8,       NONE,         ff_ ## codec ## _rpi4_8_hwaccel)
19568+#define HWACCEL_RPI4_10(codec) \
19569+    HW_CONFIG_HWACCEL(0, 0, 1, RPI4_10,      NONE,         ff_ ## codec ## _rpi4_10_hwaccel)
19570+#define HWACCEL_V4L2REQUEST(codec) \
19571+    HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME,    DRM,          ff_ ## codec ## _v4l2request_hwaccel)
19572 #define HWACCEL_VAAPI(codec) \
19573     HW_CONFIG_HWACCEL(1, 1, 1, VAAPI,        VAAPI,        ff_ ## codec ## _vaapi_hwaccel)
19574 #define HWACCEL_VDPAU(codec) \
19575--- a/libavcodec/mmaldec.c
19576+++ b/libavcodec/mmaldec.c
19577@@ -24,6 +24,9 @@
19578  * MMAL Video Decoder
19579  */
19580
19581+#pragma GCC diagnostic push
19582+// Many many redundant decls in the header files
19583+#pragma GCC diagnostic ignored "-Wredundant-decls"
19584 #include <bcm_host.h>
19585 #include <interface/mmal/mmal.h>
19586 #include <interface/mmal/mmal_parameters_video.h>
19587@@ -31,6 +34,7 @@
19588 #include <interface/mmal/util/mmal_util_params.h>
19589 #include <interface/mmal/util/mmal_default_components.h>
19590 #include <interface/mmal/vc/mmal_vc_api.h>
19591+#pragma GCC diagnostic pop
19592 #include <stdatomic.h>
19593
19594 #include "avcodec.h"
19595--- a/libavcodec/pthread_frame.c
19596+++ b/libavcodec/pthread_frame.c
19597@@ -191,7 +191,8 @@ static attribute_align_arg void *frame_w
19598
19599         /* if the previous thread uses hwaccel then we take the lock to ensure
19600          * the threads don't run concurrently */
19601-        if (avctx->hwaccel) {
19602+        if (avctx->hwaccel &&
19603+            !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
19604             pthread_mutex_lock(&p->parent->hwaccel_mutex);
19605             p->hwaccel_serializing = 1;
19606         }
19607@@ -614,7 +615,9 @@ void ff_thread_finish_setup(AVCodecConte
19608
19609     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
19610
19611-    if (avctx->hwaccel && !p->hwaccel_serializing) {
19612+    if (avctx->hwaccel &&
19613+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
19614+        !p->hwaccel_serializing) {
19615         pthread_mutex_lock(&p->parent->hwaccel_mutex);
19616         p->hwaccel_serializing = 1;
19617     }
19618--- a/libavcodec/raw.c
19619+++ b/libavcodec/raw.c
19620@@ -293,6 +293,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags
19621     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
19622     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
19623
19624+    /* RPI (Might as well define for everything) */
19625+    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
19626+    { AV_PIX_FMT_RPI4_8,      MKTAG('S', 'A', 'N', 'D') },
19627+    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
19628+    { AV_PIX_FMT_RPI4_10,     MKTAG('S', 'N', 'D', 'B') },
19629+
19630     { AV_PIX_FMT_NONE, 0 },
19631 };
19632
19633--- a/libavcodec/rawenc.c
19634+++ b/libavcodec/rawenc.c
19635@@ -24,6 +24,7 @@
19636  * Raw Video Encoder
19637  */
19638
19639+#include "config.h"
19640 #include "avcodec.h"
19641 #include "raw.h"
19642 #include "internal.h"
19643@@ -31,6 +32,10 @@
19644 #include "libavutil/intreadwrite.h"
19645 #include "libavutil/imgutils.h"
19646 #include "libavutil/internal.h"
19647+#include "libavutil/avassert.h"
19648+#if CONFIG_SAND
19649+#include "libavutil/rpi_sand_fns.h"
19650+#endif
19651
19652 static av_cold int raw_encode_init(AVCodecContext *avctx)
19653 {
19654@@ -49,22 +54,114 @@ FF_ENABLE_DEPRECATION_WARNINGS
19655     return 0;
19656 }
19657
19658+#if CONFIG_SAND
19659+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
19660+                      const AVFrame *frame)
19661+{
19662+    const int width = av_frame_cropped_width(frame);
19663+    const int height = av_frame_cropped_height(frame);
19664+    const int x0 = frame->crop_left;
19665+    const int y0 = frame->crop_top;
19666+    const int size = width * height * 3 / 2;
19667+    uint8_t * dst;
19668+    int ret;
19669+
19670+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
19671+        return ret;
19672+
19673+    dst = pkt->data;
19674+
19675+    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
19676+    dst += width * height;
19677+    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
19678+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
19679+    return 0;
19680+}
19681+
19682+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
19683+                      const AVFrame *frame)
19684+{
19685+    const int width = av_frame_cropped_width(frame);
19686+    const int height = av_frame_cropped_height(frame);
19687+    const int x0 = frame->crop_left;
19688+    const int y0 = frame->crop_top;
19689+    const int size = width * height * 3;
19690+    uint8_t * dst;
19691+    int ret;
19692+
19693+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
19694+        return ret;
19695+
19696+    dst = pkt->data;
19697+
19698+    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
19699+    dst += width * height * 2;
19700+    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
19701+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
19702+    return 0;
19703+}
19704+
19705+static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
19706+                      const AVFrame *frame)
19707+{
19708+    const int width = av_frame_cropped_width(frame);
19709+    const int height = av_frame_cropped_height(frame);
19710+    const int x0 = frame->crop_left;
19711+    const int y0 = frame->crop_top;
19712+    const int size = width * height * 3;
19713+    uint8_t * dst;
19714+    int ret;
19715+
19716+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
19717+        return ret;
19718+
19719+    dst = pkt->data;
19720+
19721+    av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
19722+    dst += width * height * 2;
19723+    av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width,
19724+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2);
19725+    return 0;
19726+}
19727+#endif
19728+
19729+
19730 static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
19731-                      const AVFrame *frame, int *got_packet)
19732+                      const AVFrame *src_frame, int *got_packet)
19733 {
19734-    int ret = av_image_get_buffer_size(frame->format,
19735-                                       frame->width, frame->height, 1);
19736+    int ret;
19737+    AVFrame * frame = NULL;
19738
19739-    if (ret < 0)
19740+#if CONFIG_SAND
19741+    if (av_rpi_is_sand_frame(src_frame)) {
19742+        ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) :
19743+            av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) :
19744+            av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1;
19745+        *got_packet = (ret == 0);
19746         return ret;
19747+    }
19748+#endif
19749+
19750+    if ((frame = av_frame_clone(src_frame)) == NULL) {
19751+        ret = AVERROR(ENOMEM);
19752+        goto fail;
19753+    }
19754+
19755+    if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0)
19756+        goto fail;
19757+
19758+    ret = av_image_get_buffer_size(frame->format,
19759+                                       frame->width, frame->height, 1);
19760+    if (ret < 0)
19761+        goto fail;
19762
19763     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
19764-        return ret;
19765+        goto fail;
19766     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
19767                                        (const uint8_t **)frame->data, frame->linesize,
19768                                        frame->format,
19769                                        frame->width, frame->height, 1)) < 0)
19770-        return ret;
19771+        goto fail;
19772
19773     if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
19774        frame->format   == AV_PIX_FMT_YUYV422) {
19775@@ -81,8 +178,14 @@ static int raw_encode(AVCodecContext *av
19776         }
19777     }
19778     pkt->flags |= AV_PKT_FLAG_KEY;
19779+    av_frame_free(&frame);
19780     *got_packet = 1;
19781     return 0;
19782+
19783+fail:
19784+    av_frame_free(&frame);
19785+    *got_packet = 0;
19786+    return ret;
19787 }
19788
19789 AVCodec ff_rawvideo_encoder = {
19790--- /dev/null
19791+++ b/libavcodec/rpi_hevc_cabac.c
19792@@ -0,0 +1,2257 @@
19793+/*
19794+ * HEVC CABAC decoding
19795+ *
19796+ * Copyright (C) 2012 - 2013 Guillaume Martres
19797+ * Copyright (C) 2012 - 2013 Gildas Cocherel
19798+ * Copyright (C) 2012 - 2013 Gildas Cocherel
19799+ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
19800+ *
19801+ * This file is part of FFmpeg.
19802+ *
19803+ * FFmpeg is free software; you can redistribute it and/or
19804+ * modify it under the terms of the GNU Lesser General Public
19805+ * License as published by the Free Software Foundation; either
19806+ * version 2.1 of the License, or (at your option) any later version.
19807+ *
19808+ * FFmpeg is distributed in the hope that it will be useful,
19809+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
19810+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19811+ * Lesser General Public License for more details.
19812+ *
19813+ * You should have received a copy of the GNU Lesser General Public
19814+ * License along with FFmpeg; if not, write to the Free Software
19815+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19816+ */
19817+
19818+#define UNCHECKED_BITSTREAM_READER 1
19819+
19820+#include "libavutil/attributes.h"
19821+#include "libavutil/common.h"
19822+
19823+#include "cabac_functions.h"
19824+#include "rpi_hevc_data.h"
19825+#include "hevc.h"
19826+#include "rpi_hevcdec.h"
19827+#include "rpi_hevc_cabac_fns.h"
19828+
19829+#include "libavutil/rpi_sand_fns.h"
19830+
19831+// BY22 is probably faster than simple bypass if the processor has
19832+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
19833+// x86 has fast int divide
19834+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
19835+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
19836+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
19837+// Use native divide if we have a fast one - otherwise use mpy 1/x
19838+// x86 has a fast integer divide - arm doesn't - unsure about other
19839+// architectures
19840+#define USE_BY22_DIV  ARCH_X86
19841+
19842+// Special case blocks with a single significant ceoff
19843+// Decreases the complexity of the code for a common case but increases the
19844+// code size.
19845+#define USE_N_END_1 1
19846+
19847+#if !USE_BY22_DIV
19848+// * 1/x @ 32 bits gets us 22 bits of accuracy
19849+#define CABAC_BY22_PEEK_BITS  22
19850+#else
19851+// A real 32-bit divide gets us another bit
19852+// If we have a 64 bit int & a unit time divider then we should get a lot
19853+// of bits (55)  but that is untested and it is unclear if it would give
19854+// us a large advantage
19855+#define CABAC_BY22_PEEK_BITS  23
19856+#endif
19857+
19858+#define CABAC_MAX_BIN 31
19859+
19860+
19861+#if USE_BY22 && !USE_BY22_DIV
19862+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
19863+
19864+static const uint32_t cabac_by22_inv_range[256] = {
19865+                                                    0,      I(257), I(258), I(259),
19866+    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
19867+    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
19868+    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
19869+    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
19870+    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
19871+    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
19872+    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
19873+    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
19874+    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
19875+    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
19876+    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
19877+    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
19878+    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
19879+    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
19880+    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
19881+    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
19882+    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
19883+    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
19884+    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
19885+    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
19886+    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
19887+    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
19888+    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
19889+    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
19890+    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
19891+    I(510), I(511)
19892+};
19893+#undef I
19894+#endif  // USE_BY22
19895+
19896+#if ARCH_ARM
19897+#include "arm/rpi_hevc_cabac.h"
19898+#endif
19899+
19900+/**
19901+ * number of bin by SyntaxElement.
19902+ */
19903+static const int8_t num_bins_in_se[] = {
19904+     1, // sao_merge_flag
19905+     1, // sao_type_idx
19906+     0, // sao_eo_class
19907+     0, // sao_band_position
19908+     0, // sao_offset_abs
19909+     0, // sao_offset_sign
19910+     0, // end_of_slice_flag
19911+     3, // split_coding_unit_flag
19912+     1, // cu_transquant_bypass_flag
19913+     3, // skip_flag
19914+     3, // cu_qp_delta
19915+     1, // pred_mode
19916+     4, // part_mode
19917+     0, // pcm_flag
19918+     1, // prev_intra_luma_pred_mode
19919+     0, // mpm_idx
19920+     0, // rem_intra_luma_pred_mode
19921+     2, // intra_chroma_pred_mode
19922+     1, // merge_flag
19923+     1, // merge_idx
19924+     5, // inter_pred_idc
19925+     2, // ref_idx_l0
19926+     2, // ref_idx_l1
19927+     2, // abs_mvd_greater0_flag
19928+     2, // abs_mvd_greater1_flag
19929+     0, // abs_mvd_minus2
19930+     0, // mvd_sign_flag
19931+     1, // mvp_lx_flag
19932+     1, // no_residual_data_flag
19933+     3, // split_transform_flag
19934+     2, // cbf_luma
19935+     4, // cbf_cb, cbf_cr
19936+     2, // transform_skip_flag[][]
19937+     2, // explicit_rdpcm_flag[][]
19938+     2, // explicit_rdpcm_dir_flag[][]
19939+    18, // last_significant_coeff_x_prefix
19940+    18, // last_significant_coeff_y_prefix
19941+     0, // last_significant_coeff_x_suffix
19942+     0, // last_significant_coeff_y_suffix
19943+     4, // significant_coeff_group_flag
19944+    44, // significant_coeff_flag
19945+    24, // coeff_abs_level_greater1_flag
19946+     6, // coeff_abs_level_greater2_flag
19947+     0, // coeff_abs_level_remaining
19948+     0, // coeff_sign_flag
19949+     8, // log2_res_scale_abs
19950+     2, // res_scale_sign_flag
19951+     1, // cu_chroma_qp_offset_flag
19952+     1, // cu_chroma_qp_offset_idx
19953+};
19954+
19955+/**
19956+ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
19957+ */
19958+static const int elem_offset[sizeof(num_bins_in_se)] = {
19959+    0, // sao_merge_flag
19960+    1, // sao_type_idx
19961+    2, // sao_eo_class
19962+    2, // sao_band_position
19963+    2, // sao_offset_abs
19964+    2, // sao_offset_sign
19965+    2, // end_of_slice_flag
19966+    2, // split_coding_unit_flag
19967+    5, // cu_transquant_bypass_flag
19968+    6, // skip_flag
19969+    9, // cu_qp_delta
19970+    12, // pred_mode
19971+    13, // part_mode
19972+    17, // pcm_flag
19973+    17, // prev_intra_luma_pred_mode
19974+    18, // mpm_idx
19975+    18, // rem_intra_luma_pred_mode
19976+    18, // intra_chroma_pred_mode
19977+    20, // merge_flag
19978+    21, // merge_idx
19979+    22, // inter_pred_idc
19980+    27, // ref_idx_l0
19981+    29, // ref_idx_l1
19982+    31, // abs_mvd_greater0_flag
19983+    33, // abs_mvd_greater1_flag
19984+    35, // abs_mvd_minus2
19985+    35, // mvd_sign_flag
19986+    35, // mvp_lx_flag
19987+    36, // no_residual_data_flag
19988+    37, // split_transform_flag
19989+    40, // cbf_luma
19990+    42, // cbf_cb, cbf_cr
19991+    46, // transform_skip_flag[][]
19992+    48, // explicit_rdpcm_flag[][]
19993+    50, // explicit_rdpcm_dir_flag[][]
19994+    52, // last_significant_coeff_x_prefix
19995+    70, // last_significant_coeff_y_prefix
19996+    88, // last_significant_coeff_x_suffix
19997+    88, // last_significant_coeff_y_suffix
19998+    88, // significant_coeff_group_flag
19999+    92, // significant_coeff_flag
20000+    136, // coeff_abs_level_greater1_flag
20001+    160, // coeff_abs_level_greater2_flag
20002+    166, // coeff_abs_level_remaining
20003+    166, // coeff_sign_flag
20004+    166, // log2_res_scale_abs
20005+    174, // res_scale_sign_flag
20006+    176, // cu_chroma_qp_offset_flag
20007+    177, // cu_chroma_qp_offset_idx
20008+};
20009+
20010+#define CNU 154
20011+/**
20012+ * Indexed by init_type
20013+ */
20014+static const uint8_t init_values[3][HEVC_CONTEXTS] = {
20015+    { // sao_merge_flag
20016+      153,
20017+      // sao_type_idx
20018+      200,
20019+      // split_coding_unit_flag
20020+      139, 141, 157,
20021+      // cu_transquant_bypass_flag
20022+      154,
20023+      // skip_flag
20024+      CNU, CNU, CNU,
20025+      // cu_qp_delta
20026+      154, 154, 154,
20027+      // pred_mode
20028+      CNU,
20029+      // part_mode
20030+      184, CNU, CNU, CNU,
20031+      // prev_intra_luma_pred_mode
20032+      184,
20033+      // intra_chroma_pred_mode
20034+      63, 139,
20035+      // merge_flag
20036+      CNU,
20037+      // merge_idx
20038+      CNU,
20039+      // inter_pred_idc
20040+      CNU, CNU, CNU, CNU, CNU,
20041+      // ref_idx_l0
20042+      CNU, CNU,
20043+      // ref_idx_l1
20044+      CNU, CNU,
20045+      // abs_mvd_greater1_flag
20046+      CNU, CNU,
20047+      // abs_mvd_greater1_flag
20048+      CNU, CNU,
20049+      // mvp_lx_flag
20050+      CNU,
20051+      // no_residual_data_flag
20052+      CNU,
20053+      // split_transform_flag
20054+      153, 138, 138,
20055+      // cbf_luma
20056+      111, 141,
20057+      // cbf_cb, cbf_cr
20058+      94, 138, 182, 154,
20059+      // transform_skip_flag
20060+      139, 139,
20061+      // explicit_rdpcm_flag
20062+      139, 139,
20063+      // explicit_rdpcm_dir_flag
20064+      139, 139,
20065+      // last_significant_coeff_x_prefix
20066+      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
20067+       79, 108, 123,  63,
20068+      // last_significant_coeff_y_prefix
20069+      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
20070+       79, 108, 123,  63,
20071+      // significant_coeff_group_flag
20072+      91, 171, 134, 141,
20073+      // significant_coeff_flag
20074+      111, 111, 125, 110, 110,  94, 124, 108, 124, 107, 125, 141, 179, 153,
20075+      125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
20076+      139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
20077+      141, 111,
20078+      // coeff_abs_level_greater1_flag
20079+      140,  92, 137, 138, 140, 152, 138, 139, 153,  74, 149,  92, 139, 107,
20080+      122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
20081+      // coeff_abs_level_greater2_flag
20082+      138, 153, 136, 167, 152, 152,
20083+      // log2_res_scale_abs
20084+      154, 154, 154, 154, 154, 154, 154, 154,
20085+      // res_scale_sign_flag
20086+      154, 154,
20087+      // cu_chroma_qp_offset_flag
20088+      154,
20089+      // cu_chroma_qp_offset_idx
20090+      154,
20091+    },
20092+    { // sao_merge_flag
20093+      153,
20094+      // sao_type_idx
20095+      185,
20096+      // split_coding_unit_flag
20097+      107, 139, 126,
20098+      // cu_transquant_bypass_flag
20099+      154,
20100+      // skip_flag
20101+      197, 185, 201,
20102+      // cu_qp_delta
20103+      154, 154, 154,
20104+      // pred_mode
20105+      149,
20106+      // part_mode
20107+      154, 139, 154, 154,
20108+      // prev_intra_luma_pred_mode
20109+      154,
20110+      // intra_chroma_pred_mode
20111+      152, 139,
20112+      // merge_flag
20113+      110,
20114+      // merge_idx
20115+      122,
20116+      // inter_pred_idc
20117+      95, 79, 63, 31, 31,
20118+      // ref_idx_l0
20119+      153, 153,
20120+      // ref_idx_l1
20121+      153, 153,
20122+      // abs_mvd_greater1_flag
20123+      140, 198,
20124+      // abs_mvd_greater1_flag
20125+      140, 198,
20126+      // mvp_lx_flag
20127+      168,
20128+      // no_residual_data_flag
20129+      79,
20130+      // split_transform_flag
20131+      124, 138, 94,
20132+      // cbf_luma
20133+      153, 111,
20134+      // cbf_cb, cbf_cr
20135+      149, 107, 167, 154,
20136+      // transform_skip_flag
20137+      139, 139,
20138+      // explicit_rdpcm_flag
20139+      139, 139,
20140+      // explicit_rdpcm_dir_flag
20141+      139, 139,
20142+      // last_significant_coeff_x_prefix
20143+      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
20144+       94, 108, 123, 108,
20145+      // last_significant_coeff_y_prefix
20146+      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
20147+       94, 108, 123, 108,
20148+      // significant_coeff_group_flag
20149+      121, 140, 61, 154,
20150+      // significant_coeff_flag
20151+      155, 154, 139, 153, 139, 123, 123,  63, 153, 166, 183, 140, 136, 153,
20152+      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
20153+      153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
20154+      140, 140,
20155+      // coeff_abs_level_greater1_flag
20156+      154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
20157+      136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
20158+      // coeff_abs_level_greater2_flag
20159+      107, 167, 91, 122, 107, 167,
20160+      // log2_res_scale_abs
20161+      154, 154, 154, 154, 154, 154, 154, 154,
20162+      // res_scale_sign_flag
20163+      154, 154,
20164+      // cu_chroma_qp_offset_flag
20165+      154,
20166+      // cu_chroma_qp_offset_idx
20167+      154,
20168+    },
20169+    { // sao_merge_flag
20170+      153,
20171+      // sao_type_idx
20172+      160,
20173+      // split_coding_unit_flag
20174+      107, 139, 126,
20175+      // cu_transquant_bypass_flag
20176+      154,
20177+      // skip_flag
20178+      197, 185, 201,
20179+      // cu_qp_delta
20180+      154, 154, 154,
20181+      // pred_mode
20182+      134,
20183+      // part_mode
20184+      154, 139, 154, 154,
20185+      // prev_intra_luma_pred_mode
20186+      183,
20187+      // intra_chroma_pred_mode
20188+      152, 139,
20189+      // merge_flag
20190+      154,
20191+      // merge_idx
20192+      137,
20193+      // inter_pred_idc
20194+      95, 79, 63, 31, 31,
20195+      // ref_idx_l0
20196+      153, 153,
20197+      // ref_idx_l1
20198+      153, 153,
20199+      // abs_mvd_greater1_flag
20200+      169, 198,
20201+      // abs_mvd_greater1_flag
20202+      169, 198,
20203+      // mvp_lx_flag
20204+      168,
20205+      // no_residual_data_flag
20206+      79,
20207+      // split_transform_flag
20208+      224, 167, 122,
20209+      // cbf_luma
20210+      153, 111,
20211+      // cbf_cb, cbf_cr
20212+      149, 92, 167, 154,
20213+      // transform_skip_flag
20214+      139, 139,
20215+      // explicit_rdpcm_flag
20216+      139, 139,
20217+      // explicit_rdpcm_dir_flag
20218+      139, 139,
20219+      // last_significant_coeff_x_prefix
20220+      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
20221+       79, 108, 123,  93,
20222+      // last_significant_coeff_y_prefix
20223+      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
20224+       79, 108, 123,  93,
20225+      // significant_coeff_group_flag
20226+      121, 140, 61, 154,
20227+      // significant_coeff_flag
20228+      170, 154, 139, 153, 139, 123, 123,  63, 124, 166, 183, 140, 136, 153,
20229+      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
20230+      153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
20231+      140, 140,
20232+      // coeff_abs_level_greater1_flag
20233+      154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
20234+      136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
20235+      // coeff_abs_level_greater2_flag
20236+      107, 167, 91, 107, 107, 167,
20237+      // log2_res_scale_abs
20238+      154, 154, 154, 154, 154, 154, 154, 154,
20239+      // res_scale_sign_flag
20240+      154, 154,
20241+      // cu_chroma_qp_offset_flag
20242+      154,
20243+      // cu_chroma_qp_offset_idx
20244+      154,
20245+    },
20246+};
20247+
20248+static const uint8_t scan_1x1[1] = {
20249+    0,
20250+};
20251+
20252+static const uint8_t horiz_scan2x2_x[4] = {
20253+    0, 1, 0, 1,
20254+};
20255+
20256+static const uint8_t horiz_scan2x2_y[4] = {
20257+    0, 0, 1, 1
20258+};
20259+
20260+static const uint8_t horiz_scan4x4_x[16] = {
20261+    0, 1, 2, 3,
20262+    0, 1, 2, 3,
20263+    0, 1, 2, 3,
20264+    0, 1, 2, 3,
20265+};
20266+
20267+static const uint8_t horiz_scan4x4_y[16] = {
20268+    0, 0, 0, 0,
20269+    1, 1, 1, 1,
20270+    2, 2, 2, 2,
20271+    3, 3, 3, 3,
20272+};
20273+
20274+static const uint8_t horiz_scan8x8_inv[8][8] = {
20275+    {  0,  1,  2,  3, 16, 17, 18, 19, },
20276+    {  4,  5,  6,  7, 20, 21, 22, 23, },
20277+    {  8,  9, 10, 11, 24, 25, 26, 27, },
20278+    { 12, 13, 14, 15, 28, 29, 30, 31, },
20279+    { 32, 33, 34, 35, 48, 49, 50, 51, },
20280+    { 36, 37, 38, 39, 52, 53, 54, 55, },
20281+    { 40, 41, 42, 43, 56, 57, 58, 59, },
20282+    { 44, 45, 46, 47, 60, 61, 62, 63, },
20283+};
20284+
20285+static const uint8_t diag_scan2x2_x[4] = {
20286+    0, 0, 1, 1,
20287+};
20288+
20289+static const uint8_t diag_scan2x2_y[4] = {
20290+    0, 1, 0, 1,
20291+};
20292+
20293+static const uint8_t diag_scan2x2_inv[2][2] = {
20294+    { 0, 2, },
20295+    { 1, 3, },
20296+};
20297+
20298+static const uint8_t diag_scan4x4_inv[4][4] = {
20299+    { 0,  2,  5,  9, },
20300+    { 1,  4,  8, 12, },
20301+    { 3,  7, 11, 14, },
20302+    { 6, 10, 13, 15, },
20303+};
20304+
20305+static const uint8_t diag_scan8x8_inv[8][8] = {
20306+    {  0,  2,  5,  9, 14, 20, 27, 35, },
20307+    {  1,  4,  8, 13, 19, 26, 34, 42, },
20308+    {  3,  7, 12, 18, 25, 33, 41, 48, },
20309+    {  6, 11, 17, 24, 32, 40, 47, 53, },
20310+    { 10, 16, 23, 31, 39, 46, 52, 57, },
20311+    { 15, 22, 30, 38, 45, 51, 56, 60, },
20312+    { 21, 29, 37, 44, 50, 55, 59, 62, },
20313+    { 28, 36, 43, 49, 54, 58, 61, 63, },
20314+};
20315+
20316+
20317+typedef struct
20318+{
20319+    uint16_t coeff;
20320+    uint16_t scale;
20321+} xy_off_t;
20322+
20323+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
20324+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
20325+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
20326+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
20327+
20328+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
20329+
20330+#define OFF_DIAG(t) {\
20331+    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
20332+    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
20333+    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
20334+    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
20335+}
20336+
20337+#define OFF_HORIZ(t) {\
20338+    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
20339+    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
20340+    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
20341+    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
20342+}
20343+
20344+#define OFF_VERT(t) {\
20345+    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
20346+    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
20347+    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
20348+    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
20349+}
20350+
20351+static const xy_off_t off_xys[3][4][16] =
20352+{
20353+    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
20354+    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
20355+    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
20356+};
20357+
20358+
20359+// Helper fns
20360+#ifndef hevc_mem_bits32
20361+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
20362+{
20363+    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
20364+}
20365+#endif
20366+
20367+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
20368+#define hevc_clz32 hevc_clz32_builtin
20369+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
20370+{
20371+    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
20372+    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
20373+}
20374+#endif
20375+
20376+// It is unlikely that we will ever need this but include for completeness
20377+#ifndef hevc_clz32
20378+static inline unsigned int hevc_clz32(unsigned int x)
20379+{
20380+    unsigned int n = 1;
20381+    if ((x & 0xffff0000) == 0) {
20382+        n += 16;
20383+        x <<= 16;
20384+    }
20385+    if ((x & 0xff000000) == 0) {
20386+        n += 8;
20387+        x <<= 8;
20388+    }
20389+    if ((x & 0xf0000000) == 0) {
20390+        n += 4;
20391+        x <<= 4;
20392+    }
20393+    if ((x & 0xc0000000) == 0) {
20394+        n += 2;
20395+        x <<= 2;
20396+    }
20397+    return n - ((x >> 31) & 1);
20398+}
20399+#endif
20400+
20401+static inline int cabac_overflow(const CABACContext * const cc)
20402+{
20403+    av_assert0(cc->bytestream >= cc->bytestream_start);
20404+    return cc->bytestream >= cc->bytestream_end + 4;
20405+}
20406+
20407+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc)
20408+{
20409+    return cabac_overflow(&lc->cc);
20410+}
20411+
20412+#if !USE_BY22
20413+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
20414+// will no longer be called but the setup calls will still exist and we want
20415+// to null them out
20416+#define bypass_start(s)
20417+#define bypass_finish(s)
20418+#else
20419+// Use BY22 for residual bypass block
20420+
20421+#define bypass_start(cc) get_cabac_by22_start(cc)
20422+#define bypass_finish(cc) get_cabac_by22_finish(cc)
20423+
20424+// BY22 notes that bypass is simply a divide into the bitstream and so we
20425+// can peek out large quantities of bits at once and treat the result as if
20426+// it was VLC.  In many cases this will lead to O(1) processing rather than
20427+// O(n) though the setup and teardown is sufficiently expensive that it is
20428+// only worth using if we expect to be dealing with more than a few bits
20429+// The definition of "a few bits" will vary from platform to platform but
20430+// tests on ARM show that it probably isn't worth it for a single coded
20431+// residual, but is for >1 - it also seems likely that if there are
20432+// more residuals then they are likely to be bigger and this will make the
20433+// O(1) nature of the code more worthwhile.
20434+
20435+
20436+// Bypass block start
20437+// Must be called before _by22_peek is used as it sets the CABAC environment
20438+// into the correct state.  _by22_finish must be called to return to 'normal'
20439+// (i.e. non-bypass) cabac decoding
20440+#ifndef get_cabac_by22_start
20441+static inline void get_cabac_by22_start(CABACContext * const c)
20442+{
20443+    const unsigned int bits = __builtin_ctz(c->low);
20444+    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
20445+    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
20446+#if !USE_BY22_DIV
20447+    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
20448+#endif
20449+
20450+    c->bytestream -= (CABAC_BITS / 8);
20451+    c->by22.bits = bits;
20452+#if !USE_BY22_DIV
20453+    c->by22.range = c->range;
20454+    c->range = inv;
20455+#endif
20456+    c->low = x;
20457+}
20458+#endif
20459+
20460+// Bypass block finish
20461+// Must be called at the end of the bypass block to return to normal operation
20462+static inline void get_cabac_by22_finish(CABACContext * const c)
20463+{
20464+    unsigned int used = c->by22.bits;
20465+    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
20466+    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
20467+
20468+    c->bytestream += bytes_used + (CABAC_BITS / 8);
20469+    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
20470+#if !USE_BY22_DIV
20471+    c->range = c->by22.range;
20472+#endif
20473+}
20474+
20475+// Peek bypass bits
20476+// _by22_start must be called before _by22_peek is called and _by22_flush
20477+// must be called afterwards to flush any used bits
20478+// The actual number of valid bits returned is
20479+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
20480+// will be at least 22 which should be long enough for any prefix or suffix
20481+// though probably not long enough for the worst case combination
20482+#ifndef get_cabac_by22_peek
20483+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
20484+{
20485+#if USE_BY22_DIV
20486+    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
20487+#else
20488+    uint32_t x = c->low & ~1U;
20489+    const uint32_t inv = c->range;
20490+
20491+    if (inv != 0)
20492+        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
20493+
20494+    return x << 1;
20495+#endif
20496+}
20497+#endif
20498+
20499+// Flush bypass bits peeked by _by22_peek
20500+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
20501+// val is an unmodified copy of whatever _by22_peek returned
20502+#ifndef get_cabac_by22_flush
20503+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
20504+{
20505+    // Subtract the bits used & reshift up to the top of the word
20506+#if USE_BY22_DIV
20507+    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
20508+#else
20509+    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
20510+#endif
20511+
20512+    // and refill lower bits
20513+    // We will probably OR over some existing bits but that doesn't matter
20514+    c->by22.bits += n;
20515+    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
20516+}
20517+#endif
20518+
20519+#endif  // USE_BY22
20520+
20521+
20522+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc)
20523+{
20524+    memcpy(s->cabac_save->rice, lc->stat_coeff, 4);
20525+    memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS);
20526+}
20527+
20528+static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
20529+{
20530+    memcpy(lc->stat_coeff, s->cabac_save->rice, 4);
20531+    memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS);
20532+}
20533+
20534+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc)
20535+{
20536+    GetBitContext * const gb = &lc->gb;
20537+    skip_bits(gb, 1);
20538+    align_get_bits(gb);
20539+    return ff_init_cabac_decoder(&lc->cc,
20540+                          gb->buffer + get_bits_count(gb) / 8,
20541+                          (get_bits_left(gb) + 7) / 8);
20542+}
20543+
20544+static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
20545+{
20546+    int init_type = 2 - s->sh.slice_type;
20547+    int i;
20548+
20549+    if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I)
20550+        init_type ^= 3;
20551+
20552+    for (i = 0; i < HEVC_CONTEXTS; i++) {
20553+        int init_value = init_values[init_type][i];
20554+        int m = (init_value >> 4) * 5 - 45;
20555+        int n = ((init_value & 15) << 3) - 16;
20556+        int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127;
20557+
20558+        pre ^= pre >> 31;
20559+        if (pre > 124)
20560+            pre = 124 + (pre & 1);
20561+        lc->cabac_state[i] = pre;
20562+    }
20563+
20564+    for (i = 0; i < 4; i++)
20565+        lc->stat_coeff[i] = 0;
20566+}
20567+
20568+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags)
20569+{
20570+    if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0)
20571+    {
20572+        lc->qPy_pred = s->sh.slice_qp;
20573+        cabac_init_state(s, lc);
20574+    }
20575+    else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0)
20576+    {
20577+        lc->qPy_pred = s->sh.slice_qp;
20578+        load_states(s, lc);
20579+    }
20580+    lc->cabac_init_req = 0;
20581+}
20582+
20583+#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
20584+
20585+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state)
20586+{
20587+    return get_cabac_inline(c, state);
20588+}
20589+
20590+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c)
20591+{
20592+    return get_cabac_terminate(c);
20593+}
20594+
20595+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc)
20596+{
20597+    if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX]))
20598+        return 0;
20599+
20600+    if (!get_cabac_bypass(&lc->cc))
20601+        return SAO_BAND;
20602+    return SAO_EDGE;
20603+}
20604+
20605+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc)
20606+{
20607+    int i;
20608+    int value = get_cabac_bypass(&lc->cc);
20609+
20610+    for (i = 0; i < 4; i++)
20611+        value = (value << 1) | get_cabac_bypass(&lc->cc);
20612+    return value;
20613+}
20614+
20615+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
20616+{
20617+    int i = 0;
20618+    int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
20619+
20620+    while (i < length && get_cabac_bypass(&lc->cc))
20621+        i++;
20622+    return i;
20623+}
20624+
20625+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc)
20626+{
20627+    return get_cabac_bypass(&lc->cc);
20628+}
20629+
20630+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc)
20631+{
20632+    int ret = get_cabac_bypass(&lc->cc) << 1;
20633+    ret    |= get_cabac_bypass(&lc->cc);
20634+    return ret;
20635+}
20636+
20637+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc)
20638+{
20639+    int val = 1;
20640+
20641+    if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0)
20642+        return 0;
20643+
20644+    while (val < 5 &&
20645+           get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0)
20646+        val++;
20647+
20648+    if (val >= 5) {
20649+        unsigned int k = 0;
20650+        while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
20651+            val += 1 << k;
20652+            k++;
20653+        }
20654+//        if (k == CABAC_MAX_BIN)
20655+//            av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
20656+
20657+        while (k--)
20658+            val += get_cabac_bypass(&lc->cc) << k;
20659+    }
20660+    return get_cabac_bypass(&lc->cc) ? -val : val;
20661+}
20662+
20663+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
20664+{
20665+    int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
20666+    int i = 0;
20667+
20668+    while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
20669+        i++;
20670+
20671+    return i;
20672+}
20673+
20674+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size)
20675+{
20676+    if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1
20677+        return PART_2Nx2N;
20678+    if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
20679+        if (lc->cu.pred_mode == MODE_INTRA) // 0
20680+            return PART_NxN;
20681+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
20682+            return PART_2NxN;
20683+        if (log2_cb_size == 3) // 00
20684+            return PART_Nx2N;
20685+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001
20686+            return PART_Nx2N;
20687+        return PART_NxN; // 000
20688+    }
20689+
20690+    if (!s->ps.sps->amp_enabled_flag) {
20691+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
20692+            return PART_2NxN;
20693+        return PART_Nx2N;
20694+    }
20695+
20696+    if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
20697+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011
20698+            return PART_2NxN;
20699+        if (get_cabac_bypass(&lc->cc)) // 0101
20700+            return PART_2NxnD;
20701+        return PART_2NxnU; // 0100
20702+    }
20703+
20704+    if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001
20705+        return PART_Nx2N;
20706+    if (get_cabac_bypass(&lc->cc)) // 0001
20707+        return PART_nRx2N;
20708+    return PART_nLx2N;  // 0000
20709+}
20710+
20711+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc)
20712+{
20713+    int i = 0;
20714+    while (i < 2 && get_cabac_bypass(&lc->cc))
20715+        i++;
20716+    return i;
20717+}
20718+
20719+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc)
20720+{
20721+    int i;
20722+    int value = get_cabac_bypass(&lc->cc);
20723+
20724+    for (i = 0; i < 4; i++)
20725+        value = (value << 1) | get_cabac_bypass(&lc->cc);
20726+    return value;
20727+}
20728+
20729+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc)
20730+{
20731+    int ret;
20732+    if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE]))
20733+        return 4;
20734+
20735+    ret  = get_cabac_bypass(&lc->cc) << 1;
20736+    ret |= get_cabac_bypass(&lc->cc);
20737+    return ret;
20738+}
20739+
20740+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
20741+{
20742+    int i = GET_CABAC_LC(elem_offset[MERGE_IDX]);
20743+
20744+    if (i != 0) {
20745+        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc))
20746+            i++;
20747+    }
20748+    return i;
20749+}
20750+
20751+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH)
20752+{
20753+    if (nPbW + nPbH == 12)
20754+        return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
20755+    if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth))
20756+        return PRED_BI;
20757+
20758+    return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
20759+}
20760+
20761+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx)
20762+{
20763+    int i = 0;
20764+    int max = num_ref_idx_lx - 1;
20765+    int max_ctx = FFMIN(max, 2);
20766+
20767+    while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i))
20768+        i++;
20769+    if (i == 2) {
20770+        while (i < max && get_cabac_bypass(&lc->cc))
20771+            i++;
20772+    }
20773+
20774+    return i;
20775+}
20776+
20777+static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc)
20778+{
20779+    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]);
20780+}
20781+
20782+static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc)
20783+{
20784+    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
20785+}
20786+
20787+#if !USE_BY22
20788+static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc)
20789+{
20790+    int ret = 2;
20791+    int k = 1;
20792+
20793+    while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
20794+        ret += 1U << k;
20795+        k++;
20796+    }
20797+    if (k == CABAC_MAX_BIN) {
20798+        av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
20799+        return 0;
20800+    }
20801+
20802+    while (k--)
20803+        ret += get_cabac_bypass(&lc->cc) << k;
20804+    return get_cabac_bypass_sign(&lc->cc, -ret);
20805+}
20806+#endif
20807+
20808+static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc)
20809+{
20810+    return get_cabac_bypass_sign(&lc->cc, -1);
20811+}
20812+
20813+static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
20814+{
20815+    return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
20816+}
20817+
20818+static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
20819+{
20820+    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
20821+}
20822+
20823+static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
20824+{
20825+    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
20826+}
20827+
20828+
20829+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) {
20830+    int i =0;
20831+
20832+    while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
20833+        i++;
20834+
20835+    return i;
20836+}
20837+
20838+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz,
20839+                                                   int log2_size, int *last_scx_prefix, int *last_scy_prefix)
20840+{
20841+    int i = 0;
20842+    int max = (log2_size << 1) - 1;
20843+    int ctx_offset, ctx_shift;
20844+
20845+    if (!c_idx_nz) {
20846+        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
20847+        ctx_shift = (log2_size + 1) >> 2;
20848+    } else {
20849+        ctx_offset = 15;
20850+        ctx_shift = log2_size - 2;
20851+    }
20852+    while (i < max &&
20853+           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
20854+        i++;
20855+    *last_scx_prefix = i;
20856+
20857+    i = 0;
20858+    while (i < max &&
20859+           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
20860+        i++;
20861+    *last_scy_prefix = i;
20862+}
20863+
20864+static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc,
20865+                                                 int last_significant_coeff_prefix)
20866+{
20867+    int i;
20868+    int length = (last_significant_coeff_prefix >> 1) - 1;
20869+    int value = get_cabac_bypass(&lc->cc);
20870+
20871+    for (i = 1; i < length; i++)
20872+        value = (value << 1) | get_cabac_bypass(&lc->cc);
20873+    return value;
20874+}
20875+
20876+static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg)
20877+{
20878+    int inc;
20879+
20880+    inc = (ctx_cg != 0) + (c_idx_nz << 1);
20881+
20882+    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
20883+}
20884+
20885+static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset)
20886+{
20887+    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
20888+}
20889+
20890+#if !USE_BY22
20891+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
20892+#endif
20893+
20894+
20895+#ifndef coeff_abs_level_remaining_decode_bypass
20896+static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param)
20897+{
20898+    uint32_t y;
20899+    unsigned int prefix;
20900+    unsigned int last_coeff_abs_level_remaining;
20901+    unsigned int n;
20902+
20903+    y = get_cabac_by22_peek(c);
20904+    prefix = hevc_clz32(~y);
20905+    // y << prefix will always have top bit 0
20906+
20907+    if (prefix < 3) {
20908+        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
20909+        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
20910+        n = prefix + 1 + rice_param;
20911+    }
20912+    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
20913+    {
20914+        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
20915+
20916+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
20917+        n = prefix * 2 + rice_param - 2;
20918+    }
20919+    else {
20920+        unsigned int suffix;
20921+
20922+        get_cabac_by22_flush(c, prefix, y);
20923+        y = get_cabac_by22_peek(c);
20924+
20925+        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
20926+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
20927+        n = prefix + rice_param - 2;
20928+    }
20929+
20930+    get_cabac_by22_flush(c, n, y);
20931+
20932+    return last_coeff_abs_level_remaining;
20933+}
20934+#endif
20935+
20936+static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param)
20937+{
20938+    int prefix = 0;
20939+    int suffix = 0;
20940+    int last_coeff_abs_level_remaining;
20941+    int i;
20942+
20943+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
20944+        prefix++;
20945+    if (prefix == CABAC_MAX_BIN) {
20946+//        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
20947+        return 0;
20948+    }
20949+
20950+    if (prefix < 3) {
20951+        for (i = 0; i < rc_rice_param; i++)
20952+            suffix = (suffix << 1) | get_cabac_bypass(c);
20953+        last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
20954+    } else {
20955+        int prefix_minus3 = prefix - 3;
20956+        for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
20957+            suffix = (suffix << 1) | get_cabac_bypass(c);
20958+        last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
20959+                                              << rc_rice_param) + suffix;
20960+    }
20961+
20962+    return last_coeff_abs_level_remaining;
20963+}
20964+
20965+#if !USE_BY22
20966+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
20967+static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb)
20968+{
20969+    unsigned int i;
20970+    uint32_t ret = 0;
20971+
20972+    for (i = 0; i < nb; i++)
20973+        ret = (ret << 1) | get_cabac_bypass(c);
20974+
20975+    return ret << (32 - nb);
20976+}
20977+#endif
20978+
20979+#ifndef coeff_sign_flag_decode_bypass
20980+static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb)
20981+{
20982+    uint32_t y;
20983+    y = get_cabac_by22_peek(c);
20984+    get_cabac_by22_flush(c, nb, y);
20985+    return y & ~(0xffffffffU >> nb);
20986+}
20987+#endif
20988+
20989+
20990+#ifndef get_cabac_greater1_bits
20991+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
20992+    uint8_t * const state0)
20993+{
20994+    unsigned int i;
20995+    unsigned int rv = 0;
20996+    for (i = 0; i != n; ++i) {
20997+        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
20998+        const unsigned int b = get_cabac(c, state0 + idx);
20999+        rv = (rv << 1) | b;
21000+    }
21001+    return rv;
21002+}
21003+#endif
21004+
21005+
21006+// N.B. levels returned are the values assuming coeff_abs_level_remaining
21007+// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
21008+// this version of events.
21009+static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels,
21010+    int * const pprev_subset_coded, int * const psum,
21011+    const unsigned int idx0_gt1, const unsigned int idx_gt2)
21012+{
21013+    CABACContext * const c = &lc->cc;
21014+    uint8_t * const state0 = lc->cabac_state + idx0_gt1;
21015+    uint8_t * const state_gt2 = lc->cabac_state + idx_gt2;
21016+    unsigned int rv;
21017+    unsigned int i;
21018+    const unsigned int n = FFMIN(n_end, 8);
21019+
21020+    // Really this is i != n but the simple unconditional loop is cheaper
21021+    // and faster
21022+    for (i = 0; i != 8; ++i)
21023+        levels[i] = 1;
21024+
21025+    rv = get_cabac_greater1_bits(c, n, state0);
21026+
21027+    *pprev_subset_coded = 0;
21028+    *psum = n;
21029+
21030+    rv <<= (32 - n);
21031+    if (rv != 0)
21032+    {
21033+        *pprev_subset_coded = 1;
21034+        *psum = n + 1;
21035+        i = hevc_clz32(rv);
21036+        levels[i] = 2;
21037+        if (get_cabac(c, state_gt2) == 0)
21038+        {
21039+            // Unset first coded bit
21040+            rv &= ~(0x80000000U >> i);
21041+        }
21042+    }
21043+
21044+    if (n_end > 8) {
21045+        const unsigned int g8 = n_end - 8;
21046+        rv |= ((1 << g8) - 1) << (24 - g8);
21047+        for (i = 0; i != g8; ++i) {
21048+            levels[i + 8] = 0;
21049+        }
21050+    }
21051+
21052+    return rv;
21053+}
21054+
21055+// extended_precision_processing_flag must be false given we are
21056+// putting the result into a 16-bit array
21057+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
21058+// scale_m is uint8_t
21059+//
21060+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
21061+//   or it can be 2 (if we have transquant_bypass)
21062+// shift is set to one less than we really want but would normally be
21063+//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
21064+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
21065+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
21066+// to achieve it
21067+
21068+#ifndef trans_scale_sat
21069+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
21070+{
21071+    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
21072+}
21073+#endif
21074+
21075+
21076+#ifndef update_rice
21077+static inline void update_rice(uint8_t * const stat_coeff,
21078+    const unsigned int last_coeff_abs_level_remaining,
21079+    const unsigned int c_rice_param)
21080+{
21081+    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
21082+    if (x >= 6)
21083+        (*stat_coeff)++;
21084+    else if (x == 0 && *stat_coeff > 0)
21085+        (*stat_coeff)--;
21086+}
21087+#endif
21088+
21089+
21090+// n must be > 0 on entry
21091+#ifndef get_cabac_sig_coeff_flag_idxs
21092+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
21093+    unsigned int n,
21094+    const uint8_t const * ctx_map,
21095+    uint8_t * p)
21096+{
21097+    do {
21098+        if (get_cabac(c, state0 + ctx_map[n]))
21099+            *p++ = n;
21100+    } while (--n != 0);
21101+    return p;
21102+}
21103+#endif
21104+
21105+
21106+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
21107+    unsigned int n,
21108+    const uint8_t * ctx_map,  // const ptr here but not in asm
21109+    uint8_t * const flag_idx)
21110+{
21111+    int rv;
21112+
21113+    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
21114+
21115+    return rv;
21116+}
21117+
21118+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
21119+     x0,  x1,  x2,  x3,\
21120+     x4,  x5,  x6,  x7,\
21121+     x8,  x9, x10, x11,\
21122+    x12, x13, x14, x15}
21123+
21124+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
21125+     x0,  x4,  x8, x12,\
21126+     x1,  x5,  x9, x13,\
21127+     x2,  x6, x10, x14,\
21128+     x3,  x7, x11, x15}
21129+
21130+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
21131+     x0,  x4,  x1,  x8,\
21132+     x5,  x2, x12,  x9,\
21133+     x6,  x3, x13, x10,\
21134+     x7, x14, x11, x15}
21135+
21136+
21137+static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz,
21138+    uint8_t * const significant_coeff_group_flag,
21139+    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
21140+    int * const pPrev_sig)
21141+{
21142+    while (--i >= 0) {
21143+        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
21144+        const unsigned int x_cg = scan_x_cg[i];
21145+
21146+        // For the flag decode we only care about Z/NZ but
21147+        // we use the full Right * 2 + Down when calculating
21148+        // significant coeff flags so we obtain it here.
21149+        //
21150+        // The group flag array is one longer than it needs to
21151+        // be so we don't need to check for y_cg limits
21152+        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
21153+
21154+        if (i == 0 ||
21155+            significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig))
21156+        {
21157+            gf_y[0] |= (1 << x_cg);
21158+            *pPrev_sig = prev_sig;
21159+            break;
21160+        }
21161+    }
21162+
21163+    return i;
21164+}
21165+
21166+static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
21167+    const unsigned int log2_trafo_size, const unsigned int c_idx,
21168+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
21169+{
21170+    const AVFrame * const frame = s->frame;
21171+    const unsigned int stride = frame_stride1(s->frame, c_idx);
21172+    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
21173+    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
21174+    const int is_sliced = 1;  // av_rpi_is_sand_frame(frame);
21175+    uint8_t * const dst = !is_sliced ?
21176+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
21177+        c_idx == 0 ?
21178+            av_rpi_sand_frame_pos_y(frame, x, y) :
21179+            av_rpi_sand_frame_pos_c(frame, x, y);
21180+
21181+    const unsigned int i = jb->intra.n;
21182+    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
21183+
21184+    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
21185+        pc->ta.dst == dst)
21186+    {
21187+        av_assert1(pc->size == log2_trafo_size &&
21188+                   pc->c_idx == 1 &&
21189+                   pc->ta.stride == stride);
21190+
21191+        pc->type = RPI_PRED_ADD_RESIDUAL_C;
21192+    }
21193+    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
21194+        pc->dc.dst == dst)
21195+    {
21196+        const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
21197+        av_assert1(pc->size == log2_trafo_size &&
21198+                   pc->c_idx == 1 &&
21199+                   pc->dc.stride == stride);
21200+
21201+        // Rewrite as add residual - must rewrite all fields as different union member
21202+        pc->type = RPI_PRED_ADD_RESIDUAL_V;
21203+        pc->ta.buf = coeffs;
21204+        pc->ta.dst = dst;
21205+        pc->ta.stride = stride;
21206+        pc->ta.dc = dc;
21207+    }
21208+    else
21209+    {
21210+        HEVCPredCmd * const cmd = pc + 1;
21211+        jb->intra.n = i + 1;
21212+
21213+        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
21214+        cmd->size = log2_trafo_size;
21215+        cmd->ta.buf = coeffs;
21216+        cmd->ta.dst = dst;
21217+        cmd->ta.stride = stride;
21218+        cmd->ta.dc = 0;
21219+    }
21220+}
21221+
21222+
21223+static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
21224+    const unsigned int log2_trafo_size, const unsigned int c_idx,
21225+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
21226+{
21227+    const AVFrame * const frame = s->frame;
21228+    const unsigned int stride = frame_stride1(s->frame, c_idx);
21229+    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
21230+    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
21231+    const int is_sliced = 1;
21232+    uint8_t * const dst = !is_sliced ?
21233+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
21234+        c_idx == 0 ?
21235+            av_rpi_sand_frame_pos_y(frame, x, y) :
21236+            av_rpi_sand_frame_pos_c(frame, x, y);
21237+
21238+    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
21239+    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
21240+
21241+    const unsigned int i = jb->intra.n;
21242+    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
21243+
21244+    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
21245+        pc->ta.dst == dst)
21246+    {
21247+        av_assert1(pc->size == log2_trafo_size &&
21248+                   pc->c_idx == 1 &&
21249+                   pc->ta.stride == stride);
21250+
21251+        pc->ta.dc = (int16_t)coeff;
21252+    }
21253+    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
21254+        pc->dc.dst == dst)
21255+    {
21256+        av_assert1(pc->size == log2_trafo_size &&
21257+                   pc->c_idx == 1 &&
21258+                   pc->dc.stride == stride &&
21259+                   (pc->dc.dc & ~0xffff) == 0);
21260+
21261+        pc->dc.dc |= (coeff << 16);
21262+    }
21263+    else
21264+    {
21265+        HEVCPredCmd * const cmd = pc + 1;
21266+        jb->intra.n = i + 1;
21267+
21268+        cmd->type = RPI_PRED_ADD_DC + c_idx;
21269+        cmd->size = log2_trafo_size;
21270+        cmd->dc.dst = dst;
21271+        cmd->dc.stride = stride;
21272+        cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
21273+    }
21274+}
21275+
21276+
21277+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
21278+                                const int x0, const int y0,
21279+                                const int log2_trafo_size, const enum ScanType scan_idx,
21280+                                const int c_idx)
21281+{
21282+    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
21283+
21284+    int last_significant_coeff_x, last_significant_coeff_y;
21285+    int num_coeff = 0;
21286+    int prev_subset_coded = 0;
21287+
21288+    int num_last_subset;
21289+    int x_cg_last_sig, y_cg_last_sig;
21290+
21291+    const uint8_t *scan_x_cg, *scan_y_cg;
21292+    const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
21293+
21294+    int use_vpu;
21295+#if RPI_COMPRESS_COEFFS
21296+    int num_nonzero = 0;
21297+    int use_compress = 0;
21298+    int *coeffs32;
21299+#endif
21300+    int use_dc = 0;
21301+    int16_t *coeffs;
21302+    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
21303+    int explicit_rdpcm_flag = 0;
21304+    int explicit_rdpcm_dir_flag;
21305+
21306+    int i;
21307+    int shift,scale;
21308+    const uint8_t *scale_matrix = NULL;
21309+    uint8_t dc_scale;
21310+    const int c_idx_nz = (c_idx != 0);
21311+    const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
21312+    int prev_sig = 0;
21313+    int may_hide_sign;
21314+
21315+    int16_t dummy_coeffs[16];
21316+
21317+    // Derive QP for dequant
21318+    if (!lc->cu.cu_transquant_bypass_flag) {
21319+        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
21320+
21321+        if (s->ps.pps->transform_skip_enabled_flag &&
21322+            log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
21323+            int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz);
21324+            if (transform_skip_flag) {
21325+                trans_skip_or_bypass = 1;
21326+                if (lc->cu.pred_mode ==  MODE_INTRA  &&
21327+                    s->ps.sps->implicit_rdpcm_enabled_flag &&
21328+                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
21329+                    may_hide_sign = 0;
21330+                }
21331+            }
21332+        }
21333+
21334+        {
21335+            static const uint8_t level_scale[8] = {
21336+                40, 45, 51, 57, 64, 72, 0, 0  // Pad to 8
21337+            };
21338+            const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y];
21339+
21340+            // Shift is set to one less than will actually occur as the scale
21341+            // and saturate step adds 1 and then shifts right again
21342+            scale = level_scale[qp6 & 7];
21343+//            shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3);
21344+            shift = log2_trafo_size - (qp6 >> 3);
21345+
21346+            if (shift < 0) {
21347+                scale <<= -shift;
21348+                shift = 0;
21349+            }
21350+        }
21351+
21352+        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
21353+            const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ?
21354+                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
21355+            const unsigned int matrix_id =
21356+                lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx;
21357+
21358+            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
21359+            dc_scale = scale_matrix[0];
21360+            if (log2_trafo_size >= 4)
21361+                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
21362+        }
21363+        else
21364+        {
21365+            static const uint8_t sixteen_scale[64] = {
21366+                16, 16, 16, 16, 16, 16, 16, 16,
21367+                16, 16, 16, 16, 16, 16, 16, 16,
21368+                16, 16, 16, 16, 16, 16, 16, 16,
21369+                16, 16, 16, 16, 16, 16, 16, 16,
21370+                16, 16, 16, 16, 16, 16, 16, 16,
21371+                16, 16, 16, 16, 16, 16, 16, 16,
21372+                16, 16, 16, 16, 16, 16, 16, 16,
21373+                16, 16, 16, 16, 16, 16, 16, 16
21374+            };
21375+            scale_matrix = sixteen_scale;
21376+            dc_scale = 16;
21377+        }
21378+    } else {
21379+        static const uint8_t unit_scale[64] = {
21380+            1, 1, 1, 1, 1, 1, 1, 1,
21381+            1, 1, 1, 1, 1, 1, 1, 1,
21382+            1, 1, 1, 1, 1, 1, 1, 1,
21383+            1, 1, 1, 1, 1, 1, 1, 1,
21384+            1, 1, 1, 1, 1, 1, 1, 1,
21385+            1, 1, 1, 1, 1, 1, 1, 1,
21386+            1, 1, 1, 1, 1, 1, 1, 1,
21387+            1, 1, 1, 1, 1, 1, 1, 1,
21388+        };
21389+        scale_matrix = unit_scale;
21390+        shift        = 0;
21391+        scale        = 2;  // We will shift right to kill this
21392+        dc_scale     = 1;
21393+
21394+        may_hide_sign = 0;
21395+    }
21396+
21397+
21398+
21399+
21400+    if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
21401+        trans_skip_or_bypass) {
21402+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz);
21403+        if (explicit_rdpcm_flag) {
21404+            may_hide_sign = 0;
21405+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz);
21406+        }
21407+    }
21408+
21409+    last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size,
21410+                                           &last_significant_coeff_x, &last_significant_coeff_y);
21411+
21412+    if (last_significant_coeff_x > 3) {
21413+        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x);
21414+        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
21415+        (2 + (last_significant_coeff_x & 1)) +
21416+        suffix;
21417+    }
21418+
21419+    if (last_significant_coeff_y > 3) {
21420+        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y);
21421+        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
21422+        (2 + (last_significant_coeff_y & 1)) +
21423+        suffix;
21424+    }
21425+
21426+    if (scan_idx == SCAN_VERT)
21427+        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
21428+
21429+    x_cg_last_sig = last_significant_coeff_x >> 2;
21430+    y_cg_last_sig = last_significant_coeff_y >> 2;
21431+
21432+    switch (scan_idx) {
21433+    case SCAN_DIAG: {
21434+        int last_x_c = last_significant_coeff_x & 3;
21435+        int last_y_c = last_significant_coeff_y & 3;
21436+
21437+        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
21438+
21439+        switch (log2_trafo_size) {
21440+        case 2:
21441+            scan_x_cg = scan_1x1;
21442+            scan_y_cg = scan_1x1;
21443+            break;
21444+        case 3:
21445+            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
21446+            scan_x_cg = diag_scan2x2_x;
21447+            scan_y_cg = diag_scan2x2_y;
21448+            break;
21449+        case 4:
21450+            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
21451+            scan_x_cg = ff_hevc_rpi_diag_scan4x4_x;
21452+            scan_y_cg = ff_hevc_rpi_diag_scan4x4_y;
21453+            break;
21454+        case 5:
21455+        default:
21456+            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
21457+            scan_x_cg = ff_hevc_rpi_diag_scan8x8_x;
21458+            scan_y_cg = ff_hevc_rpi_diag_scan8x8_y;
21459+            break;
21460+        }
21461+        break;
21462+    }
21463+    case SCAN_HORIZ:
21464+        scan_x_cg = horiz_scan2x2_x;
21465+        scan_y_cg = horiz_scan2x2_y;
21466+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
21467+        break;
21468+    default: //SCAN_VERT
21469+        scan_x_cg = horiz_scan2x2_y;
21470+        scan_y_cg = horiz_scan2x2_x;
21471+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
21472+        break;
21473+    }
21474+    num_coeff++;
21475+    num_last_subset = (num_coeff - 1) >> 4;
21476+
21477+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
21478+
21479+    {
21480+        const unsigned int ccount = 1 << (log2_trafo_size * 2);
21481+        const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */;  // These need special processing
21482+        use_vpu = 0;
21483+        use_dc = (num_coeff == 1) && !special &&
21484+            !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
21485+
21486+        if (use_dc) {
21487+            // Just need a little empty space
21488+            coeffs = dummy_coeffs;
21489+            // No need to clear
21490+        }
21491+        else
21492+        {
21493+            use_vpu = !special && log2_trafo_size >= 4;
21494+#if RPI_COMPRESS_COEFFS
21495+            use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed;
21496+#endif
21497+            coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
21498+#if RPI_COMPRESS_COEFFS
21499+            coeffs32 = (int*)coeffs;
21500+            if (!use_compress)
21501+#endif
21502+#if HAVE_NEON
21503+            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
21504+#else
21505+            memset(coeffs, 0, ccount * sizeof(int16_t));
21506+#endif
21507+        }
21508+    }
21509+
21510+    i = num_last_subset;
21511+    do {
21512+        int implicit_non_zero_coeff = 0;
21513+        int n_end;
21514+
21515+        uint8_t significant_coeff_flag_idx[16];
21516+        unsigned int nb_significant_coeff_flag = 0;
21517+
21518+        if (i == num_last_subset) {
21519+            // First time through
21520+            int last_scan_pos = num_coeff - (i << 4) - 1;
21521+            n_end = last_scan_pos - 1;
21522+            significant_coeff_flag_idx[0] = last_scan_pos;
21523+            nb_significant_coeff_flag = 1;
21524+        } else {
21525+            n_end = 15;
21526+            implicit_non_zero_coeff = (i != 0);
21527+        }
21528+
21529+        if (n_end >= 0) {
21530+            static const uint8_t ctx_idx_maps_ts2[3][16] = {
21531+                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
21532+                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
21533+                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
21534+            };
21535+            // N.B. prev_sig = Right * 2 + Down
21536+            static const uint8_t ctx_idx_maps[3][4][16] = {
21537+                {
21538+                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
21539+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
21540+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
21541+                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
21542+                },
21543+                {
21544+                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
21545+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
21546+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
21547+                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
21548+                },
21549+                {
21550+                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
21551+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
21552+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
21553+                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
21554+                }
21555+            };
21556+            const uint8_t *ctx_idx_map_p;
21557+            int scf_offset = 0;
21558+
21559+            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
21560+                ctx_idx_map_p = ctx_idx_maps[0][3];
21561+                scf_offset = 40 + c_idx_nz;
21562+            } else {
21563+                if (c_idx_nz != 0)
21564+                    scf_offset = 27;
21565+
21566+                if (log2_trafo_size == 2) {
21567+                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
21568+                } else {
21569+                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
21570+                    if (!c_idx_nz) {
21571+                        if (i != 0)
21572+                            scf_offset += 3;
21573+
21574+                        if (log2_trafo_size == 3) {
21575+                            scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
21576+                        } else {
21577+                            scf_offset += 21;
21578+                        }
21579+                    } else {
21580+                        if (log2_trafo_size == 3)
21581+                            scf_offset += 9;
21582+                        else
21583+                            scf_offset += 12;
21584+                    }
21585+                }
21586+            }
21587+
21588+            if (n_end > 0) {
21589+                int cnt = get_sig_coeff_flag_idxs(&lc->cc,
21590+                    lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
21591+                    n_end, ctx_idx_map_p,
21592+                    significant_coeff_flag_idx + nb_significant_coeff_flag);
21593+
21594+                nb_significant_coeff_flag += cnt;
21595+                if (cnt != 0) {
21596+                    implicit_non_zero_coeff = 0;
21597+                }
21598+            }
21599+
21600+            if (implicit_non_zero_coeff == 0) {
21601+                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
21602+                    scf_offset = 42 + c_idx_nz;
21603+                } else {
21604+                    if (i == 0) {
21605+                        scf_offset = c_idx_nz ? 27 : 0;
21606+                    } else {
21607+                        scf_offset = 2 + scf_offset;
21608+                    }
21609+                }
21610+                if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) {
21611+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
21612+                    nb_significant_coeff_flag++;
21613+                }
21614+            } else {
21615+                significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
21616+                nb_significant_coeff_flag++;
21617+            }
21618+        }
21619+#if RPI_COMPRESS_COEFFS
21620+        if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full!
21621+          int16_t temp[32*32];
21622+          const unsigned int ccount = 1 << (log2_trafo_size * 2);
21623+          lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0;
21624+          lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer
21625+          memcpy(temp, coeffs, sizeof(int)*num_nonzero);
21626+          coeffs32 = (int *)temp;
21627+          memset(coeffs, 0, ccount * sizeof(int16_t));
21628+          num_nonzero--;
21629+          while (num_nonzero >= 0) {
21630+            const unsigned int res = coeffs32[num_nonzero];
21631+            const unsigned int offset = res & 0xffff;
21632+            coeffs[ offset ] = res >> 16;
21633+            num_nonzero--;
21634+          }
21635+          use_compress = 0;
21636+        }
21637+#endif
21638+
21639+        if (nb_significant_coeff_flag != 0) {
21640+            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
21641+                ((i != 0 && !c_idx_nz) ? 2 : 0) |
21642+                prev_subset_coded;
21643+            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
21644+                (gt1_idx_delta << 2);
21645+            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
21646+                gt1_idx_delta;
21647+
21648+            const unsigned int x_cg = scan_x_cg[i];
21649+            const unsigned int y_cg = scan_y_cg[i];
21650+            int16_t * const blk_coeffs = coeffs +
21651+                ((x_cg + (y_cg << log2_trafo_size)) << 2);
21652+            // This calculation is 'wrong' for log2_traffo_size == 2
21653+            // but that doesn't matter as in this case x_cg & y_cg
21654+            // are always 0 so result is correct (0) anyway
21655+            const uint8_t * const blk_scale = scale_matrix +
21656+                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
21657+
21658+            // * The following code block doesn't deal with these flags:
21659+            //   (nor did the one it replaces)
21660+            //
21661+            // cabac_bypass_alignment_enabled_flag
21662+            //    This should be easy but I can't find a test case
21663+            // extended_precision_processing_flag
21664+            //    This can extend the required precision past 16bits
21665+            //    so is probably tricky - also no example found yet
21666+
21667+#if USE_N_END_1
21668+            if (nb_significant_coeff_flag == 1) {
21669+                // There is a small gain to be had from special casing the single
21670+                // transform coefficient case.  The reduction in complexity
21671+                // makes up for the code duplicatioon.
21672+
21673+                int trans_coeff_level = 1;
21674+                int coeff_sign_flag;
21675+                int coded_val = 0;
21676+
21677+                // initialize first elem of coeff_bas_level_greater1_flag
21678+                prev_subset_coded = 0;
21679+
21680+                if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) {
21681+                    trans_coeff_level = 2;
21682+                    prev_subset_coded = 1;
21683+                    coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2);
21684+                }
21685+
21686+                // Probably not worth the overhead of starting by22 for just one value
21687+                coeff_sign_flag = get_cabac_bypass(&lc->cc);
21688+
21689+                if (coded_val)
21690+                {
21691+                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
21692+                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0);
21693+                    } else {
21694+                        uint8_t * const stat_coeff =
21695+                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
21696+                        const unsigned int c_rice_param = *stat_coeff >> 2;
21697+                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param);
21698+
21699+                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
21700+                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
21701+                    }
21702+                }
21703+
21704+                {
21705+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
21706+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
21707+                    const unsigned int scale_m = blk_scale[xy_off->scale];
21708+                    const int res = trans_scale_sat(
21709+                        (trans_coeff_level ^ k) - k,  // Apply sign
21710+                        scale,
21711+                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
21712+                        shift);
21713+#if RPI_COMPRESS_COEFFS
21714+                      if (use_compress)
21715+                        coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
21716+                      else
21717+#endif
21718+                      blk_coeffs[xy_off->coeff] = res;
21719+                }
21720+            }
21721+            else
21722+#endif
21723+            {
21724+                int sign_hidden = may_hide_sign;
21725+                int levels[16]; // Should be able to get away with int16_t but that fails some tests
21726+                uint32_t coeff_sign_flags;
21727+                uint32_t coded_vals = 0;
21728+                // Sum(abs(level[]))
21729+                // In fact we only need the bottom bit and in some future
21730+                // version that may be all we calculate
21731+                unsigned int sum_abs;
21732+
21733+                coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels,
21734+                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
21735+
21736+                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
21737+                    sign_hidden = 0;
21738+
21739+                // -- Start bypass block
21740+
21741+                bypass_start(&lc->cc);
21742+
21743+                coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden);
21744+
21745+                if (coded_vals != 0)
21746+                {
21747+                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
21748+                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
21749+                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
21750+                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
21751+                    int * level = levels - 1;
21752+
21753+                    do {
21754+                        {
21755+                            const unsigned int z = hevc_clz32(coded_vals) + 1;
21756+                            level += z;
21757+                            coded_vals <<= z;
21758+                        }
21759+
21760+                        {
21761+                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param);
21762+                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
21763+
21764+                            sum_abs += last_coeff_abs_level_remaining + 1;
21765+                            *level = trans_coeff_level;
21766+
21767+                            if (stat_coeff != NULL)
21768+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
21769+                            stat_coeff = NULL;
21770+
21771+                            if (trans_coeff_level > (3 << c_rice_param) &&
21772+                                (c_rice_param < 4 || rice_adaptation_enabled))
21773+                                ++c_rice_param;
21774+                        }
21775+                    } while (coded_vals != 0);
21776+                }
21777+
21778+                // sign_hidden = 0 or 1 so we can combine the tests
21779+                if ((sign_hidden & sum_abs) != 0) {
21780+                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
21781+                }
21782+
21783+                bypass_finish(&lc->cc);
21784+
21785+                // -- Finish bypass block
21786+
21787+                // Scale loop
21788+                {
21789+                    int m = nb_significant_coeff_flag - 1;
21790+
21791+                    // Deal with DC component (if any) first
21792+                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
21793+                    {
21794+                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
21795+                        const int res = trans_scale_sat(
21796+                            (levels[m] ^ k) - k, scale, dc_scale, shift);
21797+#if RPI_COMPRESS_COEFFS
21798+                        if (use_compress)
21799+                        {
21800+                            coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs);
21801+                        }
21802+                        else
21803+#endif
21804+                        {
21805+                            blk_coeffs[0] = res;
21806+                        }
21807+                        --m;
21808+                    }
21809+
21810+#if !USE_N_END_1
21811+                    // If N_END_1 set then m was at least 1 initially
21812+                    if (m >= 0)
21813+#endif
21814+                    {
21815+                        do {
21816+                            const xy_off_t * const xy_off = scan_xy_off +
21817+                                significant_coeff_flag_idx[m];
21818+                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
21819+                            const int res = trans_scale_sat(
21820+                                (levels[m] ^ k) - k,
21821+                                scale,
21822+                                blk_scale[xy_off->scale],
21823+                                shift);
21824+#if RPI_COMPRESS_COEFFS
21825+                            if (use_compress) {
21826+                              coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
21827+                            } else
21828+#endif
21829+                              blk_coeffs[xy_off->coeff] = res;
21830+                        } while (--m >= 0);
21831+                    }
21832+                }
21833+
21834+            }
21835+        }
21836+    } while ((i = next_subset(lc, i, c_idx_nz,
21837+                              significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 &&
21838+             !cabac_overflow(&lc->cc));
21839+
21840+    if (lc->cu.cu_transquant_bypass_flag) {
21841+        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
21842+                                    (pred_mode_intra == 10 || pred_mode_intra == 26))) {
21843+            int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
21844+
21845+            s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
21846+        }
21847+    } else {
21848+        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
21849+            int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
21850+                      log2_trafo_size == 2 &&
21851+                      lc->cu.pred_mode == MODE_INTRA;
21852+            if (rot) {
21853+                for (i = 0; i < 8; i++)
21854+                    FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
21855+            }
21856+
21857+            s->hevcdsp.dequant(coeffs, log2_trafo_size);
21858+
21859+            if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
21860+                                        lc->cu.pred_mode == MODE_INTRA &&
21861+                                        (pred_mode_intra == 10 || pred_mode_intra == 26))) {
21862+                int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
21863+
21864+                s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
21865+            }
21866+        } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
21867+            s->hevcdsp.transform_4x4_luma(coeffs);
21868+        }
21869+        else if (!use_vpu)
21870+        {
21871+            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
21872+            if (max_xy == 0)
21873+            {
21874+                if (use_dc)
21875+                    rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
21876+                else
21877+                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
21878+            }
21879+            else {
21880+                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
21881+                if (max_xy < 4)
21882+                    col_limit = FFMIN(4, col_limit);
21883+                else if (max_xy < 8)
21884+                    col_limit = FFMIN(8, col_limit);
21885+                else if (max_xy < 12)
21886+                    col_limit = FFMIN(24, col_limit);
21887+                s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
21888+            }
21889+        }
21890+    }
21891+
21892+#if 0
21893+    // Mildly rotted - we support no mode where cross is valid
21894+    if (lc->tu.cross_pf) {
21895+        int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
21896+        const int ccount = 1 << (log2_trafo_size * 2);
21897+
21898+        for (i = 0; i < ccount; i++) {
21899+            coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
21900+        }
21901+    }
21902+#endif
21903+
21904+    if (!use_dc) {
21905+#if RPI_COMPRESS_COEFFS
21906+        if (use_compress) {
21907+          coeffs32[num_nonzero] = 0;
21908+        }
21909+#endif
21910+        rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
21911+    }
21912+}
21913+
21914+#if !USE_BY22
21915+// Stores results to lc
21916+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
21917+{
21918+    int x = abs_mvd_greater0_flag_decode(lc);
21919+    int y = abs_mvd_greater0_flag_decode(lc);
21920+
21921+    if (x)
21922+        x += abs_mvd_greater1_flag_decode(lc);
21923+    if (y)
21924+        y += abs_mvd_greater1_flag_decode(lc);
21925+
21926+    switch (x) {
21927+    case 2: x = mvd_decode(lc);           break;
21928+    case 1: x = mvd_sign_flag_decode(lc); break;
21929+    case 0: x = 0;                       break;
21930+    }
21931+
21932+    switch (y) {
21933+    case 2: y = mvd_decode(lc);           break;
21934+    case 1: y = mvd_sign_flag_decode(lc); break;
21935+    case 0: y = 0;                       break;
21936+    }
21937+    return MV_XY(x,y);
21938+}
21939+#else
21940+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
21941+{
21942+    int x = abs_mvd_greater0_flag_decode(lc);
21943+    int y = abs_mvd_greater0_flag_decode(lc);
21944+
21945+    if ((x | y) == 0)
21946+        return 0;
21947+
21948+    if (x != 0)
21949+        x += abs_mvd_greater1_flag_decode(lc);
21950+    if (y != 0)
21951+        y += abs_mvd_greater1_flag_decode(lc);
21952+
21953+    if ((x | y) == 1)
21954+    {
21955+        // Not worth starting BY22
21956+        if (x != 0)
21957+            x = mvd_sign_flag_decode(lc);
21958+        if (y != 0)
21959+            y = mvd_sign_flag_decode(lc);
21960+    }
21961+    else
21962+    {
21963+        CABACContext * const cc = &lc->cc;
21964+        uint32_t val;
21965+        uint32_t b;
21966+        unsigned int n = 0;
21967+
21968+        bypass_start(cc);
21969+        b = val = get_cabac_by22_peek(cc);
21970+
21971+        if (x == 1) {
21972+            x = ((int32_t)b >> 31) | 1;
21973+            n = 1;
21974+            b <<= 1;
21975+        }
21976+        else if (x == 2) {
21977+            // EG1 so we have (leading one bits + 1) of suffix
21978+            // This makes prefix & suffix lengths the same
21979+            const unsigned int k = hevc_clz32(~b) + 1;
21980+            int s;
21981+
21982+            av_assert2(k <= 15);
21983+
21984+            b <<= k;
21985+            n = 2 * k + 1; // Includes suffix & sign
21986+
21987+            // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked
21988+            // if we are going to do this without a flush
21989+            if (k > CABAC_BY22_PEEK_BITS / 2 - 1)
21990+            {
21991+                // Need too many bits - flush
21992+                // n = k
21993+                get_cabac_by22_flush(cc, k, val);
21994+                b = val = get_cabac_by22_peek(cc);
21995+                n = k + 1;
21996+            }
21997+
21998+            x = (b >> (32 - k)) + (1 << k);
21999+            b <<= k;
22000+            s = (int32_t)b >> 31;
22001+            x = (x ^ s) - s;
22002+            b <<= 1;
22003+
22004+            // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits)
22005+            if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15)
22006+            {
22007+                get_cabac_by22_flush(cc, n, val);
22008+                b = val = get_cabac_by22_peek(cc);
22009+                n = 0;
22010+            }
22011+        }
22012+
22013+        if (y == 1) {
22014+            y = ((int32_t)b >> 31) | 1;
22015+            ++n;
22016+            // don't care about b anymore
22017+        }
22018+        else if (y == 2) {
22019+            const unsigned int k = hevc_clz32(~b) + 1;
22020+            int s;
22021+
22022+            av_assert2(k <= 15);
22023+
22024+            // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked
22025+            // if we are going to do this without a flush
22026+            b <<= k;
22027+            n += 2 * k + 1;
22028+
22029+            if (n > CABAC_BY22_PEEK_BITS)
22030+            {
22031+                // Need too many bits - flush
22032+                get_cabac_by22_flush(cc, n - (k + 1), val);
22033+                b = val = get_cabac_by22_peek(cc);
22034+                n = k + 1;
22035+            }
22036+
22037+            y = (b >> (32 - k)) + (1 << k);
22038+            s = (int32_t)(b << k) >> 31;
22039+            y = (y ^ s) - s;
22040+            // don't care about b anymore
22041+        }
22042+
22043+        get_cabac_by22_flush(cc, n, val);
22044+        bypass_finish(cc);
22045+    }
22046+
22047+    return MV_XY(x, y);
22048+}
22049+#endif
22050--- /dev/null
22051+++ b/libavcodec/rpi_hevc_cabac_fns.h
22052@@ -0,0 +1,217 @@
22053+/*
22054+ * HEVC CABAC decoding
22055+ *
22056+ * Copyright (C) 2012 - 2013 Guillaume Martres
22057+ * Copyright (C) 2012 - 2013 Gildas Cocherel
22058+ * Copyright (C) 2012 - 2013 Gildas Cocherel
22059+ * Copyright (C) 2018 John Cox
22060+ *
22061+ * This file is part of FFmpeg.
22062+ *
22063+ * FFmpeg is free software; you can redistribute it and/or
22064+ * modify it under the terms of the GNU Lesser General Public
22065+ * License as published by the Free Software Foundation; either
22066+ * version 2.1 of the License, or (at your option) any later version.
22067+ *
22068+ * FFmpeg is distributed in the hope that it will be useful,
22069+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
22070+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22071+ * Lesser General Public License for more details.
22072+ *
22073+ * You should have received a copy of the GNU Lesser General Public
22074+ * License along with FFmpeg; if not, write to the Free Software
22075+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22076+ */
22077+
22078+
22079+#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H
22080+#define AVCODEC_RPI_HEVC_CABAC_FNS_H
22081+
22082+#include "config.h"
22083+#include "rpi_hevcdec.h"
22084+
22085+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc);
22086+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc);
22087+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags);
22088+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
22089+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
22090+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
22091+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc);
22092+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc);
22093+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size);
22094+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc);
22095+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc);
22096+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc);
22097+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
22098+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH);
22099+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx);
22100+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx);
22101+
22102+//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc);
22103+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc);
22104+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
22105+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
22106+                                const int x0, const int y0,
22107+                                const int log2_trafo_size, const enum ScanType scan_idx,
22108+                                const int c_idx);
22109+
22110+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
22111+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc);
22112+
22113+#define HEVC_BIN_SAO_MERGE_FLAG                         0
22114+#define HEVC_BIN_SAO_TYPE_IDX                           1
22115+#define HEVC_BIN_SAO_EO_CLASS                           2
22116+#define HEVC_BIN_SAO_BAND_POSITION                      2
22117+#define HEVC_BIN_SAO_OFFSET_ABS                         2
22118+#define HEVC_BIN_SAO_OFFSET_SIGN                        2
22119+#define HEVC_BIN_END_OF_SLICE_FLAG                      2
22120+#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG                 2
22121+#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG              5
22122+#define HEVC_BIN_SKIP_FLAG                              6
22123+#define HEVC_BIN_CU_QP_DELTA                            9
22124+#define HEVC_BIN_PRED_MODE                              12
22125+#define HEVC_BIN_PART_MODE                              13
22126+#define HEVC_BIN_PCM_FLAG                               17
22127+#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE              17
22128+#define HEVC_BIN_MPM_IDX                                18
22129+#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE               18
22130+#define HEVC_BIN_INTRA_CHROMA_PRED_MODE                 18
22131+#define HEVC_BIN_MERGE_FLAG                             20
22132+#define HEVC_BIN_MERGE_IDX                              21
22133+#define HEVC_BIN_INTER_PRED_IDC                         22
22134+#define HEVC_BIN_REF_IDX_L0                             27
22135+#define HEVC_BIN_REF_IDX_L1                             29
22136+#define HEVC_BIN_ABS_MVD_GREATER0_FLAG                  31
22137+#define HEVC_BIN_ABS_MVD_GREATER1_FLAG                  33
22138+#define HEVC_BIN_ABS_MVD_MINUS2                         35
22139+#define HEVC_BIN_MVD_SIGN_FLAG                          35
22140+#define HEVC_BIN_MVP_LX_FLAG                            35
22141+#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG                  36
22142+#define HEVC_BIN_SPLIT_TRANSFORM_FLAG                   37
22143+#define HEVC_BIN_CBF_LUMA                               40
22144+#define HEVC_BIN_CBF_CB_CR                              42
22145+#define HEVC_BIN_TRANSFORM_SKIP_FLAG                    46
22146+#define HEVC_BIN_EXPLICIT_RDPCM_FLAG                    48
22147+#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG                50
22148+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX        52
22149+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX        70
22150+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX        88
22151+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX        88
22152+#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG           88
22153+#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG                 92
22154+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG          136
22155+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG          160
22156+#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING              166
22157+#define HEVC_BIN_COEFF_SIGN_FLAG                        166
22158+#define HEVC_BIN_LOG2_RES_SCALE_ABS                     166
22159+#define HEVC_BIN_RES_SCALE_SIGN_FLAG                    174
22160+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG               176
22161+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX                177
22162+
22163+
22164+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state);
22165+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c);
22166+
22167+static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) {
22168+    const uint8_t *ptr = c->bytestream;
22169+
22170+    if (c->low & 0x1)
22171+        ptr--;
22172+#if CABAC_BITS == 16
22173+    if (c->low & 0x1FF)
22174+        ptr--;
22175+#endif
22176+    if ((int) (c->bytestream_end - ptr) < n)
22177+        return NULL;
22178+    if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0)
22179+        return NULL;
22180+
22181+    return ptr;
22182+}
22183+
22184+static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc)
22185+{
22186+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG);
22187+}
22188+
22189+static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc)
22190+{
22191+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG);
22192+}
22193+
22194+static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc)
22195+{
22196+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG);
22197+}
22198+
22199+static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
22200+                                                            const unsigned int ct_depth,
22201+                                                            const unsigned int x0, const unsigned int y0)
22202+{
22203+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG +
22204+                                 ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) +
22205+                                 ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth));
22206+}
22207+
22208+static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
22209+                             const int x0, const int y0, const int x_cb, const int y_cb)
22210+{
22211+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG +
22212+                                 (s->cabac_stash_left[y0 >> 3] & 1) +
22213+                                 (s->cabac_stash_up[x0 >> 3] & 1));
22214+}
22215+
22216+static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc)
22217+{
22218+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE);
22219+}
22220+
22221+static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc)
22222+{
22223+    return ff_hevc_rpi_get_cabac_terminate(&lc->cc);
22224+}
22225+
22226+static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc)
22227+{
22228+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE);
22229+}
22230+
22231+static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc)
22232+{
22233+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG);
22234+}
22235+
22236+static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc)
22237+{
22238+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG);
22239+}
22240+
22241+static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc)
22242+{
22243+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG);
22244+}
22245+
22246+static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
22247+{
22248+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth);
22249+}
22250+
22251+static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
22252+{
22253+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth);
22254+}
22255+
22256+static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size)
22257+{
22258+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size);
22259+}
22260+
22261+static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx)
22262+{
22263+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx);
22264+}
22265+
22266+
22267+
22268+#endif
22269+
22270--- /dev/null
22271+++ b/libavcodec/rpi_hevc_data.c
22272@@ -0,0 +1,75 @@
22273+/*
22274+ * HEVC shared tables
22275+ *
22276+ * This file is part of FFmpeg.
22277+ *
22278+ * FFmpeg is free software; you can redistribute it and/or
22279+ * modify it under the terms of the GNU Lesser General Public
22280+ * License as published by the Free Software Foundation; either
22281+ * version 2.1 of the License, or (at your option) any later version.
22282+ *
22283+ * FFmpeg is distributed in the hope that it will be useful,
22284+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
22285+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22286+ * Lesser General Public License for more details.
22287+ *
22288+ * You should have received a copy of the GNU Lesser General Public
22289+ * License along with FFmpeg; if not, write to the Free Software
22290+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22291+ */
22292+
22293+#include <stdint.h>
22294+
22295+#include "rpi_hevc_data.h"
22296+
22297+const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = {
22298+    0, 0, 1, 0,
22299+    1, 2, 0, 1,
22300+    2, 3, 1, 2,
22301+    3, 2, 3, 3,
22302+};
22303+
22304+const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = {
22305+    0, 1, 0, 2,
22306+    1, 0, 3, 2,
22307+    1, 0, 3, 2,
22308+    1, 3, 2, 3,
22309+};
22310+
22311+const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = {
22312+    0, 0, 1, 0,
22313+    1, 2, 0, 1,
22314+    2, 3, 0, 1,
22315+    2, 3, 4, 0,
22316+    1, 2, 3, 4,
22317+    5, 0, 1, 2,
22318+    3, 4, 5, 6,
22319+    0, 1, 2, 3,
22320+    4, 5, 6, 7,
22321+    1, 2, 3, 4,
22322+    5, 6, 7, 2,
22323+    3, 4, 5, 6,
22324+    7, 3, 4, 5,
22325+    6, 7, 4, 5,
22326+    6, 7, 5, 6,
22327+    7, 6, 7, 7,
22328+};
22329+
22330+const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = {
22331+    0, 1, 0, 2,
22332+    1, 0, 3, 2,
22333+    1, 0, 4, 3,
22334+    2, 1, 0, 5,
22335+    4, 3, 2, 1,
22336+    0, 6, 5, 4,
22337+    3, 2, 1, 0,
22338+    7, 6, 5, 4,
22339+    3, 2, 1, 0,
22340+    7, 6, 5, 4,
22341+    3, 2, 1, 7,
22342+    6, 5, 4, 3,
22343+    2, 7, 6, 5,
22344+    4, 3, 7, 6,
22345+    5, 4, 7, 6,
22346+    5, 7, 6, 7,
22347+};
22348--- /dev/null
22349+++ b/libavcodec/rpi_hevc_data.h
22350@@ -0,0 +1,31 @@
22351+/*
22352+ * HEVC shared data tables
22353+ *
22354+ * This file is part of FFmpeg.
22355+ *
22356+ * FFmpeg is free software; you can redistribute it and/or
22357+ * modify it under the terms of the GNU Lesser General Public
22358+ * License as published by the Free Software Foundation; either
22359+ * version 2.1 of the License, or (at your option) any later version.
22360+ *
22361+ * FFmpeg is distributed in the hope that it will be useful,
22362+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
22363+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22364+ * Lesser General Public License for more details.
22365+ *
22366+ * You should have received a copy of the GNU Lesser General Public
22367+ * License along with FFmpeg; if not, write to the Free Software
22368+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22369+ */
22370+
22371+#ifndef AVCODEC_RPI_HEVC_DATA_H
22372+#define AVCODEC_RPI_HEVC_DATA_H
22373+
22374+#include <stdint.h>
22375+
22376+extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16];
22377+extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16];
22378+extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64];
22379+extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64];
22380+
22381+#endif /* AVCODEC_RPI_HEVC_DATA_H */
22382--- /dev/null
22383+++ b/libavcodec/rpi_hevc_filter.c
22384@@ -0,0 +1,1210 @@
22385+/*
22386+ * HEVC video decoder
22387+ *
22388+ * Originally by:
22389+ * Copyright (C) 2012 - 2013 Guillaume Martres
22390+ * Copyright (C) 2013 Seppo Tomperi
22391+ * Copyright (C) 2013 Wassim Hamidouche
22392+ *
22393+ * Substantially rewritten:
22394+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
22395+ *
22396+ * This file is part of FFmpeg.
22397+ *
22398+ * FFmpeg is free software; you can redistribute it and/or
22399+ * modify it under the terms of the GNU Lesser General Public
22400+ * License as published by the Free Software Foundation; either
22401+ * version 2.1 of the License, or (at your option) any later version.
22402+ *
22403+ * FFmpeg is distributed in the hope that it will be useful,
22404+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
22405+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22406+ * Lesser General Public License for more details.
22407+ *
22408+ * You should have received a copy of the GNU Lesser General Public
22409+ * License along with FFmpeg; if not, write to the Free Software
22410+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22411+ */
22412+
22413+//#define DISABLE_SAO
22414+//#define DISABLE_DEBLOCK
22415+//#define DISABLE_STRENGTHS
22416+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
22417+//#define DISABLE_DEBLOCK_NONREF
22418+
22419+#include "libavutil/common.h"
22420+#include "libavutil/internal.h"
22421+
22422+#include "rpi_hevcdec.h"
22423+
22424+#include "bit_depth_template.c"
22425+
22426+#include "rpi_qpu.h"
22427+#include "rpi_zc.h"
22428+#include "libavutil/rpi_sand_fns.h"
22429+
22430+#define LUMA 0
22431+#define CB 1
22432+#define CR 2
22433+
22434+// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2
22435+// so -12,75 overall
22436+static const uint8_t tctablex[] = {
22437+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
22438+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
22439+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
22440+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
22441+
22442+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
22443+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 1, // QP  0...18
22444+    1, 1, 1, 1, 1, 1, 1,  1,  2,  2,  2,  2,  3,  3,  3,  3, 4, 4, 4, // QP 19...37
22445+    5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24,          // QP 38...53
22446+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24                    // 54..75
22447+};
22448+#define tctable (tctablex + 12 + 6*8)
22449+
22450+static const uint8_t betatablex[] = {
22451+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
22452+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
22453+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
22454+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
22455+
22456+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
22457+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  7,  8, // QP 0...18
22458+     9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37
22459+    38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,                      // QP 38...51
22460+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64                    // 52..73
22461+};
22462+#define betatable (betatablex + 12 + 6*8)
22463+
22464+static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y,
22465+                            const int c_idx, const int tc_offset)
22466+{
22467+    return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2];
22468+}
22469+
22470+static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
22471+                               const unsigned int xBase, const unsigned int yBase)
22472+{
22473+    const unsigned int ctb_size_mask        = (1 << s->ps.sps->log2_ctb_size) - 1;
22474+    const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size;
22475+    const unsigned int xQgBase              = xBase & MinCuQpDeltaSizeMask;
22476+    const unsigned int yQgBase              = yBase & MinCuQpDeltaSizeMask;
22477+    const unsigned int min_cb_width         = s->ps.sps->min_cb_width;
22478+    const unsigned int x_cb                 = xQgBase >> s->ps.sps->log2_min_cb_size;
22479+    const unsigned int y_cb                 = yQgBase >> s->ps.sps->log2_min_cb_size;
22480+    const int qPy_pred = lc->qPy_pred;
22481+
22482+    return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred :
22483+             s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) +
22484+            ((yQgBase & ctb_size_mask) == 0 ? qPy_pred :
22485+             s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1;
22486+}
22487+
22488+// * Only called from bitstream decode in foreground
22489+//   so should be safe
22490+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase)
22491+{
22492+    const int qp_y = get_qPy_pred(s, lc, xBase, yBase);
22493+
22494+    if (lc->tu.cu_qp_delta != 0) {
22495+        // ?? I suspect that the -bd_offset here leads to us adding it elsewhere
22496+        int off = s->ps.sps->qp_bd_offset;
22497+        lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off,
22498+                                 52 + off) - off;
22499+    } else
22500+        lc->qp_y = qp_y;
22501+}
22502+
22503+static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx)
22504+{
22505+    return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
22506+}
22507+
22508+// "DSP" these?
22509+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
22510+{
22511+    switch (pixel_shift)
22512+    {
22513+        case 2:
22514+            *(uint32_t *)dst = *(uint32_t *)src;
22515+            break;
22516+        case 1:
22517+            *(uint16_t *)dst = *(uint16_t *)src;
22518+            break;
22519+        default:
22520+            *dst = *src;
22521+            break;
22522+    }
22523+}
22524+
22525+static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src,
22526+                           ptrdiff_t stride_src, int x, int y, int width, int height,
22527+                           int c_idx, int x_ctb, int y_ctb)
22528+{
22529+    const unsigned int sh = pixel_shift(s, c_idx);
22530+    const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx);
22531+    const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx);
22532+
22533+    /* copy horizontal edges */
22534+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
22535+        src, width << sh);
22536+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
22537+        src + stride_src * (height - 1), width << sh);
22538+
22539+    /* copy vertical edges */
22540+    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
22541+
22542+    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
22543+}
22544+
22545+// N.B. Src & dst are swapped as this is a restore!
22546+// x0 & y0 are in luma coords
22547+// Width & height are in Y/C pels as appropriate
22548+// * Clear scope for optimsation here but not used enough to be worth it
22549+static void restore_tqb_pixels(const HEVCRpiContext * const s,
22550+                               uint8_t *src1, const uint8_t *dst1,
22551+                               const ptrdiff_t stride_src, const ptrdiff_t stride_dst,
22552+                               const unsigned int x0, const unsigned int y0,
22553+                               const unsigned int width, const int height,
22554+                               const int c_idx)
22555+{
22556+    if (s->ps.pps->transquant_bypass_enable_flag ||
22557+        s->ps.sps->pcm.loop_filter_disable_flag)
22558+    {
22559+        const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width;
22560+        int blks_y = height >> (c_idx == 0 ? 3 : 2);
22561+        const unsigned int bwidth = 8 << s->ps.sps->pixel_shift;  // Y & C have the same width in sand
22562+        const unsigned int bheight = (c_idx == 0) ? 8 : 4;
22563+        const unsigned int sh = ((x0 >> 3) & 7);
22564+        const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1;
22565+
22566+        do {
22567+            unsigned int m = (*pcm >> sh) & mask;
22568+            uint8_t * bd = src1;
22569+            const uint8_t * bs = dst1;
22570+            while (m != 0) {
22571+                if ((m & 1) != 0) {
22572+                    s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight);
22573+                }
22574+                m >>= 1;
22575+                bs += bwidth;
22576+                bd += bwidth;
22577+            }
22578+            src1 += stride_src * bheight;
22579+            dst1 += stride_dst * bheight;
22580+            pcm += s->ps.sps->pcm_width;
22581+        } while (--blks_y > 0);
22582+    }
22583+}
22584+
22585+#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
22586+
22587+static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y)
22588+{
22589+#if SAO_FILTER_N == 5
22590+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
22591+#elif SAO_FILTER_N == 6
22592+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
22593+#else
22594+#error Confused by size of sao fn array
22595+#endif
22596+    int c_idx;
22597+    int edges[4];  // 0 left 1 top 2 right 3 bottom
22598+    int x_ctb                = x >> s->ps.sps->log2_ctb_size;
22599+    int y_ctb                = y >> s->ps.sps->log2_ctb_size;
22600+    int ctb_addr_rs          = y_ctb * s->ps.sps->ctb_width + x_ctb;
22601+    int ctb_addr_ts          = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
22602+    RpiSAOParams *sao           = &CTB(s->sao, x_ctb, y_ctb);
22603+    // flags indicating unfilterable edges
22604+    uint8_t vert_edge[]      = { 0, 0 };
22605+    uint8_t horiz_edge[]     = { 0, 0 };
22606+    uint8_t diag_edge[]      = { 0, 0, 0, 0 };
22607+    uint8_t lfase            = CTB(s->filter_slice_edges, x_ctb, y_ctb);
22608+    uint8_t no_tile_filter   = s->ps.pps->tiles_enabled_flag &&
22609+                               !s->ps.pps->loop_filter_across_tiles_enabled_flag;
22610+    uint8_t restore          = no_tile_filter || !lfase;
22611+    uint8_t left_tile_edge   = 0;
22612+    uint8_t right_tile_edge  = 0;
22613+    uint8_t up_tile_edge     = 0;
22614+    uint8_t bottom_tile_edge = 0;
22615+    const int sliced = 1;
22616+    const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1);
22617+
22618+    edges[0]   = x_ctb == 0;
22619+    edges[1]   = y_ctb == 0;
22620+    edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
22621+    edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
22622+
22623+#ifdef DISABLE_SAO
22624+    return;
22625+#endif
22626+
22627+    if (restore) {
22628+        if (!edges[0]) {
22629+            left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
22630+            vert_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
22631+        }
22632+        if (!edges[2]) {
22633+            right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
22634+            vert_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
22635+        }
22636+        if (!edges[1]) {
22637+            up_tile_edge     = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
22638+            horiz_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
22639+        }
22640+        if (!edges[3]) {
22641+            bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
22642+            horiz_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
22643+        }
22644+        if (!edges[0] && !edges[1]) {
22645+            diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
22646+        }
22647+        if (!edges[1] && !edges[2]) {
22648+            diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
22649+        }
22650+        if (!edges[2] && !edges[3]) {
22651+            diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
22652+        }
22653+        if (!edges[0] && !edges[3]) {
22654+            diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
22655+        }
22656+    }
22657+
22658+    for (c_idx = 0; c_idx < plane_count; c_idx++) {
22659+        const unsigned int vshift = ctx_vshift(s, c_idx);
22660+        const unsigned int hshift = ctx_hshift(s, c_idx);
22661+        const int x0 = x >> hshift;
22662+        const int y0 = y >> vshift;
22663+        const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx);
22664+        const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift;
22665+        const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift;
22666+        const int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> hshift) - x0);
22667+        const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0);
22668+        int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
22669+        ptrdiff_t stride_dst;
22670+        uint8_t *dst;
22671+
22672+        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
22673+        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
22674+        uint8_t * const src = !sliced ?
22675+                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
22676+            c_idx == 0 ?
22677+                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
22678+                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
22679+        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
22680+            !sliced ? src - (1 << sh) :
22681+            c_idx == 0 ?
22682+                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
22683+                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
22684+        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
22685+            !sliced ? src + (width << sh) :
22686+            c_idx == 0 ?
22687+                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
22688+                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
22689+
22690+        if (sliced && c_idx > 1) {
22691+            break;
22692+        }
22693+
22694+//        if (c_idx == 1)
22695+//            printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr);
22696+
22697+        switch (sao->type_idx[c_idx]) {
22698+        case SAO_BAND:
22699+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
22700+                           x_ctb, y_ctb);
22701+            if (s->ps.pps->transquant_bypass_enable_flag ||
22702+                s->ps.sps->pcm.loop_filter_disable_flag)
22703+            {
22704+                // Can't use the edge buffer here as it may be in use by the foreground
22705+                DECLARE_ALIGNED(64, uint8_t, dstbuf)
22706+                    [2*MAX_PB_SIZE*MAX_PB_SIZE];
22707+                dst = dstbuf;
22708+                stride_dst = 2*MAX_PB_SIZE;
22709+                s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
22710+                if (sliced && c_idx != 0)
22711+                {
22712+                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
22713+                                                    sao->offset_val[1], sao->band_position[1],
22714+                                                    sao->offset_val[2], sao->band_position[2],
22715+                                                    width, height);
22716+                }
22717+                else
22718+                {
22719+                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
22720+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
22721+                                                    width, height);
22722+                }
22723+                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
22724+                                   x, y, width, height, c_idx);
22725+            } else {
22726+                if (sliced && c_idx != 0)
22727+                {
22728+                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
22729+                                                    sao->offset_val[1], sao->band_position[1],
22730+                                                    sao->offset_val[2], sao->band_position[2],
22731+                                                    width, height);
22732+                }
22733+                else
22734+                {
22735+                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
22736+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
22737+                                                    width, height);
22738+                }
22739+            }
22740+            sao->type_idx[c_idx] = SAO_APPLIED;
22741+            break;
22742+        case SAO_EDGE:
22743+        {
22744+            const int w = s->ps.sps->width >> hshift;
22745+            const int h = s->ps.sps->height >> vshift;
22746+            int top_edge = edges[1];
22747+            int bottom_edge = edges[3];
22748+            // Can't use the edge buffer here as it may be in use by the foreground
22749+            DECLARE_ALIGNED(64, uint8_t, dstbuf)
22750+                [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64];
22751+
22752+            stride_dst = RPI_HEVC_SAO_BUF_STRIDE;
22753+            dst = dstbuf + stride_dst + 32;
22754+
22755+            if (!top_edge) {
22756+                uint8_t *dst1;
22757+                int src_idx;
22758+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
22759+
22760+                dst1 = dst - stride_dst;
22761+
22762+                if (src_l != NULL) {
22763+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
22764+                               SAO_APPLIED);
22765+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
22766+                }
22767+
22768+                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
22769+                           SAO_APPLIED);
22770+                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
22771+
22772+                if (src_r != NULL) {
22773+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
22774+                               SAO_APPLIED);
22775+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
22776+                }
22777+            }
22778+            if (!bottom_edge) {
22779+                uint8_t * const dst1 = dst + height * stride_dst;
22780+                int src_idx;
22781+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
22782+                const unsigned int hoff = height * stride_src;
22783+
22784+                if (src_l != NULL) {
22785+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
22786+                               SAO_APPLIED);
22787+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
22788+                }
22789+
22790+                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
22791+                           SAO_APPLIED);
22792+                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
22793+
22794+                if (src_r != NULL) {
22795+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
22796+                               SAO_APPLIED);
22797+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
22798+                }
22799+            }
22800+            if (src_l != NULL) {
22801+                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
22802+                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
22803+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
22804+                              sh, height, stride_dst, 1 << sh);
22805+                } else {
22806+                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
22807+                              src_l,
22808+                              sh, height, stride_dst, stride_src);
22809+                }
22810+            }
22811+            if (src_r != NULL) {
22812+                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
22813+                    ff_hevc_rpi_copy_vert(dst + (width << sh),
22814+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
22815+                              sh, height, stride_dst, 1 << sh);
22816+                } else {
22817+                    ff_hevc_rpi_copy_vert(dst + (width << sh),
22818+                              src_r,
22819+                              sh, height, stride_dst, stride_src);
22820+                }
22821+            }
22822+
22823+            s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
22824+
22825+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
22826+                           x_ctb, y_ctb);
22827+            if (sliced && c_idx != 0)
22828+            {
22829+                // Class always the same for both U & V (which is just as well :-))
22830+                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
22831+                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
22832+                                                width, height);
22833+                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
22834+                                                    stride_src, stride_dst,
22835+                                                    sao,
22836+                                                    edges, width,
22837+                                                    height, c_idx,
22838+                                                    vert_edge,
22839+                                                    horiz_edge,
22840+                                                    diag_edge);
22841+            }
22842+            else
22843+            {
22844+                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
22845+                                                sao->eo_class[c_idx], width, height);
22846+                s->hevcdsp.sao_edge_restore[restore](src, dst,
22847+                                                    stride_src, stride_dst,
22848+                                                    sao,
22849+                                                    edges, width,
22850+                                                    height, c_idx,
22851+                                                    vert_edge,
22852+                                                    horiz_edge,
22853+                                                    diag_edge);
22854+            }
22855+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
22856+                               x, y, width, height, c_idx);
22857+            sao->type_idx[c_idx] = SAO_APPLIED;
22858+            break;
22859+        }
22860+        }
22861+    }
22862+
22863+#if RPI_ZC_SAND_8_IN_10_BUF
22864+    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
22865+        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
22866+    {
22867+        const unsigned int stride1 = frame_stride1(s->frame, 1);
22868+        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
22869+        const unsigned int xoff = (x >> 8) * stride2 * stride1;
22870+        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
22871+        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
22872+        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
22873+        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
22874+        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
22875+        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
22876+        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
22877+
22878+//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
22879+        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
22880+        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
22881+    }
22882+#endif
22883+}
22884+
22885+// When bits are delivered to deblock we want them
22886+//#define TL 1
22887+//#define TR 2
22888+//#define BL 4
22889+//#define BR 8
22890+
22891+// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br
22892+// so we need to rearrange before passing on
22893+
22894+static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
22895+{
22896+    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
22897+    return (pcm[0] |
22898+        (pcm[1] << 8) |
22899+        (pcm[s->ps.sps->pcm_width] << 16) |
22900+        (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7);
22901+}
22902+
22903+static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
22904+{
22905+    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
22906+    return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7);
22907+}
22908+
22909+// We cast away const here as we want this to work for both get and set
22910+static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
22911+{
22912+    return (uint32_t *)(bs +
22913+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
22914+#warning Unexpected masks
22915+        // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes
22916+        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
22917+            (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) +
22918+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
22919+#error Stride1 < return size
22920+#endif
22921+        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
22922+        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
22923+}
22924+
22925+static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
22926+{
22927+    return (uint8_t *)(bs +
22928+        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
22929+            (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) +
22930+        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
22931+        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
22932+}
22933+
22934+
22935+// Get block strength
22936+// Given how we call we will always get within the 32bit boundries
22937+static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2,
22938+                                unsigned int xl, unsigned int xr, const unsigned int y)
22939+{
22940+    if (xr <= xl) {
22941+        return 0;
22942+    }
22943+    else
22944+    {
22945+#if HAVE_ARMV6T2_INLINE
22946+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
22947+#error This case not yet handled in bs_get32
22948+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
22949+#error Stride1 < return size
22950+#endif
22951+        uint32_t tmp;
22952+        __asm__ (
22953+            "lsr         %[tmp], %[xl], %[xl_shift]                  \n\t"
22954+            "rsb         %[xr], %[xl], %[xr]                         \n\t"
22955+            "mla         %[stride2], %[stride2], %[tmp], %[bs]       \n\t"
22956+            "add         %[xr], %[xr], #7                            \n\t"
22957+            "lsr         %[bs], %[y], %[y_shift1]                    \n\t"
22958+            "bic         %[xr], %[xr], #7                            \n\t"
22959+            "ubfx        %[xl], %[xl], #1, #5                        \n\t"
22960+            "lsr         %[xr], %[xr], #1                            \n\t"
22961+            "cmp         %[xr], #32                                  \n\t"
22962+            "mvn         %[tmp], #0                                  \n\t"
22963+            "ldr         %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t"
22964+            "lsl         %[tmp], %[tmp], %[xr]                       \n\t"
22965+            "lsr         %[xl], %[bs], %[xl]                         \n\t"
22966+            "it ne                                                   \n\t"
22967+            "bicne       %[bs], %[xl], %[tmp]                        \n\t"
22968+            :  // Outputs
22969+                      [bs]"+r"(bs),
22970+                 [stride2]"+r"(stride2),
22971+                      [xl]"+r"(xl),
22972+                      [xr]"+r"(xr),
22973+                     [tmp]"=&r"(tmp)
22974+            :  // Inputs
22975+                       [y]"r"(y),
22976+                [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT),
22977+                [y_shift1]"M"(HEVC_RPI_BS_Y_SHR),
22978+                [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
22979+            :  // Clobbers
22980+                "cc"
22981+        );
22982+        return (uint32_t) bs;
22983+#else
22984+        const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
22985+        const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
22986+
22987+        return n == 32 ? a :
22988+            (a >> ((xl >> 1) & 31)) & ~(~0U << n);
22989+#endif
22990+    }
22991+}
22992+
22993+static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
22994+{
22995+    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
22996+    return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y);
22997+}
22998+
22999+static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
23000+{
23001+    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
23002+    return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y);
23003+}
23004+
23005+
23006+static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
23007+{
23008+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
23009+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
23010+    const unsigned int ctb_size = (1 << log2_ctb_size);
23011+    const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 :  1);
23012+    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
23013+    const DBParams * cb_dbp = s->deblock + ctb_n;
23014+    const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
23015+
23016+    unsigned int cb_x;
23017+
23018+    // Do in CTB-shaped blocks
23019+    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp)
23020+    {
23021+        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
23022+        const unsigned int bv_l = FFMAX(cb_x, 8);
23023+        const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9;
23024+        const unsigned int bh_l = bv_l - 8;
23025+        unsigned int y;
23026+
23027+        // Main body
23028+        for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8)
23029+        {
23030+            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y);
23031+
23032+            const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp;
23033+            const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
23034+            const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
23035+
23036+            if (vbs != 0)
23037+            {
23038+                const uint8_t * const tcv = tctable + dbp->tc_offset;
23039+                const uint8_t * const betav = betatable + dbp->beta_offset;
23040+                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
23041+                unsigned int x;
23042+
23043+                for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1)
23044+                {
23045+                    if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3)
23046+                    {
23047+                        const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
23048+                        s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
23049+                                                         frame_stride1(s->frame, LUMA),
23050+                                                         betav[qp],
23051+                                                         ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) |
23052+                                                          (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16),
23053+                                                         pcmfa & 3,
23054+                                                         av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
23055+                    }
23056+                }
23057+            }
23058+
23059+            if (y != 0)
23060+            {
23061+                uint32_t hbs;
23062+
23063+                // H left - mostly separated out so we only need a uint32_t hbs
23064+                if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0)
23065+                {
23066+                    const unsigned int x = bh_l;
23067+                    const unsigned int pcmfa = pcm4(s, bh_l, y - 1);
23068+                    const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
23069+                    const DBParams * const dbph = dbp - 1;
23070+                    const uint8_t * const tc = tctable + dbph->tc_offset + qp;
23071+
23072+                    av_assert2(cb_x - bh_l == 8);
23073+
23074+                    s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
23075+                                                         frame_stride1(s->frame, LUMA),
23076+                                                         betatable[qp + dbph->beta_offset],
23077+                                                         ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
23078+                                                            (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
23079+                                                         (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
23080+                }
23081+
23082+                // H
23083+                if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0)  // Will give (x <= bh_r) in for loop
23084+                {
23085+                    unsigned int x;
23086+                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);
23087+
23088+                    for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1)
23089+                    {
23090+                        if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0)
23091+                        {
23092+                            const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
23093+                            const uint8_t * const tc = tctable + dbp->tc_offset + qp;
23094+                            s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
23095+                                                                frame_stride1(s->frame, LUMA),
23096+                                                                betatable[qp + dbp->beta_offset],
23097+                                                                ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
23098+                                                                   (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
23099+                                                                (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
23100+                        }
23101+                    }
23102+                }
23103+            }
23104+
23105+        }
23106+    }
23107+}
23108+
23109+static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
23110+{
23111+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
23112+    const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
23113+    return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1;
23114+}
23115+
23116+static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
23117+{
23118+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
23119+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
23120+    const unsigned int ctb_size = (1 << log2_ctb_size);
23121+    const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 :  8);
23122+    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
23123+    const DBParams * dbp = s->deblock + ctb_n;
23124+    const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
23125+    const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1];
23126+    const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2];
23127+
23128+    unsigned int cb_x;
23129+
23130+    av_assert1((bounds.x & (ctb_size - 1)) == 0);
23131+    av_assert1((bounds.y & (ctb_size - 1)) == 0);
23132+    av_assert1(bounds.h <= ctb_size);
23133+
23134+    // Do in CTB-shaped blocks
23135+    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) {
23136+        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
23137+        const unsigned int bv_l = FFMAX(cb_x, 16);
23138+        unsigned int y;
23139+
23140+        // V above
23141+        if (bounds.y != 0) {
23142+            // Deblock V up 8
23143+            // CTB above current
23144+            // Top-half only (tc4 & ~0xffff == 0) is special cased in asm
23145+            const unsigned int y = bounds.y - 8;
23146+            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U;
23147+
23148+            if (vbs != 0)
23149+            {
23150+                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
23151+                const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
23152+                unsigned int x;
23153+
23154+                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
23155+                {
23156+                    if ((vbs & 2) != 0 && (~pcmfa & 3) != 0)
23157+                    {
23158+                        const int qp0 = q2h(s, x, y);
23159+                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
23160+                                                       frame_stride1(s->frame, 1),
23161+                                                       tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
23162+                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
23163+                                                       pcmfa & 3);
23164+                    }
23165+                }
23166+            }
23167+        }
23168+
23169+        for (y = bounds.y; y < b_b; y += 16)
23170+        {
23171+            uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) |
23172+                (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4);
23173+
23174+            // V
23175+            if (vbs != 0)
23176+            {
23177+                unsigned int x;
23178+                unsigned int pcmfa =
23179+                    (y + 16 > b_b ?
23180+                        pcm2(s, bv_l - 1, y) | 0xffff0000 :
23181+                        pcm4(s, bv_l - 1, y));
23182+                const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
23183+
23184+                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
23185+                {
23186+                    if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
23187+                    {
23188+                        const int qp0 = q2h(s, x, y);
23189+                        const int qp1 = q2h(s, x, y + 8);
23190+                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
23191+                            frame_stride1(s->frame, 1),
23192+                            ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
23193+                                ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
23194+                            av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
23195+                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
23196+                    }
23197+                }
23198+            }
23199+
23200+            // H
23201+            if (y != 0)
23202+            {
23203+                uint32_t hbs;
23204+                const unsigned int bh_l = bv_l - 16;
23205+                const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
23206+                const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
23207+                const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
23208+
23209+                // H left - mostly separated out so we only need a uint32_t hbs
23210+                // Stub is width 8 to the left of bounds, but width 16 internally
23211+                if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0)
23212+                {
23213+                    unsigned int pcmfa = pcm4(s, bh_l, y - 1);
23214+
23215+                    // Chop off bits we don't want...
23216+                    if (bh_l < bounds.x) {
23217+                        pcmfa |= 0x10001; // TL|BL pre rearrangement
23218+                        hbs &= ~3;  // Make BS 0
23219+                    }
23220+
23221+                    // Double check we still want this
23222+                    if (hbs != 0 && (~pcmfa & 0x30003) != 0)
23223+                    {
23224+                        const unsigned int x = bh_l;
23225+                        const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
23226+                        const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
23227+                        const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset;
23228+
23229+                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
23230+                            frame_stride1(s->frame, 1),
23231+                            ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
23232+                                ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
23233+                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
23234+                    }
23235+                }
23236+
23237+                // H main
23238+                if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0)
23239+                {
23240+                    unsigned int x;
23241+                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);  // Might like to mask out far right writes but probably not worth it
23242+
23243+                    for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2)
23244+                    {
23245+                        if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
23246+                        {
23247+                            const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
23248+                            const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
23249+                            const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
23250+
23251+                            s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
23252+                                frame_stride1(s->frame, 1),
23253+                                ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
23254+                                    ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
23255+                                (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
23256+                        }
23257+                    }
23258+                }
23259+            }
23260+        }
23261+    }
23262+}
23263+
23264+static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n)
23265+{
23266+    return x & ~(~0U << log2_n);
23267+}
23268+
23269+static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
23270+{
23271+    av_assert2((y & 7) == 0);
23272+
23273+    // This doesn't have the same simultainious update issues that bsf_stash
23274+    // does (other threads will have a different y) so we can do it the easy way
23275+    if ((bsf &= mask) != 0)
23276+        *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31);
23277+}
23278+
23279+
23280+static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
23281+{
23282+    // We arrange this in a slightly odd fashion but it lines up with
23283+    // how we are going to use it in the actual deblock code & it is easier
23284+    // to do the contortions here than there
23285+    //
23286+    // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},...
23287+
23288+    av_assert2((x & 7) == 0);
23289+
23290+    if ((bsf &= mask) != 0)
23291+    {
23292+        uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y);
23293+        const unsigned int sh = ((x & 8) | (y & 4)) >> 1;
23294+
23295+        if (mask <= 0xf)
23296+        {
23297+            *p |= (bsf << sh);
23298+        }
23299+        else
23300+        {
23301+            do {
23302+                *p |= (bsf & 0xf) << sh;
23303+                p += HEVC_RPI_BS_STRIDE1_BYTES;
23304+            } while ((bsf >>= 4) != 0);
23305+        }
23306+    }
23307+}
23308+
23309+static inline uint32_t bsf_mv(const HEVCRpiContext * const s,
23310+                              const unsigned int rep, const unsigned int dup,
23311+                              const unsigned int mvf_stride0,
23312+                              const unsigned int mvf_stride1,
23313+                              const RefPicList * const rpl_p, const RefPicList * const rpl_q,
23314+                              const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q)
23315+{
23316+    return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
23317+            mvf_p, mvf_q,
23318+            rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
23319+            sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1);
23320+}
23321+
23322+
23323+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s,
23324+                                               const HEVCRpiLocalContext * const lc,
23325+                                               const unsigned int x0, const unsigned int y0,
23326+                                               const unsigned int log2_trafo_size,
23327+                                               const int is_coded_block)
23328+{
23329+    const HEVCRpiMvField * const mvf_curr      = mvf_stash_ptr(s, lc, x0, y0);
23330+    const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE;
23331+    const RefPicList * const rpl        = s->refPicList;
23332+    // Rep count for bsf_mv when running with min_pu chuncks
23333+    const unsigned int log2_rep_min_pu  = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size;
23334+    const unsigned int boundary_flags   = s->sh.no_dblk_boundary_flags & lc->boundary_flags;
23335+    const unsigned int trafo_size       = (1U << log2_trafo_size);
23336+    const uint32_t bsf_mask             = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1;
23337+    const uint32_t bsf_cbf              = (bsf_mask & 0x55555555);
23338+
23339+    // Do we cover a pred split line?
23340+    const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split;
23341+    const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split;
23342+
23343+    uint32_t bsf_h;
23344+    uint32_t bsf_v;
23345+
23346+#ifdef DISABLE_STRENGTHS
23347+    return;
23348+#endif
23349+
23350+    // We are always on a size boundary
23351+    av_assert2((x0 & (trafo_size - 1)) == 0);
23352+    av_assert2((y0 & (trafo_size - 1)) == 0);
23353+    // log2_trafo_size not really a transform size; we can have to deal
23354+    // with size 2^6 blocks
23355+    av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6);
23356+
23357+    // Retrieve and update coded (b0), intra (b1) bs flags
23358+    //
23359+    // Store on min width (rather than uint32_t) to avoid possible issues
23360+    // with another thread on another core running wpp using the same
23361+    // memory (min CTB = 16 pels = 4 bsf els = 8 bits)
23362+    //
23363+    // In bsf BS=2 is represented by 3 as it is much easier to test & set
23364+    // and the actual deblock code tests for 0 and b1 set/not-set so 2 and
23365+    // 3 will work the same
23366+    {
23367+        // Given where we are called from is_cbf_luma & is_intra will be constant over the block
23368+        const uint32_t bsf0 =  (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0;
23369+        uint8_t *const p = s->bsf_stash_up + (x0 >> 4);
23370+        uint8_t *const q = s->bsf_stash_left + (y0 >> 4);
23371+
23372+        switch (log2_trafo_size)
23373+        {
23374+            case 2:
23375+            case 3:
23376+            {
23377+                const unsigned int sh_h = (x0 >> 1) & 7;
23378+                const unsigned int sh_v = (y0 >> 1) & 7;
23379+                bsf_h = *p;
23380+                bsf_v = *q;
23381+                *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h);
23382+                *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v);
23383+                bsf_h >>= sh_h;
23384+                bsf_v >>= sh_v;
23385+                break;
23386+            }
23387+            case 4:
23388+                bsf_h = *p;
23389+                bsf_v = *q;
23390+                *p = bsf0;
23391+                *q = bsf0;
23392+                break;
23393+            case 5:
23394+                bsf_h = *(uint16_t *)p;
23395+                bsf_v = *(uint16_t *)q;
23396+                *(uint16_t *)p = bsf0;
23397+                *(uint16_t *)q = bsf0;
23398+                break;
23399+            case 6:
23400+            default:
23401+                bsf_h = *(uint32_t *)p;
23402+                bsf_v = *(uint32_t *)q;
23403+                *(uint32_t *)p = bsf0;
23404+                *(uint32_t *)q = bsf0;
23405+                break;
23406+        }
23407+
23408+        bsf_h |= bsf0;
23409+        bsf_v |= bsf0;
23410+    }
23411+
23412+    // Do Horizontal
23413+    if ((y0 & 7) == 0)
23414+    {
23415+        // Boundary upper
23416+        if (y0 != 0 &&
23417+            (off_boundary(y0, s->ps.sps->log2_ctb_size) ||
23418+             (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0))
23419+        {
23420+            // Look at MVs (BS=1) if we don't already has a full set of bs bits
23421+            if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split))
23422+            {
23423+                // If we aren't on the top boundary we must be in the middle
23424+                // and in that case we know where mvf can change
23425+                const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0;
23426+                const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ?
23427+                      s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] :
23428+                      rpl;
23429+
23430+                bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
23431+                    trafo_size >> (log2_min_pu_size + log2_rep),
23432+                    trafo_size >> (log2_min_pu_size + log2_rep),
23433+                    rpl, rpl_top,
23434+                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1));
23435+            }
23436+
23437+            // Finally put the results into bs
23438+            hbs_set(s, x0, y0, bsf_mask, bsf_h);
23439+        }
23440+
23441+        // Max of 1 pu internal split - ignore if not on 8pel boundary
23442+        if (has_y_split && !off_boundary(lc->cu.y_split, 3))
23443+        {
23444+            const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split);
23445+            // If we have the x split as well then it must be in the middle
23446+            const unsigned int log2_rep = has_x_split ? 1 : 0;
23447+
23448+            hbs_set(s, x0, lc->cu.y_split, bsf_mask,
23449+                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
23450+                   trafo_size >> (log2_min_pu_size + log2_rep),
23451+                   trafo_size >> (log2_min_pu_size + log2_rep),
23452+                   rpl, rpl,
23453+                   mvf, mvf - MVF_STASH_WIDTH_PU));
23454+        }
23455+    }
23456+
23457+    // And again for vertical - same logic as horizontal just in the other direction
23458+    if ((x0 & 7) == 0)
23459+    {
23460+        // Boundary left
23461+        if (x0 != 0 &&
23462+            (off_boundary(x0, s->ps.sps->log2_ctb_size) ||
23463+             (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0))
23464+        {
23465+            if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
23466+            {
23467+                const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0;
23468+                const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ?
23469+                    s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] :
23470+                    rpl;
23471+
23472+                bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
23473+                    (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
23474+                    (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
23475+                    rpl, rpl_left,
23476+                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0));
23477+            }
23478+
23479+            vbs_set(s, x0, y0, bsf_mask, bsf_v);
23480+        }
23481+
23482+        if (has_x_split && !off_boundary(lc->cu.x_split, 3))
23483+        {
23484+            const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0);
23485+            const unsigned int log2_rep = has_y_split ? 1 : 0;
23486+
23487+            vbs_set(s, lc->cu.x_split, y0, bsf_mask,
23488+                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
23489+                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
23490+                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
23491+                   rpl, rpl,
23492+                   mvf, mvf - 1));
23493+        }
23494+    }
23495+}
23496+
23497+#undef LUMA
23498+#undef CB
23499+#undef CR
23500+
23501+static inline unsigned int ussub(const unsigned int a, const unsigned int b)
23502+{
23503+    return a < b ? 0 : a - b;
23504+}
23505+
23506+static inline int cache_boundry(const AVFrame * const frame, const unsigned int x)
23507+{
23508+    return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0;
23509+}
23510+
23511+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot)
23512+{
23513+    const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
23514+    int x, y;
23515+
23516+    const unsigned int br = bounds.x + bounds.w;
23517+    const unsigned int bb = bounds.y + bounds.h;
23518+
23519+    const int x_end = (br >= s->ps.sps->width);
23520+    const int y_end = (bb >= s->ps.sps->height);
23521+
23522+    // Deblock may not touch the edges of the bound as they are still needed
23523+    // for Intra pred
23524+    //
23525+    // Deblock is disabled with a per-slice flag
23526+    // Given that bounds may cover multiple slices & we dblock outside bounds
23527+    // anyway we can't avoid deblock using that flag - about the only thing we
23528+    // could do is have a "no deblock seen yet" flag but it doesn't really
23529+    // seem worth the effort
23530+
23531+    deblock_y_blk(s, bounds, x_end, y_end);
23532+    deblock_uv_blk(s, bounds, x_end, y_end);
23533+
23534+    // SAO needs
23535+    // (a) CTB alignment
23536+    // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel
23537+    {
23538+        const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1));
23539+        const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1));
23540+        const unsigned int yt = ussub(bounds.y, yo);
23541+        const unsigned int yb = y_end ? bb : ussub(bb, yo);
23542+        const unsigned int xl = ussub(bounds.x, xo);
23543+        const unsigned int xr = x_end ? br : ussub(br, xo);
23544+
23545+        if (s->ps.sps->sao_enabled)
23546+        {
23547+            for (y = yt; y < yb; y += ctb_size) {
23548+                for (x = xl; x < xr; x += ctb_size) {
23549+                    sao_filter_CTB(s, x, y);
23550+                }
23551+            }
23552+        }
23553+
23554+        // Cache invalidate
23555+        y = 0;
23556+        if (xr != 0 && yb != 0)
23557+        {
23558+            const unsigned int llen =
23559+                (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame));
23560+            const unsigned int mask = ~(llen - 1);
23561+            const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask;
23562+            const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask;
23563+            const unsigned int it = ussub(yt, 1);
23564+            const unsigned int ib = y_end ? bb : yb - 1;
23565+
23566+            if (il < ir) {
23567+                rpi_cache_buf_t cbuf;
23568+                rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf);
23569+                rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
23570+                  il, it, ir - il, ib - it,
23571+                  ctx_vshift(s, 1), 1, 1);
23572+
23573+                // If we have to commit the right hand tile boundry due to
23574+                // cache boundry considerations then at EoTile we must commit
23575+                // that boundry to bottom of tile (bounds)
23576+                if (ib != bb && ir == br && eot) {
23577+                    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
23578+                      br - 1, ib, 1, bb - ib,
23579+                      ctx_vshift(s, 1), 1, 1);
23580+                }
23581+
23582+                rpi_cache_flush_finish(rfe);
23583+
23584+                if (x_end)
23585+                    y = y_end ? INT_MAX : ib;
23586+
23587+//                printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1);
23588+            }
23589+        }
23590+    }
23591+
23592+    return y;
23593+}
23594+
23595--- /dev/null
23596+++ b/libavcodec/rpi_hevc_mv.h
23597@@ -0,0 +1,71 @@
23598+#ifndef AVCODEC_RPI_HEVC_MV_H
23599+#define AVCODEC_RPI_HEVC_MV_H
23600+
23601+#include "config.h"
23602+
23603+typedef int32_t MvXY;
23604+
23605+typedef struct HEVCRpiMvField {
23606+    MvXY xy[2];
23607+    int8_t ref_idx[2];
23608+    int8_t pred_flag;
23609+    int8_t dummy; // To 12 bytes
23610+} HEVCRpiMvField;
23611+
23612+
23613+#define MV_X(xy) (((xy) << 16) >> 16)
23614+#define MV_Y(xy) ((xy) >> 16)
23615+#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16))
23616+
23617+#if ARCH_ARM
23618+#include "arm/rpi_hevc_mv_arm.h"
23619+#endif
23620+
23621+#ifndef mvxy_add
23622+static inline MvXY mvxy_add(const MvXY a, const MvXY b)
23623+{
23624+    return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b));
23625+}
23626+#endif
23627+
23628+
23629+#ifndef mv_scale_xy
23630+static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb)
23631+{
23632+    int tx, scale_factor;
23633+
23634+    td = td == 0 ? 1 : av_clip_int8(td);
23635+    tb = av_clip_int8(tb);
23636+    tx = (0x4000 + (abs(td) >> 1)) / td;
23637+    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
23638+    return MV_XY(
23639+        av_clip_int16((scale_factor * MV_X(src) + 127 +
23640+                           (scale_factor * MV_X(src) < 0)) >> 8),
23641+        av_clip_int16((scale_factor * MV_Y(src) + 127 +
23642+                           (scale_factor * MV_Y(src) < 0)) >> 8));
23643+}
23644+#endif
23645+
23646+// 8.3.1 states that the bitstream may not contain poc diffs that do not
23647+// fit in 16 bits, so given that we don't care about the high bits we only
23648+// store the low 16 + LT & Inter flags
23649+
23650+#define COL_POC_INTRA   0
23651+#define COL_POC_INTER   (1 << 16)
23652+#define COL_POC_LT      (1 << 17)
23653+#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y)))
23654+#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff))
23655+#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0)
23656+
23657+typedef struct ColMv_s {
23658+    int32_t poc;
23659+    int32_t xy;
23660+} ColMv;
23661+
23662+typedef struct ColMvField_s {
23663+    ColMv L[2];
23664+} ColMvField;
23665+
23666+
23667+
23668+#endif // AVCODEC_RPI_HEVC_MV_H
23669--- /dev/null
23670+++ b/libavcodec/rpi_hevc_mvs.c
23671@@ -0,0 +1,487 @@
23672+/*
23673+ * HEVC video decoder
23674+ *
23675+ * Copyright (C) 2012 - 2013 Guillaume Martres
23676+ * Copyright (C) 2013 Anand Meher Kotra
23677+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
23678+ *
23679+ * This file is part of FFmpeg.
23680+ *
23681+ * FFmpeg is free software; you can redistribute it and/or
23682+ * modify it under the terms of the GNU Lesser General Public
23683+ * License as published by the Free Software Foundation; either
23684+ * version 2.1 of the License, or (at your option) any later version.
23685+ *
23686+ * FFmpeg is distributed in the hope that it will be useful,
23687+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
23688+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23689+ * Lesser General Public License for more details.
23690+ *
23691+ * You should have received a copy of the GNU Lesser General Public
23692+ * License along with FFmpeg; if not, write to the Free Software
23693+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23694+ */
23695+
23696+#include "hevc.h"
23697+#include "rpi_hevcdec.h"
23698+
23699+static av_always_inline int
23700+is_eq_mer(const unsigned int plevel,
23701+    const unsigned int xN, const unsigned int yN,
23702+    const unsigned int xP, const unsigned int yP)
23703+{
23704+    return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0;
23705+}
23706+
23707+// check if the mv's and refidx are the same between A and B
23708+static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
23709+{
23710+    return a->pred_flag == b->pred_flag &&
23711+        ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) &&
23712+        ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1]));
23713+    return 0;
23714+}
23715+
23716+/*
23717+ * 8.5.3.1.7  temporal luma motion vector prediction
23718+ */
23719+static int temporal_luma_motion_vector(const HEVCRpiContext * const s,
23720+                                       const HEVCRpiLocalContext * const lc, const int x0, const int y0,
23721+                                       const int nPbW, const int nPbH, const int refIdxLx,
23722+                                       MvXY * const mvLXCol, const int X)
23723+{
23724+    int x, y;
23725+    const ColMv * cmv = NULL;
23726+
23727+    HEVCRpiFrame * const col_ref = s->ref->collocated_ref;
23728+    const RefPicList * const refPicList = s->refPicList + X;
23729+    const int cur_lt = refPicList->isLongTerm[refIdxLx];
23730+
23731+    *mvLXCol = 0;
23732+    // Unlikely but we might have a col_ref IDR frame!
23733+    if (col_ref->col_mvf == NULL)
23734+        return 0;
23735+
23736+    ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH);
23737+
23738+    //bottom right collocated motion vector
23739+    x = x0 + nPbW;
23740+    y = y0 + nPbH;
23741+
23742+    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
23743+        y < s->ps.sps->height &&
23744+        x < s->ps.sps->width)
23745+    {
23746+        const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
23747+            (y >> 4) * s->col_mvf_stride;
23748+
23749+        if (col->L[0].poc != COL_POC_INTRA &&
23750+            (col->L[1].poc == COL_POC_INTRA ||
23751+             (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
23752+        {
23753+            cmv = col->L + 0;
23754+        }
23755+        else if (col->L[1].poc != COL_POC_INTRA)
23756+        {
23757+            cmv = col->L + 1;
23758+        }
23759+    }
23760+
23761+    // derive center collocated motion vector
23762+    if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt)
23763+    {
23764+        cmv = NULL;
23765+        x                  = x0 + (nPbW >> 1);
23766+        y                  = y0 + (nPbH >> 1);
23767+
23768+        {
23769+            const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
23770+              (y >> 4) * s->col_mvf_stride;
23771+
23772+            if (col->L[0].poc != COL_POC_INTRA &&
23773+              (col->L[1].poc == COL_POC_INTRA ||
23774+               (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
23775+            {
23776+              cmv = col->L + 0;
23777+            }
23778+            else if (col->L[1].poc != COL_POC_INTRA)
23779+            {
23780+              cmv = col->L + 1;
23781+            }
23782+        }
23783+    }
23784+
23785+    if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc))
23786+        return 0;
23787+
23788+    {
23789+        const int col_poc  = col_ref->poc;
23790+        const int ref_poc  = refPicList->list[refIdxLx];
23791+
23792+        *mvLXCol = (cur_lt ||
23793+                        cmv->poc == col_poc ||
23794+                        COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ?
23795+                    cmv->xy :
23796+                    mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc);
23797+    }
23798+
23799+    return cmv != NULL;
23800+}
23801+
23802+static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
23803+{
23804+    return b != NULL && compare_mv_ref_idx(a, b);
23805+}
23806+
23807+
23808+
23809+/*
23810+ * 8.5.3.1.2  Derivation process for spatial merging candidates
23811+ */
23812+static inline const HEVCRpiMvField *
23813+derive_spatial_merge_candidates(
23814+    const HEVCRpiContext * const s,
23815+    const HEVCRpiLocalContext * const lc,
23816+    const unsigned int x0, const unsigned int y0,
23817+    const unsigned int nPbW, const unsigned int nPbH,
23818+    const unsigned int avail,
23819+    const unsigned int part_idx,
23820+    const unsigned int merge_idx,
23821+    HEVCRpiMvField * const mvf_t)
23822+{
23823+    const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N);
23824+    const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD);
23825+
23826+    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
23827+    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
23828+    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
23829+    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
23830+    const unsigned int plevel = s->ps.pps->log2_parallel_merge_level;
23831+    const unsigned int part_mode = lc->cu.part_mode;
23832+
23833+    const HEVCRpiMvField * perm[4];
23834+    unsigned int nb_merge_cand = 0;
23835+
23836+    // singleMCLFlag => part_idx == 0 so no need to test for it
23837+    if ((avail & AVAIL_L) == 0 ||
23838+        (part_idx == 1 &&
23839+            ((parts_a1 >> part_mode) & 1) != 0 ||
23840+                is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) ||
23841+        mvf_a1->pred_flag == PF_INTRA)
23842+    {
23843+        mvf_a1 = NULL;
23844+    }
23845+    else
23846+    {
23847+        if (merge_idx == nb_merge_cand)
23848+            return mvf_a1;
23849+        perm[nb_merge_cand++] = mvf_a1;
23850+    }
23851+
23852+    if ((avail & AVAIL_U) == 0 ||
23853+            (part_idx == 1 &&
23854+               ((parts_b1 >> part_mode) & 1) != 0 ||
23855+                   is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) ||
23856+            mvf_b1->pred_flag == PF_INTRA)
23857+    {
23858+        mvf_b1 = NULL;
23859+    }
23860+    else if (!mvf_eq(mvf_b1, mvf_a1))
23861+    {
23862+        if (merge_idx == nb_merge_cand)
23863+            return mvf_b1;
23864+        perm[nb_merge_cand++] = mvf_b1;
23865+    }
23866+
23867+    // above right spatial merge candidate
23868+    // Never need mvf_b0 again so don't bother zeroing if navail
23869+    if ((avail & AVAIL_UR) != 0 &&
23870+        !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) &&
23871+        mvf_b0->pred_flag != PF_INTRA &&
23872+        !mvf_eq(mvf_b0, mvf_b1))
23873+    {
23874+        if (merge_idx == nb_merge_cand)
23875+            return mvf_b0;
23876+        perm[nb_merge_cand++] = mvf_b0;
23877+    }
23878+
23879+    // left bottom spatial merge candidate
23880+    // Never need mvf_a0 again so don't bother zeroing if navail
23881+    if ((avail & AVAIL_DL) != 0 &&
23882+        !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) &&
23883+        mvf_a0->pred_flag != PF_INTRA &&
23884+        !mvf_eq(mvf_a0, mvf_a1))
23885+    {
23886+        if (merge_idx == nb_merge_cand)
23887+            return mvf_a0;
23888+        perm[nb_merge_cand++] = mvf_a0;
23889+    }
23890+
23891+    // above left spatial merge candidate
23892+    if (nb_merge_cand != 4 &&
23893+        (avail & AVAIL_UL) != 0 &&
23894+        !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0))
23895+    {
23896+        const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
23897+
23898+        if (mvf_b2->pred_flag != PF_INTRA &&
23899+            !mvf_eq(mvf_b2, mvf_a1) &&
23900+            !mvf_eq(mvf_b2, mvf_b1))
23901+        {
23902+            if (merge_idx == nb_merge_cand)
23903+                return mvf_b2;
23904+            perm[nb_merge_cand++] = mvf_b2;
23905+        }
23906+    }
23907+
23908+    // temporal motion vector candidate
23909+    if (s->sh.slice_temporal_mvp_enabled_flag)
23910+    {
23911+        static const HEVCRpiMvField mvf_z = {{0}};
23912+
23913+        *mvf_t = mvf_z;
23914+
23915+        if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
23916+                                        0, mvf_t->xy + 0, 0))
23917+            mvf_t->pred_flag = PF_L0;
23918+
23919+        if (s->sh.slice_type == HEVC_SLICE_B &&
23920+                temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
23921+                                            0, mvf_t->xy + 1, 1))
23922+            mvf_t->pred_flag |= PF_L1;
23923+
23924+        if (mvf_t->pred_flag != 0)
23925+        {
23926+            if (merge_idx == nb_merge_cand)
23927+                return mvf_t;
23928+            perm[nb_merge_cand++] = mvf_t;
23929+        }
23930+    }
23931+
23932+    // combined bi-predictive merge candidates  (applies for B slices)
23933+    if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1)
23934+    {
23935+        unsigned int comb_idx = 0;
23936+        const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1);
23937+        const RefPicList * const refPicList = s->refPicList;
23938+
23939+        for (comb_idx = 0; comb_idx < cand_count; comb_idx++)
23940+        {
23941+            static const uint8_t l0_l1_cand_idx[12][2] = {
23942+                { 0, 1, },
23943+                { 1, 0, },
23944+                { 0, 2, },
23945+                { 2, 0, },
23946+                { 1, 2, },
23947+                { 2, 1, },
23948+                { 0, 3, },
23949+                { 3, 0, },
23950+                { 1, 3, },
23951+                { 3, 1, },
23952+                { 2, 3, },
23953+                { 3, 2, },
23954+            };
23955+
23956+            const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
23957+            const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
23958+            const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx];
23959+            const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx];
23960+
23961+            if ((mvf_c0->pred_flag & PF_L0) != 0 &&
23962+                (mvf_c1->pred_flag & PF_L1) != 0 &&
23963+                (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] ||
23964+                 mvf_c0->xy[0] != mvf_c1->xy[1]))
23965+            {
23966+                if (merge_idx == nb_merge_cand++)
23967+                {
23968+                    // Need to be a bit careful as we will construct mvf_t and we
23969+                    // may already be using that as one of our condidates
23970+                    // so build & copy rather than build in place
23971+                    const HEVCRpiMvField mvf_m = {
23972+                        .xy = {
23973+                            mvf_c0->xy[0],
23974+                            mvf_c1->xy[1]},
23975+                        .ref_idx = {
23976+                            mvf_c0->ref_idx[0],
23977+                            mvf_c1->ref_idx[1]},
23978+                        .pred_flag = PF_BI
23979+                    };
23980+                    *mvf_t = mvf_m;
23981+                    return mvf_t;
23982+                }
23983+            }
23984+        }
23985+    }
23986+
23987+    // "append" Zero motion vector candidates
23988+    {
23989+        const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ?
23990+                            FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0];
23991+        const unsigned int zero_idx = merge_idx - nb_merge_cand;
23992+
23993+        const HEVCRpiMvField mvf_m = {
23994+            .xy = {0, 0},
23995+            .ref_idx = {
23996+                zero_idx < nb_refs ? zero_idx : 0,
23997+                (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0},
23998+            .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0
23999+        };
24000+
24001+        *mvf_t = mvf_m;
24002+        return mvf_t;
24003+    }
24004+}
24005+
24006+
24007+// 8.5.3.1.1 Derivation process of luma Mvs for merge mode
24008+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
24009+                                int nPbH, int log2_cb_size, int part_idx,
24010+                                int merge_idx, HEVCRpiMvField * const mv)
24011+{
24012+    const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ?
24013+        derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8,
24014+                                        ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8),
24015+                                        0, merge_idx, mv) :
24016+        derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH,
24017+                                        ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
24018+                                        part_idx, merge_idx, mv);
24019+
24020+    if (mvf_m != mv)
24021+        *mv = *mvf_m;
24022+
24023+    if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12)
24024+        mv->pred_flag = PF_L0;
24025+}
24026+
24027+
24028+static av_always_inline const MvXY *
24029+mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf)
24030+{
24031+    if (mvf != NULL)
24032+    {
24033+        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0)
24034+            return mvf->xy + pfi0;
24035+        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0)
24036+            return mvf->xy + pfi1;
24037+    }
24038+    return NULL;
24039+}
24040+
24041+static av_always_inline const MvXY *
24042+mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1,
24043+              const int islt0, const int poc0, const int poc_cur,
24044+              MvXY * const mv_t, const HEVCRpiMvField * const mvf)
24045+{
24046+    if (mvf != NULL)
24047+    {
24048+        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0)
24049+        {
24050+            const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]];
24051+            if (islt0 || poc1 == poc0) {
24052+                return mvf->xy + pfi0;
24053+            }
24054+            *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0);
24055+            return mv_t;
24056+        }
24057+        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0)
24058+        {
24059+            const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]];
24060+            if (islt0 || poc1 == poc0) {
24061+                return mvf->xy + pfi1;
24062+            }
24063+            *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0);
24064+            return mv_t;
24065+        }
24066+    }
24067+    return NULL;
24068+}
24069+
24070+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
24071+    const unsigned int x0, const unsigned int y0,
24072+    const unsigned int nPbW, const unsigned int nPbH,
24073+    const unsigned int avail,
24074+    HEVCRpiMvField * const mv,
24075+    const unsigned int mvp_lx_flag, const unsigned int LX)
24076+{
24077+    const unsigned int pfi0 = LX;
24078+    const unsigned int pfi1 = LX == 0 ? 1 : 0;
24079+    const RefPicList * const rpl = s->refPicList;
24080+    const int poc0 = rpl[LX].list[mv->ref_idx[LX]];
24081+    const int poc_cur = s->poc;
24082+    const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]];
24083+
24084+    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
24085+    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
24086+    const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
24087+    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
24088+    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
24089+    const MvXY * mva = NULL;
24090+    const MvXY * mvb;
24091+    MvXY * const mv_rv = mv->xy + LX;
24092+    MvXY mvt_a, mvt_b;
24093+
24094+    *mv_rv = 0;
24095+
24096+    if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA)
24097+        mvf_a0 = NULL;
24098+    else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0)
24099+        goto use_mva;
24100+
24101+    if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA)
24102+        mvf_a1 = NULL;
24103+
24104+    if (mva == NULL &&
24105+        (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL &&
24106+        (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL)
24107+        mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1);
24108+
24109+    if (mvp_lx_flag == 0 && mva != NULL)
24110+        goto use_mva;
24111+
24112+    if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA)
24113+        mvf_b0 = NULL;
24114+    if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA)
24115+        mvf_b1 = NULL;
24116+    if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA)
24117+        mvf_b2 = NULL;
24118+
24119+    if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL &&
24120+        (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL)
24121+        mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2);
24122+
24123+    if (mvf_a0 == NULL && mvf_a1 == NULL) {
24124+        mva = mvb;
24125+        if (mvp_lx_flag == 0 && mva != NULL)
24126+            goto use_mva;
24127+
24128+        if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL &&
24129+            (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL)
24130+            mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2);
24131+    }
24132+
24133+    if (mva == NULL) {
24134+        mva = mvb;
24135+        mvb = NULL;
24136+    }
24137+
24138+    if (mvb != NULL && *mva == *mvb)  // If A == B then ignore B
24139+        mvb = NULL;
24140+
24141+    if (mvp_lx_flag == 0 && mva != NULL) {
24142+        goto use_mva;
24143+    }
24144+    else if (mvp_lx_flag != 0 && mvb != NULL) {
24145+        *mv_rv = *mvb;
24146+    }
24147+    else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) {
24148+        temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
24149+                                    nPbH, mv->ref_idx[LX],
24150+                                    mv_rv, LX);
24151+    }
24152+    return;
24153+
24154+use_mva:
24155+    *mv_rv = *mva;
24156+    return;
24157+}
24158+
24159--- /dev/null
24160+++ b/libavcodec/rpi_hevc_parse.c
24161@@ -0,0 +1,143 @@
24162+/*
24163+ * This file is part of FFmpeg.
24164+ *
24165+ * FFmpeg is free software; you can redistribute it and/or
24166+ * modify it under the terms of the GNU Lesser General Public
24167+ * License as published by the Free Software Foundation; either
24168+ * version 2.1 of the License, or (at your option) any later version.
24169+ *
24170+ * FFmpeg is distributed in the hope that it will be useful,
24171+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
24172+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24173+ * Lesser General Public License for more details.
24174+ *
24175+ * You should have received a copy of the GNU Lesser General Public
24176+ * License along with FFmpeg; if not, write to the Free Software
24177+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24178+ */
24179+
24180+#include "bytestream.h"
24181+#include "h2645_parse.h"
24182+#include "hevc.h"
24183+#include "rpi_hevc_parse.h"
24184+
24185+static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps,
24186+                                 HEVCSEIContext *sei, int is_nalff, int nal_length_size,
24187+                                 int err_recognition, int apply_defdispwin, void *logctx)
24188+{
24189+    int i;
24190+    int ret = 0;
24191+    H2645Packet pkt = { 0 };
24192+
24193+    ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff,
24194+                                nal_length_size, AV_CODEC_ID_HEVC, 1, 0);
24195+    if (ret < 0) {
24196+        goto done;
24197+    }
24198+
24199+    for (i = 0; i < pkt.nb_nals; i++) {
24200+        H2645NAL *nal = &pkt.nals[i];
24201+
24202+        /* ignore everything except parameter sets and VCL NALUs */
24203+        switch (nal->type) {
24204+        case HEVC_NAL_VPS:
24205+            ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps);
24206+            if (ret < 0)
24207+                goto done;
24208+            break;
24209+        case HEVC_NAL_SPS:
24210+            ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin);
24211+            if (ret < 0)
24212+                goto done;
24213+            break;
24214+        case HEVC_NAL_PPS:
24215+            ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps);
24216+            if (ret < 0)
24217+                goto done;
24218+            break;
24219+        case HEVC_NAL_SEI_PREFIX:
24220+        case HEVC_NAL_SEI_SUFFIX:
24221+            ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type);
24222+            if (ret < 0)
24223+                goto done;
24224+            break;
24225+        default:
24226+            av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type);
24227+            break;
24228+        }
24229+    }
24230+
24231+done:
24232+    ff_h2645_packet_uninit(&pkt);
24233+    if (err_recognition & AV_EF_EXPLODE)
24234+        return ret;
24235+
24236+    return 0;
24237+}
24238+
24239+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
24240+                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
24241+                             int err_recognition, int apply_defdispwin, void *logctx)
24242+{
24243+    int ret = 0;
24244+    GetByteContext gb;
24245+
24246+    bytestream2_init(&gb, data, size);
24247+
24248+    if (size > 3 && (data[0] || data[1] || data[2] > 1)) {
24249+        /* It seems the extradata is encoded as hvcC format.
24250+         * Temporarily, we support configurationVersion==0 until 14496-15 3rd
24251+         * is finalized. When finalized, configurationVersion will be 1 and we
24252+         * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
24253+        int i, j, num_arrays, nal_len_size;
24254+
24255+        *is_nalff = 1;
24256+
24257+        bytestream2_skip(&gb, 21);
24258+        nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
24259+        num_arrays   = bytestream2_get_byte(&gb);
24260+
24261+        /* nal units in the hvcC always have length coded with 2 bytes,
24262+         * so put a fake nal_length_size = 2 while parsing them */
24263+        *nal_length_size = 2;
24264+
24265+        /* Decode nal units from hvcC. */
24266+        for (i = 0; i < num_arrays; i++) {
24267+            int type = bytestream2_get_byte(&gb) & 0x3f;
24268+            int cnt  = bytestream2_get_be16(&gb);
24269+
24270+            for (j = 0; j < cnt; j++) {
24271+                // +2 for the nal size field
24272+                int nalsize = bytestream2_peek_be16(&gb) + 2;
24273+                if (bytestream2_get_bytes_left(&gb) < nalsize) {
24274+                    av_log(logctx, AV_LOG_ERROR,
24275+                           "Invalid NAL unit size in extradata.\n");
24276+                    return AVERROR_INVALIDDATA;
24277+                }
24278+
24279+                ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff,
24280+                                            *nal_length_size, err_recognition, apply_defdispwin,
24281+                                            logctx);
24282+                if (ret < 0) {
24283+                    av_log(logctx, AV_LOG_ERROR,
24284+                           "Decoding nal unit %d %d from hvcC failed\n",
24285+                           type, i);
24286+                    return ret;
24287+                }
24288+                bytestream2_skip(&gb, nalsize);
24289+            }
24290+        }
24291+
24292+        /* Now store right nal length size, that will be used to parse
24293+         * all other nals */
24294+        *nal_length_size = nal_len_size;
24295+    } else {
24296+        *is_nalff = 0;
24297+        ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size,
24298+                                    err_recognition, apply_defdispwin, logctx);
24299+        if (ret < 0)
24300+            return ret;
24301+    }
24302+
24303+    return ret;
24304+}
24305--- /dev/null
24306+++ b/libavcodec/rpi_hevc_parse.h
24307@@ -0,0 +1,36 @@
24308+/*
24309+ * This file is part of FFmpeg.
24310+ *
24311+ * FFmpeg is free software; you can redistribute it and/or
24312+ * modify it under the terms of the GNU Lesser General Public
24313+ * License as published by the Free Software Foundation; either
24314+ * version 2.1 of the License, or (at your option) any later version.
24315+ *
24316+ * FFmpeg is distributed in the hope that it will be useful,
24317+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
24318+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24319+ * Lesser General Public License for more details.
24320+ *
24321+ * You should have received a copy of the GNU Lesser General Public
24322+ * License along with FFmpeg; if not, write to the Free Software
24323+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24324+ */
24325+
24326+/**
24327+ * @file
24328+ * H.265 parser code
24329+ */
24330+
24331+#ifndef AVCODEC_RPI_HEVC_PARSE_H
24332+#define AVCODEC_RPI_HEVC_PARSE_H
24333+
24334+#include <stdint.h>
24335+
24336+#include "rpi_hevc_ps.h"
24337+#include "rpi_hevc_sei.h"
24338+
24339+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
24340+                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
24341+                             int err_recognition, int apply_defdispwin, void *logctx);
24342+
24343+#endif /* AVCODEC_RPI_HEVC_PARSE_H */
24344--- /dev/null
24345+++ b/libavcodec/rpi_hevc_ps.c
24346@@ -0,0 +1,1938 @@
24347+/*
24348+ * HEVC Parameter Set decoding
24349+ *
24350+ * Copyright (C) 2012 - 2103 Guillaume Martres
24351+ * Copyright (C) 2012 - 2103 Mickael Raulet
24352+ * Copyright (C) 2012 - 2013 Gildas Cocherel
24353+ * Copyright (C) 2013 Vittorio Giovara
24354+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
24355+ *
24356+ * This file is part of FFmpeg.
24357+ *
24358+ * FFmpeg is free software; you can redistribute it and/or
24359+ * modify it under the terms of the GNU Lesser General Public
24360+ * License as published by the Free Software Foundation; either
24361+ * version 2.1 of the License, or (at your option) any later version.
24362+ *
24363+ * FFmpeg is distributed in the hope that it will be useful,
24364+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
24365+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24366+ * Lesser General Public License for more details.
24367+ *
24368+ * You should have received a copy of the GNU Lesser General Public
24369+ * License along with FFmpeg; if not, write to the Free Software
24370+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24371+ */
24372+
24373+#include "libavutil/imgutils.h"
24374+#include "golomb.h"
24375+#include "rpi_hevc_data.h"
24376+#include "rpi_hevc_ps.h"
24377+#include "rpi_hevcdec.h"
24378+
24379+static const uint8_t default_scaling_list_intra[] = {
24380+    16, 16, 16, 16, 17, 18, 21, 24,
24381+    16, 16, 16, 16, 17, 19, 22, 25,
24382+    16, 16, 17, 18, 20, 22, 25, 29,
24383+    16, 16, 18, 21, 24, 27, 31, 36,
24384+    17, 17, 20, 24, 30, 35, 41, 47,
24385+    18, 19, 22, 27, 35, 44, 54, 65,
24386+    21, 22, 25, 31, 41, 54, 70, 88,
24387+    24, 25, 29, 36, 47, 65, 88, 115
24388+};
24389+
24390+static const uint8_t default_scaling_list_inter[] = {
24391+    16, 16, 16, 16, 17, 18, 20, 24,
24392+    16, 16, 16, 17, 18, 20, 24, 25,
24393+    16, 16, 17, 18, 20, 24, 25, 28,
24394+    16, 17, 18, 20, 24, 25, 28, 33,
24395+    17, 18, 20, 24, 25, 28, 33, 41,
24396+    18, 20, 24, 25, 28, 33, 41, 54,
24397+    20, 24, 25, 28, 33, 41, 54, 71,
24398+    24, 25, 28, 33, 41, 54, 71, 91
24399+};
24400+
24401+static const AVRational vui_sar[] = {
24402+    {  0,   1 },
24403+    {  1,   1 },
24404+    { 12,  11 },
24405+    { 10,  11 },
24406+    { 16,  11 },
24407+    { 40,  33 },
24408+    { 24,  11 },
24409+    { 20,  11 },
24410+    { 32,  11 },
24411+    { 80,  33 },
24412+    { 18,  11 },
24413+    { 15,  11 },
24414+    { 64,  33 },
24415+    { 160, 99 },
24416+    {  4,   3 },
24417+    {  3,   2 },
24418+    {  2,   1 },
24419+};
24420+
24421+
24422+// pps_cb_qp_offset: -12,+12
24423+// slice_cb_qp_offset: -12,+12 also
24424+//   "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive."
24425+// cr_qp_offset_list[n]: -12,+12
24426+// So worst case total offset: -24,+24
24427+
24428+#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6)
24429+#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n))
24430+#define M(B,n) C(B,(-n))
24431+
24432+// Sizeof the QP_START_BLOCK
24433+#define QP_OFFSET_0 (8*6 + 12*2)
24434+#define QP_START(B) \
24435+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
24436+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
24437+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
24438+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
24439+\
24440+    M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\
24441+    M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\
24442+    M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\
24443+    M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\
24444+    M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\
24445+    M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\
24446+    M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\
24447+    M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1)
24448+#define QP_END(B) \
24449+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
24450+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
24451+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51)
24452+
24453+#define T1(B)\
24454+{\
24455+    QP_START(B),\
24456+    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
24457+    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
24458+    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
24459+    C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\
24460+    C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\
24461+    C(B,44), C(B,45),\
24462+    C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\
24463+    QP_END(B)\
24464+}
24465+#define T0(B)\
24466+{\
24467+    QP_START(B),\
24468+    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
24469+    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
24470+    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
24471+    C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\
24472+    C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\
24473+    C(B,50), C(B,51),\
24474+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
24475+    QP_END(B)\
24476+}
24477+
24478+#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2)
24479+
24480+static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)};
24481+static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)};
24482+
24483+#undef T
24484+#undef C
24485+#undef QP_END
24486+
24487+#define C(B,n) ((n)<0?0:(n)>51?51:(n))
24488+// We do need a lot of -ve padding to cope with high bit depths that give -ve qps
24489+#define QP_DBLK_OFFSET_0 QP_OFFSET_0
24490+#define QP_END(B)\
24491+ 51, 51, 51, 51, 51, 51
24492+
24493+// These don't need all the padding we have here (12 top/bottom would be enough)
24494+static const uint8_t qp_c_dblk_0[] = T0(0);
24495+static const uint8_t qp_c_dblk_1[] = T1(0);
24496+
24497+#undef T
24498+#undef M
24499+#undef C
24500+#undef QP_END
24501+#undef QP_START
24502+
24503+
24504+static void remove_pps(HEVCRpiParamSets * const s, const int id)
24505+{
24506+    if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data)
24507+        s->pps = NULL;
24508+    av_buffer_unref(&s->pps_list[id]);
24509+}
24510+
24511+static void remove_sps(HEVCRpiParamSets * const s, const int id)
24512+{
24513+    int i;
24514+    if (s->sps_list[id]) {
24515+        if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data)
24516+            s->sps = NULL;
24517+
24518+        /* drop all PPS that depend on this SPS */
24519+        for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
24520+            if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id)
24521+                remove_pps(s, i);
24522+
24523+        av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data));
24524+    }
24525+    av_buffer_unref(&s->sps_list[id]);
24526+}
24527+
24528+static void remove_vps(HEVCRpiParamSets * const s, const int id)
24529+{
24530+    int i;
24531+    if (s->vps_list[id]) {
24532+        if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data)
24533+            s->vps = NULL;
24534+
24535+        for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++)
24536+            if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id)
24537+                remove_sps(s, i);
24538+    }
24539+    av_buffer_unref(&s->vps_list[id]);
24540+}
24541+
24542+int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx,
24543+                                  ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header)
24544+{
24545+    uint8_t rps_predict = 0;
24546+    int delta_poc;
24547+    int k0 = 0;
24548+    int k1 = 0;
24549+    int k  = 0;
24550+    int i;
24551+
24552+    if (rps != sps->st_rps && sps->nb_st_rps)
24553+        rps_predict = get_bits1(gb);
24554+
24555+    if (rps_predict) {
24556+        const ShortTermRPS *rps_ridx;
24557+        int delta_rps;
24558+        unsigned abs_delta_rps;
24559+        uint8_t use_delta_flag = 0;
24560+        uint8_t delta_rps_sign;
24561+
24562+        if (is_slice_header) {
24563+            unsigned int delta_idx = get_ue_golomb_long(gb) + 1;
24564+            if (delta_idx > sps->nb_st_rps) {
24565+                av_log(avctx, AV_LOG_ERROR,
24566+                       "Invalid value of delta_idx in slice header RPS: %d > %d.\n",
24567+                       delta_idx, sps->nb_st_rps);
24568+                return AVERROR_INVALIDDATA;
24569+            }
24570+            rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
24571+            rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs;
24572+        } else
24573+            rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
24574+
24575+        delta_rps_sign = get_bits1(gb);
24576+        abs_delta_rps  = get_ue_golomb_long(gb) + 1;
24577+        if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
24578+            av_log(avctx, AV_LOG_ERROR,
24579+                   "Invalid value of abs_delta_rps: %d\n",
24580+                   abs_delta_rps);
24581+            return AVERROR_INVALIDDATA;
24582+        }
24583+        delta_rps      = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
24584+        for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
24585+            int used = rps->used[k] = get_bits1(gb);
24586+
24587+            if (!used)
24588+                use_delta_flag = get_bits1(gb);
24589+
24590+            if (used || use_delta_flag) {
24591+                if (i < rps_ridx->num_delta_pocs)
24592+                    delta_poc = delta_rps + rps_ridx->delta_poc[i];
24593+                else
24594+                    delta_poc = delta_rps;
24595+                rps->delta_poc[k] = delta_poc;
24596+                if (delta_poc < 0)
24597+                    k0++;
24598+                else
24599+                    k1++;
24600+                k++;
24601+            }
24602+        }
24603+
24604+        if (k >= FF_ARRAY_ELEMS(rps->used)) {
24605+            av_log(avctx, AV_LOG_ERROR,
24606+                   "Invalid num_delta_pocs: %d\n", k);
24607+            return AVERROR_INVALIDDATA;
24608+        }
24609+
24610+        rps->num_delta_pocs    = k;
24611+        rps->num_negative_pics = k0;
24612+        // sort in increasing order (smallest first)
24613+        if (rps->num_delta_pocs != 0) {
24614+            int used, tmp;
24615+            for (i = 1; i < rps->num_delta_pocs; i++) {
24616+                delta_poc = rps->delta_poc[i];
24617+                used      = rps->used[i];
24618+                for (k = i - 1; k >= 0; k--) {
24619+                    tmp = rps->delta_poc[k];
24620+                    if (delta_poc < tmp) {
24621+                        rps->delta_poc[k + 1] = tmp;
24622+                        rps->used[k + 1]      = rps->used[k];
24623+                        rps->delta_poc[k]     = delta_poc;
24624+                        rps->used[k]          = used;
24625+                    }
24626+                }
24627+            }
24628+        }
24629+        if ((rps->num_negative_pics >> 1) != 0) {
24630+            int used;
24631+            k = rps->num_negative_pics - 1;
24632+            // flip the negative values to largest first
24633+            for (i = 0; i < rps->num_negative_pics >> 1; i++) {
24634+                delta_poc         = rps->delta_poc[i];
24635+                used              = rps->used[i];
24636+                rps->delta_poc[i] = rps->delta_poc[k];
24637+                rps->used[i]      = rps->used[k];
24638+                rps->delta_poc[k] = delta_poc;
24639+                rps->used[k]      = used;
24640+                k--;
24641+            }
24642+        }
24643+    } else {
24644+        unsigned int prev, nb_positive_pics;
24645+        rps->num_negative_pics = get_ue_golomb_long(gb);
24646+        nb_positive_pics       = get_ue_golomb_long(gb);
24647+
24648+        if (rps->num_negative_pics >= HEVC_MAX_REFS ||
24649+            nb_positive_pics >= HEVC_MAX_REFS) {
24650+            av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n");
24651+            return AVERROR_INVALIDDATA;
24652+        }
24653+
24654+        rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics;
24655+        if (rps->num_delta_pocs) {
24656+            prev = 0;
24657+            for (i = 0; i < rps->num_negative_pics; i++) {
24658+                delta_poc = get_ue_golomb_long(gb) + 1;
24659+                if (delta_poc < 1 || delta_poc > 32768) {
24660+                    av_log(avctx, AV_LOG_ERROR,
24661+                        "Invalid value of delta_poc: %d\n",
24662+                        delta_poc);
24663+                    return AVERROR_INVALIDDATA;
24664+                }
24665+                prev -= delta_poc;
24666+                rps->delta_poc[i] = prev;
24667+                rps->used[i]      = get_bits1(gb);
24668+            }
24669+            prev = 0;
24670+            for (i = 0; i < nb_positive_pics; i++) {
24671+                delta_poc = get_ue_golomb_long(gb) + 1;
24672+                if (delta_poc < 1 || delta_poc > 32768) {
24673+                    av_log(avctx, AV_LOG_ERROR,
24674+                        "Invalid value of delta_poc: %d\n",
24675+                        delta_poc);
24676+                    return AVERROR_INVALIDDATA;
24677+                }
24678+                prev += delta_poc;
24679+                rps->delta_poc[rps->num_negative_pics + i] = prev;
24680+                rps->used[rps->num_negative_pics + i]      = get_bits1(gb);
24681+            }
24682+        }
24683+    }
24684+    return 0;
24685+}
24686+
24687+
24688+static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx,
24689+                                      PTLCommon * const ptl)
24690+{
24691+    int i;
24692+
24693+    if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
24694+        return -1;
24695+
24696+    ptl->profile_space = get_bits(gb, 2);
24697+    ptl->tier_flag     = get_bits1(gb);
24698+    ptl->profile_idc   = get_bits(gb, 5);
24699+    if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN)
24700+        av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n");
24701+    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10)
24702+        av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
24703+    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
24704+        av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
24705+    else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
24706+        av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
24707+    else
24708+        av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
24709+
24710+    for (i = 0; i < 32; i++) {
24711+        ptl->profile_compatibility_flag[i] = get_bits1(gb);
24712+
24713+        if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i])
24714+            ptl->profile_idc = i;
24715+    }
24716+    ptl->progressive_source_flag    = get_bits1(gb);
24717+    ptl->interlaced_source_flag     = get_bits1(gb);
24718+    ptl->non_packed_constraint_flag = get_bits1(gb);
24719+    ptl->frame_only_constraint_flag = get_bits1(gb);
24720+
24721+    skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
24722+    skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
24723+    skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
24724+
24725+    return 0;
24726+}
24727+
24728+static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx,
24729+                      PTL * const ptl, const int max_num_sub_layers)
24730+{
24731+    int i;
24732+    if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
24733+        get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
24734+        av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
24735+        return -1;
24736+    }
24737+
24738+    ptl->general_ptl.level_idc = get_bits(gb, 8);
24739+
24740+    for (i = 0; i < max_num_sub_layers - 1; i++) {
24741+        ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
24742+        ptl->sub_layer_level_present_flag[i]   = get_bits1(gb);
24743+    }
24744+
24745+    if (max_num_sub_layers - 1> 0)
24746+        for (i = max_num_sub_layers - 1; i < 8; i++)
24747+            skip_bits(gb, 2); // reserved_zero_2bits[i]
24748+    for (i = 0; i < max_num_sub_layers - 1; i++) {
24749+        if (ptl->sub_layer_profile_present_flag[i] &&
24750+            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
24751+            av_log(avctx, AV_LOG_ERROR,
24752+                   "PTL information for sublayer %i too short\n", i);
24753+            return -1;
24754+        }
24755+        if (ptl->sub_layer_level_present_flag[i]) {
24756+            if (get_bits_left(gb) < 8) {
24757+                av_log(avctx, AV_LOG_ERROR,
24758+                       "Not enough data for sublayer %i level_idc\n", i);
24759+                return -1;
24760+            } else
24761+                ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
24762+        }
24763+    }
24764+
24765+    return 0;
24766+}
24767+
24768+static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb,
24769+                                const int subpic_params_present)
24770+{
24771+    int i;
24772+
24773+    for (i = 0; i < nb_cpb; i++) {
24774+        get_ue_golomb_long(gb); // bit_rate_value_minus1
24775+        get_ue_golomb_long(gb); // cpb_size_value_minus1
24776+
24777+        if (subpic_params_present) {
24778+            get_ue_golomb_long(gb); // cpb_size_du_value_minus1
24779+            get_ue_golomb_long(gb); // bit_rate_du_value_minus1
24780+        }
24781+        skip_bits1(gb); // cbr_flag
24782+    }
24783+}
24784+
24785+static int decode_hrd(GetBitContext * const gb, const int common_inf_present,
24786+                      const int max_sublayers)
24787+{
24788+    int nal_params_present = 0, vcl_params_present = 0;
24789+    int subpic_params_present = 0;
24790+    int i;
24791+
24792+    if (common_inf_present) {
24793+        nal_params_present = get_bits1(gb);
24794+        vcl_params_present = get_bits1(gb);
24795+
24796+        if (nal_params_present || vcl_params_present) {
24797+            subpic_params_present = get_bits1(gb);
24798+
24799+            if (subpic_params_present) {
24800+                skip_bits(gb, 8); // tick_divisor_minus2
24801+                skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1
24802+                skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag
24803+                skip_bits(gb, 5); // dpb_output_delay_du_length_minus1
24804+            }
24805+
24806+            skip_bits(gb, 4); // bit_rate_scale
24807+            skip_bits(gb, 4); // cpb_size_scale
24808+
24809+            if (subpic_params_present)
24810+                skip_bits(gb, 4);  // cpb_size_du_scale
24811+
24812+            skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1
24813+            skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1
24814+            skip_bits(gb, 5); // dpb_output_delay_length_minus1
24815+        }
24816+    }
24817+
24818+    for (i = 0; i < max_sublayers; i++) {
24819+        int low_delay = 0;
24820+        unsigned int nb_cpb = 1;
24821+        int fixed_rate = get_bits1(gb);
24822+
24823+        if (!fixed_rate)
24824+            fixed_rate = get_bits1(gb);
24825+
24826+        if (fixed_rate)
24827+            get_ue_golomb_long(gb);  // elemental_duration_in_tc_minus1
24828+        else
24829+            low_delay = get_bits1(gb);
24830+
24831+        if (!low_delay) {
24832+            nb_cpb = get_ue_golomb_long(gb) + 1;
24833+            if (nb_cpb < 1 || nb_cpb > 32) {
24834+                av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
24835+                return AVERROR_INVALIDDATA;
24836+            }
24837+        }
24838+
24839+        if (nal_params_present)
24840+            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
24841+        if (vcl_params_present)
24842+            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
24843+    }
24844+    return 0;
24845+}
24846+
24847+int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx,
24848+                           HEVCRpiParamSets * const ps)
24849+{
24850+    int i,j;
24851+    int vps_id = 0;
24852+    ptrdiff_t nal_size;
24853+    HEVCRpiVPS *vps;
24854+    AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps));
24855+
24856+    if (!vps_buf)
24857+        return AVERROR(ENOMEM);
24858+    vps = (HEVCRpiVPS*)vps_buf->data;
24859+
24860+    av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n");
24861+
24862+    nal_size = gb->buffer_end - gb->buffer;
24863+    if (nal_size > sizeof(vps->data)) {
24864+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS "
24865+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
24866+               nal_size, sizeof(vps->data));
24867+        vps->data_size = sizeof(vps->data);
24868+    } else {
24869+        vps->data_size = nal_size;
24870+    }
24871+    memcpy(vps->data, gb->buffer, vps->data_size);
24872+
24873+    vps_id = get_bits(gb, 4);
24874+    if (vps_id >= HEVC_MAX_VPS_COUNT) {
24875+        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
24876+        goto err;
24877+    }
24878+
24879+    if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits
24880+        av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n");
24881+        goto err;
24882+    }
24883+
24884+    vps->vps_max_layers               = get_bits(gb, 6) + 1;
24885+    vps->vps_max_sub_layers           = get_bits(gb, 3) + 1;
24886+    vps->vps_temporal_id_nesting_flag = get_bits1(gb);
24887+
24888+    if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits
24889+        av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n");
24890+        goto err;
24891+    }
24892+
24893+    if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) {
24894+        av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
24895+               vps->vps_max_sub_layers);
24896+        goto err;
24897+    }
24898+
24899+    if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
24900+        goto err;
24901+
24902+    vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
24903+
24904+    i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1;
24905+    for (; i < vps->vps_max_sub_layers; i++) {
24906+        vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1;
24907+        vps->vps_num_reorder_pics[i]      = get_ue_golomb_long(gb);
24908+        vps->vps_max_latency_increase[i]  = get_ue_golomb_long(gb) - 1;
24909+
24910+        if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
24911+            av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
24912+                   vps->vps_max_dec_pic_buffering[i] - 1);
24913+            goto err;
24914+        }
24915+        if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) {
24916+            av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n",
24917+                   vps->vps_num_reorder_pics[i]);
24918+            if (avctx->err_recognition & AV_EF_EXPLODE)
24919+                goto err;
24920+        }
24921+    }
24922+
24923+    vps->vps_max_layer_id   = get_bits(gb, 6);
24924+    vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
24925+    if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
24926+        (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
24927+        av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
24928+        goto err;
24929+    }
24930+
24931+    for (i = 1; i < vps->vps_num_layer_sets; i++)
24932+        for (j = 0; j <= vps->vps_max_layer_id; j++)
24933+            skip_bits(gb, 1);  // layer_id_included_flag[i][j]
24934+
24935+    vps->vps_timing_info_present_flag = get_bits1(gb);
24936+    if (vps->vps_timing_info_present_flag) {
24937+        vps->vps_num_units_in_tick               = get_bits_long(gb, 32);
24938+        vps->vps_time_scale                      = get_bits_long(gb, 32);
24939+        vps->vps_poc_proportional_to_timing_flag = get_bits1(gb);
24940+        if (vps->vps_poc_proportional_to_timing_flag)
24941+            vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
24942+        vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
24943+        if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
24944+            av_log(avctx, AV_LOG_ERROR,
24945+                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
24946+            goto err;
24947+        }
24948+        for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
24949+            int common_inf_present = 1;
24950+
24951+            get_ue_golomb_long(gb); // hrd_layer_set_idx
24952+            if (i)
24953+                common_inf_present = get_bits1(gb);
24954+            decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers);
24955+        }
24956+    }
24957+    get_bits1(gb); /* vps_extension_flag */
24958+
24959+    if (get_bits_left(gb) < 0) {
24960+        av_log(avctx, AV_LOG_ERROR,
24961+               "Overread VPS by %d bits\n", -get_bits_left(gb));
24962+        if (ps->vps_list[vps_id])
24963+            goto err;
24964+    }
24965+
24966+    if (ps->vps_list[vps_id] &&
24967+        !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
24968+        av_buffer_unref(&vps_buf);
24969+    } else {
24970+        remove_vps(ps, vps_id);
24971+        ps->vps_list[vps_id] = vps_buf;
24972+    }
24973+
24974+    return 0;
24975+
24976+err:
24977+    av_buffer_unref(&vps_buf);
24978+    return AVERROR_INVALIDDATA;
24979+}
24980+
24981+static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx,
24982+                       const int apply_defdispwin, HEVCRpiSPS * const sps)
24983+{
24984+    VUI backup_vui, * const vui = &sps->vui;
24985+    GetBitContext backup;
24986+    int sar_present, alt = 0;
24987+
24988+    av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
24989+
24990+    sar_present = get_bits1(gb);
24991+    if (sar_present) {
24992+        uint8_t sar_idx = get_bits(gb, 8);
24993+        if (sar_idx < FF_ARRAY_ELEMS(vui_sar))
24994+            vui->sar = vui_sar[sar_idx];
24995+        else if (sar_idx == 255) {
24996+            vui->sar.num = get_bits(gb, 16);
24997+            vui->sar.den = get_bits(gb, 16);
24998+        } else
24999+            av_log(avctx, AV_LOG_WARNING,
25000+                   "Unknown SAR index: %u.\n", sar_idx);
25001+    }
25002+
25003+    vui->overscan_info_present_flag = get_bits1(gb);
25004+    if (vui->overscan_info_present_flag)
25005+        vui->overscan_appropriate_flag = get_bits1(gb);
25006+
25007+    vui->video_signal_type_present_flag = get_bits1(gb);
25008+    if (vui->video_signal_type_present_flag) {
25009+        vui->video_format                    = get_bits(gb, 3);
25010+        vui->video_full_range_flag           = get_bits1(gb);
25011+        vui->colour_description_present_flag = get_bits1(gb);
25012+        if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
25013+            sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
25014+        if (vui->colour_description_present_flag) {
25015+            vui->colour_primaries        = get_bits(gb, 8);
25016+            vui->transfer_characteristic = get_bits(gb, 8);
25017+            vui->matrix_coeffs           = get_bits(gb, 8);
25018+
25019+            // Set invalid values to "unspecified"
25020+            if (!av_color_primaries_name(vui->colour_primaries))
25021+                vui->colour_primaries = AVCOL_PRI_UNSPECIFIED;
25022+            if (!av_color_transfer_name(vui->transfer_characteristic))
25023+                vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
25024+            if (!av_color_space_name(vui->matrix_coeffs))
25025+                vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
25026+            if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
25027+                switch (sps->pix_fmt) {
25028+                case AV_PIX_FMT_YUV444P:
25029+                    sps->pix_fmt = AV_PIX_FMT_GBRP;
25030+                    break;
25031+                case AV_PIX_FMT_YUV444P10:
25032+                    sps->pix_fmt = AV_PIX_FMT_GBRP10;
25033+                    break;
25034+                case AV_PIX_FMT_YUV444P12:
25035+                    sps->pix_fmt = AV_PIX_FMT_GBRP12;
25036+                    break;
25037+                }
25038+            }
25039+        }
25040+    }
25041+
25042+    vui->chroma_loc_info_present_flag = get_bits1(gb);
25043+    if (vui->chroma_loc_info_present_flag) {
25044+        vui->chroma_sample_loc_type_top_field    = get_ue_golomb_long(gb);
25045+        vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb);
25046+    }
25047+
25048+    vui->neutra_chroma_indication_flag = get_bits1(gb);
25049+    vui->field_seq_flag                = get_bits1(gb);
25050+    vui->frame_field_info_present_flag = get_bits1(gb);
25051+
25052+    // Backup context in case an alternate header is detected
25053+    memcpy(&backup, gb, sizeof(backup));
25054+    memcpy(&backup_vui, vui, sizeof(backup_vui));
25055+    if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
25056+        vui->default_display_window_flag = 0;
25057+        av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
25058+    } else
25059+        vui->default_display_window_flag = get_bits1(gb);
25060+
25061+    if (vui->default_display_window_flag) {
25062+        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
25063+        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
25064+        vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
25065+        vui->def_disp_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
25066+        vui->def_disp_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
25067+        vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
25068+
25069+        if (apply_defdispwin &&
25070+            avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
25071+            av_log(avctx, AV_LOG_DEBUG,
25072+                   "discarding vui default display window, "
25073+                   "original values are l:%u r:%u t:%u b:%u\n",
25074+                   vui->def_disp_win.left_offset,
25075+                   vui->def_disp_win.right_offset,
25076+                   vui->def_disp_win.top_offset,
25077+                   vui->def_disp_win.bottom_offset);
25078+
25079+            vui->def_disp_win.left_offset   =
25080+            vui->def_disp_win.right_offset  =
25081+            vui->def_disp_win.top_offset    =
25082+            vui->def_disp_win.bottom_offset = 0;
25083+        }
25084+    }
25085+
25086+timing_info:
25087+    vui->vui_timing_info_present_flag = get_bits1(gb);
25088+
25089+    if (vui->vui_timing_info_present_flag) {
25090+        if( get_bits_left(gb) < 66 && !alt) {
25091+            // The alternate syntax seem to have timing info located
25092+            // at where def_disp_win is normally located
25093+            av_log(avctx, AV_LOG_WARNING,
25094+                   "Strange VUI timing information, retrying...\n");
25095+            memcpy(vui, &backup_vui, sizeof(backup_vui));
25096+            memcpy(gb, &backup, sizeof(backup));
25097+            alt = 1;
25098+            goto timing_info;
25099+        }
25100+        vui->vui_num_units_in_tick               = get_bits_long(gb, 32);
25101+        vui->vui_time_scale                      = get_bits_long(gb, 32);
25102+        if (alt) {
25103+            av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n",
25104+                   vui->vui_time_scale, vui->vui_num_units_in_tick);
25105+        }
25106+        vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
25107+        if (vui->vui_poc_proportional_to_timing_flag)
25108+            vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
25109+        vui->vui_hrd_parameters_present_flag = get_bits1(gb);
25110+        if (vui->vui_hrd_parameters_present_flag)
25111+            decode_hrd(gb, 1, sps->max_sub_layers);
25112+    }
25113+
25114+    vui->bitstream_restriction_flag = get_bits1(gb);
25115+    if (vui->bitstream_restriction_flag) {
25116+        if (get_bits_left(gb) < 8 && !alt) {
25117+            av_log(avctx, AV_LOG_WARNING,
25118+                   "Strange VUI bitstream restriction information, retrying"
25119+                   " from timing information...\n");
25120+            memcpy(vui, &backup_vui, sizeof(backup_vui));
25121+            memcpy(gb, &backup, sizeof(backup));
25122+            alt = 1;
25123+            goto timing_info;
25124+        }
25125+        vui->tiles_fixed_structure_flag              = get_bits1(gb);
25126+        vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
25127+        vui->restricted_ref_pic_lists_flag           = get_bits1(gb);
25128+        vui->min_spatial_segmentation_idc            = get_ue_golomb_long(gb);
25129+        vui->max_bytes_per_pic_denom                 = get_ue_golomb_long(gb);
25130+        vui->max_bits_per_min_cu_denom               = get_ue_golomb_long(gb);
25131+        vui->log2_max_mv_length_horizontal           = get_ue_golomb_long(gb);
25132+        vui->log2_max_mv_length_vertical             = get_ue_golomb_long(gb);
25133+    }
25134+
25135+    if (get_bits_left(gb) < 1 && !alt) {
25136+        // XXX: Alternate syntax when sps_range_extension_flag != 0?
25137+        av_log(avctx, AV_LOG_WARNING,
25138+               "Overread in VUI, retrying from timing information...\n");
25139+        memcpy(vui, &backup_vui, sizeof(backup_vui));
25140+        memcpy(gb, &backup, sizeof(backup));
25141+        alt = 1;
25142+        goto timing_info;
25143+    }
25144+}
25145+
25146+static void set_default_scaling_list_data(ScalingList * const sl)
25147+{
25148+    int matrixId;
25149+
25150+    for (matrixId = 0; matrixId < 6; matrixId++) {
25151+        // 4x4 default is 16
25152+        memset(sl->sl[0][matrixId], 16, 16);
25153+        sl->sl_dc[0][matrixId] = 16; // default for 16x16
25154+        sl->sl_dc[1][matrixId] = 16; // default for 32x32
25155+    }
25156+
25157+    memcpy(sl->sl[1][0], default_scaling_list_intra, 64);
25158+    memcpy(sl->sl[1][1], default_scaling_list_intra, 64);
25159+    memcpy(sl->sl[1][2], default_scaling_list_intra, 64);
25160+
25161+    memcpy(sl->sl[1][3], default_scaling_list_inter, 64);
25162+    memcpy(sl->sl[1][4], default_scaling_list_inter, 64);
25163+    memcpy(sl->sl[1][5], default_scaling_list_inter, 64);
25164+
25165+    memcpy(sl->sl[2][0], default_scaling_list_intra, 64);
25166+    memcpy(sl->sl[2][1], default_scaling_list_intra, 64);
25167+    memcpy(sl->sl[2][2], default_scaling_list_intra, 64);
25168+
25169+    memcpy(sl->sl[2][3], default_scaling_list_inter, 64);
25170+    memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
25171+    memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
25172+
25173+    memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
25174+    memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
25175+    memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
25176+
25177+    memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
25178+    memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
25179+    memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
25180+}
25181+
25182+static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl,
25183+                             const HEVCRpiSPS * const sps)
25184+{
25185+    uint8_t scaling_list_pred_mode_flag;
25186+    int32_t scaling_list_dc_coef[2][6];
25187+    int size_id, matrix_id, pos;
25188+    int i;
25189+
25190+    for (size_id = 0; size_id < 4; size_id++)
25191+        for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
25192+            scaling_list_pred_mode_flag = get_bits1(gb);
25193+            if (!scaling_list_pred_mode_flag) {
25194+                unsigned int delta = get_ue_golomb_long(gb);
25195+                /* Only need to handle non-zero delta. Zero means default,
25196+                 * which should already be in the arrays. */
25197+                if (delta) {
25198+                    // Copy from previous array.
25199+                    delta *= (size_id == 3) ? 3 : 1;
25200+                    if (matrix_id < delta) {
25201+                        av_log(avctx, AV_LOG_ERROR,
25202+                               "Invalid delta in scaling list data: %d.\n", delta);
25203+                        return AVERROR_INVALIDDATA;
25204+                    }
25205+
25206+                    memcpy(sl->sl[size_id][matrix_id],
25207+                           sl->sl[size_id][matrix_id - delta],
25208+                           size_id > 0 ? 64 : 16);
25209+                    if (size_id > 1)
25210+                        sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta];
25211+                }
25212+            } else {
25213+                int next_coef, coef_num;
25214+                int32_t scaling_list_delta_coef;
25215+
25216+                next_coef = 8;
25217+                coef_num  = FFMIN(64, 1 << (4 + (size_id << 1)));
25218+                if (size_id > 1) {
25219+                    scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8;
25220+                    next_coef = scaling_list_dc_coef[size_id - 2][matrix_id];
25221+                    sl->sl_dc[size_id - 2][matrix_id] = next_coef;
25222+                }
25223+                for (i = 0; i < coef_num; i++) {
25224+                    if (size_id == 0)
25225+                        pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] +
25226+                                  ff_hevc_rpi_diag_scan4x4_x[i];
25227+                    else
25228+                        pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] +
25229+                                  ff_hevc_rpi_diag_scan8x8_x[i];
25230+
25231+                    scaling_list_delta_coef = get_se_golomb(gb);
25232+                    next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256;
25233+                    sl->sl[size_id][matrix_id][pos] = next_coef;
25234+                }
25235+            }
25236+        }
25237+
25238+    if (sps->chroma_format_idc == 3) {
25239+        for (i = 0; i < 64; i++) {
25240+            sl->sl[3][1][i] = sl->sl[2][1][i];
25241+            sl->sl[3][2][i] = sl->sl[2][2][i];
25242+            sl->sl[3][4][i] = sl->sl[2][4][i];
25243+            sl->sl[3][5][i] = sl->sl[2][5][i];
25244+        }
25245+        sl->sl_dc[1][1] = sl->sl_dc[0][1];
25246+        sl->sl_dc[1][2] = sl->sl_dc[0][2];
25247+        sl->sl_dc[1][4] = sl->sl_dc[0][4];
25248+        sl->sl_dc[1][5] = sl->sl_dc[0][5];
25249+    }
25250+
25251+
25252+    return 0;
25253+}
25254+
25255+static int map_pixel_format(HEVCRpiSPS * const sps)
25256+{
25257+    const int cfmt = sps->chroma_format_idc;
25258+
25259+    sps->pix_fmt = AV_PIX_FMT_NONE;
25260+    switch (sps->bit_depth) {
25261+    case 8:
25262+        if (cfmt == 1)
25263+            sps->pix_fmt = AV_PIX_FMT_SAND128;
25264+        break;
25265+    case 10:
25266+        if (cfmt == 1)
25267+            sps->pix_fmt = AV_PIX_FMT_SAND64_10;
25268+        break;
25269+    default:
25270+        break;
25271+    }
25272+
25273+    sps->hshift[0] = sps->vshift[0] = 0;
25274+    sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4
25275+    sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2
25276+
25277+    sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0;
25278+
25279+    return 0;
25280+}
25281+
25282+static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id,
25283+                      const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx)
25284+{
25285+    HEVCRpiWindow *ow;
25286+    int ret = 0;
25287+    int log2_diff_max_min_transform_block_size;
25288+    int bit_depth_chroma, start, vui_present, sublayer_ordering_info;
25289+    int i;
25290+
25291+    // Coded parameters
25292+
25293+    sps->vps_id = get_bits(gb, 4);
25294+    if (sps->vps_id >= HEVC_MAX_VPS_COUNT) {
25295+        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
25296+        return AVERROR_INVALIDDATA;
25297+    }
25298+
25299+    if (vps_list && !vps_list[sps->vps_id]) {
25300+        av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
25301+               sps->vps_id);
25302+        return AVERROR_INVALIDDATA;
25303+    }
25304+
25305+    sps->max_sub_layers = get_bits(gb, 3) + 1;
25306+    if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) {
25307+        av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
25308+               sps->max_sub_layers);
25309+        return AVERROR_INVALIDDATA;
25310+    }
25311+
25312+    sps->temporal_id_nesting_flag = get_bits(gb, 1);
25313+
25314+    if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
25315+        return ret;
25316+
25317+    *sps_id = get_ue_golomb_long(gb);
25318+    if (*sps_id >= HEVC_MAX_SPS_COUNT) {
25319+        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
25320+        return AVERROR_INVALIDDATA;
25321+    }
25322+
25323+    sps->chroma_format_idc = get_ue_golomb_long(gb);
25324+    if (sps->chroma_format_idc > 3U) {
25325+        av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
25326+        return AVERROR_INVALIDDATA;
25327+    }
25328+
25329+    if (sps->chroma_format_idc == 3)
25330+        sps->separate_colour_plane_flag = get_bits1(gb);
25331+
25332+    if (sps->separate_colour_plane_flag)
25333+        sps->chroma_format_idc = 0;
25334+
25335+    sps->width  = get_ue_golomb_long(gb);
25336+    sps->height = get_ue_golomb_long(gb);
25337+    if ((ret = av_image_check_size(sps->width,
25338+                                   sps->height, 0, avctx)) < 0)
25339+        return ret;
25340+
25341+    if (get_bits1(gb)) { // pic_conformance_flag
25342+        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
25343+        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
25344+        sps->pic_conf_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
25345+        sps->pic_conf_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
25346+        sps->pic_conf_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
25347+        sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
25348+
25349+        if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
25350+            av_log(avctx, AV_LOG_DEBUG,
25351+                   "discarding sps conformance window, "
25352+                   "original values are l:%u r:%u t:%u b:%u\n",
25353+                   sps->pic_conf_win.left_offset,
25354+                   sps->pic_conf_win.right_offset,
25355+                   sps->pic_conf_win.top_offset,
25356+                   sps->pic_conf_win.bottom_offset);
25357+
25358+            sps->pic_conf_win.left_offset   =
25359+            sps->pic_conf_win.right_offset  =
25360+            sps->pic_conf_win.top_offset    =
25361+            sps->pic_conf_win.bottom_offset = 0;
25362+        }
25363+        sps->output_window = sps->pic_conf_win;
25364+    }
25365+
25366+    sps->bit_depth   = get_ue_golomb_long(gb) + 8;
25367+    bit_depth_chroma = get_ue_golomb_long(gb) + 8;
25368+    if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
25369+        av_log(avctx, AV_LOG_ERROR,
25370+               "Luma bit depth (%d) is different from chroma bit depth (%d), "
25371+               "this is unsupported.\n",
25372+               sps->bit_depth, bit_depth_chroma);
25373+        return AVERROR_INVALIDDATA;
25374+    }
25375+
25376+    ret = map_pixel_format(sps);
25377+    if (ret < 0)
25378+        return ret;
25379+
25380+    sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
25381+    if (sps->log2_max_poc_lsb > 16) {
25382+        av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
25383+               sps->log2_max_poc_lsb - 4);
25384+        return AVERROR_INVALIDDATA;
25385+    }
25386+
25387+    sublayer_ordering_info = get_bits1(gb);
25388+    start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1;
25389+    for (i = start; i < sps->max_sub_layers; i++) {
25390+        sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1;
25391+        sps->temporal_layer[i].num_reorder_pics      = get_ue_golomb_long(gb);
25392+        sps->temporal_layer[i].max_latency_increase  = get_ue_golomb_long(gb) - 1;
25393+        if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) {
25394+            av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
25395+                   sps->temporal_layer[i].max_dec_pic_buffering - 1U);
25396+            return AVERROR_INVALIDDATA;
25397+        }
25398+        if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
25399+            av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
25400+                   sps->temporal_layer[i].num_reorder_pics);
25401+            if (avctx->err_recognition & AV_EF_EXPLODE ||
25402+                sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) {
25403+                return AVERROR_INVALIDDATA;
25404+            }
25405+            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
25406+        }
25407+    }
25408+
25409+    if (!sublayer_ordering_info) {
25410+        for (i = 0; i < start; i++) {
25411+            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering;
25412+            sps->temporal_layer[i].num_reorder_pics      = sps->temporal_layer[start].num_reorder_pics;
25413+            sps->temporal_layer[i].max_latency_increase  = sps->temporal_layer[start].max_latency_increase;
25414+        }
25415+    }
25416+
25417+    sps->log2_min_cb_size                    = get_ue_golomb_long(gb) + 3;
25418+    sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb);
25419+    sps->log2_min_tb_size                    = get_ue_golomb_long(gb) + 2;
25420+    log2_diff_max_min_transform_block_size   = get_ue_golomb_long(gb);
25421+    sps->log2_max_trafo_size                 = log2_diff_max_min_transform_block_size +
25422+                                               sps->log2_min_tb_size;
25423+
25424+    if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
25425+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
25426+        return AVERROR_INVALIDDATA;
25427+    }
25428+
25429+    if (sps->log2_diff_max_min_coding_block_size > 30) {
25430+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
25431+        return AVERROR_INVALIDDATA;
25432+    }
25433+
25434+    if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
25435+        av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
25436+        return AVERROR_INVALIDDATA;
25437+    }
25438+
25439+    if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
25440+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
25441+        return AVERROR_INVALIDDATA;
25442+    }
25443+
25444+    {
25445+        const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size;
25446+        // Not a bitstream limitation, but all profiles
25447+        if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) {
25448+            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY);
25449+            return AVERROR_INVALIDDATA;
25450+        }
25451+
25452+        if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) {
25453+            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size);
25454+            return AVERROR_INVALIDDATA;
25455+        }
25456+
25457+        // Inferred parameters
25458+        sps->log2_ctb_size = CtbLog2SizeY;
25459+//        sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
25460+    }
25461+
25462+    sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
25463+    sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
25464+
25465+    sps->scaling_list_enable_flag = get_bits1(gb);
25466+    if (sps->scaling_list_enable_flag) {
25467+        set_default_scaling_list_data(&sps->scaling_list);
25468+
25469+        if (get_bits1(gb)) {
25470+            ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
25471+            if (ret < 0)
25472+                return ret;
25473+        }
25474+    }
25475+
25476+    sps->amp_enabled_flag = get_bits1(gb);
25477+    sps->sao_enabled      = get_bits1(gb);
25478+
25479+    // Set pcm defaults (0) so we don't have to test _enabled when we
25480+    // want to use them
25481+    memset(&sps->pcm, 0, sizeof(sps->pcm));
25482+
25483+    if (get_bits1(gb))  // pcm_enabled_flag
25484+    {
25485+        const unsigned int limit_max_pcm = FFMIN(5,
25486+            sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size);
25487+        sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
25488+        sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1;
25489+        sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3;
25490+        sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
25491+                                        get_ue_golomb_long(gb);
25492+        if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) {
25493+            av_log(avctx, AV_LOG_ERROR,
25494+                   "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n",
25495+                   sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth);
25496+            return AVERROR_INVALIDDATA;
25497+        }
25498+        if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size ||
25499+            sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) {
25500+            av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)",
25501+                   sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size);
25502+            return AVERROR_INVALIDDATA;
25503+        }
25504+
25505+        sps->pcm.loop_filter_disable_flag = get_bits1(gb);
25506+    }
25507+
25508+    // Could be based on min_pcm_cb_size but much easier logic if we just stick
25509+    // with 8 (and costs us little)
25510+    sps->pcm_width = (sps->width + 63) >> 6;  // 8 for min size, 8 bits per byte - round up
25511+    sps->pcm_height = (sps->height + 7) >> 3;
25512+
25513+    sps->nb_st_rps = get_ue_golomb_long(gb);
25514+    if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) {
25515+        av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
25516+               sps->nb_st_rps);
25517+        return AVERROR_INVALIDDATA;
25518+    }
25519+    for (i = 0; i < sps->nb_st_rps; i++) {
25520+        if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
25521+                                                 sps, 0)) < 0)
25522+            return ret;
25523+    }
25524+
25525+    sps->long_term_ref_pics_present_flag = get_bits1(gb);
25526+    if (sps->long_term_ref_pics_present_flag) {
25527+        sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
25528+        if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
25529+            av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
25530+                   sps->num_long_term_ref_pics_sps);
25531+            return AVERROR_INVALIDDATA;
25532+        }
25533+        for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
25534+            sps->lt_ref_pic_poc_lsb_sps[i]       = get_bits(gb, sps->log2_max_poc_lsb);
25535+            sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
25536+        }
25537+    }
25538+
25539+    sps->sps_temporal_mvp_enabled_flag          = get_bits1(gb);
25540+    sps->intra_filters_disable = get_bits1(gb) ? 0 : FILTER_STRONG; // sps->sps_strong_intra_smoothing_enable_flag
25541+    sps->vui.sar = (AVRational){0, 1};
25542+    vui_present = get_bits1(gb);
25543+    if (vui_present)
25544+        decode_vui(gb, avctx, apply_defdispwin, sps);
25545+
25546+    if (get_bits1(gb)) { // sps_extension_flag
25547+        int sps_extension_flag[1];
25548+        for (i = 0; i < 1; i++)
25549+            sps_extension_flag[i] = get_bits1(gb);
25550+        skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
25551+        if (sps_extension_flag[0]) {
25552+            int extended_precision_processing_flag;
25553+            int cabac_bypass_alignment_enabled_flag;
25554+
25555+            sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
25556+            sps->transform_skip_context_enabled_flag  = get_bits1(gb);
25557+            sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
25558+
25559+            sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
25560+
25561+            extended_precision_processing_flag = get_bits1(gb);
25562+            if (extended_precision_processing_flag)
25563+                av_log(avctx, AV_LOG_WARNING,
25564+                   "extended_precision_processing_flag not yet implemented\n");
25565+
25566+            if (get_bits1(gb))          // sps->intra_smoothing_disabled_flag
25567+                sps->intra_filters_disable |= FILTER_EITHER;
25568+            sps->high_precision_offsets_enabled_flag = get_bits1(gb);
25569+            sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
25570+
25571+            cabac_bypass_alignment_enabled_flag  = get_bits1(gb);
25572+            if (cabac_bypass_alignment_enabled_flag)
25573+                av_log(avctx, AV_LOG_WARNING,
25574+                   "cabac_bypass_alignment_enabled_flag not yet implemented\n");
25575+        }
25576+    }
25577+    if (apply_defdispwin) {
25578+        sps->output_window.left_offset   += sps->vui.def_disp_win.left_offset;
25579+        sps->output_window.right_offset  += sps->vui.def_disp_win.right_offset;
25580+        sps->output_window.top_offset    += sps->vui.def_disp_win.top_offset;
25581+        sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset;
25582+    }
25583+
25584+    ow = &sps->output_window;
25585+    if (ow->left_offset >= INT_MAX - ow->right_offset     ||
25586+        ow->top_offset  >= INT_MAX - ow->bottom_offset    ||
25587+        ow->left_offset + ow->right_offset  >= sps->width ||
25588+        ow->top_offset  + ow->bottom_offset >= sps->height) {
25589+        av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n",
25590+               ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset);
25591+        if (avctx->err_recognition & AV_EF_EXPLODE) {
25592+            return AVERROR_INVALIDDATA;
25593+        }
25594+        av_log(avctx, AV_LOG_WARNING,
25595+               "Displaying the whole video surface.\n");
25596+        memset(ow, 0, sizeof(*ow));
25597+        memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
25598+    }
25599+
25600+    // Inferred parameters
25601+
25602+    sps->ctb_width  = (sps->width  + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
25603+    sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
25604+    sps->ctb_size   = sps->ctb_width * sps->ctb_height;
25605+
25606+    sps->min_cb_width  = sps->width  >> sps->log2_min_cb_size;
25607+    sps->min_cb_height = sps->height >> sps->log2_min_cb_size;
25608+    sps->min_tb_width  = sps->width  >> sps->log2_min_tb_size;
25609+    sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
25610+    sps->min_pu_width  = sps->width  >> LOG2_MIN_PU_SIZE;
25611+    sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE;
25612+    sps->tb_mask       = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
25613+
25614+    sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
25615+    sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7));
25616+
25617+    if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
25618+        av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
25619+        av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
25620+        return AVERROR_INVALIDDATA;
25621+    }
25622+
25623+    if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
25624+        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
25625+               sps->max_transform_hierarchy_depth_inter);
25626+        return AVERROR_INVALIDDATA;
25627+    }
25628+    if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
25629+        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
25630+               sps->max_transform_hierarchy_depth_intra);
25631+        return AVERROR_INVALIDDATA;
25632+    }
25633+    if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
25634+        av_log(avctx, AV_LOG_ERROR,
25635+               "max transform block size out of range: %d\n",
25636+               sps->log2_max_trafo_size);
25637+        return AVERROR_INVALIDDATA;
25638+    }
25639+
25640+    if (get_bits_left(gb) < 0) {
25641+        av_log(avctx, AV_LOG_ERROR,
25642+               "Overread SPS by %d bits\n", -get_bits_left(gb));
25643+        return AVERROR_INVALIDDATA;
25644+    }
25645+
25646+    return 0;
25647+}
25648+
25649+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
25650+                           HEVCRpiParamSets *ps, int apply_defdispwin)
25651+{
25652+    HEVCRpiSPS *sps;
25653+    AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
25654+    unsigned int sps_id;
25655+    int ret;
25656+    ptrdiff_t nal_size;
25657+
25658+    if (!sps_buf)
25659+        return AVERROR(ENOMEM);
25660+    sps = (HEVCRpiSPS*)sps_buf->data;
25661+
25662+    av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n");
25663+
25664+    nal_size = gb->buffer_end - gb->buffer;
25665+    if (nal_size > sizeof(sps->data)) {
25666+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS "
25667+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
25668+               nal_size, sizeof(sps->data));
25669+        sps->data_size = sizeof(sps->data);
25670+    } else {
25671+        sps->data_size = nal_size;
25672+    }
25673+    memcpy(sps->data, gb->buffer, sps->data_size);
25674+
25675+    ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id,
25676+                            apply_defdispwin,
25677+                            ps->vps_list, avctx);
25678+    if (ret < 0) {
25679+        av_buffer_unref(&sps_buf);
25680+        return ret;
25681+    }
25682+
25683+    if (avctx->debug & FF_DEBUG_BITSTREAM) {
25684+        av_log(avctx, AV_LOG_DEBUG,
25685+               "Parsed SPS: id %d; coded wxh: %dx%d; "
25686+               "cropped wxh: %dx%d; pix_fmt: %s.\n",
25687+               sps_id, sps->width, sps->height,
25688+               sps->width - (sps->output_window.left_offset + sps->output_window.right_offset),
25689+               sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset),
25690+               av_get_pix_fmt_name(sps->pix_fmt));
25691+    }
25692+
25693+    /* check if this is a repeat of an already parsed SPS, then keep the
25694+     * original one.
25695+     * otherwise drop all PPSes that depend on it */
25696+    if (ps->sps_list[sps_id] &&
25697+        !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) {
25698+        av_buffer_unref(&sps_buf);
25699+    } else {
25700+        remove_sps(ps, sps_id);
25701+        ps->sps_list[sps_id] = sps_buf;
25702+    }
25703+
25704+    return 0;
25705+}
25706+
25707+static void hevc_pps_free(void *opaque, uint8_t *data)
25708+{
25709+    HEVCRpiPPS *pps = (HEVCRpiPPS*)data;
25710+
25711+    av_freep(&pps->column_width);
25712+    av_freep(&pps->row_height);
25713+    av_freep(&pps->col_bd);
25714+    av_freep(&pps->row_bd);
25715+    av_freep(&pps->col_idxX);
25716+    av_freep(&pps->ctb_addr_rs_to_ts);
25717+    av_freep(&pps->ctb_addr_ts_to_rs);
25718+    av_freep(&pps->tile_pos_ts);
25719+    av_freep(&pps->tile_size);
25720+    av_freep(&pps->tile_id);
25721+    av_freep(&pps->ctb_ts_flags);
25722+
25723+    av_freep(&pps);
25724+}
25725+
25726+static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets)
25727+{
25728+    do
25729+    {
25730+        const int offset = get_se_golomb_long(gb);
25731+        if (offset < -12 || offset > 12) {
25732+            av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset);
25733+            return AVERROR_INVALIDDATA;
25734+        }
25735+        *offsets++ = offset;
25736+    } while (n_minus_1-- != 0);
25737+    return 0;
25738+}
25739+
25740+static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx,
25741+                                HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
25742+{
25743+    if (pps->transform_skip_enabled_flag) {
25744+        pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
25745+    }
25746+    pps->cross_component_prediction_enabled_flag = get_bits1(gb);
25747+    if (pps->cross_component_prediction_enabled_flag &&
25748+        (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag))
25749+    {
25750+        av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n");
25751+        return AVERROR_INVALIDDATA;
25752+    }
25753+    pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
25754+    if (pps->chroma_qp_offset_list_enabled_flag) {
25755+        int err;
25756+
25757+        pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
25758+        pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
25759+        if (pps->chroma_qp_offset_list_len_minus1 > 5) {
25760+            av_log(avctx, AV_LOG_ERROR,
25761+                   "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
25762+            return AVERROR_INVALIDDATA;
25763+        }
25764+        av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n");
25765+
25766+        if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 ||
25767+            (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0)
25768+            return err;
25769+    }
25770+
25771+    {
25772+        const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0;
25773+
25774+        pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
25775+        if (pps->log2_sao_offset_scale_luma > max_offset) {
25776+            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid");
25777+            return AVERROR_INVALIDDATA;
25778+        }
25779+        pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
25780+        if (pps->log2_sao_offset_scale_chroma > max_offset) {
25781+            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid");
25782+            return AVERROR_INVALIDDATA;
25783+        }
25784+    }
25785+
25786+    return(0);
25787+}
25788+
25789+static inline int setup_pps(AVCodecContext * const avctx,
25790+                            HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
25791+{
25792+    int pic_area_in_ctbs;
25793+    int i, j, x, y, ctb_addr_rs, tile_id;
25794+
25795+    // Inferred parameters
25796+
25797+    // qp_y -> qp_u/qp_v tables
25798+    // The tables have at least -24,+24 overrun after adding offset here
25799+    // which should allow for clipless offseting
25800+
25801+    pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0;  // No offset for luma, but may be useful for general code
25802+    pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0;
25803+
25804+    if (sps->chroma_format_idc == 1) {
25805+        pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
25806+        pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
25807+        pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
25808+        pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
25809+    }
25810+    else
25811+    {
25812+        pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
25813+        pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
25814+        pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
25815+        pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
25816+    }
25817+
25818+    pps->col_bd   = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd));
25819+    pps->row_bd   = av_malloc_array(pps->num_tile_rows + 1,    sizeof(*pps->row_bd));
25820+    pps->col_idxX = av_malloc_array(sps->ctb_width,    sizeof(*pps->col_idxX));
25821+    if (!pps->col_bd || !pps->row_bd || !pps->col_idxX)
25822+        return AVERROR(ENOMEM);
25823+
25824+    if (pps->uniform_spacing_flag) {
25825+        if (!pps->column_width) {
25826+            pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
25827+            pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
25828+        }
25829+        if (!pps->column_width || !pps->row_height)
25830+            return AVERROR(ENOMEM);
25831+
25832+        for (i = 0; i < pps->num_tile_columns; i++) {
25833+            pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns -
25834+                                   (i * sps->ctb_width) / pps->num_tile_columns;
25835+        }
25836+
25837+        for (i = 0; i < pps->num_tile_rows; i++) {
25838+            pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows -
25839+                                 (i * sps->ctb_height) / pps->num_tile_rows;
25840+        }
25841+    }
25842+
25843+    {
25844+        const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift);
25845+        pps->col_bd[0] = 0;
25846+        pps->tile_wpp_inter_disable = 0;
25847+        for (i = 0; i < pps->num_tile_columns; i++)
25848+        {
25849+            pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i];
25850+
25851+            // Avoid trying tile parallel if the columns don't fall on cache boundries
25852+            // (this causes too much pain syncing flushes with the QPU)
25853+            // Ignore the final (RHS of pic) tile boundry
25854+            if ((pps->col_bd[i] & td_mask) != 0) {
25855+                pps->tile_wpp_inter_disable = 1;
25856+            }
25857+        }
25858+
25859+        // If we can start the next row before finishing the first line of
25860+        // this one then we must wait at the end of the tile
25861+        // * if this happens a lot then there are better but more complicated
25862+        //   conditions that we could apply
25863+        if (pps->tile_wpp_inter_disable) {
25864+            for (i = 0; i < pps->num_tile_rows; i++)
25865+            {
25866+                if (pps->row_height[i] <= RPI_MAX_JOBS) {
25867+                    pps->tile_wpp_inter_disable = 2;
25868+                    break;
25869+                }
25870+            }
25871+        }
25872+    }
25873+
25874+    pps->row_bd[0] = 0;
25875+    for (i = 0; i < pps->num_tile_rows; i++)
25876+        pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
25877+
25878+    for (i = 0, j = 0; i < sps->ctb_width; i++) {
25879+        if (i >= pps->col_bd[j + 1])
25880+            j++;
25881+        pps->col_idxX[i] = j;
25882+    }
25883+
25884+    /**
25885+     * 6.5
25886+     */
25887+    pic_area_in_ctbs     = sps->ctb_size;
25888+
25889+    pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
25890+    pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
25891+    pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
25892+    pps->tile_size         = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size));
25893+    pps->tile_pos_ts       = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts));
25894+    pps->ctb_ts_flags      = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_ts_flags));
25895+    if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
25896+        !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
25897+        return AVERROR(ENOMEM);
25898+    }
25899+
25900+    memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags));
25901+
25902+    for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
25903+        int tb_x   = ctb_addr_rs % sps->ctb_width;
25904+        int tb_y   = ctb_addr_rs / sps->ctb_width;
25905+        int tile_x = 0;
25906+        int tile_y = 0;
25907+        int val    = 0;
25908+
25909+        for (i = 0; i < pps->num_tile_columns; i++) {
25910+            if (tb_x < pps->col_bd[i + 1]) {
25911+                tile_x = i;
25912+                break;
25913+            }
25914+        }
25915+
25916+        for (i = 0; i < pps->num_tile_rows; i++) {
25917+            if (tb_y < pps->row_bd[i + 1]) {
25918+                tile_y = i;
25919+                break;
25920+            }
25921+        }
25922+
25923+        for (i = 0; i < tile_x; i++)
25924+            val += pps->row_height[tile_y] * pps->column_width[i];
25925+        for (i = 0; i < tile_y; i++)
25926+            val += sps->ctb_width * pps->row_height[i];
25927+
25928+        val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] +
25929+               tb_x - pps->col_bd[tile_x];
25930+
25931+        pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
25932+        pps->ctb_addr_ts_to_rs[val]         = ctb_addr_rs;
25933+    }
25934+
25935+    {
25936+        uint8_t * pflags = pps->ctb_ts_flags;
25937+        uint16_t * ptid = pps->tile_id;
25938+
25939+        for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
25940+        {
25941+            for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
25942+            {
25943+                const unsigned int tile_w = pps->column_width[i];
25944+
25945+                pflags[0] |= CTB_TS_FLAGS_CIREQ;
25946+
25947+                for (x = 0; x != tile_w; ++x) {
25948+                    pflags[x] |= CTB_TS_FLAGS_TOT;
25949+                }
25950+
25951+                for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
25952+                {
25953+                    pflags[0] |= CTB_TS_FLAGS_SOTL;
25954+
25955+                    if (pps->entropy_coding_sync_enabled_flag)
25956+                    {
25957+                        if (pps->column_width[i] != 1)
25958+                            pflags[1] |= CTB_TS_FLAGS_CSAVE;
25959+                        else
25960+                            pflags[0] |= CTB_TS_FLAGS_CIREQ;
25961+
25962+                        if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0)
25963+                            pflags[0] |= CTB_TS_FLAGS_CLOAD;
25964+                    }
25965+
25966+                    for (x = 0; x != tile_w; ++x)
25967+                        *ptid++ = tile_id;
25968+
25969+                    pflags += tile_w;
25970+                    pflags[-1] |= CTB_TS_FLAGS_EOTL;
25971+                    if (i + 1 == pps->num_tile_columns)
25972+                        pflags[-1] |= CTB_TS_FLAGS_EOL;
25973+                }
25974+
25975+                pflags[-1] |= CTB_TS_FLAGS_EOT;
25976+            }
25977+        }
25978+    }
25979+
25980+    {
25981+        unsigned int ts = 0;
25982+        for (j = 0; j < pps->num_tile_rows; j++)
25983+            for (i = 0; i < pps->num_tile_columns; i++)
25984+            {
25985+                const unsigned int size = pps->column_width[i] * pps->row_height[j];
25986+                pps->tile_size[j * pps->num_tile_columns + i] = size;
25987+                pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts;
25988+                ts += size;
25989+            }
25990+    }
25991+
25992+    return 0;
25993+}
25994+
25995+int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx,
25996+                           HEVCRpiParamSets * const ps)
25997+{
25998+    const HEVCRpiSPS *sps = NULL;
25999+    int i, ret = 0;
26000+    unsigned int pps_id = 0;
26001+    ptrdiff_t nal_size;
26002+    unsigned log2_parallel_merge_level_minus2;
26003+
26004+    AVBufferRef *pps_buf;
26005+    HEVCRpiPPS *pps = av_mallocz(sizeof(*pps));
26006+
26007+    if (!pps)
26008+        return AVERROR(ENOMEM);
26009+
26010+    pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps),
26011+                               hevc_pps_free, NULL, 0);
26012+    if (!pps_buf) {
26013+        av_freep(&pps);
26014+        return AVERROR(ENOMEM);
26015+    }
26016+
26017+    av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n");
26018+
26019+    nal_size = gb->buffer_end - gb->buffer;
26020+    if (nal_size > sizeof(pps->data)) {
26021+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS "
26022+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
26023+               nal_size, sizeof(pps->data));
26024+        pps->data_size = sizeof(pps->data);
26025+    } else {
26026+        pps->data_size = nal_size;
26027+    }
26028+    memcpy(pps->data, gb->buffer, pps->data_size);
26029+
26030+    // Default values
26031+    pps->loop_filter_across_tiles_enabled_flag = 1;
26032+    pps->num_tile_columns                      = 1;
26033+    pps->num_tile_rows                         = 1;
26034+    pps->uniform_spacing_flag                  = 1;
26035+    pps->disable_dbf                           = 0;
26036+    pps->beta_offset                           = 0;
26037+    pps->tc_offset                             = 0;
26038+    pps->log2_max_transform_skip_block_size    = 2;
26039+
26040+    // Coded parameters
26041+    pps_id = get_ue_golomb_long(gb);
26042+    if (pps_id >= HEVC_MAX_PPS_COUNT) {
26043+        av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
26044+        ret = AVERROR_INVALIDDATA;
26045+        goto err;
26046+    }
26047+    pps->sps_id = get_ue_golomb_long(gb);
26048+    if (pps->sps_id >= HEVC_MAX_SPS_COUNT) {
26049+        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id);
26050+        ret = AVERROR_INVALIDDATA;
26051+        goto err;
26052+    }
26053+    if (!ps->sps_list[pps->sps_id]) {
26054+        av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id);
26055+        ret = AVERROR_INVALIDDATA;
26056+        goto err;
26057+    }
26058+    sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data;
26059+
26060+    pps->dependent_slice_segments_enabled_flag = get_bits1(gb);
26061+    pps->output_flag_present_flag              = get_bits1(gb);
26062+    pps->num_extra_slice_header_bits           = get_bits(gb, 3);
26063+
26064+    pps->sign_data_hiding_flag = get_bits1(gb);
26065+
26066+    pps->cabac_init_present_flag = get_bits1(gb);
26067+
26068+    pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1;
26069+    if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) {
26070+        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n");
26071+        ret = AVERROR_INVALIDDATA;
26072+        goto err;
26073+    }
26074+    pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1;
26075+    if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) {
26076+        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n");
26077+        ret = AVERROR_INVALIDDATA;
26078+        goto err;
26079+    }
26080+
26081+    pps->pic_init_qp_minus26 = get_se_golomb(gb);
26082+    if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) {
26083+        av_log(avctx, AV_LOG_ERROR,
26084+               "init_qp_minus26 %d is outside the valid range "
26085+               "[%d, %d].\n",
26086+               pps->pic_init_qp_minus26,
26087+               -(26 + sps->qp_bd_offset), 25);
26088+        ret = AVERROR_INVALIDDATA;
26089+        goto err;
26090+    }
26091+
26092+    pps->constrained_intra_pred_flag = get_bits1(gb);
26093+    pps->transform_skip_enabled_flag = get_bits1(gb);
26094+
26095+    pps->cu_qp_delta_enabled_flag = get_bits1(gb);
26096+    pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size;
26097+    if (pps->cu_qp_delta_enabled_flag)
26098+    {
26099+        const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
26100+
26101+        if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
26102+            av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
26103+                   diff_cu_qp_delta_depth);
26104+            ret = AVERROR_INVALIDDATA;
26105+            goto err;
26106+        }
26107+
26108+        pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth;
26109+    }
26110+
26111+    pps->cb_qp_offset = get_se_golomb(gb);
26112+    if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
26113+        av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
26114+               pps->cb_qp_offset);
26115+        ret = AVERROR_INVALIDDATA;
26116+        goto err;
26117+    }
26118+    pps->cr_qp_offset = get_se_golomb(gb);
26119+    if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) {
26120+        av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n",
26121+               pps->cr_qp_offset);
26122+        ret = AVERROR_INVALIDDATA;
26123+        goto err;
26124+    }
26125+    pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb);
26126+
26127+    pps->weighted_pred_flag   = get_bits1(gb);
26128+    pps->weighted_bipred_flag = get_bits1(gb);
26129+
26130+    pps->transquant_bypass_enable_flag    = get_bits1(gb);
26131+    pps->tiles_enabled_flag               = get_bits1(gb);
26132+    pps->entropy_coding_sync_enabled_flag = get_bits1(gb);
26133+
26134+    if (pps->tiles_enabled_flag) {
26135+        pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
26136+        pps->num_tile_rows    = get_ue_golomb_long(gb) + 1;
26137+        if (pps->num_tile_columns <= 0 ||
26138+            pps->num_tile_columns >= sps->width) {
26139+            av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
26140+                   pps->num_tile_columns - 1);
26141+            ret = AVERROR_INVALIDDATA;
26142+            goto err;
26143+        }
26144+        if (pps->num_tile_rows <= 0 ||
26145+            pps->num_tile_rows >= sps->height) {
26146+            av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
26147+                   pps->num_tile_rows - 1);
26148+            ret = AVERROR_INVALIDDATA;
26149+            goto err;
26150+        }
26151+
26152+        pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
26153+        pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
26154+        if (!pps->column_width || !pps->row_height) {
26155+            ret = AVERROR(ENOMEM);
26156+            goto err;
26157+        }
26158+
26159+        pps->uniform_spacing_flag = get_bits1(gb);
26160+        if (!pps->uniform_spacing_flag) {
26161+            uint64_t sum = 0;
26162+            for (i = 0; i < pps->num_tile_columns - 1; i++) {
26163+                pps->column_width[i] = get_ue_golomb_long(gb) + 1;
26164+                sum                 += pps->column_width[i];
26165+            }
26166+            if (sum >= sps->ctb_width) {
26167+                av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n");
26168+                ret = AVERROR_INVALIDDATA;
26169+                goto err;
26170+            }
26171+            pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum;
26172+
26173+            sum = 0;
26174+            for (i = 0; i < pps->num_tile_rows - 1; i++) {
26175+                pps->row_height[i] = get_ue_golomb_long(gb) + 1;
26176+                sum               += pps->row_height[i];
26177+            }
26178+            if (sum >= sps->ctb_height) {
26179+                av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n");
26180+                ret = AVERROR_INVALIDDATA;
26181+                goto err;
26182+            }
26183+            pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum;
26184+        }
26185+        pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb);
26186+    }
26187+
26188+    pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb);
26189+
26190+    pps->deblocking_filter_control_present_flag = get_bits1(gb);
26191+    if (pps->deblocking_filter_control_present_flag) {
26192+        pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
26193+        pps->disable_dbf                             = get_bits1(gb);
26194+        if (!pps->disable_dbf) {
26195+            int beta_offset_div2 = get_se_golomb(gb);
26196+            int tc_offset_div2   = get_se_golomb(gb) ;
26197+            if (beta_offset_div2 < -6 || beta_offset_div2 > 6) {
26198+                av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
26199+                       beta_offset_div2);
26200+                ret = AVERROR_INVALIDDATA;
26201+                goto err;
26202+            }
26203+            if (tc_offset_div2 < -6 || tc_offset_div2 > 6) {
26204+                av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
26205+                       tc_offset_div2);
26206+                ret = AVERROR_INVALIDDATA;
26207+                goto err;
26208+            }
26209+            pps->beta_offset = 2 * beta_offset_div2;
26210+            pps->tc_offset   = 2 *   tc_offset_div2;
26211+        }
26212+    }
26213+
26214+    pps->scaling_list_data_present_flag = get_bits1(gb);
26215+    if (pps->scaling_list_data_present_flag) {
26216+        set_default_scaling_list_data(&pps->scaling_list);
26217+        ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
26218+        if (ret < 0)
26219+            goto err;
26220+    }
26221+    pps->lists_modification_present_flag = get_bits1(gb);
26222+    log2_parallel_merge_level_minus2     = get_ue_golomb_long(gb);
26223+    if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) {
26224+        av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
26225+               log2_parallel_merge_level_minus2);
26226+        ret = AVERROR_INVALIDDATA;
26227+        goto err;
26228+    }
26229+    pps->log2_parallel_merge_level       = log2_parallel_merge_level_minus2 + 2;
26230+
26231+    pps->slice_header_extension_present_flag = get_bits1(gb);
26232+
26233+    if (get_bits1(gb)) { // pps_extension_present_flag
26234+        int pps_range_extensions_flag = get_bits1(gb);
26235+        skip_bits(gb, 7); // pps_extension_7bits
26236+        if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
26237+            if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
26238+                goto err;
26239+        }
26240+    }
26241+
26242+    ret = setup_pps(avctx, pps, sps);
26243+    if (ret < 0)
26244+        goto err;
26245+
26246+    if (get_bits_left(gb) < 0) {
26247+        av_log(avctx, AV_LOG_ERROR,
26248+               "Overread PPS by %d bits\n", -get_bits_left(gb));
26249+        ret = AVERROR_INVALIDDATA;
26250+        goto err;
26251+    }
26252+
26253+    remove_pps(ps, pps_id);
26254+    ps->pps_list[pps_id] = pps_buf;
26255+
26256+    return 0;
26257+
26258+err:
26259+    av_buffer_unref(&pps_buf);
26260+    return ret;
26261+}
26262+
26263+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type)
26264+{
26265+    int max_poc_lsb  = 1 << sps->log2_max_poc_lsb;
26266+    int prev_poc_lsb = pocTid0 % max_poc_lsb;
26267+    int prev_poc_msb = pocTid0 - prev_poc_lsb;
26268+    int poc_msb;
26269+
26270+    if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
26271+        poc_msb = prev_poc_msb + max_poc_lsb;
26272+    else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
26273+        poc_msb = prev_poc_msb - max_poc_lsb;
26274+    else
26275+        poc_msb = prev_poc_msb;
26276+
26277+    // For BLA picture types, POCmsb is set to 0.
26278+    if (nal_unit_type == HEVC_NAL_BLA_W_LP   ||
26279+        nal_unit_type == HEVC_NAL_BLA_W_RADL ||
26280+        nal_unit_type == HEVC_NAL_BLA_N_LP)
26281+        poc_msb = 0;
26282+
26283+    return poc_msb + poc_lsb;
26284+}
26285--- /dev/null
26286+++ b/libavcodec/rpi_hevc_ps.h
26287@@ -0,0 +1,449 @@
26288+/*
26289+ * HEVC parameter set parsing
26290+ *
26291+ * This file is part of FFmpeg.
26292+ *
26293+ * FFmpeg is free software; you can redistribute it and/or
26294+ * modify it under the terms of the GNU Lesser General Public
26295+ * License as published by the Free Software Foundation; either
26296+ * version 2.1 of the License, or (at your option) any later version.
26297+ *
26298+ * FFmpeg is distributed in the hope that it will be useful,
26299+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
26300+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26301+ * Lesser General Public License for more details.
26302+ *
26303+ * You should have received a copy of the GNU Lesser General Public
26304+ * License along with FFmpeg; if not, write to the Free Software
26305+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26306+ */
26307+
26308+#ifndef AVCODEC_RPI_HEVC_PS_H
26309+#define AVCODEC_RPI_HEVC_PS_H
26310+
26311+#include <stdint.h>
26312+
26313+#include "libavutil/buffer.h"
26314+#include "libavutil/pixfmt.h"
26315+#include "libavutil/rational.h"
26316+
26317+#include "avcodec.h"
26318+#include "get_bits.h"
26319+#include "hevc.h"
26320+
26321+typedef struct ShortTermRPS {
26322+    unsigned int num_negative_pics;
26323+    int num_delta_pocs;
26324+    int rps_idx_num_delta_pocs;
26325+    int32_t delta_poc[32];
26326+    uint8_t used[32];
26327+} ShortTermRPS;
26328+
26329+typedef struct LongTermRPS {
26330+    int     poc[32];
26331+    uint8_t used[32];
26332+    uint8_t nb_refs;
26333+} LongTermRPS;
26334+
26335+typedef struct RpiSliceHeader {
26336+    unsigned int pps_id;
26337+
26338+    ///< address (in raster order) of the first block in the current slice segment
26339+    unsigned int   slice_segment_addr;
26340+    ///< address (in raster order) of the first block in the current slice
26341+    unsigned int   slice_addr;
26342+
26343+    enum HEVCSliceType slice_type;
26344+
26345+    int pic_order_cnt_lsb;
26346+
26347+    uint8_t first_slice_in_pic_flag;
26348+    uint8_t dependent_slice_segment_flag;
26349+    uint8_t pic_output_flag;
26350+    uint8_t colour_plane_id;
26351+
26352+    ///< RPS coded in the slice header itself is stored here
26353+    int short_term_ref_pic_set_sps_flag;
26354+    int short_term_ref_pic_set_size;
26355+    ShortTermRPS slice_rps;
26356+    const ShortTermRPS *short_term_rps;
26357+    int long_term_ref_pic_set_size;
26358+    LongTermRPS long_term_rps;
26359+    unsigned int list_entry_lx[2][32];
26360+
26361+    uint8_t rpl_modification_flag[2];
26362+    uint8_t no_output_of_prior_pics_flag;
26363+    uint8_t slice_temporal_mvp_enabled_flag;
26364+
26365+    unsigned int nb_refs[2];
26366+
26367+    uint8_t slice_sample_adaptive_offset_flag[3];
26368+    uint8_t mvd_l1_zero_flag;
26369+
26370+    uint8_t cabac_init_flag;
26371+    uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
26372+    uint8_t slice_loop_filter_across_slices_enabled_flag;
26373+    uint8_t collocated_list;
26374+
26375+    uint8_t no_dblk_boundary_flags;
26376+
26377+    unsigned int collocated_ref_idx;
26378+
26379+    int slice_qp_delta;
26380+    int slice_cb_qp_offset;  // -12, +12
26381+    int slice_cr_qp_offset;  // -12, +12
26382+
26383+    uint8_t cu_chroma_qp_offset_enabled_flag;
26384+
26385+    int beta_offset;    ///< beta_offset_div2 * 2
26386+    int tc_offset;      ///< tc_offset_div2 * 2
26387+
26388+    unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
26389+
26390+    unsigned *entry_point_offset;
26391+    int * offset;
26392+    int * size;
26393+    int num_entry_point_offsets;
26394+    int offsets_allocated;
26395+
26396+    uint8_t offload_wpp;
26397+    uint8_t offload_tiles;
26398+
26399+    int8_t slice_qp;
26400+
26401+    uint8_t luma_log2_weight_denom;
26402+    uint8_t chroma_log2_weight_denom;
26403+
26404+    int16_t luma_weight_l0[16];     // -128, +255
26405+    int16_t luma_offset_l0[16];
26406+    int16_t chroma_weight_l0[16][2];
26407+    int16_t chroma_offset_l0[16][2];
26408+
26409+    int16_t luma_weight_l1[16];
26410+    int16_t luma_offset_l1[16];
26411+    int16_t chroma_weight_l1[16][2];
26412+    int16_t chroma_offset_l1[16][2];
26413+
26414+} RpiSliceHeader;
26415+
26416+typedef struct HEVCRpiWindow {
26417+    uint16_t left_offset;
26418+    uint16_t right_offset;
26419+    uint16_t top_offset;
26420+    uint16_t bottom_offset;
26421+} HEVCRpiWindow;
26422+
26423+typedef struct VUI {
26424+    AVRational sar;
26425+
26426+    int overscan_info_present_flag;
26427+    int overscan_appropriate_flag;
26428+
26429+    int video_signal_type_present_flag;
26430+    int video_format;
26431+    int video_full_range_flag;
26432+    int colour_description_present_flag;
26433+    uint8_t colour_primaries;
26434+    uint8_t transfer_characteristic;
26435+    uint8_t matrix_coeffs;
26436+
26437+    int chroma_loc_info_present_flag;
26438+    int chroma_sample_loc_type_top_field;
26439+    int chroma_sample_loc_type_bottom_field;
26440+    int neutra_chroma_indication_flag;
26441+
26442+    int field_seq_flag;
26443+    int frame_field_info_present_flag;
26444+
26445+    int default_display_window_flag;
26446+    HEVCRpiWindow def_disp_win;
26447+
26448+    int vui_timing_info_present_flag;
26449+    uint32_t vui_num_units_in_tick;
26450+    uint32_t vui_time_scale;
26451+    int vui_poc_proportional_to_timing_flag;
26452+    int vui_num_ticks_poc_diff_one_minus1;
26453+    int vui_hrd_parameters_present_flag;
26454+
26455+    int bitstream_restriction_flag;
26456+    int tiles_fixed_structure_flag;
26457+    int motion_vectors_over_pic_boundaries_flag;
26458+    int restricted_ref_pic_lists_flag;
26459+    int min_spatial_segmentation_idc;
26460+    int max_bytes_per_pic_denom;
26461+    int max_bits_per_min_cu_denom;
26462+    int log2_max_mv_length_horizontal;
26463+    int log2_max_mv_length_vertical;
26464+} VUI;
26465+
26466+typedef struct PTLCommon {
26467+    uint8_t profile_space;
26468+    uint8_t tier_flag;
26469+    uint8_t profile_idc;
26470+    uint8_t profile_compatibility_flag[32];
26471+    uint8_t level_idc;
26472+    uint8_t progressive_source_flag;
26473+    uint8_t interlaced_source_flag;
26474+    uint8_t non_packed_constraint_flag;
26475+    uint8_t frame_only_constraint_flag;
26476+} PTLCommon;
26477+
26478+typedef struct PTL {
26479+    PTLCommon general_ptl;
26480+    PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS];
26481+
26482+    uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS];
26483+    uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS];
26484+} PTL;
26485+
26486+typedef struct HEVCRpiVPS {
26487+    uint8_t vps_temporal_id_nesting_flag;
26488+    int vps_max_layers;
26489+    int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1
26490+
26491+    PTL ptl;
26492+    int vps_sub_layer_ordering_info_present_flag;
26493+    unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS];
26494+    unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS];
26495+    unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS];
26496+    int vps_max_layer_id;
26497+    int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1
26498+    uint8_t vps_timing_info_present_flag;
26499+    uint32_t vps_num_units_in_tick;
26500+    uint32_t vps_time_scale;
26501+    uint8_t vps_poc_proportional_to_timing_flag;
26502+    int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
26503+    int vps_num_hrd_parameters;
26504+
26505+    uint8_t data[4096];
26506+    int data_size;
26507+} HEVCRpiVPS;
26508+
26509+typedef struct ScalingList {
26510+    /* This is a little wasteful, since sizeID 0 only needs 8 coeffs,
26511+     * and size ID 3 only has 2 arrays, not 6. */
26512+    uint8_t sl[4][6][64];
26513+    uint8_t sl_dc[2][6];
26514+} ScalingList;
26515+
26516+typedef struct HEVCRpiSPS {
26517+    unsigned vps_id;
26518+    uint8_t chroma_format_idc;
26519+    uint8_t separate_colour_plane_flag;
26520+
26521+    HEVCRpiWindow output_window;
26522+
26523+    HEVCRpiWindow pic_conf_win;
26524+
26525+    uint16_t wp_offset_half_range;  // WpOffsetHalfRange
26526+
26527+    uint8_t bit_depth;
26528+
26529+//    int bit_depth_chroma;  // We only support lum_bit_depth = chroma_bit_depth
26530+    uint8_t pixel_shift;
26531+    enum AVPixelFormat pix_fmt;
26532+
26533+    unsigned int log2_max_poc_lsb;
26534+
26535+    int max_sub_layers;
26536+    struct {
26537+        int max_dec_pic_buffering;
26538+        int num_reorder_pics;
26539+        int max_latency_increase;
26540+    } temporal_layer[HEVC_MAX_SUB_LAYERS];
26541+    uint8_t temporal_id_nesting_flag;
26542+
26543+    uint8_t scaling_list_enable_flag;
26544+    ScalingList scaling_list;
26545+
26546+    unsigned int nb_st_rps;
26547+    ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS];
26548+
26549+    uint8_t amp_enabled_flag;
26550+    uint8_t sao_enabled;
26551+
26552+    uint8_t long_term_ref_pics_present_flag;
26553+    uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS];
26554+    uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS];
26555+    uint8_t num_long_term_ref_pics_sps;
26556+
26557+    struct {
26558+        uint8_t bit_depth;
26559+        uint8_t bit_depth_chroma;
26560+        uint8_t log2_min_pcm_cb_size;
26561+        uint8_t log2_max_pcm_cb_size;
26562+        uint8_t loop_filter_disable_flag;
26563+    } pcm;
26564+    char sps_temporal_mvp_enabled_flag;
26565+//    char sps_strong_intra_smoothing_enable_flag;  -> intra_filtes_disable
26566+
26567+    uint8_t log2_min_cb_size;  // 3..6
26568+    uint8_t log2_diff_max_min_coding_block_size;
26569+    uint8_t log2_min_tb_size;  // 2..5
26570+    uint8_t log2_max_trafo_size;
26571+    uint8_t log2_ctb_size;     // 4..6
26572+//    unsigned int log2_min_pu_size;  // 2..5 (min_cb_size - 1)
26573+#define LOG2_MIN_PU_SIZE 2
26574+#define LOG2_MIN_CU_SIZE 3
26575+
26576+    uint8_t max_transform_hierarchy_depth_inter;
26577+    uint8_t max_transform_hierarchy_depth_intra;
26578+
26579+    char transform_skip_rotation_enabled_flag;
26580+    char transform_skip_context_enabled_flag;
26581+    char implicit_rdpcm_enabled_flag;
26582+    char explicit_rdpcm_enabled_flag;
26583+//    char intra_smoothing_disabled_flag;  -> intra_filtes_disable
26584+    char high_precision_offsets_enabled_flag;
26585+    char persistent_rice_adaptation_enabled_flag;
26586+
26587+    uint8_t intra_filters_disable;
26588+
26589+    ///< coded frame dimension in various units
26590+    int width;
26591+    int height;
26592+    int ctb_width;
26593+    int ctb_height;
26594+    int ctb_size;   // Pic size in CTBs not size of a CTB
26595+    int min_cb_width;
26596+    int min_cb_height;
26597+    int min_tb_width;
26598+    int min_tb_height;
26599+    int min_pu_width;
26600+    int min_pu_height;
26601+    int pcm_width;
26602+    int pcm_height;
26603+    int tb_mask;
26604+
26605+    int hshift[3];
26606+    int vshift[3];
26607+
26608+    int qp_bd_offset;
26609+
26610+    uint8_t data[4096];
26611+    int data_size;
26612+
26613+    VUI vui;
26614+    PTL ptl;
26615+} HEVCRpiSPS;
26616+
26617+#define CTB_TS_FLAGS_SOTL       (1U << 0)       // X start of tile line
26618+#define CTB_TS_FLAGS_EOTL       (1U << 1)       // Last CTB of a tile line
26619+#define CTB_TS_FLAGS_EOL        (1U << 2)       // Last CTB of a complete line
26620+#define CTB_TS_FLAGS_EOT        (1U << 3)       // Last CTB of a tile
26621+#define CTB_TS_FLAGS_CSAVE      (1U << 4)
26622+#define CTB_TS_FLAGS_CIREQ      (1U << 5)       // Cabac init request
26623+#define CTB_TS_FLAGS_TOT        (1U << 6)       // CTB on top row of a tile
26624+#define CTB_TS_FLAGS_CLOAD      (1U << 7)
26625+
26626+typedef struct HEVCRpiPPS {
26627+    unsigned int sps_id; ///< seq_parameter_set_id
26628+
26629+    uint8_t sign_data_hiding_flag;
26630+
26631+    uint8_t cabac_init_present_flag;
26632+
26633+    int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1
26634+    int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1
26635+    int pic_init_qp_minus26;
26636+
26637+    uint8_t constrained_intra_pred_flag;
26638+    uint8_t transform_skip_enabled_flag;
26639+
26640+    uint8_t cu_qp_delta_enabled_flag;
26641+    uint8_t log2_min_cu_qp_delta_size;
26642+    int cb_qp_offset;   // -12..12
26643+    int cr_qp_offset;   // -12..12
26644+    const uint8_t * qp_dblk_x[3];
26645+    const int8_t * qp_bd_x[3];
26646+
26647+    uint8_t pic_slice_level_chroma_qp_offsets_present_flag;
26648+    uint8_t weighted_pred_flag;
26649+    uint8_t weighted_bipred_flag;
26650+    uint8_t output_flag_present_flag;
26651+    uint8_t transquant_bypass_enable_flag;
26652+
26653+    uint8_t dependent_slice_segments_enabled_flag;
26654+    uint8_t tiles_enabled_flag;
26655+    uint8_t entropy_coding_sync_enabled_flag;
26656+
26657+    uint8_t tile_wpp_inter_disable;
26658+    int num_tile_columns;   ///< num_tile_columns_minus1 + 1
26659+    int num_tile_rows;      ///< num_tile_rows_minus1 + 1
26660+    uint8_t uniform_spacing_flag;
26661+    uint8_t loop_filter_across_tiles_enabled_flag;
26662+
26663+    uint8_t seq_loop_filter_across_slices_enabled_flag;
26664+
26665+    uint8_t deblocking_filter_control_present_flag;
26666+    uint8_t deblocking_filter_override_enabled_flag;
26667+    uint8_t disable_dbf;
26668+    int beta_offset;    ///< beta_offset_div2 * 2
26669+    int tc_offset;      ///< tc_offset_div2 * 2
26670+
26671+    uint8_t scaling_list_data_present_flag;
26672+    ScalingList scaling_list;
26673+
26674+    uint8_t lists_modification_present_flag;
26675+    int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
26676+    int num_extra_slice_header_bits;
26677+    uint8_t slice_header_extension_present_flag;
26678+    uint8_t log2_max_transform_skip_block_size;
26679+    uint8_t cross_component_prediction_enabled_flag;
26680+    uint8_t chroma_qp_offset_list_enabled_flag;
26681+    uint8_t diff_cu_chroma_qp_offset_depth;
26682+    uint8_t chroma_qp_offset_list_len_minus1;
26683+    int8_t  cb_qp_offset_list[6];
26684+    int8_t  cr_qp_offset_list[6];
26685+    uint8_t log2_sao_offset_scale_luma;
26686+    uint8_t log2_sao_offset_scale_chroma;
26687+
26688+    // Inferred parameters
26689+    uint16_t *column_width;  ///< ColumnWidth
26690+    uint16_t *row_height;    ///< RowHeight
26691+    uint16_t *col_bd;        ///< ColBd
26692+    uint16_t *row_bd;        ///< RowBd
26693+    uint16_t *col_idxX;
26694+
26695+    // We can limit these to uint16_t given our other size limits
26696+    uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
26697+    uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
26698+    uint16_t *tile_id;           ///< TileId
26699+    uint16_t *tile_pos_ts;       ///< TilePosRS
26700+    uint16_t *tile_size;         ///< TileSize
26701+    uint8_t * ctb_ts_flags;
26702+
26703+    uint8_t data[4096];
26704+    int data_size;
26705+} HEVCRpiPPS;
26706+
26707+typedef struct HEVCRpiParamSets {
26708+    /* currently active parameter sets */
26709+    const HEVCRpiVPS *vps;
26710+    const HEVCRpiSPS *sps;
26711+    const HEVCRpiPPS *pps;
26712+
26713+    AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
26714+    AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
26715+    AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
26716+} HEVCRpiParamSets;
26717+
26718+int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
26719+                           HEVCRpiParamSets *ps);
26720+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
26721+                           HEVCRpiParamSets *ps, int apply_defdispwin);
26722+int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
26723+                           HEVCRpiParamSets *ps);
26724+
26725+int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
26726+                                  ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header);
26727+
26728+int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id,
26729+                           uint8_t *buf, int buf_size);
26730+
26731+/**
26732+ * Compute POC of the current frame and return it.
26733+ */
26734+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type);
26735+
26736+#endif /* AVCODEC_RPI_HEVC_PS_H */
26737--- /dev/null
26738+++ b/libavcodec/rpi_hevc_refs.c
26739@@ -0,0 +1,485 @@
26740+/*
26741+ * HEVC video decoder
26742+ *
26743+ * Copyright (C) 2012 - 2013 Guillaume Martres
26744+ * Copyright (C) 2012 - 2013 Gildas Cocherel
26745+ *
26746+ * This file is part of FFmpeg.
26747+ *
26748+ * FFmpeg is free software; you can redistribute it and/or
26749+ * modify it under the terms of the GNU Lesser General Public
26750+ * License as published by the Free Software Foundation; either
26751+ * version 2.1 of the License, or (at your option) any later version.
26752+ *
26753+ * FFmpeg is distributed in the hope that it will be useful,
26754+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
26755+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26756+ * Lesser General Public License for more details.
26757+ *
26758+ * You should have received a copy of the GNU Lesser General Public
26759+ * License along with FFmpeg; if not, write to the Free Software
26760+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26761+ */
26762+
26763+#include "libavutil/avassert.h"
26764+#include "libavutil/pixdesc.h"
26765+#include "libavutil/rpi_sand_fns.h"
26766+#include "internal.h"
26767+#include "thread.h"
26768+#include "hevc.h"
26769+#include "rpi_hevcdec.h"
26770+
26771+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags)
26772+{
26773+    /* frame->frame can be NULL if context init failed */
26774+    if (!frame->frame || !frame->frame->buf[0])
26775+        return;
26776+
26777+    frame->flags &= ~flags;
26778+    if (!frame->flags) {
26779+        ff_thread_release_buffer(s->avctx, &frame->tf);
26780+
26781+        av_buffer_unref(&frame->col_mvf_buf);  // OK if already NULL
26782+        frame->col_mvf = NULL;
26783+
26784+        frame->collocated_ref = NULL;
26785+    }
26786+}
26787+
26788+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s)
26789+{
26790+    int i;
26791+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
26792+        ff_hevc_rpi_unref_frame(s, &s->DPB[i],
26793+                            HEVC_FRAME_FLAG_SHORT_REF |
26794+                            HEVC_FRAME_FLAG_LONG_REF);
26795+}
26796+
26797+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s)
26798+{
26799+    int i;
26800+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
26801+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
26802+}
26803+
26804+static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s)
26805+{
26806+    int i, ret;
26807+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
26808+        HEVCRpiFrame * const frame = &s->DPB[i];
26809+        if (frame->frame->buf[0])
26810+            continue;
26811+
26812+        ret = ff_thread_get_buffer(s->avctx, &frame->tf,
26813+                                   AV_GET_BUFFER_FLAG_REF);
26814+        if (ret < 0)
26815+            return NULL;
26816+
26817+        frame->col_mvf = NULL;
26818+        frame->col_mvf_buf = NULL;
26819+        if (s->used_for_ref && !s->is_irap)
26820+        {
26821+            frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool);
26822+            if (!frame->col_mvf_buf)
26823+                goto fail;
26824+            frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data;
26825+        }
26826+
26827+        frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
26828+        frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
26829+
26830+        return frame;
26831+
26832+fail:
26833+        ff_hevc_rpi_unref_frame(s, frame, ~0);
26834+        return NULL;
26835+    }
26836+    av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n");
26837+    return NULL;
26838+}
26839+
26840+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc)
26841+{
26842+    HEVCRpiFrame *ref;
26843+    int i;
26844+
26845+    /* check that this POC doesn't already exist */
26846+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
26847+        HEVCRpiFrame *frame = &s->DPB[i];
26848+
26849+        if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
26850+            frame->poc == poc) {
26851+            av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n",
26852+                   poc);
26853+            return AVERROR_INVALIDDATA;
26854+        }
26855+    }
26856+
26857+    ref = alloc_frame(s);
26858+    if (!ref)
26859+        return AVERROR(ENOMEM);
26860+
26861+    *frame = ref->frame;
26862+    s->ref = ref;
26863+
26864+    if (s->sh.pic_output_flag)
26865+        ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF;
26866+    else
26867+        ref->flags = HEVC_FRAME_FLAG_SHORT_REF;
26868+
26869+    ref->poc      = poc;
26870+    ref->sequence = s->seq_decode;
26871+    ref->frame->crop_left   = s->ps.sps->output_window.left_offset;
26872+    ref->frame->crop_right  = s->ps.sps->output_window.right_offset;
26873+    ref->frame->crop_top    = s->ps.sps->output_window.top_offset;
26874+    ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset;
26875+
26876+    return 0;
26877+}
26878+
26879+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush)
26880+{
26881+    do {
26882+        int nb_output = 0;
26883+        int min_poc   = INT_MAX;
26884+        int i, min_idx, ret;
26885+
26886+        if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
26887+            for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
26888+                HEVCRpiFrame *frame = &s->DPB[i];
26889+                if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
26890+                        frame->sequence == s->seq_output) {
26891+                    ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
26892+                }
26893+            }
26894+        }
26895+
26896+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
26897+            HEVCRpiFrame *frame = &s->DPB[i];
26898+            if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
26899+                frame->sequence == s->seq_output) {
26900+                nb_output++;
26901+                if (frame->poc < min_poc || nb_output == 1) {
26902+                    min_poc = frame->poc;
26903+                    min_idx = i;
26904+                }
26905+            }
26906+        }
26907+
26908+        /* wait for more frames before output */
26909+        if (!flush && s->seq_output == s->seq_decode && s->ps.sps &&
26910+            nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics)
26911+            return 0;
26912+
26913+        if (nb_output) {
26914+            HEVCRpiFrame *frame = &s->DPB[min_idx];
26915+            if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
26916+                return 0;
26917+
26918+            ret = av_frame_ref(out, frame->frame);
26919+            if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
26920+                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
26921+            else
26922+                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
26923+            if (ret < 0)
26924+                return ret;
26925+            av_log(s->avctx, AV_LOG_DEBUG,
26926+                   "Output frame with POC %d.\n", frame->poc);
26927+            return 1;
26928+        }
26929+
26930+        if (s->seq_output != s->seq_decode)
26931+            s->seq_output = (s->seq_output + 1) & 0xff;
26932+        else
26933+            break;
26934+    } while (1);
26935+
26936+    return 0;
26937+}
26938+
26939+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s)
26940+{
26941+    int dpb = 0;
26942+    int min_poc = INT_MAX;
26943+    int i;
26944+
26945+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
26946+        HEVCRpiFrame *frame = &s->DPB[i];
26947+        if ((frame->flags) &&
26948+            frame->sequence == s->seq_output &&
26949+            frame->poc != s->poc) {
26950+            dpb++;
26951+        }
26952+    }
26953+
26954+    if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
26955+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
26956+            HEVCRpiFrame *frame = &s->DPB[i];
26957+            if ((frame->flags) &&
26958+                frame->sequence == s->seq_output &&
26959+                frame->poc != s->poc) {
26960+                if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
26961+                    min_poc = frame->poc;
26962+                }
26963+            }
26964+        }
26965+
26966+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
26967+            HEVCRpiFrame *frame = &s->DPB[i];
26968+            if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
26969+                frame->sequence == s->seq_output &&
26970+                frame->poc <= min_poc) {
26971+                frame->flags |= HEVC_FRAME_FLAG_BUMPING;
26972+            }
26973+        }
26974+
26975+        dpb--;
26976+    }
26977+}
26978+
26979+static int init_slice_rpl(HEVCRpiContext *s)
26980+{
26981+    if (s->slice_idx >= s->rpl_tab_size)
26982+        return AVERROR_INVALIDDATA;
26983+
26984+    s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0;
26985+    return 0;
26986+}
26987+
26988+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s)
26989+{
26990+    RpiSliceHeader *sh = &s->sh;
26991+
26992+    uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1;
26993+    uint8_t list_idx;
26994+    int i, j, ret;
26995+
26996+    ret = init_slice_rpl(s);
26997+    if (ret < 0)
26998+        return ret;
26999+
27000+    if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
27001+          s->rps[LT_CURR].nb_refs)) {
27002+        av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n");
27003+        return AVERROR_INVALIDDATA;
27004+    }
27005+
27006+    for (list_idx = 0; list_idx < nb_list; list_idx++) {
27007+        RefPicList  rpl_tmp = { { 0 } };
27008+        RefPicList *rpl     = &s->refPicList[list_idx];
27009+
27010+        /* The order of the elements is
27011+         * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
27012+         * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */
27013+        int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF,
27014+                              list_idx ? ST_CURR_BEF : ST_CURR_AFT,
27015+                              LT_CURR };
27016+
27017+        /* concatenate the candidate lists for the current frame */
27018+        while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) {
27019+            for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) {
27020+                RefPicList *rps = &s->rps[cand_lists[i]];
27021+                for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) {
27022+                    rpl_tmp.list[rpl_tmp.nb_refs]       = rps->list[j];
27023+                    rpl_tmp.ref[rpl_tmp.nb_refs]        = rps->ref[j];
27024+                    rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2;
27025+                    rpl_tmp.nb_refs++;
27026+                }
27027+            }
27028+        }
27029+
27030+        /* reorder the references if necessary */
27031+        if (sh->rpl_modification_flag[list_idx]) {
27032+            for (i = 0; i < sh->nb_refs[list_idx]; i++) {
27033+                int idx = sh->list_entry_lx[list_idx][i];
27034+
27035+                if (idx >= rpl_tmp.nb_refs) {
27036+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n");
27037+                    return AVERROR_INVALIDDATA;
27038+                }
27039+
27040+                rpl->list[i]       = rpl_tmp.list[idx];
27041+                rpl->ref[i]        = rpl_tmp.ref[idx];
27042+                rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx];
27043+                rpl->nb_refs++;
27044+            }
27045+        } else {
27046+            memcpy(rpl, &rpl_tmp, sizeof(*rpl));
27047+            rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]);
27048+        }
27049+
27050+        if (sh->collocated_list == list_idx &&
27051+            sh->collocated_ref_idx < rpl->nb_refs)
27052+            s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx];
27053+    }
27054+
27055+    return 0;
27056+}
27057+
27058+static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc)
27059+{
27060+    int i;
27061+    int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1;
27062+
27063+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
27064+        HEVCRpiFrame *ref = &s->DPB[i];
27065+        if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
27066+            if ((ref->poc & LtMask) == poc)
27067+                return ref;
27068+        }
27069+    }
27070+
27071+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
27072+        HEVCRpiFrame *ref = &s->DPB[i];
27073+        if (ref->frame->buf[0] && ref->sequence == s->seq_decode) {
27074+            if (ref->poc == poc || (ref->poc & LtMask) == poc)
27075+                return ref;
27076+        }
27077+    }
27078+
27079+    if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s))
27080+        av_log(s->avctx, AV_LOG_ERROR,
27081+               "Could not find ref with POC %d\n", poc);
27082+    return NULL;
27083+}
27084+
27085+static void mark_ref(HEVCRpiFrame *frame, int flag)
27086+{
27087+    frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
27088+    frame->flags |= flag;
27089+}
27090+
27091+static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
27092+{
27093+    HEVCRpiFrame *frame;
27094+    int i, x, y;
27095+
27096+    frame = alloc_frame(s);
27097+    if (!frame)
27098+        return NULL;
27099+
27100+    if (!s->ps.sps->pixel_shift) {
27101+        for (i = 0; frame->frame->buf[i]; i++)
27102+            memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1),
27103+                   frame->frame->buf[i]->size);
27104+    } else {
27105+        for (i = 0; frame->frame->data[i]; i++)
27106+            for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++)
27107+                for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) {
27108+                    AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x,
27109+                            1 << (s->ps.sps->bit_depth - 1));
27110+                }
27111+    }
27112+
27113+    frame->poc      = poc;
27114+    frame->sequence = s->seq_decode;
27115+    frame->flags    = 0;
27116+
27117+    ff_hevc_rpi_progress_set_all_done(frame);
27118+
27119+    return frame;
27120+}
27121+
27122+/* add a reference with the given poc to the list and mark it as used in DPB */
27123+static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list,
27124+                             int poc, int ref_flag)
27125+{
27126+    HEVCRpiFrame *ref = find_ref_idx(s, poc);
27127+
27128+    if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
27129+        return AVERROR_INVALIDDATA;
27130+
27131+    if (!ref) {
27132+        ref = generate_missing_ref(s, poc);
27133+        if (!ref)
27134+            return AVERROR(ENOMEM);
27135+    }
27136+
27137+    list->list[list->nb_refs] = ref->poc;
27138+    list->ref[list->nb_refs]  = ref;
27139+    list->nb_refs++;
27140+
27141+    mark_ref(ref, ref_flag);
27142+    return 0;
27143+}
27144+
27145+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s)
27146+{
27147+    const ShortTermRPS *short_rps = s->sh.short_term_rps;
27148+    const LongTermRPS  *long_rps  = &s->sh.long_term_rps;
27149+    RefPicList               *rps = s->rps;
27150+    int i, ret = 0;
27151+
27152+    if (!short_rps) {
27153+        rps[0].nb_refs = rps[1].nb_refs = 0;
27154+        return 0;
27155+    }
27156+
27157+    /* clear the reference flags on all frames except the current one */
27158+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
27159+        HEVCRpiFrame *frame = &s->DPB[i];
27160+
27161+        if (frame == s->ref)
27162+            continue;
27163+
27164+        mark_ref(frame, 0);
27165+    }
27166+
27167+    for (i = 0; i < NB_RPS_TYPE; i++)
27168+        rps[i].nb_refs = 0;
27169+
27170+    /* add the short refs */
27171+    for (i = 0; i < short_rps->num_delta_pocs; i++) {
27172+        int poc = s->poc + short_rps->delta_poc[i];
27173+        int list;
27174+
27175+        if (!short_rps->used[i])
27176+            list = ST_FOLL;
27177+        else if (i < short_rps->num_negative_pics)
27178+            list = ST_CURR_BEF;
27179+        else
27180+            list = ST_CURR_AFT;
27181+
27182+        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF);
27183+        if (ret < 0)
27184+            goto fail;
27185+    }
27186+
27187+    /* add the long refs */
27188+    for (i = 0; i < long_rps->nb_refs; i++) {
27189+        int poc  = long_rps->poc[i];
27190+        int list = long_rps->used[i] ? LT_CURR : LT_FOLL;
27191+
27192+        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF);
27193+        if (ret < 0)
27194+            goto fail;
27195+    }
27196+
27197+fail:
27198+    /* release any frames that are now unused */
27199+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
27200+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0);
27201+
27202+    return ret;
27203+}
27204+
27205+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s)
27206+{
27207+    int ret = 0;
27208+    int i;
27209+    const ShortTermRPS *rps = s->sh.short_term_rps;
27210+    LongTermRPS *long_rps   = &s->sh.long_term_rps;
27211+
27212+    if (rps) {
27213+        for (i = 0; i < rps->num_negative_pics; i++)
27214+            ret += !!rps->used[i];
27215+        for (; i < rps->num_delta_pocs; i++)
27216+            ret += !!rps->used[i];
27217+    }
27218+
27219+    if (long_rps) {
27220+        for (i = 0; i < long_rps->nb_refs; i++)
27221+            ret += !!long_rps->used[i];
27222+    }
27223+    return ret;
27224+}
27225--- /dev/null
27226+++ b/libavcodec/rpi_hevc_sei.c
27227@@ -0,0 +1,368 @@
27228+/*
27229+ * HEVC Supplementary Enhancement Information messages
27230+ *
27231+ * Copyright (C) 2012 - 2013 Guillaume Martres
27232+ * Copyright (C) 2012 - 2013 Gildas Cocherel
27233+ * Copyright (C) 2013 Vittorio Giovara
27234+ *
27235+ * This file is part of FFmpeg.
27236+ *
27237+ * FFmpeg is free software; you can redistribute it and/or
27238+ * modify it under the terms of the GNU Lesser General Public
27239+ * License as published by the Free Software Foundation; either
27240+ * version 2.1 of the License, or (at your option) any later version.
27241+ *
27242+ * FFmpeg is distributed in the hope that it will be useful,
27243+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
27244+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
27245+ * Lesser General Public License for more details.
27246+ *
27247+ * You should have received a copy of the GNU Lesser General Public
27248+ * License along with FFmpeg; if not, write to the Free Software
27249+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27250+ */
27251+
27252+#include "golomb.h"
27253+#include "rpi_hevc_ps.h"
27254+#include "rpi_hevc_sei.h"
27255+
27256+static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb)
27257+{
27258+    int cIdx, i;
27259+    uint8_t hash_type;
27260+    //uint16_t picture_crc;
27261+    //uint32_t picture_checksum;
27262+    hash_type = get_bits(gb, 8);
27263+
27264+    for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
27265+        if (hash_type == 0) {
27266+            s->is_md5 = 1;
27267+            for (i = 0; i < 16; i++)
27268+                s->md5[cIdx][i] = get_bits(gb, 8);
27269+        } else if (hash_type == 1) {
27270+            // picture_crc = get_bits(gb, 16);
27271+            skip_bits(gb, 16);
27272+        } else if (hash_type == 2) {
27273+            // picture_checksum = get_bits_long(gb, 32);
27274+            skip_bits(gb, 32);
27275+        }
27276+    }
27277+    return 0;
27278+}
27279+
27280+static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb)
27281+{
27282+    int i;
27283+    // Mastering primaries
27284+    for (i = 0; i < 3; i++) {
27285+        s->display_primaries[i][0] = get_bits(gb, 16);
27286+        s->display_primaries[i][1] = get_bits(gb, 16);
27287+    }
27288+    // White point (x, y)
27289+    s->white_point[0] = get_bits(gb, 16);
27290+    s->white_point[1] = get_bits(gb, 16);
27291+
27292+    // Max and min luminance of mastering display
27293+    s->max_luminance = get_bits_long(gb, 32);
27294+    s->min_luminance = get_bits_long(gb, 32);
27295+
27296+    // As this SEI message comes before the first frame that references it,
27297+    // initialize the flag to 2 and decrement on IRAP access unit so it
27298+    // persists for the coded video sequence (e.g., between two IRAPs)
27299+    s->present = 2;
27300+    return 0;
27301+}
27302+
27303+static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb)
27304+{
27305+    // Max and average light levels
27306+    s->max_content_light_level     = get_bits_long(gb, 16);
27307+    s->max_pic_average_light_level = get_bits_long(gb, 16);
27308+    // As this SEI message comes before the first frame that references it,
27309+    // initialize the flag to 2 and decrement on IRAP access unit so it
27310+    // persists for the coded video sequence (e.g., between two IRAPs)
27311+    s->present = 2;
27312+    return  0;
27313+}
27314+
27315+static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb)
27316+{
27317+    get_ue_golomb_long(gb);             // frame_packing_arrangement_id
27318+    s->present = !get_bits1(gb);
27319+
27320+    if (s->present) {
27321+        s->arrangement_type               = get_bits(gb, 7);
27322+        s->quincunx_subsampling           = get_bits1(gb);
27323+        s->content_interpretation_type    = get_bits(gb, 6);
27324+
27325+        // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
27326+        skip_bits(gb, 3);
27327+        s->current_frame_is_frame0_flag = get_bits1(gb);
27328+        // frame0_self_contained_flag, frame1_self_contained_flag
27329+        skip_bits(gb, 2);
27330+
27331+        if (!s->quincunx_subsampling && s->arrangement_type != 5)
27332+            skip_bits(gb, 16);  // frame[01]_grid_position_[xy]
27333+        skip_bits(gb, 8);       // frame_packing_arrangement_reserved_byte
27334+        skip_bits1(gb);         // frame_packing_arrangement_persistence_flag
27335+    }
27336+    skip_bits1(gb);             // upsampled_aspect_ratio_flag
27337+    return 0;
27338+}
27339+
27340+static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb)
27341+{
27342+    s->present = !get_bits1(gb);
27343+
27344+    if (s->present) {
27345+        s->hflip = get_bits1(gb);     // hor_flip
27346+        s->vflip = get_bits1(gb);     // ver_flip
27347+
27348+        s->anticlockwise_rotation = get_bits(gb, 16);
27349+        skip_bits1(gb);     // display_orientation_persistence_flag
27350+    }
27351+
27352+    return 0;
27353+}
27354+
27355+static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps,
27356+                                     void *logctx, int size)
27357+{
27358+    HEVCSEIPictureTiming *h = &s->picture_timing;
27359+    HEVCRpiSPS *sps;
27360+
27361+    if (!ps->sps_list[s->active_seq_parameter_set_id])
27362+        return(AVERROR(ENOMEM));
27363+    sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data;
27364+
27365+    if (sps->vui.frame_field_info_present_flag) {
27366+        int pic_struct = get_bits(gb, 4);
27367+        h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
27368+        if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) {
27369+            av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n");
27370+            h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
27371+        } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) {
27372+            av_log(logctx, AV_LOG_DEBUG, "TOP Field\n");
27373+            h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
27374+        }
27375+        get_bits(gb, 2);                   // source_scan_type
27376+        get_bits(gb, 1);                   // duplicate_flag
27377+        skip_bits1(gb);
27378+        size--;
27379+    }
27380+    skip_bits_long(gb, 8 * size);
27381+
27382+    return 0;
27383+}
27384+
27385+static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb,
27386+                                                      int size)
27387+{
27388+    int flag;
27389+    int user_data_type_code;
27390+    int cc_count;
27391+
27392+    if (size < 3)
27393+       return AVERROR(EINVAL);
27394+
27395+    user_data_type_code = get_bits(gb, 8);
27396+    if (user_data_type_code == 0x3) {
27397+        skip_bits(gb, 1); // reserved
27398+
27399+        flag = get_bits(gb, 1); // process_cc_data_flag
27400+        if (flag) {
27401+            skip_bits(gb, 1);
27402+            cc_count = get_bits(gb, 5);
27403+            skip_bits(gb, 8); // reserved
27404+            size -= 2;
27405+
27406+            if (cc_count && size >= cc_count * 3) {
27407+                const uint64_t new_size = (s->a53_caption_size + cc_count
27408+                                           * UINT64_C(3));
27409+                int i, ret;
27410+
27411+                if (new_size > INT_MAX)
27412+                    return AVERROR(EINVAL);
27413+
27414+                /* Allow merging of the cc data from two fields. */
27415+                ret = av_reallocp(&s->a53_caption, new_size);
27416+                if (ret < 0)
27417+                    return ret;
27418+
27419+                for (i = 0; i < cc_count; i++) {
27420+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
27421+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
27422+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
27423+                }
27424+                skip_bits(gb, 8); // marker_bits
27425+            }
27426+        }
27427+    } else {
27428+        int i;
27429+        for (i = 0; i < size - 1; i++)
27430+            skip_bits(gb, 8);
27431+    }
27432+
27433+    return 0;
27434+}
27435+
27436+static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb,
27437+                                                         int size)
27438+{
27439+    uint32_t country_code;
27440+    uint32_t user_identifier;
27441+
27442+    if (size < 7)
27443+        return AVERROR(EINVAL);
27444+    size -= 7;
27445+
27446+    country_code = get_bits(gb, 8);
27447+    if (country_code == 0xFF) {
27448+        skip_bits(gb, 8);
27449+        size--;
27450+    }
27451+
27452+    skip_bits(gb, 8);
27453+    skip_bits(gb, 8);
27454+
27455+    user_identifier = get_bits_long(gb, 32);
27456+
27457+    switch (user_identifier) {
27458+        case MKBETAG('G', 'A', '9', '4'):
27459+            return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size);
27460+        default:
27461+            skip_bits_long(gb, size * 8);
27462+            break;
27463+    }
27464+    return 0;
27465+}
27466+
27467+static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx)
27468+{
27469+    int num_sps_ids_minus1;
27470+    int i;
27471+    unsigned active_seq_parameter_set_id;
27472+
27473+    get_bits(gb, 4); // active_video_parameter_set_id
27474+    get_bits(gb, 1); // self_contained_cvs_flag
27475+    get_bits(gb, 1); // num_sps_ids_minus1
27476+    num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
27477+
27478+    if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
27479+        av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
27480+        return AVERROR_INVALIDDATA;
27481+    }
27482+
27483+    active_seq_parameter_set_id = get_ue_golomb_long(gb);
27484+    if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) {
27485+        av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
27486+        return AVERROR_INVALIDDATA;
27487+    }
27488+    s->active_seq_parameter_set_id = active_seq_parameter_set_id;
27489+
27490+    for (i = 1; i <= num_sps_ids_minus1; i++)
27491+        get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
27492+
27493+    return 0;
27494+}
27495+
27496+static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb)
27497+{
27498+    s->present = 1;
27499+    s->preferred_transfer_characteristics = get_bits(gb, 8);
27500+    return 0;
27501+}
27502+
27503+static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
27504+                                 int type, int size)
27505+{
27506+    switch (type) {
27507+    case 256:  // Mismatched value from HM 8.1
27508+        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
27509+    case HEVC_SEI_TYPE_FRAME_PACKING:
27510+        return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb);
27511+    case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
27512+        return decode_nal_sei_display_orientation(&s->display_orientation, gb);
27513+    case HEVC_SEI_TYPE_PICTURE_TIMING:
27514+        return decode_nal_sei_pic_timing(s, gb, ps, logctx, size);
27515+    case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
27516+        return decode_nal_sei_mastering_display_info(&s->mastering_display, gb);
27517+    case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
27518+        return decode_nal_sei_content_light_info(&s->content_light, gb);
27519+    case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
27520+        return decode_nal_sei_active_parameter_sets(s, gb, logctx);
27521+    case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
27522+        return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size);
27523+    case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
27524+        return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb);
27525+    default:
27526+        av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
27527+        skip_bits_long(gb, 8 * size);
27528+        return 0;
27529+    }
27530+}
27531+
27532+static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
27533+                                 int type, int size)
27534+{
27535+    switch (type) {
27536+    case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
27537+        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
27538+    default:
27539+        av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type);
27540+        skip_bits_long(gb, 8 * size);
27541+        return 0;
27542+    }
27543+}
27544+
27545+static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s,
27546+                                  const HEVCRpiParamSets * const ps, const int nal_unit_type)
27547+{
27548+    int payload_type = 0;
27549+    int payload_size = 0;
27550+    int byte = 0xFF;
27551+    av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
27552+
27553+    while (byte == 0xFF) {
27554+       if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255)
27555+           return AVERROR_INVALIDDATA;
27556+        byte          = get_bits(gb, 8);
27557+        payload_type += byte;
27558+    }
27559+    byte = 0xFF;
27560+    while (byte == 0xFF) {
27561+        if (get_bits_left(gb) < 8 + 8LL*payload_size)
27562+            return AVERROR_INVALIDDATA;
27563+         byte          = get_bits(gb, 8);
27564+        payload_size += byte;
27565+    }
27566+    if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
27567+        return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
27568+    } else { /* nal_unit_type == NAL_SEI_SUFFIX */
27569+        return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
27570+    }
27571+}
27572+
27573+static int more_rbsp_data(GetBitContext *gb)
27574+{
27575+    return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80;
27576+}
27577+
27578+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
27579+                           const HEVCRpiParamSets *ps, int type)
27580+{
27581+    int ret;
27582+
27583+    do {
27584+        ret = decode_nal_sei_message(gb, logctx, s, ps, type);
27585+        if (ret < 0)
27586+            return ret;
27587+    } while (more_rbsp_data(gb));
27588+    return 1;
27589+}
27590+
27591+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s)
27592+{
27593+    s->a53_caption.a53_caption_size = 0;
27594+    av_freep(&s->a53_caption.a53_caption);
27595+}
27596--- /dev/null
27597+++ b/libavcodec/rpi_hevc_sei.h
27598@@ -0,0 +1,135 @@
27599+/*
27600+ * HEVC Supplementary Enhancement Information messages
27601+ *
27602+ * This file is part of FFmpeg.
27603+ *
27604+ * FFmpeg is free software; you can redistribute it and/or
27605+ * modify it under the terms of the GNU Lesser General Public
27606+ * License as published by the Free Software Foundation; either
27607+ * version 2.1 of the License, or (at your option) any later version.
27608+ *
27609+ * FFmpeg is distributed in the hope that it will be useful,
27610+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
27611+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
27612+ * Lesser General Public License for more details.
27613+ *
27614+ * You should have received a copy of the GNU Lesser General Public
27615+ * License along with FFmpeg; if not, write to the Free Software
27616+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27617+ */
27618+
27619+#ifndef AVCODEC_RPI_HEVC_SEI_H
27620+#define AVCODEC_RPI_HEVC_SEI_H
27621+
27622+#include <stdint.h>
27623+
27624+#include "libavutil/md5.h"
27625+
27626+#include "get_bits.h"
27627+
27628+/**
27629+ * SEI message types
27630+ */
27631+typedef enum {
27632+    HEVC_SEI_TYPE_BUFFERING_PERIOD                     = 0,
27633+    HEVC_SEI_TYPE_PICTURE_TIMING                       = 1,
27634+    HEVC_SEI_TYPE_PAN_SCAN_RECT                        = 2,
27635+    HEVC_SEI_TYPE_FILLER_PAYLOAD                       = 3,
27636+    HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35       = 4,
27637+    HEVC_SEI_TYPE_USER_DATA_UNREGISTERED               = 5,
27638+    HEVC_SEI_TYPE_RECOVERY_POINT                       = 6,
27639+    HEVC_SEI_TYPE_SCENE_INFO                           = 9,
27640+    HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT                  = 15,
27641+    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
27642+    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END   = 17,
27643+    HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS           = 19,
27644+    HEVC_SEI_TYPE_POST_FILTER_HINT                     = 22,
27645+    HEVC_SEI_TYPE_TONE_MAPPING_INFO                    = 23,
27646+    HEVC_SEI_TYPE_FRAME_PACKING                        = 45,
27647+    HEVC_SEI_TYPE_DISPLAY_ORIENTATION                  = 47,
27648+    HEVC_SEI_TYPE_SOP_DESCRIPTION                      = 128,
27649+    HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS                = 129,
27650+    HEVC_SEI_TYPE_DECODING_UNIT_INFO                   = 130,
27651+    HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX                = 131,
27652+    HEVC_SEI_TYPE_DECODED_PICTURE_HASH                 = 132,
27653+    HEVC_SEI_TYPE_SCALABLE_NESTING                     = 133,
27654+    HEVC_SEI_TYPE_REGION_REFRESH_INFO                  = 134,
27655+    HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO               = 137,
27656+    HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO             = 144,
27657+    HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
27658+} HEVC_SEI_Type;
27659+
27660+typedef struct HEVCSEIPictureHash {
27661+    uint8_t       md5[3][16];
27662+    uint8_t is_md5;
27663+} HEVCSEIPictureHash;
27664+
27665+typedef struct HEVCSEIFramePacking {
27666+    int present;
27667+    int arrangement_type;
27668+    int content_interpretation_type;
27669+    int quincunx_subsampling;
27670+    int current_frame_is_frame0_flag;
27671+} HEVCSEIFramePacking;
27672+
27673+typedef struct HEVCSEIDisplayOrientation {
27674+    int present;
27675+    int anticlockwise_rotation;
27676+    int hflip, vflip;
27677+} HEVCSEIDisplayOrientation;
27678+
27679+typedef struct HEVCSEIPictureTiming {
27680+    int picture_struct;
27681+} HEVCSEIPictureTiming;
27682+
27683+typedef struct HEVCSEIA53Caption {
27684+    int a53_caption_size;
27685+    uint8_t *a53_caption;
27686+} HEVCSEIA53Caption;
27687+
27688+typedef struct HEVCSEIMasteringDisplay {
27689+    int present;
27690+    uint16_t display_primaries[3][2];
27691+    uint16_t white_point[2];
27692+    uint32_t max_luminance;
27693+    uint32_t min_luminance;
27694+} HEVCSEIMasteringDisplay;
27695+
27696+typedef struct HEVCSEIContentLight {
27697+    int present;
27698+    uint16_t max_content_light_level;
27699+    uint16_t max_pic_average_light_level;
27700+} HEVCSEIContentLight;
27701+
27702+typedef struct HEVCSEIAlternativeTransfer {
27703+    int present;
27704+    int preferred_transfer_characteristics;
27705+} HEVCSEIAlternativeTransfer;
27706+
27707+typedef struct HEVCSEIContext {
27708+    HEVCSEIPictureHash picture_hash;
27709+    HEVCSEIFramePacking frame_packing;
27710+    HEVCSEIDisplayOrientation display_orientation;
27711+    HEVCSEIPictureTiming picture_timing;
27712+    HEVCSEIA53Caption a53_caption;
27713+    HEVCSEIMasteringDisplay mastering_display;
27714+    HEVCSEIContentLight content_light;
27715+    int active_seq_parameter_set_id;
27716+    HEVCSEIAlternativeTransfer alternative_transfer;
27717+} HEVCSEIContext;
27718+
27719+struct HEVCRpiParamSets;
27720+
27721+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
27722+                           const struct HEVCRpiParamSets *ps, int type);
27723+
27724+/**
27725+ * Reset SEI values that are stored on the Context.
27726+ * e.g. Caption data that was extracted during NAL
27727+ * parsing.
27728+ *
27729+ * @param s HEVCRpiContext.
27730+ */
27731+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s);
27732+
27733+#endif /* AVCODEC_RPI_HEVC_SEI_H */
27734--- /dev/null
27735+++ b/libavcodec/rpi_hevc_shader.c
27736@@ -0,0 +1,1537 @@
27737+#include "rpi_hevc_shader.h"
27738+
27739+#ifdef _MSC_VER
27740+   #include <stdint.h>
27741+   /* cast through uintptr_t to avoid warnings */
27742+   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
27743+#else
27744+   #define POINTER_TO_UINT(X) ((unsigned int)(X))
27745+#endif
27746+
27747+#ifdef __cplusplus
27748+extern "C" { /* the types are probably wrong... */
27749+#endif
27750+#ifdef __cplusplus
27751+}
27752+#endif
27753+
27754+#ifdef _MSC_VER
27755+__declspec(align(8))
27756+#elif defined(__GNUC__)
27757+__attribute__((aligned(8)))
27758+#endif
27759+unsigned int ff_hevc_rpi_shader[] = {
27760+// ::mc_setup_c_q0
27761+// ::mc_start
27762+/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
27763+// ::mc_setup_c_qn
27764+/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
27765+/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
27766+/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
27767+/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
27768+/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
27769+/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
27770+/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
27771+/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
27772+/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
27773+/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
27774+/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
27775+/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
27776+/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
27777+/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
27778+/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
27779+/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
27780+/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
27781+/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
27782+/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
27783+/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
27784+/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
27785+/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
27786+/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
27787+/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
27788+/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
27789+/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
27790+/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
27791+/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
27792+/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
27793+/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
27794+/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
27795+/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
27796+/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
27797+/* [0x00000110] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
27798+/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
27799+/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
27800+/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
27801+/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
27802+/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
27803+/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
27804+/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
27805+/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
27806+/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
27807+/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
27808+/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
27809+/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
27810+/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
27811+/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
27812+/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
27813+/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
27814+// :1
27815+/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
27816+/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
27817+/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
27818+/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
27819+/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
27820+/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
27821+/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
27822+/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
27823+/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
27824+/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
27825+/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
27826+/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
27827+/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
27828+/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
27829+/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
27830+/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
27831+// ::mc_filter_c_p
27832+/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
27833+/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
27834+/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
27835+/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
27836+/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
27837+/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
27838+/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
27839+/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
27840+/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
27841+/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
27842+/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
27843+/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
27844+/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
27845+/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
27846+/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
27847+/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
27848+/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
27849+/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
27850+/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
27851+/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
27852+/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
27853+/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
27854+/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
27855+/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
27856+// :1
27857+/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
27858+/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
27859+/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
27860+/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
27861+/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
27862+/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
27863+/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
27864+/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
27865+/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
27866+/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
27867+/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
27868+/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
27869+/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
27870+/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
27871+/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
27872+/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
27873+/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
27874+/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
27875+/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
27876+/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
27877+/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
27878+/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
27879+/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
27880+/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
27881+/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
27882+/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
27883+/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
27884+/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
27885+/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
27886+/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
27887+/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
27888+/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
27889+/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
27890+/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
27891+/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
27892+/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
27893+/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
27894+/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
27895+/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
27896+// ::mc_filter_c_p_l1
27897+/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
27898+/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
27899+/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
27900+/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
27901+/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
27902+/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
27903+/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
27904+/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
27905+/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
27906+/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
27907+/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
27908+/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
27909+/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
27910+/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
27911+/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
27912+/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
27913+/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
27914+/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
27915+/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
27916+/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
27917+/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
27918+/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
27919+/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
27920+/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
27921+// :1
27922+/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
27923+/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
27924+/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
27925+/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
27926+/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
27927+/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
27928+/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
27929+/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
27930+/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
27931+/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
27932+/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
27933+/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
27934+/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
27935+/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
27936+/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
27937+/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
27938+/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
27939+/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
27940+/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
27941+/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
27942+/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
27943+/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
27944+/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
27945+/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
27946+/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
27947+/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
27948+/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
27949+/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
27950+/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
27951+/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
27952+/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
27953+/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
27954+/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
27955+/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
27956+/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
27957+/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
27958+/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
27959+/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
27960+/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
27961+// ::mc_filter_c_b
27962+/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
27963+/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
27964+/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
27965+/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
27966+/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
27967+/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
27968+/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
27969+/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
27970+/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
27971+/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
27972+/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
27973+/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
27974+/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
27975+/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
27976+/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
27977+/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
27978+/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
27979+/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
27980+/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
27981+/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
27982+/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
27983+/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
27984+/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
27985+/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
27986+/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
27987+/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
27988+/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
27989+/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
27990+/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
27991+/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
27992+/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
27993+/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
27994+/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
27995+/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
27996+/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
27997+/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
27998+/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
27999+// :1
28000+/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
28001+/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
28002+/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
28003+/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
28004+/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
28005+/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
28006+/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
28007+/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
28008+/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
28009+/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
28010+/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
28011+/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
28012+/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
28013+/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
28014+/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
28015+/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
28016+/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
28017+/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
28018+/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
28019+/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
28020+/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
28021+/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
28022+/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
28023+/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
28024+/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
28025+/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
28026+/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
28027+/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
28028+/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
28029+/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
28030+/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
28031+/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
28032+/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
28033+/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
28034+/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
28035+/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
28036+/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
28037+/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
28038+/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
28039+/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
28040+/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
28041+/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
28042+/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
28043+/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
28044+/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
28045+/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
28046+/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
28047+/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
28048+/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
28049+/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
28050+/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
28051+/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
28052+/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
28053+/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
28054+/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
28055+/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
28056+/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
28057+/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
28058+/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
28059+/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
28060+// ::mc_sync_q0
28061+/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28062+/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28063+/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
28064+/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
28065+/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
28066+/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28067+/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
28068+/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
28069+/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
28070+// ::mc_sync_q1
28071+/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28072+/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28073+/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28074+/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
28075+/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
28076+/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
28077+// ::mc_sync_q2
28078+/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28079+/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28080+/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28081+/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
28082+/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
28083+/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
28084+// ::mc_sync_q3
28085+/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28086+/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28087+/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28088+/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
28089+/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
28090+/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop
28091+// ::mc_sync_q4
28092+/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28093+/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28094+/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
28095+/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
28096+/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
28097+/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28098+/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
28099+/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
28100+/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
28101+// ::mc_sync_q5
28102+/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28103+/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28104+/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28105+/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
28106+/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
28107+/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
28108+// ::mc_sync_q6
28109+/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28110+/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28111+/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28112+/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
28113+/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
28114+/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
28115+// ::mc_sync_q7
28116+/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28117+/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28118+/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28119+/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
28120+/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
28121+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop
28122+// ::mc_sync_q8
28123+/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28124+/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28125+/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
28126+/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
28127+/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
28128+/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28129+/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
28130+/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
28131+/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
28132+// ::mc_sync_q9
28133+/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28134+/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28135+/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28136+/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
28137+/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
28138+/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
28139+// ::mc_sync_q10
28140+/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28141+/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28142+/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28143+/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
28144+/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
28145+/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
28146+// ::mc_sync_q11
28147+/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28148+/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28149+/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28150+/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
28151+/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
28152+/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop
28153+// ::mc_exit_c_qn
28154+// ::mc_exit_y_qn
28155+/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
28156+// :1
28157+/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
28158+/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
28159+/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
28160+/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
28161+/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
28162+/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
28163+/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop
28164+/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop
28165+// ::mc_exit_c_q0
28166+// ::mc_exit_y_q0
28167+/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
28168+// :1
28169+/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
28170+/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
28171+/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
28172+/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
28173+/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
28174+/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
28175+/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
28176+/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
28177+/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop
28178+// ::mc_setup_y_q0
28179+/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
28180+// ::mc_setup_y_qn
28181+/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
28182+/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif
28183+/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif
28184+/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
28185+/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
28186+/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
28187+/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
28188+/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
28189+/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
28190+/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
28191+/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
28192+/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
28193+/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
28194+/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
28195+/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
28196+/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1
28197+/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
28198+/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
28199+/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
28200+/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
28201+/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
28202+/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
28203+/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
28204+/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
28205+/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
28206+/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
28207+/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2
28208+/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28209+/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
28210+/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
28211+/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
28212+/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
28213+/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
28214+/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
28215+/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
28216+/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2
28217+/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28218+/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
28219+/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
28220+/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
28221+/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
28222+// :1
28223+/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
28224+/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
28225+/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
28226+/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
28227+/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
28228+/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
28229+/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
28230+/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
28231+/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
28232+/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
28233+/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
28234+/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
28235+/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
28236+/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
28237+/* [0x00000df0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
28238+/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
28239+/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
28240+/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
28241+/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
28242+/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
28243+/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28244+/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
28245+/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28246+/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
28247+/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
28248+/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
28249+// :per_block_setup_8
28250+/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
28251+/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
28252+/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
28253+/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
28254+/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
28255+/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
28256+/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28257+/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
28258+/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
28259+/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
28260+/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
28261+/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
28262+/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
28263+/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
28264+/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
28265+/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28266+/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
28267+/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
28268+/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
28269+/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
28270+/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
28271+/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
28272+/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
28273+/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
28274+/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
28275+/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
28276+/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
28277+/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
28278+/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
28279+/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
28280+/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
28281+/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
28282+/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
28283+/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
28284+/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
28285+/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
28286+/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
28287+/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
28288+/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
28289+/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
28290+/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
28291+/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
28292+/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
28293+/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
28294+/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
28295+/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28296+/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
28297+/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
28298+/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
28299+// ::mc_filter_y_pxx
28300+/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
28301+/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
28302+/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
28303+/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
28304+/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
28305+/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
28306+/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
28307+// :1
28308+/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
28309+/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
28310+/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
28311+/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
28312+/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
28313+/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
28314+/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
28315+/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
28316+/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
28317+/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
28318+/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
28319+/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
28320+/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
28321+/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
28322+/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
28323+/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
28324+/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
28325+/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
28326+/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
28327+/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
28328+/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
28329+/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
28330+/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
28331+/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
28332+/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
28333+/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
28334+/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
28335+/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
28336+/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
28337+/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
28338+/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
28339+/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
28340+/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
28341+/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
28342+/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
28343+/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
28344+/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
28345+/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
28346+/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
28347+/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
28348+/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
28349+/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
28350+/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
28351+/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
28352+/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
28353+/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
28354+/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
28355+/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
28356+/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
28357+/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
28358+/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
28359+/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
28360+/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
28361+/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
28362+/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
28363+/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
28364+// ::mc_filter_y_bxx
28365+/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
28366+/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
28367+/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
28368+/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
28369+/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
28370+/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
28371+/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
28372+/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
28373+// :1
28374+/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
28375+/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
28376+/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
28377+/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
28378+/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
28379+/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
28380+/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
28381+/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
28382+/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
28383+/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
28384+/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
28385+/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
28386+/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
28387+/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
28388+/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
28389+/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
28390+/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
28391+/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
28392+/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
28393+/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
28394+/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
28395+/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
28396+/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
28397+/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
28398+/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
28399+/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
28400+/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
28401+/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
28402+/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
28403+/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
28404+/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
28405+/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
28406+/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
28407+/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
28408+/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
28409+/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
28410+/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
28411+/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
28412+/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
28413+/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
28414+/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
28415+/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
28416+/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
28417+/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
28418+/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
28419+/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
28420+/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
28421+/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
28422+/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
28423+/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
28424+/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
28425+/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
28426+/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
28427+/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
28428+/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
28429+/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
28430+/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
28431+/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
28432+// ::mc_filter_y_p00
28433+/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
28434+/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
28435+/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
28436+/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
28437+/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
28438+/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
28439+/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
28440+/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
28441+/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2
28442+/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28443+/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
28444+/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
28445+/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
28446+/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
28447+/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
28448+/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
28449+/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
28450+/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
28451+/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
28452+/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
28453+// :1
28454+/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
28455+/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
28456+/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
28457+/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
28458+/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
28459+/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
28460+/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
28461+/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
28462+/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
28463+/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
28464+/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
28465+/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
28466+/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
28467+/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
28468+/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
28469+/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
28470+/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
28471+/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
28472+/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
28473+/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
28474+/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
28475+/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
28476+/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
28477+// ::mc_filter_y_b00
28478+/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
28479+/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
28480+/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
28481+/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
28482+/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1
28483+/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
28484+/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
28485+/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
28486+// :1
28487+/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
28488+/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
28489+/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
28490+/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
28491+/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
28492+/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
28493+/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
28494+/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
28495+/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
28496+/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
28497+/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
28498+/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
28499+/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
28500+/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
28501+/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
28502+/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
28503+/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
28504+/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
28505+/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
28506+/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
28507+/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
28508+/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
28509+/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
28510+/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
28511+/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
28512+/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
28513+/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
28514+/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
28515+/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
28516+// ::mc_setup_c10_q0
28517+/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
28518+// ::mc_setup_c10_qn
28519+/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
28520+/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
28521+/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
28522+/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
28523+/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
28524+/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
28525+/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
28526+/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
28527+/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
28528+/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
28529+/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
28530+/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
28531+/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
28532+/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
28533+/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
28534+/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
28535+/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
28536+/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
28537+/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
28538+/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
28539+/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
28540+/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
28541+/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
28542+/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
28543+/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
28544+/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1
28545+/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28546+/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
28547+/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
28548+/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
28549+/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
28550+/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
28551+/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
28552+/* [0x00001770] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
28553+/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
28554+/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
28555+/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
28556+/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
28557+/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
28558+/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
28559+/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
28560+/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
28561+/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
28562+/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
28563+/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
28564+/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28565+/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
28566+/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
28567+// :1
28568+/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
28569+/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
28570+/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
28571+/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
28572+/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
28573+/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
28574+/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
28575+/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
28576+/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
28577+/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
28578+/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28579+/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
28580+/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28581+/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
28582+/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
28583+/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
28584+// ::mc_filter_c10_p
28585+/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
28586+/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
28587+/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
28588+/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
28589+/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
28590+/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
28591+/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
28592+/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
28593+/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28594+/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
28595+/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
28596+/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
28597+/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
28598+/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
28599+/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
28600+/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
28601+/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
28602+/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
28603+/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
28604+/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
28605+/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
28606+/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
28607+// :1
28608+/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
28609+/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
28610+/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
28611+/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
28612+/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
28613+/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
28614+/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
28615+/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
28616+/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
28617+/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
28618+/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
28619+/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
28620+/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
28621+/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
28622+/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
28623+/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
28624+/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
28625+/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
28626+/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
28627+/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
28628+/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
28629+/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
28630+/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
28631+/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
28632+/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
28633+/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
28634+/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
28635+/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
28636+/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
28637+/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
28638+/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
28639+/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
28640+/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
28641+/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
28642+/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
28643+/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
28644+/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
28645+/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
28646+/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
28647+// ::mc_filter_c10_p_l1
28648+/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
28649+/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
28650+/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
28651+/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
28652+/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
28653+/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
28654+/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
28655+/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
28656+/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28657+/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
28658+/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
28659+/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
28660+/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
28661+/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
28662+/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
28663+/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
28664+/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
28665+/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
28666+/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
28667+/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
28668+/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
28669+/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
28670+// :1
28671+/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
28672+/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
28673+/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
28674+/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
28675+/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
28676+/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
28677+/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
28678+/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
28679+/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
28680+/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
28681+/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
28682+/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
28683+/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
28684+/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
28685+/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
28686+/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
28687+/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
28688+/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
28689+/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
28690+/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
28691+/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
28692+/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
28693+/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
28694+/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
28695+/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
28696+/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
28697+/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
28698+/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
28699+/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
28700+/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
28701+/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
28702+/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
28703+/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
28704+/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
28705+/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
28706+/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
28707+/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
28708+/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
28709+/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
28710+// ::mc_filter_c10_b
28711+/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
28712+/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
28713+/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
28714+/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
28715+/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
28716+/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
28717+/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
28718+/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
28719+/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
28720+/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28721+/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
28722+/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
28723+/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
28724+/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
28725+/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
28726+/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
28727+/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
28728+/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
28729+/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
28730+/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
28731+/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
28732+/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
28733+/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
28734+/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
28735+/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
28736+/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
28737+/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28738+/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
28739+/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
28740+/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
28741+/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
28742+/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
28743+/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
28744+/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
28745+/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
28746+// :1
28747+/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
28748+/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
28749+/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
28750+/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
28751+/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
28752+/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
28753+/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
28754+/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
28755+/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
28756+/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
28757+/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
28758+/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
28759+/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
28760+/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
28761+/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
28762+/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
28763+/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
28764+/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
28765+/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
28766+/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
28767+/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
28768+/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
28769+/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
28770+/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
28771+/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
28772+/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
28773+/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
28774+/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
28775+/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
28776+/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
28777+/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
28778+/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
28779+/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
28780+/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
28781+/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
28782+/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
28783+/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
28784+/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
28785+/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
28786+/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
28787+/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
28788+/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
28789+/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
28790+/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
28791+/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
28792+/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
28793+/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
28794+/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
28795+/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
28796+/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
28797+/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
28798+/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
28799+/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
28800+/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
28801+/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
28802+/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
28803+/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
28804+/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
28805+/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
28806+/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
28807+// ::mc_sync10_q0
28808+/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28809+/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28810+/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
28811+/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
28812+/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
28813+/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28814+/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
28815+/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
28816+/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
28817+// ::mc_sync10_q1
28818+/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28819+/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28820+/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28821+/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
28822+/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
28823+/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
28824+// ::mc_sync10_q2
28825+/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28826+/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28827+/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28828+/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
28829+/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
28830+/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
28831+// ::mc_sync10_q3
28832+/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28833+/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28834+/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28835+/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
28836+/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
28837+/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop
28838+// ::mc_sync10_q4
28839+/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28840+/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28841+/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
28842+/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
28843+/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
28844+/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28845+/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
28846+/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
28847+/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
28848+// ::mc_sync10_q5
28849+/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28850+/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28851+/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28852+/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
28853+/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
28854+/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
28855+// ::mc_sync10_q6
28856+/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28857+/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28858+/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28859+/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
28860+/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
28861+/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
28862+// ::mc_sync10_q7
28863+/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28864+/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28865+/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28866+/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
28867+/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
28868+/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop
28869+// ::mc_sync10_q8
28870+/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28871+/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28872+/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
28873+/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
28874+/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
28875+/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28876+/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
28877+/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
28878+/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
28879+// ::mc_sync10_q9
28880+/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28881+/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28882+/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28883+/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
28884+/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
28885+/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
28886+// ::mc_sync10_q10
28887+/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28888+/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28889+/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28890+/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
28891+/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
28892+/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
28893+// ::mc_sync10_q11
28894+/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28895+/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
28896+/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28897+/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
28898+/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
28899+/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop
28900+// ::mc_exit_c10_q0
28901+// ::mc_exit_y10_q0
28902+/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
28903+// :1
28904+/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
28905+/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
28906+/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
28907+/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
28908+/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
28909+/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
28910+/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
28911+/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
28912+/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop
28913+// ::mc_exit_c10_qn
28914+// ::mc_exit_y10_qn
28915+/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
28916+// :1
28917+/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
28918+/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
28919+/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
28920+/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
28921+/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
28922+/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
28923+/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop
28924+/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop
28925+// ::mc_setup_y10_q0
28926+/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
28927+// ::mc_setup_y10_qn
28928+/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
28929+/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif
28930+/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif
28931+/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
28932+/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
28933+/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
28934+/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
28935+/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
28936+/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
28937+/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
28938+/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
28939+/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
28940+/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
28941+/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
28942+/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
28943+/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
28944+/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
28945+/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
28946+/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
28947+/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
28948+/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
28949+/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
28950+/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
28951+/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
28952+/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
28953+/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
28954+/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
28955+/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
28956+/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2
28957+/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28958+/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
28959+/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
28960+/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
28961+/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
28962+/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
28963+/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
28964+/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
28965+/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
28966+/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2
28967+/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
28968+/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
28969+/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
28970+/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
28971+/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
28972+// :1
28973+/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
28974+/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
28975+/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
28976+/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
28977+/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
28978+/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
28979+/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
28980+/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
28981+/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
28982+/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
28983+/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
28984+/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
28985+/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
28986+/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
28987+/* [0x00002428] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
28988+/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
28989+/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
28990+/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
28991+/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
28992+/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
28993+/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
28994+/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
28995+/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
28996+/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
28997+/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
28998+/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
28999+// :per_block_setup_10
29000+/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
29001+/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
29002+/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
29003+/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
29004+/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
29005+/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
29006+/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
29007+/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
29008+/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
29009+/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
29010+/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
29011+/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
29012+/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
29013+/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
29014+/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
29015+/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
29016+/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
29017+/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
29018+/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
29019+/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
29020+/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
29021+/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
29022+/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
29023+/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
29024+/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
29025+/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
29026+/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
29027+/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
29028+/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
29029+/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
29030+/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
29031+/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
29032+/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
29033+/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
29034+/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
29035+/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
29036+/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
29037+/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
29038+/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
29039+/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
29040+/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
29041+/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
29042+/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
29043+/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
29044+/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
29045+/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
29046+/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
29047+/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
29048+/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
29049+/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
29050+/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
29051+// ::mc_filter_y10_pxx
29052+/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
29053+/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
29054+/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
29055+/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
29056+/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
29057+/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
29058+/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
29059+// :1
29060+/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
29061+/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
29062+/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
29063+/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
29064+/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
29065+/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
29066+/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
29067+/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
29068+/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
29069+/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
29070+/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
29071+/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
29072+/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
29073+/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
29074+/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
29075+/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
29076+/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
29077+/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
29078+/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
29079+/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
29080+/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
29081+/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
29082+/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
29083+/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
29084+/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
29085+/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
29086+/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
29087+/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
29088+/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
29089+/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
29090+/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
29091+/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
29092+/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
29093+/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
29094+/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
29095+/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
29096+/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
29097+/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
29098+/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
29099+/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
29100+/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
29101+/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
29102+/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
29103+/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
29104+/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
29105+/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
29106+/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
29107+/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
29108+/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
29109+/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
29110+/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
29111+/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
29112+/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
29113+/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
29114+/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
29115+/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
29116+// ::mc_filter_y10_p00
29117+/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
29118+/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
29119+/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
29120+/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
29121+/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
29122+/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
29123+/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
29124+/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
29125+/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
29126+/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2
29127+/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
29128+/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
29129+/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
29130+/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
29131+/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
29132+/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
29133+/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
29134+/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
29135+/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
29136+/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
29137+/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
29138+// :1
29139+/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
29140+/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
29141+/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
29142+/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
29143+/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
29144+/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
29145+/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
29146+/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
29147+/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
29148+/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
29149+/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
29150+/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
29151+/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
29152+/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
29153+/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
29154+/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
29155+/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
29156+/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
29157+/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
29158+/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
29159+/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
29160+/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
29161+/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
29162+// ::mc_filter_y10_bxx
29163+/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
29164+/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
29165+/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
29166+/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
29167+/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
29168+/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
29169+/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
29170+/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
29171+// :1
29172+/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
29173+/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
29174+/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
29175+/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
29176+/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
29177+/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
29178+/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
29179+/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
29180+/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
29181+/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
29182+/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
29183+/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
29184+/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
29185+/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
29186+/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
29187+/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
29188+/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
29189+/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
29190+/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
29191+/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
29192+/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
29193+/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
29194+/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
29195+/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
29196+/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
29197+/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
29198+/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
29199+/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
29200+/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
29201+/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
29202+/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
29203+/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
29204+/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
29205+/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
29206+/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
29207+/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
29208+/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
29209+/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
29210+/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
29211+/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
29212+/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
29213+/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
29214+/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
29215+/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
29216+/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
29217+/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
29218+/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
29219+/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
29220+/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
29221+/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
29222+/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
29223+/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
29224+/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
29225+/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
29226+/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
29227+/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
29228+/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
29229+/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
29230+// ::mc_filter_y10_b00
29231+/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
29232+/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
29233+/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
29234+/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
29235+/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1
29236+/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
29237+/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
29238+/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
29239+// :1
29240+/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
29241+/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
29242+/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
29243+/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
29244+/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
29245+/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
29246+/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
29247+/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
29248+/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
29249+/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
29250+/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
29251+/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
29252+/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
29253+/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
29254+/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
29255+/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
29256+/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
29257+/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
29258+/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
29259+/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
29260+/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
29261+/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
29262+/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
29263+/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
29264+/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
29265+/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
29266+/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
29267+/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
29268+/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
29269+// ::mc_end
29270+};
29271+#ifdef __HIGHC__
29272+#pragma Align_to(8, ff_hevc_rpi_shader)
29273+#endif
29274--- /dev/null
29275+++ b/libavcodec/rpi_hevc_shader.h
29276@@ -0,0 +1,63 @@
29277+#ifndef rpi_hevc_shader_H
29278+#define rpi_hevc_shader_H
29279+
29280+extern unsigned int ff_hevc_rpi_shader[];
29281+
29282+#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0)
29283+#define mc_start (ff_hevc_rpi_shader + 0)
29284+#define mc_setup_c_qn (ff_hevc_rpi_shader + 2)
29285+#define mc_filter_c_p (ff_hevc_rpi_shader + 134)
29286+#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260)
29287+#define mc_filter_c_b (ff_hevc_rpi_shader + 386)
29288+#define mc_sync_q0 (ff_hevc_rpi_shader + 580)
29289+#define mc_sync_q1 (ff_hevc_rpi_shader + 598)
29290+#define mc_sync_q2 (ff_hevc_rpi_shader + 610)
29291+#define mc_sync_q3 (ff_hevc_rpi_shader + 622)
29292+#define mc_sync_q4 (ff_hevc_rpi_shader + 634)
29293+#define mc_sync_q5 (ff_hevc_rpi_shader + 652)
29294+#define mc_sync_q6 (ff_hevc_rpi_shader + 664)
29295+#define mc_sync_q7 (ff_hevc_rpi_shader + 676)
29296+#define mc_sync_q8 (ff_hevc_rpi_shader + 688)
29297+#define mc_sync_q9 (ff_hevc_rpi_shader + 706)
29298+#define mc_sync_q10 (ff_hevc_rpi_shader + 718)
29299+#define mc_sync_q11 (ff_hevc_rpi_shader + 730)
29300+#define mc_exit_c_qn (ff_hevc_rpi_shader + 742)
29301+#define mc_exit_y_qn (ff_hevc_rpi_shader + 742)
29302+#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760)
29303+#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760)
29304+#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780)
29305+#define mc_setup_y_qn (ff_hevc_rpi_shader + 782)
29306+#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014)
29307+#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140)
29308+#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272)
29309+#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358)
29310+#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432)
29311+#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434)
29312+#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562)
29313+#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684)
29314+#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806)
29315+#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996)
29316+#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014)
29317+#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026)
29318+#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038)
29319+#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050)
29320+#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068)
29321+#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080)
29322+#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092)
29323+#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104)
29324+#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122)
29325+#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134)
29326+#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146)
29327+#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158)
29328+#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158)
29329+#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178)
29330+#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178)
29331+#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196)
29332+#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198)
29333+#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440)
29334+#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566)
29335+#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654)
29336+#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786)
29337+#define mc_end (ff_hevc_rpi_shader + 2860)
29338+
29339+#endif
29340--- /dev/null
29341+++ b/libavcodec/rpi_hevc_shader.qasm
29342@@ -0,0 +1,1850 @@
29343+# Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
29344+# All rights reserved.
29345+#
29346+# Redistribution and use in source and binary forms, with or without
29347+# modification, are permitted provided that the following conditions are met:
29348+#     * Redistributions of source code must retain the above copyright
29349+#       notice, this list of conditions and the following disclaimer.
29350+#     * Redistributions in binary form must reproduce the above copyright
29351+#       notice, this list of conditions and the following disclaimer in the
29352+#       documentation and/or other materials provided with the distribution.
29353+#     * Neither the name of the copyright holder nor the
29354+#       names of its contributors may be used to endorse or promote products
29355+#       derived from this software without specific prior written permission.
29356+#
29357+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
29358+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
29359+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
29360+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
29361+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29362+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29363+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
29364+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29365+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29366+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29367+#
29368+# Written by Peter de Rivaz, John Cox
29369+
29370+
29371+
29372+# Inter pred asm
29373+#
29374+# Logic here should be good to 14 bits without modification
29375+# but only 8 & 10 are currently instantiated & tested
29376+# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow
29377+# in _p00 & _b00
29378+
29379+# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
29380+# the warning that we are using rotation & ra/rb registers. r0..3 can be
29381+# rotated through all 16 elems ra regs can only be rotated through their
29382+# local 4.  As it happens this is what is wanted here as we do not want the
29383+# constants from the other half of the calc.
29384+
29385+# Number limits in P/B calculation
29386+#
29387+# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier
29388+# we offset our intermediates s.t. they always end up +ve before the next
29389+# multiply (may be -ve whilst summing but that doesn't matter).
29390+#
29391+# Range calc for up to 14 bits (Y-B pred):
29392+#
29393+# denom: [0, 7]
29394+# bmax = (1 << bits) - 1
29395+# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1]
29396+#
29397+# wt_mul: [-128, 255]
29398+# wt_off = off * 2 + 1: [-bmax, bmax]
29399+#
29400+# pel: [0, bmax]
29401+# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff]
29402+# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e]
29403+# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6]
29404+# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4]
29405+# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2):
29406+#  [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000]
29407+#
29408+# This all looks good and is mostly bit depth independant - and as we manage
29409+# to do unsigned multiplies everywhere (now) this should be good for any bit
29410+# depth up to 14 (we could probably do 16 - but that requires a few tweaks
29411+# to the shifts we don't currently have logic for)
29412+
29413+# PREREAD is the number of requests that we have sitting in the TMU request
29414+# queue.
29415+#
29416+# There are 8 slots availible in the TMU request Q for tm0s requests, but
29417+# only 4 output FIFO entries and overflow is bad (corruption or crash)
29418+# (If threaded then only 2 out FIFO entries, but we aren't.)
29419+# In s/w we are effectively limited to the min vertical read which is >= 4
29420+# so output FIFO is the limit.
29421+#
29422+# As the test for read-next is is the main part of the Luma loop (rather than
29423+# the preload FIFO part) we are limited to min_luma_height - 1
29424+# Min_luma_height is 4 so we can only have a preload of 3
29425+# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick
29426+# in chroma without abandoning preload pretty much entirely (which would be bad)
29427+#
29428+# Timing tests vs preload of 4 suggests this doesn't hurt us much
29429+# Could have preread 4 for Chroma but when tested it didn't help
29430+
29431+.set PREREAD,                      3
29432+
29433+# Offset added (effectively) at the exit of the H FIR filter
29434+# This is enough to force the result +ve
29435+# Is good if it is a power of 2 as that allows for >> without loss
29436+#
29437+# Worst case for a single Y FIR is *-22 so we need an offset of 256*22
29438+# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00
29439+# Round up to next power of 2
29440+
29441+.set FIR_OFFSET,                   0x4000
29442+
29443+# Block heights - 8 & 16 are the only numbers we currently support
29444+
29445+.set C_BLK_HEIGHT_8,               16
29446+.set C_BLK_HEIGHT_16,              8
29447+.set Y_BLK_HEIGHT_8,               16
29448+.set Y_BLK_HEIGHT_16,              8
29449+
29450+# QPU counts - depend on block size
29451+# If we have a 2-byte format & block_size > 8 then can only afford
29452+# 8 QPUs
29453+# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h
29454+
29455+.set N_QPU_8,                      12
29456+.set N_QPU_16,                     12
29457+
29458+# Value to add to the weight multiplier to convert it into an unsigned value
29459+# Should be power of two for convienience
29460+
29461+.set LOG2_MUL_ADD,                 14
29462+.set MUL_ADD,                      (1 << LOG2_MUL_ADD)
29463+
29464+# Fixed denom (max that it can be set to)
29465+.set DENOM,                        7
29466+
29467+# register allocation
29468+#
29469+
29470+# ra0-3
29471+# Used as temp and may be loop filter coeffs (split into .8s)
29472+# or temp in loop. Check usage on an individual basis.
29473+
29474+# ra4-11
29475+# V FIFO / temp / free
29476+
29477+# -- free --                       ra12
29478+
29479+# -- free --                       ra13
29480+
29481+# -- free --                       ra14
29482+
29483+# -- free --                       ra15
29484+
29485+# uniform: width:height
29486+.set ra_width_height,              ra16
29487+.set ra_width,                     ra16.16b
29488+.set ra_height,                    ra16.16a
29489+
29490+# y:y2 same layout as y_y2_next so we can update both together
29491+.set ra_y_y2,                      ra17
29492+.set ra_y2,                        ra17.16a
29493+.set ra_y,                         ra17.16b
29494+
29495+# uniform: L1 weight (U on left, V on right)
29496+# Only used in Y B
29497+.set ra_wt_off_mul_l1,             ra18
29498+.set ra_wt_off_l1,                 ra18.16b
29499+.set ra_wt_mul_l1,                 ra18.16a
29500+
29501+# y_next:y2_next same layout as y_y2 so we can update both together
29502+.set ra_y_y2_next,                 ra19
29503+.set ra_y_next,                    ra19.16b
29504+.set ra_y2_next,                   ra19.16a
29505+
29506+# Setup: consts - subdivide a single register
29507+.set ra_kff800100,                 ra20
29508+.set ra_k256,                      ra20.16a
29509+.set ra_k0,                        ra20.8a
29510+.set ra_k1,                        ra20.8b
29511+.set ra_k128,                      ra20.8c
29512+.set ra_k255,                      ra20.8d
29513+
29514+# Loop: xshifts
29515+.set ra_xshift,                    ra21.16a
29516+.set ra_xshift_next,               ra21.16b
29517+
29518+# Loop var: L0 weight (U on left, V on right)
29519+# _off_ is not used in loop as we want to modify it before use
29520+.set ra_wt_off_mul_l0,             ra22
29521+.set ra_wt_mul_l0,                 ra22.16a
29522+.set ra_wt_off_l0,                 ra22.16b
29523+
29524+# Max pel value (for 8 bit we can get away with sat ops but not 9+)
29525+# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
29526+#   2nd byte   but as the source should never be > 3 there 0x3ff should do
29527+.set ra_blk_height_pmax,           ra23
29528+.set ra_pmax,                      ra23.16a
29529+.set ra_blk_height,                ra23.8c
29530+# --free --                        ra23.8d
29531+
29532+# Loop:  src frame base (L0)
29533+.set ra_base,                      ra24
29534+
29535+# Misc  offsets
29536+.set ra_fir_off_val_wt_den_p7,     ra25
29537+.set ra_wt_den_p7,                 ra25.8a
29538+# -- free --                       ra25.8b
29539+.set ra_fir_off_val,               ra25.16b
29540+
29541+# As it happens these constants are the same
29542+.if FIR_OFFSET == MUL_ADD
29543+# Weight multiplier unsigned add
29544+.set ra_kmul_add,                  ra_fir_off_val
29545+.else
29546+.error "FIR_OFFSET != MUL_ADD: Need new register & init"
29547+.endif
29548+
29549+# Loop: next src frame base (L0)
29550+.set ra_base_next,                 ra26
29551+
29552+# Loop: height<<23 + width<<16 + vdw_setup_0
29553+.set ra_dma0,                      ra27
29554+
29555+# Loop: destination address
29556+.set ra_dest,                      ra28
29557+
29558+# Setup: Dup of rb_ef
29559+# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul
29560+# (top bits are ignored by mul24)
29561+.set ra_ef,                        ra29
29562+
29563+# Use an even numbered register as a link register to avoid corrupting flags
29564+.set ra_link,                      ra30
29565+
29566+# -- free --                       ra31
29567+
29568+.set rb_xshift2,                   rb0
29569+.set rb_xshift2_next,              rb1
29570+
29571+# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
29572+.set rb_elem_x,                    rb2
29573+
29574+# El Flags
29575+# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
29576+# Duped into ra_ef as sometimes that is easier to use
29577+.set rb_ef,                        rb3
29578+
29579+# rb4-11
29580+# Loop: V filter FIFO or V filter coeff
29581+
29582+# Loop var: offset to add before shift (round + weighting offsets)
29583+# Exact value varies by loop
29584+.set rb_wt_off,                    rb12
29585+
29586+# -- free --                       rb13
29587+
29588+# -- free --                       rb14
29589+
29590+# Loop: src frame base (L1)
29591+.set rb_base2,                     rb15
29592+
29593+# Line pitch (128 for sand128)
29594+.set rb_pitch,                     rb16
29595+
29596+# Loop count - 2 (set up TMU for next xfer)
29597+.set rb_i_tmu,                     rb17
29598+
29599+# Loop count for min(height, 16)
29600+# Y will reset & loop again if height > 16
29601+.set rb_lcount,                    rb18
29602+
29603+# frame_base2_next
29604+.set rb_base2_next,                rb19
29605+
29606+# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
29607+# offset to the slice
29608+.set rb_xpitch,                    rb20
29609+
29610+# These 3 consts each save 1 instruction in Y loop setup
29611+# so whilst they are worthwhile they should be the 1st to die if we need
29612+# another b reg
29613+.set rb_y_coeffs_2,                rb21                         # 0x050b0a00
29614+.set rb_y_coeffs_3,                rb22                         # 0x11283a40
29615+.set rb_y_coeffs_5,                rb23                         # 0x0a0b0500
29616+
29617+# Setup: 0xff (8-bit) / 0xffff (9+ bit)
29618+.set rb_pmask,                     rb24
29619+
29620+# vdw_setup_1(dst_pitch)
29621+.set rb_dma1_base,                 rb25
29622+
29623+# Setup: pic width - 1
29624+# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
29625+.set rb_max_x,                     rb26
29626+
29627+# vdw_setup_0 (depends on QPU number)
29628+.set rb_dma0_base,                 rb27
29629+
29630+# Setup: vw_setup value to reset VPM write pointer
29631+.set rb_vpm_init,                  rb28
29632+
29633+# Loop: vdw_setup_1(dst_pitch-width) = stride
29634+.set rb_dma1,                      rb29
29635+
29636+# Setup: pic_height - 1
29637+.set rb_max_y,                     rb30
29638+
29639+# Setup: FIR H offset
29640+.set rb_fir_off_h,                 rb31
29641+
29642+
29643+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
29644+.set i_shift16,                    -16
29645+.set i_shift21,                    -11
29646+.set i_shift23,                     -9
29647+.set i_shift30,                     -2
29648+
29649+# Much of the setup code is common between Y & C
29650+# Macros that express this - obviously these can't be overlapped
29651+# so are probably unsuitable for loop code
29652+
29653+.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
29654+  mov r2, qpu_num
29655+.if v_bit_depth <= 8
29656+  # 8 bit version
29657+  asr r1, r2, 2
29658+  shl r1, r1, 6
29659+  and r0, r2, 3
29660+  or  r0, r0, r1
29661+
29662+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
29663+  add r_vpm, r0, r1  # VPM 8bit storage
29664+
29665+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
29666+  shl r0, r0, 5
29667+
29668+.else
29669+  # 16 bit version
29670+  # Limited to 8 QPUs if blk height > 8
29671+  asr r1, r2, 1
29672+.if v_blk_height <= 8
29673+  shl r1, r1, 4
29674+.else
29675+  shl r1, r1, 5
29676+.endif
29677+  and r0, r2, 1
29678+  or  r0, r0, r1
29679+
29680+  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
29681+  add r_vpm, r0, r1
29682+
29683+  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
29684+  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
29685+  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
29686+  shl r0, r0, 6
29687+.endif
29688+  add r_dma, r0, r1  # DMA out
29689+.endm
29690+
29691+
29692+.macro m_setup_q0
29693+  srel -, 12
29694+.endm
29695+
29696+# Code start label
29697+::mc_start
29698+
29699+################################################################################
29700+# mc_setup_c
29701+#
29702+# typedef struct qpu_mc_pred_c_s_s {
29703+#     int16_t y;
29704+#     int16_t x;
29705+#     uint32_t base;
29706+#     uint32_t pic_cw;            // C Width (== Y width / 2)
29707+#     uint32_t pic_ch;            // C Height (== Y Height / 2)
29708+#     uint32_t stride2;
29709+#     uint32_t stride1;
29710+#     uint32_t wdenom;
29711+#     int16_t y2;
29712+#     int16_t x2;
29713+#     uint32_t base2;
29714+#     uint32_t next_fn;
29715+# } qpu_mc_pred_c_s_t;
29716+
29717+.macro m_setup_c, v_bit_depth
29718+
29719+# Cannot use mul24 on x as x might be -ve, so must use shift
29720+.if v_bit_depth <= 8
29721+.set v_x_shift,         1
29722+.set v_pmask,           0xff
29723+.set v_blk_height,      C_BLK_HEIGHT_8
29724+.else
29725+.set v_x_shift,         2
29726+.set v_pmask,           0xffff
29727+.set v_blk_height,      C_BLK_HEIGHT_16
29728+.endif
29729+
29730+  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
29731+
29732+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
29733+  shl rb_ef, r0, i_shift30      ; mov ra_base, unif             # ; ref_c_base
29734+
29735+# Read image dimensions
29736+  sub r0, unif, 1                                               # pic c width
29737+  shl rb_max_x, r0, v_x_shift                                   # rb_max_x in bytes
29738+  sub rb_max_y, unif, 1                                         # pic c height
29739+
29740+# load constants
29741+  mov ra_kff800100, 0xff800100
29742+  mov rb_pmask, v_pmask
29743+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
29744+  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
29745+  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
29746+
29747+# get source pitch
29748+  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # ; stride2
29749+  mov rb_pitch, unif                                            # stride1
29750+  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay] Merged with dst_stride shortly
29751+  add rb_dma1_base, r1, rb_pitch                                # vdw_setup_1
29752+
29753+  and r0, 1, elem_num
29754+  nop                           ; mul24 r0, r0, 5
29755+.if v_bit_depth <= 8
29756+  add rb_elem_x, r0, elem_num
29757+.else
29758+  add r0, r0, elem_num
29759+  add rb_elem_x, r0, r0
29760+.endif
29761+
29762+# Compute base address for first and second access
29763+# ra_base ends up with t0s base
29764+# ra_base2 ends up with t1s base
29765+
29766+  shl r0, ra0.16b, v_x_shift                                    # [rb_elem_x delay]
29767+  add r0, r0, rb_elem_x                                         # Add elem no to x to get X for this slice
29768+  max r0, r0, 0                 ; mov ra_y, ra0.16a             # ; stash Y
29769+  min r0, r0, rb_max_x
29770+
29771+# Get shift
29772+# Shift will always calculate as 0 for 9+ bit
29773+# Ideally we can optimize the shift out of the code in these cases but for now
29774+# it is tidier to leave it in
29775+.if v_bit_depth <= 8
29776+  shl ra_xshift_next, r0, 3
29777+.else
29778+  mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
29779+.endif
29780+
29781+# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
29782+
29783+.if v_bit_depth <= 8
29784+  and r0, r0, -4
29785+.endif
29786+  sub r1, ra_k0, rb_pitch
29787+  and r1, r0, r1
29788+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
29789+  add r0, r0, r1                ; mov ra0, unif                 # ; next_x2_y2
29790+  add ra_base, ra_base, r0
29791+
29792+# Compute part of VPM to use for DMA output
29793+# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
29794+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
29795+
29796+# And again for L1, but only worrying about frame2 stuff
29797+
29798+# Compute base address for first and second access
29799+# ra_base ends up with t0s base
29800+# rb_base2 ends up with t1s base
29801+
29802+  shl r0, ra0.16b, v_x_shift
29803+  add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a            # Add QPU slice offset
29804+  max r0, r0, 0                 ; mov rb_base2, unif            # ref_c_base2
29805+  min r0, r0, rb_max_x
29806+
29807+# Get shift (already zero if 9+ bit so ignore)
29808+.if v_bit_depth <= 8
29809+  shl rb_xshift2_next, r0, 3
29810+.endif
29811+
29812+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
29813+
29814+.if v_bit_depth <= 8
29815+  and r0, r0, -4
29816+.endif
29817+  sub r1, ra_k0, rb_pitch
29818+  and r1, r0, r1                ; mov r3, PREREAD
29819+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
29820+  add r0, r0, r1                ; mov r2, ra_y2
29821+  add rb_base2, rb_base2, r0    ; mov r0, ra_y
29822+
29823+# Do preloads
29824+# r0 = ra_y, r2 = ra_y2, r3 = PREREAD
29825+
29826+:1
29827+  sub.setf r3, r3, 1
29828+  max r1, r0, 0
29829+  min r1, r1, rb_max_y
29830+  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
29831+  add t0s, ra_base, r1          ; mov ra_y, r0
29832+
29833+  max r1, r2, 0
29834+  brr.anynz -, r:1b
29835+  min r1, r1, rb_max_y
29836+  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
29837+  add t1s, rb_base2, r1         ; mov ra_y2, r2
29838+# >>> .anynz 1b
29839+
29840+  mov ra_link, unif                                             # link
29841+# touch registers to keep simulator happy (and fills in delay slots)
29842+  mov ra4, 0                    ; mov rb4, 0
29843+  bra -, ra_link
29844+  mov ra5, 0                    ; mov rb5, 0
29845+  mov ra6, 0                    ; mov rb6, 0
29846+  mov ra7, 0                    ; mov rb7, 0
29847+# >>> ra_link
29848+.endm
29849+
29850+::mc_setup_c_q0
29851+  m_setup_q0
29852+::mc_setup_c_qn
29853+  m_setup_c 8
29854+
29855+################################################################################
29856+#
29857+# mc_filter_c_p
29858+#
29859+# typedef struct qpu_mc_pred_c_p_s {
29860+#     int16_t y;
29861+#     int16_t x;
29862+#     uint32_t base;
29863+#     uint16_t h;
29864+#     uint16_t w;
29865+#     uint32_t coeffs_x;
29866+#     uint32_t coeffs_y;
29867+#     uint32_t wo_u;
29868+#     uint32_t wo_v;
29869+#     uint32_t dst_addr_c;
29870+#     uint32_t next_fn;
29871+# } qpu_mc_pred_c_p_t;
29872+
29873+.macro m_filter_c_p, v_tmu, v_bit_depth
29874+
29875+.if v_bit_depth <= 8
29876+.set v_x_shift,         1
29877+.set v_x_mul,           2
29878+.set v_v_shift,         8
29879+# Shifts to get width & height in the right place in rb_dma0
29880+.set v_dma_h_shift,     7
29881+.set v_dma_wh_shift,    i_shift16
29882+.else
29883+.set v_x_shift,         2
29884+.set v_x_mul,           4
29885+.set v_v_shift,         i_shift16
29886+# Shifts to get width & height in the right place in rb_dma0
29887+.set v_dma_h_shift,     8
29888+.set v_dma_wh_shift,    15
29889+.endif
29890+
29891+.if v_tmu == 0
29892+.set vrx_xshift,        rb_xshift2              # b side more convienient
29893+.set vrx_xshift_next,   ra_xshift_next
29894+.set vra_y_next,        ra_y_next
29895+.set vrx_base_next,     ra_base_next
29896+.set vra_y,             ra_y
29897+.set vra_base,          ra_base
29898+.set vr_txs,            t0s
29899+.else
29900+.set vrx_xshift,        ra_xshift               # a side more convienient
29901+.set vrx_xshift_next,   rb_xshift2_next
29902+.set vra_y_next,        ra_y2_next
29903+.set vrx_base_next,     rb_base2_next
29904+.set vra_y,             ra_y2
29905+.set vra_base,          rb_base2
29906+.set vr_txs,            t1s
29907+.endif
29908+
29909+# denom shift values
29910+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
29911+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
29912+
29913+# per-channel shifts were calculated on the *previous* invocation
29914+# get base addresses and per-channel shifts for *next* invocation
29915+  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
29916+
29917+  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; base
29918+
29919+  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0          # r5 = 0
29920+  add r0, r0, rb_elem_x         ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
29921+  sub r1, r5, rb_pitch          ; mov ra0, unif                 # ; H filter coeffs
29922+  max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
29923+  min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
29924+
29925+.if v_bit_depth <= 8
29926+  shl vrx_xshift_next, r0, 3
29927+  and r0, r0, -4
29928+.endif
29929+  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
29930+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
29931+  add r0, r0, r1                ; mov ra3, unif                 # ; V filter coeffs
29932+  add vrx_base_next, r3, r0     ; mov r1, ra_height
29933+
29934+# set up VPM write
29935+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
29936+  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
29937+  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
29938+
29939+# Misc final setup...
29940+
29941+  shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif             # ; dst_addr
29942+  add r0, r0, r2                ; mov r2, ra_fir_off_val        # Combine width and height of destination area (r0=h<<8, r2=w*2)
29943+  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c              # Shift into bits 16 upwards of the vdw_setup0 register
29944+  add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0          # ; r1=weight
29945+  shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
29946+  sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
29947+  add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4            # ; loop counter (V FIFO fill = 4)
29948+  mov rb11, ra3.8d              ; mov ra_link, unif             # ; Link
29949+
29950+# r5           = -4                     (loop counter)
29951+# ra_wt_mul_l0 = weight L0 + 128        (now unsigned)
29952+# rb_wt_off    = (offset * 2 + 1) << (wt_den + 5)
29953+# rb31         = FIR value offset
29954+
29955+# FIFO: rb4, ra5, rb6, ra7
29956+# Coeffs in ra3.8a, ra3.8b, rb10, rb11
29957+
29958+# We want (r0r1)
29959+# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
29960+# We fetch (after shift)
29961+#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
29962+
29963+:1
29964+# retrieve texture results and pick out bytes
29965+# then submit two more texture requests
29966+
29967+.if v_tmu == 0
29968+  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
29969+  shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
29970+  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
29971+  add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
29972+.else
29973+  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
29974+  shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
29975+  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
29976+  add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next       # [r1 << delay]
29977+.endif
29978+
29979+  add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
29980+  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
29981+  min r3, r3, rb_max_y          ; mov.ifnc r0, r2
29982+
29983+  and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
29984+.if v_tmu == 0
29985+  add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask        # ; mask bytes
29986+.else
29987+  add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax         # ; mask bytes
29988+.endif
29989+
29990+# apply horizontal filter
29991+# The filter coeffs for the two halves of this are the same (unlike in the
29992+# Y case) so it doesn't matter which ra0 we get them from
29993+# Also as the two halves are locked together we don't need to separate the 1st
29994+# r0 mul or the last r1 mul as they are valid for all QPUs
29995+
29996+  add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
29997+  sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
29998+  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
29999+  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
30000+  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
30001+  add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
30002+
30003+# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift)
30004+# We would like to save the r5->r4 shift but we need a delay slot
30005+# for both r7 & r6 which we can't find anything to put in if we have
30006+# already multiplied r4 & r5!
30007+  brr.anyn -, r:1b
30008+  add r2, r2, r3                ; mul24 r0, ra7, rb10           # r6 post
30009+  mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b         # r5 post
30010+  asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
30011+# >>> .anyn 1b
30012+
30013+  add r1, r1, r0                ; mul24 r0, rb4, ra3.8a         # [ra7 delay]
30014+  sub r1, r1, r0                ; mul24 r0, ra7, rb11
30015+  sub r1, r1, r0
30016+
30017+  asr r1, r1, 6                 ; mov r3, ra_blk_height         # ; NxtLoop
30018+  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
30019+  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
30020+  sub r1, r0, r1                ; v8subs r0, ra_height, r3      # ; NxtLoop
30021+  brr.anyn -, r:1b
30022+  asr r1, r1, i_wt_den_p6
30023+  min r1, r1, ra_pmax           ; mov -, vw_wait
30024+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
30025+# >>> .anyn 1b
30026+
30027+# r0 = remaining height (min 0)
30028+# r2 = r3 * rb_pitch
30029+# r3 = block_height
30030+
30031+# If looping again then we consumed 16 height last loop
30032+# rb_dma1 (stride) remains constant
30033+# rb_i_tmu remains const (based on total height)
30034+# recalc ra_dma0, rb_lcount based on new segment height
30035+
30036+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
30037+
30038+# DMA out
30039+  bra.anyz -, ra_link
30040+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
30041+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
30042+  shl r1, r1, i_shift23
30043+# >>> .anyz ra_link
30044+
30045+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
30046+# We add to dma0 to reduce the number of output lines in the final block
30047+  brr -, r:1b
30048+  add rb_lcount, rb_lcount, r0
30049+  add ra_dma0, ra_dma0, r1
30050+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
30051+# >>> 1b
30052+.endm
30053+
30054+::mc_filter_c_p
30055+  m_filter_c_p 0, 8
30056+
30057+::mc_filter_c_p_l1
30058+  m_filter_c_p 1, 8
30059+
30060+################################################################################
30061+#
30062+# mc_filter_c_b
30063+#
30064+# typedef struct qpu_mc_pred_c_b_s {
30065+#     int16_t y;
30066+#     int16_t x;
30067+#     uint32_t base;
30068+#     uint16_t h;
30069+#     uint16_t w;
30070+#     uint32_t coeffs_x1;
30071+#     uint32_t coeffs_y1;
30072+#     int16_t weight_u1;
30073+#     int16_t weight_v1;
30074+#     int16_t y2;
30075+#     int16_t x2;
30076+#     uint32_t base2;
30077+#     uint32_t coeffs_x2;
30078+#     uint32_t coeffs_y2;
30079+#     uint32_t wo_u2;
30080+#     uint32_t wo_v2;
30081+#     uint32_t dst_addr_c;
30082+#     uint32_t next_fn;
30083+# } qpu_mc_pred_c_b_t;
30084+
30085+.macro m_filter_c_b, v_bit_depth
30086+
30087+.if v_bit_depth <= 8
30088+.set v_x_shift,         1
30089+.set v_v_shift,         8
30090+# Shifts to get width & height in the right place in ra_dma0
30091+.set v_dma_h_shift,     7
30092+.set v_dma_wh_shift,    i_shift16
30093+.else
30094+.set v_x_shift,         2
30095+.set v_v_shift,         i_shift16
30096+# Shifts to get width & height in the right place in ra_dma0
30097+.set v_dma_h_shift,     8
30098+.set v_dma_wh_shift,    15
30099+.endif
30100+.set v_x_mul,           (1 << v_x_shift)
30101+
30102+# denom shift values
30103+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
30104+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
30105+
30106+# per-channel shifts were calculated on the *previous* invocation
30107+
30108+# get base addresses and per-channel shifts for *next* invocation
30109+  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
30110+
30111+  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; r3=base
30112+
30113+  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1          # x ; r5=0
30114+  add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
30115+  sub r1, r5, rb_pitch          ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
30116+  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
30117+  min r0, r0, rb_max_x          ; mov ra0, unif                 # ; L0 H filter coeffs
30118+
30119+.if v_bit_depth <= 8
30120+  shl ra_xshift_next, r0, 3
30121+.endif
30122+
30123+  and r0, r0, -4                ; mov ra2, unif                 # ; L0 V filter coeffs
30124+  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=x*2 (we are working in pel pairs)
30125+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
30126+  add r0, r0, r1                ; mov r1, ra_height             # Add stripe offsets ; r1=height
30127+  add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
30128+
30129+# set up VPM write
30130+
30131+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U weight
30132+  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
30133+  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight
30134+
30135+  shl r0, r1, v_dma_h_shift     ; mov ra3, unif                 # ; x2_y2
30136+  add r0, r0, r2                ; mov r3, unif                  # [ra3 delay] ; base
30137+  shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a       # Shift into bits 16 upwards of the vdw_setup0 register
30138+  add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b               # r0=x
30139+
30140+# L1 - uniform layout could possibly be optimized
30141+
30142+  shl r0, r0, v_x_shift         ; mov ra1, unif                 # r0=x<<shift ; L1 H filter coeffs
30143+  add r0, r0, rb_elem_x         ; mov ra3, unif                 # ; L1 V filter coeffs
30144+  sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif    # [ra3 delay] r1=pitch2 mask ; U offset/weight
30145+  max r0, r0, r5                ; mov ra9, rb_max_y
30146+  min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
30147+
30148+.if v_bit_depth <= 8
30149+  shl rb_xshift2_next, r0, 3
30150+.endif
30151+
30152+  and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
30153+  and r1, r0, r1                ; mov r5rep, -4
30154+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
30155+  add r0, r0, r1                ; mov ra_dest, unif             #  Add stripe offsets ; dst_addr
30156+  add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
30157+
30158+  add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
30159+  add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
30160+  add r0, r0, r1                ; mov r1, ra_wt_off_l1          # ; L0 off unset
30161+  shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
30162+  sub rb_wt_off, r1, r0         ; mov ra_link, unif             # ; link
30163+
30164+  mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
30165+
30166+# r5        loop counter (-4)
30167+# ra0       H coeffs L0
30168+# ra1       H coeffs L1
30169+# ra2       V coeffs L0
30170+# ra3       V coeffs L1
30171+# ra9       rb_max_y alias
30172+# ra10      rb_xshift2 alias
30173+
30174+:1
30175+# retrieve texture results and pick out bytes
30176+# then submit two more texture requests
30177+  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
30178+  shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
30179+  shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
30180+  add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next # [ra_y delay]
30181+  add ra_y, 1, ra_y             ; mov r3, ra_y
30182+
30183+  max r3, r3, ra_k0             ; mov      r0, r1 << 15
30184+  min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
30185+
30186+  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
30187+  add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask        # ; masks bytes
30188+
30189+# L0 H-filter (-ra4*, +rb5, +rb6, -ra7)
30190+
30191+  and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
30192+  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
30193+  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
30194+  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
30195+  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
30196+  nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
30197+
30198+  add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
30199+
30200+  shr r2, r4, ra10              ; mov rb5, rb6
30201+  shr r1, r2, v_v_shift         ; mov r3, ra_y2
30202+  shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7                  # [r1 << delay]
30203+
30204+  add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
30205+  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
30206+  min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
30207+
30208+  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
30209+  add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax         # ; masks bytes
30210+
30211+# L1 H-filter (-r0*, +rb9, +rb10, -ra11)
30212+
30213+  add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
30214+  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
30215+  sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
30216+  nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
30217+  add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
30218+  add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
30219+
30220+  brr.anyn -, r:1b
30221+  add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
30222+  mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
30223+  shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
30224+# >>> .anyn 1b
30225+
30226+  sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b        # L1 ; L0
30227+  sub.setf -, r5, rb_lcount     ; mov r0, ra4
30228+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
30229+  add r1, r1, r0                ; mul24 r0, ra7,  rb7
30230+
30231+  sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c        # L1
30232+  add r2, r2, r0                ; mul24 r0, ra11, rb11          # L1
30233+  sub r2, r2, r0
30234+
30235+  shr r1, r1, 6
30236+  shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
30237+  add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
30238+  add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
30239+  sub r1, r1, r2                ; mov r3, ra_blk_height         # ; NxtLoop
30240+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3      # ; NxtLoop
30241+
30242+  brr.anyn -, r:1b
30243+  asr r1, r1, ra_wt_den_p7
30244+  min r1, r1, ra_pmax           ; mov -, vw_wait
30245+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
30246+# >>> .anyn 1b
30247+
30248+# r0 = remaining height (min 0)
30249+# r2 = r3 * rb_pitch
30250+# r3 = block_height
30251+
30252+# If looping again then we consumed 16 height last loop
30253+# rb_dma1 (stride) remains constant
30254+# rb_i_tmu remains const (based on total height)
30255+# recalc ra_dma0, rb_lcount based on new segment height
30256+
30257+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
30258+
30259+# DMA out
30260+  bra.anyz -, ra_link
30261+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
30262+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
30263+  shl r1, r1, i_shift23
30264+# >>> .anyz ra_link
30265+
30266+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
30267+# We add to dma0 to reduce the number of output lines in the final block
30268+  brr -, r:1b
30269+  add rb_lcount, rb_lcount, r0
30270+  add ra_dma0, ra_dma0, r1
30271+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
30272+# >>> 1b
30273+.endm
30274+
30275+::mc_filter_c_b
30276+  m_filter_c_b 8
30277+
30278+################################################################################
30279+# Exit code used by both Luma & Chroma so place between them to avoid I-cache
30280+# conflicts
30281+
30282+.macro m_exit_drain
30283+.if PREREAD == 2
30284+# Special case 2 as loop is wasteful
30285+  nop                   ; nop           ; ldtmu0
30286+  nop                   ; nop           ; ldtmu1
30287+  nop                   ; nop           ; ldtmu0
30288+  mov -, vw_wait        ; nop           ; ldtmu1
30289+.else
30290+  mov.setf r3, PREREAD - 1
30291+:1
30292+  brr.anynz -, r:1b
30293+  nop                   ; nop           ; ldtmu0
30294+  nop                   ; nop           ; ldtmu1
30295+  sub.setf r3, r3, 1
30296+ # >>>
30297+  mov  -, vw_wait
30298+.endif
30299+.endm
30300+
30301+# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
30302+# All qpus start at the beginning and after that (group - 1) must have finished
30303+# before (group) can start
30304+#
30305+# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
30306+# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
30307+# lockup otherwise)
30308+#
30309+# There is some, currently ill defined, potential lockup if we have the VDM active
30310+# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
30311+#
30312+# The code stalled when I had many waiters on a single sem so we have a
30313+# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
30314+# and we currently have both the memory & sems to support it.
30315+.macro m_sync_q, n_qpu, n_quads
30316+# Do not generate code for qpu >= quads * 4 -  fns should never be called
30317+.if n_qpu < n_quads * 4
30318+  mov ra_link, unif     # Can only branch to an a reg (not r0)
30319+  mov -, vw_wait        # [ra_link delay]
30320+
30321+.set n_sem_sync, n_qpu - (n_qpu % 4)
30322+.set n_sem_in, n_qpu
30323+.set n_sem_out, n_qpu + 1
30324+
30325+.if n_qpu % 4 == 0
30326+
30327+.set n_sem_quad_in,  12 + n_qpu / 4
30328+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
30329+
30330+  sacq -, n_sem_sync
30331+  sacq -, n_sem_sync
30332+  sacq -, n_sem_sync
30333+  bra -, ra_link
30334+  sacq -, n_sem_quad_in
30335+  srel -, n_sem_out
30336+  srel -, n_sem_quad_out
30337+
30338+.else
30339+  bra -, ra_link
30340+  srel -, n_sem_sync
30341+  sacq -, n_sem_in
30342+.if n_sem_out % 4 != 0
30343+  srel -, n_sem_out
30344+.else
30345+  nop
30346+.endif
30347+.endif
30348+.endif
30349+.endm
30350+
30351+.set v_quads8, N_QPU_8 / 4
30352+
30353+::mc_sync_q0
30354+  m_sync_q 0, v_quads8
30355+::mc_sync_q1
30356+  m_sync_q 1, v_quads8
30357+::mc_sync_q2
30358+  m_sync_q 2, v_quads8
30359+::mc_sync_q3
30360+  m_sync_q 3, v_quads8
30361+::mc_sync_q4
30362+  m_sync_q 4, v_quads8
30363+::mc_sync_q5
30364+  m_sync_q 5, v_quads8
30365+::mc_sync_q6
30366+  m_sync_q 6, v_quads8
30367+::mc_sync_q7
30368+  m_sync_q 7, v_quads8
30369+::mc_sync_q8
30370+  m_sync_q 8, v_quads8
30371+::mc_sync_q9
30372+  m_sync_q 9, v_quads8
30373+::mc_sync_q10
30374+  m_sync_q 10, v_quads8
30375+::mc_sync_q11
30376+  m_sync_q 11, v_quads8
30377+
30378+# mc_exit()
30379+# Chroma & Luma the same now
30380+
30381+.macro m_exit_qn
30382+  m_exit_drain
30383+  nop                   ; nop           ; thrend
30384+  nop
30385+  nop
30386+# >>> thrend <<<
30387+.endm
30388+
30389+::mc_exit_c_qn
30390+::mc_exit_y_qn
30391+  m_exit_qn
30392+
30393+
30394+
30395+# mc_interrupt_exit12()
30396+
30397+.macro m_exit_q0
30398+  m_exit_drain
30399+  sacq -, 12
30400+  nop                   ; nop           ; thrend
30401+  mov interrupt, 1
30402+  nop
30403+# >>> thrend <<<
30404+.endm
30405+
30406+::mc_exit_c_q0
30407+::mc_exit_y_q0
30408+  m_exit_q0
30409+
30410+# LUMA CODE
30411+
30412+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
30413+# For P frames we make the second x,y coordinates offset by +8
30414+
30415+
30416+################################################################################
30417+# mc_setup
30418+#
30419+# typedef struct qpu_mc_pred_y_s_s {
30420+#    qpu_mc_src_t next_src1;
30421+#    qpu_mc_src_t next_src2;
30422+#    uint16_t pic_h;
30423+#    uint16_t pic_w;
30424+#    uint32_t stride2;
30425+#    uint32_t stride1;
30426+#    uint32_t wdenom;
30427+#    uint32_t next_fn;
30428+# } qpu_mc_pred_y_s_t;
30429+
30430+.macro m_setup_y, v_bit_depth
30431+
30432+# Cannot use mul24 on x as x might be -ve, so must use shift
30433+.if v_bit_depth <= 8
30434+.set v_x_shift,         0
30435+.set v_pmask,           0xff
30436+.set v_blk_height,      Y_BLK_HEIGHT_8
30437+.else
30438+.set v_x_shift,         1
30439+.set v_pmask,           0xffff
30440+.set v_blk_height,      Y_BLK_HEIGHT_16
30441+.endif
30442+
30443+
30444+  # Need to save these because we need to know the frame dimensions before computing texture coordinates
30445+  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
30446+  mov ra9, unif                                                 # ref_y_base
30447+  mov ra1, unif                                                 # x2_y2
30448+
30449+
30450+# load constants
30451+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
30452+  shl rb_ef, r0, i_shift30      ; mov ra11, unif                # ; ref_y2_base
30453+
30454+  mov ra_kff800100, 0xff800100
30455+  mov rb_pmask, v_pmask
30456+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
30457+  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
30458+  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
30459+  mov rb_y_coeffs_2, 0x050b0a00
30460+  mov rb_y_coeffs_3, 0x11283a40
30461+  mov rb_y_coeffs_5, 0x0a0b0500
30462+
30463+# Compute part of VPM to use
30464+
30465+# Read image dimensions
30466+  mov ra3, unif                                                 # width_height
30467+  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # [ra3 delay] ; stride2
30468+.if v_x_shift == 0
30469+  sub rb_max_x, ra3.16b, 1
30470+.else
30471+  sub r0, ra3.16b, 1
30472+  shl rb_max_x, r0, v_x_shift
30473+.endif
30474+  sub rb_max_y, ra3.16a, 1
30475+  mov r3, elem_num              ; mov rb_pitch, unif            # stride1
30476+
30477+# get destination pitch
30478+  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay]
30479+  or  rb_dma1_base, r1, rb_pitch
30480+
30481+# Compute base address for first and second access
30482+  add r0, ra0.16b, r3                                           # Load x + elem_num
30483+.if v_x_shift != 0
30484+  shl r0, r0, v_x_shift
30485+.endif
30486+  max r0, r0, 0
30487+  min r0, r0, rb_max_x
30488+  shl ra_xshift_next, r0, 3                                     # Compute shifts
30489+
30490+# X is byte offset - we can only load words - mask
30491+
30492+  and r0, r0, -4                ; v8subs r2, r2, r2
30493+  sub r2, r2, rb_pitch
30494+  and r1, r0, r2
30495+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
30496+  add r0, r0, r1                                                # Add stripe offsets
30497+  add ra_base, ra9, r0
30498+
30499+  # r3 still contains elem_num
30500+  add r0, ra1.16b, r3                                           # Load x
30501+.if v_x_shift != 0
30502+  shl r0, r0, v_x_shift
30503+.endif
30504+  max r0, r0, 0
30505+  min r0, r0, rb_max_x
30506+  shl rb_xshift2_next, r0, 3                                    # Compute shifts
30507+
30508+  # r2 still contains mask
30509+  and r0, r0, -4
30510+  and r1, r0, r2
30511+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
30512+  add r0, r0, r1                                                # Add stripe offsets
30513+  add rb_base2, ra11, r0
30514+
30515+# Do preloads
30516+  nop                           ; mov r0, ra0.16a               # ; r0 = y
30517+  mov r3, PREREAD               ; mov r2, ra1.16a               # ; r2 = y2
30518+
30519+:1
30520+  sub.setf r3, r3, 1
30521+  max r1, r0, 0
30522+  min r1, r1, rb_max_y
30523+  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
30524+  add t0s, ra_base, r1          ; mov ra_y, r0
30525+
30526+  max r1, r2, 0
30527+  brr.anynz -, r:1b
30528+  min r1, r1, rb_max_y
30529+  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
30530+  add t1s, rb_base2, r1         ; mov ra_y2, r2
30531+# >>> .anynz 1b
30532+
30533+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
30534+
30535+  mov ra_link, unif                                             # Next fn
30536+
30537+# touch vertical context to keep simulator happy
30538+  mov ra8,  0                   ; mov rb8,  0                   # [ra_link delay]
30539+  bra -, ra_link
30540+  mov ra9,  0                   ; mov rb9,  0
30541+  mov ra10, 0                   ; mov rb10, 0
30542+  mov ra11, 0                   ; mov rb11, 0
30543+# >>> ra_link
30544+.endm
30545+
30546+::mc_setup_y_q0
30547+  m_setup_q0
30548+::mc_setup_y_qn
30549+  m_setup_y 8
30550+
30551+################################################################################
30552+#
30553+# Start of per-block setup code
30554+# P and B blocks share the same setup code to save on Icache space
30555+
30556+# get base addresses and per-channel shifts for *next* invocation
30557+# per-channel shifts were calculated on the *previous* invocation
30558+
30559+# 1st 3 instructions of per_block-setup in branch delay
30560+#
30561+# typedef struct qpu_mc_pred_y_p_s {
30562+#    qpu_mc_src_t next_src1;
30563+#    qpu_mc_src_t next_src2;
30564+#    uint16_t h;
30565+#    uint16_t w;
30566+#    uint32_t mymx21;
30567+#    uint32_t wo1;
30568+#    uint32_t wo2;
30569+#    uint32_t dst_addr;
30570+#    uint32_t next_fn;
30571+# } qpu_mc_pred_y_p_t;
30572+#
30573+
30574+.macro m_luma_setup, v_bit_depth
30575+# Hack - QASM may well have have label pasting but I have no idea how...
30576+.if v_bit_depth == 8
30577+  brr ra_link, r:per_block_setup_8
30578+.elif v_bit_depth == 10
30579+  brr ra_link, r:per_block_setup_10
30580+.endif
30581+  mov ra0, unif                 ; mov r3, elem_num              # y_x ; elem_num has implicit unpack??
30582+  add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2          # [ra0 delay] ; r5 = 0
30583+  add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
30584+.endm
30585+
30586+.macro m_per_block_setup, v_bit_depth
30587+
30588+.if v_bit_depth <= 8
30589+.set v_x_shift,         0
30590+.set v_x_mul,           1
30591+# Shifts to get width & height in the right place in ra_dma0
30592+.set v_dma_h_shift,     7
30593+.set v_dma_wh_shift,    i_shift16
30594+.else
30595+.set v_x_shift,         1
30596+.set v_x_mul,           2
30597+# Shifts to get width & height in the right place in ra_dma0
30598+.set v_dma_h_shift,     8
30599+.set v_dma_wh_shift,    15
30600+.endif
30601+
30602+.if v_x_shift != 0
30603+  shl r0, r0, v_x_shift
30604+.endif
30605+  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
30606+  min r0, r0, rb_max_x
30607+
30608+  shl ra_xshift_next, r0, 3                                     # Compute shifts
30609+  and r0, r0, -4
30610+  sub r2, r5, rb_pitch          ; mov ra_base_next, unif        # ; src1.base
30611+  and r1, r0, r2                ; mov ra_y_next, ra0.16a
30612+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
30613+  add r0, r0, r1                ; mov ra1, unif                 # Add stripe offsets ; src2.x_y
30614+  add ra_base_next, ra_base_next, r0                            # [ra1 delay]
30615+
30616+  add r0, ra1.16b, r3                                           # Load x2
30617+.if v_x_shift != 0
30618+  shl r0, r0, v_x_shift
30619+.endif
30620+  max r0, r0, r5                ; mov ra_y2_next, ra1.16a
30621+  min r0, r0, rb_max_x          ; mov rb_base2_next, unif       # ; src2.base
30622+  shl rb_xshift2_next, r0, 3                                    # Compute shifts
30623+  and r0, r0, -4                ; mov ra_width_height, unif     # ; width_height
30624+  and r1, r0, r2                ; mov vw_setup, rb_vpm_init     # ; set up VPM write
30625+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
30626+  add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul   # Add stripe offsets ; r1 = x in bytes
30627+  add rb_base2_next, rb_base2_next, r0
30628+
30629+# get width,height of block (unif load above), r1 = width * pel_size
30630+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height             # Compute vdw_setup1(dst_pitch-width)
30631+  add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
30632+  add rb_lcount, r0, (7-8)
30633+  shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add           # ; r3 return val
30634+  add r0, r0, r1                                                # Combine width and height of destination area
30635+  shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val        # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val
30636+  add ra_dma0, r0, rb_dma0_base ; mov r0, unif                  # ; Packed filter offsets
30637+
30638+# get filter coefficients and discard unused B frame values
30639+  shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif    #  Pick half to use ; L0 offset/weight
30640+  shl ra8, r0, 3                ; mov rb5, ra_k255
30641+
30642+# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
30643+
30644+# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val
30645+# but I can't see a way of doing that that is cheap enough to be worth it
30646+
30647+# Picked out in a slightly random order to space out uniform loads
30648+
30649+  # 1
30650+  mov r1, 0x01040400            # [ra8 delay]
30651+  ror ra2.8b, r1, ra8.8d
30652+  ror ra0.8b, r1, ra8.8c
30653+  # 2
30654+  ror ra2.8c, rb_y_coeffs_2, ra8.8d
30655+  ror ra0.8c, rb_y_coeffs_2, ra8.8c
30656+  # 0
30657+  mov r1,0x00010100             # -ve  [ra8 delay]
30658+  ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif    # ; L1 Wt/Offset
30659+  ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
30660+  # 7
30661+  shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000
30662+  ror r0, r1, ra8.8d            ; mov ra_dest, unif             # ; Destination address
30663+  ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
30664+  # 3
30665+  ror ra2.8d, rb_y_coeffs_3, ra8.8d
30666+  ror ra0.8d, rb_y_coeffs_3, ra8.8c
30667+  # 5
30668+  ror ra3.8b, rb_y_coeffs_5, ra8.8d
30669+  ror ra1.8b, rb_y_coeffs_5, ra8.8c
30670+  # 6
30671+  mov r1,0x04040100
30672+  ror ra3.8c, r1, ra8.8d
30673+  ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8                 # ; r5 return val
30674+
30675+  bra -, ra_link
30676+  # 4
30677+  mov r1,0x3a281100
30678+  ror r0, r1, ra8.8d            ; mov ra_link, unif             # ; link - load after we've used its previous val
30679+  ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
30680+# >>> branch ra_link
30681+
30682+# r5 = -8
30683+# r2 = fir_off_val
30684+# r3 = 128
30685+.endm
30686+
30687+:per_block_setup_8
30688+  m_per_block_setup 8
30689+
30690+
30691+
30692+################################################################################
30693+#
30694+# mc_filter_y_pxx
30695+#
30696+# Setup (& therefore uniform struct) shared with _bxx
30697+# Struct in m_luma_setup
30698+#
30699+# We can have 2 separate P reqs here as long as they mate to generate a
30700+# rectangular output block (i.e. h0 = h1, w0 = 8)
30701+#
30702+# At this point we have already issued PREREAD pairs of texture requests for the current block
30703+
30704+.macro m_filter_y_pxx, v_bit_depth
30705+
30706+# denom shift values
30707+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
30708+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
30709+
30710+  m_luma_setup v_bit_depth
30711+
30712+  shl r1, ra_wt_off_l0, i_wt_den_p5
30713+  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul
30714+  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
30715+
30716+# retrieve texture results and pick out bytes
30717+# then submit two more texture requests
30718+
30719+# This loop is identical to the B loop from here --->
30720+:1
30721+  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
30722+
30723+  max r2, ra_y, 0               ; mov r1, 0
30724+  min r2, r2, rb_max_y          ; mov r3, ra_k1
30725+  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
30726+  add t0s, ra_base, r2          ; mov rb5,  rb6
30727+  shr r0, r4, ra_xshift         ; mov rb6,  rb7
30728+
30729+  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
30730+  shr r1, r4, rb_xshift2        ; mov rb7, ra8
30731+  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
30732+  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
30733+  add t1s, rb_base2, r2         ; mov ra8,  ra9
30734+
30735+# apply horizontal filter
30736+  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
30737+  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
30738+  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
30739+  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
30740+  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
30741+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
30742+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
30743+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
30744+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
30745+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
30746+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
30747+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
30748+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
30749+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
30750+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
30751+  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
30752+
30753+  brr.anyn -, r:1b
30754+  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
30755+  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
30756+  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
30757+  # >>> .anyn 1b (r5 + r5)
30758+
30759+  # apply vertical filter and write to VPM
30760+  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
30761+
30762+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
30763+  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
30764+  add r1, r1, r0                ; mul24 r0, ra8,  rb8
30765+  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
30766+  add r1, r1, r0                ; mul24 r0, ra11, rb11
30767+# <--- to here
30768+  sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height                 # ; NxtLoop: r3 = block height
30769+  sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
30770+  sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
30771+
30772+  asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
30773+  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
30774+  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
30775+  sub r1, r0, r1                ; v8subs r0, ra_height, r3              # ; NxtLoop: r0 = remaining height (0 saturate)
30776+
30777+  brr.anyn -, r:1b
30778+  asr r1, r1, i_wt_den_p6
30779+  min r1, r1, ra_pmax           ; mov -, vw_wait
30780+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch                # ; NxtLoop
30781+# >>> branch.anyn 1b (r5 - rb_lcount)
30782+
30783+# r0 = remaining height (min 0)
30784+# r2 = r3 * rb_pitch
30785+# r3 = block_height
30786+
30787+# If looping again then we consumed 16 height last loop
30788+# rb_dma1 (stride) remains constant
30789+# rb_i_tmu remains const (based on total height)
30790+# recalc ra_dma0, rb_lcount based on new segment height
30791+
30792+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0 # VDW setup 0
30793+
30794+# DMA out
30795+  bra.anyz -, ra_link
30796+  min r0, r0, r3                ; mov vw_setup, rb_dma1 # Stride
30797+  sub r1, r0, r3                ; mov vw_addr, ra_dest  # start the VDW
30798+  shl r1, r1, i_shift23
30799+# >>> .anyz ra_link
30800+
30801+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
30802+# We add to dma0 to reduce the number of output lines in the final block
30803+  brr -, r:1b
30804+  add rb_lcount, rb_lcount, r0
30805+  add ra_dma0, ra_dma0, r1
30806+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
30807+# >>> 1b
30808+.endm
30809+
30810+::mc_filter_y_pxx
30811+  m_filter_y_pxx 8
30812+
30813+
30814+################################################################################
30815+
30816+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
30817+#
30818+# Setup (& therefore uniform struct) shared with _pxx
30819+# Struct in m_luma_setup
30820+#
30821+# l0 calc in els 0-7, L1 in 8-15
30822+# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh)
30823+#
30824+# At this point we have already issued PREREAD pairs of texture requests for the current block
30825+
30826+.macro m_filter_y_bxx, v_bit_depth
30827+
30828+# denom shift values
30829+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
30830+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
30831+
30832+  m_luma_setup v_bit_depth
30833+
30834+  shl r1, ra_wt_off_l0, i_wt_den_p6
30835+  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
30836+  sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
30837+  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
30838+
30839+# This loop is identical to the P loop from here --->
30840+:1
30841+  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
30842+
30843+  max r2, ra_y, 0               ; mov r1, 0
30844+  min r2, r2, rb_max_y          ; mov r3, ra_k1
30845+  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
30846+  add t0s, ra_base, r2          ; mov rb5,  rb6
30847+  shr r0, r4, ra_xshift         ; mov rb6,  rb7
30848+
30849+  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
30850+  shr r1, r4, rb_xshift2        ; mov rb7, ra8
30851+  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
30852+  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
30853+  add t1s, rb_base2, r2         ; mov ra8,  ra9
30854+
30855+# apply horizontal filter
30856+  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
30857+  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
30858+  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
30859+  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
30860+  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
30861+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
30862+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
30863+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
30864+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
30865+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
30866+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
30867+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
30868+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
30869+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
30870+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
30871+  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
30872+
30873+  brr.anyn -, r:1b
30874+  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
30875+  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
30876+  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
30877+  # >>> .anyn 1b (r5 + r5)
30878+
30879+  # apply vertical filter and write to VPM
30880+  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
30881+
30882+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
30883+  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
30884+  add r1, r1, r0                ; mul24 r0, ra8,  rb8
30885+  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
30886+  add r1, r1, r0                ; mul24 r0, ra11, rb11
30887+# <--- to here
30888+  sub r1, r1, ra4
30889+  sub r1, r1, r0                ; mov r2, rb_wt_off
30890+
30891+  asr r1, r1, 6
30892+  sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
30893+  mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
30894+  sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
30895+  sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
30896+  add r1, r1, r2                ; mov r0, r1 << 8
30897+  add r1, r1, r0                ; mov r3, ra_blk_height         # ; NxtLoop: r3 = block height
30898+
30899+  brr.anyn -, r:1b
30900+  asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch        # ; NxtLoop
30901+  min r1, r1, ra_pmax           ; mov -, vw_wait
30902+  max vpm, r1, 0                ; v8subs r0, ra_height, r3      # ; NxtLoop: r0 = remaining height (0 saturate)
30903+# >>> branch.anyn 1b (r5 - rb_lcount)
30904+
30905+# r0 = remaining height (min 0)
30906+# r2 = r3 * rb_pitch
30907+# r3 = block_height
30908+
30909+# If looping again then we consumed block_height last loop
30910+# rb_dma1 (stride) remains constant
30911+# rb_i_tmu remains const (based on total height)
30912+# recalc ra_dma0, rb_lcount based on new segment height
30913+
30914+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
30915+
30916+# DMA out
30917+  bra.anyz -, ra_link
30918+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
30919+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
30920+  shl r1, r1, i_shift23
30921+# >>> .anyz ra_link (ra_height - remaining height)
30922+
30923+# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve
30924+# We add to dma0 to reduce the number of output lines in the final block
30925+  brr -, r:1b
30926+  add rb_lcount, rb_lcount, r0
30927+  add ra_dma0, ra_dma0, r1
30928+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
30929+# >>> 1b
30930+.endm
30931+
30932+::mc_filter_y_bxx
30933+  m_filter_y_bxx 8
30934+
30935+################################################################################
30936+#
30937+# typedef struct qpu_mc_pred_y_p00_s {
30938+#    qpu_mc_src_t next_src1;
30939+#    uint16_t h;
30940+#    uint16_t w;
30941+#    uint32_t wo1;
30942+#    uint32_t dst_addr;
30943+#    uint32_t next_fn;
30944+# } qpu_mc_pred_y_p00_t;
30945+
30946+.macro m_filter_y_p00, v_bit_depth
30947+
30948+.if v_bit_depth <= 8
30949+.set v_x_shift,         0
30950+.set v_x_mul,           1
30951+# Shifts to get width & height in the right place in ra_dma0
30952+.set v_dma_h_shift,     7
30953+.set v_dma_wh_shift,    i_shift16
30954+.else
30955+.set v_x_shift,         1
30956+.set v_x_mul,           2
30957+# Shifts to get width & height in the right place in ra_dma0
30958+.set v_dma_h_shift,     8
30959+.set v_dma_wh_shift,    15
30960+.endif
30961+
30962+  mov ra0, unif                 ; mov r0, elem_num              # y_x
30963+  mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5          # [ra0 delay] ; r5 = 0
30964+  add r0, ra0.16b, r0           ; mov ra_base_next, unif        # ; src1.base
30965+.if v_x_shift != 0
30966+  shl r0, r0, v_x_shift
30967+.endif
30968+
30969+  max r0, r0, r5                ; mov ra_y_next, ra0.16a        # ; width_height
30970+  min r0, r0, rb_max_x          ; mov ra_width_height, unif
30971+
30972+  shl ra_xshift_next, r0, 3                                     # Compute shifts
30973+  and r0, r0, -4
30974+  sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif    # ; weight_offset
30975+  and r1, r0, r2
30976+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
30977+  add r0, r0, r1                ; mov ra_dest, unif             # Add stripe offsets ; dest addr
30978+  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
30979+
30980+# get width,height of block (unif load above)
30981+# Compute vdw_setup1(dst_pitch-width)
30982+  shl r1, ra_width, v_x_shift
30983+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
30984+  sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
30985+  shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
30986+  add r0, r0, r1                                                # Combine width and height of destination area
30987+  shl rb_wt_off, ra_wt_off_l0, DENOM + 7
30988+  shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif             # Shift into bits 16 upwards of the vdw_setup0 register ; link
30989+  add ra_dma0, r0, rb_dma0_base
30990+
30991+:1
30992+  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
30993+  nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
30994+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
30995+
30996+  max r2, ra_y, 0  # y
30997+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
30998+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
30999+  add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
31000+
31001+  sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
31002+  shl r1, r1, 8                 ; mov r3, ra_blk_height
31003+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
31004+
31005+  brr.anyn -, r:1b
31006+  asr r1, r1, DENOM + 8
31007+  min r1, r1, ra_pmax           ; mov -, vw_wait
31008+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
31009+# >>> branch.anyn 1b
31010+
31011+# r0 = remaining height (min 0)
31012+# r2 = r3 * rb_pitch
31013+# r3 = block_height
31014+
31015+# If looping again then we consumed 16 height last loop
31016+# rb_dma1 (stride) remains constant
31017+# rb_i_tmu remains const (based on total height)
31018+# recalc ra_dma0, rb_lcount based on new segment height
31019+
31020+  mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
31021+
31022+# DMA out
31023+  bra.anyz -, ra_link
31024+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
31025+  sub r1, r0, r3        ; mov vw_addr, ra_dest  # start the VDW
31026+  shl r1, r1, i_shift23
31027+# >>> .anyz ra_link
31028+
31029+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
31030+# We add to dma0 to reduce the number of output lines in the final block
31031+  brr -, r:1b
31032+  add rb_lcount, rb_lcount, r0
31033+  add ra_dma0, ra_dma0, r1
31034+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
31035+# >>> 1b
31036+.endm
31037+
31038+::mc_filter_y_p00
31039+  m_filter_y_p00 8
31040+
31041+################################################################################
31042+
31043+.macro m_filter_y_b00, v_bit_depth
31044+# luma setup does a fair bit more than we need calculating filter coeffs
31045+# that we will never use but it saves I-cache to use it (also simple!)
31046+  m_luma_setup v_bit_depth
31047+
31048+# Fix up vals that were expecting a filter (somewhat icky)
31049+  mov r2, 1
31050+  add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0      # Need in rX rather than raX for <<8 to do what we want
31051+  shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero
31052+  nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
31053+
31054+:1
31055+  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
31056+  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
31057+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
31058+
31059+  max r2, ra_y, 0  # y
31060+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
31061+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
31062+  add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
31063+
31064+  max r2, ra_y2, 0
31065+  min r2, r2, rb_max_y
31066+  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
31067+  add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax         # v8subs masks out all but bottom byte
31068+  and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
31069+
31070+  sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
31071+  add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
31072+
31073+  shl r1, r1, 8                 ; mov r3, ra_blk_height
31074+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
31075+
31076+  brr.anyn -, r:1b
31077+  asr r1, r1, (DENOM + 9) - 32                                  # -32 to get valid shift immediate
31078+  min r1, r1, ra_pmax           ; mov -, vw_wait
31079+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
31080+# >>> branch.anyn 1b
31081+
31082+# r0 = remaining height (min 0)
31083+# r2 = r3 * rb_pitch
31084+# r3 = block_height
31085+
31086+# If looping again then we consumed 16 height last loop
31087+# rb_dma1 (stride) remains constant
31088+# rb_i_tmu remains const (based on total height)
31089+# recalc ra_dma0, rb_lcount based on new segment height
31090+
31091+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
31092+
31093+# DMA out
31094+  bra.anyz -, ra_link
31095+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
31096+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
31097+  shl r1, r1, i_shift23
31098+# >>> .anyz ra_link
31099+
31100+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
31101+# We add to dma0 to reduce the number of output lines in the final block
31102+  brr -, r:1b
31103+  add rb_lcount, rb_lcount, r0
31104+  add ra_dma0, ra_dma0, r1
31105+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
31106+# >>> 1b
31107+.endm
31108+
31109+::mc_filter_y_b00
31110+  m_filter_y_b00 8
31111+
31112+################################################################################
31113+################################################################################
31114+# 10 BIT
31115+
31116+::mc_setup_c10_q0
31117+  m_setup_q0
31118+::mc_setup_c10_qn
31119+  m_setup_c 10
31120+
31121+::mc_filter_c10_p
31122+  m_filter_c_p 0, 10
31123+
31124+::mc_filter_c10_p_l1
31125+  m_filter_c_p 1, 10
31126+
31127+
31128+::mc_filter_c10_b
31129+  m_filter_c_b 10
31130+
31131+# Even if these fns are the same as for other bit depths we want our own copy
31132+# to keep the code we are using in a single lump to avoid (direct map) cache
31133+# thrashing
31134+.set v_quads10, N_QPU_16 / 4
31135+
31136+::mc_sync10_q0
31137+  m_sync_q 0, v_quads10
31138+::mc_sync10_q1
31139+  m_sync_q 1, v_quads10
31140+::mc_sync10_q2
31141+  m_sync_q 2, v_quads10
31142+::mc_sync10_q3
31143+  m_sync_q 3, v_quads10
31144+::mc_sync10_q4
31145+  m_sync_q 4, v_quads10
31146+::mc_sync10_q5
31147+  m_sync_q 5, v_quads10
31148+::mc_sync10_q6
31149+  m_sync_q 6, v_quads10
31150+::mc_sync10_q7
31151+  m_sync_q 7, v_quads10
31152+::mc_sync10_q8
31153+  m_sync_q 8, v_quads10
31154+::mc_sync10_q9
31155+  m_sync_q 9, v_quads10
31156+::mc_sync10_q10
31157+  m_sync_q 10, v_quads10
31158+::mc_sync10_q11
31159+  m_sync_q 11, v_quads10
31160+
31161+::mc_exit_y10_q0
31162+::mc_exit_c10_q0
31163+  m_exit_q0
31164+
31165+::mc_exit_y10_qn
31166+::mc_exit_c10_qn
31167+  m_exit_qn
31168+
31169+::mc_setup_y10_q0
31170+  m_setup_q0
31171+::mc_setup_y10_qn
31172+  m_setup_y 10
31173+
31174+:per_block_setup_10
31175+  m_per_block_setup 10
31176+
31177+::mc_filter_y10_pxx
31178+  m_filter_y_pxx 10
31179+
31180+::mc_filter_y10_p00
31181+  m_filter_y_p00 10
31182+
31183+::mc_filter_y10_bxx
31184+  m_filter_y_bxx 10
31185+
31186+::mc_filter_y10_b00
31187+  m_filter_y_b00 10
31188+
31189+
31190+
31191+::mc_end
31192+# Do not add code here because mc_end must appear after all other code.
31193--- /dev/null
31194+++ b/libavcodec/rpi_hevc_shader_cmd.h
31195@@ -0,0 +1,165 @@
31196+/*
31197+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
31198+All rights reserved.
31199+
31200+Redistribution and use in source and binary forms, with or without
31201+modification, are permitted provided that the following conditions are met:
31202+    * Redistributions of source code must retain the above copyright
31203+      notice, this list of conditions and the following disclaimer.
31204+    * Redistributions in binary form must reproduce the above copyright
31205+      notice, this list of conditions and the following disclaimer in the
31206+      documentation and/or other materials provided with the distribution.
31207+    * Neither the name of the copyright holder nor the
31208+      names of its contributors may be used to endorse or promote products
31209+      derived from this software without specific prior written permission.
31210+
31211+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
31212+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
31213+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31214+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
31215+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
31216+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31217+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
31218+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31219+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31220+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31221+*/
31222+
31223+#ifndef RPI_SHADER_CMD_H
31224+#define RPI_SHADER_CMD_H
31225+
31226+#pragma pack(push, 4)
31227+
31228+#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
31229+// If mixed then we are just confused and get a lot of warnings....
31230+typedef const uint8_t * qpu_mc_src_addr_t;
31231+typedef uint8_t * qpu_mc_dst_addr_t;
31232+#else
31233+typedef uint32_t qpu_mc_src_addr_t;
31234+typedef uint32_t qpu_mc_dst_addr_t;
31235+#endif
31236+
31237+typedef struct qpu_mc_src_s
31238+{
31239+    int16_t y;
31240+    int16_t x;
31241+    qpu_mc_src_addr_t base;
31242+} qpu_mc_src_t;
31243+
31244+
31245+typedef struct qpu_mc_pred_c_p_s {
31246+    qpu_mc_src_t next_src;
31247+    uint16_t h;
31248+    uint16_t w;
31249+    uint32_t coeffs_x;
31250+    uint32_t coeffs_y;
31251+    uint32_t wo_u;
31252+    uint32_t wo_v;
31253+    qpu_mc_dst_addr_t dst_addr_c;
31254+    uint32_t next_fn;
31255+} qpu_mc_pred_c_p_t;
31256+
31257+typedef struct qpu_mc_pred_c_b_s {
31258+    qpu_mc_src_t next_src1;
31259+    uint16_t h;
31260+    uint16_t w;
31261+    uint32_t coeffs_x1;
31262+    uint32_t coeffs_y1;
31263+    int16_t weight_u1;
31264+    int16_t weight_v1;
31265+    qpu_mc_src_t next_src2;
31266+    uint32_t coeffs_x2;
31267+    uint32_t coeffs_y2;
31268+    uint32_t wo_u2;
31269+    uint32_t wo_v2;
31270+    qpu_mc_dst_addr_t dst_addr_c;
31271+    uint32_t next_fn;
31272+} qpu_mc_pred_c_b_t;
31273+
31274+typedef struct qpu_mc_pred_c_s_s {
31275+    qpu_mc_src_t next_src1;
31276+    uint32_t pic_cw;            // C Width (== Y width / 2)
31277+    uint32_t pic_ch;            // C Height (== Y Height / 2)
31278+    uint32_t stride2;
31279+    uint32_t stride1;
31280+    qpu_mc_src_t next_src2;
31281+    uint32_t next_fn;
31282+} qpu_mc_pred_c_s_t;
31283+
31284+typedef struct qpu_mc_pred_c_s {
31285+    union {
31286+        qpu_mc_pred_c_p_t p;
31287+        qpu_mc_pred_c_b_t b;
31288+        qpu_mc_pred_c_s_t s;
31289+    };
31290+} qpu_mc_pred_c_t;
31291+
31292+
31293+typedef struct qpu_mc_pred_y_p_s {
31294+    qpu_mc_src_t next_src1;
31295+    qpu_mc_src_t next_src2;
31296+    uint16_t h;
31297+    uint16_t w;
31298+    uint32_t mymx21;
31299+    uint32_t wo1;
31300+    uint32_t wo2;
31301+    qpu_mc_dst_addr_t dst_addr;
31302+    uint32_t next_fn;
31303+} qpu_mc_pred_y_p_t;
31304+
31305+typedef struct qpu_mc_pred_y_p00_s {
31306+    qpu_mc_src_t next_src1;
31307+    uint16_t h;
31308+    uint16_t w;
31309+    uint32_t wo1;
31310+    qpu_mc_dst_addr_t dst_addr;
31311+    uint32_t next_fn;
31312+} qpu_mc_pred_y_p00_t;
31313+
31314+typedef struct qpu_mc_pred_y_s_s {
31315+    qpu_mc_src_t next_src1;
31316+    qpu_mc_src_t next_src2;
31317+    uint16_t pic_h;
31318+    uint16_t pic_w;
31319+    uint32_t stride2;
31320+    uint32_t stride1;
31321+    uint32_t next_fn;
31322+} qpu_mc_pred_y_s_t;
31323+
31324+typedef struct qpu_mc_pred_sync_s {
31325+    uint32_t next_fn;
31326+} qpu_mc_pred_sync_t;
31327+
31328+// Only a useful structure in that it allows us to return something other than a void *
31329+typedef struct qpu_mc_pred_y_s {
31330+    union {
31331+        qpu_mc_pred_y_p_t p;
31332+        qpu_mc_pred_y_p00_t p00;
31333+        qpu_mc_pred_y_s_t s;
31334+    };
31335+} qpu_mc_pred_y_t;
31336+
31337+typedef union qpu_mc_pred_cmd_u {
31338+    qpu_mc_pred_y_t y;
31339+    qpu_mc_pred_c_t c;
31340+    qpu_mc_pred_sync_t sync;
31341+} qpu_mc_pred_cmd_t;
31342+
31343+static void inline qpu_mc_link_set(qpu_mc_pred_cmd_t * const cmd, const uint32_t fn)
31344+{
31345+    // Link is last el of previous cmd
31346+    ((uint32_t *)cmd)[-1] = fn;
31347+}
31348+
31349+#define QPU_MC_PRED_N_Y8        12
31350+#define QPU_MC_PRED_N_C8        12
31351+
31352+#define QPU_MC_PRED_N_Y10       12
31353+#define QPU_MC_PRED_N_C10       12
31354+
31355+#define QPU_MC_DENOM            7
31356+
31357+#pragma pack(pop)
31358+
31359+#endif
31360+
31361--- /dev/null
31362+++ b/libavcodec/rpi_hevc_shader_template.c
31363@@ -0,0 +1,88 @@
31364+/*
31365+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
31366+All rights reserved.
31367+
31368+Redistribution and use in source and binary forms, with or without
31369+modification, are permitted provided that the following conditions are met:
31370+    * Redistributions of source code must retain the above copyright
31371+      notice, this list of conditions and the following disclaimer.
31372+    * Redistributions in binary form must reproduce the above copyright
31373+      notice, this list of conditions and the following disclaimer in the
31374+      documentation and/or other materials provided with the distribution.
31375+    * Neither the name of the copyright holder nor the
31376+      names of its contributors may be used to endorse or promote products
31377+      derived from this software without specific prior written permission.
31378+
31379+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
31380+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
31381+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31382+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
31383+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
31384+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31385+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
31386+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31387+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31388+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31389+*/
31390+
31391+#include "hevc.h"
31392+#include "rpi_hevcdec.h"
31393+#include "libavutil/rpi_sand_fns.h"
31394+#include "rpi_hevc_shader_cmd.h"
31395+#include "rpi_hevc_shader_template.h"
31396+
31397+typedef struct shader_track_s
31398+{
31399+    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
31400+    const struct qpu_mc_src_s *last_l0;
31401+    const struct qpu_mc_src_s *last_l1;
31402+    uint32_t width;  // pic_width * PW
31403+    uint32_t height;
31404+    uint32_t stride2;
31405+    uint32_t stride1;
31406+} shader_track_t;
31407+
31408+static int wtoidx(const unsigned int w)
31409+{
31410+    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
31411+    return pel_weight[w];
31412+}
31413+
31414+static const int fctom(uint32_t x)
31415+{
31416+    int rv;
31417+    // As it happens we can take the 2nd filter term & divide it by 8
31418+    // (dropping fractions) to get the fractional move
31419+    rv = 8 - ((x >> 11) & 0xf);
31420+    av_assert2(rv >= 0 && rv <= 7);
31421+    return rv;
31422+}
31423+
31424+static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
31425+{
31426+    return (x << shl) >> shr;
31427+}
31428+
31429+static inline int woff_p(HEVCRpiContext *const s, int32_t x)
31430+{
31431+    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
31432+}
31433+
31434+static inline int woff_b(HEVCRpiContext *const s, int32_t x)
31435+{
31436+    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
31437+}
31438+
31439+static inline int wweight(int32_t x)
31440+{
31441+    return ext(x, 16, 16);
31442+}
31443+
31444+
31445+#define PW 1
31446+#include "rpi_hevc_shader_template_fn.h"
31447+
31448+#undef PW
31449+#define PW 2
31450+#include "rpi_hevc_shader_template_fn.h"
31451+
31452--- /dev/null
31453+++ b/libavcodec/rpi_hevc_shader_template.h
31454@@ -0,0 +1,49 @@
31455+/*
31456+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
31457+All rights reserved.
31458+
31459+Redistribution and use in source and binary forms, with or without
31460+modification, are permitted provided that the following conditions are met:
31461+    * Redistributions of source code must retain the above copyright
31462+      notice, this list of conditions and the following disclaimer.
31463+    * Redistributions in binary form must reproduce the above copyright
31464+      notice, this list of conditions and the following disclaimer in the
31465+      documentation and/or other materials provided with the distribution.
31466+    * Neither the name of the copyright holder nor the
31467+      names of its contributors may be used to endorse or promote products
31468+      derived from this software without specific prior written permission.
31469+
31470+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
31471+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
31472+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31473+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
31474+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
31475+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31476+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
31477+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31478+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31479+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31480+*/
31481+
31482+#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
31483+#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
31484+
31485+struct HEVCRpiContext;
31486+struct HEVCRpiInterPredEnv;
31487+
31488+void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s,
31489+                  const struct HEVCRpiInterPredEnv *const ipe_y,
31490+                  const struct HEVCRpiInterPredEnv *const ipe_c);
31491+
31492+void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s,
31493+                  const struct HEVCRpiInterPredEnv *const ipe_y,
31494+                  const struct HEVCRpiInterPredEnv *const ipe_c);
31495+
31496+void rpi_sand_dump8(const char * const name,
31497+                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
31498+
31499+void rpi_sand_dump16(const char * const name,
31500+                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
31501+
31502+#endif
31503+
31504--- /dev/null
31505+++ b/libavcodec/rpi_hevc_shader_template_fn.h
31506@@ -0,0 +1,502 @@
31507+/*
31508+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
31509+All rights reserved.
31510+
31511+Redistribution and use in source and binary forms, with or without
31512+modification, are permitted provided that the following conditions are met:
31513+    * Redistributions of source code must retain the above copyright
31514+      notice, this list of conditions and the following disclaimer.
31515+    * Redistributions in binary form must reproduce the above copyright
31516+      notice, this list of conditions and the following disclaimer in the
31517+      documentation and/or other materials provided with the distribution.
31518+    * Neither the name of the copyright holder nor the
31519+      names of its contributors may be used to endorse or promote products
31520+      derived from this software without specific prior written permission.
31521+
31522+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
31523+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
31524+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31525+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
31526+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
31527+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31528+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
31529+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31530+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31531+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31532+*/
31533+
31534+#define STRCAT(x,y) x##y
31535+
31536+#if PW == 1
31537+#define pixel uint8_t
31538+#define FUNC(f) STRCAT(f, 8)
31539+#elif PW == 2
31540+#define pixel uint16_t
31541+#define FUNC(f) STRCAT(f, 16)
31542+#else
31543+#error Unexpected PW
31544+#endif
31545+
31546+#define PATCH_STRIDE (16 * PW)
31547+
31548+static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
31549+{
31550+    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
31551+        const pixel s = *(const pixel *)src;
31552+        pixel * d = (pixel *)dst;
31553+        for (unsigned int j = 0; j < w; j += PW) {
31554+            *d++ = s;
31555+        }
31556+    }
31557+}
31558+
31559+static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
31560+{
31561+    for (unsigned int i = 0; i != h; ++i, dst += stride) {
31562+        memcpy(dst, src, w);
31563+    }
31564+}
31565+
31566+static void FUNC(get_patch_y)(const shader_track_t * const st,
31567+                         uint8_t * dst, const unsigned int dst_stride,
31568+                         const qpu_mc_src_t *src,
31569+                         unsigned int _w, unsigned int _h)
31570+{
31571+    int x = src->x * PW;
31572+    int y = src->y;
31573+    int w = _w * PW;
31574+    int h = _h;
31575+    int dl = 0;
31576+    int dr = 0;
31577+    int dt = 0;
31578+    int db = 0;
31579+
31580+    if (x < 0) {
31581+        if (-x >= w)
31582+            x = PW - w;
31583+        dl = -x;
31584+        w += x;
31585+        x = 0;
31586+    }
31587+    if (x + w > st->width) {
31588+        if (x >= st->width)
31589+            x = st->width - PW;
31590+        dr = (x + w) - st->width;
31591+        w = st->width - x;
31592+    }
31593+
31594+    // Y
31595+    if (y < 0) {
31596+        if (-y >= h)
31597+            y = 1 - h;
31598+        dt = -y;
31599+        h += y;
31600+        y = 0;
31601+    }
31602+    if (y + h > st->height) {
31603+        if (y >= st->height)
31604+            y = st->height - 1;
31605+        db = (y + h) - st->height;
31606+        h = st->height - y;
31607+    }
31608+
31609+    dst += dl + dt * dst_stride;
31610+    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
31611+
31612+    // Edge dup
31613+    if (dl != 0)
31614+        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
31615+    if (dr != 0)
31616+        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
31617+    w += dl + dr;
31618+    dst -= dl;
31619+
31620+    if (dt != 0)
31621+        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
31622+    if (db != 0)
31623+        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
31624+}
31625+
31626+
31627+
31628+static void FUNC(get_patch_c)(const shader_track_t * const st,
31629+                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
31630+                         const qpu_mc_src_t *src,
31631+                         unsigned int _w, unsigned int _h)
31632+{
31633+    int x = src->x * PW;
31634+    int y = src->y;
31635+    int w = _w * PW;
31636+    int h = _h;
31637+    int dl = 0;
31638+    int dr = 0;
31639+    int dt = 0;
31640+    int db = 0;
31641+    const int width = st->width;
31642+    const int height = st->height;
31643+
31644+    if (x < 0) {
31645+        if (-x >= w)
31646+            x = PW - w;
31647+        dl = -x;
31648+        w += x;
31649+        x = 0;
31650+    }
31651+    if (x + w > width) {
31652+        if (x >= width)
31653+            x = width - PW;
31654+        dr = (x + w) - width;
31655+        w = width - x;
31656+    }
31657+
31658+    // Y
31659+    if (y < 0) {
31660+        if (-y >= h)
31661+            y = 1 - h;
31662+        dt = -y;
31663+        h += y;
31664+        y = 0;
31665+    }
31666+    if (y + h > height) {
31667+        if (y >= height)
31668+            y = height - 1;
31669+        db = (y + h) - height;
31670+        h = height - y;
31671+    }
31672+
31673+    dst_u += dl + dt * dst_stride;
31674+    dst_v += dl + dt * dst_stride;
31675+    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
31676+
31677+    // Edge dup
31678+    if (dl != 0)
31679+    {
31680+        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
31681+        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
31682+    }
31683+    if (dr != 0)
31684+    {
31685+        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
31686+        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
31687+    }
31688+    w += dl + dr;
31689+    dst_u -= dl;
31690+    dst_v -= dl;
31691+
31692+    if (dt != 0)
31693+    {
31694+        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
31695+        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
31696+    }
31697+    if (db != 0)
31698+    {
31699+        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
31700+        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
31701+    }
31702+}
31703+
31704+// w, y, w, h in pixels
31705+// stride1, stride2 in bytes
31706+void FUNC(rpi_sand_dump)(const char * const name,
31707+                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
31708+{
31709+    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
31710+
31711+    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
31712+
31713+    if (is_c) {
31714+        x *= 2;
31715+        w *= 2;
31716+    }
31717+
31718+    for (int i = y; i != y + h; ++i) {
31719+        for (int j = x; j != x + w; ++j) {
31720+            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
31721+            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
31722+#if PW == 1
31723+            if (j < 0 || i < 0)
31724+                printf("..%c", sep);
31725+            else
31726+                printf("%02x%c", *(const pixel*)p, sep);
31727+#else
31728+            if (j < 0 || i < 0)
31729+                printf("...%c", sep);
31730+            else
31731+                printf("%03x%c", *(const pixel*)p, sep);
31732+#endif
31733+        }
31734+        printf("\n");
31735+    }
31736+}
31737+
31738+
31739+void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s,
31740+                  const HEVCRpiInterPredEnv *const ipe_y,
31741+                  const HEVCRpiInterPredEnv *const ipe_c)
31742+{
31743+    for (int c_idx = 0; c_idx < 2; ++c_idx)
31744+    {
31745+        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
31746+        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
31747+        unsigned int exit_n = 0;
31748+
31749+        if (ipe == NULL || !ipe->used) {
31750+            continue;
31751+        }
31752+
31753+        do {
31754+            for (unsigned int i = 0; i != ipe->n; ++i) {
31755+                const HEVCRpiInterPredQ * const q = ipe->q + i;
31756+                shader_track_t * const st = tracka + i;
31757+                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
31758+
31759+                for (;;) {
31760+                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
31761+
31762+                    if (link == q->code_setup) {
31763+                        if (c_idx == 0) {
31764+                            // Luma
31765+                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
31766+
31767+                            st->height = c->pic_h;
31768+                            st->width = c->pic_w * PW;
31769+                            st->stride1 = c->stride1;
31770+                            st->stride2 = c->stride2;
31771+                            st->last_l0 = &c->next_src1;
31772+                            st->last_l1 = &c->next_src2;
31773+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
31774+                        }
31775+                        else {
31776+                            // Chroma
31777+                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
31778+
31779+                            st->height = c->pic_ch;
31780+                            st->width = c->pic_cw * PW;
31781+                            st->stride1 = c->stride1;
31782+                            st->stride2 = c->stride2;
31783+                            st->last_l0 = &c->next_src1;
31784+                            st->last_l1 = &c->next_src2;
31785+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
31786+                        }
31787+                    }
31788+                    else if (link == s->qpu.y_pxx) {
31789+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
31790+                        const int w1 = FFMIN(c->w, 8);
31791+                        const int w2 = c->w - w1;
31792+
31793+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
31794+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
31795+
31796+                        FUNC(get_patch_y)(st,
31797+                                    patch_y1, PATCH_STRIDE,
31798+                                    st->last_l0,
31799+                                    16, c->h + 7);
31800+                        if (w2 > 0) {
31801+                            FUNC(get_patch_y)(st,
31802+                                        patch_y2, PATCH_STRIDE,
31803+                                        st->last_l1,
31804+                                        16, c->h + 7);
31805+                        }
31806+
31807+                        // wo[offset] = offset*2+1
31808+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
31809+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
31810+                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
31811+                        if (w2 > 0) {
31812+                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
31813+                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
31814+                                c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
31815+                        }
31816+                        st->last_l0 = &c->next_src1;
31817+                        st->last_l1 = &c->next_src2;
31818+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
31819+                    }
31820+                    else if (link == s->qpu.y_bxx) {
31821+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
31822+
31823+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
31824+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
31825+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
31826+
31827+                        FUNC(get_patch_y)(st,
31828+                                    patch_y1, PATCH_STRIDE,
31829+                                    st->last_l0,
31830+                                    16, c->h + 7);
31831+                        FUNC(get_patch_y)(st,
31832+                                    patch_y2, PATCH_STRIDE,
31833+                                    st->last_l1,
31834+                                    16, c->h + 7);
31835+
31836+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
31837+                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
31838+                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
31839+
31840+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
31841+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
31842+                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
31843+                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
31844+                        st->last_l0 = &c->next_src1;
31845+                        st->last_l1 = &c->next_src2;
31846+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
31847+                    }
31848+                    else if (link == s->qpu.y_p00) {
31849+                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
31850+
31851+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
31852+
31853+                        FUNC(get_patch_y)(st,
31854+                                    patch_y1, PATCH_STRIDE,
31855+                                    st->last_l0,
31856+                                    16, c->h + 7);
31857+
31858+                        // wo[offset] = offset*2+1
31859+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
31860+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
31861+                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
31862+
31863+                        st->last_l0 = &c->next_src1;
31864+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
31865+                    }
31866+                    else if (link == s->qpu.y_b00) {
31867+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
31868+
31869+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
31870+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
31871+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
31872+
31873+                        av_assert0(c->w <= 16 && c->h <= 64);
31874+
31875+                        FUNC(get_patch_y)(st,
31876+                                    patch_y1, PATCH_STRIDE,
31877+                                    st->last_l0,
31878+                                    16, c->h);
31879+                        FUNC(get_patch_y)(st,
31880+                                    patch_y2, PATCH_STRIDE,
31881+                                    st->last_l1,
31882+                                    16, c->h);
31883+
31884+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
31885+                           patch_y3, patch_y1, PATCH_STRIDE,
31886+                           c->h, 0, 0, c->w);
31887+
31888+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
31889+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
31890+                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
31891+                            0, woff_b(s, c->wo2), 0, 0, c->w);
31892+                        st->last_l0 = &c->next_src1;
31893+                        st->last_l1 = &c->next_src2;
31894+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
31895+                    }
31896+                    else if (link == s->qpu.c_pxx) {
31897+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
31898+                        const int mx = fctom(c->coeffs_x);
31899+                        const int my = fctom(c->coeffs_y);
31900+
31901+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
31902+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
31903+                        uint8_t patch_u3[8 * 16 * PW];
31904+                        uint8_t patch_v3[8 * 16 * PW];
31905+
31906+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
31907+
31908+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
31909+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
31910+                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
31911+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
31912+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
31913+                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
31914+
31915+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
31916+
31917+                        st->last_l0 = &c->next_src;
31918+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
31919+                    }
31920+                    else if (link == s->qpu.c_pxx_l1) {
31921+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
31922+                        const int mx = fctom(c->coeffs_x);
31923+                        const int my = fctom(c->coeffs_y);
31924+
31925+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
31926+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
31927+                        uint8_t patch_u3[8 * 16 * PW];
31928+                        uint8_t patch_v3[8 * 16 * PW];
31929+
31930+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
31931+
31932+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
31933+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
31934+                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
31935+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
31936+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
31937+                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
31938+
31939+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
31940+
31941+                        st->last_l1 = &c->next_src;
31942+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
31943+                    }
31944+                    else if (link == s->qpu.c_bxx) {
31945+                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
31946+                        const int mx1 = fctom(c->coeffs_x1);
31947+                        const int my1 = fctom(c->coeffs_y1);
31948+                        const int mx2 = fctom(c->coeffs_x2);
31949+                        const int my2 = fctom(c->coeffs_y2);
31950+
31951+                        uint8_t patch_u1[PATCH_STRIDE * 72];
31952+                        uint8_t patch_v1[PATCH_STRIDE * 72];
31953+                        uint8_t patch_u2[PATCH_STRIDE * 72];
31954+                        uint8_t patch_v2[PATCH_STRIDE * 72];
31955+                        uint8_t patch_u3[8 * 16 * PW];
31956+                        uint8_t patch_v3[8 * 16 * PW];
31957+                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
31958+                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
31959+
31960+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
31961+                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
31962+
31963+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
31964+                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
31965+                           c->h, mx1, my1, c->w);
31966+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
31967+                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
31968+                           c->h, mx1, my1, c->w);
31969+
31970+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
31971+                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
31972+                            c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2),
31973+                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
31974+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
31975+                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
31976+                            c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2),
31977+                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
31978+
31979+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
31980+
31981+                        st->last_l0 = &c->next_src1;
31982+                        st->last_l1 = &c->next_src2;
31983+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
31984+                    }
31985+                    else if (link == q->code_sync) {
31986+                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
31987+                        break;
31988+                    }
31989+                    else if (link == q->code_exit) {
31990+                        // We expect exit to occur without other sync
31991+                        av_assert0(i == exit_n);
31992+                        ++exit_n;
31993+                        break;
31994+                    }
31995+                    else {
31996+                        av_assert0(0);
31997+                    }
31998+                }
31999+
32000+                st->qpu_mc_curr = cmd;
32001+            }
32002+        } while (exit_n == 0);
32003+    }
32004+}
32005+
32006+#undef FUNC
32007+#undef pixel
32008+
32009--- /dev/null
32010+++ b/libavcodec/rpi_hevc_transform.s
32011@@ -0,0 +1,444 @@
32012+# ******************************************************************************
32013+# Argon Design Ltd.
32014+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
32015+#
32016+# Module : HEVC
32017+# Author : Peter de Rivaz
32018+# ******************************************************************************
32019+
32020+# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack)
32021+# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions)
32022+.set USE_STACK, 0
32023+
32024+# Lines that fail to assemble start with #:
32025+# The script insert_magic_opcodes.sh inserts the machine code directly for these.
32026+# HEVC VPU Transform
32027+#
32028+# Transform matrix can be thought of as
32029+#   output row vector = input row vector * transMatrix2
32030+#
32031+# The even rows of the matrix are symmetric
32032+# The odd rows of the matrix are antisymmetric
32033+#
32034+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
32035+#
32036+# EXAMPLE
32037+#   (a b c d) (1 2  2  1)
32038+#             (3 4 -4 -3)
32039+#             (5 6  6  5)
32040+#             (7 8 -8 -7)
32041+#
32042+#  x=(a c)(1 2) = 1a+5c 2a+6c
32043+#         (5 6)
32044+#
32045+#  y=(b d)(3 4) = 3b+7d 4b+8d
32046+#         (7 8)
32047+#
32048+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
32049+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
32050+#
32051+#  Final results are (u , v[::-1])
32052+#
32053+#
32054+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
32055+#  Apply the even matrix first and stop before rounding
32056+#  Then apply the odd matrix in a full manner:
32057+#
32058+#   First step is to compute partial products with the first input (16 cycles)
32059+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
32060+#   2a 4b 6c 8d
32061+#   2a -4b 6c -8d
32062+#   1a -3b 5c -7d
32063+#
32064+#   Second step is to sum partial products into final position (8 cycles)
32065+#   1a+3b+5c+7d
32066+#   2a+4b+6c+8d
32067+#   2a-4b+6c-8d
32068+#   1a-3b+5c-7d
32069+#
32070+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
32071+#
32072+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
32073+#
32074+#   For 8x8 we could compute two in parallel.
32075+#
32076+#
32077+
32078+# Columns are transformed first
32079+#
32080+# Store top left half of transMatrix2 in
32081+# Store bottom left half of transMatrix2 in HX(32,32)
32082+#
32083+# For 16x16
32084+# HX(0:15,0) contains input data before transform
32085+# HY(0:15,0) contains 32bit output data after transform
32086+# HX(32,0) contains even rows of left half of transMatrix2
32087+# HX(32,32) contains odd rows of left half of transMatrix2
32088+# HY(48,0) contains partial products ready for summing
32089+#
32090+
32091+
32092+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
32093+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
32094+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
32095+# num: number of 16x16 transforms to be done
32096+# coeffs32
32097+# num32: number of 32x32 transforms
32098+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
32099+#
32100+
32101+.equ TRANS_SHIFT, 20 - BIT_DEPTH
32102+.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
32103+.equ TRANS_ASL2, 16 - TRANS_SHIFT
32104+
32105+
32106+hevc_trans_16x16:
32107+  push r6-r15, lr # TODO cut down number of used registers
32108+  mov r14,r3 # coeffs32
32109+  mov r15,r4 # num32
32110+  mov r3, 16*2 # Stride of transMatrix2 in bytes
32111+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
32112+
32113+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
32114+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
32115+
32116+  # Now use r0 to describe which matrix we are working on.
32117+  # Allows us to prefetch the next block of coefficients for efficiency.
32118+  mov r0,0 # This describes the location where we read our coefficients from
32119+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
32120+  mov r7,16*16*2 # Total block size
32121+  mov r8,64*16 # Value used to swap from current to next VRF location
32122+  mov r4,64 # Constant used for rounding first pass
32123+  mov r5,TRANS_RND2 # Constant used for rounding second pass
32124+
32125+  sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
32126+
32127+  add r11,sp,64 # Space for 32 bytes before, and rounding
32128+  lsr r11,5
32129+  lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32
32130+
32131+  lsr r10, r2, 16 # Number of compressed blocks stored in top short
32132+  extu r2,16
32133+  # At start of block r0,r1 point to the current block (that has already been loaded)
32134+  # r0 VRF location of current block
32135+  # r1 address of current block
32136+  # r2 number of 16*16 transforms to do
32137+  # r3 Stride of coefficients (==32)
32138+  # r4 TRANS_RND1 (64)
32139+  # r5 TRANS_RND2
32140+  # r6 temporary used inside col_trans16
32141+  # r7 16*16*2 total bytes in block
32142+  # r8 64*16 VRF switch locations
32143+  # r9 temporary in unpack_coeff for index
32144+  # r10 number of 16x16 transforms using compression
32145+  # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer)
32146+  # r12 temporary counter in unpack_coeff
32147+  # r13
32148+  # r14 Save information for 32 bit transform (coeffs location)
32149+  # r15 Save information for 32 bit transform (number of transforms)
32150+  cmp r2,0
32151+  beq done16x16s
32152+block_loop:
32153+  # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests
32154+  cmp r10,0
32155+  mov r6, r1
32156+  beq not_compressed
32157+  sub r10, 1
32158+  bl unpack16x16
32159+not_compressed:
32160+  #mov r6,r1 # DEBUG without compress
32161+  vldh HX(0++,0)+r0,(r6 += r3) REP 16
32162+  #eor r0,r8
32163+  #add r1,r7
32164+  # Prefetch the next block
32165+  #bl unpack16x16
32166+  #vldh HX(0++,0)+r0,(r6 += r3) REP 16
32167+  #vmov HX(0++,0)+r0,0 REP 16  # DEBUG
32168+  #eor r0,r8
32169+  #sub r1,r7
32170+
32171+  # Transform the current block
32172+  bl col_trans_16
32173+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
32174+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
32175+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
32176+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
32177+
32178+  bl col_trans_16
32179+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
32180+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
32181+  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
32182+
32183+  # Save results - note there has been a transposition during the processing so we save columns
32184+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
32185+
32186+  # Move onto next block
32187+  eor r0,r8
32188+  add r1,r7
32189+
32190+  addcmpbgt r2,-1,0,block_loop
32191+done16x16s:
32192+
32193+  add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
32194+  # Now go and do any 32x32 transforms
32195+  b hevc_trans_32x32
32196+
32197+  pop r6-r15, pc
32198+# This returns a value in r6 that says where to load the data from.
32199+# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it.
32200+unpack16x16:
32201+# Clear out destination
32202+  vmov HX(0,0)+r0,0
32203+  mov r6, r11
32204+  vsth HX(0,0)+r0,(r6 += r3) REP 16
32205+  mov r5, r1 # Moving pointer to input coefficients
32206+unpack_outer_loop:
32207+  # Loop until we find the end
32208+  vldh HX(0,0)+r0,(r5)  # TODO would prefetch help here while unpacking previous?
32209+  sub r6,r11,32
32210+  #add r6,pc,packed_data-$ # Packed data
32211+  vsth HX(0,0)+r0,(r6)  # Store into packed data
32212+  mov r12,0
32213+unpack_loop:
32214+  ld r4,(r6)
32215+  add r6,r6,4
32216+  lsr r9,r4,16 # r9 is destination value
32217+  cmp r4,0 # {value,index}
32218+  extu r4,8
32219+  beq done_unpack
32220+  sth r9,(r11, r4)
32221+  addcmpblt r12,1,8,unpack_loop
32222+#  # Read next 16
32223+  add r5,32
32224+  b unpack_outer_loop
32225+done_unpack:
32226+#  # Set new load location
32227+  mov r6, r11
32228+  #add r6,pc,unpacked_data-$
32229+#  # Restore constants
32230+  mov r4,64
32231+  mov r5,TRANS_RND2
32232+#  pop r6-r15, pc
32233+  b lr
32234+
32235+# r1,r2,r3 r7,r8 should be preserved
32236+# HX(0++,0)+r0 is the block to be transformed
32237+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
32238+# Use HY(48,0) for intermediate results
32239+# r0 can be used, but should be returned to its original value at the end
32240+col_trans_16:
32241+  add r6,r0,16 # Final value for this loop
32242+col_trans_16_loop:
32243+  # First compute partial products for a single column
32244+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
32245+  # Then sum up the results and place back
32246+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
32247+  addcmpblt r0,1,r6,col_trans_16_loop
32248+  sub r0,16  # put r0 back to its original value
32249+  b lr
32250+
32251+col_trans_odd_16:
32252+  add r6,r0,16 # Final value for this loop
32253+col_trans_odd_16_loop:
32254+  # First compute partial products for a single column
32255+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
32256+  # Then sum up the results and place back
32257+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
32258+  addcmpblt r0,1,r6,col_trans_odd_16_loop
32259+  sub r0,16  # put r0 back to its original value
32260+  b lr
32261+
32262+# r1/r10 input pointer
32263+# r0,r4,r5,r6 free
32264+# r8/r9 output storage
32265+#
32266+# Store packed coefficients at r9-32
32267+# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows)
32268+unpack32x32:
32269+# Clear out destination
32270+  vmov HX(0,0),0
32271+  add r0, r9, 32*32*2 # Unpacked buffer
32272+  mov r4, 32
32273+  vsth HX(0,0),(r0 += r4) REP 64
32274+unpack_outer_loop32:
32275+  # Loop until we find the end
32276+  vldh HX(0,0),(r1)  # TODO would prefetch help here while unpacking previous?
32277+  sub r6,r9,32
32278+  #add r6,pc,packed_data-$ # Packed data
32279+  vsth HX(0,0),(r6)  # Store into packed data
32280+  mov r8,0
32281+unpack_loop32:
32282+  ld r4,(r6)
32283+  add r6,r6,4
32284+  lsr r5,r4,16 # r5 is destination value
32285+  cmp r4,0 # {value,index}
32286+  extu r4,10
32287+  beq done_unpack
32288+  sth r5,(r0, r4)
32289+  addcmpblt r8,1,8,unpack_loop32
32290+#  # Read next 16
32291+  add r1,32
32292+  b unpack_outer_loop32
32293+done_unpack32:
32294+  b lr
32295+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
32296+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
32297+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
32298+# num: number of 16x16 transforms to be done in low 16, number of packed in high 16
32299+#
32300+# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first!
32301+hevc_trans_32x32:
32302+  mov r1,r14 # coeffs
32303+  mov r2,r15 # num
32304+  lsr r15,r15,16 # Number that are packed
32305+  extu r2,16 # Total number
32306+
32307+  # Fetch odd transform matrix
32308+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
32309+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
32310+  #add r0, 16*16*2
32311+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
32312+
32313+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
32314+  mov r7, 16*16*2 # Total block size
32315+
32316+.if USE_STACK
32317+  # Stack base allocation
32318+  sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking
32319+  # set r8 to 32byte aligned stack pointer with 32 bytes of space before it
32320+  add r8,sp,63
32321+  lsr r8,5
32322+  lsl r8,5
32323+.else
32324+#:version r8
32325+  .half 0x00e8 #AUTOINSERTED
32326+  btst r8,16
32327+#:add r8,pc,intermediate_results-$
32328+  .half 0xbfe8
32329+  .half intermediate_results-($-2)
32330+  beq on_vpu1
32331+  add r8,r8,32*32*2*2+16*2 # Move to secondary storage
32332+on_vpu1:
32333+.endif
32334+  mov r9,r8  # Backup of the temporary storage
32335+  mov r10,r1 # Backup of the coefficient buffer
32336+
32337+  cmp r2,0
32338+  beq done32x32s
32339+block_loop32:
32340+
32341+  # Transform the first 16 columns
32342+  mov r1,r10  # Input Coefficient buffer
32343+  mov r8,r9   # Output temporary storage
32344+  # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed)
32345+  cmp r2,r15
32346+  bgt not_compressed_32
32347+  bl unpack32x32
32348+  add r1,r9,32*32*2   # Uncompressed into temporary storage
32349+  mov r8,r9           # Transform into here
32350+not_compressed_32:
32351+  # COLUMN TRANSFORM
32352+  mov r4, 64 # Constant used for rounding first pass
32353+  mov r5, 9 # left shift used for rounding first pass
32354+
32355+  bl trans32
32356+  # Transform the second 16 columns
32357+  add r8,32*16*2
32358+  add r1,32
32359+  bl trans32
32360+
32361+  # ROW TRANSFORM
32362+  mov r4, TRANS_RND2 # Constant used for rounding second pass
32363+  mov r5, TRANS_ASL2 # left shift used for rounding second pass
32364+
32365+  mov r1,r9  # Input temporary storage
32366+  mov r8,r10   # Output Coefficient buffer
32367+  bl trans32
32368+  # Transform the second 16 columns
32369+  add r8,32*16*2
32370+  add r1,32
32371+  bl trans32
32372+
32373+  add r10, 32*32*2 # move onto next block of coefficients
32374+  addcmpbgt r2,-1,0,block_loop32
32375+done32x32s:
32376+
32377+.if USE_STACK
32378+  add sp,sp,32*32*4+64# Restore stack
32379+.endif
32380+
32381+  pop r6-r15, pc
32382+
32383+trans32:
32384+  push lr
32385+  # We can no longer afford the VRF space to do prefetching when doing 32x32
32386+  # Fetch the even rows
32387+  vldh HX(0++,0),(r1 += r3) REP 16
32388+  # Fetch the odd rows
32389+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
32390+
32391+  # Transform the even rows using even matrix
32392+  mov r0, 0 # Even rows
32393+  bl col_trans_16
32394+
32395+  # Now transform the odd rows using odd matrix
32396+  mov r0, 64*16 # Odd rows
32397+  bl col_trans_odd_16
32398+
32399+  # Now apply butterfly to compute the first 16 results
32400+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
32401+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
32402+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
32403+  # 16bit results now in HX(48,32)
32404+  mov r0,r8
32405+  mov r6,32*2
32406+  vsth VX(48,32++),(r0+=r6) REP 16
32407+
32408+  # Now apply butterfly to compute the second 16 results (in reverse order)
32409+  vsub HY(63,0),HY(0 ,0),HY(16,0)
32410+  vsub HY(62,0),HY(1 ,0),HY(17,0)
32411+  vsub HY(61,0),HY(2 ,0),HY(18,0)
32412+  vsub HY(60,0),HY(3 ,0),HY(19,0)
32413+  vsub HY(59,0),HY(4 ,0),HY(20,0)
32414+  vsub HY(58,0),HY(5 ,0),HY(21,0)
32415+  vsub HY(57,0),HY(6 ,0),HY(22,0)
32416+  vsub HY(56,0),HY(7 ,0),HY(23,0)
32417+  vsub HY(55,0),HY(8 ,0),HY(24,0)
32418+  vsub HY(54,0),HY(9 ,0),HY(25,0)
32419+  vsub HY(53,0),HY(10,0),HY(26,0)
32420+  vsub HY(52,0),HY(11,0),HY(27,0)
32421+  vsub HY(51,0),HY(12,0),HY(28,0)
32422+  vsub HY(50,0),HY(13,0),HY(29,0)
32423+  vsub HY(49,0),HY(14,0),HY(30,0)
32424+  vsub HY(48,0),HY(15,0),HY(31,0)
32425+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
32426+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
32427+  add r0,r8,32
32428+  vsth VX(48,32++),(r0+=r6) REP 16
32429+  pop pc
32430+
32431+.if USE_STACK == 0
32432+  .balign 32
32433+
32434+# .space directives generate 0's in the bin so avoid unnecessary padding by
32435+# just setting to appropriate value
32436+.equ intermediate_results, $+16*2
32437+
32438+# Layout goes:
32439+#
32440+#packed_buffer:
32441+#  .space 16*2
32442+#intermediate_results:
32443+#  .space 32*32*2
32444+#unpacked_buffer:
32445+#  .space 32*32*2
32446+#
32447+#packed_buffer2:
32448+#  .space 16*2
32449+#intermediate_results2:
32450+#  .space 32*32*2
32451+#unpacked_buffer2:
32452+#  .space 32*32*2
32453+.endif
32454+
32455+
32456--- /dev/null
32457+++ b/libavcodec/rpi_hevc_transform10.h
32458@@ -0,0 +1,94 @@
32459+static const unsigned char rpi_hevc_transform10 [] = {
32460+0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
32461+0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
32462+0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
32463+0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
32464+0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
32465+0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
32466+0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x02,   // 0030
32467+0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
32468+0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
32469+0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
32470+0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
32471+0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
32472+0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
32473+0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
32474+0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
32475+0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
32476+0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
32477+0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
32478+0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x06,  0x04,   // 0090
32479+0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
32480+0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
32481+0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
32482+0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
32483+0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
32484+0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
32485+0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
32486+0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
32487+0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
32488+0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
32489+0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
32490+0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
32491+0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
32492+0x00,  0x02,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
32493+0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
32494+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
32495+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
32496+0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
32497+0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
32498+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
32499+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
32500+0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
32501+0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
32502+0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
32503+0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
32504+0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
32505+0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
32506+0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
32507+0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
32508+0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
32509+0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
32510+0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
32511+0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
32512+0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
32513+0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
32514+0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
32515+0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
32516+0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
32517+0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
32518+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
32519+0x04,  0xb0,  0x00,  0x02,  0x65,  0x60,  0x91,  0x40,   // 01d8
32520+0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
32521+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
32522+0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
32523+0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
32524+0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
32525+0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
32526+0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
32527+0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
32528+0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
32529+0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
32530+0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
32531+0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
32532+0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
32533+0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
32534+0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
32535+0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
32536+0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
32537+0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
32538+0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
32539+0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
32540+0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
32541+0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
32542+0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
32543+0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
32544+0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
32545+0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
32546+0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
32547+0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
32548+0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
32549+0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
32550+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
32551+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
32552+};
32553--- /dev/null
32554+++ b/libavcodec/rpi_hevc_transform8.h
32555@@ -0,0 +1,94 @@
32556+static const unsigned char rpi_hevc_transform8 [] = {
32557+0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
32558+0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
32559+0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
32560+0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
32561+0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
32562+0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
32563+0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x08,   // 0030
32564+0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
32565+0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
32566+0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
32567+0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
32568+0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
32569+0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
32570+0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
32571+0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
32572+0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
32573+0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
32574+0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
32575+0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x04,  0x04,   // 0090
32576+0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
32577+0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
32578+0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
32579+0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
32580+0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
32581+0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
32582+0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
32583+0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
32584+0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
32585+0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
32586+0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
32587+0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
32588+0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
32589+0x00,  0x08,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
32590+0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
32591+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
32592+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
32593+0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
32594+0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
32595+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
32596+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
32597+0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
32598+0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
32599+0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
32600+0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
32601+0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
32602+0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
32603+0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
32604+0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
32605+0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
32606+0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
32607+0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
32608+0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
32609+0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
32610+0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
32611+0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
32612+0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
32613+0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
32614+0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
32615+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
32616+0x04,  0xb0,  0x00,  0x08,  0x45,  0x60,  0x91,  0x40,   // 01d8
32617+0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
32618+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
32619+0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
32620+0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
32621+0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
32622+0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
32623+0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
32624+0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
32625+0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
32626+0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
32627+0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
32628+0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
32629+0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
32630+0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
32631+0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
32632+0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
32633+0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
32634+0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
32635+0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
32636+0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
32637+0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
32638+0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
32639+0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
32640+0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
32641+0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
32642+0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
32643+0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
32644+0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
32645+0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
32646+0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
32647+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
32648+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
32649+};
32650--- /dev/null
32651+++ b/libavcodec/rpi_hevcdec.c
32652@@ -0,0 +1,6134 @@
32653+/*
32654+ * HEVC video Decoder
32655+ *
32656+ * Copyright (C) 2012 - 2013 Guillaume Martres
32657+ * Copyright (C) 2012 - 2013 Mickael Raulet
32658+ * Copyright (C) 2012 - 2013 Gildas Cocherel
32659+ * Copyright (C) 2012 - 2013 Wassim Hamidouche
32660+ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
32661+ *
32662+ * This file is part of FFmpeg.
32663+ *
32664+ * FFmpeg is free software; you can redistribute it and/or
32665+ * modify it under the terms of the GNU Lesser General Public
32666+ * License as published by the Free Software Foundation; either
32667+ * version 2.1 of the License, or (at your option) any later version.
32668+ *
32669+ * FFmpeg is distributed in the hope that it will be useful,
32670+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
32671+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
32672+ * Lesser General Public License for more details.
32673+ *
32674+ * You should have received a copy of the GNU Lesser General Public
32675+ * License along with FFmpeg; if not, write to the Free Software
32676+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32677+ */
32678+
32679+#include "libavutil/attributes.h"
32680+#include "libavutil/common.h"
32681+#include "libavutil/display.h"
32682+#include "libavutil/internal.h"
32683+#include "libavutil/mastering_display_metadata.h"
32684+#include "libavutil/md5.h"
32685+#include "libavutil/opt.h"
32686+#include "libavutil/pixdesc.h"
32687+#include "libavutil/stereo3d.h"
32688+
32689+#include "decode.h"
32690+#include "bswapdsp.h"
32691+#include "bytestream.h"
32692+#include "golomb.h"
32693+#include "hevc.h"
32694+#include "rpi_hevc_data.h"
32695+#include "rpi_hevc_parse.h"
32696+#include "rpi_hevcdec.h"
32697+#include "rpi_hevc_cabac_fns.h"
32698+#include "profiles.h"
32699+#include "hwconfig.h"
32700+
32701+#include "rpi_zc_frames.h"
32702+#include "rpi_qpu.h"
32703+#include "rpi_hevc_shader.h"
32704+#include "rpi_hevc_shader_cmd.h"
32705+#include "rpi_hevc_shader_template.h"
32706+#include "rpi_zc.h"
32707+#include "libavutil/rpi_sand_fns.h"
32708+
32709+#include "pthread.h"
32710+#include <stdatomic.h>
32711+
32712+#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
32713+
32714+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
32715+
32716+#ifndef av_mod_uintp2
32717+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
32718+{
32719+    return a & ((1 << p) - 1);
32720+}
32721+#   define av_mod_uintp2   av_mod_uintp2_c
32722+#endif
32723+
32724+const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
32725+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
32726+
32727+#define MC_DUMMY_X (-32)
32728+#define MC_DUMMY_Y (-32)
32729+
32730+// UV & Y both have min 4x4 pred (no 2x2 chroma)
32731+// Allow for even spread +1 for setup, +1 for rounding
32732+// As we have load sharing this can (in theory) be exceeded so we have to
32733+// check after each CTU, but it is a good base size
32734+
32735+// Worst case (all 4x4) commands per CTU
32736+#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
32737+#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
32738+
32739+#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64)
32740+
32741+#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP)
32742+#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS)
32743+
32744+#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2)
32745+#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2)
32746+
32747+// Total cmds to allocate - allow for slack & setup
32748+#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX)
32749+#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX)
32750+
32751+#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2))
32752+#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2))
32753+
32754+// The QPU code for UV blocks only works up to a block width of 8
32755+#define RPI_CHROMA_BLOCK_WIDTH 8
32756+
32757+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
32758+
32759+
32760+// Actual filter goes -ve, +ve, +ve, -ve using these values
32761+static const uint32_t rpi_filter_coefs[8] = {
32762+        ENCODE_COEFFS(  0,  64,   0,  0),
32763+        ENCODE_COEFFS(  2,  58,  10,  2),
32764+        ENCODE_COEFFS(  4,  54,  16,  2),
32765+        ENCODE_COEFFS(  6,  46,  28,  4),
32766+        ENCODE_COEFFS(  4,  36,  36,  4),
32767+        ENCODE_COEFFS(  4,  28,  46,  6),
32768+        ENCODE_COEFFS(  2,  16,  54,  4),
32769+        ENCODE_COEFFS(  2,  10,  58,  2)
32770+};
32771+
32772+// Function arrays by QPU
32773+
32774+static const int * const inter_pred_setup_c_qpu[12] = {
32775+    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
32776+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
32777+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
32778+};
32779+
32780+static const int * const inter_pred_setup_c10_qpu[12] = {
32781+    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
32782+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
32783+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
32784+};
32785+
32786+static const int * const inter_pred_setup_y_qpu[12] = {
32787+    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
32788+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
32789+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
32790+};
32791+
32792+static const int * const inter_pred_setup_y10_qpu[12] = {
32793+    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
32794+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
32795+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
32796+};
32797+
32798+static const int * const inter_pred_sync_qpu[12] = {
32799+    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
32800+    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
32801+    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
32802+};
32803+
32804+static const int * const inter_pred_sync10_qpu[12] = {
32805+    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
32806+    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
32807+    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
32808+};
32809+
32810+static const int * const inter_pred_exit_c_qpu[12] = {
32811+    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
32812+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
32813+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
32814+};
32815+
32816+static const int * const inter_pred_exit_c10_qpu[12] = {
32817+    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
32818+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
32819+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
32820+};
32821+
32822+static const int * const inter_pred_exit_y_qpu[12] = {
32823+    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
32824+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
32825+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
32826+};
32827+
32828+static const int * const inter_pred_exit_y10_qpu[12] = {
32829+    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
32830+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
32831+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
32832+};
32833+
32834+typedef struct ipe_chan_info_s
32835+{
32836+    const uint8_t bit_depth;
32837+    const uint8_t n;
32838+    const int * const * setup_fns;
32839+    const int * const * sync_fns;
32840+    const int * const * exit_fns;
32841+} ipe_chan_info_t;
32842+
32843+typedef struct ipe_init_info_s
32844+{
32845+    ipe_chan_info_t luma;
32846+    ipe_chan_info_t chroma;
32847+} ipe_init_info_t;
32848+
32849+static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a)
32850+{
32851+    switch (ln)
32852+    {
32853+        default:  // normally 0
32854+            *b = a;
32855+            break;
32856+        case 1:
32857+            a |= a << 8;
32858+            *(uint16_t *)b = a;
32859+            b += stride;
32860+            *(uint16_t *)b = a;
32861+            break;
32862+        case 2:
32863+            a |= a << 8;
32864+            a |= a << 16;
32865+            *(uint32_t *)b = a;
32866+            b += stride;
32867+            *(uint32_t *)b = a;
32868+            b += stride;
32869+            *(uint32_t *)b = a;
32870+            b += stride;
32871+            *(uint32_t *)b = a;
32872+            break;
32873+        case 3:
32874+        {
32875+            unsigned int i;
32876+            uint64_t d;
32877+            a |= a << 8;
32878+            a |= a << 16;
32879+            d = ((uint64_t)a << 32) | a;
32880+            for (i = 0; i != 8; ++i, b += stride)
32881+                *(uint64_t *)b = d;
32882+            break;
32883+        }
32884+        case 4:
32885+        {
32886+            unsigned int i;
32887+            uint64_t d;
32888+            a |= a << 8;
32889+            a |= a << 16;
32890+            d = ((uint64_t)a << 32) | a;
32891+            for (i = 0; i != 16; ++i, b += stride)
32892+            {
32893+                *(uint64_t *)b = d;
32894+                *(uint64_t *)(b + 8) = d;
32895+            }
32896+            break;
32897+        }
32898+    }
32899+}
32900+
32901+// We expect this to be called with ln = (log2_cb_size - 3) so range =  -1..3
32902+// (4 not required)
32903+static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a)
32904+{
32905+    switch (ln)
32906+    {
32907+        default:  // 0 or -1
32908+            *b_u = a;
32909+            *b_l = a;
32910+            break;
32911+        case 1:
32912+            a |= a << 8;
32913+            *(uint16_t *)b_u = a;
32914+            *(uint16_t *)b_l = a;
32915+            break;
32916+        case 2:
32917+            a |= a << 8;
32918+            a |= a << 16;
32919+            *(uint32_t *)b_u = a;
32920+            *(uint32_t *)b_l = a;
32921+            break;
32922+        case 3:
32923+            a |= a << 8;
32924+            a |= a << 16;
32925+            *(uint32_t *)b_u = a;
32926+            *(uint32_t *)(b_u + 4) = a;
32927+            *(uint32_t *)b_l = a;
32928+            *(uint32_t *)(b_l + 4) = a;
32929+            break;
32930+        case 4:
32931+            a |= a << 8;
32932+            a |= a << 16;
32933+            *(uint32_t *)b_u = a;
32934+            *(uint32_t *)(b_u + 4) = a;
32935+            *(uint32_t *)(b_u + 8) = a;
32936+            *(uint32_t *)(b_u + 12) = a;
32937+            *(uint32_t *)b_l = a;
32938+            *(uint32_t *)(b_l + 4) = a;
32939+            *(uint32_t *)(b_l + 8) = a;
32940+            *(uint32_t *)(b_l + 12) = a;
32941+            break;
32942+    }
32943+}
32944+
32945+static void zap_cabac_stash(uint8_t * b, const int ln)
32946+{
32947+    switch (ln)
32948+    {
32949+        default:  // 0
32950+            *b = 0;
32951+            break;
32952+        case 1:
32953+            *(uint16_t *)b = 0;
32954+            break;
32955+        case 2:
32956+            *(uint32_t *)b = 0;
32957+            break;
32958+        case 3:
32959+            *(uint32_t *)b = 0;
32960+            *(uint32_t *)(b + 4) = 0;
32961+            break;
32962+    }
32963+}
32964+
32965+
32966+
32967+// Set a small square block of bits in a bitmap
32968+// Bits must be aligned on their size boundry (which will be true of all split CBs)
32969+static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln)
32970+{
32971+    unsigned int n;
32972+    const unsigned int sh = (x & 7);
32973+
32974+    f += (x >> 3);
32975+
32976+    av_assert2(ln <= 3);
32977+    av_assert2((x & ((1 << ln) - 1)) == 0);
32978+
32979+    switch (ln)
32980+    {
32981+        default:  // 1
32982+            f[0] |= 1 << sh;
32983+            break;
32984+        case 1:  // 3 * 2
32985+            n = 3 << sh;
32986+            f[0] |= n;
32987+            f[stride] |= n;
32988+            break;
32989+        case 2:  // 0xf * 4
32990+            n = 0xf << sh;
32991+            f[0] |= n;
32992+            f[stride] |= n;
32993+            f[stride * 2] |= n;
32994+            f[stride * 3] |= n;
32995+            break;
32996+        case 3:  // 0xff * 8
32997+            for (n = 0; n != 8; ++n, f += stride)
32998+                *f = 0xff;
32999+            break;
33000+    }
33001+}
33002+
33003+static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
33004+   {  // 8
33005+      .luma =   {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
33006+      .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
33007+   },
33008+   {  // 9
33009+      .luma =   {0},
33010+      .chroma = {0}
33011+   },
33012+   {  // 10
33013+      .luma =   {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
33014+      .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
33015+   }
33016+
33017+};
33018+
33019+static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
33020+{
33021+    const unsigned int n = ici->n;
33022+    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
33023+
33024+    ipe->n = n;
33025+    ipe->max_fill = q1_size - ipe->min_gap;
33026+    for(unsigned int i = 0; i < n; i++) {
33027+        HEVCRpiInterPredQ * const q = ipe->q + i;
33028+        q->qpu_mc_curr = q->qpu_mc_base =
33029+            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
33030+        q->code_setup = qpu_fn(ici->setup_fns[i]);
33031+        q->code_sync = qpu_fn(ici->sync_fns[i]);
33032+        q->code_exit = qpu_fn(ici->exit_fns[i]);
33033+    }
33034+}
33035+
33036+static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth)
33037+{
33038+    av_assert0(bit_depth >= 8 && bit_depth <= 16);
33039+
33040+    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
33041+}
33042+
33043+// Unsigned Trivial MOD
33044+static inline unsigned int utmod(const unsigned int x, const unsigned int n)
33045+{
33046+    return x >= n ? x - n : x;
33047+}
33048+
33049+// returns pq->job_n++
33050+static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq)
33051+{
33052+    unsigned int const x2 = pq->job_n;
33053+    pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS);
33054+    return x2;
33055+}
33056+
33057+static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
33058+{
33059+    pq->terminate = 0;
33060+    pq->job_n = 0;
33061+    pq->context = s;
33062+    pq->worker = worker;
33063+    pq->psem_out = psem_out;
33064+    pq->pass_n = n;
33065+    pq->started = 0;
33066+    sem_init(&pq->sem_in, 0, 0);
33067+}
33068+
33069+static void pass_queue_kill(HEVCRpiPassQueue * const pq)
33070+{
33071+    sem_destroy(&pq->sem_in);
33072+}
33073+
33074+static inline void rpi_sem_wait(sem_t * const sem)
33075+{
33076+    while (sem_wait(sem) != 0) {
33077+        av_assert0(errno == EINTR);
33078+    }
33079+}
33080+
33081+static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
33082+{
33083+    sem_post(&pq->sem_in);
33084+}
33085+
33086+static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb)
33087+{
33088+    // Do the various passes - common with the worker code
33089+    for (unsigned int i = 0; i != RPI_PASSES; ++i) {
33090+        s->passq[i].worker(s, jb);
33091+    }
33092+}
33093+
33094+
33095+#if 0
33096+static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
33097+{
33098+    int x;
33099+    sem_getvalue((sem_t *)&jbc->sem_out, &x);
33100+    printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
33101+}
33102+#endif
33103+
33104+
33105+static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc)
33106+{
33107+    HEVCRpiJob * jb;
33108+    HEVCRpiJobGlobal * const jbg = jbc->jbg;
33109+
33110+    pthread_mutex_lock(&jbg->lock);
33111+    // Check local 1st
33112+    if ((jb = jbc->jb1) != NULL)
33113+    {
33114+        // Only 1 - very easy :-)
33115+        jbc->jb1 = NULL;
33116+    }
33117+    else
33118+    {
33119+        // Now look for global free chain
33120+        if ((jb = jbg->free1) != NULL)
33121+        {
33122+            // Found one - unlink it
33123+            jbg->free1 = jb->next;
33124+            jb->next = NULL;
33125+        }
33126+        else
33127+        {
33128+            // Out of places to look - wait for one to become free - add to Qs
33129+
33130+            // Global
33131+            // If "good" lc then add after the last "good" el in the chain
33132+            // otherwise add to the tail
33133+            if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
33134+            {
33135+                // Add to end as we had to wait last time or wait Q empty
33136+                if ((lc->jw_prev = jbg->wait_tail) == NULL)
33137+                    jbg->wait_head = lc;
33138+                else
33139+                    lc->jw_prev->jw_next = lc;
33140+                lc->jw_next = NULL;
33141+                jbg->wait_tail = lc;
33142+            }
33143+            else
33144+            {
33145+                // This is a "good" lc that we need to poke into the middle
33146+                // of the Q
33147+                // We know that the Q isn't empty and there is at least one
33148+                // !last_progess_good el in it from the previous test
33149+
33150+                HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after
33151+
33152+                if (p == NULL)
33153+                {
33154+                    // No current good els - add to head
33155+                    lc->jw_next = jbg->wait_head;
33156+                    jbg->wait_head = lc;
33157+                }
33158+                else
33159+                {
33160+                    lc->jw_next = p->jw_next;
33161+                    p->jw_next = lc;
33162+                }
33163+
33164+                lc->jw_next->jw_prev = lc;
33165+                lc->jw_prev = p;
33166+            }
33167+
33168+            // If "good" then we are now the last good waiting el
33169+            if (lc->last_progress_good)
33170+                jbg->wait_good = lc;
33171+
33172+            // Local
33173+            if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
33174+                jbc->lcw_head = lc;
33175+            else
33176+                lc->ljw_prev->ljw_next = lc;
33177+            lc->ljw_next = NULL;
33178+            jbc->lcw_tail = lc;
33179+        }
33180+    }
33181+
33182+    pthread_mutex_unlock(&jbg->lock);
33183+
33184+    if (jb == NULL)  // Need to wait
33185+    {
33186+        rpi_sem_wait(&lc->jw_sem);
33187+        jb = lc->jw_job;  // Set by free code
33188+    }
33189+
33190+    return jb;
33191+}
33192+
33193+
33194+static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
33195+{
33196+    HEVCRpiJobGlobal * const jbg = jbc0->jbg;  // This jbc only used to find jbg so we can get the lock
33197+    HEVCRpiJobCtl * jbc = jb->jbc_local;
33198+    HEVCRpiLocalContext * lc = NULL;
33199+
33200+    pthread_mutex_lock(&jbg->lock);
33201+
33202+    if (jbc != NULL)
33203+    {
33204+        av_assert1(jbc->jb1 == NULL);
33205+
33206+        // Release to Local if nothing waiting there
33207+        if ((lc = jbc->lcw_head) == NULL)
33208+            jbc->jb1 = jb;
33209+    }
33210+    else
33211+    {
33212+        // Release to global if nothing waiting there
33213+        if ((lc = jbg->wait_head) == NULL)
33214+        {
33215+            jb->next = jbg->free1;
33216+            jbg->free1 = jb;
33217+        }
33218+        else
33219+        {
33220+            // ? seems somehow mildy ugly...
33221+            jbc = lc->context->jbc;
33222+        }
33223+    }
33224+
33225+    if (lc != NULL)
33226+    {
33227+        // Something was waiting
33228+
33229+        // Unlink
33230+        // Global
33231+        if (lc->jw_next == NULL)
33232+            jbg->wait_tail = lc->jw_prev;
33233+        else
33234+            lc->jw_next->jw_prev = lc->jw_prev;
33235+
33236+        if (lc->jw_prev == NULL)
33237+            jbg->wait_head = lc->jw_next;
33238+        else
33239+            lc->jw_prev->jw_next = lc->jw_next;
33240+
33241+        // Local
33242+        if (lc->ljw_next == NULL)
33243+            jbc->lcw_tail = lc->ljw_prev;
33244+        else
33245+            lc->ljw_next->ljw_prev = lc->ljw_prev;
33246+
33247+        if (lc->ljw_prev == NULL)
33248+            jbc->lcw_head = lc->ljw_next;
33249+        else
33250+            lc->ljw_prev->ljw_next = lc->ljw_next;
33251+
33252+        // Update good if required
33253+        if (jbg->wait_good == lc)
33254+            jbg->wait_good = lc->jw_prev;
33255+
33256+        // Prod
33257+        lc->jw_job = jb;
33258+        sem_post(&lc->jw_sem);
33259+    }
33260+
33261+    pthread_mutex_unlock(&jbg->lock);
33262+}
33263+
33264+static void job_lc_kill(HEVCRpiLocalContext * const lc)
33265+{
33266+    sem_destroy(&lc->jw_sem);
33267+}
33268+
33269+static void job_lc_init(HEVCRpiLocalContext * const lc)
33270+{
33271+    lc->jw_next = NULL;
33272+    lc->jw_prev = NULL;
33273+    lc->ljw_next = NULL;
33274+    lc->ljw_prev = NULL;
33275+    lc->jw_job = NULL;
33276+    sem_init(&lc->jw_sem,  0, 0);
33277+}
33278+
33279+// Returns:
33280+//  0 if we have waited for MV or expect to wait for recon
33281+//  1 if we haven't waited for MV & do not need to wait for recon
33282+static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb)
33283+{
33284+    if (jb->waited) // reset by rpi_begin
33285+        return 0;
33286+    for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i)
33287+    {
33288+        if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL &&
33289+                ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i])
33290+            return 0;
33291+    }
33292+    return 1;
33293+}
33294+
33295+// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
33296+static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc)
33297+{
33298+    HEVCRpiJobCtl *const jbc = s->jbc;
33299+    HEVCRpiJob * const jb = lc->jb0;
33300+
33301+    av_assert1(jb != NULL);
33302+
33303+    if (jb->ctu_ts_last < 0) {
33304+        return;
33305+    }
33306+
33307+    lc->last_progress_good = progress_good(s, jb);
33308+    jb->waited = !lc->last_progress_good;
33309+    lc->jb0 = NULL;
33310+
33311+    if (s->offload_recon)
33312+    {
33313+        pthread_mutex_lock(&jbc->in_lock);
33314+        jbc->offloadq[jbc->offload_in] = jb;
33315+        jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
33316+        pthread_mutex_unlock(&jbc->in_lock);
33317+
33318+        pass_queue_submit_job(s->passq + 0);  // Consumes job eventually
33319+    }
33320+    else
33321+    {
33322+        pass_queue_do_all(s, jb);  // Consumes job before return
33323+    }
33324+}
33325+
33326+
33327+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
33328+// available to receive the next job.
33329+//
33330+// Now safe against multiple callers - needed for tiles
33331+// "normal" and WPP will only call here one at a time
33332+static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
33333+{
33334+    HEVCRpiJobCtl * const jbc = s->jbc;
33335+
33336+    // It is legit for us to already have a job allocated - do nothing in this case
33337+    if (lc->jb0 != NULL)
33338+        return;
33339+
33340+    if (s->offload_recon)
33341+        rpi_sem_wait(&jbc->sem_out);  // This sem will stop this frame grabbing too much
33342+
33343+    lc->jb0 = job_alloc(jbc, lc);
33344+
33345+    rpi_begin(s, lc->jb0, lc->ts);
33346+}
33347+
33348+// Free up a job without submission
33349+static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
33350+{
33351+    HEVCRpiJobCtl * const jbc = s->jbc;
33352+    HEVCRpiJob * const jb = lc->jb0;
33353+
33354+    if (jb == NULL) {
33355+        return;
33356+    }
33357+
33358+    lc->jb0 = NULL;
33359+
33360+    job_free(jbc, jb);
33361+
33362+    // If offload then poke sem_out too
33363+    if (s->offload_recon) {
33364+        sem_post(&jbc->sem_out);
33365+    }
33366+}
33367+
33368+
33369+// Call this to wait for all jobs to have completed at the end of a frame
33370+// Slightly icky as there is no clean way to wait for a sem to count up
33371+// Not reentrant - call on main thread only
33372+static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
33373+{
33374+    HEVCRpiJobCtl * const jbc = s->jbc;
33375+    int i = 0;
33376+
33377+    // We shouldn't reach here with an unsubmitted job
33378+    av_assert1(lc->jb0 == NULL);
33379+
33380+    // If no offload then there can't be anything to wait for
33381+    if (!s->offload_recon) {
33382+        return;
33383+    }
33384+
33385+    if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
33386+    {
33387+        for (i = 0; i != RPI_MAX_JOBS; ++i) {
33388+            rpi_sem_wait(&jbc->sem_out);
33389+        }
33390+        for (i = 0; i != RPI_MAX_JOBS; ++i) {
33391+            sem_post(&jbc->sem_out);
33392+        }
33393+    }
33394+}
33395+
33396+static void * pass_worker(void *arg)
33397+{
33398+    HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
33399+    HEVCRpiContext *const s = pq->context;
33400+
33401+    for (;;)
33402+    {
33403+        rpi_sem_wait(&pq->sem_in);
33404+
33405+        if (pq->terminate)
33406+            break;
33407+
33408+        pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]);
33409+        // * should really set jb->passes_done here
33410+
33411+        sem_post(pq->psem_out);
33412+    }
33413+    return NULL;
33414+}
33415+
33416+static void pass_queues_start_all(HEVCRpiContext *const s)
33417+{
33418+    unsigned int i;
33419+    HEVCRpiPassQueue * const pqs = s->passq;
33420+
33421+    for (i = 0; i != RPI_PASSES; ++i)
33422+    {
33423+        av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
33424+        pqs[i].started = 1;
33425+    }
33426+}
33427+
33428+static void pass_queues_term_all(HEVCRpiContext *const s)
33429+{
33430+    unsigned int i;
33431+    HEVCRpiPassQueue * const pqs = s->passq;
33432+
33433+    for (i = 0; i != RPI_PASSES; ++i)
33434+        pqs[i].terminate = 1;
33435+    for (i = 0; i != RPI_PASSES; ++i)
33436+    {
33437+        if (pqs[i].started)
33438+            sem_post(&pqs[i].sem_in);
33439+    }
33440+    for (i = 0; i != RPI_PASSES; ++i)
33441+    {
33442+        if (pqs[i].started) {
33443+            pthread_join(pqs[i].thread, NULL);
33444+            pqs[i].started = 0;
33445+        }
33446+    }
33447+}
33448+
33449+static void pass_queues_kill_all(HEVCRpiContext *const s)
33450+{
33451+    unsigned int i;
33452+    HEVCRpiPassQueue * const pqs = s->passq;
33453+
33454+    for (i = 0; i != RPI_PASSES; ++i)
33455+        pass_queue_kill(pqs + i);
33456+}
33457+
33458+
33459+static void worker_pic_free_one(HEVCRpiJob * const jb)
33460+{
33461+    // Free coeff stuff - allocation not the same for all buffers
33462+    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
33463+
33464+    if (cf->s[0].buf != NULL)
33465+        av_freep(&cf->mptr);
33466+    if (cf->s[2].buf != NULL)
33467+        gpu_free(&cf->gptr);
33468+    memset(cf, 0, sizeof(*cf));
33469+}
33470+
33471+static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
33472+{
33473+    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
33474+
33475+    if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
33476+        goto fail;
33477+    cf->s[2].buf = (int16_t *)cf->gptr.arm;
33478+    cf->s[3].buf = cf->s[2].buf + coeff_count;
33479+
33480+    // Must be 64 byte aligned for our zero zapping code so over-allocate &
33481+    // round
33482+    if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
33483+        goto fail;
33484+    cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
33485+    return 0;
33486+
33487+fail:
33488+    av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
33489+    worker_pic_free_one(jb);
33490+    return -1;
33491+}
33492+
33493+static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
33494+{
33495+    unsigned int i;
33496+    for (i = 0; i != 4; ++i) {
33497+        cf->s[i].n = 0;
33498+#if RPI_COMPRESS_COEFFS
33499+        cf->s[i].packed = 1;
33500+        cf->s[i].packed_n = 0;
33501+#endif
33502+    }
33503+}
33504+
33505+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
33506+{
33507+    HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
33508+    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
33509+    cfe->n += n;
33510+    return coeffs;
33511+}
33512+
33513+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
33514+                                     const HEVCRpiFrame * const ref, const int val, const int field)
33515+{
33516+    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
33517+        HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
33518+        HEVCRpiFrameProgressState * const pstate = fs->progress_states + field;
33519+        sem_t * sem = NULL;
33520+
33521+        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
33522+        if (((volatile int *)ref->tf.progress->data)[field] < val) {
33523+            HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait;
33524+
33525+            av_assert1(pwait->req == -1 && pwait->next == NULL);
33526+            jb->waited = 1;  // Remember that we had to wait for later scheduling
33527+
33528+            pwait->req = val;
33529+            pwait->next = NULL;
33530+            if (pstate->first == NULL)
33531+                pstate->first = pwait;
33532+            else
33533+                pstate->last->next = pwait;
33534+            pstate->last = pwait;
33535+            sem = &pwait->sem;
33536+        }
33537+        pthread_mutex_unlock(&pstate->lock);
33538+
33539+        if (sem != NULL) {
33540+            rpi_sem_wait(sem);
33541+        }
33542+    }
33543+}
33544+
33545+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field)
33546+{
33547+    HEVCRpiFrameProgressState *const pstate = s->progress_states + field;
33548+
33549+    ((int *)s->ref->tf.progress->data)[field] = val;
33550+
33551+    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
33552+    {
33553+        HEVCRpiFrameProgressWait ** ppwait = &pstate->first;
33554+        HEVCRpiFrameProgressWait * pwait;
33555+
33556+        while ((pwait = *ppwait) != NULL) {
33557+            if (pwait->req > val)
33558+            {
33559+                ppwait = &pwait->next;
33560+                pstate->last = pwait;
33561+            }
33562+            else
33563+            {
33564+                *ppwait = pwait->next;
33565+                pwait->req = -1;
33566+                pwait->next = NULL;
33567+                sem_post(&pwait->sem);
33568+            }
33569+        }
33570+    }
33571+    pthread_mutex_unlock(&pstate->lock);
33572+}
33573+
33574+static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate)
33575+{
33576+    pstate->first = NULL;
33577+    pstate->last = NULL;
33578+    pthread_mutex_init(&pstate->lock, NULL);
33579+}
33580+
33581+static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait)
33582+{
33583+    pwait->req = -1;
33584+    pwait->next = NULL;
33585+    sem_init(&pwait->sem, 0, 0);
33586+}
33587+
33588+static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate)
33589+{
33590+    av_assert1(pstate->first == NULL);
33591+    pthread_mutex_destroy(&pstate->lock);
33592+}
33593+
33594+static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait)
33595+{
33596+    sem_destroy(&pwait->sem);
33597+}
33598+
33599+
33600+/**
33601+ * NOTE: Each function hls_foo correspond to the function foo in the
33602+ * specification (HLS stands for High Level Syntax).
33603+ */
33604+
33605+/**
33606+ * Section 5.7
33607+ */
33608+
33609+// Realloc the entry point arrays
33610+static int alloc_entry_points(RpiSliceHeader * const sh, const int n)
33611+{
33612+    if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0)
33613+    {
33614+        // Round up alloc to multiple of 32
33615+        int a = (n + 31) & ~31;
33616+
33617+        // We don't care about the previous contents so probably fastest to simply discard
33618+        av_freep(&sh->entry_point_offset);
33619+        av_freep(&sh->offset);
33620+        av_freep(&sh->size);
33621+
33622+        if (a != 0)
33623+        {
33624+            sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned));
33625+            sh->offset = av_malloc_array(a, sizeof(int));
33626+            sh->size = av_malloc_array(a, sizeof(int));
33627+
33628+            if (!sh->entry_point_offset || !sh->offset || !sh->size) {
33629+                sh->num_entry_point_offsets = 0;
33630+                sh->offsets_allocated = 0;
33631+                return AVERROR(ENOMEM);
33632+            }
33633+        }
33634+
33635+        sh->offsets_allocated = a;
33636+    }
33637+
33638+    return 0;
33639+}
33640+
33641+/* free everything allocated  by pic_arrays_init() */
33642+static void pic_arrays_free(HEVCRpiContext *s)
33643+{
33644+    av_freep(&s->sao);
33645+    av_freep(&s->deblock);
33646+
33647+    av_freep(&s->cabac_stash_up);
33648+    s->cabac_stash_left = NULL;  // freed with _up
33649+
33650+    av_freep(&s->mvf_up);
33651+    av_freep(&s->mvf_left);
33652+
33653+    av_freep(&s->is_pcm);
33654+    av_freep(&s->is_intra_store);
33655+    s->is_intra = NULL;
33656+    av_freep(&s->rpl_tab);
33657+    s->rpl_tab_size = 0;
33658+
33659+    av_freep(&s->qp_y_tab);
33660+    av_freep(&s->tab_slice_address);
33661+    av_freep(&s->filter_slice_edges);
33662+
33663+    av_freep(&s->bs_horizontal);
33664+    s->bs_vertical = NULL;  // freed with H
33665+    av_freep(&s->bsf_stash_left);
33666+    av_freep(&s->bsf_stash_up);
33667+
33668+    av_freep(&s->rpl_up);
33669+    av_freep(&s->rpl_left);
33670+
33671+    alloc_entry_points(&s->sh, 0);
33672+
33673+    av_buffer_pool_uninit(&s->col_mvf_pool);
33674+}
33675+
33676+/* allocate arrays that depend on frame dimensions */
33677+static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps)
33678+{
33679+    const unsigned int log2_min_cb_size = sps->log2_min_cb_size;
33680+    const unsigned int width            = sps->width;
33681+    const unsigned int height           = sps->height;
33682+    const unsigned int pic_size_in_cb   = ((width  >> log2_min_cb_size) + 1) *
33683+                           ((height >> log2_min_cb_size) + 1);
33684+    const unsigned int ctb_count        = sps->ctb_size;
33685+
33686+    {
33687+        unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
33688+        unsigned int h = ((height + 15) & ~15);
33689+
33690+        s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size
33691+        s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols
33692+    }
33693+
33694+    s->sao           = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly
33695+    s->deblock       = av_mallocz_array(ctb_count, sizeof(*s->deblock));
33696+    if (!s->sao || !s->deblock)
33697+        goto fail;
33698+
33699+    s->cabac_stash_up  = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3));
33700+    s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3);
33701+    if (s->cabac_stash_up == NULL)
33702+        goto fail;
33703+
33704+    // Round width up to max ctb size
33705+    s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
33706+    // * Only needed if we have H tiles
33707+    s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
33708+
33709+    // We can overread by 1 line & one byte in deblock so alloc & zero
33710+    // We don't need to zero the extra @ start of frame as it will never be
33711+    // written
33712+    s->is_pcm   = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
33713+    s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
33714+    if (s->is_pcm == NULL || s->is_intra_store == NULL)
33715+        goto fail;
33716+
33717+    s->filter_slice_edges = av_mallocz(ctb_count);
33718+    s->tab_slice_address  = av_malloc_array(ctb_count,
33719+                                      sizeof(*s->tab_slice_address));
33720+    s->qp_y_tab           = av_malloc_array(pic_size_in_cb,
33721+                                      sizeof(*s->qp_y_tab));
33722+    if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
33723+        goto fail;
33724+
33725+    s->bs_horizontal = av_mallocz(s->bs_size * 2);
33726+    s->bs_vertical   = s->bs_horizontal + s->bs_size;
33727+    if (s->bs_horizontal == NULL)
33728+        goto fail;
33729+
33730+    s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up));
33731+    s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left));
33732+    if (s->rpl_left == NULL || s->rpl_up == NULL)
33733+        goto fail;
33734+
33735+    if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
33736+        (s->bsf_stash_up   = av_mallocz(((width + 63) & ~63) >> 4)) == NULL)
33737+        goto fail;
33738+
33739+    s->col_mvf_stride = (width + 15) >> 4;
33740+    s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField),
33741+                                          av_buffer_allocz);
33742+    if (s->col_mvf_pool == NULL)
33743+        goto fail;
33744+
33745+    return 0;
33746+
33747+fail:
33748+    pic_arrays_free(s);
33749+    return AVERROR(ENOMEM);
33750+}
33751+
33752+static void default_pred_weight_table(HEVCRpiContext * const s)
33753+{
33754+  unsigned int i;
33755+  const unsigned int wt = 1 << QPU_MC_DENOM;
33756+  s->sh.luma_log2_weight_denom = 0;
33757+  s->sh.chroma_log2_weight_denom = 0;
33758+  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
33759+      s->sh.luma_weight_l0[i] = wt;
33760+      s->sh.luma_offset_l0[i] = 0;
33761+      s->sh.chroma_weight_l0[i][0] = wt;
33762+      s->sh.chroma_weight_l0[i][1] = wt;
33763+      s->sh.chroma_offset_l0[i][0] = 0;
33764+      s->sh.chroma_offset_l0[i][1] = 0;
33765+  }
33766+  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
33767+      s->sh.luma_weight_l1[i] = wt;
33768+      s->sh.luma_offset_l1[i] = 0;
33769+      s->sh.chroma_weight_l1[i][0] = wt;
33770+      s->sh.chroma_weight_l1[i][1] = wt;
33771+      s->sh.chroma_offset_l1[i][0] = 0;
33772+      s->sh.chroma_offset_l1[i][1] = 0;
33773+  }
33774+}
33775+
33776+static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb,
33777+                       const unsigned int refs,
33778+                       int16_t * luma_weight,   int16_t * luma_offset,
33779+                       int16_t * chroma_weight, int16_t * chroma_offset)
33780+{
33781+    unsigned int luma_flags;
33782+    unsigned int chroma_flags;
33783+    unsigned int i;
33784+    const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8);
33785+    const int wp_offset_half_range = s->ps.sps->wp_offset_half_range;
33786+    const unsigned int luma_weight_base    = 1 << QPU_MC_DENOM;
33787+    const unsigned int chroma_weight_base  = 1 << QPU_MC_DENOM;
33788+    const unsigned int luma_weight_shift   = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom);
33789+    const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom);
33790+
33791+    if (refs == 0)
33792+        return 0;
33793+
33794+    luma_flags = get_bits(gb, refs);
33795+    chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs);
33796+    i = 1 << (refs - 1);
33797+
33798+    do
33799+    {
33800+        if ((luma_flags & i) != 0)
33801+        {
33802+            const int delta_weight = get_se_golomb(gb);
33803+            const int offset = get_se_golomb(gb);
33804+            if (delta_weight < -128 || delta_weight > 127 ||
33805+                offset < -wp_offset_half_range || offset >= wp_offset_half_range)
33806+            {
33807+                return AVERROR_INVALIDDATA;
33808+            }
33809+            *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift);
33810+            *luma_offset++ = offset << wp_offset_bd_shift;
33811+        }
33812+        else
33813+        {
33814+            *luma_weight++ = luma_weight_base;
33815+            *luma_offset++ = 0;
33816+        }
33817+
33818+        if ((chroma_flags & i) != 0)
33819+        {
33820+            unsigned int j;
33821+            for (j = 0; j != 2; ++j)
33822+            {
33823+                const int delta_weight = get_se_golomb(gb);
33824+                const int delta_offset = get_se_golomb(gb);
33825+
33826+                if (delta_weight < -128 || delta_weight > 127 ||
33827+                    delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range)
33828+                {
33829+                    return AVERROR_INVALIDDATA;
33830+                }
33831+
33832+                *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift);
33833+                *chroma_offset++ = av_clip(
33834+                    wp_offset_half_range + delta_offset -
33835+                        ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom),
33836+                    -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift;
33837+            }
33838+        }
33839+        else
33840+        {
33841+            *chroma_weight++ = chroma_weight_base;
33842+            *chroma_weight++ = chroma_weight_base;
33843+            *chroma_offset++ = 0;
33844+            *chroma_offset++ = 0;
33845+        }
33846+    } while ((i >>= 1) != 0);
33847+
33848+    return 0;
33849+}
33850+
33851+static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb)
33852+{
33853+    int err;
33854+    const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb);
33855+    const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb);
33856+
33857+    if (luma_log2_weight_denom > 7 ||
33858+        chroma_log2_weight_denom > 7)
33859+    {
33860+        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n",
33861+               luma_log2_weight_denom, chroma_log2_weight_denom);
33862+        return AVERROR_INVALIDDATA;
33863+    }
33864+
33865+    s->sh.luma_log2_weight_denom = luma_log2_weight_denom;
33866+    s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
33867+
33868+    if ((err = get_weights(s, gb, s->sh.nb_refs[L0],
33869+                s->sh.luma_weight_l0,      s->sh.luma_offset_l0,
33870+                s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 ||
33871+        (err = get_weights(s, gb, s->sh.nb_refs[L1],
33872+                s->sh.luma_weight_l1,      s->sh.luma_offset_l1,
33873+                s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0)
33874+    {
33875+        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n");
33876+        return err;
33877+    }
33878+
33879+    return 0;
33880+}
33881+
33882+static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb)
33883+{
33884+    const HEVCRpiSPS *sps = s->ps.sps;
33885+    int max_poc_lsb    = 1 << sps->log2_max_poc_lsb;
33886+    int prev_delta_msb = 0;
33887+    unsigned int nb_sps = 0, nb_sh;
33888+    int i;
33889+
33890+    rps->nb_refs = 0;
33891+    if (!sps->long_term_ref_pics_present_flag)
33892+        return 0;
33893+
33894+    if (sps->num_long_term_ref_pics_sps > 0)
33895+        nb_sps = get_ue_golomb_long(gb);
33896+    nb_sh = get_ue_golomb_long(gb);
33897+
33898+    if (nb_sps > sps->num_long_term_ref_pics_sps)
33899+        return AVERROR_INVALIDDATA;
33900+    if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
33901+        return AVERROR_INVALIDDATA;
33902+
33903+    rps->nb_refs = nb_sh + nb_sps;
33904+
33905+    for (i = 0; i < rps->nb_refs; i++) {
33906+        uint8_t delta_poc_msb_present;
33907+
33908+        if (i < nb_sps) {
33909+            uint8_t lt_idx_sps = 0;
33910+
33911+            if (sps->num_long_term_ref_pics_sps > 1)
33912+                lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
33913+
33914+            rps->poc[i]  = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
33915+            rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
33916+        } else {
33917+            rps->poc[i]  = get_bits(gb, sps->log2_max_poc_lsb);
33918+            rps->used[i] = get_bits1(gb);
33919+        }
33920+
33921+        delta_poc_msb_present = get_bits1(gb);
33922+        if (delta_poc_msb_present) {
33923+            int64_t delta = get_ue_golomb_long(gb);
33924+            int64_t poc;
33925+
33926+            if (i && i != nb_sps)
33927+                delta += prev_delta_msb;
33928+
33929+            poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
33930+            if (poc != (int32_t)poc)
33931+                return AVERROR_INVALIDDATA;
33932+            rps->poc[i] = poc;
33933+            prev_delta_msb = delta;
33934+        }
33935+    }
33936+
33937+    return 0;
33938+}
33939+
33940+static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps,
33941+                                 const HEVCRpiSPS *sps)
33942+{
33943+    const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data;
33944+    const HEVCRpiWindow *ow = &sps->output_window;
33945+    unsigned int num = 0, den = 0;
33946+
33947+    avctx->pix_fmt             = sps->pix_fmt;
33948+    avctx->coded_width         = sps->width;
33949+    avctx->coded_height        = sps->height;
33950+    avctx->width               = sps->width  - ow->left_offset - ow->right_offset;
33951+    avctx->height              = sps->height - ow->top_offset  - ow->bottom_offset;
33952+    avctx->has_b_frames        = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics;
33953+    avctx->profile             = sps->ptl.general_ptl.profile_idc;
33954+    avctx->level               = sps->ptl.general_ptl.level_idc;
33955+
33956+    ff_set_sar(avctx, sps->vui.sar);
33957+
33958+    if (sps->vui.video_signal_type_present_flag)
33959+        avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
33960+                                                            : AVCOL_RANGE_MPEG;
33961+    else
33962+        avctx->color_range = AVCOL_RANGE_MPEG;
33963+
33964+    if (sps->vui.colour_description_present_flag) {
33965+        avctx->color_primaries = sps->vui.colour_primaries;
33966+        avctx->color_trc       = sps->vui.transfer_characteristic;
33967+        avctx->colorspace      = sps->vui.matrix_coeffs;
33968+    } else {
33969+        avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
33970+        avctx->color_trc       = AVCOL_TRC_UNSPECIFIED;
33971+        avctx->colorspace      = AVCOL_SPC_UNSPECIFIED;
33972+    }
33973+
33974+    if (vps->vps_timing_info_present_flag) {
33975+        num = vps->vps_num_units_in_tick;
33976+        den = vps->vps_time_scale;
33977+    } else if (sps->vui.vui_timing_info_present_flag) {
33978+        num = sps->vui.vui_num_units_in_tick;
33979+        den = sps->vui.vui_time_scale;
33980+    }
33981+
33982+    if (num != 0 && den != 0)
33983+        av_reduce(&avctx->framerate.den, &avctx->framerate.num,
33984+                  num, den, 1 << 30);
33985+}
33986+
33987+static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps)
33988+{
33989+    enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts;
33990+
33991+    // Admit to no h/w formats
33992+
33993+    *fmt++ = sps->pix_fmt;
33994+    *fmt = AV_PIX_FMT_NONE;
33995+
33996+    return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts);
33997+}
33998+
33999+static int is_sps_supported(const HEVCRpiSPS * const sps)
34000+{
34001+    return av_rpi_is_sand_format(sps->pix_fmt) &&
34002+           sps->width <= HEVC_RPI_MAX_WIDTH &&
34003+           sps->height <= HEVC_RPI_MAX_HEIGHT;
34004+}
34005+
34006+static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps,
34007+                   const enum AVPixelFormat pix_fmt)
34008+{
34009+    int ret;
34010+
34011+    pic_arrays_free(s);
34012+    s->ps.sps = NULL;
34013+    s->ps.vps = NULL;
34014+
34015+    if (sps == NULL)
34016+        return 0;
34017+
34018+    if (!is_sps_supported(sps))
34019+        return AVERROR_DECODER_NOT_FOUND;
34020+
34021+    ret = pic_arrays_init(s, sps);
34022+    if (ret < 0)
34023+        goto fail;
34024+
34025+    export_stream_params(s->avctx, &s->ps, sps);
34026+
34027+    s->avctx->pix_fmt = pix_fmt;
34028+
34029+    ff_hevc_rpi_pred_init(&s->hpc,     sps->bit_depth);
34030+    ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth);
34031+
34032+    // * We don't support cross_component_prediction_enabled_flag but as that
34033+    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
34034+    //   only deal with sand which is never 4:4:4
34035+    //   [support wouldn't be hard]
34036+
34037+    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
34038+
34039+    av_freep(&s->sao_pixel_buffer_h[0]);
34040+    av_freep(&s->sao_pixel_buffer_v[0]);
34041+
34042+    if (sps->sao_enabled)
34043+    {
34044+        const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1;
34045+        unsigned int c_idx;
34046+        size_t vsize[3] = {0};
34047+        size_t hsize[3] = {0};
34048+
34049+        for(c_idx = 0; c_idx < c_count; c_idx++) {
34050+            int w = sps->width >> ctx_hshift(s, c_idx);
34051+            int h = sps->height >> ctx_vshift(s, c_idx);
34052+            // ctb height & width are a min of 8 so this must a multiple of 16
34053+            // so no point rounding up!
34054+            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
34055+            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
34056+        }
34057+
34058+        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
34059+        // when we have plaited chroma
34060+        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
34061+        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
34062+        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
34063+        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
34064+        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
34065+        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
34066+    }
34067+
34068+    s->ps.sps = sps;
34069+    s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
34070+
34071+    return 0;
34072+
34073+fail:
34074+    pic_arrays_free(s);
34075+    s->ps.sps = NULL;
34076+    return ret;
34077+}
34078+
34079+static inline int qp_offset_valid(const int qp_offset)
34080+{
34081+    return qp_offset >= -12 && qp_offset <= 12;
34082+}
34083+
34084+static int hls_slice_header(HEVCRpiContext * const s)
34085+{
34086+    GetBitContext * const gb = &s->HEVClc->gb;
34087+    RpiSliceHeader * const sh   = &s->sh;
34088+    int i, ret;
34089+
34090+    // Coded parameters
34091+    sh->first_slice_in_pic_flag = get_bits1(gb);
34092+    if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
34093+        s->seq_decode = (s->seq_decode + 1) & 0xff;
34094+        s->max_ra     = INT_MAX;
34095+        if (IS_IDR(s))
34096+            ff_hevc_rpi_clear_refs(s);
34097+    }
34098+    sh->no_output_of_prior_pics_flag = 0;
34099+    if (IS_IRAP(s))
34100+        sh->no_output_of_prior_pics_flag = get_bits1(gb);
34101+
34102+    sh->pps_id = get_ue_golomb_long(gb);
34103+    if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) {
34104+        av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
34105+        return AVERROR_INVALIDDATA;
34106+    }
34107+    if (!sh->first_slice_in_pic_flag &&
34108+        s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) {
34109+        av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n");
34110+        return AVERROR_INVALIDDATA;
34111+    }
34112+    s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data;
34113+    if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
34114+        sh->no_output_of_prior_pics_flag = 1;
34115+
34116+    if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
34117+        const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
34118+        const HEVCRpiSPS *last_sps = s->ps.sps;
34119+        enum AVPixelFormat pix_fmt;
34120+
34121+        if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
34122+            if (sps->width != last_sps->width || sps->height != last_sps->height ||
34123+                sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
34124+                last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
34125+                sh->no_output_of_prior_pics_flag = 0;
34126+        }
34127+        ff_hevc_rpi_clear_refs(s);
34128+
34129+        ret = set_sps(s, sps, sps->pix_fmt);
34130+        if (ret < 0)
34131+            return ret;
34132+
34133+        pix_fmt = get_format(s, sps);
34134+        if (pix_fmt < 0)
34135+            return pix_fmt;
34136+
34137+//        ret = set_sps(s, sps, pix_fmt);
34138+//        if (ret < 0)
34139+//            return ret;
34140+
34141+        s->avctx->pix_fmt = pix_fmt;
34142+
34143+        s->seq_decode = (s->seq_decode + 1) & 0xff;
34144+        s->max_ra     = INT_MAX;
34145+    }
34146+
34147+    sh->dependent_slice_segment_flag = 0;
34148+    if (!sh->first_slice_in_pic_flag) {
34149+        int slice_address_length;
34150+
34151+        if (s->ps.pps->dependent_slice_segments_enabled_flag)
34152+            sh->dependent_slice_segment_flag = get_bits1(gb);
34153+
34154+        slice_address_length = av_ceil_log2(s->ps.sps->ctb_size);
34155+        sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
34156+        if (sh->slice_segment_addr >= s->ps.sps->ctb_size) {
34157+            av_log(s->avctx, AV_LOG_ERROR,
34158+                   "Invalid slice segment address: %u.\n",
34159+                   sh->slice_segment_addr);
34160+            return AVERROR_INVALIDDATA;
34161+        }
34162+
34163+        if (!sh->dependent_slice_segment_flag) {
34164+            sh->slice_addr = sh->slice_segment_addr;
34165+            s->slice_idx++;
34166+        }
34167+    } else {
34168+        sh->slice_segment_addr = sh->slice_addr = 0;
34169+        s->slice_idx           = 0;
34170+        s->slice_initialized   = 0;
34171+    }
34172+
34173+    if (!sh->dependent_slice_segment_flag) {
34174+        s->slice_initialized = 0;
34175+
34176+        for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++)
34177+            skip_bits(gb, 1);  // slice_reserved_undetermined_flag[]
34178+
34179+        sh->slice_type = get_ue_golomb_long(gb);
34180+        if (!(sh->slice_type == HEVC_SLICE_I ||
34181+              sh->slice_type == HEVC_SLICE_P ||
34182+              sh->slice_type == HEVC_SLICE_B)) {
34183+            av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
34184+                   sh->slice_type);
34185+            return AVERROR_INVALIDDATA;
34186+        }
34187+        if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) {
34188+            av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n");
34189+            return AVERROR_INVALIDDATA;
34190+        }
34191+
34192+        // when flag is not present, picture is inferred to be output
34193+        sh->pic_output_flag = 1;
34194+        if (s->ps.pps->output_flag_present_flag)
34195+            sh->pic_output_flag = get_bits1(gb);
34196+
34197+        if (s->ps.sps->separate_colour_plane_flag)
34198+            sh->colour_plane_id = get_bits(gb, 2);
34199+
34200+        if (!IS_IDR(s)) {
34201+            int poc, pos;
34202+
34203+            sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
34204+            poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
34205+            if (!sh->first_slice_in_pic_flag && poc != s->poc) {
34206+                av_log(s->avctx, AV_LOG_WARNING,
34207+                       "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
34208+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
34209+                    return AVERROR_INVALIDDATA;
34210+                poc = s->poc;
34211+            }
34212+            s->poc = poc;
34213+
34214+            sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
34215+            pos = get_bits_left(gb);
34216+            if (!sh->short_term_ref_pic_set_sps_flag) {
34217+                ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
34218+                if (ret < 0)
34219+                    return ret;
34220+
34221+                sh->short_term_rps = &sh->slice_rps;
34222+            } else {
34223+                int numbits, rps_idx;
34224+
34225+                if (!s->ps.sps->nb_st_rps) {
34226+                    av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
34227+                    return AVERROR_INVALIDDATA;
34228+                }
34229+
34230+                numbits = av_ceil_log2(s->ps.sps->nb_st_rps);
34231+                rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
34232+                sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
34233+            }
34234+            sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
34235+
34236+            pos = get_bits_left(gb);
34237+            ret = decode_lt_rps(s, &sh->long_term_rps, gb);
34238+            if (ret < 0) {
34239+                av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
34240+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
34241+                    return AVERROR_INVALIDDATA;
34242+            }
34243+            sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
34244+
34245+            if (s->ps.sps->sps_temporal_mvp_enabled_flag)
34246+                sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
34247+            else
34248+                sh->slice_temporal_mvp_enabled_flag = 0;
34249+        } else {
34250+            s->sh.short_term_rps = NULL;
34251+            s->poc               = 0;
34252+        }
34253+
34254+        /* 8.3.1 */
34255+        if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
34256+            s->nal_unit_type != HEVC_NAL_TRAIL_N &&
34257+            s->nal_unit_type != HEVC_NAL_TSA_N   &&
34258+            s->nal_unit_type != HEVC_NAL_STSA_N  &&
34259+            s->nal_unit_type != HEVC_NAL_RADL_N  &&
34260+            s->nal_unit_type != HEVC_NAL_RADL_R  &&
34261+            s->nal_unit_type != HEVC_NAL_RASL_N  &&
34262+            s->nal_unit_type != HEVC_NAL_RASL_R)
34263+            s->pocTid0 = s->poc;
34264+
34265+        if (s->ps.sps->sao_enabled) {
34266+            sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
34267+            if (ctx_cfmt(s) != 0) {
34268+                sh->slice_sample_adaptive_offset_flag[1] =
34269+                sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
34270+            }
34271+        } else {
34272+            sh->slice_sample_adaptive_offset_flag[0] = 0;
34273+            sh->slice_sample_adaptive_offset_flag[1] = 0;
34274+            sh->slice_sample_adaptive_offset_flag[2] = 0;
34275+        }
34276+
34277+        sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
34278+        if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
34279+            int nb_refs;
34280+
34281+            sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active;
34282+            if (sh->slice_type == HEVC_SLICE_B)
34283+                sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active;
34284+
34285+            if (get_bits1(gb)) { // num_ref_idx_active_override_flag
34286+                sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1;
34287+                if (sh->slice_type == HEVC_SLICE_B)
34288+                    sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1;
34289+            }
34290+            if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) {
34291+                av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
34292+                       sh->nb_refs[L0], sh->nb_refs[L1]);
34293+                return AVERROR_INVALIDDATA;
34294+            }
34295+
34296+            sh->rpl_modification_flag[0] = 0;
34297+            sh->rpl_modification_flag[1] = 0;
34298+            nb_refs = ff_hevc_rpi_frame_nb_refs(s);
34299+            if (!nb_refs) {
34300+                av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
34301+                return AVERROR_INVALIDDATA;
34302+            }
34303+
34304+            if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) {
34305+                sh->rpl_modification_flag[0] = get_bits1(gb);
34306+                if (sh->rpl_modification_flag[0]) {
34307+                    for (i = 0; i < sh->nb_refs[L0]; i++)
34308+                        sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
34309+                }
34310+
34311+                if (sh->slice_type == HEVC_SLICE_B) {
34312+                    sh->rpl_modification_flag[1] = get_bits1(gb);
34313+                    if (sh->rpl_modification_flag[1] == 1)
34314+                        for (i = 0; i < sh->nb_refs[L1]; i++)
34315+                            sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
34316+                }
34317+            }
34318+
34319+            if (sh->slice_type == HEVC_SLICE_B)
34320+                sh->mvd_l1_zero_flag = get_bits1(gb);
34321+
34322+            if (s->ps.pps->cabac_init_present_flag)
34323+                sh->cabac_init_flag = get_bits1(gb);
34324+            else
34325+                sh->cabac_init_flag = 0;
34326+
34327+            sh->collocated_ref_idx = 0;
34328+            if (sh->slice_temporal_mvp_enabled_flag) {
34329+                sh->collocated_list = L0;
34330+                if (sh->slice_type == HEVC_SLICE_B)
34331+                    sh->collocated_list = !get_bits1(gb);
34332+
34333+                if (sh->nb_refs[sh->collocated_list] > 1) {
34334+                    sh->collocated_ref_idx = get_ue_golomb_long(gb);
34335+                    if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
34336+                        av_log(s->avctx, AV_LOG_ERROR,
34337+                               "Invalid collocated_ref_idx: %d.\n",
34338+                               sh->collocated_ref_idx);
34339+                        return AVERROR_INVALIDDATA;
34340+                    }
34341+                }
34342+            }
34343+
34344+            if ((s->ps.pps->weighted_pred_flag   && sh->slice_type == HEVC_SLICE_P) ||
34345+                (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B))
34346+            {
34347+                if ((ret = pred_weight_table(s, gb)) != 0)
34348+                    return ret;
34349+            }
34350+            else
34351+            {
34352+                // Give us unit weights
34353+                default_pred_weight_table(s);
34354+            }
34355+
34356+            sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
34357+            if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
34358+                av_log(s->avctx, AV_LOG_ERROR,
34359+                       "Invalid number of merging MVP candidates: %d.\n",
34360+                       sh->max_num_merge_cand);
34361+                return AVERROR_INVALIDDATA;
34362+            }
34363+        }
34364+
34365+        sh->slice_qp_delta = get_se_golomb(gb);
34366+
34367+        if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) {
34368+            sh->slice_cb_qp_offset = get_se_golomb(gb);
34369+            sh->slice_cr_qp_offset = get_se_golomb(gb);
34370+            if (!qp_offset_valid(sh->slice_cb_qp_offset) ||
34371+                !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) ||
34372+                !qp_offset_valid(sh->slice_cr_qp_offset) ||
34373+                !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset))
34374+            {
34375+                av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n",
34376+                       sh->slice_cr_qp_offset, sh->slice_cr_qp_offset,
34377+                       s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset);
34378+                return AVERROR_INVALIDDATA;
34379+            }
34380+        } else
34381+        {
34382+            sh->slice_cb_qp_offset = 0;
34383+            sh->slice_cr_qp_offset = 0;
34384+        }
34385+
34386+        if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
34387+            sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
34388+        else
34389+            sh->cu_chroma_qp_offset_enabled_flag = 0;
34390+
34391+        if (s->ps.pps->deblocking_filter_control_present_flag) {
34392+            int deblocking_filter_override_flag = 0;
34393+
34394+            if (s->ps.pps->deblocking_filter_override_enabled_flag)
34395+                deblocking_filter_override_flag = get_bits1(gb);
34396+
34397+            if (deblocking_filter_override_flag) {
34398+                sh->disable_deblocking_filter_flag = get_bits1(gb);
34399+                if (!sh->disable_deblocking_filter_flag) {
34400+                    int beta_offset_div2 = get_se_golomb(gb);
34401+                    int tc_offset_div2   = get_se_golomb(gb) ;
34402+                    if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
34403+                        tc_offset_div2   < -6 || tc_offset_div2   > 6) {
34404+                        av_log(s->avctx, AV_LOG_ERROR,
34405+                            "Invalid deblock filter offsets: %d, %d\n",
34406+                            beta_offset_div2, tc_offset_div2);
34407+                        return AVERROR_INVALIDDATA;
34408+                    }
34409+                    sh->beta_offset = beta_offset_div2 * 2;
34410+                    sh->tc_offset   =   tc_offset_div2 * 2;
34411+                }
34412+            } else {
34413+                sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
34414+                sh->beta_offset                    = s->ps.pps->beta_offset;
34415+                sh->tc_offset                      = s->ps.pps->tc_offset;
34416+            }
34417+        } else {
34418+            sh->disable_deblocking_filter_flag = 0;
34419+            sh->beta_offset                    = 0;
34420+            sh->tc_offset                      = 0;
34421+        }
34422+
34423+        if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag &&
34424+            (sh->slice_sample_adaptive_offset_flag[0] ||
34425+             sh->slice_sample_adaptive_offset_flag[1] ||
34426+             !sh->disable_deblocking_filter_flag)) {
34427+            sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
34428+        } else {
34429+            sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag;
34430+        }
34431+        sh->no_dblk_boundary_flags =
34432+            (sh->slice_loop_filter_across_slices_enabled_flag ? 0 :
34433+                BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) |
34434+            (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 :
34435+                BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE);
34436+
34437+
34438+    } else if (!s->slice_initialized) {
34439+        av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
34440+        return AVERROR_INVALIDDATA;
34441+    }
34442+
34443+    sh->num_entry_point_offsets = 0;
34444+    sh->offload_wpp = 0;
34445+    sh->offload_tiles = 0;
34446+
34447+    if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
34448+        unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
34449+        // It would be possible to bound this tighter but this here is simpler
34450+        if (num_entry_point_offsets > get_bits_left(gb)) {
34451+            av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
34452+            return AVERROR_INVALIDDATA;
34453+        }
34454+
34455+        sh->num_entry_point_offsets = num_entry_point_offsets;
34456+        if (sh->num_entry_point_offsets > 0) {
34457+            int offset_len = get_ue_golomb_long(gb) + 1;
34458+
34459+            if (offset_len < 1 || offset_len > 32) {
34460+                sh->num_entry_point_offsets = 0;
34461+                av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
34462+                return AVERROR_INVALIDDATA;
34463+            }
34464+
34465+            if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0)
34466+            {
34467+                av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
34468+                return ret;
34469+            }
34470+
34471+            for (i = 0; i < sh->num_entry_point_offsets; i++) {
34472+                uint32_t val_minus1 = get_bits_long(gb, offset_len);
34473+                if (val_minus1 > (1 << 28))
34474+                {
34475+                    // We can declare offsets of > 2^28 bad without loss of generality
34476+                    // Will check actual bounds wrt NAL later, but this keeps
34477+                    // the values within bounds we can deal with easily
34478+                    av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1);
34479+                    return AVERROR_INVALIDDATA;
34480+                }
34481+                sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size
34482+            }
34483+
34484+            // Do we want to offload this
34485+            if (s->threads_type != 0)
34486+            {
34487+                sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) &&
34488+                    s->ps.pps->num_tile_columns > 1;
34489+                // * We only cope with WPP in a single column
34490+                //   Probably want to deal with that case as tiles rather than WPP anyway
34491+                // ?? Not actually sure that the main code deals with WPP + multi-col correctly
34492+                sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag &&
34493+                    s->ps.pps->num_tile_columns == 1;
34494+            }
34495+        }
34496+    }
34497+
34498+    if (s->ps.pps->slice_header_extension_present_flag) {
34499+        unsigned int length = get_ue_golomb_long(gb);
34500+        if (length*8LL > get_bits_left(gb)) {
34501+            av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
34502+            return AVERROR_INVALIDDATA;
34503+        }
34504+        for (i = 0; i < length; i++)
34505+            skip_bits(gb, 8);  // slice_header_extension_data_byte
34506+    }
34507+
34508+    // Inferred parameters
34509+    sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
34510+    if (sh->slice_qp > 51 ||
34511+        sh->slice_qp < -s->ps.sps->qp_bd_offset) {
34512+        av_log(s->avctx, AV_LOG_ERROR,
34513+               "The slice_qp %d is outside the valid range "
34514+               "[%d, 51].\n",
34515+               sh->slice_qp,
34516+               -s->ps.sps->qp_bd_offset);
34517+        return AVERROR_INVALIDDATA;
34518+    }
34519+
34520+    if (get_bits_left(gb) < 0) {
34521+        av_log(s->avctx, AV_LOG_ERROR,
34522+               "Overread slice header by %d bits\n", -get_bits_left(gb));
34523+        return AVERROR_INVALIDDATA;
34524+    }
34525+
34526+    s->slice_initialized = 1;
34527+    return 0;
34528+}
34529+
34530+static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry)
34531+{
34532+    RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
34533+    int c_idx, i;
34534+
34535+    if (s->sh.slice_sample_adaptive_offset_flag[0] ||
34536+        s->sh.slice_sample_adaptive_offset_flag[1]) {
34537+        if ((lc->ctb_avail & AVAIL_L) != 0)
34538+        {
34539+            const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
34540+            if (sao_merge_left_flag) {
34541+                *sao = sao[-1];
34542+                return;
34543+            }
34544+        }
34545+        if ((lc->ctb_avail & AVAIL_U) != 0)
34546+        {
34547+            const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
34548+            if (sao_merge_up_flag) {
34549+                *sao = sao[-(int)s->ps.sps->ctb_width];
34550+                return;
34551+            }
34552+        }
34553+    }
34554+
34555+    for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) {
34556+        const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
34557+                                                 s->ps.pps->log2_sao_offset_scale_chroma;
34558+        int offset_abs[4];
34559+        char offset_sign[4] = {0};
34560+
34561+        if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
34562+            sao->type_idx[c_idx] = SAO_NOT_APPLIED;
34563+            continue;
34564+        }
34565+
34566+        if (c_idx == 2) {
34567+            sao->type_idx[2] = sao->type_idx[1];
34568+            sao->eo_class[2] = sao->eo_class[1];
34569+        } else {
34570+            sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc);
34571+        }
34572+
34573+        // ** Could use BY22 here quite plausibly - this is all bypass stuff
34574+        //    though only per CTB so not very timing critical
34575+
34576+        if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
34577+            continue;
34578+
34579+        for (i = 0; i < 4; i++)
34580+            offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc);
34581+
34582+        if (sao->type_idx[c_idx] == SAO_BAND) {
34583+            for (i = 0; i < 4; i++) {
34584+                if (offset_abs[i] != 0)
34585+                    offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc);
34586+            }
34587+            sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc);
34588+        } else if (c_idx != 2) {
34589+            sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc);
34590+        }
34591+
34592+        // Inferred parameters
34593+        sao->offset_val[c_idx][0] = 0;
34594+        for (i = 0; i < 4; i++) {
34595+            sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
34596+            if (sao->type_idx[c_idx] == SAO_EDGE) {
34597+                if (i > 1)
34598+                    sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
34599+            } else if (offset_sign[i]) {
34600+                sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
34601+            }
34602+        }
34603+    }
34604+}
34605+
34606+#if 0
34607+static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) {
34608+    int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx);  // 0..4
34609+
34610+    if (log2_res_scale_abs_plus1 !=  0) {
34611+        int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx);
34612+        lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
34613+                               (1 - 2 * res_scale_sign_flag);
34614+    } else {
34615+        lc->tu.res_scale_val = 0;
34616+    }
34617+
34618+
34619+    return 0;
34620+}
34621+#endif
34622+
34623+static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
34624+{
34625+    return jb->intra.cmds + jb->intra.n++;
34626+}
34627+
34628+#define A0(x, y, U, L, UL, UR, DL) \
34629+    [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0))
34630+
34631+#define A1(x, y, U, L, UL, UR, DL) \
34632+    A0((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A0((x) + 1, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
34633+    A0((x) + 0, (y) + 1,  1,   (L),  (L),   1,   (DL)),  A0((x) + 1, (y) + 1,  1,    1,    1,    0,    0  )
34634+
34635+#define A2(x, y, U, L, UL, UR, DL) \
34636+    A1((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A1((x) + 2, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
34637+    A1((x) + 0, (y) + 2,  1,   (L),  (L),   1,   (DL)),  A1((x) + 2, (y) + 2,  1,    1,    1,    0,    0  )
34638+
34639+#define A3(x, y, U, L, UL, UR, DL) \
34640+    A2((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A2((x) + 4, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
34641+    A2((x) + 0, (y) + 4,  1,   (L),  (L),   1,   (DL)),  A2((x) + 4, (y) + 4,  1,    1,    1,    0,    0  )
34642+
34643+#define A4(x, y, U, L, UL, UR, DL) \
34644+    A3((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A3((x) + 8, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
34645+    A3((x) + 0, (y) + 8,  1,   (L),  (L),   1,   (DL)),  A3((x) + 8, (y) + 8,  1,    1,    1,    0,    0  )
34646+
34647+static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)};
34648+
34649+unsigned int ff_hevc_rpi_tb_avail_flags(
34650+    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
34651+    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h)
34652+{
34653+    const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size;
34654+    const unsigned int tb_x = x & ~ctb_mask;
34655+    const unsigned int tb_y = y & ~ctb_mask;
34656+    const unsigned int ctb_avail = lc->ctb_avail;
34657+
34658+    const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16;
34659+
34660+    unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL);
34661+
34662+    // This deals with both the U & L edges
34663+    if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0)
34664+        f |= AVAIL_UL;
34665+
34666+    if (x + w < lc->end_of_ctb_x)
34667+        f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR;
34668+    else if (tb_y == 0)
34669+        f |= (ctb_avail & AVAIL_UR);
34670+#if AVAIL_S_U - AVAIL_S_UR < 0
34671+#error Shift problem
34672+#endif
34673+
34674+    // Never any D if Y beyond eoctb
34675+    if (y + h < lc->end_of_ctb_y)
34676+        f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL;
34677+#if AVAIL_S_DL - AVAIL_S_L < 0
34678+#error Shift problem
34679+#endif
34680+
34681+//    printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h,
34682+//           lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16],
34683+//           lc->end_of_ctb_x, lc->end_of_ctb_y);
34684+
34685+    return f;
34686+}
34687+
34688+#undef A0
34689+#undef A1
34690+#undef A2
34691+#undef A3
34692+#undef A4
34693+
34694+static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx,
34695+                          unsigned int avail)
34696+{
34697+    // If rpi_enabled then sand - U & V done on U call
34698+    if (c_idx <= 1)
34699+    {
34700+        HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
34701+        cmd->type = RPI_PRED_INTRA + c_idx;
34702+        cmd->size = log2_trafo_size;
34703+        cmd->avail = avail;
34704+        cmd->i_pred.x = x0;
34705+        cmd->i_pred.y = y0;
34706+        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
34707+
34708+//        printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail);
34709+    }
34710+}
34711+
34712+#define CBF_CB0_S 0
34713+#define CBF_CB1_S 1 // CB1 must be CB0 + 1
34714+#define CBF_CR0_S 2
34715+#define CBF_CR1_S 3
34716+
34717+#define CBF_CB0 (1 << CBF_CB0_S)
34718+#define CBF_CR0 (1 << CBF_CR0_S)
34719+#define CBF_CB1 (1 << CBF_CB1_S)
34720+#define CBF_CR1 (1 << CBF_CR1_S)
34721+
34722+// * Only good for chroma_idx == 1
34723+static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
34724+                              const unsigned int x0, const unsigned int y0,
34725+                              const unsigned int log2_cb_size, const unsigned int log2_trafo_size,
34726+                              const unsigned int blk_idx, const int cbf_luma,
34727+                              const unsigned int cbf_chroma)
34728+{
34729+    const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1);
34730+    const unsigned int x0_c = x0 & ~7;
34731+    const unsigned int y0_c = y0 & ~7;
34732+
34733+    enum ScanType scan_idx   = SCAN_DIAG;
34734+    enum ScanType scan_idx_c = SCAN_DIAG;
34735+
34736+    if (lc->cu.pred_mode == MODE_INTRA)
34737+    {
34738+        const unsigned int trafo_size = 1 << log2_trafo_size;
34739+        const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size);
34740+
34741+        do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail);
34742+
34743+        if (log2_trafo_size > 2)
34744+            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail);
34745+        else if (blk_idx == 3)
34746+            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1,
34747+                          ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8));
34748+
34749+        if (log2_trafo_size < 4) {
34750+            if (lc->tu.intra_pred_mode >= 6 &&
34751+                lc->tu.intra_pred_mode <= 14) {
34752+                scan_idx = SCAN_VERT;
34753+            } else if (lc->tu.intra_pred_mode >= 22 &&
34754+                       lc->tu.intra_pred_mode <= 30) {
34755+                scan_idx = SCAN_HORIZ;
34756+            }
34757+
34758+            if (lc->tu.intra_pred_mode_c >=  6 &&
34759+                lc->tu.intra_pred_mode_c <= 14) {
34760+                scan_idx_c = SCAN_VERT;
34761+            } else if (lc->tu.intra_pred_mode_c >= 22 &&
34762+                       lc->tu.intra_pred_mode_c <= 30) {
34763+                scan_idx_c = SCAN_HORIZ;
34764+            }
34765+        }
34766+    }
34767+
34768+    if (!cbf_luma && cbf_chroma == 0)
34769+        return 0;
34770+
34771+    if (lc->tu.is_cu_qp_delta_wanted)
34772+    {
34773+        const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc);
34774+        const unsigned int cb_mask = ~0U << log2_cb_size;
34775+
34776+        if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) ||
34777+            qp_delta >  (25 + (s->ps.sps->qp_bd_offset >> 1)))
34778+        {
34779+            av_log(s->avctx, AV_LOG_ERROR,
34780+                   "The cu_qp_delta %d is outside the valid range "
34781+                   "[%d, %d].\n",
34782+                   qp_delta,
34783+                   -(26 + (s->ps.sps->qp_bd_offset >> 1)),
34784+                    (25 + (s->ps.sps->qp_bd_offset >> 1)));
34785+            return AVERROR_INVALIDDATA;
34786+        }
34787+
34788+        lc->tu.is_cu_qp_delta_wanted = 0;
34789+        lc->tu.cu_qp_delta = qp_delta;
34790+        ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask);
34791+    }
34792+
34793+    // * Not main profile & untested due to no conform streams
34794+    if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma &&
34795+        !lc->cu.cu_transquant_bypass_flag) {
34796+        int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc);
34797+        if (cu_chroma_qp_offset_flag) {
34798+            int cu_chroma_qp_offset_idx  = 0;
34799+            if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
34800+                cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc);
34801+            }
34802+            lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
34803+            lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
34804+        }
34805+        lc->tu.cu_chroma_qp_offset_wanted = 0;
34806+    }
34807+
34808+    if (cbf_luma)
34809+        ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
34810+
34811+    if (log2_trafo_size > 2 || blk_idx == 3)
34812+    {
34813+        if ((cbf_chroma & CBF_CB0) != 0)
34814+            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
34815+                                        log2_trafo_size_c, scan_idx_c, 1);
34816+        if ((cbf_chroma & CBF_CR0) != 0)
34817+            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
34818+                                        log2_trafo_size_c, scan_idx_c, 2);
34819+    }
34820+
34821+    return 0;
34822+}
34823+
34824+static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size)
34825+{
34826+    set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3);
34827+}
34828+
34829+
34830+static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
34831+                              const unsigned int x0, const unsigned int y0,
34832+                              const unsigned int log2_trafo_size,
34833+                              const unsigned int trafo_depth, const unsigned int blk_idx,
34834+                              const unsigned int cbf_c0)
34835+{
34836+    // When trafo_size == 2 hls_transform_unit uses c0 so put in c1
34837+    unsigned int cbf_c1 = cbf_c0;
34838+    int split_transform_flag;
34839+    int ret;
34840+
34841+    if (lc->cu.intra_split_flag) {
34842+        if (trafo_depth == 1) {
34843+            lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[blk_idx];
34844+            if (ctx_cfmt(s) == 3) {
34845+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
34846+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[blk_idx];
34847+            } else {
34848+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
34849+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
34850+            }
34851+        }
34852+    } else {
34853+        lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[0];
34854+        lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
34855+        lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
34856+    }
34857+
34858+    if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
34859+        log2_trafo_size >  s->ps.sps->log2_min_tb_size    &&
34860+        trafo_depth     < lc->cu.max_trafo_depth       &&
34861+        !(lc->cu.intra_split_flag && trafo_depth == 0))
34862+    {
34863+        split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size);
34864+    } else {
34865+        int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
34866+                          lc->cu.pred_mode == MODE_INTER &&
34867+                          lc->cu.part_mode != PART_2Nx2N &&
34868+                          trafo_depth == 0;
34869+
34870+        split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size ||
34871+                               (lc->cu.intra_split_flag && trafo_depth == 0) ||
34872+                               inter_split;
34873+    }
34874+
34875+    if (log2_trafo_size > 2 || ctx_cfmt(s) == 3)
34876+    {
34877+        const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3);
34878+        cbf_c1 = 0;
34879+
34880+        if ((cbf_c0 & CBF_CB0) != 0)
34881+        {
34882+            cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S;
34883+            if (wants_c1)
34884+                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S;
34885+        }
34886+
34887+        if ((cbf_c0 & CBF_CR0) != 0)
34888+        {
34889+            cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S;
34890+            if (wants_c1)
34891+                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S;
34892+        }
34893+    }
34894+
34895+    if (split_transform_flag) {
34896+        const int trafo_size_split = 1 << (log2_trafo_size - 1);
34897+        const int x1 = x0 + trafo_size_split;
34898+        const int y1 = y0 + trafo_size_split;
34899+
34900+#define SUBDIVIDE(x, y, idx)                                                    \
34901+do {                                                                            \
34902+    ret = hls_transform_tree(s, lc, x, y,                                       \
34903+                             log2_trafo_size - 1, trafo_depth + 1, idx,         \
34904+                             cbf_c1);                                           \
34905+    if (ret < 0)                                                                \
34906+        return ret;                                                             \
34907+} while (0)
34908+
34909+        SUBDIVIDE(x0, y0, 0);
34910+        SUBDIVIDE(x1, y0, 1);
34911+        SUBDIVIDE(x0, y1, 2);
34912+        SUBDIVIDE(x1, y1, 3);
34913+
34914+#undef SUBDIVIDE
34915+    } else {
34916+        // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have
34917+        // trafo_size == 2 with depth == 0 the issue is moot
34918+        const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) ||
34919+            ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth));
34920+
34921+        ret = hls_transform_unit(s, lc, x0, y0,
34922+                                 log2_trafo_size + trafo_depth, log2_trafo_size,
34923+                                 blk_idx, cbf_luma, cbf_c1);
34924+        if (ret < 0)
34925+            return ret;
34926+
34927+        if (!s->sh.disable_deblocking_filter_flag) {
34928+            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma);
34929+        }
34930+    }
34931+    return 0;
34932+}
34933+
34934+
34935+static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
34936+{
34937+    GetBitContext gb;
34938+    int ret;
34939+
34940+    ret = init_get_bits(&gb, pcm, length);
34941+    if (ret < 0)
34942+        return ret;
34943+
34944+    s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
34945+                       frame_stride1(s->frame, 0),
34946+                       cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
34947+
34948+    s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)),
34949+                       s->frame->linesize[1],
34950+                       cb_size >> ctx_hshift(s, 1),
34951+                       cb_size >> ctx_vshift(s, 1),
34952+                       &gb, s->ps.sps->pcm.bit_depth_chroma);
34953+
34954+    return 0;
34955+}
34956+
34957+
34958+// x * 2^(y*2)
34959+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
34960+{
34961+    return x << (y * 2);
34962+}
34963+
34964+static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
34965+{
34966+    // Length in bits
34967+    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
34968+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) +
34969+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2));
34970+
34971+    const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3);
34972+
34973+    if (!s->sh.disable_deblocking_filter_flag)
34974+        ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
34975+
34976+    // Copy coeffs
34977+    {
34978+        const int blen = (length + 7) >> 3;
34979+        // Round allocated bytes up to nearest 32 to avoid alignment confusion
34980+        // Allocation is in int16_t s
34981+        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
34982+        // sample this rounding doesn't affect the total size we need to allocate for
34983+        // the coeff buffer
34984+        int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
34985+        memcpy(coeffs, pcm, blen);
34986+
34987+        // Our coeff stash assumes that any partially allocated 64byte lump
34988+        // is zeroed so make that true.
34989+        {
34990+            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
34991+            if ((-(intptr_t)eopcm & 63) != 0)
34992+                memset(eopcm, 0, -(intptr_t)eopcm & 63);
34993+        }
34994+
34995+        // Add command
34996+        {
34997+            HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
34998+            cmd->type = RPI_PRED_I_PCM;
34999+            cmd->size = log2_cb_size;
35000+            cmd->i_pcm.src = coeffs;
35001+            cmd->i_pcm.x = x0;
35002+            cmd->i_pcm.y = y0;
35003+            cmd->i_pcm.src_len = length;
35004+        }
35005+        return 0;
35006+    }
35007+}
35008+
35009+
35010+static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref,
35011+                                const MvXY xy, const int y0, const int height)
35012+{
35013+    if (s->threads_type != 0) {
35014+        const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9);
35015+
35016+        // Progress has to be attached to current job as the actual wait
35017+        // is in worker_core which can't use lc
35018+        int16_t *const pr = lc->jb0->progress_req + ref->dpb_no;
35019+        if (*pr < y) {
35020+            *pr = y;
35021+        }
35022+    }
35023+}
35024+
35025+static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
35026+                                  const int x0, const int y0, const int nPbW,
35027+                                  const int nPbH,
35028+                                  HEVCRpiMvField * const mv)
35029+{
35030+    enum InterPredIdc inter_pred_idc = PRED_L0;
35031+    int mvp_flag;
35032+    const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH);
35033+
35034+    mv->pred_flag = 0;
35035+    if (s->sh.slice_type == HEVC_SLICE_B)
35036+        inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
35037+
35038+    if (inter_pred_idc != PRED_L1) {
35039+        MvXY mvd;
35040+
35041+        if (s->sh.nb_refs[L0])
35042+            mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
35043+
35044+        mv->pred_flag = PF_L0;
35045+        mvd = ff_hevc_rpi_hls_mvd_coding(lc);
35046+        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
35047+        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
35048+                                 mv, mvp_flag, 0);
35049+        mv->xy[0] = mvxy_add(mv->xy[0], mvd);
35050+    }
35051+
35052+    if (inter_pred_idc != PRED_L0) {
35053+        MvXY mvd = 0;
35054+
35055+        if (s->sh.nb_refs[L1])
35056+            mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
35057+
35058+        if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI)
35059+            mvd = ff_hevc_rpi_hls_mvd_coding(lc);
35060+
35061+        mv->pred_flag += PF_L1;
35062+        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
35063+        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
35064+                                 mv, mvp_flag, 1);
35065+        mv->xy[1] = mvxy_add(mv->xy[1], mvd);
35066+    }
35067+}
35068+
35069+
35070+static HEVCRpiInterPredQ *
35071+rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
35072+{
35073+    HEVCRpiInterPredQ * yp = NULL;
35074+    HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr;
35075+    const unsigned int max_fill = ipe->max_fill;
35076+    unsigned int load = UINT_MAX;
35077+
35078+    for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) {
35079+        // We will always have enough room between the Qs but if we are
35080+        // running critically low due to poor scheduling then use fill size
35081+        // rather than load to determine QPU.  This has obvious dire
35082+        // performance implications but (a) it is better than crashing
35083+        // and (b) it should (almost) never happen
35084+        const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base;
35085+        const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load;
35086+
35087+        if (tload < load)
35088+        {
35089+            yp = ypt;
35090+            load = tload;
35091+        }
35092+    }
35093+
35094+    yp->load += load_val;
35095+    ipe->used_grp = 1;
35096+    qpu_mc_link_set(yp->qpu_mc_curr, fn);
35097+
35098+    return yp;
35099+}
35100+
35101+
35102+static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
35103+{
35104+    for (unsigned int i = 0; i != ipe->n; ++i) {
35105+        HEVCRpiInterPredQ * const q = ipe->q + i;
35106+        const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
35107+
35108+        qpu_mc_link_set(q->qpu_mc_curr, q->code_sync);
35109+        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(&q->qpu_mc_curr->sync + 1);
35110+        q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
35111+    }
35112+}
35113+
35114+// Returns 0 on success
35115+// We no longer check for Q fullness as wew have emergncy code in ctu alloc
35116+// * However it might be an idea to have some means of spotting that we've used it
35117+static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
35118+{
35119+    if (!ipe->used_grp)
35120+        return 0;
35121+
35122+    if ((ipe->curr += ipe->n_grp) >= ipe->n)
35123+    {
35124+        ipe->curr = 0;
35125+        rpi_inter_pred_sync(ipe);
35126+    }
35127+    ipe->used = 1;
35128+    ipe->used_grp = 0;
35129+
35130+    return 0;
35131+}
35132+
35133+static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
35134+{
35135+    unsigned int i;
35136+
35137+    ipe->curr = 0;
35138+    ipe->used = 0;
35139+    ipe->used_grp = 0;
35140+    for (i = 0; i != ipe->n; ++i) {
35141+        HEVCRpiInterPredQ * const q = ipe->q + i;
35142+        q->qpu_mc_curr = q->qpu_mc_base;
35143+        q->load = 0;
35144+        q->last_l0 = NULL;
35145+        q->last_l1 = NULL;
35146+    }
35147+}
35148+
35149+static int rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
35150+                                 const unsigned int n_max, const unsigned int n_grp,
35151+                                 const unsigned int total_size, const unsigned int min_gap)
35152+{
35153+    int rv;
35154+
35155+    memset(ipe, 0, sizeof(*ipe));
35156+    if ((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) == NULL)
35157+        return AVERROR(ENOMEM);
35158+
35159+    ipe->n_grp = n_grp;
35160+    ipe->min_gap = min_gap;
35161+
35162+    if ((rv = gpu_malloc_cached(total_size, &ipe->gptr)) != 0)
35163+        av_freep(&ipe->q);
35164+    return rv;
35165+}
35166+
35167+
35168+#if RPI_QPU_EMU_Y
35169+#define get_mc_address_y(f) ((f)->data[0])
35170+#else
35171+#define get_mc_address_y(f) get_vc_address_y(f)
35172+#endif
35173+#if RPI_QPU_EMU_C
35174+#define get_mc_address_u(f) ((f)->data[1])
35175+#else
35176+#define get_mc_address_u(f) get_vc_address_u(f)
35177+#endif
35178+
35179+static inline uint32_t pack_wo_p(const int off, const int mul)
35180+{
35181+    return PACK2(off * 2 + 1, mul);
35182+}
35183+
35184+static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul)
35185+{
35186+    return PACK2(off0 + off1 + 1, mul);
35187+}
35188+
35189+
35190+static void
35191+rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
35192+           const int x0, const int y0,
35193+           const int nPbW, const int nPbH,
35194+           const MvXY mv_xy,
35195+           const int weight_mul,
35196+           const int weight_offset,
35197+           AVFrame *const src_frame)
35198+{
35199+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
35200+    const unsigned int mx          = MV_X(mv_xy) & 3;
35201+    const unsigned int my          = MV_Y(mv_xy) & 3;
35202+    const unsigned int my_mx       = (my << 8) | mx;
35203+    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
35204+    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
35205+    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
35206+    const uint32_t wo = pack_wo_p(weight_offset, weight_mul);
35207+    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
35208+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
35209+
35210+    if (my_mx == 0)
35211+    {
35212+        const int x1 = x0 + (MV_X(mv_xy) >> 2);
35213+        const int y1 = y0 + (MV_Y(mv_xy) >> 2);
35214+        const int bh = nPbH;
35215+
35216+        for (int start_x = 0; start_x < nPbW; start_x += 16)
35217+        {
35218+            const int bw = FFMIN(nPbW - start_x, 16);
35219+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
35220+            qpu_mc_src_t *const src1 = yp->last_l0;
35221+            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
35222+
35223+#if RPI_TSTATS
35224+            {
35225+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
35226+                ++ts->y_pred1_x0y0;
35227+
35228+                if (nPbW > 8)
35229+                    ++ts->y_pred1_wgt8;
35230+                else
35231+                    ++ts->y_pred1_wle8;
35232+
35233+                if (nPbH > 16)
35234+                    ++ts->y_pred1_hgt16;
35235+                else
35236+                    ++ts->y_pred1_hle16;
35237+            }
35238+#endif
35239+
35240+            src1->x = x1 + start_x;
35241+            src1->y = y1;
35242+            src1->base = src_vc_address_y;
35243+            cmd_y->w = bw;
35244+            cmd_y->h = bh;
35245+            cmd_y->wo1 = wo;
35246+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
35247+            yp->last_l0 = &cmd_y->next_src1;
35248+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
35249+        }
35250+    }
35251+    else
35252+    {
35253+        const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3;
35254+        const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3;
35255+        const unsigned int bh = nPbH;
35256+        int start_x = 0;
35257+
35258+#if 1
35259+        // As Y-pred operates on two independant 8-wide src blocks we can merge
35260+        // this pred with the previous one if it the previous one is 8 pel wide,
35261+        // the same height as the current block, immediately to the left of our
35262+        // current dest block and mono-pred.
35263+
35264+        qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
35265+        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
35266+        {
35267+            const int bw = FFMIN(nPbW, 8);
35268+            qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
35269+
35270+            last_y8_src2->x = x1_m3;
35271+            last_y8_src2->y = y1_m3;
35272+            last_y8_src2->base = src_vc_address_y;
35273+            last_y8_p->w += bw;
35274+            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
35275+            last_y8_p->wo2 = wo;
35276+
35277+            jb->last_y8_p = NULL;
35278+            jb->last_y8_l1 = NULL;
35279+            start_x = bw;
35280+#if RPI_TSTATS
35281+            ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge;
35282+#endif
35283+        }
35284+#endif
35285+
35286+        for (; start_x < nPbW; start_x += 16)
35287+        {
35288+            const int bw = FFMIN(nPbW - start_x, 16);
35289+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
35290+            qpu_mc_src_t *const src1 = yp->last_l0;
35291+            qpu_mc_src_t *const src2 = yp->last_l1;
35292+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
35293+#if RPI_TSTATS
35294+            {
35295+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
35296+                if (mx == 0 && my == 0)
35297+                    ++ts->y_pred1_x0y0;
35298+                else if (mx == 0)
35299+                    ++ts->y_pred1_x0;
35300+                else if (my == 0)
35301+                    ++ts->y_pred1_y0;
35302+                else
35303+                    ++ts->y_pred1_xy;
35304+
35305+                if (nPbW > 8)
35306+                    ++ts->y_pred1_wgt8;
35307+                else
35308+                    ++ts->y_pred1_wle8;
35309+
35310+                if (nPbH > 16)
35311+                    ++ts->y_pred1_hgt16;
35312+                else
35313+                    ++ts->y_pred1_hle16;
35314+            }
35315+#endif
35316+            src1->x = x1_m3 + start_x;
35317+            src1->y = y1_m3;
35318+            src1->base = src_vc_address_y;
35319+            if (bw <= 8)
35320+            {
35321+                src2->x = MC_DUMMY_X;
35322+                src2->y = MC_DUMMY_Y;
35323+#if RPI_QPU_EMU_Y
35324+                src2->base = s->qpu_dummy_frame_emu;
35325+#else
35326+                src2->base = s->qpu_dummy_frame_qpu;
35327+#endif
35328+            }
35329+            else
35330+            {
35331+                src2->x = x1_m3 + start_x + 8;
35332+                src2->y = y1_m3;
35333+                src2->base = src_vc_address_y;
35334+            }
35335+            cmd_y->w = bw;
35336+            cmd_y->h = bh;
35337+            cmd_y->mymx21 = my2_mx2_my_mx;
35338+            cmd_y->wo1 = wo;
35339+            cmd_y->wo2 = wo;
35340+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
35341+            yp->last_l0 = &cmd_y->next_src1;
35342+            yp->last_l1 = &cmd_y->next_src2;
35343+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
35344+
35345+            if (bw == 8) {
35346+                jb->last_y8_l1 = src2;
35347+                jb->last_y8_p = cmd_y;
35348+            }
35349+        }
35350+    }
35351+}
35352+
35353+static void
35354+rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
35355+           const int x0, const int y0,
35356+           const int nPbW, const int nPbH,
35357+           const struct HEVCRpiMvField *const mv_field,
35358+           const AVFrame *const src_frame,
35359+           const AVFrame *const src_frame2)
35360+{
35361+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
35362+    const MvXY mv  = mv_field->xy[0];
35363+    const MvXY mv2 = mv_field->xy[1];
35364+
35365+    const unsigned int mx          = MV_X(mv) & 3;
35366+    const unsigned int my          = MV_Y(mv) & 3;
35367+    const unsigned int my_mx = (my<<8) | mx;
35368+    const unsigned int mx2          = MV_X(mv2) & 3;
35369+    const unsigned int my2          = MV_Y(mv2) & 3;
35370+    const unsigned int my2_mx2 = (my2<<8) | mx2;
35371+    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
35372+    const unsigned int ref_idx0 = mv_field->ref_idx[0];
35373+    const unsigned int ref_idx1 = mv_field->ref_idx[1];
35374+    const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]);
35375+    const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]);
35376+
35377+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
35378+    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
35379+    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
35380+    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
35381+    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
35382+
35383+    if (my2_mx2_my_mx == 0)
35384+    {
35385+        const int x1 = x0 + (MV_X(mv) >> 2);
35386+        const int y1 = y0 + (MV_Y(mv) >> 2);
35387+        const int x2 = x0 + (MV_X(mv2) >> 2);
35388+        const int y2 = y0 + (MV_Y(mv2) >> 2);
35389+        const int bh = nPbH;
35390+
35391+        // Can do chunks a full 16 wide if we don't want the H filter
35392+        for (int start_x=0; start_x < nPbW; start_x += 16)
35393+        {
35394+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
35395+            qpu_mc_src_t *const src1 = yp->last_l0;
35396+            qpu_mc_src_t *const src2 = yp->last_l1;
35397+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
35398+#if RPI_TSTATS
35399+            {
35400+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
35401+                ++ts->y_pred2_x0y0;
35402+
35403+                if (nPbH > 16)
35404+                    ++ts->y_pred2_hgt16;
35405+                else
35406+                    ++ts->y_pred2_hle16;
35407+            }
35408+#endif
35409+            src1->x = x1 + start_x;
35410+            src1->y = y1;
35411+            src1->base = src1_base;
35412+            src2->x = x2 + start_x;
35413+            src2->y = y2;
35414+            src2->base = src2_base;
35415+            cmd_y->w = FFMIN(nPbW - start_x, 16);
35416+            cmd_y->h = bh;
35417+            cmd_y->mymx21 = 0;
35418+            cmd_y->wo1 = wo1;
35419+            cmd_y->wo2 = wo2;
35420+            cmd_y->dst_addr =  dst + (start_x << xshl);
35421+            yp->last_l0 = &cmd_y->next_src1;
35422+            yp->last_l1 = &cmd_y->next_src2;
35423+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
35424+        }
35425+    }
35426+    else
35427+    {
35428+        // Filter requires a run-up of 3
35429+        const int x1 = x0 + (MV_X(mv) >> 2) - 3;
35430+        const int y1 = y0 + (MV_Y(mv) >> 2) - 3;
35431+        const int x2 = x0 + (MV_X(mv2) >> 2) - 3;
35432+        const int y2 = y0 + (MV_Y(mv2) >> 2) - 3;
35433+        const int bh = nPbH;
35434+
35435+        for (int start_x=0; start_x < nPbW; start_x += 8)
35436+        { // B blocks work 8 at a time
35437+            // B weights aren't doubled as the QPU code does the same
35438+            // amount of work as it does for P
35439+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
35440+            qpu_mc_src_t *const src1 = yp->last_l0;
35441+            qpu_mc_src_t *const src2 = yp->last_l1;
35442+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
35443+#if RPI_TSTATS
35444+            {
35445+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
35446+                const unsigned int mmx = mx | mx2;
35447+                const unsigned int mmy = my | my2;
35448+                if (mmx == 0 && mmy == 0)
35449+                    ++ts->y_pred2_x0y0;
35450+                else if (mmx == 0)
35451+                    ++ts->y_pred2_x0;
35452+                else if (mmy == 0)
35453+                    ++ts->y_pred2_y0;
35454+                else
35455+                    ++ts->y_pred2_xy;
35456+
35457+                if (nPbH > 16)
35458+                    ++ts->y_pred2_hgt16;
35459+                else
35460+                    ++ts->y_pred2_hle16;
35461+            }
35462+#endif
35463+            src1->x = x1 + start_x;
35464+            src1->y = y1;
35465+            src1->base = src1_base;
35466+            src2->x = x2 + start_x;
35467+            src2->y = y2;
35468+            src2->base = src2_base;
35469+            cmd_y->w = FFMIN(nPbW - start_x, 8);
35470+            cmd_y->h = bh;
35471+            cmd_y->mymx21 = my2_mx2_my_mx;
35472+            cmd_y->wo1 = wo1;
35473+            cmd_y->wo2 = wo2;
35474+            cmd_y->dst_addr =  dst + (start_x << xshl);
35475+            yp->last_l0 = &cmd_y->next_src1;
35476+            yp->last_l1 = &cmd_y->next_src2;
35477+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
35478+        }
35479+    }
35480+}
35481+
35482+// h/v shifts fixed at one as that is all the qasm copes with
35483+static void
35484+rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
35485+  const unsigned int lx, const int x0_c, const int y0_c,
35486+  const int nPbW_c, const int nPbH_c,
35487+  const MvXY mv,
35488+  const int16_t * const c_weights,
35489+  const int16_t * const c_offsets,
35490+  AVFrame * const src_frame)
35491+{
35492+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
35493+    const int hshift = 1; // = s->ps.sps->hshift[1];
35494+    const int vshift = 1; // = s->ps.sps->vshift[1];
35495+
35496+    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
35497+    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
35498+    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
35499+    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)];
35500+    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)];
35501+    const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]);
35502+    const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]);
35503+    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
35504+    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
35505+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
35506+    const unsigned int bh = nPbH_c;
35507+    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
35508+
35509+    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
35510+    {
35511+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
35512+        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
35513+        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
35514+        qpu_mc_src_t * const last_lx = *plast_lx;
35515+        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
35516+
35517+        last_lx->x = x1_c + start_x;
35518+        last_lx->y = y1_c;
35519+        last_lx->base = src_base_u;
35520+        cmd_c->h = bh;
35521+        cmd_c->w = bw;
35522+        cmd_c->coeffs_x = x_coeffs;
35523+        cmd_c->coeffs_y = y_coeffs;
35524+        cmd_c->wo_u = wo_u;
35525+        cmd_c->wo_v = wo_v;
35526+        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
35527+        *plast_lx = &cmd_c->next_src;
35528+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
35529+    }
35530+    return;
35531+}
35532+
35533+// h/v shifts fixed at one as that is all the qasm copes with
35534+static void
35535+rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
35536+  const int x0_c, const int y0_c,
35537+  const int nPbW_c, const int nPbH_c,
35538+  const struct HEVCRpiMvField * const mv_field,
35539+  const int16_t * const c_weights,
35540+  const int16_t * const c_offsets,
35541+  const int16_t * const c_weights2,
35542+  const int16_t * const c_offsets2,
35543+  AVFrame * const src_frame,
35544+  AVFrame * const src_frame2)
35545+{
35546+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
35547+    const int hshift = 1; // s->ps.sps->hshift[1];
35548+    const int vshift = 1; // s->ps.sps->vshift[1];
35549+    const MvXY mv = mv_field->xy[0];
35550+    const MvXY mv2 = mv_field->xy[1];
35551+
35552+    const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift);
35553+    const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift);
35554+    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
35555+    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
35556+    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
35557+    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
35558+
35559+    const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift);
35560+    const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift);
35561+    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
35562+    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
35563+
35564+    const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1;
35565+    const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1;
35566+
35567+    const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]);
35568+    const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]);
35569+
35570+    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
35571+    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
35572+    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
35573+    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
35574+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
35575+    const unsigned int bh = nPbH_c;
35576+
35577+    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
35578+    {
35579+        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
35580+
35581+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
35582+        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
35583+        qpu_mc_src_t * const src_l0 = cp->last_l0;
35584+        qpu_mc_src_t * const src_l1 = cp->last_l1;
35585+
35586+        src_l0->x = x1_c + start_x;
35587+        src_l0->y = y1_c;
35588+        src_l0->base = src1_base;
35589+        src_l1->x = x2_c + start_x;
35590+        src_l1->y = y2_c;
35591+        src_l1->base = src2_base;
35592+
35593+        u[0].h = bh;
35594+        u[0].w = bw;
35595+        u[0].coeffs_x1 = coefs0_x;
35596+        u[0].coeffs_y1 = coefs0_y;
35597+        u[0].weight_u1 = c_weights[0]; // Weight L0 U
35598+        u[0].weight_v1 = c_weights[1]; // Weight L0 V
35599+        u[0].coeffs_x2 = coefs1_x;
35600+        u[0].coeffs_y2 = coefs1_y;
35601+        u[0].wo_u2 = wo_u2;
35602+        u[0].wo_v2 = wo_v2;
35603+        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
35604+
35605+        cp->last_l0 = &u[0].next_src1;
35606+        cp->last_l1 = &u[0].next_src2;
35607+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
35608+    }
35609+}
35610+
35611+
35612+static inline void
35613+col_stash(const HEVCRpiContext * const s,
35614+          const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0,
35615+          const HEVCRpiMvField * const mvf)
35616+{
35617+    ColMvField * const col_mvf = s->ref->col_mvf;
35618+    const unsigned int x = (x0 + 15) >> 4;
35619+    const unsigned int y = (y0 + 15) >> 4;
35620+    const unsigned int w = ((x0 + 15 + w0) >> 4) - x;
35621+    const unsigned int h = ((y0 + 15 + h0) >> 4) - y;
35622+
35623+    if (col_mvf != NULL && w != 0 && h != 0)
35624+    {
35625+        // Only record MV from the top left of the 16x16 block
35626+
35627+        const RefPicList * const rpl = s->refPicList;
35628+        const ColMvField cmv = {
35629+            .L = {
35630+                {
35631+                    .poc = (mvf->pred_flag & PF_L0) == 0 ?
35632+                            COL_POC_INTRA :
35633+                            COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]),
35634+                    .xy = mvf->xy[0]
35635+                },
35636+                {
35637+                    .poc = (mvf->pred_flag & PF_L1) == 0 ?
35638+                            COL_POC_INTRA :
35639+                            COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]),
35640+                    .xy = mvf->xy[1]
35641+                }
35642+            }
35643+        };
35644+
35645+        ColMvField * p = col_mvf + y * s->col_mvf_stride + x;
35646+        const unsigned int stride = s->col_mvf_stride - w;
35647+        unsigned int j = h;
35648+
35649+        do
35650+        {
35651+            unsigned int k = w;
35652+            do
35653+            {
35654+                *p++ = cmv;
35655+            } while (--k != 0);
35656+            p += stride;
35657+        } while (--j != 0);
35658+    }
35659+}
35660+
35661+static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
35662+                                const unsigned int x0, const unsigned int y0,
35663+                                const unsigned int nPbW, const unsigned int nPbH,
35664+                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
35665+{
35666+    HEVCRpiJob * const jb = lc->jb0;
35667+
35668+    struct HEVCRpiMvField current_mv = {{0}};
35669+    const RefPicList  *const refPicList = s->refPicList;
35670+    const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL;
35671+
35672+    if (lc->cu.pred_mode != MODE_SKIP)
35673+        lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
35674+
35675+    if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) {
35676+        const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 :
35677+            ff_hevc_rpi_merge_idx_decode(s, lc);
35678+
35679+        ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
35680+                                   partIdx, merge_idx, &current_mv);
35681+    } else {
35682+        hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, &current_mv);
35683+    }
35684+
35685+    {
35686+        HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
35687+        unsigned int i, j;
35688+
35689+        for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++)
35690+        {
35691+            for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++)
35692+                p[i] = current_mv;
35693+            p += MVF_STASH_WIDTH_PU;
35694+        }
35695+    }
35696+
35697+    col_stash(s, x0, y0, nPbW, nPbH, &current_mv);
35698+
35699+    if (current_mv.pred_flag & PF_L0) {
35700+        ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
35701+        if (!ref0)
35702+            return;
35703+        hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH);
35704+    }
35705+    if (current_mv.pred_flag & PF_L1) {
35706+        ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
35707+        if (!ref1)
35708+            return;
35709+        hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH);
35710+    }
35711+
35712+    if (current_mv.pred_flag == PF_L0) {
35713+        const int x0_c = x0 >> ctx_hshift(s, 1);
35714+        const int y0_c = y0 >> ctx_vshift(s, 1);
35715+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
35716+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
35717+
35718+        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0],
35719+          s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
35720+          ref0->frame);
35721+
35722+        if (ctx_cfmt(s) != 0) {
35723+            rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0],
35724+              s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
35725+              ref0->frame);
35726+            return;
35727+        }
35728+    } else if (current_mv.pred_flag == PF_L1) {
35729+        const int x0_c = x0 >> ctx_hshift(s, 1);
35730+        const int y0_c = y0 >> ctx_vshift(s, 1);
35731+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
35732+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
35733+
35734+        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1],
35735+          s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
35736+          ref1->frame);
35737+
35738+        if (ctx_cfmt(s) != 0) {
35739+            rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1],
35740+              s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
35741+              ref1->frame);
35742+            return;
35743+        }
35744+    } else if (current_mv.pred_flag == PF_BI) {
35745+        const int x0_c = x0 >> ctx_hshift(s, 1);
35746+        const int y0_c = y0 >> ctx_vshift(s, 1);
35747+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
35748+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
35749+
35750+        rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
35751+
35752+        if (ctx_cfmt(s) != 0) {
35753+          rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
35754+                       &current_mv,
35755+                       s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
35756+                       s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
35757+                       s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
35758+                       s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
35759+                       ref0->frame,
35760+                       ref1->frame);
35761+            return;
35762+        }
35763+    }
35764+}
35765+
35766+static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
35767+                    const unsigned int x0, const unsigned int y0,
35768+                    const unsigned int log2_cb_size,
35769+                    const unsigned int ipm)
35770+{
35771+    const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE;
35772+    const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE;
35773+
35774+    {
35775+        const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE));
35776+        set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm);
35777+    }
35778+
35779+    // If IRAP then everything is Intra & we avoid ever looking at these
35780+    // stashes so don't bother setting them
35781+    if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA)
35782+    {
35783+        if (s->is_intra != NULL)
35784+        {
35785+            set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE);
35786+        }
35787+
35788+        {
35789+            HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
35790+            const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1
35791+            unsigned int n = size_in_pus;
35792+
35793+            do
35794+            {
35795+                memset(p, 0, size_in_pus * sizeof(*p));
35796+                p += MVF_STASH_WIDTH_PU;
35797+            } while (--n != 0);
35798+        }
35799+
35800+
35801+        if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0)
35802+        {
35803+            // Only record top left stuff
35804+            // Blocks should always be alinged on size boundries
35805+            // so cannot have overflow from a small block
35806+
35807+            ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4);
35808+            const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4));
35809+            const unsigned int stride = s->col_mvf_stride - size_in_col;
35810+            unsigned int j = size_in_col;
35811+
35812+            do
35813+            {
35814+                unsigned int k = size_in_col;
35815+                do
35816+                {
35817+                    p->L[0].poc = COL_POC_INTRA;
35818+                    p->L[0].xy = 0;
35819+                    p->L[1].poc = COL_POC_INTRA;
35820+                    p->L[1].xy = 0;
35821+                    ++p;
35822+                } while (--k != 0);
35823+                p += stride;
35824+            } while (--j != 0);
35825+        }
35826+    }
35827+}
35828+
35829+static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
35830+                                                const unsigned int x0, const unsigned int y0,
35831+                                                const unsigned int log2_cb_size)
35832+{
35833+    set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC);
35834+}
35835+
35836+
35837+/**
35838+ * 8.4.1
35839+ */
35840+static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
35841+                                int x0, int y0, int log2_pu_size,
35842+                                int prev_intra_luma_pred_flag,
35843+                                const unsigned int idx)
35844+{
35845+    const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size);
35846+    const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
35847+    const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
35848+
35849+    // Up does not cross boundries so as we always scan 1 slice-tile-line in an
35850+    // lc we can just keep 1 CTB lR stashes
35851+    // Left is reset to DC @ Start of Line/Tile/Slice in fill_job
35852+    const unsigned int cand_up   = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu];
35853+    const unsigned int cand_left = lc->ipm_left[yb_pu];
35854+
35855+    unsigned int intra_pred_mode;
35856+    unsigned int a, b, c;
35857+
35858+    if (cand_left == cand_up) {
35859+        if (cand_left < 2) {
35860+            a = INTRA_PLANAR;
35861+            b = INTRA_DC;
35862+            c = INTRA_ANGULAR_26;
35863+        } else {
35864+            a = cand_left;
35865+            b = 2 + ((cand_left - 2 - 1 + 32) & 31);
35866+            c = 2 + ((cand_left - 2 + 1) & 31);
35867+        }
35868+    } else {
35869+        a = cand_left;
35870+        b = cand_up;
35871+        c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ?
35872+                INTRA_PLANAR :
35873+            (cand_left != INTRA_DC && cand_up != INTRA_DC) ?
35874+                INTRA_DC :
35875+                INTRA_ANGULAR_26;
35876+    }
35877+
35878+    if (prev_intra_luma_pred_flag) {
35879+        intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c;
35880+    } else {
35881+        // Sort lowest 1st
35882+        if (a > b)
35883+            FFSWAP(int, a, b);
35884+        if (a > c)
35885+            FFSWAP(int, a, c);
35886+        if (b > c)
35887+            FFSWAP(int, b, c);
35888+
35889+        intra_pred_mode = idx;
35890+        if (intra_pred_mode >= a)
35891+            intra_pred_mode++;
35892+        if (intra_pred_mode >= b)
35893+            intra_pred_mode++;
35894+        if (intra_pred_mode >= c)
35895+            intra_pred_mode++;
35896+    }
35897+
35898+    /* write the intra prediction units into the mv array */
35899+    set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode);
35900+    return intra_pred_mode;
35901+}
35902+
35903+static const uint8_t tab_mode_idx[] = {
35904+     0,  1,  2,  2,  2,  2,  3,  5,  7,  8, 10, 12, 13, 15, 17, 18, 19, 20,
35905+    21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
35906+
35907+static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
35908+                                  const unsigned int x0, const unsigned int y0,
35909+                                  const unsigned int log2_cb_size)
35910+{
35911+    static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
35912+    uint8_t prev_intra_luma_pred_flag[4];
35913+    int split   = lc->cu.part_mode == PART_NxN;
35914+    const unsigned int split_size = (1 << (log2_cb_size - 1));
35915+    int chroma_mode;
35916+    const unsigned int n = split ? 4 : 1;
35917+    unsigned int i;
35918+
35919+    for (i = 0; i != n; i++)
35920+        prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc);
35921+
35922+    for (i = 0; i < n; i++) {
35923+        // depending on mode idx is mpm or luma_pred_mode
35924+        const unsigned int idx = prev_intra_luma_pred_flag[i] ?
35925+            ff_hevc_rpi_mpm_idx_decode(lc) :
35926+            ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc);
35927+
35928+        lc->pu.intra_pred_mode[i] =
35929+            luma_intra_pred_mode(s, lc,
35930+                                 x0 + ((i & 1) == 0 ? 0 : split_size),
35931+                                 y0 + ((i & 2) == 0 ? 0 : split_size),
35932+                                 log2_cb_size - split,
35933+                                 prev_intra_luma_pred_flag[i], idx);
35934+    }
35935+
35936+    if (ctx_cfmt(s) == 3) {
35937+        for (i = 0; i < n; i++) {
35938+            lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
35939+            if (chroma_mode != 4) {
35940+                if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode])
35941+                    lc->pu.intra_pred_mode_c[i] = 34;
35942+                else
35943+                    lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode];
35944+            } else {
35945+                lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i];
35946+            }
35947+        }
35948+    } else if (ctx_cfmt(s) == 2) {
35949+        int mode_idx;
35950+        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
35951+        if (chroma_mode != 4) {
35952+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
35953+                mode_idx = 34;
35954+            else
35955+                mode_idx = intra_chroma_table[chroma_mode];
35956+        } else {
35957+            mode_idx = lc->pu.intra_pred_mode[0];
35958+        }
35959+        lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
35960+    } else if (ctx_cfmt(s) != 0) {
35961+        chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
35962+        if (chroma_mode != 4) {
35963+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
35964+                lc->pu.intra_pred_mode_c[0] = 34;
35965+            else
35966+                lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
35967+        } else {
35968+            lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
35969+        }
35970+    }
35971+}
35972+
35973+static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
35974+                           const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size)
35975+{
35976+    const unsigned int cb_size          = 1 << log2_cb_size;
35977+    const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
35978+    const unsigned int min_cb_width     = s->ps.sps->min_cb_width;
35979+    const unsigned int x_cb             = x0 >> log2_min_cb_size;
35980+    const unsigned int y_cb             = y0 >> log2_min_cb_size;
35981+    const unsigned int idx              = log2_cb_size - 2;
35982+    const unsigned int qp_block_mask    = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
35983+    int skip_flag = 0;
35984+
35985+    lc->cu.x                = x0;
35986+    lc->cu.y                = y0;
35987+    lc->cu.x_split          = x0;
35988+    lc->cu.y_split          = y0;
35989+
35990+    lc->cu.pred_mode        = MODE_INTRA;
35991+    lc->cu.part_mode        = PART_2Nx2N;
35992+    lc->cu.intra_split_flag = 0;
35993+    lc->cu.cu_transquant_bypass_flag = 0;
35994+    lc->pu.intra_pred_mode[0] = 1;
35995+    lc->pu.intra_pred_mode[1] = 1;
35996+    lc->pu.intra_pred_mode[2] = 1;
35997+    lc->pu.intra_pred_mode[3] = 1;
35998+
35999+    if (s->ps.pps->transquant_bypass_enable_flag) {
36000+        lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc);
36001+        if (lc->cu.cu_transquant_bypass_flag)
36002+            set_deblocking_bypass(s, x0, y0, log2_cb_size);
36003+    }
36004+
36005+    if (s->sh.slice_type != HEVC_SLICE_I) {
36006+        lc->cu.pred_mode = MODE_INTER;
36007+        skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
36008+    }
36009+
36010+    if (skip_flag) {
36011+        lc->cu.pred_mode = MODE_SKIP;
36012+
36013+        hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
36014+        intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
36015+
36016+        if (!s->sh.disable_deblocking_filter_flag)
36017+            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
36018+    } else {
36019+        int pcm_flag = 0;
36020+
36021+        if (s->sh.slice_type != HEVC_SLICE_I)
36022+            lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc);
36023+        if (lc->cu.pred_mode != MODE_INTRA ||
36024+            log2_cb_size == s->ps.sps->log2_min_cb_size) {
36025+            lc->cu.part_mode        = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size);
36026+            lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
36027+                                      lc->cu.pred_mode == MODE_INTRA;
36028+        }
36029+
36030+        if (lc->cu.pred_mode == MODE_INTRA) {
36031+            if (lc->cu.part_mode == PART_2Nx2N &&
36032+                log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size &&  // 0 if not enabled
36033+                log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
36034+                ff_hevc_rpi_pcm_flag_decode(lc) != 0)
36035+            {
36036+                int ret;
36037+                pcm_flag = 1;
36038+                intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
36039+                if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0)
36040+                    return ret;
36041+
36042+                if (s->ps.sps->pcm.loop_filter_disable_flag)
36043+                    set_deblocking_bypass(s, x0, y0, log2_cb_size);
36044+            } else {
36045+                intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
36046+            }
36047+        } else {
36048+            intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
36049+            switch (lc->cu.part_mode) {
36050+            case PART_2Nx2N:
36051+                hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
36052+                break;
36053+            case PART_2NxN:
36054+                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
36055+                lc->cu.y_split = y0 + cb_size / 2;
36056+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
36057+                break;
36058+            case PART_Nx2N:
36059+                hls_prediction_unit(s, lc, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
36060+                lc->cu.x_split = x0 + cb_size / 2;
36061+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
36062+                break;
36063+            case PART_2NxnU:
36064+                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
36065+                lc->cu.y_split = y0 + cb_size / 4;
36066+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx);
36067+                break;
36068+            case PART_2NxnD:
36069+                hls_prediction_unit(s, lc, x0, y0,                   cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx);
36070+                lc->cu.y_split = y0 + cb_size / 4 * 3;
36071+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
36072+                break;
36073+            case PART_nLx2N:
36074+                hls_prediction_unit(s, lc, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
36075+                lc->cu.x_split = x0 + cb_size / 4;
36076+                hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
36077+                break;
36078+            case PART_nRx2N:
36079+                hls_prediction_unit(s, lc, x0,                   y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2);
36080+                lc->cu.x_split = x0 + cb_size / 4 * 3;
36081+                hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
36082+                break;
36083+            case PART_NxN:
36084+                hls_prediction_unit(s, lc, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
36085+                lc->cu.x_split = x0 + cb_size / 2;
36086+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
36087+                lc->cu.y_split = y0 + cb_size / 2;
36088+                hls_prediction_unit(s, lc, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
36089+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
36090+                break;
36091+            }
36092+        }
36093+
36094+        if (!pcm_flag) {
36095+            int rqt_root_cbf = 1;
36096+
36097+            if (lc->cu.pred_mode != MODE_INTRA &&
36098+                !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
36099+                rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc);
36100+            }
36101+            if (rqt_root_cbf) {
36102+                const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0);
36103+                int ret;
36104+
36105+                lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
36106+                                         s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
36107+                                         s->ps.sps->max_transform_hierarchy_depth_inter;
36108+                // transform_tree does deblock_boundary_strengths
36109+                ret = hls_transform_tree(s, lc, x0, y0,
36110+                                         log2_cb_size, 0, 0, cbf_c);
36111+                if (ret < 0)
36112+                    return ret;
36113+            } else {
36114+                if (!s->sh.disable_deblocking_filter_flag)
36115+                    ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
36116+            }
36117+        }
36118+    }
36119+
36120+    // If the delta is still wanted then we haven't read the delta & therefore need to set qp here
36121+    if (lc->tu.is_cu_qp_delta_wanted)
36122+        ff_hevc_rpi_set_qPy(s, lc, x0, y0);
36123+
36124+    if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
36125+       ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
36126+        lc->qPy_pred = lc->qp_y;
36127+    }
36128+
36129+    set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff);
36130+
36131+    set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag);
36132+
36133+    return 0;
36134+}
36135+
36136+// Returns:
36137+//  < 0  Error
36138+//  0    More data wanted
36139+//  1    EoSlice / EoPicture
36140+static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
36141+                               const int log2_cb_size, const unsigned int cb_depth)
36142+{
36143+    const int cb_size    = 1 << log2_cb_size;
36144+    int ret;
36145+    int split_cu;
36146+
36147+    lc->ct_depth = cb_depth;
36148+    split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
36149+    if (x0 + cb_size <= s->ps.sps->width  &&
36150+        y0 + cb_size <= s->ps.sps->height &&
36151+        split_cu)
36152+    {
36153+        split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
36154+    }
36155+
36156+    // Qp delta (and offset) need to remain wanted if cb_size < min until
36157+    // a coded block is found so we still initial state at depth 0 (outside
36158+    // this fn) and only reset here
36159+    if (s->ps.pps->cu_qp_delta_enabled_flag &&
36160+        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
36161+    {
36162+        lc->tu.is_cu_qp_delta_wanted = 1;
36163+        lc->tu.cu_qp_delta          = 0;
36164+    }
36165+    if (s->sh.cu_chroma_qp_offset_enabled_flag &&
36166+        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
36167+    {
36168+        lc->tu.cu_chroma_qp_offset_wanted = 1;
36169+    }
36170+
36171+    lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0];
36172+    lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset;
36173+    lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset;
36174+
36175+    if (split_cu) {
36176+        int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
36177+        const int cb_size_split = cb_size >> 1;
36178+        const int x1 = x0 + cb_size_split;
36179+        const int y1 = y0 + cb_size_split;
36180+
36181+        int more_data = 0;
36182+
36183+        more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
36184+        if (more_data < 0)
36185+            return more_data;
36186+
36187+        if (more_data && x1 < s->ps.sps->width) {
36188+            more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
36189+            if (more_data < 0)
36190+                return more_data;
36191+        }
36192+        if (more_data && y1 < s->ps.sps->height) {
36193+            more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
36194+            if (more_data < 0)
36195+                return more_data;
36196+        }
36197+        if (more_data && x1 < s->ps.sps->width &&
36198+            y1 < s->ps.sps->height) {
36199+            more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
36200+            if (more_data < 0)
36201+                return more_data;
36202+        }
36203+
36204+        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
36205+            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
36206+            lc->qPy_pred = lc->qp_y;
36207+
36208+        if (more_data)
36209+            return ((x1 + cb_size_split) < s->ps.sps->width ||
36210+                    (y1 + cb_size_split) < s->ps.sps->height);
36211+        else
36212+            return 0;
36213+    } else {
36214+        ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
36215+        if (ret < 0)
36216+            return ret;
36217+        if ((!((x0 + cb_size) %
36218+               (1 << (s->ps.sps->log2_ctb_size))) ||
36219+             (x0 + cb_size >= s->ps.sps->width)) &&
36220+            (!((y0 + cb_size) %
36221+               (1 << (s->ps.sps->log2_ctb_size))) ||
36222+             (y0 + cb_size >= s->ps.sps->height))) {
36223+            int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc);
36224+            return !end_of_slice_flag;
36225+        } else {
36226+            return 1;
36227+        }
36228+    }
36229+
36230+    return 0;  // NEVER
36231+}
36232+
36233+static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
36234+                                 const int x_ctb, const int y_ctb, const int ctb_addr_ts)
36235+{
36236+    const unsigned int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
36237+    const unsigned int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
36238+    const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr;  // slice_addr = RS addr of start of slice
36239+    const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
36240+    const unsigned int line_w = s->ps.sps->ctb_width;
36241+
36242+    s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
36243+
36244+    lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width);
36245+    lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
36246+
36247+    lc->boundary_flags = 0;
36248+
36249+    if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0)
36250+        lc->boundary_flags |= BOUNDARY_LEFT_TILE;
36251+    if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
36252+        lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
36253+    if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0)
36254+        lc->boundary_flags |= BOUNDARY_UPPER_TILE;
36255+    if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w])
36256+        lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
36257+
36258+    // Use line width rather than tile width for addr_in_slice test as
36259+    // addr_in_slice is in raster units
36260+
36261+    lc->ctb_avail =
36262+        ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) |
36263+        ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) |
36264+        ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
36265+            (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) |
36266+        ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
36267+            (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0);
36268+    // Down-left never avail at CTB level
36269+}
36270+
36271+
36272+static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
36273+{
36274+    int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds,
36275+        (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0);
36276+
36277+    // Signal
36278+    if (y > 0) {
36279+        // Cast away const as progress is held in s, but this really shouldn't confuse anything
36280+        ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1);
36281+    }
36282+
36283+    // Job done now
36284+    // ? Move outside this fn
36285+    job_free(s->jbc, jb);
36286+}
36287+
36288+// I-pred, transform_and_add for all blocks types done here
36289+// All ARM
36290+static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
36291+{
36292+    unsigned int i;
36293+    HEVCRpiIntraPredEnv * const iap = &jb->intra;
36294+    const HEVCPredCmd *cmd = iap->cmds;
36295+
36296+#if !RPI_WORKER_WAIT_PASS_0
36297+    rpi_sem_wait(&jb->sem);
36298+    rpi_cache_flush_execute(jb->rfe);  // Invalidate data set up in pass1
36299+#endif
36300+
36301+    for (i = iap->n; i > 0; i--, cmd++)
36302+    {
36303+        switch (cmd->type)
36304+        {
36305+            case RPI_PRED_INTRA:
36306+                s->hpc.intra_pred(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
36307+                break;
36308+            case RPI_PRED_INTRA_C:
36309+                s->hpc.intra_pred_c(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
36310+                break;
36311+            case RPI_PRED_ADD_RESIDUAL:
36312+                s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
36313+                break;
36314+            case RPI_PRED_ADD_DC:
36315+                s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
36316+                break;
36317+            case RPI_PRED_ADD_RESIDUAL_U:
36318+                s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
36319+                break;
36320+            case RPI_PRED_ADD_RESIDUAL_V:
36321+                s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
36322+                break;
36323+            case RPI_PRED_ADD_RESIDUAL_C:
36324+                s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
36325+                break;
36326+            case RPI_PRED_ADD_DC_U:
36327+            case RPI_PRED_ADD_DC_V:
36328+                s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
36329+                break;
36330+
36331+            case RPI_PRED_I_PCM:
36332+                pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
36333+                break;
36334+
36335+            default:
36336+                av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
36337+                abort();
36338+        }
36339+    }
36340+
36341+    // Mark done
36342+    iap->n = 0;
36343+}
36344+
36345+
36346+// Set initial uniform job values & zero ctu_count
36347+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
36348+{
36349+    unsigned int i;
36350+    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
36351+    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
36352+    const HEVCRpiSPS * const sps = s->ps.sps;
36353+
36354+    const uint16_t pic_width_y   = sps->width;
36355+    const uint16_t pic_height_y  = sps->height;
36356+
36357+    const uint16_t pic_width_c   = sps->width >> ctx_hshift(s, 1);
36358+    const uint16_t pic_height_c  = sps->height >> ctx_vshift(s, 1);
36359+
36360+    // We expect the pointer to change if we use another sps
36361+    if (sps != jb->sps)
36362+    {
36363+        worker_pic_free_one(jb);
36364+
36365+        set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
36366+        set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
36367+
36368+        {
36369+            const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH;
36370+            const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1));
36371+            worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
36372+        }
36373+
36374+        jb->sps = sps;
36375+    }
36376+
36377+    jb->waited = 0;
36378+    jb->ctu_ts_first = ctu_ts_first;
36379+    jb->ctu_ts_last = -1;
36380+
36381+    rpi_inter_pred_reset(cipe);
36382+    for (i = 0; i < cipe->n; i++) {
36383+        HEVCRpiInterPredQ * const cp = cipe->q + i;
36384+        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
36385+
36386+        u->next_src1.x = 0;
36387+        u->next_src1.y = 0;
36388+        u->next_src1.base = 0;
36389+        u->pic_cw = pic_width_c;
36390+        u->pic_ch = pic_height_c;
36391+        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
36392+        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
36393+        cp->last_l0 = &u->next_src1;
36394+
36395+        u->next_fn = 0;
36396+        u->next_src2.x = 0;
36397+        u->next_src2.y = 0;
36398+        u->next_src2.base = 0;
36399+        cp->last_l1 = &u->next_src2;
36400+
36401+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
36402+    }
36403+
36404+    rpi_inter_pred_reset(yipe);
36405+    for (i = 0; i < yipe->n; i++) {
36406+        HEVCRpiInterPredQ * const yp = yipe->q + i;
36407+        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
36408+
36409+        y->next_src1.x = 0;
36410+        y->next_src1.y = 0;
36411+        y->next_src1.base = 0;
36412+        y->next_src2.x = 0;
36413+        y->next_src2.y = 0;
36414+        y->next_src2.base = 0;
36415+        y->pic_h = pic_height_y;
36416+        y->pic_w = pic_width_y;
36417+        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
36418+        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
36419+        y->next_fn = 0;
36420+        yp->last_l0 = &y->next_src1;
36421+        yp->last_l1 = &y->next_src2;
36422+
36423+        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
36424+    }
36425+
36426+    jb->last_y8_p = NULL;
36427+    jb->last_y8_l1 = NULL;
36428+
36429+    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
36430+        jb->progress_req[i] = -1;
36431+    }
36432+
36433+    worker_pic_reset(&jb->coeffs);
36434+}
36435+
36436+
36437+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
36438+static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s,
36439+                                     const vpu_qpu_job_h vqj,
36440+                                     rpi_cache_flush_env_t * const rfe,
36441+                                     HEVCRpiInterPredEnv * const ipe)
36442+{
36443+    unsigned int i;
36444+    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
36445+    unsigned int max_block = 0;
36446+
36447+    if (!ipe->used) {
36448+        return 0;
36449+    }
36450+
36451+    if (ipe->curr != 0) {
36452+        rpi_inter_pred_sync(ipe);
36453+    }
36454+
36455+    // Add final commands to Q
36456+    for(i = 0; i != ipe->n; ++i) {
36457+        HEVCRpiInterPredQ * const yp = ipe->q + i;
36458+        qpu_mc_src_t *const p0 = yp->last_l0;
36459+        qpu_mc_src_t *const p1 = yp->last_l1;
36460+        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
36461+
36462+        if (block_size > max_block)
36463+            max_block = block_size;
36464+
36465+        qpu_mc_link_set(yp->qpu_mc_curr, yp->code_exit);
36466+
36467+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
36468+        p0->x = MC_DUMMY_X;
36469+        p0->y = MC_DUMMY_Y;
36470+        p0->base = s->qpu_dummy_frame_qpu;
36471+        p1->x = MC_DUMMY_X;
36472+        p1->y = MC_DUMMY_Y;
36473+        p1->base = s->qpu_dummy_frame_qpu;
36474+
36475+        yp->last_l0 = NULL;
36476+        yp->last_l1 = NULL;
36477+
36478+        // Add to mailbox list
36479+        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
36480+        mail[i][1] = yp->code_setup;
36481+    }
36482+
36483+    // We don't need invalidate here as the uniforms aren't changed by the QPU
36484+    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
36485+    // new values which seems to give us a small performance advantage
36486+    //
36487+    // In most cases we will not have a completely packed set of uniforms and as
36488+    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
36489+    // fullest
36490+    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
36491+                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
36492+                                  ipe->n, ipe->max_fill + ipe->min_gap);
36493+    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
36494+
36495+    return 1;
36496+}
36497+#endif
36498+
36499+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
36500+static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s,
36501+                                     const vpu_qpu_job_h vqj,
36502+                                     rpi_cache_flush_env_t * const rfe,
36503+                                     HEVCRpiInterPredEnv * const ipe)
36504+{
36505+    unsigned int i;
36506+    if (!ipe->used) {
36507+        return 0;
36508+    }
36509+
36510+    if (ipe->curr != 0) {
36511+        rpi_inter_pred_sync(ipe);
36512+    }
36513+
36514+    // Add final commands to Q
36515+    for(i = 0; i != ipe->n; ++i) {
36516+        HEVCRpiInterPredQ * const yp = ipe->q + i;
36517+        qpu_mc_src_t *const p0 = yp->last_l0;
36518+        qpu_mc_src_t *const p1 = yp->last_l1;
36519+
36520+        yp->qpu_mc_curr->data[-1] = yp->code_exit;
36521+
36522+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
36523+        p0->x = MC_DUMMY_X;
36524+        p0->y = MC_DUMMY_Y;
36525+        p0->base = s->qpu_dummy_frame_emu;
36526+        p1->x = MC_DUMMY_X;
36527+        p1->y = MC_DUMMY_Y;
36528+        p1->base = s->qpu_dummy_frame_emu;
36529+
36530+        yp->last_l0 = NULL;
36531+        yp->last_l1 = NULL;
36532+    }
36533+
36534+    return 1;
36535+}
36536+#endif
36537+
36538+
36539+#if RPI_QPU_EMU_Y
36540+#define mc_terminate_add_y mc_terminate_add_emu
36541+#else
36542+#define mc_terminate_add_y mc_terminate_add_qpu
36543+#endif
36544+#if RPI_QPU_EMU_C
36545+#define mc_terminate_add_c mc_terminate_add_emu
36546+#else
36547+#define mc_terminate_add_c mc_terminate_add_qpu
36548+#endif
36549+
36550+
36551+static void flush_frame(HEVCRpiContext *s,AVFrame *frame)
36552+{
36553+    rpi_cache_buf_t cbuf;
36554+    rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
36555+    rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
36556+    rpi_cache_flush_finish(rfe);
36557+}
36558+
36559+static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
36560+{
36561+    const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
36562+    const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
36563+    const unsigned int ctb_width = s->ps.sps->ctb_width;
36564+    RpiBlk *const bounds = &jb->bounds;
36565+    av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
36566+    bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
36567+    bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
36568+    bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
36569+    bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
36570+
36571+    bounds->w = FFMIN(bounds->w, s->ps.sps->width - bounds->x);
36572+    bounds->h = FFMIN(bounds->h, s->ps.sps->height - bounds->y);
36573+}
36574+
36575+#if RPI_PASSES == 2
36576+static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb)
36577+{
36578+    // Perform intra prediction and residual reconstruction
36579+    rpi_execute_pred_cmds(s, jb);
36580+
36581+    // Perform deblocking for CTBs in this row
36582+    rpi_execute_dblk_cmds(s, jb);
36583+}
36584+#endif
36585+
36586+// Core execution tasks
36587+static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
36588+{
36589+    int pred_y, pred_c;
36590+    vpu_qpu_job_env_t qvbuf;
36591+    const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf);
36592+#if RPI_WORKER_WAIT_PASS_0
36593+    int do_wait;
36594+#endif
36595+
36596+    {
36597+        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
36598+        if (cf->s[3].n + cf->s[2].n != 0)
36599+        {
36600+            const unsigned int csize = sizeof(cf->s[3].buf[0]);
36601+            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
36602+            unsigned int n16 = (cf->s[2].n >> 8);
36603+            unsigned int n32 = (cf->s[3].n >> 10);
36604+#if RPI_COMPRESS_COEFFS
36605+            if (cf->s[2].packed) {
36606+                n16 = n16 | (n16<<16);
36607+            } else {
36608+                const unsigned int npack16 = (cf->s[2].packed_n>>8);
36609+                n16 = n16 | (npack16<<16);
36610+            }
36611+            if (cf->s[3].packed) {
36612+                n32 = n32 | (n32<<16);
36613+            } else {
36614+                const unsigned int npack32 = (cf->s[3].packed_n>>10);
36615+                n32 = n32 | (npack32<<16);
36616+            }
36617+#endif
36618+            vpu_qpu_job_add_vpu(vqj,
36619+                vpu_get_fn(s->ps.sps->bit_depth),
36620+                vpu_get_constants(),
36621+                cf->gptr.vc,
36622+                n16,
36623+                cf->gptr.vc + offset32,
36624+                n32,
36625+                0);
36626+
36627+            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
36628+            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
36629+        }
36630+    }
36631+
36632+    pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip);
36633+
36634+// We could take a sync here and try to locally overlap QPU processing with ARM
36635+// but testing showed a slightly negative benefit with noticable extra complexity
36636+
36637+    pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip);
36638+
36639+    // Returns 0 if nothing to do, 1 if sync added
36640+#if RPI_WORKER_WAIT_PASS_0
36641+    do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem);
36642+#else
36643+    if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0)
36644+        sem_post(&jb->sem);
36645+#endif
36646+
36647+    rpi_cache_flush_execute(jb->rfe);
36648+
36649+    // Await progress as required
36650+    // jb->waited will only be clear if we have already tested the progress values
36651+    // (in worker_submit_job) and found we don't have to wait
36652+    if (jb->waited)
36653+    {
36654+        unsigned int i;
36655+        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
36656+            if (jb->progress_req[i] >= 0) {
36657+                ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]);
36658+            }
36659+        }
36660+    }
36661+
36662+    vpu_qpu_job_finish(vqj);
36663+
36664+    // We always work on a rectangular block
36665+    if (pred_y || pred_c)
36666+    {
36667+        rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
36668+                                        jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
36669+                                        ctx_vshift(s, 1), pred_y, pred_c);
36670+    }
36671+
36672+    // If we have emulated VPU ops - do it here
36673+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
36674+    if (av_rpi_is_sand8_frame(s->frame))
36675+    {
36676+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
36677+        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
36678+#elif RPI_QPU_EMU_Y
36679+        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL);
36680+#else
36681+        ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip);
36682+#endif
36683+    }
36684+    else
36685+    {
36686+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
36687+        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
36688+#elif RPI_QPU_EMU_Y
36689+        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL);
36690+#else
36691+        ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip);
36692+#endif
36693+    }
36694+#endif
36695+
36696+#if RPI_WORKER_WAIT_PASS_0
36697+    if (do_wait)
36698+        rpi_sem_wait(&jb->sem);
36699+    rpi_cache_flush_execute(jb->rfe);
36700+#endif
36701+}
36702+
36703+
36704+static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
36705+{
36706+    av_freep(&ipe->q);
36707+    gpu_free(&ipe->gptr);
36708+}
36709+
36710+static HEVCRpiJob * job_new(void)
36711+{
36712+    HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
36713+
36714+    if (jb == NULL)
36715+        return NULL;
36716+
36717+    sem_init(&jb->sem, 0, 0);
36718+    jb->rfe = rpi_cache_flush_init(&jb->flush_buf);
36719+    ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
36720+
36721+    jb->intra.n = 0;
36722+    if ((jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS)) == NULL)
36723+        goto fail1;
36724+
36725+    // * Sizeof the union structure might be overkill but at the moment it
36726+    //   is correct (it certainly isn't going to be too small)
36727+    // Set max fill to slack/2 from the end of the Q
36728+    // If we exceed this in any Q then we will schedule by size (which should
36729+    // mean that we never use that Q again part from syncs)
36730+    // * Given how agressive the overflow resonse is we could maybe put the
36731+    //   threshold even nearer the end, but I don't expect us to ever hit
36732+    //   it on any real stream anyway.
36733+
36734+    if (rpi_inter_pred_alloc(&jb->chroma_ip,
36735+                         QPU_N_MAX, QPU_N_GRP,
36736+                         QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t),
36737+                         QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2) != 0)
36738+        goto fail2;
36739+    if (rpi_inter_pred_alloc(&jb->luma_ip,
36740+                         QPU_N_MAX,  QPU_N_GRP,
36741+                         QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t),
36742+                         QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2) != 0)
36743+        goto fail3;
36744+
36745+    return jb;
36746+
36747+fail3:
36748+    rpi_free_inter_pred(&jb->luma_ip);
36749+fail2:
36750+    av_freep(&jb->intra.cmds);
36751+fail1:
36752+    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
36753+    rpi_cache_flush_finish(jb->rfe);
36754+    sem_destroy(&jb->sem);
36755+    return NULL;
36756+}
36757+
36758+static void job_delete(HEVCRpiJob * const jb)
36759+{
36760+    worker_pic_free_one(jb);
36761+    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
36762+    rpi_free_inter_pred(&jb->chroma_ip);
36763+    rpi_free_inter_pred(&jb->luma_ip);
36764+    av_freep(&jb->intra.cmds);
36765+    rpi_cache_flush_finish(jb->rfe);  // Not really needed - should do nothing
36766+    sem_destroy(&jb->sem);
36767+    av_free(jb);
36768+}
36769+
36770+static void jbg_delete(HEVCRpiJobGlobal * const jbg)
36771+{
36772+    HEVCRpiJob * jb;
36773+
36774+    if (jbg == NULL)
36775+        return;
36776+
36777+    jb = jbg->free1;
36778+    while (jb != NULL)
36779+    {
36780+        HEVCRpiJob * const jb2 = jb;
36781+        jb = jb2->next;
36782+        job_delete(jb2);
36783+    }
36784+
36785+    pthread_mutex_destroy(&jbg->lock);
36786+    av_free(jbg);
36787+}
36788+
36789+static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
36790+{
36791+    HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
36792+    if (jbg == NULL)
36793+        return NULL;
36794+
36795+    pthread_mutex_init(&jbg->lock, NULL);
36796+
36797+    while (job_count-- != 0)
36798+    {
36799+        HEVCRpiJob * const jb = job_new();
36800+        if (jb == NULL)
36801+            goto fail;
36802+
36803+        jb->next = jbg->free1;
36804+        jbg->free1 = jb;
36805+    }
36806+
36807+    return jbg;
36808+
36809+fail:
36810+    jbg_delete(jbg);
36811+    return NULL;
36812+}
36813+
36814+static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
36815+{
36816+    HEVCRpiJobGlobal * jbg;
36817+
36818+    if (jbc == NULL)
36819+        return;
36820+
36821+    jbg = jbc->jbg;
36822+
36823+    if (jbc->jb1 != NULL)
36824+        job_delete(jbc->jb1);
36825+
36826+    pthread_mutex_destroy(&jbc->in_lock);
36827+    sem_destroy(&jbc->sem_out);
36828+    av_free(jbc);
36829+
36830+    // Deref the global job context
36831+    if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
36832+        jbg_delete(jbg);
36833+}
36834+
36835+static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
36836+{
36837+    HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
36838+
36839+    if (jbc == NULL)
36840+        return NULL;
36841+
36842+    jbc->jbg = jbg;
36843+    atomic_fetch_add(&jbg->ref_count, 1);
36844+
36845+    sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
36846+    pthread_mutex_init(&jbc->in_lock, NULL);
36847+
36848+    if ((jbc->jb1 = job_new()) == NULL)
36849+        goto fail;
36850+    jbc->jb1->jbc_local = jbc;
36851+
36852+    return jbc;
36853+
36854+fail:
36855+    rpi_job_ctl_delete(jbc);
36856+    return NULL;
36857+}
36858+
36859+
36860+
36861+static av_cold void hevc_init_worker(HEVCRpiContext * const s)
36862+{
36863+#if RPI_PASSES == 2
36864+    pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
36865+#elif RPI_PASSES == 3
36866+    pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
36867+    pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
36868+#else
36869+#error Passes confused
36870+#endif
36871+    pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
36872+
36873+    pass_queues_start_all(s);
36874+}
36875+
36876+static av_cold void hevc_exit_worker(HEVCRpiContext *s)
36877+{
36878+    pass_queues_term_all(s);
36879+
36880+    pass_queues_kill_all(s);
36881+
36882+    rpi_job_ctl_delete(s->jbc);
36883+    s->jbc = NULL;
36884+}
36885+
36886+
36887+static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
36888+{
36889+    const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
36890+    const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
36891+    const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts];
36892+
36893+    // Check for obvious disasters
36894+    if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) {
36895+        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
36896+        return AVERROR_INVALIDDATA;
36897+    }
36898+
36899+    // If dependant then ctb_addr_ts != 0 from previous check
36900+    if (s->sh.dependent_slice_segment_flag) {
36901+        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
36902+        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
36903+            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
36904+            return AVERROR_INVALIDDATA;
36905+        }
36906+    }
36907+
36908+    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
36909+        tile_id + s->sh.num_entry_point_offsets >= tiles)
36910+    {
36911+        av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
36912+        return AVERROR_INVALIDDATA;
36913+    }
36914+
36915+    // Tiled stuff must start at start of tile if it has multiple entry points
36916+    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
36917+        s->sh.num_entry_point_offsets != 0 &&
36918+        ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id])
36919+    {
36920+        av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
36921+        return AVERROR_INVALIDDATA;
36922+    }
36923+
36924+    ff_hevc_rpi_cabac_init_decoder(lc);
36925+
36926+    // Setup any required decode vars
36927+    lc->cabac_init_req = !s->sh.dependent_slice_segment_flag;
36928+
36929+//    printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot);
36930+    lc->qp_y = s->sh.slice_qp;
36931+
36932+    // General setup
36933+    lc->bt_line_no = 0;
36934+    lc->ts = ctb_addr_ts;
36935+    return 0;
36936+}
36937+
36938+static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
36939+{
36940+    const GetBitContext * const gb = &s->HEVClc->gb;
36941+    RpiSliceHeader * const sh = &s->sh;
36942+    int i, j;
36943+
36944+    const unsigned int length = nal->size;
36945+    unsigned int offset = ((gb->index) >> 3) + 1;  // We have a bit & align still to come = +1 byte
36946+    unsigned int cmpt;
36947+    unsigned int startheader;
36948+
36949+    if (sh->num_entry_point_offsets == 0) {
36950+        s->data = NULL;
36951+        return 0;
36952+    }
36953+
36954+    // offset in slice header includes emulation prevention bytes.
36955+    // Unfortunately those have been removed by the time we get here so we
36956+    // have to compensate.  The nal layer keeps a track of where they were.
36957+    for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) {
36958+        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
36959+            startheader--;
36960+            cmpt++;
36961+        }
36962+    }
36963+
36964+    for (i = 1; i < sh->num_entry_point_offsets; i++) {
36965+        offset += (sh->entry_point_offset[i - 1] - cmpt);
36966+        for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) {
36967+            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
36968+                startheader--;
36969+                cmpt++;
36970+            }
36971+        }
36972+        if (sh->entry_point_offset[i] <= cmpt) {
36973+            av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n");
36974+            return AVERROR_INVALIDDATA;
36975+        }
36976+        sh->size[i - 1] = sh->entry_point_offset[i] - cmpt;
36977+        sh->offset[i - 1] = offset;
36978+    }
36979+
36980+    offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt;
36981+    if (length < offset) {
36982+        av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
36983+        return AVERROR_INVALIDDATA;
36984+    }
36985+    sh->size[sh->num_entry_point_offsets - 1] = length - offset;
36986+    sh->offset[sh->num_entry_point_offsets - 1] = offset;
36987+
36988+    // Remember data start pointer as we won't have nal later
36989+    s->data = nal->data;
36990+    return 0;
36991+}
36992+
36993+
36994+// Return
36995+// < 0   Error
36996+// 0     OK
36997+//
36998+// jb->ctu_ts_last < 0       Job still filling
36999+// jb->ctu_ts_last >= 0      Job ready
37000+
37001+static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks)
37002+{
37003+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
37004+    const unsigned int ctb_size = (1 << log2_ctb_size);
37005+    HEVCRpiJob * const jb = lc->jb0;
37006+    int more_data = 1;
37007+    unsigned int ctb_addr_ts = lc->ts;
37008+    unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
37009+    unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size;
37010+    const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size;
37011+
37012+    lc->unit_done = 0;
37013+
37014+    while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
37015+    {
37016+        int q_full;
37017+        const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
37018+
37019+        hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
37020+
37021+        ff_hevc_rpi_cabac_init(s, lc, ctb_flags);
37022+
37023+        hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size);
37024+
37025+        s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
37026+        s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
37027+        s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
37028+
37029+        // Zap stashes if navail
37030+        if ((lc->ctb_avail & AVAIL_U) == 0)
37031+            zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3);
37032+        if ((lc->ctb_avail & AVAIL_L) == 0)
37033+        {
37034+            memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE);
37035+            zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3);
37036+        }
37037+#if MVF_STASH_WIDTH > 64
37038+        // Restore left mvf stash at start of tile if not at start of line
37039+        if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap)
37040+        {
37041+            unsigned int i;
37042+            HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0);
37043+            const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
37044+            for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
37045+            {
37046+                *dst = *src++;
37047+                dst += MVF_STASH_WIDTH_PU;
37048+            }
37049+        }
37050+#endif
37051+
37052+        // Set initial tu states
37053+        lc->tu.cu_qp_delta = 0;
37054+        lc->tu.is_cu_qp_delta_wanted = 0;
37055+        lc->tu.cu_chroma_qp_offset_wanted = 0;
37056+
37057+        // Decode
37058+        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0);
37059+
37060+        if (ff_hevc_rpi_cabac_overflow(lc))
37061+        {
37062+            av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n ");
37063+            more_data = AVERROR_INVALIDDATA;
37064+        }
37065+
37066+        if (more_data < 0) {
37067+            s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN;  // Mark slice as broken
37068+            return more_data;
37069+        }
37070+
37071+        if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 ||
37072+             (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0)))
37073+        {
37074+            if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 ||
37075+                ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL)
37076+            {
37077+                av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n ");
37078+                return -1;
37079+            }
37080+        }
37081+
37082+        // --- Post CTB processing
37083+
37084+        // Stash rpl top/left for deblock that needs to remember such things cross-slice
37085+        s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList;
37086+        s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList;
37087+
37088+        if (!s->is_irap)
37089+        {
37090+            // Copy MVF up to up-left & stash to up
37091+            {
37092+                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1);
37093+                HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE);
37094+
37095+    //            printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst);
37096+
37097+                lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE];
37098+                memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE);
37099+            }
37100+            // Stash sideways if end of tile line but not end of line (no point)
37101+            // ** Could/should do this @ end of fn
37102+#if MVF_STASH_WIDTH > 64
37103+            if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL)
37104+#endif
37105+            {
37106+                unsigned int i;
37107+                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0);
37108+                HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
37109+                for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
37110+                {
37111+                    *dst++ = *src;
37112+                    src += MVF_STASH_WIDTH_PU;
37113+                }
37114+            }
37115+        }
37116+
37117+        if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0)
37118+            ff_hevc_rpi_save_states(s, lc);
37119+
37120+        // Report progress so we can use our MVs in other frames
37121+        if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0)
37122+            ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
37123+
37124+        // End of line || End of tile line || End of tile
37125+        // (EoL covers end of frame for our purposes here)
37126+        q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0);
37127+
37128+        // Allocate QPU chunks on fixed size 64 pel boundries rather than
37129+        // whatever ctb_size is today.
37130+        // * We might quite like to continue to 64 pel vertical too but that
37131+        //   currently confuses WPP
37132+        if (((x_ctb + ctb_size) & 63) == 0 || q_full)
37133+        {
37134+            int overflow = 0;
37135+            if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
37136+                overflow = 1;
37137+            if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
37138+                overflow = 1;
37139+            if (overflow)
37140+            {
37141+                // * This is very annoying (and slow) to cope with in WPP so
37142+                //   we treat it as an error there (no known stream triggers this
37143+                //   with the current buffer sizes).  Non-wpp should cope fine.
37144+                av_log(s->avctx, AV_LOG_WARNING,  "%s: Q full before EoL\n", __func__);
37145+                q_full = 1;
37146+            }
37147+        }
37148+
37149+        // Inc TS to next.
37150+        ctb_addr_ts++;
37151+        ctb_addr_rs++;
37152+        x_ctb += ctb_size;
37153+
37154+        if (q_full)
37155+        {
37156+            // Do job
37157+            // Prep for submission
37158+            jb->ctu_ts_last = ctb_addr_ts - 1;  // Was pre-inced
37159+            job_gen_bounds(s, jb);
37160+            break;
37161+        }
37162+
37163+        // If max_blocks started as 0 then this will never be true
37164+        if (--max_blocks == 0)
37165+            break;
37166+    }
37167+
37168+    lc->unit_done = (more_data <= 0);
37169+    lc->ts = ctb_addr_ts;
37170+    return 0;
37171+}
37172+
37173+static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n)
37174+{
37175+    lc->context = s;
37176+    lc->jb0 = NULL;
37177+    lc->lc_n = n;
37178+    lc->bt_terminate = 0;
37179+    lc->bt_psem_out = NULL;
37180+    sem_init(&lc->bt_sem_in, 0, 0);
37181+}
37182+
37183+#define TRACE_WPP 0
37184+#if RPI_EXTRA_BIT_THREADS > 0
37185+static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts)
37186+{
37187+    unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
37188+    return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
37189+}
37190+
37191+// Move local context parameters from an aux bit thread back to the main
37192+// thread at the end of a slice as processing is going to continue there.
37193+static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep)
37194+{
37195+    if (src_lc == dst_lc) {
37196+        return;
37197+    }
37198+
37199+    // Move the job
37200+    // We will still have an active job if the final line terminates early
37201+    // Dest should always be null by now
37202+    av_assert1(dst_lc->jb0 == NULL);
37203+    dst_lc->jb0 = src_lc->jb0;
37204+    src_lc->jb0 = NULL;
37205+
37206+    // Always need to store where we are in the bitstream
37207+    dst_lc->ts = src_lc->ts;
37208+    dst_lc->gb = src_lc->gb;
37209+    // Cabac init request will be built at start of next slice
37210+
37211+    // Need to store context if we might have a dependent seg
37212+    if (is_dep)
37213+    {
37214+        dst_lc->qPy_pred = src_lc->qPy_pred;
37215+        memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left));
37216+        memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
37217+        memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
37218+    }
37219+}
37220+
37221+static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc)
37222+{
37223+    rpi_sem_wait(&lc->bt_sem_in);
37224+    return lc->bt_terminate;
37225+}
37226+
37227+// Do one WPP line
37228+// Will not work correctly over horizontal tile boundries - vertical should be OK
37229+static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first)
37230+{
37231+    const int is_tile = lc->bt_is_tile;
37232+    const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
37233+    const unsigned int line = lc->bt_line_no;
37234+    const unsigned int line_inc = lc->bt_line_inc;
37235+    const int is_last = (line >= lc->bt_last_line);
37236+
37237+    const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
37238+    const unsigned int ts_next =
37239+        line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
37240+            INT_MAX :
37241+        is_tile ?
37242+            s->ps.pps->tile_pos_ts[tile_id + line_inc] :
37243+            lc->ts + lc->bt_line_width * line_inc;
37244+    // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
37245+    const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
37246+    unsigned int ts_prev;
37247+    int loop_n = 0;
37248+    int err = 0;
37249+
37250+    av_assert1(line <= s->sh.num_entry_point_offsets);
37251+
37252+#if TRACE_WPP
37253+    printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
37254+           lc->lc_n,  is_tile ? "Tile" : "WPP", tile_id,
37255+           line, lc->bt_last_line, s->sh.num_entry_point_offsets,
37256+           lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
37257+#endif
37258+    if (line != 0)
37259+    {
37260+        const uint8_t * const data = s->data + s->sh.offset[line - 1];
37261+        const unsigned int len = s->sh.size[line - 1];
37262+        if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
37263+            return err;
37264+
37265+        ff_init_cabac_decoder(&lc->cc, data, len);
37266+    }
37267+
37268+    // We should never be processing a dependent slice here so reset is good
37269+    // ?? These probably shouldn't be needed (as they should be set by later
37270+    //    logic) but do seem to be required
37271+    lc->qp_y = s->sh.slice_qp;
37272+
37273+    do
37274+    {
37275+        if (!is_last && loop_n > 1) {
37276+#if TRACE_WPP
37277+            printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
37278+#endif
37279+            sem_post(lc->bt_psem_out);
37280+        }
37281+        // The wait for loop_n == 0 has been done in bit_thread
37282+        if (!is_first && loop_n != 0)
37283+        {
37284+#if TRACE_WPP
37285+            printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
37286+#endif
37287+            if (wait_bt_sem_in(lc) != 0)
37288+                return AVERROR_EXIT;
37289+        }
37290+
37291+#if TRACE_WPP
37292+        {
37293+            int n;
37294+            sem_getvalue(&lc->bt_sem_in, &n);
37295+            printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
37296+        }
37297+#endif
37298+
37299+        ts_prev = lc->ts;
37300+
37301+        // If we have had an error - do no further decode but do continue
37302+        // moving signals around so the other threads continue to operate
37303+        // correctly (or at least as correctly as they can with this line missing)
37304+        //
37305+        // Errors in WPP/Tile are less fatal than normal as we have a good idea
37306+        // of how to restart on the next line so there is no need to give up totally
37307+        if (err != 0)
37308+        {
37309+            lc->unit_done = 0;
37310+            lc->ts += partial_size;
37311+        }
37312+        else
37313+        {
37314+            worker_pass0_ready(s, lc);
37315+
37316+            if ((err = fill_job(s, lc, partial_size)) < 0 ||
37317+                (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
37318+            {
37319+                if (err == 0) {
37320+                    av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
37321+                    err = AVERROR_INVALIDDATA;
37322+                }
37323+                worker_free(s, lc);
37324+                lc->ts = ts_prev + partial_size;  // Pretend we did all that
37325+                lc->unit_done = 0;
37326+            }
37327+            else if (is_tile)
37328+            {
37329+                worker_submit_job(s, lc);
37330+            }
37331+        }
37332+
37333+        ++loop_n;
37334+    } while (lc->ts < ts_eol && !lc->unit_done);
37335+
37336+    // If we are on the last line & we didn't get a whole line we must wait for
37337+    // and sink the sem_posts from the line above / tile to the left.
37338+    while ((ts_prev += partial_size) < ts_eol)
37339+    {
37340+#if TRACE_WPP
37341+        printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
37342+#endif
37343+        if (wait_bt_sem_in(lc) != 0)
37344+            return AVERROR_EXIT;
37345+    }
37346+
37347+    lc->bt_line_no += line_inc;
37348+
37349+    if (!is_tile && err == 0)
37350+        worker_submit_job(s, lc);
37351+
37352+    if (!is_last) {
37353+        lc->ts = ts_next;
37354+
37355+#if TRACE_WPP
37356+        printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
37357+#endif
37358+        sem_post(lc->bt_psem_out);
37359+        if (loop_n > 1) {
37360+#if TRACE_WPP
37361+            printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
37362+#endif
37363+            sem_post(lc->bt_psem_out);
37364+        }
37365+    }
37366+    else
37367+    {
37368+        movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag);  // * & not EoT
37369+#if MVF_STASH_WIDTH > 64
37370+        // Horrid calculations to work out what we want but luckily this should almost never execute
37371+        // **** Move to movlc
37372+        if (!s->is_irap)
37373+        {
37374+            const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts];
37375+            if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf
37376+            {
37377+                const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1;
37378+                unsigned int i;
37379+                const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
37380+                HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
37381+
37382+                for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i)
37383+                {
37384+                    *d_mvf = *s_mvf;
37385+                    d_mvf += MVF_STASH_WIDTH_PU;
37386+                    s_mvf += MVF_STASH_WIDTH_PU;
37387+                }
37388+
37389+            }
37390+        }
37391+#endif
37392+        // When all done poke the thread 0 sem_in one final time
37393+#if TRACE_WPP
37394+        printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
37395+#endif
37396+        sem_post(&s->HEVClcList[0]->bt_sem_in);
37397+    }
37398+
37399+#if TRACE_WPP
37400+    printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
37401+#endif
37402+    return err;
37403+}
37404+
37405+static void wpp_setup_lcs(HEVCRpiContext * const s)
37406+{
37407+    unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
37408+    const unsigned int line_width = line_ts_width(s, ts);
37409+
37410+    for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
37411+    {
37412+        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
37413+        lc->ts = ts;
37414+        lc->bt_is_tile = 0;
37415+        lc->bt_line_no = i;
37416+        lc->bt_line_width = line_width;
37417+        lc->bt_last_line = s->sh.num_entry_point_offsets;
37418+        lc->bt_line_inc = RPI_BIT_THREADS;
37419+        ts += line_width;
37420+    }
37421+}
37422+
37423+
37424+// Can only process tile single row at once
37425+static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row)
37426+{
37427+    const HEVCRpiPPS * const pps = s->ps.pps;
37428+    const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
37429+    const unsigned int tile0 = pps->tile_id[ts0];
37430+    const unsigned int col0 = tile0 % pps->num_tile_columns;
37431+
37432+    const unsigned int col = (slice_row == 0) ? col0 : 0;
37433+    unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
37434+    const unsigned int last_line = FFMIN(
37435+        line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
37436+
37437+    const unsigned int par =
37438+        FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
37439+#if TRACE_WPP
37440+    printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
37441+           pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
37442+#endif
37443+    for (unsigned int i = 0; i != par; ++i, ++line)
37444+    {
37445+        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
37446+        const unsigned int tile = tile0 + line;
37447+
37448+        lc->ts = pps->tile_pos_ts[tile];
37449+        lc->bt_line_no = line;
37450+        lc->bt_is_tile = 1;
37451+        lc->bt_line_width = line_ts_width(s, lc->ts);
37452+        lc->bt_last_line = last_line;
37453+        lc->bt_line_inc = par;
37454+    }
37455+}
37456+
37457+
37458+static void * bit_thread(void * v)
37459+{
37460+    HEVCRpiLocalContext * const lc = v;
37461+    HEVCRpiContext *const s = lc->context;
37462+
37463+    while (wait_bt_sem_in(lc) == 0)
37464+    {
37465+        int err;
37466+
37467+        if ((err = rpi_run_one_line(s, lc, 0)) < 0) {  // Never first tile/wpp
37468+            if (lc->bt_terminate) {
37469+                av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
37470+                break;
37471+            }
37472+            av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
37473+        }
37474+    }
37475+
37476+    return NULL;
37477+}
37478+
37479+static int bit_threads_start(HEVCRpiContext * const s)
37480+{
37481+    if (s->bt_started)
37482+        return 0;
37483+
37484+    for (int i = 1; i < RPI_BIT_THREADS; ++i)
37485+    {
37486+        // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
37487+        if (s->HEVClcList[i] == NULL) {
37488+            if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
37489+                return -1;
37490+        }
37491+
37492+        bt_lc_init(s, s->HEVClcList[i], i);
37493+        job_lc_init(s->HEVClcList[i]);
37494+    }
37495+
37496+    // Link the sems in a circle
37497+    for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
37498+        s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
37499+    s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
37500+
37501+    // Init all lc before starting any threads
37502+    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
37503+    {
37504+        if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
37505+            return -1;
37506+    }
37507+
37508+    s->bt_started = 1;
37509+    return 0;
37510+}
37511+
37512+static int bit_threads_kill(HEVCRpiContext * const s)
37513+{
37514+    if (!s->bt_started)
37515+        return 0;
37516+    s->bt_started = 0;
37517+
37518+    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
37519+    {
37520+        HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1];
37521+        if (lc == NULL)
37522+            break;
37523+
37524+        lc->bt_terminate = 1;
37525+        sem_post(&lc->bt_sem_in);
37526+        pthread_join(s->bit_threads[i], NULL);
37527+
37528+        sem_destroy(&lc->bt_sem_in);
37529+        job_lc_kill(lc);
37530+    }
37531+    return 0;
37532+}
37533+#endif
37534+
37535+
37536+// If we are at EoT and the row is shorter than the number of jobs
37537+// we can Q we have to wait for it finish otherwise we risk cache/QPU
37538+// disasters
37539+static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n)
37540+{
37541+    return
37542+        s->ps.pps->tile_wpp_inter_disable >= 2 &&
37543+        s->sh.slice_type != HEVC_SLICE_I &&
37544+        n >= 0 &&
37545+        (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT;
37546+}
37547+
37548+static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
37549+{
37550+    HEVCRpiContext * const s  = avctxt->priv_data;
37551+    HEVCRpiLocalContext * const lc = s->HEVClc;
37552+    int err;
37553+
37554+    // Start of slice
37555+    if ((err = slice_start(s, lc)) != 0)
37556+        return err;
37557+
37558+#if RPI_EXTRA_BIT_THREADS > 0
37559+
37560+    if (s->sh.offload_tiles)
37561+    {
37562+        unsigned int slice_row = 0;
37563+
37564+#if TRACE_WPP
37565+        printf("%s: Do Tiles\n", __func__);
37566+#endif
37567+        // Generate & start extra bit threads if they aren't already running
37568+        bit_threads_start(s);
37569+
37570+        do
37571+        {
37572+            // Reset lc lines etc.
37573+            tile_one_row_setup_lcs(s, slice_row);
37574+
37575+#if TRACE_WPP
37576+            printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
37577+                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
37578+#endif
37579+
37580+            rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
37581+#if TRACE_WPP
37582+            printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
37583+                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
37584+#endif
37585+
37586+            while (lc->bt_line_no <= lc->bt_last_line) {
37587+                rpi_sem_wait(&lc->bt_sem_in);
37588+                rpi_run_one_line(s, lc, 0);
37589+            }
37590+#if TRACE_WPP
37591+            printf("%s: Done body\n", __func__);
37592+#endif
37593+
37594+            // Wait for everything else to finish
37595+            rpi_sem_wait(&lc->bt_sem_in);
37596+
37597+            ++slice_row;
37598+        } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
37599+
37600+
37601+#if TRACE_WPP
37602+        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
37603+#endif
37604+    }
37605+    else if (s->sh.offload_wpp)
37606+    {
37607+#if TRACE_WPP
37608+        printf("%s: Do WPP\n", __func__);
37609+#endif
37610+        // Generate & start extra bit threads if they aren't already running
37611+        bit_threads_start(s);
37612+
37613+        // Reset lc lines etc.
37614+        wpp_setup_lcs(s);
37615+
37616+        rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
37617+#if TRACE_WPP
37618+        printf("%s: Done 1st\n", __func__);
37619+#endif
37620+
37621+        while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
37622+            rpi_sem_wait(&lc->bt_sem_in);
37623+            rpi_run_one_line(s, lc, 0);
37624+        }
37625+#if TRACE_WPP
37626+        printf("%s: Done body\n", __func__);
37627+#endif
37628+
37629+        // Wait for everything else to finish
37630+        rpi_sem_wait(&lc->bt_sem_in);
37631+
37632+#if TRACE_WPP
37633+        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
37634+#endif
37635+    }
37636+    else
37637+#endif
37638+    {
37639+#if TRACE_WPP
37640+        printf("%s: Single start: ts=%d\n", __func__, lc->ts);
37641+#endif
37642+        // Single bit thread
37643+        do {
37644+            // Make sure we have space to prepare the next job
37645+            worker_pass0_ready(s, lc);
37646+
37647+            if ((err = fill_job(s, lc, 0)) < 0)
37648+                goto fail;
37649+
37650+            worker_submit_job(s, lc);
37651+
37652+            if (tile_needs_wait(s, lc->ts - 1))
37653+                worker_wait(s, lc);
37654+
37655+        } while (!lc->unit_done);
37656+
37657+#if TRACE_WPP
37658+        printf("%s: Single end: ts=%d\n", __func__, lc->ts);
37659+#endif
37660+    }
37661+
37662+    // If we have reached the end of the frame or
37663+    // then wait for the worker to finish all its jobs
37664+    if (lc->ts >= s->ps.sps->ctb_size)
37665+        worker_wait(s, lc);
37666+
37667+#if RPI_TSTATS
37668+    {
37669+        HEVCRpiStats *const ts = &s->tstats;
37670+
37671+        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
37672+               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
37673+               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
37674+               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
37675+               ts->y_pred2_hgt16, ts->y_pred2_hle16);
37676+        memset(ts, 0, sizeof(*ts));
37677+    }
37678+#endif
37679+
37680+    return lc->ts;
37681+
37682+fail:
37683+    // Cleanup
37684+    av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
37685+    // Free our job & wait for temination
37686+    worker_free(s, lc);
37687+    worker_wait(s, lc);
37688+    return err;
37689+}
37690+
37691+
37692+static void set_no_backward_pred(HEVCRpiContext * const s)
37693+{
37694+    int i, j;
37695+    const RefPicList *const refPicList = s->refPicList;
37696+
37697+    s->no_backward_pred_flag = 0;
37698+    if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag)
37699+        return;
37700+
37701+    for (j = 0; j < 2; j++) {
37702+        for (i = 0; i < refPicList[j].nb_refs; i++) {
37703+            if (refPicList[j].list[i] > s->poc) {
37704+                s->no_backward_pred_flag = 1;
37705+                return;
37706+            }
37707+        }
37708+    }
37709+}
37710+
37711+static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
37712+{
37713+    int err;
37714+    if ((err = gen_entry_points(s, nal)) < 0)
37715+        return err;
37716+
37717+    set_no_backward_pred(s);
37718+
37719+    return rpi_decode_entry(s->avctx, NULL);
37720+}
37721+
37722+static int set_side_data(HEVCRpiContext *s)
37723+{
37724+    AVFrame *out = s->ref->frame;
37725+
37726+    if (s->sei.frame_packing.present &&
37727+        s->sei.frame_packing.arrangement_type >= 3 &&
37728+        s->sei.frame_packing.arrangement_type <= 5 &&
37729+        s->sei.frame_packing.content_interpretation_type > 0 &&
37730+        s->sei.frame_packing.content_interpretation_type < 3) {
37731+        AVStereo3D *stereo = av_stereo3d_create_side_data(out);
37732+        if (!stereo)
37733+            return AVERROR(ENOMEM);
37734+
37735+        switch (s->sei.frame_packing.arrangement_type) {
37736+        case 3:
37737+            if (s->sei.frame_packing.quincunx_subsampling)
37738+                stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
37739+            else
37740+                stereo->type = AV_STEREO3D_SIDEBYSIDE;
37741+            break;
37742+        case 4:
37743+            stereo->type = AV_STEREO3D_TOPBOTTOM;
37744+            break;
37745+        case 5:
37746+            stereo->type = AV_STEREO3D_FRAMESEQUENCE;
37747+            break;
37748+        }
37749+
37750+        if (s->sei.frame_packing.content_interpretation_type == 2)
37751+            stereo->flags = AV_STEREO3D_FLAG_INVERT;
37752+
37753+        if (s->sei.frame_packing.arrangement_type == 5) {
37754+            if (s->sei.frame_packing.current_frame_is_frame0_flag)
37755+                stereo->view = AV_STEREO3D_VIEW_LEFT;
37756+            else
37757+                stereo->view = AV_STEREO3D_VIEW_RIGHT;
37758+        }
37759+    }
37760+
37761+    if (s->sei.display_orientation.present &&
37762+        (s->sei.display_orientation.anticlockwise_rotation ||
37763+         s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) {
37764+        double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16);
37765+        AVFrameSideData *rotation = av_frame_new_side_data(out,
37766+                                                           AV_FRAME_DATA_DISPLAYMATRIX,
37767+                                                           sizeof(int32_t) * 9);
37768+        if (!rotation)
37769+            return AVERROR(ENOMEM);
37770+
37771+        av_display_rotation_set((int32_t *)rotation->data, angle);
37772+        av_display_matrix_flip((int32_t *)rotation->data,
37773+                               s->sei.display_orientation.hflip,
37774+                               s->sei.display_orientation.vflip);
37775+    }
37776+
37777+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
37778+    // so the side data persists for the entire coded video sequence.
37779+    if (s->sei.mastering_display.present > 0 &&
37780+        IS_IRAP(s) && s->no_rasl_output_flag) {
37781+        s->sei.mastering_display.present--;
37782+    }
37783+    if (s->sei.mastering_display.present) {
37784+        // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
37785+        const int mapping[3] = {2, 0, 1};
37786+        const int chroma_den = 50000;
37787+        const int luma_den = 10000;
37788+        int i;
37789+        AVMasteringDisplayMetadata *metadata =
37790+            av_mastering_display_metadata_create_side_data(out);
37791+        if (!metadata)
37792+            return AVERROR(ENOMEM);
37793+
37794+        for (i = 0; i < 3; i++) {
37795+            const int j = mapping[i];
37796+            metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
37797+            metadata->display_primaries[i][0].den = chroma_den;
37798+            metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
37799+            metadata->display_primaries[i][1].den = chroma_den;
37800+        }
37801+        metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
37802+        metadata->white_point[0].den = chroma_den;
37803+        metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
37804+        metadata->white_point[1].den = chroma_den;
37805+
37806+        metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
37807+        metadata->max_luminance.den = luma_den;
37808+        metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
37809+        metadata->min_luminance.den = luma_den;
37810+        metadata->has_luminance = 1;
37811+        metadata->has_primaries = 1;
37812+
37813+        av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
37814+        av_log(s->avctx, AV_LOG_DEBUG,
37815+               "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
37816+               av_q2d(metadata->display_primaries[0][0]),
37817+               av_q2d(metadata->display_primaries[0][1]),
37818+               av_q2d(metadata->display_primaries[1][0]),
37819+               av_q2d(metadata->display_primaries[1][1]),
37820+               av_q2d(metadata->display_primaries[2][0]),
37821+               av_q2d(metadata->display_primaries[2][1]),
37822+               av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
37823+        av_log(s->avctx, AV_LOG_DEBUG,
37824+               "min_luminance=%f, max_luminance=%f\n",
37825+               av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
37826+    }
37827+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
37828+    // so the side data persists for the entire coded video sequence.
37829+    if (s->sei.content_light.present > 0 &&
37830+        IS_IRAP(s) && s->no_rasl_output_flag) {
37831+        s->sei.content_light.present--;
37832+    }
37833+    if (s->sei.content_light.present) {
37834+        AVContentLightMetadata *metadata =
37835+            av_content_light_metadata_create_side_data(out);
37836+        if (!metadata)
37837+            return AVERROR(ENOMEM);
37838+        metadata->MaxCLL  = s->sei.content_light.max_content_light_level;
37839+        metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
37840+
37841+        av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
37842+        av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
37843+               metadata->MaxCLL, metadata->MaxFALL);
37844+    }
37845+
37846+    if (s->sei.a53_caption.a53_caption) {
37847+        AVFrameSideData* sd = av_frame_new_side_data(out,
37848+                                                     AV_FRAME_DATA_A53_CC,
37849+                                                     s->sei.a53_caption.a53_caption_size);
37850+        if (sd)
37851+            memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
37852+        av_freep(&s->sei.a53_caption.a53_caption);
37853+        s->sei.a53_caption.a53_caption_size = 0;
37854+        s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
37855+    }
37856+
37857+    if (s->sei.alternative_transfer.present &&
37858+        av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
37859+        s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
37860+        s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics;
37861+    }
37862+
37863+    return 0;
37864+}
37865+
37866+static int hevc_frame_start(HEVCRpiContext * const s)
37867+{
37868+    int ret;
37869+
37870+    memset(s->bs_horizontal, 0, s->bs_size * 2);  // Does V too
37871+    memset(s->is_pcm,        0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
37872+    memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address));
37873+
37874+    // Only need to remember intra for CIP
37875+    if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap)
37876+        s->is_intra = NULL;
37877+    else
37878+    {
37879+        s->is_intra = s->is_intra_store;
37880+        memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
37881+    }
37882+
37883+    s->is_decoded        = 0;
37884+    s->first_nal_type    = s->nal_unit_type;
37885+
37886+    s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
37887+
37888+    if (s->pkt.nb_nals > s->rpl_tab_size)
37889+    {
37890+        // In most cases it will be faster to free & realloc as that doesn't
37891+        // require (an unwanted) copy
37892+        av_freep(&s->rpl_tab);
37893+        s->rpl_tab_size = 0;
37894+        if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL)
37895+            goto fail;
37896+        s->rpl_tab_size = s->pkt.nb_nals;
37897+    }
37898+    memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab));
37899+
37900+    ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
37901+    if (ret < 0)
37902+        goto fail;
37903+
37904+    // Resize rpl_tab to max that we might want
37905+    ret = ff_hevc_rpi_frame_rps(s);
37906+    if (ret < 0) {
37907+        av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
37908+        goto fail;
37909+    }
37910+
37911+    s->ref->frame->key_frame = IS_IRAP(s);
37912+
37913+    ret = set_side_data(s);
37914+    if (ret < 0)
37915+        goto fail;
37916+
37917+    s->frame->pict_type = 3 - s->sh.slice_type;
37918+
37919+    if (!IS_IRAP(s))
37920+        ff_hevc_rpi_bump_frame(s);
37921+
37922+    av_frame_unref(s->output_frame);
37923+    ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0);
37924+    if (ret < 0)
37925+        goto fail;
37926+
37927+    ff_thread_finish_setup(s->avctx);
37928+
37929+    return 0;
37930+
37931+fail:
37932+    if (s->ref)
37933+        ff_hevc_rpi_unref_frame(s, s->ref, ~0);
37934+    s->ref = NULL;
37935+    return ret;
37936+}
37937+
37938+static inline int is_non_ref_unit_type(const unsigned int nal_unit_type)
37939+{
37940+    // From Table 7-1
37941+    return (nal_unit_type & ~0xe) == 0;  // True for 0, 2, 4, 6, 8, 10, 12, 14
37942+}
37943+
37944+static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal)
37945+{
37946+    GetBitContext * const gb    = &s->HEVClc->gb;
37947+    int ctb_addr_ts, ret;
37948+
37949+    *gb              = nal->gb;
37950+    s->nal_unit_type = nal->type;
37951+    s->temporal_id   = nal->temporal_id;
37952+
37953+    switch (s->nal_unit_type) {
37954+    case HEVC_NAL_VPS:
37955+        ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps);
37956+        if (ret < 0)
37957+            goto fail;
37958+        break;
37959+    case HEVC_NAL_SPS:
37960+        ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps,
37961+                                     s->apply_defdispwin);
37962+        if (ret < 0)
37963+            goto fail;
37964+        break;
37965+    case HEVC_NAL_PPS:
37966+        ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps);
37967+        if (ret < 0)
37968+            goto fail;
37969+        break;
37970+    case HEVC_NAL_SEI_PREFIX:
37971+    case HEVC_NAL_SEI_SUFFIX:
37972+        ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
37973+        if (ret < 0)
37974+            goto fail;
37975+        break;
37976+    case HEVC_NAL_TRAIL_R:
37977+    case HEVC_NAL_TRAIL_N:
37978+    case HEVC_NAL_TSA_N:
37979+    case HEVC_NAL_TSA_R:
37980+    case HEVC_NAL_STSA_N:
37981+    case HEVC_NAL_STSA_R:
37982+    case HEVC_NAL_BLA_W_LP:
37983+    case HEVC_NAL_BLA_W_RADL:
37984+    case HEVC_NAL_BLA_N_LP:
37985+    case HEVC_NAL_IDR_W_RADL:
37986+    case HEVC_NAL_IDR_N_LP:
37987+    case HEVC_NAL_CRA_NUT:
37988+    case HEVC_NAL_RADL_N:
37989+    case HEVC_NAL_RADL_R:
37990+    case HEVC_NAL_RASL_N:
37991+    case HEVC_NAL_RASL_R:
37992+        ret = hls_slice_header(s);
37993+        if (ret < 0)
37994+            return ret;
37995+
37996+        // The definition of _N unit types is "non-reference for other frames
37997+        // with the same temporal_id" so they may/will be ref frames for pics
37998+        // with a higher temporal_id.
37999+        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
38000+            !is_non_ref_unit_type(s->nal_unit_type);
38001+        s->offload_recon = s->threads_type != 0 && s->used_for_ref;
38002+        s->is_irap = IS_IRAP(s);
38003+
38004+#if DEBUG_DECODE_N
38005+        {
38006+            static int z = 0;
38007+            if (IS_IDR(s)) {
38008+                z = 1;
38009+            }
38010+            if (z != 0 && z++ > DEBUG_DECODE_N) {
38011+                s->is_decoded = 0;
38012+                break;
38013+            }
38014+        }
38015+#endif
38016+        if (
38017+            (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) ||
38018+            (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
38019+            (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
38020+            (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s)))
38021+        {
38022+            s->is_decoded = 0;
38023+            break;
38024+        }
38025+
38026+        if (s->sh.first_slice_in_pic_flag) {
38027+            if (s->max_ra == INT_MAX) {
38028+                if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
38029+                    s->max_ra = s->poc;
38030+                } else {
38031+                    if (IS_IDR(s))
38032+                        s->max_ra = INT_MIN;
38033+                }
38034+            }
38035+
38036+            if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
38037+                s->poc <= s->max_ra) {
38038+                s->is_decoded = 0;
38039+                break;
38040+            } else {
38041+                if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
38042+                    s->max_ra = INT_MIN;
38043+            }
38044+
38045+            ret = hevc_frame_start(s);
38046+            if (ret < 0)
38047+                return ret;
38048+        } else if (!s->ref) {
38049+            av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
38050+            goto fail;
38051+        }
38052+
38053+        if (s->nal_unit_type != s->first_nal_type) {
38054+            av_log(s->avctx, AV_LOG_ERROR,
38055+                   "Non-matching NAL types of the VCL NALUs: %d %d\n",
38056+                   s->first_nal_type, s->nal_unit_type);
38057+            return AVERROR_INVALIDDATA;
38058+        }
38059+
38060+        if (!s->sh.dependent_slice_segment_flag &&
38061+            s->sh.slice_type != HEVC_SLICE_I) {
38062+            ret = ff_hevc_rpi_slice_rpl(s);
38063+            if (ret < 0) {
38064+                av_log(s->avctx, AV_LOG_WARNING,
38065+                       "Error constructing the reference lists for the current slice.\n");
38066+                goto fail;
38067+            }
38068+        }
38069+
38070+        ctb_addr_ts = hls_slice_data(s, nal);
38071+        if (ctb_addr_ts >= s->ps.sps->ctb_size) {
38072+            s->is_decoded = 1;
38073+        }
38074+
38075+        if (ctb_addr_ts < 0) {
38076+            ret = ctb_addr_ts;
38077+            goto fail;
38078+        }
38079+        break;
38080+    case HEVC_NAL_EOS_NUT:
38081+    case HEVC_NAL_EOB_NUT:
38082+        s->seq_decode = (s->seq_decode + 1) & 0xff;
38083+        s->max_ra     = INT_MAX;
38084+        break;
38085+    case HEVC_NAL_AUD:
38086+    case HEVC_NAL_FD_NUT:
38087+        break;
38088+    default:
38089+        av_log(s->avctx, AV_LOG_INFO,
38090+               "Skipping NAL unit %d\n", s->nal_unit_type);
38091+    }
38092+
38093+    return 0;
38094+fail:
38095+    if (s->avctx->err_recognition & AV_EF_EXPLODE)
38096+        return ret;
38097+    return 0;
38098+}
38099+
38100+static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length)
38101+{
38102+    int i, ret = 0;
38103+    int eos_at_start = 1;
38104+
38105+    s->ref = NULL;
38106+    s->last_eos = s->eos;
38107+    s->eos = 0;
38108+
38109+    /* split the input packet into NAL units, so we know the upper bound on the
38110+     * number of slices in the frame */
38111+    ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
38112+                                s->nal_length_size, s->avctx->codec_id, 0, 0);
38113+    if (ret < 0) {
38114+        av_log(s->avctx, AV_LOG_ERROR,
38115+               "Error splitting the input into NAL units.\n");
38116+        return ret;
38117+    }
38118+
38119+    for (i = 0; i < s->pkt.nb_nals; i++) {
38120+        if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
38121+            s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
38122+            if (eos_at_start) {
38123+                s->last_eos = 1;
38124+            } else {
38125+                s->eos = 1;
38126+            }
38127+        } else {
38128+            eos_at_start = 0;
38129+        }
38130+    }
38131+
38132+    /* decode the NAL units */
38133+    for (i = 0; i < s->pkt.nb_nals; i++) {
38134+        ret = decode_nal_unit(s, &s->pkt.nals[i]);
38135+        if (ret < 0) {
38136+            av_log(s->avctx, AV_LOG_WARNING,
38137+                   "Error parsing NAL unit #%d.\n", i);
38138+            goto fail;
38139+        }
38140+    }
38141+
38142+fail:  // Also success path
38143+    if (s->ref != NULL) {
38144+        if (s->used_for_ref && s->threads_type != 0) {
38145+            ff_hevc_rpi_progress_signal_all_done(s);
38146+        }
38147+        else {
38148+            // Flush frame to real memory as we expect to be able to pass
38149+            // it straight on to mmal
38150+            flush_frame(s, s->frame);
38151+        }
38152+    }
38153+    return ret;
38154+}
38155+
38156+static void print_md5(void *log_ctx, int level, uint8_t md5[16])
38157+{
38158+    int i;
38159+    for (i = 0; i < 16; i++)
38160+        av_log(log_ctx, level, "%02"PRIx8, md5[i]);
38161+}
38162+
38163+static int verify_md5(HEVCRpiContext *s, AVFrame *frame)
38164+{
38165+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
38166+    int pixel_shift;
38167+    int i, j;
38168+
38169+    if (!desc)
38170+        return AVERROR(EINVAL);
38171+
38172+    pixel_shift = desc->comp[0].depth > 8;
38173+
38174+    av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
38175+           s->poc);
38176+
38177+    /* the checksums are LE, so we have to byteswap for >8bpp formats
38178+     * on BE arches */
38179+#if HAVE_BIGENDIAN
38180+    if (pixel_shift && !s->checksum_buf) {
38181+        av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
38182+                       FFMAX3(frame->linesize[0], frame->linesize[1],
38183+                              frame->linesize[2]));
38184+        if (!s->checksum_buf)
38185+            return AVERROR(ENOMEM);
38186+    }
38187+#endif
38188+
38189+    for (i = 0; frame->data[i]; i++) {
38190+        int width  = s->avctx->coded_width;
38191+        int height = s->avctx->coded_height;
38192+        int w = (i == 1 || i == 2) ? (width  >> desc->log2_chroma_w) : width;
38193+        int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
38194+        uint8_t md5[16];
38195+
38196+        av_md5_init(s->md5_ctx);
38197+        for (j = 0; j < h; j++) {
38198+            const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
38199+#if HAVE_BIGENDIAN
38200+            if (pixel_shift) {
38201+                s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf,
38202+                                    (const uint16_t *) src, w);
38203+                src = s->checksum_buf;
38204+            }
38205+#endif
38206+            av_md5_update(s->md5_ctx, src, w << pixel_shift);
38207+        }
38208+        av_md5_final(s->md5_ctx, md5);
38209+
38210+        if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
38211+            av_log   (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
38212+            print_md5(s->avctx, AV_LOG_DEBUG, md5);
38213+            av_log   (s->avctx, AV_LOG_DEBUG, "; ");
38214+        } else {
38215+            av_log   (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
38216+            print_md5(s->avctx, AV_LOG_ERROR, md5);
38217+            av_log   (s->avctx, AV_LOG_ERROR, " != ");
38218+            print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]);
38219+            av_log   (s->avctx, AV_LOG_ERROR, "\n");
38220+            return AVERROR_INVALIDDATA;
38221+        }
38222+    }
38223+
38224+    av_log(s->avctx, AV_LOG_DEBUG, "\n");
38225+
38226+    return 0;
38227+}
38228+
38229+static int all_sps_supported(const HEVCRpiContext * const s)
38230+{
38231+    for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
38232+        if (s->ps.sps_list[i] != NULL)
38233+        {
38234+            const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
38235+            if (!is_sps_supported(sps))
38236+                return 0;
38237+        }
38238+    }
38239+    return 1;
38240+}
38241+
38242+static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first)
38243+{
38244+    int ret, i;
38245+
38246+    ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
38247+                                   &s->nal_length_size, s->avctx->err_recognition,
38248+                                   s->apply_defdispwin, s->avctx);
38249+    if (ret < 0)
38250+        return ret;
38251+
38252+    /* export stream parameters from the first SPS */
38253+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
38254+        if (first && s->ps.sps_list[i]) {
38255+            const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
38256+            export_stream_params(s->avctx, &s->ps, sps);
38257+            break;
38258+        }
38259+    }
38260+
38261+    return 0;
38262+}
38263+
38264+static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
38265+                             AVPacket *avpkt)
38266+{
38267+    int ret;
38268+    int new_extradata_size;
38269+    uint8_t *new_extradata;
38270+    HEVCRpiContext *s = avctx->priv_data;
38271+
38272+    if (!avpkt->size) {
38273+        ret = ff_hevc_rpi_output_frame(s, data, 1);
38274+        if (ret < 0)
38275+            return ret;
38276+
38277+        *got_output = ret;
38278+        return 0;
38279+    }
38280+
38281+    new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
38282+                                            &new_extradata_size);
38283+    if (new_extradata && new_extradata_size > 0) {
38284+        ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0);
38285+        if (ret < 0)
38286+            return ret;
38287+    }
38288+
38289+    s->ref = NULL;
38290+    ret    = decode_nal_units(s, avpkt->data, avpkt->size);
38291+    if (ret < 0)
38292+        return ret;
38293+
38294+    /* verify the SEI checksum */
38295+    if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
38296+        s->sei.picture_hash.is_md5) {
38297+        ret = verify_md5(s, s->ref->frame);
38298+        if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
38299+            ff_hevc_rpi_unref_frame(s, s->ref, ~0);
38300+            return ret;
38301+        }
38302+    }
38303+    s->sei.picture_hash.is_md5 = 0;
38304+
38305+    if (s->is_decoded) {
38306+        av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
38307+        s->is_decoded = 0;
38308+    }
38309+
38310+    if (s->output_frame->buf[0]) {
38311+        av_frame_move_ref(data, s->output_frame);
38312+        *got_output = 1;
38313+    }
38314+
38315+    return avpkt->size;
38316+}
38317+
38318+static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src)
38319+{
38320+    int ret;
38321+
38322+    ret = ff_thread_ref_frame(&dst->tf, &src->tf);
38323+    if (ret < 0)
38324+        return ret;
38325+
38326+    if (src->col_mvf_buf != NULL)
38327+    {
38328+        dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf);
38329+        if (!dst->col_mvf_buf)
38330+            goto fail;
38331+    }
38332+    dst->col_mvf = src->col_mvf;
38333+
38334+    dst->poc        = src->poc;
38335+    dst->flags      = src->flags;
38336+    dst->sequence   = src->sequence;
38337+    return 0;
38338+
38339+fail:
38340+    ff_hevc_rpi_unref_frame(s, dst, ~0);
38341+    return AVERROR(ENOMEM);
38342+}
38343+
38344+
38345+static av_cold int hevc_decode_free(AVCodecContext *avctx)
38346+{
38347+    HEVCRpiContext * const s = avctx->priv_data;
38348+    int i;
38349+
38350+    pic_arrays_free(s);
38351+
38352+    av_freep(&s->md5_ctx);
38353+
38354+    av_freep(&s->cabac_save);
38355+
38356+#if RPI_EXTRA_BIT_THREADS
38357+    bit_threads_kill(s);
38358+#endif
38359+
38360+    hevc_exit_worker(s);
38361+    for (i = 0; i != 2; ++i) {
38362+        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
38363+    }
38364+    job_lc_kill(s->HEVClc);
38365+
38366+    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
38367+    av_freep(&s->sao_pixel_buffer_v[0]);
38368+    av_frame_free(&s->output_frame);
38369+
38370+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
38371+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
38372+        av_frame_free(&s->DPB[i].frame);
38373+    }
38374+
38375+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
38376+        av_buffer_unref(&s->ps.vps_list[i]);
38377+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
38378+        av_buffer_unref(&s->ps.sps_list[i]);
38379+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
38380+        av_buffer_unref(&s->ps.pps_list[i]);
38381+    s->ps.sps = NULL;
38382+    s->ps.pps = NULL;
38383+    s->ps.vps = NULL;
38384+
38385+    // Free separately from sLists as used that way by RPI WPP
38386+    for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
38387+        av_freep(s->HEVClcList + i);
38388+    }
38389+    s->HEVClc = NULL;  // Allocated as part of HEVClcList
38390+
38391+    ff_h2645_packet_uninit(&s->pkt);
38392+
38393+    if (s->qpu_init_ok)
38394+        vpu_qpu_term();
38395+    s->qpu_init_ok = 0;
38396+
38397+    return 0;
38398+}
38399+
38400+
38401+static av_cold int hevc_init_context(AVCodecContext *avctx)
38402+{
38403+    HEVCRpiContext *s = avctx->priv_data;
38404+    int i;
38405+
38406+    s->avctx = avctx;
38407+
38408+    s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext));
38409+    if (!s->HEVClc)
38410+        goto fail;
38411+    s->HEVClcList[0] = s->HEVClc;
38412+
38413+    if (vpu_qpu_init() != 0)
38414+        goto fail;
38415+    s->qpu_init_ok = 1;
38416+
38417+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
38418+    {
38419+        static const uint32_t dframe[1] = {0x80808080};
38420+        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
38421+    }
38422+#endif
38423+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
38424+    s->qpu_dummy_frame_qpu = qpu_dummy();
38425+#endif
38426+
38427+    bt_lc_init(s, s->HEVClc, 0);
38428+    job_lc_init(s->HEVClc);
38429+
38430+    for (i = 0; i != 2; ++i) {
38431+        ff_hevc_rpi_progress_init_state(s->progress_states + i);
38432+    }
38433+
38434+    if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL)
38435+        goto fail;
38436+
38437+     if ((s->output_frame = av_frame_alloc()) == NULL)
38438+        goto fail;
38439+
38440+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
38441+        s->DPB[i].frame = av_frame_alloc();
38442+        if (!s->DPB[i].frame)
38443+            goto fail;
38444+        s->DPB[i].tf.f = s->DPB[i].frame;
38445+        s->DPB[i].dpb_no = i;
38446+    }
38447+
38448+    s->max_ra = INT_MAX;
38449+
38450+    if ((s->md5_ctx = av_md5_alloc()) == NULL)
38451+        goto fail;
38452+
38453+    s->context_initialized = 1;
38454+    s->eos = 0;
38455+
38456+    ff_hevc_rpi_reset_sei(&s->sei);
38457+
38458+    return 0;
38459+
38460+fail:
38461+    av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__);
38462+    hevc_decode_free(avctx);
38463+    return AVERROR(ENOMEM);
38464+}
38465+
38466+#if HAVE_THREADS
38467+static int hevc_update_thread_context(AVCodecContext *dst,
38468+                                      const AVCodecContext *src)
38469+{
38470+    HEVCRpiContext *s  = dst->priv_data;
38471+    HEVCRpiContext *s0 = src->priv_data;
38472+    int i, ret;
38473+
38474+    av_assert0(s->context_initialized);
38475+
38476+    // dst == src can happen according to the comments and in that case
38477+    // there is nothing to do here
38478+    if (dst == src)
38479+        return 0;
38480+
38481+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
38482+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
38483+        if (s0->DPB[i].frame->buf[0]) {
38484+            ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
38485+            if (ret < 0)
38486+                return ret;
38487+        }
38488+    }
38489+
38490+    if (s->ps.sps != s0->ps.sps)
38491+        s->ps.sps = NULL;
38492+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
38493+        av_buffer_unref(&s->ps.vps_list[i]);
38494+        if (s0->ps.vps_list[i]) {
38495+            s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]);
38496+            if (!s->ps.vps_list[i])
38497+                return AVERROR(ENOMEM);
38498+        }
38499+    }
38500+
38501+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
38502+        av_buffer_unref(&s->ps.sps_list[i]);
38503+        if (s0->ps.sps_list[i]) {
38504+            s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]);
38505+            if (!s->ps.sps_list[i])
38506+                return AVERROR(ENOMEM);
38507+        }
38508+    }
38509+
38510+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) {
38511+        av_buffer_unref(&s->ps.pps_list[i]);
38512+        if (s0->ps.pps_list[i]) {
38513+            s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]);
38514+            if (!s->ps.pps_list[i])
38515+                return AVERROR(ENOMEM);
38516+        }
38517+    }
38518+
38519+    if (s->ps.sps != s0->ps.sps)
38520+        if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
38521+            return ret;
38522+
38523+    s->seq_decode = s0->seq_decode;
38524+    s->seq_output = s0->seq_output;
38525+    s->pocTid0    = s0->pocTid0;
38526+    s->max_ra     = s0->max_ra;
38527+    s->eos        = s0->eos;
38528+    s->no_rasl_output_flag = s0->no_rasl_output_flag;
38529+
38530+    s->is_nalff        = s0->is_nalff;
38531+    s->nal_length_size = s0->nal_length_size;
38532+
38533+    s->threads_type        = s0->threads_type;
38534+
38535+    if (s0->eos) {
38536+        s->seq_decode = (s->seq_decode + 1) & 0xff;
38537+        s->max_ra = INT_MAX;
38538+    }
38539+
38540+    s->sei.frame_packing        = s0->sei.frame_packing;
38541+    s->sei.display_orientation  = s0->sei.display_orientation;
38542+    s->sei.mastering_display    = s0->sei.mastering_display;
38543+    s->sei.content_light        = s0->sei.content_light;
38544+    s->sei.alternative_transfer = s0->sei.alternative_transfer;
38545+
38546+    // * We do this here as it allows us to easily locate our parents
38547+    //   global job pool, but there really should be a less nasty way
38548+    if (s->jbc == NULL)
38549+    {
38550+        av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
38551+        hevc_init_worker(s);
38552+    }
38553+
38554+    return 0;
38555+}
38556+#endif
38557+
38558+#include <sys/stat.h>
38559+static int qpu_ok(void)
38560+{
38561+    static int is_pi3 = -1;
38562+    if (is_pi3 == -1)
38563+    {
38564+        struct stat sb;
38565+        is_pi3 = (stat("/dev/rpivid-intcmem", &sb) != 0);
38566+    }
38567+    return is_pi3;
38568+}
38569+
38570+static av_cold int hevc_decode_init(AVCodecContext *avctx)
38571+{
38572+    HEVCRpiContext *s = avctx->priv_data;
38573+    int ret;
38574+
38575+    if (!qpu_ok())
38576+        return AVERROR_DECODER_NOT_FOUND;
38577+
38578+    if ((ret = hevc_init_context(avctx)) < 0)
38579+        return ret;
38580+
38581+    // If we are a child context then stop now
38582+    // Everything after this point is either 1st decode setup or global alloc
38583+    // that must not be repeated
38584+    // Global info will be copied into children in update_thread_context (we
38585+    // can't do it here as we have no way of finding the parent context)
38586+    if (avctx->internal->is_copy)
38587+        return 0;
38588+
38589+    // Job allocation requires VCSM alloc to work so ensure that we have it
38590+    // initialised by this point
38591+    {
38592+        HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
38593+        if (jbg == NULL) {
38594+            av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
38595+            ret = AVERROR(ENOMEM);
38596+            goto fail;
38597+        }
38598+
38599+        if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) {
38600+            av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
38601+            ret = AVERROR(ENOMEM);
38602+            goto fail;
38603+        }
38604+    }
38605+
38606+    hevc_init_worker(s);
38607+
38608+    s->eos = 1;
38609+
38610+    if (avctx->extradata_size > 0 && avctx->extradata) {
38611+        if ((ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1)) < 0)
38612+            goto fail;
38613+
38614+        if (!all_sps_supported(s)) {
38615+            ret = AVERROR_DECODER_NOT_FOUND;
38616+            goto fail;
38617+        }
38618+    }
38619+
38620+    if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
38621+        s->threads_type = FF_THREAD_FRAME;
38622+    else
38623+        s->threads_type = 0;
38624+
38625+    return 0;
38626+
38627+fail:
38628+    hevc_decode_free(avctx);
38629+    return ret;
38630+}
38631+
38632+static void hevc_decode_flush(AVCodecContext *avctx)
38633+{
38634+    HEVCRpiContext *s = avctx->priv_data;
38635+    ff_hevc_rpi_flush_dpb(s);
38636+    s->max_ra = INT_MAX;
38637+    s->eos = 1;
38638+}
38639+
38640+typedef struct  hwaccel_rpi3_qpu_env_s {
38641+    const AVClass *av_class;
38642+    AVZcEnvPtr zc;
38643+} hwaccel_rpi3_qpu_env_t;
38644+
38645+static int hwaccel_alloc_frame(AVCodecContext *s, AVFrame *frame)
38646+{
38647+    hwaccel_rpi3_qpu_env_t * const r3 = s->internal->hwaccel_priv_data;
38648+    int rv;
38649+
38650+    if (av_rpi_zc_in_use(s))
38651+    {
38652+        rv = s->get_buffer2(s, frame, 0);
38653+    }
38654+    else
38655+    {
38656+        rv = av_rpi_zc_get_buffer(r3->zc, frame);
38657+        if (rv == 0)
38658+            rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);  // actually do the alloc
38659+    }
38660+
38661+    if (rv == 0 &&
38662+        (rv = ff_attach_decode_data(frame)) < 0)
38663+    {
38664+        av_frame_unref(frame);
38665+    }
38666+
38667+    return rv;
38668+}
38669+
38670+static int hwaccel_rpi3_qpu_free(AVCodecContext *avctx)
38671+{
38672+    hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
38673+    av_rpi_zc_int_env_freep(&r3->zc);
38674+    return 0;
38675+}
38676+
38677+static int hwaccel_rpi3_qpu_init(AVCodecContext *avctx)
38678+{
38679+    hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
38680+
38681+    if ((r3->zc = av_rpi_zc_int_env_alloc(avctx)) == NULL)
38682+        goto fail;
38683+
38684+    return 0;
38685+
38686+fail:
38687+    av_log(avctx, AV_LOG_ERROR, "Rpi3 QPU init failed\n");
38688+    hwaccel_rpi3_qpu_free(avctx);
38689+    return AVERROR(ENOMEM);
38690+}
38691+
38692+
38693+#define OFFSET(x) offsetof(HEVCRpiContext, x)
38694+#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
38695+
38696+
38697+static const AVOption options[] = {
38698+    { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
38699+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
38700+    { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
38701+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
38702+    { NULL },
38703+};
38704+
38705+static const AVClass hevc_rpi_decoder_class = {
38706+    .class_name = "HEVC RPI decoder",
38707+    .item_name  = av_default_item_name,
38708+    .option     = options,
38709+    .version    = LIBAVUTIL_VERSION_INT,
38710+};
38711+
38712+static const enum AVPixelFormat hevc_rpi_pix_fmts[] = {
38713+    AV_PIX_FMT_SAND128,
38714+    AV_PIX_FMT_SAND64_10,
38715+    AV_PIX_FMT_NONE
38716+};
38717+
38718+
38719+static const AVHWAccel hwaccel_rpi3_qpu = {
38720+    .name           = "Pi3 QPU Hwaccel",
38721+    .alloc_frame    = hwaccel_alloc_frame,
38722+    .init           = hwaccel_rpi3_qpu_init,
38723+    .uninit         = hwaccel_rpi3_qpu_free,
38724+    .priv_data_size = sizeof(hwaccel_rpi3_qpu_env_t),
38725+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
38726+};
38727+
38728+static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand128 =
38729+{
38730+    .public = {
38731+        .pix_fmt = AV_PIX_FMT_SAND128,
38732+        .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
38733+        .device_type = AV_HWDEVICE_TYPE_NONE,
38734+    },
38735+    .hwaccel = &hwaccel_rpi3_qpu
38736+};
38737+static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand64_10 =
38738+{
38739+    .public = {
38740+        .pix_fmt = AV_PIX_FMT_SAND64_10,
38741+        .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
38742+        .device_type = AV_HWDEVICE_TYPE_NONE,
38743+    },
38744+    .hwaccel = &hwaccel_rpi3_qpu
38745+};
38746+
38747+
38748+static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = {
38749+    &hevc_rpi_hw_config_sand128,
38750+    &hevc_rpi_hw_config_sand64_10,
38751+    NULL
38752+};
38753+
38754+
38755+AVCodec ff_hevc_rpi_decoder = {
38756+    .name                  = "hevc_rpi",
38757+    .long_name             = NULL_IF_CONFIG_SMALL("HEVC (rpi)"),
38758+    .type                  = AVMEDIA_TYPE_VIDEO,
38759+    .id                    = AV_CODEC_ID_HEVC,
38760+    .priv_data_size        = sizeof(HEVCRpiContext),
38761+    .priv_class            = &hevc_rpi_decoder_class,
38762+    .init                  = hevc_decode_init,
38763+    .close                 = hevc_decode_free,
38764+    .decode                = hevc_rpi_decode_frame,
38765+    .flush                 = hevc_decode_flush,
38766+    .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context),
38767+    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
38768+                             AV_CODEC_CAP_HARDWARE |
38769+                             AV_CODEC_CAP_AVOID_PROBING |
38770+#if 0
38771+    // Debugging is often easier without threads getting in the way
38772+                            0,
38773+#warning H265 threading turned off
38774+#else
38775+    // We only have decent optimisation for frame - so only admit to that
38776+                             AV_CODEC_CAP_FRAME_THREADS,
38777+#endif
38778+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE |
38779+                             FF_CODEC_CAP_EXPORTS_CROPPING |
38780+                             FF_CODEC_CAP_ALLOCATE_PROGRESS,
38781+    .pix_fmts              = hevc_rpi_pix_fmts,
38782+    .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
38783+    .hw_configs            = hevc_rpi_hw_configs,
38784+//    .wrapper_name          = "hevc_rpi",
38785+};
38786+
38787--- /dev/null
38788+++ b/libavcodec/rpi_hevcdec.h
38789@@ -0,0 +1,1091 @@
38790+/*
38791+ * HEVC video decoder
38792+ *
38793+ * Copyright (C) 2012 - 2013 Guillaume Martres
38794+ *
38795+ * This file is part of FFmpeg.
38796+ *
38797+ * FFmpeg is free software; you can redistribute it and/or
38798+ * modify it under the terms of the GNU Lesser General Public
38799+ * License as published by the Free Software Foundation; either
38800+ * version 2.1 of the License, or (at your option) any later version.
38801+ *
38802+ * FFmpeg is distributed in the hope that it will be useful,
38803+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
38804+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
38805+ * Lesser General Public License for more details.
38806+ *
38807+ * You should have received a copy of the GNU Lesser General Public
38808+ * License along with FFmpeg; if not, write to the Free Software
38809+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
38810+ */
38811+
38812+#ifndef AVCODEC_RPI_HEVCDEC_H
38813+#define AVCODEC_RPI_HEVCDEC_H
38814+
38815+#include "config.h"
38816+
38817+#include <stdatomic.h>
38818+
38819+#include "libavutil/buffer.h"
38820+
38821+#include "avcodec.h"
38822+#include "bswapdsp.h"
38823+#include "cabac.h"
38824+#include "get_bits.h"
38825+#include "rpi_hevcpred.h"
38826+#include "h2645_parse.h"
38827+#include "hevc.h"
38828+#include "rpi_hevc_mv.h"
38829+#include "rpi_hevc_ps.h"
38830+#include "rpi_hevc_sei.h"
38831+#include "rpi_hevcdsp.h"
38832+#include "internal.h"
38833+#include "thread.h"
38834+#include "videodsp.h"
38835+
38836+#if ARCH_ARM
38837+#include "arm/rpi_hevc_misc_neon.h"
38838+#endif
38839+
38840+#define MAX_NB_THREADS 16
38841+#define SHIFT_CTB_WPP 2
38842+
38843+//TODO: check if this is really the maximum
38844+#define MAX_TRANSFORM_DEPTH 5
38845+
38846+#define MAX_TB_SIZE 32
38847+#define MAX_QP 51
38848+#define DEFAULT_INTRA_TC_OFFSET 2
38849+
38850+#define HEVC_CONTEXTS 199
38851+
38852+#define MRG_MAX_NUM_CANDS     5
38853+
38854+#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE)  // 64
38855+
38856+// Size of DPB array
38857+#define HEVC_DPB_ELS            32
38858+
38859+#define L0 0
38860+#define L1 1
38861+
38862+#define EPEL_EXTRA_BEFORE 1
38863+#define EPEL_EXTRA_AFTER  2
38864+#define EPEL_EXTRA        3
38865+#define QPEL_EXTRA_BEFORE 3
38866+#define QPEL_EXTRA_AFTER  4
38867+#define QPEL_EXTRA        7
38868+
38869+#define EDGE_EMU_BUFFER_STRIDE 80
38870+
38871+#include <semaphore.h>
38872+#include "rpi_qpu.h"
38873+
38874+// Max jobs per frame thread. Actual usage will be limited by the size
38875+// of the global job pool
38876+// ?? Limits
38877+#define RPI_MAX_JOBS            8
38878+
38879+// This is the number of _extra_ bit threads - we will have
38880+// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
38881+//
38882+// 0 is legitimate and will disable our WPP processing
38883+//#define RPI_EXTRA_BIT_THREADS 0
38884+#define RPI_EXTRA_BIT_THREADS   2
38885+
38886+// Number of separate threads/passes in worker
38887+// 2 and 3 are the currently valid numbers
38888+// At the moment 3 seems fractionally faster
38889+//#define RPI_PASSES              2
38890+#define RPI_PASSES              3
38891+
38892+// Print out various usage stats
38893+#define RPI_TSTATS              0
38894+
38895+// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form
38896+#define RPI_COMPRESS_COEFFS     1
38897+
38898+// Wait for VPU/QPU to finish in worker pass 0
38899+// If 0 then the wait is in pass 1
38900+//
38901+// One might expect the better place to wait would be in pass 1 however
38902+// testing shows that pass 0 produces overall faster decode.
38903+// Interestingly it is QPU/VPU limited streams that seem to suffer
38904+// from pass 1 waits, CPU limited ones tend to show a very mild gain.
38905+// This define exists so it is easy to test this.
38906+#define RPI_WORKER_WAIT_PASS_0  1
38907+
38908+// Use ARM emulation of QPU pred
38909+// These are for debug only as the emulation makes only limited
38910+// effort to be fast
38911+#define RPI_QPU_EMU_Y           0
38912+#define RPI_QPU_EMU_C           0
38913+
38914+// Max width & height we are prepared to consider
38915+// Sand frame shape calc becomes confused with large frames
38916+// Some buffer alloc also depends on this
38917+#define HEVC_RPI_MAX_WIDTH      2048
38918+#define HEVC_RPI_MAX_HEIGHT     1088
38919+
38920+
38921+// Min CTB size is 16
38922+#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16)
38923+
38924+/**
38925+ * Value of the luma sample at position (x, y) in the 2D array tab.
38926+ */
38927+#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
38928+#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
38929+
38930+#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
38931+#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
38932+                   (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
38933+#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
38934+
38935+enum RPSType {
38936+    ST_CURR_BEF = 0,
38937+    ST_CURR_AFT,
38938+    ST_FOLL,
38939+    LT_CURR,
38940+    LT_FOLL,
38941+    NB_RPS_TYPE,
38942+};
38943+
38944+enum SyntaxElement {
38945+    SAO_MERGE_FLAG = 0,
38946+    SAO_TYPE_IDX,
38947+    SAO_EO_CLASS,
38948+    SAO_BAND_POSITION,
38949+    SAO_OFFSET_ABS,
38950+    SAO_OFFSET_SIGN,
38951+    END_OF_SLICE_FLAG,
38952+    SPLIT_CODING_UNIT_FLAG,
38953+    CU_TRANSQUANT_BYPASS_FLAG,
38954+    SKIP_FLAG,
38955+    CU_QP_DELTA,
38956+    PRED_MODE_FLAG,
38957+    PART_MODE,
38958+    PCM_FLAG,
38959+    PREV_INTRA_LUMA_PRED_FLAG,
38960+    MPM_IDX,
38961+    REM_INTRA_LUMA_PRED_MODE,
38962+    INTRA_CHROMA_PRED_MODE,
38963+    MERGE_FLAG,
38964+    MERGE_IDX,
38965+    INTER_PRED_IDC,
38966+    REF_IDX_L0,
38967+    REF_IDX_L1,
38968+    ABS_MVD_GREATER0_FLAG,
38969+    ABS_MVD_GREATER1_FLAG,
38970+    ABS_MVD_MINUS2,
38971+    MVD_SIGN_FLAG,
38972+    MVP_LX_FLAG,
38973+    NO_RESIDUAL_DATA_FLAG,
38974+    SPLIT_TRANSFORM_FLAG,
38975+    CBF_LUMA,
38976+    CBF_CB_CR,
38977+    TRANSFORM_SKIP_FLAG,
38978+    EXPLICIT_RDPCM_FLAG,
38979+    EXPLICIT_RDPCM_DIR_FLAG,
38980+    LAST_SIGNIFICANT_COEFF_X_PREFIX,
38981+    LAST_SIGNIFICANT_COEFF_Y_PREFIX,
38982+    LAST_SIGNIFICANT_COEFF_X_SUFFIX,
38983+    LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
38984+    SIGNIFICANT_COEFF_GROUP_FLAG,
38985+    SIGNIFICANT_COEFF_FLAG,
38986+    COEFF_ABS_LEVEL_GREATER1_FLAG,
38987+    COEFF_ABS_LEVEL_GREATER2_FLAG,
38988+    COEFF_ABS_LEVEL_REMAINING,
38989+    COEFF_SIGN_FLAG,
38990+    LOG2_RES_SCALE_ABS,
38991+    RES_SCALE_SIGN_FLAG,
38992+    CU_CHROMA_QP_OFFSET_FLAG,
38993+    CU_CHROMA_QP_OFFSET_IDX,
38994+};
38995+
38996+enum PartMode {
38997+    PART_2Nx2N = 0,
38998+    PART_2NxN  = 1,
38999+    PART_Nx2N  = 2,
39000+    PART_NxN   = 3,
39001+    PART_2NxnU = 4,
39002+    PART_2NxnD = 5,
39003+    PART_nLx2N = 6,
39004+    PART_nRx2N = 7,
39005+};
39006+
39007+enum PredMode {
39008+    MODE_INTER = 0,
39009+    MODE_INTRA,
39010+    MODE_SKIP,
39011+};
39012+
39013+enum InterPredIdc {
39014+    PRED_L0 = 0,
39015+    PRED_L1,
39016+    PRED_BI,
39017+};
39018+
39019+enum PredFlag {
39020+    PF_INTRA = 0,
39021+    PF_L0,
39022+    PF_L1,
39023+    PF_BI,
39024+};
39025+
39026+enum SAOType {
39027+    SAO_NOT_APPLIED = 0,
39028+    SAO_BAND,
39029+    SAO_EDGE,
39030+    SAO_APPLIED
39031+};
39032+
39033+enum SAOEOClass {
39034+    SAO_EO_HORIZ = 0,
39035+    SAO_EO_VERT,
39036+    SAO_EO_135D,
39037+    SAO_EO_45D,
39038+};
39039+
39040+enum ScanType {
39041+    SCAN_DIAG = 0,
39042+    SCAN_HORIZ,
39043+    SCAN_VERT,
39044+};
39045+
39046+typedef struct RefPicList {
39047+    struct HEVCRpiFrame *ref[HEVC_MAX_REFS];
39048+    int list[HEVC_MAX_REFS];
39049+    uint8_t isLongTerm[HEVC_MAX_REFS];
39050+    int nb_refs;
39051+} RefPicList;
39052+
39053+typedef struct RefPicListTab {
39054+    RefPicList refPicList[2];
39055+} RefPicListTab;
39056+
39057+typedef struct RpiCodingUnit {
39058+    unsigned int x;             // Passed to deblock
39059+    unsigned int y;
39060+    unsigned int x_split;
39061+    unsigned int y_split;
39062+
39063+    enum PredMode pred_mode;    ///< PredMode
39064+    enum PartMode part_mode;    ///< PartMode
39065+
39066+    // Inferred parameters
39067+    uint8_t intra_split_flag;   ///< IntraSplitFlag
39068+    uint8_t max_trafo_depth;    ///< MaxTrafoDepth
39069+    uint8_t cu_transquant_bypass_flag;
39070+} RpiCodingUnit;
39071+
39072+typedef struct RpiPredictionUnit {
39073+    uint8_t intra_pred_mode[4];
39074+    uint8_t intra_pred_mode_c[4];
39075+    uint8_t chroma_mode_c[4];
39076+    uint8_t merge_flag;
39077+} RpiPredictionUnit;
39078+
39079+typedef struct HEVCRpiTransformUnit {
39080+    int8_t cu_qp_delta;
39081+
39082+    // Inferred parameters;
39083+    uint8_t intra_pred_mode;
39084+    uint8_t intra_pred_mode_c;
39085+    uint8_t chroma_mode_c;
39086+    uint8_t is_cu_qp_delta_wanted;
39087+    uint8_t cu_chroma_qp_offset_wanted;
39088+    const int8_t * qp_divmod6[3];
39089+} HEVCRpiTransformUnit;
39090+
39091+typedef struct DBParams {
39092+    int8_t beta_offset; // -12 to +12
39093+    int8_t tc_offset;   // -12 to +12
39094+} DBParams;
39095+
39096+#define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
39097+#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
39098+#define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
39099+#define HEVC_FRAME_FLAG_BUMPING   (1 << 3)
39100+
39101+struct HEVCRpiJob;
39102+
39103+typedef struct HEVCRpiFrame {
39104+    AVFrame *frame;
39105+    ThreadFrame tf;
39106+    ColMvField *col_mvf;
39107+    int poc;
39108+    struct HEVCRpiFrame *collocated_ref;
39109+
39110+    AVBufferRef *col_mvf_buf;
39111+
39112+    /**
39113+     * A sequence counter, so that old frames are output first
39114+     * after a POC reset
39115+     */
39116+    uint16_t sequence;
39117+
39118+    /**
39119+     * A combination of HEVC_FRAME_FLAG_*
39120+     */
39121+    uint8_t flags;
39122+
39123+    // Entry no in DPB - can be used as a small unique
39124+    // frame identifier (within the current thread)
39125+    uint8_t dpb_no;
39126+} HEVCRpiFrame;
39127+
39128+typedef struct HEVCRpiLocalContext {
39129+    HEVCRpiTransformUnit tu;
39130+
39131+    CABACContext cc;
39132+
39133+    // Vars that allow us to locate everything from just an lc
39134+    struct HEVCRpiContext * context;  // ??? make const ???
39135+    unsigned int lc_n; // lc list el no
39136+
39137+    // Job wait links
39138+    struct HEVCRpiLocalContext * jw_next;
39139+    struct HEVCRpiLocalContext * jw_prev;
39140+    struct HEVCRpiLocalContext * ljw_next;
39141+    struct HEVCRpiLocalContext * ljw_prev;
39142+    struct HEVCRpiJob * volatile jw_job;
39143+    sem_t jw_sem;
39144+
39145+    // ?? Wrap in structure ??
39146+    sem_t bt_sem_in;
39147+    sem_t * bt_psem_out;
39148+    volatile int bt_terminate;
39149+    unsigned int ts;
39150+    unsigned int bt_last_line;  // Last line in this bit_thread chunk
39151+    unsigned int bt_line_no;
39152+    unsigned int bt_line_width;
39153+    unsigned int bt_line_inc;
39154+
39155+    struct HEVCRpiJob * jb0;
39156+    char unit_done;  // Set once we have dealt with this slice
39157+    char bt_is_tile;
39158+    char last_progress_good;
39159+    char cabac_init_req;
39160+
39161+    uint8_t cabac_state[HEVC_CONTEXTS];
39162+    uint8_t stat_coeff[4];
39163+    GetBitContext gb;
39164+
39165+    uint8_t ct_depth;
39166+    int8_t qp_y;
39167+    int8_t curr_qp_y;
39168+    int8_t qPy_pred;
39169+
39170+// N.B. Used by asm (neon) - do not change
39171+#define AVAIL_S_UR  0
39172+#define AVAIL_S_U   1
39173+#define AVAIL_S_UL  2
39174+#define AVAIL_S_L   3
39175+#define AVAIL_S_DL  4
39176+
39177+#define AVAIL_U     (1 << AVAIL_S_U)
39178+#define AVAIL_L     (1 << AVAIL_S_L)
39179+#define AVAIL_UL    (1 << AVAIL_S_UL)
39180+#define AVAIL_UR    (1 << AVAIL_S_UR)
39181+#define AVAIL_DL    (1 << AVAIL_S_DL)
39182+
39183+// Intra filters - same number space as avail
39184+#define FILTER_LIGHT    0x40
39185+#define FILTER_STRONG   0x80
39186+#define FILTER_EITHER   (FILTER_LIGHT | FILTER_STRONG)
39187+
39188+    uint8_t ctb_avail;
39189+    int     end_of_ctb_x;
39190+    int     end_of_ctb_y;
39191+
39192+    RpiCodingUnit cu;
39193+    RpiPredictionUnit pu;
39194+
39195+#define BOUNDARY_LEFT_SLICE     (1 << 0)
39196+#define BOUNDARY_LEFT_TILE      (1 << 1)
39197+#define BOUNDARY_UPPER_SLICE    (1 << 2)
39198+#define BOUNDARY_UPPER_TILE     (1 << 3)
39199+    /* properties of the boundary of the current CTB for the purposes
39200+     * of the deblocking filter */
39201+    unsigned int boundary_flags;
39202+
39203+#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE)
39204+    uint8_t ipm_left[IPM_TAB_SIZE];
39205+    uint8_t ipm_up[IPM_TAB_SIZE];
39206+
39207+//#define MVF_STASH_WIDTH       128
39208+#define MVF_STASH_WIDTH       64
39209+#define MVF_STASH_HEIGHT      64
39210+#define MVF_STASH_WIDTH_PU    (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE)
39211+#define MVF_STASH_HEIGHT_PU   (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE)
39212+    HEVCRpiMvField mvf_ul[1];
39213+    HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU];
39214+
39215+    /* +7 is for subpixel interpolation, *2 for high bit depths */
39216+//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
39217+    /* The extended size between the new edge emu buffer is abused by SAO */
39218+//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
39219+//    DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
39220+
39221+} HEVCRpiLocalContext;
39222+
39223+// Each block can have an intra prediction and an add_residual command
39224+// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
39225+
39226+// Sand only has 2 planes (Y/C)
39227+#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4))
39228+
39229+// Command for intra prediction and transform_add of predictions to coefficients
39230+enum rpi_pred_cmd_e
39231+{
39232+    RPI_PRED_ADD_RESIDUAL,
39233+    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
39234+    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
39235+    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
39236+    RPI_PRED_ADD_DC,
39237+    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
39238+    RPI_PRED_ADD_DC_V,
39239+    RPI_PRED_INTRA,
39240+    RPI_PRED_INTRA_C,
39241+    RPI_PRED_I_PCM,
39242+    RPI_PRED_CMD_MAX
39243+};
39244+
39245+typedef struct HEVCPredCmd {
39246+    uint8_t type;
39247+    uint8_t size;  // log2 "size" used by all variants
39248+    uint8_t avail; // i_pred - but left here as they pack well
39249+    uint8_t dummy;
39250+    union {
39251+        struct {  // TRANSFORM_ADD
39252+            uint8_t * dst;
39253+            const int16_t * buf;
39254+            uint16_t stride;  // Should be good enough for all pic fmts we use
39255+            int16_t dc;
39256+        } ta;
39257+        struct {
39258+            uint8_t * dst;
39259+            uint32_t stride;
39260+            int dc;
39261+        } dc;
39262+        struct {  // INTRA
39263+            uint16_t x;
39264+            uint16_t y;
39265+            enum IntraPredMode mode;
39266+        } i_pred;
39267+        struct {  // I_PCM
39268+            uint16_t x;
39269+            uint16_t y;
39270+            const void * src;
39271+            uint32_t src_len;
39272+        } i_pcm;
39273+    };
39274+} HEVCPredCmd;
39275+
39276+union qpu_mc_pred_cmd_s;
39277+struct qpu_mc_pred_y_p_s;
39278+struct qpu_mc_src_s;
39279+
39280+typedef struct HEVCRpiInterPredQ
39281+{
39282+    union qpu_mc_pred_cmd_u *qpu_mc_base;
39283+    union qpu_mc_pred_cmd_u *qpu_mc_curr;
39284+    struct qpu_mc_src_s *last_l0;
39285+    struct qpu_mc_src_s *last_l1;
39286+    unsigned int load;
39287+    uint32_t code_setup;
39288+    uint32_t code_sync;
39289+    uint32_t code_exit;
39290+} HEVCRpiInterPredQ;
39291+
39292+typedef struct HEVCRpiInterPredEnv
39293+{
39294+    HEVCRpiInterPredQ * q;
39295+    uint8_t n;                  // Number of Qs
39296+    uint8_t n_grp;              // Number of Q in a group
39297+    uint8_t curr;               // Current Q number (0..n-1)
39298+    uint8_t used;               // 0 if nothing in any Q, 1 otherwise
39299+    uint8_t used_grp;           // 0 if nothing in any Q in the current group
39300+    unsigned int max_fill;
39301+    unsigned int min_gap;
39302+    GPU_MEM_PTR_T gptr;
39303+} HEVCRpiInterPredEnv;
39304+
39305+typedef struct HEVCRpiIntraPredEnv {
39306+    unsigned int n;        // Number of commands
39307+    HEVCPredCmd * cmds;
39308+} HEVCRpiIntraPredEnv;
39309+
39310+typedef struct HEVCRpiCoeffEnv {
39311+    unsigned int n;
39312+#if RPI_COMPRESS_COEFFS
39313+    unsigned int packed; // Equal to 1 if coefficients should be being packed
39314+    unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed).  Only valid if packed==0
39315+#endif
39316+    int16_t * buf;
39317+} HEVCRpiCoeffEnv;
39318+
39319+typedef struct HEVCRpiCoeffsEnv {
39320+    HEVCRpiCoeffEnv s[4];
39321+    GPU_MEM_PTR_T gptr;
39322+    void * mptr;
39323+} HEVCRpiCoeffsEnv;
39324+
39325+typedef struct HEVCRpiFrameProgressWait {
39326+    int req;
39327+    struct HEVCRpiFrameProgressWait * next;
39328+    sem_t sem;
39329+} HEVCRpiFrameProgressWait;
39330+
39331+typedef struct HEVCRpiFrameProgressState {
39332+    struct HEVCRpiFrameProgressWait * first;
39333+    struct HEVCRpiFrameProgressWait * last;
39334+    pthread_mutex_t lock;
39335+} HEVCRpiFrameProgressState;
39336+
39337+typedef struct RpiBlk
39338+{
39339+    unsigned int x;
39340+    unsigned int y;
39341+    unsigned int w;
39342+    unsigned int h;
39343+} RpiBlk;
39344+
39345+typedef struct HEVCRpiJob {
39346+    struct HEVCRpiJob * next;  // Free chain
39347+    struct HEVCRpiJobCtl * jbc_local;
39348+    const HEVCRpiSPS * sps;       // sps used to set up this job
39349+
39350+    int waited;
39351+    int ctu_ts_first;
39352+    int ctu_ts_last;
39353+    RpiBlk bounds;  // Bounding box of job
39354+
39355+    struct qpu_mc_pred_y_p_s * last_y8_p;
39356+    struct qpu_mc_src_s * last_y8_l1;
39357+    rpi_cache_flush_env_t * rfe;
39358+
39359+    HEVCRpiInterPredEnv chroma_ip;
39360+    HEVCRpiInterPredEnv luma_ip;
39361+    int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no
39362+    HEVCRpiIntraPredEnv intra;
39363+    HEVCRpiCoeffsEnv coeffs;
39364+    HEVCRpiFrameProgressWait progress_wait;
39365+    sem_t sem;
39366+    rpi_cache_buf_t flush_buf;
39367+} HEVCRpiJob;
39368+
39369+struct HEVCRpiContext;
39370+
39371+typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb);
39372+
39373+typedef struct HEVCRpiPassQueue
39374+{
39375+//    int pending;
39376+    volatile int terminate;
39377+    sem_t sem_in;
39378+    sem_t * psem_out;
39379+    unsigned int job_n;
39380+    struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread
39381+    HEVCRpiWorkerFn * worker;
39382+    pthread_t thread;
39383+    uint8_t pass_n;  // Pass number - debug
39384+    uint8_t started;
39385+} HEVCRpiPassQueue;
39386+
39387+
39388+struct HEVCRpiJobGlobal;
39389+
39390+typedef struct HEVCRpiJobCtl
39391+{
39392+    sem_t sem_out;
39393+
39394+    HEVCRpiJob * volatile jb1;  // The job associated with this frame if unallocated - NULL if allocated
39395+    struct HEVCRpiJobGlobal * jbg;
39396+
39397+    HEVCRpiLocalContext * lcw_head;
39398+    HEVCRpiLocalContext * lcw_tail;
39399+
39400+    pthread_mutex_t in_lock;
39401+    int offload_in;
39402+
39403+    HEVCRpiJob *offloadq[RPI_MAX_JOBS];
39404+} HEVCRpiJobCtl;
39405+
39406+
39407+typedef struct HEVCRpiJobGlobal
39408+{
39409+    intptr_t ref_count;
39410+    pthread_mutex_t lock;
39411+    HEVCRpiJob * free1;                 // Singly linked list of free jobs
39412+    HEVCRpiLocalContext * wait_head;       // Double linked list of lcs waiting for a job
39413+    HEVCRpiLocalContext * wait_good;  // Last good tail
39414+    HEVCRpiLocalContext * wait_tail;
39415+
39416+} HEVCRpiJobGlobal;
39417+
39418+#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
39419+
39420+#if RPI_TSTATS
39421+typedef struct HEVCRpiStats {
39422+    int y_pred1_y8_merge;
39423+    int y_pred1_xy;
39424+    int y_pred1_x0;
39425+    int y_pred1_y0;
39426+    int y_pred1_x0y0;
39427+    int y_pred1_wle8;
39428+    int y_pred1_wgt8;
39429+    int y_pred1_hle16;
39430+    int y_pred1_hgt16;
39431+    int y_pred2_xy;
39432+    int y_pred2_x0;
39433+    int y_pred2_y0;
39434+    int y_pred2_x0y0;
39435+    int y_pred2_hle16;
39436+    int y_pred2_hgt16;
39437+} HEVCRpiStats;
39438+#endif
39439+
39440+typedef struct HEVCRpiCabacState
39441+{
39442+    uint8_t rice[4];
39443+    uint8_t state[HEVC_CONTEXTS];
39444+} HEVCRpiCabacState;
39445+
39446+#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT   6   // 64 pels
39447+#define HEVC_RPI_BS_STRIDE1_PELS        (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT)
39448+#define HEVC_RPI_BS_STRIDE1_PEL_MASK    (HEVC_RPI_BS_STRIDE1_PELS - 1)
39449+#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT  2   // 4 els per byte
39450+#define HEVC_RPI_BS_PELS_PER_EL_SHIFT   2   // 4 pels per el
39451+#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT)
39452+#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT  (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)
39453+#define HEVC_RPI_BS_STRIDE1_BYTES       (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
39454+#define HEVC_RPI_BS_Y_SHR               3   // 8 vertical pels per row
39455+#define HEVC_RPI_BS_COL_BYTES_SHR       (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
39456+
39457+typedef struct HEVCRpiContext {
39458+    const AVClass *c;  // needed by private avoptions
39459+    AVCodecContext *avctx;
39460+
39461+    uint8_t             threads_type;
39462+    char qpu_init_ok;
39463+
39464+    /** 1 if the independent slice segment header was successfully parsed */
39465+    uint8_t slice_initialized;
39466+    char used_for_ref;  // rpi
39467+    char is_irap;
39468+    char offload_recon;
39469+    uint8_t eos;       ///< current packet contains an EOS/EOB NAL
39470+    uint8_t last_eos;  ///< last packet contains an EOS/EOB NAL
39471+    uint8_t no_backward_pred_flag;
39472+    uint8_t is_decoded;
39473+    uint8_t no_rasl_output_flag;
39474+
39475+
39476+    /**
39477+     * Sequence counters for decoded and output frames, so that old
39478+     * frames are output first after a POC reset
39479+     */
39480+    uint16_t seq_decode;
39481+    uint16_t seq_output;
39482+
39483+    int                 width;
39484+    int                 height;
39485+
39486+    HEVCRpiJobCtl * jbc;
39487+    // cabac stash
39488+    // b0       skip flag
39489+    // b1+      ct_depth
39490+    uint8_t * cabac_stash_left;
39491+    uint8_t * cabac_stash_up;
39492+
39493+    // Function pointers
39494+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
39495+    const uint8_t * qpu_dummy_frame_emu;
39496+#endif
39497+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
39498+    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
39499+#endif
39500+    HEVCRpiQpu qpu;
39501+
39502+    HEVCRpiFrameProgressState progress_states[2];
39503+
39504+    HEVCRpiCabacState *cabac_save;
39505+
39506+    AVFrame *frame;
39507+    AVFrame *output_frame;
39508+    uint8_t *sao_pixel_buffer_h[3];
39509+    uint8_t *sao_pixel_buffer_v[3];
39510+
39511+    unsigned int col_mvf_stride;
39512+    AVBufferPool *col_mvf_pool;
39513+
39514+    RpiSAOParams *sao;
39515+    DBParams *deblock;
39516+    enum HEVCNALUnitType nal_unit_type;
39517+    int temporal_id;  ///< temporal_id_plus1 - 1
39518+    HEVCRpiFrame *ref;
39519+    int poc;
39520+    int pocTid0;
39521+    int slice_idx; ///< number of the slice being currently decoded
39522+    int max_ra;
39523+
39524+    int8_t *qp_y_tab;
39525+
39526+    // Deblocking block strength bitmaps
39527+    unsigned int bs_stride2;
39528+    unsigned int bs_size;
39529+    uint8_t *bs_horizontal;
39530+    uint8_t *bs_vertical;
39531+    uint8_t *bsf_stash_up;
39532+    uint8_t *bsf_stash_left;
39533+
39534+#if HEVC_RPI_MAX_CTBS >= 0xffff
39535+#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0
39536+    uint32_t *tab_slice_address;
39537+#else
39538+#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0
39539+    uint16_t *tab_slice_address;
39540+#endif
39541+
39542+    // Bitfield 1 bit per 8 pels (min pcm size)
39543+    uint8_t *is_pcm;
39544+    // Bitfield 1 bit per 8 pels (min cb size)
39545+    // Only needed for CIP as CIP processing is async to the main thread
39546+    uint8_t *is_intra;
39547+
39548+    // PU
39549+    HEVCRpiMvField *mvf_up;
39550+    HEVCRpiMvField *mvf_left;
39551+
39552+    const RefPicList **rpl_up;
39553+    const RefPicList **rpl_left;
39554+    RefPicList * refPicList;
39555+
39556+    // CTB-level flags affecting loop filter operation
39557+    uint8_t *filter_slice_edges;
39558+
39559+    /** used on BE to byteswap the lines for checksumming */
39560+    uint8_t *checksum_buf;
39561+    int      checksum_buf_size;
39562+
39563+    const uint8_t *data;
39564+
39565+    H2645Packet pkt;
39566+    // type of the first VCL NAL of the current frame
39567+    enum HEVCNALUnitType first_nal_type;
39568+
39569+    uint8_t context_initialized;
39570+    int is_nalff;           ///< this flag is != 0 if bitstream is encapsulated
39571+                            ///< as a format defined in 14496-15
39572+    int apply_defdispwin;
39573+
39574+    int nal_length_size;    ///< Number of bytes used for nal length (1, 2 or 4)
39575+    int nuh_layer_id;
39576+
39577+    struct AVMD5 *md5_ctx;
39578+
39579+    RefPicListTab * rpl_tab;
39580+    unsigned int rpl_tab_size;
39581+
39582+    uint8_t *is_intra_store;
39583+
39584+    RpiSliceHeader sh;
39585+
39586+    HEVCRpiParamSets ps;
39587+
39588+    HEVCRpiLocalContext    *HEVClc;
39589+    HEVCRpiLocalContext    *HEVClcList[MAX_NB_THREADS];
39590+
39591+    HEVCRpiFrame DPB[HEVC_DPB_ELS];
39592+
39593+    ///< candidate references for the current frame
39594+    RefPicList rps[5];
39595+
39596+    HEVCRpiPredContext hpc;
39597+    HEVCDSPContext hevcdsp;
39598+
39599+    HEVCSEIContext sei;
39600+
39601+    // Put structures that allocate non-trivial storage at the end
39602+    // These are mostly used indirectly so position in the structure doesn't matter
39603+    HEVCRpiPassQueue passq[RPI_PASSES];
39604+#if RPI_EXTRA_BIT_THREADS > 0
39605+    int bt_started;
39606+    // This simply contains thread descriptors - task setup is held elsewhere
39607+    pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
39608+#endif
39609+#if RPI_TSTATS
39610+    HEVCRpiStats tstats;
39611+#endif
39612+} HEVCRpiContext;
39613+
39614+/**
39615+ * Mark all frames in DPB as unused for reference.
39616+ */
39617+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s);
39618+
39619+/**
39620+ * Drop all frames currently in DPB.
39621+ */
39622+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
39623+
39624+/**
39625+ * Construct the reference picture sets for the current frame.
39626+ */
39627+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s);
39628+
39629+/**
39630+ * Construct the reference picture list(s) for the current slice.
39631+ */
39632+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
39633+
39634+
39635+/**
39636+ * Get the number of candidate references for the current frame.
39637+ */
39638+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s);
39639+
39640+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc);
39641+
39642+/**
39643+ * Find next frame in output order and put a reference to it in frame.
39644+ * @return 1 if a frame was output, 0 otherwise
39645+ */
39646+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush);
39647+
39648+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
39649+
39650+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags);
39651+
39652+unsigned int ff_hevc_rpi_tb_avail_flags(
39653+    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
39654+    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h);
39655+
39656+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
39657+                                int nPbH, int log2_cb_size, int part_idx,
39658+                                int merge_idx, HEVCRpiMvField * const mv);
39659+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
39660+    const unsigned int x0, const unsigned int y0,
39661+    const unsigned int nPbW, const unsigned int nPbH,
39662+    const unsigned int avail,
39663+    HEVCRpiMvField * const mv,
39664+    const unsigned int mvp_lx_flag, const unsigned int LX);
39665+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase);
39666+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
39667+                                               const unsigned int x0, const unsigned int y0,
39668+                                               const unsigned int log2_trafo_size, const int is_coded_block);
39669+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot);
39670+
39671+extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
39672+extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
39673+extern const uint8_t ff_hevc_rpi_qpel_extra[4];
39674+
39675+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
39676+
39677+// arm/hevc_misc_neon.S
39678+// Neon coeff zap fn
39679+#if HAVE_NEON
39680+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
39681+#endif
39682+
39683+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
39684+                                     const HEVCRpiFrame * const ref, const int val, const int field);
39685+
39686+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
39687+
39688+// All of these expect that s->threads_type == FF_THREAD_FRAME
39689+
39690+static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
39691+                                     const HEVCRpiFrame * const ref, const int y)
39692+{
39693+    if (s->threads_type != 0)
39694+        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
39695+}
39696+
39697+static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
39698+{
39699+    if (s->used_for_ref && s->threads_type != 0)
39700+        ff_hevc_rpi_progress_signal_field(s, y, 1);
39701+}
39702+
39703+static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
39704+                                     const HEVCRpiFrame * const ref, const int y)
39705+{
39706+    ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
39707+}
39708+
39709+static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
39710+{
39711+    if (s->used_for_ref && s->threads_type != 0)
39712+    {
39713+        ff_hevc_rpi_progress_signal_field(s, y, 0);
39714+    }
39715+}
39716+
39717+static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s)
39718+{
39719+    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
39720+    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
39721+}
39722+
39723+
39724+// Set all done - signal nothing (used in missing refs)
39725+// Works for both rpi & non-rpi
39726+static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref)
39727+{
39728+    if (ref->tf.progress != NULL)
39729+    {
39730+        int * const p = (int *)ref->tf.progress->data;
39731+        p[0] = INT_MAX;
39732+        p[1] = INT_MAX;
39733+    }
39734+}
39735+
39736+#define HEVC_RPI_420_ONLY 1
39737+#define HEVC_RPI_SAND128_ONLY 1
39738+
39739+static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx)
39740+{
39741+#if HEVC_RPI_420_ONLY
39742+    return cidx == 0 ? 0 : 1;
39743+#else
39744+    return s->ps.sps->hshift[cidx];
39745+#endif
39746+}
39747+
39748+static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx)
39749+{
39750+#if HEVC_RPI_420_ONLY
39751+    return cidx == 0 ? 0 : 1;
39752+#else
39753+    return s->ps.sps->vshift[cidx];
39754+#endif
39755+}
39756+
39757+static inline int ctx_cfmt(const HEVCRpiContext * const s)
39758+{
39759+#if HEVC_RPI_420_ONLY
39760+    return 1;
39761+#else
39762+    return s->ps.sps->chroma_format_idc;
39763+#endif
39764+}
39765+
39766+static inline int frame_stride1(const AVFrame * const frame, const int c_idx)
39767+{
39768+#if HEVC_RPI_SAND128_ONLY
39769+    return 128;
39770+#else
39771+    return frame->linesize[c_idx];
39772+#endif
39773+}
39774+
39775+#if HEVC_RPI_SAND128_ONLY
39776+// Propagate this decision to later zc includes
39777+#define RPI_ZC_SAND128_ONLY 1
39778+#endif
39779+
39780+#ifndef ff_hevc_rpi_copy_vert
39781+static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src,
39782+                                         int pixel_shift, int height,
39783+                                         ptrdiff_t stride_dst, ptrdiff_t stride_src)
39784+{
39785+    int i;
39786+    switch (pixel_shift)
39787+    {
39788+        case 2:
39789+            for (i = 0; i < height; i++) {
39790+                *(uint32_t *)dst = *(uint32_t *)src;
39791+                dst += stride_dst;
39792+                src += stride_src;
39793+            }
39794+            break;
39795+        case 1:
39796+            for (i = 0; i < height; i++) {
39797+                *(uint16_t *)dst = *(uint16_t *)src;
39798+                dst += stride_dst;
39799+                src += stride_src;
39800+            }
39801+            break;
39802+        default:
39803+            for (i = 0; i < height; i++) {
39804+                *dst = *src;
39805+                dst += stride_dst;
39806+                src += stride_src;
39807+            }
39808+            break;
39809+    }
39810+}
39811+#endif
39812+
39813+
39814+#if MVF_STASH_WIDTH == 64
39815+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
39816+                               const unsigned int x, const unsigned int y)
39817+{
39818+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
39819+    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE));
39820+}
39821+
39822+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
39823+                               const unsigned int x0, const unsigned int y0,
39824+                               const unsigned int x, const unsigned int y)
39825+{
39826+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
39827+    const unsigned int x0_ctb = x0 & mask_cs_hi;
39828+    const unsigned int y0_ctb = y0 & mask_cs_hi;
39829+
39830+    return (HEVCRpiMvField *)((y < y0_ctb) ?
39831+        (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) :
39832+        (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) :
39833+            lc->mvf_stash +
39834+                ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU +
39835+                ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)));
39836+}
39837+
39838+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
39839+                               const unsigned int x0,
39840+                               const unsigned int x)
39841+{
39842+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
39843+    const unsigned int x0_ctb = x0 & mask_cs_hi;
39844+    return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU;
39845+}
39846+
39847+#else
39848+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
39849+                               const unsigned int x, const unsigned int y)
39850+{
39851+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
39852+    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)));
39853+}
39854+
39855+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
39856+                               const unsigned int x0, const unsigned int y0,
39857+                               const unsigned int x, const unsigned int y)
39858+{
39859+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
39860+
39861+    const unsigned int x0_ctb = x0 & mask_cs_hi;
39862+    const unsigned int y0_ctb = y0 & mask_cs_hi;
39863+
39864+    // If not in the same CTB for Y assume up
39865+    if (y < y0_ctb) {
39866+        // If not in the same CTB for X too assume up-left
39867+        return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE));
39868+    }
39869+    return mvf_stash_ptr(s, lc, x, y);
39870+}
39871+
39872+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
39873+                               const unsigned int x0,
39874+                               const unsigned int x)
39875+{
39876+    return MVF_STASH_WIDTH_PU;
39877+}
39878+#endif
39879+
39880+#endif /* AVCODEC_RPI_HEVCDEC_H */
39881--- /dev/null
39882+++ b/libavcodec/rpi_hevcdsp.c
39883@@ -0,0 +1,450 @@
39884+/*
39885+ * HEVC video decoder
39886+ *
39887+ * Copyright (C) 2012 - 2013 Guillaume Martres
39888+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
39889+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
39890+ *
39891+ * This file is part of FFmpeg.
39892+ *
39893+ * FFmpeg is free software; you can redistribute it and/or
39894+ * modify it under the terms of the GNU Lesser General Public
39895+ * License as published by the Free Software Foundation; either
39896+ * version 2.1 of the License, or (at your option) any later version.
39897+ *
39898+ * FFmpeg is distributed in the hope that it will be useful,
39899+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
39900+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
39901+ * Lesser General Public License for more details.
39902+ *
39903+ * You should have received a copy of the GNU Lesser General Public
39904+ * License along with FFmpeg; if not, write to the Free Software
39905+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
39906+ */
39907+
39908+#include "rpi_hevcdsp.h"
39909+#include "rpi_hevc_mv.h"
39910+
39911+static const int8_t transform[32][32] = {
39912+    { 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
39913+      64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
39914+    { 90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4,
39915+      -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
39916+    { 90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90,
39917+     -90, -87, -80, -70, -57, -43, -25,  -9,   9,  25,  43,  57,  70,  80,  87,  90 },
39918+    { 90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
39919+      13,  38,  61,  78,  88,  90,  85,  73,  54,  31,   4, -22, -46, -67, -82, -90 },
39920+    { 89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89,
39921+      89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89 },
39922+    { 88,  67,  31, -13, -54, -82, -90, -78, -46, -4,   38,  73,  90,  85,  61,  22,
39923+     -22, -61, -85, -90, -73, -38,   4,  46,  78,  90,  82,  54,  13, -31, -67, -88 },
39924+    { 87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87,
39925+     -87, -57,  -9,  43,  80,  90,  70,  25, -25, -70, -90, -80, -43,   9,  57,  87 },
39926+    { 85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31,
39927+      31,  78,  90,  61,   4, -54, -88, -82, -38,  22,  73,  90,  67,  13, -46, -85 },
39928+    { 83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,
39929+      83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83 },
39930+    { 82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38,
39931+     -38, -88, -73,  -4,  67,  90,  46, -31, -85, -78, -13,  61,  90,  54, -22, -82 },
39932+    { 80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80,
39933+     -80,  -9,  70,  87,  25, -57, -90, -43,  43,  90,  57, -25, -87, -70,   9,  80 },
39934+    { 78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46,
39935+      46,  90,  38, -54, -90, -31,  61,  88,  22, -67, -85, -13,  73,  82,   4, -78 },
39936+    { 75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75,
39937+      75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75 },
39938+    { 73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54,
39939+     -54, -85,   4,  88,  46, -61, -82,  13,  90,  38, -67, -78,  22,  90,  31, -73 },
39940+    { 70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70,
39941+     -70,  43,  87,  -9, -90, -25,  80,  57, -57, -80,  25,  90,   9, -87, -43,  70 },
39942+    { 67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61,
39943+      61,  73, -46, -82,  31,  88, -13, -90,  -4,  90,  22, -85, -38,  78,  54, -67 },
39944+    { 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,
39945+      64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64 },
39946+    { 61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67,
39947+     -67, -54,  78,  38, -85, -22,  90,   4, -90,  13,  88, -31, -82,  46,  73, -61 },
39948+    { 57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57,
39949+     -57,  80,  25, -90,   9,  87, -43, -70,  70,  43, -87,  -9,  90, -25, -80,  57 },
39950+    { 54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73,
39951+      73,  31, -90,  22,  78, -67, -38,  90, -13, -82,  61,  46, -88,   4,  85, -54 },
39952+    { 50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50,
39953+      50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50 },
39954+    { 46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78,
39955+     -78,  -4,  82, -73, -13,  85, -67, -22,  88, -61, -31,  90, -54, -38,  90, -46 },
39956+    { 43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43,
39957+     -43,  90, -57, -25,  87, -70,  -9,  80, -80,   9,  70, -87,  25,  57, -90,  43 },
39958+    { 38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82,
39959+      82, -22, -54,  90, -61, -13,  78, -85,  31,  46, -90,  67,   4, -73,  88, -38 },
39960+    { 36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36,
39961+      36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36 },
39962+    { 31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85,
39963+     -85,  46,  13, -67,  90, -73,  22,  38, -82,  88, -54,  -4,  61, -90,  78, -31 },
39964+    { 25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25,
39965+     -25,  70, -90,  80, -43,  -9,  57, -87,  87, -57,   9,  43, -80,  90, -70,  25 },
39966+    { 22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88,
39967+      88, -67,  31,  13, -54,  82, -90,  78, -46,   4,  38, -73,  90, -85,  61, -22 },
39968+    { 18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18,
39969+      18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18 },
39970+    { 13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90,
39971+     -90,  82, -67,  46, -22,  -4,  31, -54,  73, -85,  90, -88,  78, -61,  38, -13 },
39972+    {  9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25, -9,
39973+      -9,  25, -43,  57, -70,  80, -87,  90, -90,  87, -80,  70, -57,  43, -25,   9 },
39974+    {  4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90,
39975+      90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 },
39976+};
39977+
39978+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = {
39979+    { -2, 58, 10, -2},
39980+    { -4, 54, 16, -2},
39981+    { -6, 46, 28, -4},
39982+    { -4, 36, 36, -4},
39983+    { -4, 28, 46, -6},
39984+    { -2, 16, 54, -4},
39985+    { -2, 10, 58, -2},
39986+};
39987+
39988+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = {
39989+    { -1,  4,-10, 58, 17, -5,  1,  0, -1,  4,-10, 58, 17, -5,  1,  0},
39990+    { -1,  4,-11, 40, 40,-11,  4, -1, -1,  4,-11, 40, 40,-11,  4, -1},
39991+    {  0,  1, -5, 17, 58,-10,  4, -1,  0,  1, -5, 17, 58,-10,  4, -1}
39992+};
39993+
39994+#define BIT_DEPTH 8
39995+#include "rpi_hevcdsp_template.c"
39996+#undef BIT_DEPTH
39997+
39998+#define BIT_DEPTH 9
39999+#include "rpi_hevcdsp_template.c"
40000+#undef BIT_DEPTH
40001+
40002+#define BIT_DEPTH 10
40003+#include "rpi_hevcdsp_template.c"
40004+#undef BIT_DEPTH
40005+
40006+#define BIT_DEPTH 12
40007+#include "rpi_hevcdsp_template.c"
40008+#undef BIT_DEPTH
40009+
40010+static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
40011+                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
40012+                                               int in_inc0, int in_inc1)
40013+{
40014+    int shift = 32;
40015+    uint32_t bs = 0;
40016+    for (; pus > 0; pus--) {
40017+        int strength, out;
40018+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
40019+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
40020+        int nr_idx0 = neigh->ref_idx[0];
40021+        int nr_idx1 = neigh->ref_idx[1];
40022+        int neigh_refL0 = neigh_rpl0[nr_idx0];
40023+        int neigh_refL1 = neigh_rpl1[nr_idx1];
40024+
40025+        av_assert0(nr_idx0 >= 0 && nr_idx0 <=31);
40026+        av_assert0(nr_idx1 >= 0 && nr_idx1 <=31);
40027+
40028+#if 1 // This more directly matches the original implementation
40029+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
40030+            // same L0 and L1
40031+            if (curr_refL0 == neigh_refL0 &&
40032+                curr_refL0 == curr_refL1 &&
40033+                neigh_refL0 == neigh_refL1) {
40034+                if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
40035+                     FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) &&
40036+                    (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
40037+                     FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4))
40038+                    strength = 1;
40039+                else
40040+                    strength = 0;
40041+            } else if (neigh_refL0 == curr_refL0 &&
40042+                       neigh_refL1 == curr_refL1) {
40043+                if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
40044+                    FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4)
40045+                    strength = 1;
40046+                else
40047+                    strength = 0;
40048+            } else if (neigh_refL1 == curr_refL0 &&
40049+                       neigh_refL0 == curr_refL1) {
40050+                if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
40051+                    FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)
40052+                    strength = 1;
40053+                else
40054+                    strength = 0;
40055+            } else {
40056+                strength = 1;
40057+            }
40058+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
40059+            MvXY curr_mv0, neigh_mv0;
40060+
40061+            if (curr->pred_flag & 1) {
40062+                curr_mv0   = curr->xy[0];
40063+            } else {
40064+                curr_mv0   = curr->xy[1];
40065+                curr_refL0 = curr_refL1;
40066+            }
40067+
40068+            if (neigh->pred_flag & 1) {
40069+                neigh_mv0   = neigh->xy[0];
40070+            } else {
40071+                neigh_mv0   = neigh->xy[1];
40072+                neigh_refL0 = neigh_refL1;
40073+            }
40074+
40075+            if (curr_refL0 == neigh_refL0) {
40076+                if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4)
40077+                    strength = 1;
40078+                else
40079+                    strength = 0;
40080+            } else
40081+                strength = 1;
40082+        } else
40083+            strength = 1;
40084+#else // This has exactly the same effect, but is more suitable for vectorisation
40085+        MvXY curr_mv[2];
40086+        MvXY neigh_mv[2];
40087+        memcpy(curr_mv, curr->xy, sizeof curr_mv);
40088+        memcpy(neigh_mv, neigh->xy, sizeof neigh_mv);
40089+
40090+        if (!(curr->pred_flag & 2)) {
40091+            curr_mv[1] = curr_mv[0];
40092+            curr_refL1 = curr_refL0;
40093+        }
40094+        if (!(neigh->pred_flag & 2)) {
40095+            neigh_mv[1] = neigh_mv[0];
40096+            neigh_refL1 = neigh_refL0;
40097+        }
40098+        if (!(curr->pred_flag & 1)) {
40099+            curr_mv[0] = curr_mv[1];
40100+            curr_refL0 = curr_refL1;
40101+        }
40102+        if (!(neigh->pred_flag & 1)) {
40103+            neigh_mv[0] = neigh_mv[1];
40104+            neigh_refL0 = neigh_refL1;
40105+        }
40106+
40107+        strength = 1;
40108+
40109+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
40110+                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) |
40111+                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4);
40112+
40113+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
40114+                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) |
40115+                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4);
40116+
40117+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
40118+#endif
40119+
40120+        curr += in_inc0 / sizeof (HEVCRpiMvField);
40121+        neigh += in_inc1 / sizeof (HEVCRpiMvField);
40122+
40123+        for (out = dup; out > 0; out--)
40124+        {
40125+            bs = (bs >> 2) | (strength << 30);
40126+            shift -= 2;
40127+        }
40128+    }
40129+    return bs >> shift;
40130+}
40131+
40132+
40133+static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height)
40134+{
40135+    unsigned int i, j;
40136+
40137+    if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
40138+        for (i = 0; i < height; i++) {
40139+            for (j = 0; j < width; j+=8)
40140+                AV_COPY64U(dst+j, src+j);
40141+            dst += stride_dst;
40142+            src += stride_src;
40143+        }
40144+    } else {
40145+        for (i = 0; i < height; i++) {
40146+            for (j = 0; j < width; j+=16)
40147+                AV_COPY128(dst+j, src+j);
40148+            dst += stride_dst;
40149+            src += stride_src;
40150+        }
40151+    }
40152+}
40153+
40154+
40155+
40156+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
40157+{
40158+#undef FUNC
40159+#define FUNC(a, depth) a ## _ ## depth
40160+
40161+#undef PEL_FUNC
40162+#define PEL_FUNC(dst1, idx1, idx2, a, depth)                                   \
40163+    for(i = 0 ; i < 10 ; i++)                                                  \
40164+{                                                                              \
40165+    hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth;                            \
40166+}
40167+
40168+#undef EPEL_FUNCS
40169+#define EPEL_FUNCS(depth)                                                     \
40170+    PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth);                \
40171+    PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth);                    \
40172+    PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth);                    \
40173+    PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
40174+
40175+#undef EPEL_UNI_FUNCS
40176+#define EPEL_UNI_FUNCS(depth)                                                 \
40177+    PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
40178+    PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth);            \
40179+    PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth);            \
40180+    PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth);           \
40181+    PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
40182+    PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth);        \
40183+    PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth);        \
40184+    PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
40185+
40186+#undef EPEL_BI_FUNCS
40187+#define EPEL_BI_FUNCS(depth)                                                \
40188+    PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);        \
40189+    PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth);            \
40190+    PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth);            \
40191+    PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth);           \
40192+    PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);    \
40193+    PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth);        \
40194+    PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth);        \
40195+    PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
40196+
40197+#undef QPEL_FUNCS
40198+#define QPEL_FUNCS(depth)                                                     \
40199+    PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth);                \
40200+    PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth);                    \
40201+    PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth);                    \
40202+    PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
40203+
40204+#undef QPEL_UNI_FUNCS
40205+#define QPEL_UNI_FUNCS(depth)                                                 \
40206+    PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
40207+    PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth);            \
40208+    PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth);            \
40209+    PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth);           \
40210+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
40211+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth);        \
40212+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth);        \
40213+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
40214+
40215+#undef QPEL_BI_FUNCS
40216+#define QPEL_BI_FUNCS(depth)                                                  \
40217+    PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);          \
40218+    PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth);              \
40219+    PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth);              \
40220+    PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth);             \
40221+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);      \
40222+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth);          \
40223+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
40224+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
40225+
40226+#define SLICED_ADD_RESIDUAL(depth)\
40227+    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
40228+    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
40229+    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
40230+    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
40231+    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
40232+    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
40233+    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
40234+    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
40235+    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
40236+    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
40237+    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
40238+    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
40239+    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
40240+    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
40241+    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
40242+    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
40243+    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
40244+#define SLICED_LOOP_FILTERS(depth)\
40245+    hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \
40246+    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
40247+    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
40248+    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
40249+#define SLICED_SAO(depth)\
40250+    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
40251+        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
40252+        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
40253+    }                                                                         \
40254+    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
40255+    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
40256+
40257+#define HEVC_DSP(depth)                                                     \
40258+    hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
40259+    hevcdsp->add_residual[0]        = FUNC(add_residual4x4, depth);         \
40260+    hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
40261+    hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
40262+    hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
40263+    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);         \
40264+    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);         \
40265+    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);       \
40266+    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);       \
40267+    SLICED_ADD_RESIDUAL(depth);                                             \
40268+    hevcdsp->dequant                = FUNC(dequant, depth);                 \
40269+    hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
40270+    hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
40271+    hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
40272+    hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
40273+    hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
40274+    hevcdsp->idct[3]                = FUNC(idct_32x32, depth);              \
40275+                                                                            \
40276+    hevcdsp->idct_dc[0]             = FUNC(idct_4x4_dc, depth);             \
40277+    hevcdsp->idct_dc[1]             = FUNC(idct_8x8_dc, depth);             \
40278+    hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
40279+    hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
40280+                                                                            \
40281+    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
40282+        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
40283+        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
40284+    }                                                                       \
40285+    hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
40286+    hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
40287+    SLICED_SAO(depth);                                                         \
40288+                                                                               \
40289+    QPEL_FUNCS(depth);                                                         \
40290+    QPEL_UNI_FUNCS(depth);                                                     \
40291+    QPEL_BI_FUNCS(depth);                                                      \
40292+    EPEL_FUNCS(depth);                                                         \
40293+    EPEL_UNI_FUNCS(depth);                                                     \
40294+    EPEL_BI_FUNCS(depth);                                                      \
40295+                                                                               \
40296+    SLICED_LOOP_FILTERS(depth);                                                \
40297+    hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
40298+    hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
40299+    hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
40300+    hevcdsp->hevc_v_loop_filter_chroma   = FUNC(hevc_v_loop_filter_chroma, depth); \
40301+    hevcdsp->hevc_h_loop_filter_luma_c   = FUNC(hevc_h_loop_filter_luma, depth);   \
40302+    hevcdsp->hevc_v_loop_filter_luma_c   = FUNC(hevc_v_loop_filter_luma, depth);   \
40303+    hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
40304+    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
40305+int i = 0;
40306+
40307+    switch (bit_depth) {
40308+    case 9:
40309+        HEVC_DSP(9);
40310+        break;
40311+    case 10:
40312+        HEVC_DSP(10);
40313+        break;
40314+    case 12:
40315+        HEVC_DSP(12);
40316+        break;
40317+    default:
40318+        HEVC_DSP(8);
40319+        break;
40320+    }
40321+
40322+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
40323+    hevcdsp->cpy_blk = cpy_blk;
40324+
40325+    if (ARCH_PPC)
40326+        ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
40327+    if (ARCH_X86)
40328+        ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth);
40329+    if (ARCH_ARM)
40330+        ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth);
40331+    if (ARCH_MIPS)
40332+        ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth);
40333+}
40334--- /dev/null
40335+++ b/libavcodec/rpi_hevcdsp.h
40336@@ -0,0 +1,177 @@
40337+/*
40338+ * HEVC video decoder
40339+ *
40340+ * Copyright (C) 2012 - 2013 Guillaume Martres
40341+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
40342+ *
40343+ *
40344+ * This file is part of FFmpeg.
40345+ *
40346+ * FFmpeg is free software; you can redistribute it and/or
40347+ * modify it under the terms of the GNU Lesser General Public
40348+ * License as published by the Free Software Foundation; either
40349+ * version 2.1 of the License, or (at your option) any later version.
40350+ *
40351+ * FFmpeg is distributed in the hope that it will be useful,
40352+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
40353+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
40354+ * Lesser General Public License for more details.
40355+ *
40356+ * You should have received a copy of the GNU Lesser General Public
40357+ * License along with FFmpeg; if not, write to the Free Software
40358+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
40359+ */
40360+
40361+#ifndef AVCODEC_RPI_HEVCDSP_H
40362+#define AVCODEC_RPI_HEVCDSP_H
40363+
40364+#include "hevc.h"
40365+#include "get_bits.h"
40366+
40367+struct HEVCRpiMvField;
40368+
40369+#define MAX_PB_SIZE 64
40370+
40371+#define RPI_HEVC_SAO_BUF_STRIDE 160
40372+
40373+
40374+typedef struct RpiSAOParams {
40375+    uint8_t band_position[3];   ///< sao_band_position (Y,U,V)
40376+    uint8_t eo_class[3];        ///< sao_eo_class      (Y,U=V)
40377+    uint8_t type_idx[3];        ///< sao_type_idx      (Y,U=V)
40378+
40379+    int16_t offset_val[3][5];   ///<SaoOffsetVal       (Y,U,V)
40380+
40381+} RpiSAOParams;
40382+
40383+
40384+// This controls how many sao dsp functions there are
40385+// N=5 has width = 8, 16, 32, 48, 64
40386+// N=6 adds a function for width=24 (in fn array el 5 so existing code should
40387+// still work)
40388+#define SAO_FILTER_N 6
40389+
40390+
40391+typedef struct HEVCDSPContext {
40392+    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
40393+                    struct GetBitContext *gb, int pcm_bit_depth);
40394+
40395+    void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
40396+    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
40397+    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
40398+    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
40399+
40400+    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
40401+    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
40402+    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
40403+                    struct GetBitContext *gb, int pcm_bit_depth);
40404+
40405+    void (*dequant)(int16_t *coeffs, int16_t log2_size);
40406+
40407+    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
40408+
40409+    void (*transform_4x4_luma)(int16_t *coeffs);
40410+
40411+    void (*idct[4])(int16_t *coeffs, int col_limit);
40412+
40413+    void (*idct_dc[4])(int16_t *coeffs);
40414+
40415+    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
40416+                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
40417+    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
40418+                               const int16_t *sao_offset_val_u, int sao_left_class_u,
40419+                               const int16_t *sao_offset_val_v, int sao_left_class_v,
40420+                               int width, int height);
40421+
40422+    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
40423+    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
40424+                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
40425+    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
40426+                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
40427+
40428+    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
40429+                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
40430+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
40431+    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
40432+                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
40433+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
40434+
40435+    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
40436+                                    int height, intptr_t mx, intptr_t my, int width);
40437+    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
40438+                                        int height, intptr_t mx, intptr_t my, int width);
40439+    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
40440+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
40441+
40442+    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
40443+                                       int16_t *src2,
40444+                                       int height, intptr_t mx, intptr_t my, int width);
40445+    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
40446+                                         int16_t *src2,
40447+                                         int height, int denom, int wx0, int wx1,
40448+                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
40449+    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
40450+                                    int height, intptr_t mx, intptr_t my, int width);
40451+
40452+    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
40453+                                        int height, intptr_t mx, intptr_t my, int width);
40454+    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
40455+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
40456+    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
40457+                                       int16_t *src2,
40458+                                       int height, intptr_t mx, intptr_t my, int width);
40459+    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
40460+                                         int16_t *src2,
40461+                                         int height, int denom, int wx0, int ox0, int wx1,
40462+                                         int ox1, intptr_t mx, intptr_t my, int width);
40463+
40464+    void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
40465+                                    int beta, int32_t *tc,
40466+                                    uint8_t *no_p, uint8_t *no_q);
40467+    void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
40468+                                    int beta, int32_t *tc,
40469+                                    uint8_t *no_p, uint8_t *no_q);
40470+    void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
40471+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
40472+    void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
40473+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
40474+    void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
40475+                                      int beta, int32_t *tc,
40476+                                      uint8_t *no_p, uint8_t *no_q);
40477+    void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
40478+                                      int beta, int32_t *tc,
40479+                                      uint8_t *no_p, uint8_t *no_q);
40480+    void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
40481+                                        int32_t *tc, uint8_t *no_p,
40482+                                        uint8_t *no_q);
40483+    void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
40484+                                        int32_t *tc, uint8_t *no_p,
40485+                                        uint8_t *no_q);
40486+    void (*hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
40487+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
40488+    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
40489+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
40490+                                 uint8_t * _pix_l);
40491+    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
40492+                                 unsigned int no_f);
40493+    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
40494+                                 uint8_t * src_l,
40495+                                 unsigned int no_f);
40496+
40497+    uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
40498+                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
40499+                                               int in_inc0, int inc_inc1);
40500+
40501+    void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height);
40502+} HEVCDSPContext;
40503+
40504+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
40505+
40506+extern const int8_t ff_hevc_rpi_epel_filters[7][4];
40507+extern const int8_t ff_hevc_rpi_qpel_filters[3][16];
40508+
40509+void ff_hevc_rpi_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
40510+void ff_hevc_rpi_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
40511+void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth);
40512+void ff_hevc_rpi_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
40513+#endif /* AVCODEC_RPI_HEVCDSP_H */
40514--- /dev/null
40515+++ b/libavcodec/rpi_hevcdsp_template.c
40516@@ -0,0 +1,2279 @@
40517+/*
40518+ * HEVC video decoder
40519+ *
40520+ * Copyright (C) 2012 - 2013 Guillaume Martres
40521+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
40522+ *
40523+ * This file is part of FFmpeg.
40524+ *
40525+ * FFmpeg is free software; you can redistribute it and/or
40526+ * modify it under the terms of the GNU Lesser General Public
40527+ * License as published by the Free Software Foundation; either
40528+ * version 2.1 of the License, or (at your option) any later version.
40529+ *
40530+ * FFmpeg is distributed in the hope that it will be useful,
40531+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
40532+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
40533+ * Lesser General Public License for more details.
40534+ *
40535+ * You should have received a copy of the GNU Lesser General Public
40536+ * License along with FFmpeg; if not, write to the Free Software
40537+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
40538+ */
40539+
40540+#include "get_bits.h"
40541+#include "rpi_hevcdec.h"
40542+
40543+#include "bit_depth_template.c"
40544+#include "rpi_hevcdsp.h"
40545+
40546+#include "rpi_hevc_shader_template.h"
40547+
40548+static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
40549+                          GetBitContext *gb, int pcm_bit_depth)
40550+{
40551+    int x, y;
40552+    pixel *dst = (pixel *)_dst;
40553+
40554+    stride /= sizeof(pixel);
40555+
40556+    for (y = 0; y < height; y++) {
40557+        for (x = 0; x < width; x++)
40558+            dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
40559+        dst += stride;
40560+    }
40561+}
40562+
40563+static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
40564+                          GetBitContext *gb, int pcm_bit_depth)
40565+{
40566+    int x, y;
40567+    pixel *dst = (pixel *)_dst;
40568+
40569+    stride /= sizeof(pixel);
40570+
40571+    for (y = 0; y < height; y++) {
40572+        for (x = 0; x < width; x++)
40573+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
40574+        dst += stride;
40575+    }
40576+
40577+    dst = (pixel *)_dst + 1;
40578+    for (y = 0; y < height; y++) {
40579+        for (x = 0; x < width; x++)
40580+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
40581+        dst += stride;
40582+    }
40583+}
40584+
40585+static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
40586+                                                ptrdiff_t stride, int size)
40587+{
40588+    int x, y;
40589+    pixel *dst = (pixel *)_dst;
40590+
40591+    stride /= sizeof(pixel);
40592+
40593+    for (y = 0; y < size; y++) {
40594+        for (x = 0; x < size; x++) {
40595+            dst[x] = av_clip_pixel(dst[x] + *res);
40596+            res++;
40597+        }
40598+        dst += stride;
40599+    }
40600+}
40601+
40602+static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
40603+{
40604+    int x, y;
40605+    pixel *dst = (pixel *)_dst;
40606+
40607+    stride /= sizeof(pixel);
40608+
40609+    for (y = 0; y < size; y++) {
40610+        for (x = 0; x < size; x++) {
40611+            dst[x] = av_clip_pixel(dst[x] + dc);
40612+        }
40613+        dst += stride;
40614+    }
40615+}
40616+
40617+
40618+static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
40619+                                                ptrdiff_t stride, const int dc_v, int size)
40620+{
40621+    int x, y;
40622+    pixel *dst = (pixel *)_dst;
40623+
40624+    stride /= sizeof(pixel);
40625+
40626+    for (y = 0; y < size; y++) {
40627+        for (x = 0; x < size * 2; x += 2) {
40628+            dst[x] = av_clip_pixel(dst[x] + *res);
40629+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
40630+            res++;
40631+        }
40632+        dst += stride;
40633+    }
40634+}
40635+
40636+static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
40637+                                                ptrdiff_t stride, const int dc_u, int size)
40638+{
40639+    int x, y;
40640+    pixel *dst = (pixel *)_dst;
40641+
40642+    stride /= sizeof(pixel);
40643+
40644+    for (y = 0; y < size; y++) {
40645+        for (x = 0; x < size * 2; x += 2) {
40646+            dst[x] = av_clip_pixel(dst[x] + dc_u);
40647+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
40648+            res++;
40649+        }
40650+        dst += stride;
40651+    }
40652+}
40653+
40654+static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
40655+                                                ptrdiff_t stride, unsigned int size)
40656+{
40657+    unsigned int x, y;
40658+    pixel *dst = (pixel *)_dst;
40659+    const int16_t * ru = res;
40660+    const int16_t * rv = res + size * size;
40661+
40662+//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
40663+//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
40664+//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
40665+
40666+    stride /= sizeof(pixel);
40667+
40668+    for (y = 0; y < size; y++) {
40669+        for (x = 0; x < size * 2; x += 2) {
40670+            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
40671+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
40672+        }
40673+        dst += stride;
40674+    }
40675+
40676+//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
40677+}
40678+
40679+
40680+static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
40681+{
40682+    int x, y;
40683+    pixel *dst = (pixel *)_dst;
40684+    const int dc_v = dc >> 16;
40685+    const int dc_u = (dc << 16) >> 16;
40686+
40687+    stride /= sizeof(pixel);
40688+
40689+    for (y = 0; y < size; y++) {
40690+        for (x = 0; x < size * 2; x += 2) {
40691+            dst[x] = av_clip_pixel(dst[x] + dc_u);
40692+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
40693+        }
40694+        dst += stride;
40695+    }
40696+}
40697+
40698+
40699+static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
40700+                                  ptrdiff_t stride)
40701+{
40702+    FUNC(add_residual)(_dst, res, stride, 4);
40703+}
40704+
40705+static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
40706+                                  ptrdiff_t stride)
40707+{
40708+    FUNC(add_residual)(_dst, res, stride, 8);
40709+}
40710+
40711+static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
40712+                                    ptrdiff_t stride)
40713+{
40714+    FUNC(add_residual)(_dst, res, stride, 16);
40715+}
40716+
40717+static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
40718+                                    ptrdiff_t stride)
40719+{
40720+    FUNC(add_residual)(_dst, res, stride, 32);
40721+}
40722+
40723+static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
40724+{
40725+    FUNC(add_residual_dc)(_dst, stride, dc, 4);
40726+}
40727+
40728+static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
40729+{
40730+    FUNC(add_residual_dc)(_dst, stride, dc, 8);
40731+}
40732+
40733+static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
40734+{
40735+    FUNC(add_residual_dc)(_dst, stride, dc, 16);
40736+}
40737+
40738+static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
40739+{
40740+    FUNC(add_residual_dc)(_dst, stride, dc, 32);
40741+}
40742+
40743+// -- U -- (plaited)
40744+
40745+static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
40746+                                  ptrdiff_t stride, int dc_u)
40747+{
40748+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
40749+}
40750+
40751+static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
40752+                                  ptrdiff_t stride, int dc_u)
40753+{
40754+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
40755+}
40756+
40757+static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
40758+                                    ptrdiff_t stride, int dc_u)
40759+{
40760+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
40761+}
40762+
40763+static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
40764+                                    ptrdiff_t stride, int dc_u)
40765+{
40766+    // Should never occur for 420, which is all that sand supports
40767+    av_assert0(0);
40768+}
40769+
40770+// -- V -- (plaited)
40771+
40772+static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
40773+                                  ptrdiff_t stride, int dc_v)
40774+{
40775+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
40776+}
40777+
40778+static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
40779+                                  ptrdiff_t stride, int dc_v)
40780+{
40781+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
40782+}
40783+
40784+static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
40785+                                    ptrdiff_t stride, int dc_v)
40786+{
40787+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
40788+}
40789+
40790+static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
40791+                                    ptrdiff_t stride, int dc_v)
40792+{
40793+    // Should never occur for 420, which is all that sand supports
40794+    av_assert0(0);
40795+}
40796+
40797+// -- C -- (plaited - both U & V)
40798+
40799+static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
40800+                                  ptrdiff_t stride)
40801+{
40802+    FUNC(add_residual_c)(_dst, res, stride, 4);
40803+}
40804+
40805+static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
40806+                                  ptrdiff_t stride)
40807+{
40808+    FUNC(add_residual_c)(_dst, res, stride, 8);
40809+}
40810+
40811+static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
40812+                                    ptrdiff_t stride)
40813+{
40814+    FUNC(add_residual_c)(_dst, res, stride, 16);
40815+}
40816+
40817+static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
40818+                                    ptrdiff_t stride)
40819+{
40820+    // Should never occur for 420, which is all that sand supports
40821+    av_assert0(0);
40822+}
40823+
40824+static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
40825+{
40826+    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
40827+}
40828+
40829+static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
40830+{
40831+    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
40832+}
40833+
40834+static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
40835+{
40836+    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
40837+}
40838+
40839+static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
40840+{
40841+    // Should never occur for 420, which is all that sand supports
40842+    av_assert0(0);
40843+}
40844+
40845+
40846+static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
40847+{
40848+    int16_t *coeffs = (int16_t *) _coeffs;
40849+    int x, y;
40850+    int size = 1 << log2_size;
40851+
40852+    if (mode) {
40853+        coeffs += size;
40854+        for (y = 0; y < size - 1; y++) {
40855+            for (x = 0; x < size; x++)
40856+                coeffs[x] += coeffs[x - size];
40857+            coeffs += size;
40858+        }
40859+    } else {
40860+        for (y = 0; y < size; y++) {
40861+            for (x = 1; x < size; x++)
40862+                coeffs[x] += coeffs[x - 1];
40863+            coeffs += size;
40864+        }
40865+    }
40866+}
40867+
40868+static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
40869+{
40870+    int shift  = 15 - BIT_DEPTH - log2_size;
40871+    int x, y;
40872+    int size = 1 << log2_size;
40873+
40874+    if (shift > 0) {
40875+        int offset = 1 << (shift - 1);
40876+        for (y = 0; y < size; y++) {
40877+            for (x = 0; x < size; x++) {
40878+                *coeffs = (*coeffs + offset) >> shift;
40879+                coeffs++;
40880+            }
40881+        }
40882+    } else {
40883+        for (y = 0; y < size; y++) {
40884+            for (x = 0; x < size; x++) {
40885+                *coeffs = *coeffs << -shift;
40886+                coeffs++;
40887+            }
40888+        }
40889+    }
40890+}
40891+
40892+#define SET(dst, x)   (dst) = (x)
40893+#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
40894+
40895+#define TR_4x4_LUMA(dst, src, step, assign)                             \
40896+    do {                                                                \
40897+        int c0 = src[0 * step] + src[2 * step];                         \
40898+        int c1 = src[2 * step] + src[3 * step];                         \
40899+        int c2 = src[0 * step] - src[3 * step];                         \
40900+        int c3 = 74 * src[1 * step];                                    \
40901+                                                                        \
40902+        assign(dst[2 * step], 74 * (src[0 * step] -                     \
40903+                                    src[2 * step] +                     \
40904+                                    src[3 * step]));                    \
40905+        assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
40906+        assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
40907+        assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
40908+    } while (0)
40909+
40910+static void FUNC(transform_4x4_luma)(int16_t *coeffs)
40911+{
40912+    int i;
40913+    int shift    = 7;
40914+    int add      = 1 << (shift - 1);
40915+    int16_t *src = coeffs;
40916+
40917+    for (i = 0; i < 4; i++) {
40918+        TR_4x4_LUMA(src, src, 4, SCALE);
40919+        src++;
40920+    }
40921+
40922+    shift = 20 - BIT_DEPTH;
40923+    add   = 1 << (shift - 1);
40924+    for (i = 0; i < 4; i++) {
40925+        TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
40926+        coeffs += 4;
40927+    }
40928+}
40929+
40930+#undef TR_4x4_LUMA
40931+
40932+#define TR_4(dst, src, dstep, sstep, assign, end)                 \
40933+    do {                                                          \
40934+        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
40935+        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
40936+        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
40937+        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
40938+                                                                  \
40939+        assign(dst[0 * dstep], e0 + o0);                          \
40940+        assign(dst[1 * dstep], e1 + o1);                          \
40941+        assign(dst[2 * dstep], e1 - o1);                          \
40942+        assign(dst[3 * dstep], e0 - o0);                          \
40943+    } while (0)
40944+
40945+#define TR_8(dst, src, dstep, sstep, assign, end)                 \
40946+    do {                                                          \
40947+        int i, j;                                                 \
40948+        int e_8[4];                                               \
40949+        int o_8[4] = { 0 };                                       \
40950+        for (i = 0; i < 4; i++)                                   \
40951+            for (j = 1; j < end; j += 2)                          \
40952+                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
40953+        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                     \
40954+                                                                  \
40955+        for (i = 0; i < 4; i++) {                                 \
40956+            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
40957+            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
40958+        }                                                         \
40959+    } while (0)
40960+
40961+#define TR_16(dst, src, dstep, sstep, assign, end)                \
40962+    do {                                                          \
40963+        int i, j;                                                 \
40964+        int e_16[8];                                              \
40965+        int o_16[8] = { 0 };                                      \
40966+        for (i = 0; i < 8; i++)                                   \
40967+            for (j = 1; j < end; j += 2)                          \
40968+                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
40969+        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                    \
40970+                                                                  \
40971+        for (i = 0; i < 8; i++) {                                 \
40972+            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
40973+            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
40974+        }                                                         \
40975+    } while (0)
40976+
40977+#define TR_32(dst, src, dstep, sstep, assign, end)                \
40978+    do {                                                          \
40979+        int i, j;                                                 \
40980+        int e_32[16];                                             \
40981+        int o_32[16] = { 0 };                                     \
40982+        for (i = 0; i < 16; i++)                                  \
40983+            for (j = 1; j < end; j += 2)                          \
40984+                o_32[i] += transform[j][i] * src[j * sstep];      \
40985+        TR_16(e_32, src, 1, 2 * sstep, SET, end / 2);             \
40986+                                                                  \
40987+        for (i = 0; i < 16; i++) {                                \
40988+            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
40989+            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
40990+        }                                                         \
40991+    } while (0)
40992+
40993+#define IDCT_VAR4(H)                                              \
40994+    int limit2 = FFMIN(col_limit + 4, H)
40995+#define IDCT_VAR8(H)                                              \
40996+    int limit  = FFMIN(col_limit, H);                             \
40997+    int limit2 = FFMIN(col_limit + 4, H)
40998+#define IDCT_VAR16(H)   IDCT_VAR8(H)
40999+#define IDCT_VAR32(H)   IDCT_VAR8(H)
41000+
41001+#define IDCT(H)                                                   \
41002+static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs,          \
41003+                                        int col_limit)            \
41004+{                                                                 \
41005+    int i;                                                        \
41006+    int      shift = 7;                                           \
41007+    int      add   = 1 << (shift - 1);                            \
41008+    int16_t *src   = coeffs;                                      \
41009+    IDCT_VAR ## H(H);                                             \
41010+                                                                  \
41011+    for (i = 0; i < H; i++) {                                     \
41012+        TR_ ## H(src, src, H, H, SCALE, limit2);                  \
41013+        if (limit2 < H && i%4 == 0 && !!i)                        \
41014+            limit2 -= 4;                                          \
41015+        src++;                                                    \
41016+    }                                                             \
41017+                                                                  \
41018+    shift = 20 - BIT_DEPTH;                                       \
41019+    add   = 1 << (shift - 1);                                     \
41020+    for (i = 0; i < H; i++) {                                     \
41021+        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);             \
41022+        coeffs += H;                                              \
41023+    }                                                             \
41024+}
41025+
41026+#define IDCT_DC(H)                                                \
41027+static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs)    \
41028+{                                                                 \
41029+    int i, j;                                                     \
41030+    int shift = 14 - BIT_DEPTH;                                   \
41031+    int add   = 1 << (shift - 1);                                 \
41032+    int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift;          \
41033+                                                                  \
41034+    for (j = 0; j < H; j++) {                                     \
41035+        for (i = 0; i < H; i++) {                                 \
41036+            coeffs[i + j * H] = coeff;                            \
41037+        }                                                         \
41038+    }                                                             \
41039+}
41040+
41041+IDCT( 4)
41042+IDCT( 8)
41043+IDCT(16)
41044+IDCT(32)
41045+
41046+IDCT_DC( 4)
41047+IDCT_DC( 8)
41048+IDCT_DC(16)
41049+IDCT_DC(32)
41050+
41051+#undef TR_4
41052+#undef TR_8
41053+#undef TR_16
41054+#undef TR_32
41055+
41056+#undef SET
41057+#undef SCALE
41058+
41059+static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
41060+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
41061+                                  int16_t *sao_offset_val, int sao_left_class,
41062+                                  int width, int height)
41063+{
41064+    pixel *dst = (pixel *)_dst;
41065+    pixel *src = (pixel *)_src;
41066+    int offset_table[32] = { 0 };
41067+    int k, y, x;
41068+    int shift  = BIT_DEPTH - 5;
41069+
41070+    stride_dst /= sizeof(pixel);
41071+    stride_src /= sizeof(pixel);
41072+
41073+    for (k = 0; k < 4; k++)
41074+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
41075+    for (y = 0; y < height; y++) {
41076+        for (x = 0; x < width; x++)
41077+            dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
41078+        dst += stride_dst;
41079+        src += stride_src;
41080+    }
41081+}
41082+
41083+#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
41084+
41085+static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
41086+                                  int eo, int width, int height) {
41087+
41088+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
41089+    static const int8_t pos[4][2][2] = {
41090+        { { -1,  0 }, {  1, 0 } }, // horizontal
41091+        { {  0, -1 }, {  0, 1 } }, // vertical
41092+        { { -1, -1 }, {  1, 1 } }, // 45 degree
41093+        { {  1, -1 }, { -1, 1 } }, // 135 degree
41094+    };
41095+    pixel *dst = (pixel *)_dst;
41096+    pixel *src = (pixel *)_src;
41097+    int a_stride, b_stride;
41098+    int x, y;
41099+    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
41100+    stride_dst /= sizeof(pixel);
41101+
41102+    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
41103+    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
41104+    for (y = 0; y < height; y++) {
41105+        for (x = 0; x < width; x++) {
41106+            int diff0 = CMP(src[x], src[x + a_stride]);
41107+            int diff1 = CMP(src[x], src[x + b_stride]);
41108+            int offset_val        = edge_idx[2 + diff0 + diff1];
41109+            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
41110+        }
41111+        src += stride_src;
41112+        dst += stride_dst;
41113+    }
41114+}
41115+
41116+
41117+#if BIT_DEPTH == 10
41118+// We need a 32 bit variation for the _c restores so hijack bit depth 10
41119+#undef pixel
41120+#undef BIT_DEPTH
41121+#define pixel uint32_t
41122+#define BIT_DEPTH 32
41123+// All 16 bit variations are the same
41124+#define sao_edge_restore_0_10 sao_edge_restore_0_9
41125+#define sao_edge_restore_1_10 sao_edge_restore_1_9
41126+#define sao_edge_restore_0_11 sao_edge_restore_0_9
41127+#define sao_edge_restore_1_11 sao_edge_restore_1_9
41128+#define sao_edge_restore_0_12 sao_edge_restore_0_9
41129+#define sao_edge_restore_1_12 sao_edge_restore_1_9
41130+#define sao_edge_restore_0_13 sao_edge_restore_0_9
41131+#define sao_edge_restore_1_13 sao_edge_restore_1_9
41132+#define sao_edge_restore_0_14 sao_edge_restore_0_9
41133+#define sao_edge_restore_1_14 sao_edge_restore_1_9
41134+#define sao_edge_restore_0_15 sao_edge_restore_0_9
41135+#define sao_edge_restore_1_15 sao_edge_restore_1_9
41136+#define sao_edge_restore_0_16 sao_edge_restore_0_9
41137+#define sao_edge_restore_1_16 sao_edge_restore_1_9
41138+#endif
41139+#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
41140+static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
41141+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
41142+                                    int *borders, int _width, int _height,
41143+                                    int c_idx, uint8_t *vert_edge,
41144+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
41145+{
41146+    int x, y;
41147+    pixel *dst = (pixel *)_dst;
41148+    pixel *src = (pixel *)_src;
41149+    int sao_eo_class    = sao->eo_class[c_idx];
41150+    int init_x = 0, width = _width, height = _height;
41151+
41152+    stride_dst /= sizeof(pixel);
41153+    stride_src /= sizeof(pixel);
41154+
41155+    if (sao_eo_class != SAO_EO_VERT) {
41156+        if (borders[0]) {
41157+            for (y = 0; y < height; y++) {
41158+                dst[y * stride_dst] = src[y * stride_src];
41159+            }
41160+            init_x = 1;
41161+        }
41162+        if (borders[2]) {
41163+            int offset     = width - 1;
41164+            for (x = 0; x < height; x++) {
41165+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
41166+            }
41167+            width--;
41168+        }
41169+    }
41170+    if (sao_eo_class != SAO_EO_HORIZ) {
41171+        if (borders[1]) {
41172+            for (x = init_x; x < width; x++)
41173+                dst[x] = src[x];
41174+        }
41175+        if (borders[3]) {
41176+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
41177+            ptrdiff_t y_stride_src = stride_src * (height - 1);
41178+            for (x = init_x; x < width; x++)
41179+                dst[x + y_stride_dst] = src[x + y_stride_src];
41180+            height--;
41181+        }
41182+    }
41183+}
41184+
41185+static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
41186+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
41187+                                    int *borders, int _width, int _height,
41188+                                    int c_idx, uint8_t *vert_edge,
41189+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
41190+{
41191+    int x, y;
41192+    pixel *dst = (pixel *)_dst;
41193+    pixel *src = (pixel *)_src;
41194+    int sao_eo_class    = sao->eo_class[c_idx];
41195+    int init_x = 0, init_y = 0, width = _width, height = _height;
41196+
41197+    stride_dst /= sizeof(pixel);
41198+    stride_src /= sizeof(pixel);
41199+
41200+    if (sao_eo_class != SAO_EO_VERT) {
41201+        if (borders[0]) {
41202+            for (y = 0; y < height; y++) {
41203+                dst[y * stride_dst] = src[y * stride_src];
41204+            }
41205+            init_x = 1;
41206+        }
41207+        if (borders[2]) {
41208+            int offset     = width - 1;
41209+            for (x = 0; x < height; x++) {
41210+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
41211+            }
41212+            width--;
41213+        }
41214+    }
41215+    if (sao_eo_class != SAO_EO_HORIZ) {
41216+        if (borders[1]) {
41217+            for (x = init_x; x < width; x++)
41218+                dst[x] = src[x];
41219+            init_y = 1;
41220+        }
41221+        if (borders[3]) {
41222+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
41223+            ptrdiff_t y_stride_src = stride_src * (height - 1);
41224+            for (x = init_x; x < width; x++)
41225+                dst[x + y_stride_dst] = src[x + y_stride_src];
41226+            height--;
41227+        }
41228+    }
41229+
41230+    {
41231+        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
41232+        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
41233+        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
41234+        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
41235+
41236+        // Restore pixels that can't be modified
41237+        if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
41238+            for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
41239+                dst[y*stride_dst] = src[y*stride_src];
41240+        }
41241+        if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
41242+            for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
41243+                dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
41244+        }
41245+
41246+        if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
41247+            for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
41248+                dst[x] = src[x];
41249+        }
41250+        if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
41251+            for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
41252+                dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
41253+        }
41254+        if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
41255+            dst[0] = src[0];
41256+        if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
41257+            dst[width-1] = src[width-1];
41258+        if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
41259+            dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
41260+        if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
41261+            dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
41262+
41263+    }
41264+}
41265+#endif
41266+#if BIT_DEPTH == 32
41267+#undef BIT_DEPTH
41268+#undef pixel
41269+#define BIT_DEPTH 10
41270+#define pixel uint16_t
41271+#endif
41272+
41273+// --- Plaited chroma versions
41274+
41275+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
41276+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
41277+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
41278+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
41279+                                  int width, int height)
41280+{
41281+    pixel *dst = (pixel *)_dst;
41282+    pixel *src = (pixel *)_src;
41283+    int offset_table_u[32] = { 0 };
41284+    int offset_table_v[32] = { 0 };
41285+    int k, y, x;
41286+    int shift  = BIT_DEPTH - 5;
41287+
41288+    stride_dst /= sizeof(pixel);
41289+    stride_src /= sizeof(pixel);
41290+    width *= 2;
41291+
41292+    for (k = 0; k < 4; k++)
41293+    {
41294+        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
41295+        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
41296+    }
41297+    for (y = 0; y < height; y++) {
41298+        for (x = 0; x < width; x += 2)
41299+        {
41300+//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
41301+//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
41302+            // *** & 31 shouldn't be wanted but just now we generate broken input that
41303+            // crashes us in 10-bit world
41304+            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
41305+            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
41306+        }
41307+        dst += stride_dst;
41308+        src += stride_src;
41309+    }
41310+}
41311+
41312+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
41313+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
41314+                                  int eo, int width, int height) {
41315+
41316+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
41317+    static const int8_t pos[4][2][2] = {
41318+        { { -1,  0 }, {  1, 0 } }, // horizontal
41319+        { {  0, -1 }, {  0, 1 } }, // vertical
41320+        { { -1, -1 }, {  1, 1 } }, // 45 degree
41321+        { {  1, -1 }, { -1, 1 } }, // 135 degree
41322+    };
41323+    pixel *dst = (pixel *)_dst;
41324+    pixel *src = (pixel *)_src;
41325+    int a_stride, b_stride;
41326+    int x, y;
41327+    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
41328+
41329+    stride_dst /= sizeof(pixel);
41330+    width *= 2;
41331+
41332+    av_assert0(width <= 64);
41333+
41334+    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
41335+    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
41336+    for (y = 0; y < height; y++) {
41337+        for (x = 0; x < width; x += 2) {
41338+            int diff0u = CMP(src[x], src[x + a_stride]);
41339+            int diff1u = CMP(src[x], src[x + b_stride]);
41340+            int offset_valu        = edge_idx[2 + diff0u + diff1u];
41341+            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
41342+            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
41343+            int offset_valv        = edge_idx[2 + diff0v + diff1v];
41344+            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
41345+            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
41346+        }
41347+        src += stride_src;
41348+        dst += stride_dst;
41349+    }
41350+}
41351+
41352+// Do once
41353+#if BIT_DEPTH == 8
41354+// Any old 2 byte 'normal' restore will work for these
41355+#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
41356+#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
41357+// We need 32 bit for 9 bit+
41358+#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
41359+#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
41360+#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
41361+#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
41362+#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
41363+#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
41364+#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
41365+#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
41366+#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
41367+#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
41368+#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
41369+#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
41370+#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
41371+#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
41372+#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
41373+#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
41374+#endif
41375+
41376+#undef CMP
41377+
41378+////////////////////////////////////////////////////////////////////////////////
41379+//
41380+////////////////////////////////////////////////////////////////////////////////
41381+static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
41382+                                      uint8_t *_src, ptrdiff_t _srcstride,
41383+                                      int height, intptr_t mx, intptr_t my, int width)
41384+{
41385+    int x, y;
41386+    pixel *src          = (pixel *)_src;
41387+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
41388+
41389+    for (y = 0; y < height; y++) {
41390+        for (x = 0; x < width; x++)
41391+            dst[x] = src[x] << (14 - BIT_DEPTH);
41392+        src += srcstride;
41393+        dst += MAX_PB_SIZE;
41394+    }
41395+}
41396+
41397+static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
41398+                                          int height, intptr_t mx, intptr_t my, int width)
41399+{
41400+    int y;
41401+    pixel *src          = (pixel *)_src;
41402+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
41403+    pixel *dst          = (pixel *)_dst;
41404+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41405+
41406+    for (y = 0; y < height; y++) {
41407+        memcpy(dst, src, width * sizeof(pixel));
41408+        src += srcstride;
41409+        dst += dststride;
41410+    }
41411+}
41412+
41413+static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
41414+                                         int16_t *src2,
41415+                                         int height, intptr_t mx, intptr_t my, int width)
41416+{
41417+    int x, y;
41418+    pixel *src          = (pixel *)_src;
41419+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
41420+    pixel *dst          = (pixel *)_dst;
41421+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41422+
41423+    int shift = 14  + 1 - BIT_DEPTH;
41424+#if BIT_DEPTH < 14
41425+    int offset = 1 << (shift - 1);
41426+#else
41427+    int offset = 0;
41428+#endif
41429+
41430+    for (y = 0; y < height; y++) {
41431+        for (x = 0; x < width; x++)
41432+            dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
41433+        src  += srcstride;
41434+        dst  += dststride;
41435+        src2 += MAX_PB_SIZE;
41436+    }
41437+}
41438+
41439+static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
41440+                                            int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
41441+{
41442+    int x, y;
41443+    pixel *src          = (pixel *)_src;
41444+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
41445+    pixel *dst          = (pixel *)_dst;
41446+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41447+    int shift = denom + 14 - BIT_DEPTH;
41448+#if BIT_DEPTH < 14
41449+    int offset = 1 << (shift - 1);
41450+#else
41451+    int offset = 0;
41452+#endif
41453+
41454+    ox     = ox * (1 << (BIT_DEPTH - 8));
41455+    for (y = 0; y < height; y++) {
41456+        for (x = 0; x < width; x++)
41457+            dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
41458+        src += srcstride;
41459+        dst += dststride;
41460+    }
41461+}
41462+
41463+static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
41464+                                           int16_t *src2,
41465+                                           int height, int denom, int wx0, int wx1,
41466+                                           int ox0, int ox1, intptr_t mx, intptr_t my, int width)
41467+{
41468+    int x, y;
41469+    pixel *src          = (pixel *)_src;
41470+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
41471+    pixel *dst          = (pixel *)_dst;
41472+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41473+
41474+    int shift = 14  + 1 - BIT_DEPTH;
41475+    int log2Wd = denom + shift - 1;
41476+
41477+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
41478+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
41479+    for (y = 0; y < height; y++) {
41480+        for (x = 0; x < width; x++) {
41481+            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
41482+        }
41483+        src  += srcstride;
41484+        dst  += dststride;
41485+        src2 += MAX_PB_SIZE;
41486+    }
41487+}
41488+
41489+////////////////////////////////////////////////////////////////////////////////
41490+//
41491+////////////////////////////////////////////////////////////////////////////////
41492+#define QPEL_FILTER(src, stride)                                               \
41493+    (filter[0] * src[x - 3 * stride] +                                         \
41494+     filter[1] * src[x - 2 * stride] +                                         \
41495+     filter[2] * src[x -     stride] +                                         \
41496+     filter[3] * src[x             ] +                                         \
41497+     filter[4] * src[x +     stride] +                                         \
41498+     filter[5] * src[x + 2 * stride] +                                         \
41499+     filter[6] * src[x + 3 * stride] +                                         \
41500+     filter[7] * src[x + 4 * stride])
41501+
41502+static void FUNC(put_hevc_qpel_h)(int16_t *dst,
41503+                                  uint8_t *_src, ptrdiff_t _srcstride,
41504+                                  int height, intptr_t mx, intptr_t my, int width)
41505+{
41506+    int x, y;
41507+    pixel        *src       = (pixel*)_src;
41508+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
41509+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
41510+    for (y = 0; y < height; y++) {
41511+        for (x = 0; x < width; x++)
41512+            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
41513+        src += srcstride;
41514+        dst += MAX_PB_SIZE;
41515+    }
41516+}
41517+
41518+static void FUNC(put_hevc_qpel_v)(int16_t *dst,
41519+                                  uint8_t *_src, ptrdiff_t _srcstride,
41520+                                  int height, intptr_t mx, intptr_t my, int width)
41521+{
41522+    int x, y;
41523+    pixel        *src       = (pixel*)_src;
41524+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
41525+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
41526+    for (y = 0; y < height; y++)  {
41527+        for (x = 0; x < width; x++)
41528+            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
41529+        src += srcstride;
41530+        dst += MAX_PB_SIZE;
41531+    }
41532+}
41533+
41534+static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
41535+                                   uint8_t *_src,
41536+                                   ptrdiff_t _srcstride,
41537+                                   int height, intptr_t mx,
41538+                                   intptr_t my, int width)
41539+{
41540+    int x, y;
41541+    const int8_t *filter;
41542+    pixel *src = (pixel*)_src;
41543+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
41544+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
41545+    int16_t *tmp = tmp_array;
41546+
41547+    src   -= QPEL_EXTRA_BEFORE * srcstride;
41548+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
41549+    for (y = 0; y < height + QPEL_EXTRA; y++) {
41550+        for (x = 0; x < width; x++)
41551+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
41552+        src += srcstride;
41553+        tmp += MAX_PB_SIZE;
41554+    }
41555+
41556+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
41557+    filter = ff_hevc_rpi_qpel_filters[my - 1];
41558+    for (y = 0; y < height; y++) {
41559+        for (x = 0; x < width; x++)
41560+            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
41561+        tmp += MAX_PB_SIZE;
41562+        dst += MAX_PB_SIZE;
41563+    }
41564+}
41565+
41566+static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
41567+                                      uint8_t *_src, ptrdiff_t _srcstride,
41568+                                      int height, intptr_t mx, intptr_t my, int width)
41569+{
41570+    int x, y;
41571+    pixel        *src       = (pixel*)_src;
41572+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
41573+    pixel *dst          = (pixel *)_dst;
41574+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41575+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
41576+    int shift = 14 - BIT_DEPTH;
41577+
41578+#if BIT_DEPTH < 14
41579+    int offset = 1 << (shift - 1);
41580+#else
41581+    int offset = 0;
41582+#endif
41583+
41584+    for (y = 0; y < height; y++) {
41585+        for (x = 0; x < width; x++)
41586+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
41587+        src += srcstride;
41588+        dst += dststride;
41589+    }
41590+}
41591+
41592+static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
41593+                                     int16_t *src2,
41594+                                     int height, intptr_t mx, intptr_t my, int width)
41595+{
41596+    int x, y;
41597+    pixel        *src       = (pixel*)_src;
41598+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
41599+    pixel *dst          = (pixel *)_dst;
41600+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41601+
41602+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
41603+
41604+    int shift = 14  + 1 - BIT_DEPTH;
41605+#if BIT_DEPTH < 14
41606+    int offset = 1 << (shift - 1);
41607+#else
41608+    int offset = 0;
41609+#endif
41610+
41611+    for (y = 0; y < height; y++) {
41612+        for (x = 0; x < width; x++)
41613+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
41614+        src  += srcstride;
41615+        dst  += dststride;
41616+        src2 += MAX_PB_SIZE;
41617+    }
41618+}
41619+
41620+static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
41621+                                     uint8_t *_src, ptrdiff_t _srcstride,
41622+                                     int height, intptr_t mx, intptr_t my, int width)
41623+{
41624+    int x, y;
41625+    pixel        *src       = (pixel*)_src;
41626+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
41627+    pixel *dst          = (pixel *)_dst;
41628+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41629+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
41630+    int shift = 14 - BIT_DEPTH;
41631+
41632+#if BIT_DEPTH < 14
41633+    int offset = 1 << (shift - 1);
41634+#else
41635+    int offset = 0;
41636+#endif
41637+
41638+    for (y = 0; y < height; y++) {
41639+        for (x = 0; x < width; x++)
41640+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
41641+        src += srcstride;
41642+        dst += dststride;
41643+    }
41644+}
41645+
41646+
41647+static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
41648+                                     int16_t *src2,
41649+                                     int height, intptr_t mx, intptr_t my, int width)
41650+{
41651+    int x, y;
41652+    pixel        *src       = (pixel*)_src;
41653+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
41654+    pixel *dst          = (pixel *)_dst;
41655+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41656+
41657+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
41658+
41659+    int shift = 14 + 1 - BIT_DEPTH;
41660+#if BIT_DEPTH < 14
41661+    int offset = 1 << (shift - 1);
41662+#else
41663+    int offset = 0;
41664+#endif
41665+
41666+    for (y = 0; y < height; y++) {
41667+        for (x = 0; x < width; x++)
41668+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
41669+        src  += srcstride;
41670+        dst  += dststride;
41671+        src2 += MAX_PB_SIZE;
41672+    }
41673+}
41674+
41675+static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
41676+                                       uint8_t *_src, ptrdiff_t _srcstride,
41677+                                       int height, intptr_t mx, intptr_t my, int width)
41678+{
41679+    int x, y;
41680+    const int8_t *filter;
41681+    pixel *src = (pixel*)_src;
41682+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
41683+    pixel *dst          = (pixel *)_dst;
41684+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41685+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
41686+    int16_t *tmp = tmp_array;
41687+    int shift =  14 - BIT_DEPTH;
41688+
41689+#if BIT_DEPTH < 14
41690+    int offset = 1 << (shift - 1);
41691+#else
41692+    int offset = 0;
41693+#endif
41694+
41695+    src   -= QPEL_EXTRA_BEFORE * srcstride;
41696+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
41697+    for (y = 0; y < height + QPEL_EXTRA; y++) {
41698+        for (x = 0; x < width; x++)
41699+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
41700+        src += srcstride;
41701+        tmp += MAX_PB_SIZE;
41702+    }
41703+
41704+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
41705+    filter = ff_hevc_rpi_qpel_filters[my - 1];
41706+
41707+    for (y = 0; y < height; y++) {
41708+        for (x = 0; x < width; x++)
41709+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
41710+        tmp += MAX_PB_SIZE;
41711+        dst += dststride;
41712+    }
41713+}
41714+
41715+static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
41716+                                      int16_t *src2,
41717+                                      int height, intptr_t mx, intptr_t my, int width)
41718+{
41719+    int x, y;
41720+    const int8_t *filter;
41721+    pixel *src = (pixel*)_src;
41722+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
41723+    pixel *dst          = (pixel *)_dst;
41724+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41725+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
41726+    int16_t *tmp = tmp_array;
41727+    int shift = 14 + 1 - BIT_DEPTH;
41728+#if BIT_DEPTH < 14
41729+    int offset = 1 << (shift - 1);
41730+#else
41731+    int offset = 0;
41732+#endif
41733+
41734+    src   -= QPEL_EXTRA_BEFORE * srcstride;
41735+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
41736+    for (y = 0; y < height + QPEL_EXTRA; y++) {
41737+        for (x = 0; x < width; x++)
41738+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
41739+        src += srcstride;
41740+        tmp += MAX_PB_SIZE;
41741+    }
41742+
41743+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
41744+    filter = ff_hevc_rpi_qpel_filters[my - 1];
41745+
41746+    for (y = 0; y < height; y++) {
41747+        for (x = 0; x < width; x++)
41748+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
41749+        tmp  += MAX_PB_SIZE;
41750+        dst  += dststride;
41751+        src2 += MAX_PB_SIZE;
41752+    }
41753+}
41754+
41755+static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
41756+                                        uint8_t *_src, ptrdiff_t _srcstride,
41757+                                        int height, int denom, int wx, int ox,
41758+                                        intptr_t mx, intptr_t my, int width)
41759+{
41760+    int x, y;
41761+    pixel        *src       = (pixel*)_src;
41762+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
41763+    pixel *dst          = (pixel *)_dst;
41764+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41765+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
41766+    int shift = denom + 14 - BIT_DEPTH;
41767+#if BIT_DEPTH < 14
41768+    int offset = 1 << (shift - 1);
41769+#else
41770+    int offset = 0;
41771+#endif
41772+
41773+    ox = ox * (1 << (BIT_DEPTH - 8));
41774+    for (y = 0; y < height; y++) {
41775+        for (x = 0; x < width; x++)
41776+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
41777+        src += srcstride;
41778+        dst += dststride;
41779+    }
41780+}
41781+
41782+static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
41783+                                       int16_t *src2,
41784+                                       int height, int denom, int wx0, int wx1,
41785+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
41786+{
41787+    int x, y;
41788+    pixel        *src       = (pixel*)_src;
41789+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
41790+    pixel *dst          = (pixel *)_dst;
41791+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41792+
41793+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
41794+
41795+    int shift = 14  + 1 - BIT_DEPTH;
41796+    int log2Wd = denom + shift - 1;
41797+
41798+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
41799+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
41800+    for (y = 0; y < height; y++) {
41801+        for (x = 0; x < width; x++)
41802+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
41803+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
41804+        src  += srcstride;
41805+        dst  += dststride;
41806+        src2 += MAX_PB_SIZE;
41807+    }
41808+}
41809+
41810+static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
41811+                                        uint8_t *_src, ptrdiff_t _srcstride,
41812+                                        int height, int denom, int wx, int ox,
41813+                                        intptr_t mx, intptr_t my, int width)
41814+{
41815+    int x, y;
41816+    pixel        *src       = (pixel*)_src;
41817+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
41818+    pixel *dst          = (pixel *)_dst;
41819+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41820+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
41821+    int shift = denom + 14 - BIT_DEPTH;
41822+#if BIT_DEPTH < 14
41823+    int offset = 1 << (shift - 1);
41824+#else
41825+    int offset = 0;
41826+#endif
41827+
41828+    ox = ox * (1 << (BIT_DEPTH - 8));
41829+    for (y = 0; y < height; y++) {
41830+        for (x = 0; x < width; x++)
41831+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
41832+        src += srcstride;
41833+        dst += dststride;
41834+    }
41835+}
41836+
41837+static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
41838+                                       int16_t *src2,
41839+                                       int height, int denom, int wx0, int wx1,
41840+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
41841+{
41842+    int x, y;
41843+    pixel        *src       = (pixel*)_src;
41844+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
41845+    pixel *dst          = (pixel *)_dst;
41846+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41847+
41848+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
41849+
41850+    int shift = 14 + 1 - BIT_DEPTH;
41851+    int log2Wd = denom + shift - 1;
41852+
41853+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
41854+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
41855+    for (y = 0; y < height; y++) {
41856+        for (x = 0; x < width; x++)
41857+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
41858+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
41859+        src  += srcstride;
41860+        dst  += dststride;
41861+        src2 += MAX_PB_SIZE;
41862+    }
41863+}
41864+
41865+static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
41866+                                         uint8_t *_src, ptrdiff_t _srcstride,
41867+                                         int height, int denom, int wx, int ox,
41868+                                         intptr_t mx, intptr_t my, int width)
41869+{
41870+    int x, y;
41871+    const int8_t *filter;
41872+    pixel *src = (pixel*)_src;
41873+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
41874+    pixel *dst          = (pixel *)_dst;
41875+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41876+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
41877+    int16_t *tmp = tmp_array;
41878+    int shift = denom + 14 - BIT_DEPTH;
41879+#if BIT_DEPTH < 14
41880+    int offset = 1 << (shift - 1);
41881+#else
41882+    int offset = 0;
41883+#endif
41884+
41885+    src   -= QPEL_EXTRA_BEFORE * srcstride;
41886+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
41887+    for (y = 0; y < height + QPEL_EXTRA; y++) {
41888+        for (x = 0; x < width; x++)
41889+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
41890+        src += srcstride;
41891+        tmp += MAX_PB_SIZE;
41892+    }
41893+
41894+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
41895+    filter = ff_hevc_rpi_qpel_filters[my - 1];
41896+
41897+    ox = ox * (1 << (BIT_DEPTH - 8));
41898+    for (y = 0; y < height; y++) {
41899+        for (x = 0; x < width; x++)
41900+            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
41901+        tmp += MAX_PB_SIZE;
41902+        dst += dststride;
41903+    }
41904+}
41905+
41906+static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
41907+                                        int16_t *src2,
41908+                                        int height, int denom, int wx0, int wx1,
41909+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
41910+{
41911+    int x, y;
41912+    const int8_t *filter;
41913+    pixel *src = (pixel*)_src;
41914+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
41915+    pixel *dst          = (pixel *)_dst;
41916+    ptrdiff_t dststride = _dststride / sizeof(pixel);
41917+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
41918+    int16_t *tmp = tmp_array;
41919+    int shift = 14 + 1 - BIT_DEPTH;
41920+    int log2Wd = denom + shift - 1;
41921+
41922+    src   -= QPEL_EXTRA_BEFORE * srcstride;
41923+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
41924+    for (y = 0; y < height + QPEL_EXTRA; y++) {
41925+        for (x = 0; x < width; x++)
41926+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
41927+        src += srcstride;
41928+        tmp += MAX_PB_SIZE;
41929+    }
41930+
41931+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
41932+    filter = ff_hevc_rpi_qpel_filters[my - 1];
41933+
41934+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
41935+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
41936+    for (y = 0; y < height; y++) {
41937+        for (x = 0; x < width; x++)
41938+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
41939+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
41940+        tmp  += MAX_PB_SIZE;
41941+        dst  += dststride;
41942+        src2 += MAX_PB_SIZE;
41943+    }
41944+}
41945+
41946+////////////////////////////////////////////////////////////////////////////////
41947+//
41948+////////////////////////////////////////////////////////////////////////////////
41949+#define EPEL_FILTER(src, stride)                                               \
41950+    (filter[0] * src[x - stride] +                                             \
41951+     filter[1] * src[x]          +                                             \
41952+     filter[2] * src[x + stride] +                                             \
41953+     filter[3] * src[x + 2 * stride])
41954+
41955+static void FUNC(put_hevc_epel_h)(int16_t *dst,
41956+                                  uint8_t *_src, ptrdiff_t _srcstride,
41957+                                  int height, intptr_t mx, intptr_t my, int width)
41958+{
41959+    int x, y;
41960+    pixel *src = (pixel *)_src;
41961+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
41962+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
41963+    for (y = 0; y < height; y++) {
41964+        for (x = 0; x < width; x++)
41965+            dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
41966+        src += srcstride;
41967+        dst += MAX_PB_SIZE;
41968+    }
41969+}
41970+
41971+static void FUNC(put_hevc_epel_v)(int16_t *dst,
41972+                                  uint8_t *_src, ptrdiff_t _srcstride,
41973+                                  int height, intptr_t mx, intptr_t my, int width)
41974+{
41975+    int x, y;
41976+    pixel *src = (pixel *)_src;
41977+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
41978+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
41979+
41980+    for (y = 0; y < height; y++) {
41981+        for (x = 0; x < width; x++)
41982+            dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
41983+        src += srcstride;
41984+        dst += MAX_PB_SIZE;
41985+    }
41986+}
41987+
41988+static void FUNC(put_hevc_epel_hv)(int16_t *dst,
41989+                                   uint8_t *_src, ptrdiff_t _srcstride,
41990+                                   int height, intptr_t mx, intptr_t my, int width)
41991+{
41992+    int x, y;
41993+    pixel *src = (pixel *)_src;
41994+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
41995+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
41996+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
41997+    int16_t *tmp = tmp_array;
41998+
41999+    src -= EPEL_EXTRA_BEFORE * srcstride;
42000+
42001+    for (y = 0; y < height + EPEL_EXTRA; y++) {
42002+        for (x = 0; x < width; x++)
42003+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
42004+        src += srcstride;
42005+        tmp += MAX_PB_SIZE;
42006+    }
42007+
42008+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
42009+    filter = ff_hevc_rpi_epel_filters[my - 1];
42010+
42011+    for (y = 0; y < height; y++) {
42012+        for (x = 0; x < width; x++)
42013+            dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
42014+        tmp += MAX_PB_SIZE;
42015+        dst += MAX_PB_SIZE;
42016+    }
42017+}
42018+
42019+static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
42020+                                      int height, intptr_t mx, intptr_t my, int width)
42021+{
42022+    int x, y;
42023+    pixel *src = (pixel *)_src;
42024+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
42025+    pixel *dst          = (pixel *)_dst;
42026+    ptrdiff_t dststride = _dststride / sizeof(pixel);
42027+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
42028+    int shift = 14 - BIT_DEPTH;
42029+#if BIT_DEPTH < 14
42030+    int offset = 1 << (shift - 1);
42031+#else
42032+    int offset = 0;
42033+#endif
42034+
42035+    for (y = 0; y < height; y++) {
42036+        for (x = 0; x < width; x++)
42037+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
42038+        src += srcstride;
42039+        dst += dststride;
42040+    }
42041+}
42042+
42043+static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
42044+                                     int16_t *src2,
42045+                                     int height, intptr_t mx, intptr_t my, int width)
42046+{
42047+    int x, y;
42048+    pixel *src = (pixel *)_src;
42049+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
42050+    pixel *dst          = (pixel *)_dst;
42051+    ptrdiff_t dststride = _dststride / sizeof(pixel);
42052+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
42053+    int shift = 14 + 1 - BIT_DEPTH;
42054+#if BIT_DEPTH < 14
42055+    int offset = 1 << (shift - 1);
42056+#else
42057+    int offset = 0;
42058+#endif
42059+
42060+    for (y = 0; y < height; y++) {
42061+        for (x = 0; x < width; x++) {
42062+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
42063+        }
42064+        dst  += dststride;
42065+        src  += srcstride;
42066+        src2 += MAX_PB_SIZE;
42067+    }
42068+}
42069+
42070+static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
42071+                                      int height, intptr_t mx, intptr_t my, int width)
42072+{
42073+    int x, y;
42074+    pixel *src = (pixel *)_src;
42075+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
42076+    pixel *dst          = (pixel *)_dst;
42077+    ptrdiff_t dststride = _dststride / sizeof(pixel);
42078+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
42079+    int shift = 14 - BIT_DEPTH;
42080+#if BIT_DEPTH < 14
42081+    int offset = 1 << (shift - 1);
42082+#else
42083+    int offset = 0;
42084+#endif
42085+
42086+    for (y = 0; y < height; y++) {
42087+        for (x = 0; x < width; x++)
42088+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
42089+        src += srcstride;
42090+        dst += dststride;
42091+    }
42092+}
42093+
42094+static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
42095+                                     int16_t *src2,
42096+                                     int height, intptr_t mx, intptr_t my, int width)
42097+{
42098+    int x, y;
42099+    pixel *src = (pixel *)_src;
42100+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
42101+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
42102+    pixel *dst          = (pixel *)_dst;
42103+    ptrdiff_t dststride = _dststride / sizeof(pixel);
42104+    int shift = 14 + 1 - BIT_DEPTH;
42105+#if BIT_DEPTH < 14
42106+    int offset = 1 << (shift - 1);
42107+#else
42108+    int offset = 0;
42109+#endif
42110+
42111+    for (y = 0; y < height; y++) {
42112+        for (x = 0; x < width; x++)
42113+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
42114+        dst  += dststride;
42115+        src  += srcstride;
42116+        src2 += MAX_PB_SIZE;
42117+    }
42118+}
42119+
42120+static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
42121+                                       int height, intptr_t mx, intptr_t my, int width)
42122+{
42123+    int x, y;
42124+    pixel *src = (pixel *)_src;
42125+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
42126+    pixel *dst          = (pixel *)_dst;
42127+    ptrdiff_t dststride = _dststride / sizeof(pixel);
42128+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
42129+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
42130+    int16_t *tmp = tmp_array;
42131+    int shift = 14 - BIT_DEPTH;
42132+#if BIT_DEPTH < 14
42133+    int offset = 1 << (shift - 1);
42134+#else
42135+    int offset = 0;
42136+#endif
42137+
42138+    src -= EPEL_EXTRA_BEFORE * srcstride;
42139+
42140+    for (y = 0; y < height + EPEL_EXTRA; y++) {
42141+        for (x = 0; x < width; x++)
42142+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
42143+        src += srcstride;
42144+        tmp += MAX_PB_SIZE;
42145+    }
42146+
42147+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
42148+    filter = ff_hevc_rpi_epel_filters[my - 1];
42149+
42150+    for (y = 0; y < height; y++) {
42151+        for (x = 0; x < width; x++)
42152+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
42153+        tmp += MAX_PB_SIZE;
42154+        dst += dststride;
42155+    }
42156+}
42157+
42158+static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
42159+                                      int16_t *src2,
42160+                                      int height, intptr_t mx, intptr_t my, int width)
42161+{
42162+    int x, y;
42163+    pixel *src = (pixel *)_src;
42164+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
42165+    pixel *dst          = (pixel *)_dst;
42166+    ptrdiff_t dststride = _dststride / sizeof(pixel);
42167+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
42168+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
42169+    int16_t *tmp = tmp_array;
42170+    int shift = 14 + 1 - BIT_DEPTH;
42171+#if BIT_DEPTH < 14
42172+    int offset = 1 << (shift - 1);
42173+#else
42174+    int offset = 0;
42175+#endif
42176+
42177+    src -= EPEL_EXTRA_BEFORE * srcstride;
42178+
42179+    for (y = 0; y < height + EPEL_EXTRA; y++) {
42180+        for (x = 0; x < width; x++)
42181+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
42182+        src += srcstride;
42183+        tmp += MAX_PB_SIZE;
42184+    }
42185+
42186+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
42187+    filter = ff_hevc_rpi_epel_filters[my - 1];
42188+
42189+    for (y = 0; y < height; y++) {
42190+        for (x = 0; x < width; x++)
42191+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
42192+        tmp  += MAX_PB_SIZE;
42193+        dst  += dststride;
42194+        src2 += MAX_PB_SIZE;
42195+    }
42196+}
42197+
42198+static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
42199+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
42200+{
42201+    int x, y;
42202+    pixel *src = (pixel *)_src;
42203+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
42204+    pixel *dst          = (pixel *)_dst;
42205+    ptrdiff_t dststride = _dststride / sizeof(pixel);
42206+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
42207+    int shift = denom + 14 - BIT_DEPTH;
42208+#if BIT_DEPTH < 14
42209+    int offset = 1 << (shift - 1);
42210+#else
42211+    int offset = 0;
42212+#endif
42213+
42214+    ox     = ox * (1 << (BIT_DEPTH - 8));
42215+    for (y = 0; y < height; y++) {
42216+        for (x = 0; x < width; x++) {
42217+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
42218+        }
42219+        dst += dststride;
42220+        src += srcstride;
42221+    }
42222+}
42223+
42224+static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
42225+                                       int16_t *src2,
42226+                                       int height, int denom, int wx0, int wx1,
42227+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
42228+{
42229+    int x, y;
42230+    pixel *src = (pixel *)_src;
42231+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
42232+    pixel *dst          = (pixel *)_dst;
42233+    ptrdiff_t dststride = _dststride / sizeof(pixel);
42234+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
42235+    int shift = 14 + 1 - BIT_DEPTH;
42236+    int log2Wd = denom + shift - 1;
42237+
42238+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
42239+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
42240+    for (y = 0; y < height; y++) {
42241+        for (x = 0; x < width; x++)
42242+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
42243+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
42244+        src  += srcstride;
42245+        dst  += dststride;
42246+        src2 += MAX_PB_SIZE;
42247+    }
42248+}
42249+
42250+static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
42251+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
42252+{
42253+    int x, y;
42254+    pixel *src = (pixel *)_src;
42255+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
42256+    pixel *dst          = (pixel *)_dst;
42257+    ptrdiff_t dststride = _dststride / sizeof(pixel);
42258+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
42259+    int shift = denom + 14 - BIT_DEPTH;
42260+#if BIT_DEPTH < 14
42261+    int offset = 1 << (shift - 1);
42262+#else
42263+    int offset = 0;
42264+#endif
42265+
42266+    ox     = ox * (1 << (BIT_DEPTH - 8));
42267+    for (y = 0; y < height; y++) {
42268+        for (x = 0; x < width; x++) {
42269+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
42270+        }
42271+        dst += dststride;
42272+        src += srcstride;
42273+    }
42274+}
42275+
42276+static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
42277+                                       int16_t *src2,
42278+                                       int height, int denom, int wx0, int wx1,
42279+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
42280+{
42281+    int x, y;
42282+    pixel *src = (pixel *)_src;
42283+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
42284+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
42285+    pixel *dst          = (pixel *)_dst;
42286+    ptrdiff_t dststride = _dststride / sizeof(pixel);
42287+    int shift = 14 + 1 - BIT_DEPTH;
42288+    int log2Wd = denom + shift - 1;
42289+
42290+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
42291+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
42292+    for (y = 0; y < height; y++) {
42293+        for (x = 0; x < width; x++)
42294+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
42295+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
42296+        src  += srcstride;
42297+        dst  += dststride;
42298+        src2 += MAX_PB_SIZE;
42299+    }
42300+}
42301+
42302+static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
42303+                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
42304+{
42305+    int x, y;
42306+    pixel *src = (pixel *)_src;
42307+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
42308+    pixel *dst          = (pixel *)_dst;
42309+    ptrdiff_t dststride = _dststride / sizeof(pixel);
42310+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
42311+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
42312+    int16_t *tmp = tmp_array;
42313+    int shift = denom + 14 - BIT_DEPTH;
42314+#if BIT_DEPTH < 14
42315+    int offset = 1 << (shift - 1);
42316+#else
42317+    int offset = 0;
42318+#endif
42319+
42320+    src -= EPEL_EXTRA_BEFORE * srcstride;
42321+
42322+    for (y = 0; y < height + EPEL_EXTRA; y++) {
42323+        for (x = 0; x < width; x++)
42324+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
42325+        src += srcstride;
42326+        tmp += MAX_PB_SIZE;
42327+    }
42328+
42329+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
42330+    filter = ff_hevc_rpi_epel_filters[my - 1];
42331+
42332+    ox     = ox * (1 << (BIT_DEPTH - 8));
42333+    for (y = 0; y < height; y++) {
42334+        for (x = 0; x < width; x++)
42335+            dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
42336+        tmp += MAX_PB_SIZE;
42337+        dst += dststride;
42338+    }
42339+}
42340+
42341+static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
42342+                                        int16_t *src2,
42343+                                        int height, int denom, int wx0, int wx1,
42344+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
42345+{
42346+    int x, y;
42347+    pixel *src = (pixel *)_src;
42348+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
42349+    pixel *dst          = (pixel *)_dst;
42350+    ptrdiff_t dststride = _dststride / sizeof(pixel);
42351+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
42352+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
42353+    int16_t *tmp = tmp_array;
42354+    int shift = 14 + 1 - BIT_DEPTH;
42355+    int log2Wd = denom + shift - 1;
42356+
42357+    src -= EPEL_EXTRA_BEFORE * srcstride;
42358+
42359+    for (y = 0; y < height + EPEL_EXTRA; y++) {
42360+        for (x = 0; x < width; x++)
42361+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
42362+        src += srcstride;
42363+        tmp += MAX_PB_SIZE;
42364+    }
42365+
42366+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
42367+    filter = ff_hevc_rpi_epel_filters[my - 1];
42368+
42369+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
42370+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
42371+    for (y = 0; y < height; y++) {
42372+        for (x = 0; x < width; x++)
42373+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
42374+                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
42375+        tmp  += MAX_PB_SIZE;
42376+        dst  += dststride;
42377+        src2 += MAX_PB_SIZE;
42378+    }
42379+}
42380+
42381+// line zero
42382+#define P3 pix[-4 * xstride]
42383+#define P2 pix[-3 * xstride]
42384+#define P1 pix[-2 * xstride]
42385+#define P0 pix[-1 * xstride]
42386+#define Q0 pix[0 * xstride]
42387+#define Q1 pix[1 * xstride]
42388+#define Q2 pix[2 * xstride]
42389+#define Q3 pix[3 * xstride]
42390+
42391+// line three. used only for deblocking decision
42392+#define TP3 pix[-4 * xstride + 3 * ystride]
42393+#define TP2 pix[-3 * xstride + 3 * ystride]
42394+#define TP1 pix[-2 * xstride + 3 * ystride]
42395+#define TP0 pix[-1 * xstride + 3 * ystride]
42396+#define TQ0 pix[0  * xstride + 3 * ystride]
42397+#define TQ1 pix[1  * xstride + 3 * ystride]
42398+#define TQ2 pix[2  * xstride + 3 * ystride]
42399+#define TQ3 pix[3  * xstride + 3 * ystride]
42400+
42401+static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
42402+                                        ptrdiff_t _xstride, ptrdiff_t _ystride,
42403+                                        int beta, int *_tc,
42404+                                        uint8_t *_no_p, uint8_t *_no_q)
42405+{
42406+    int d, j;
42407+    pixel *pix        = (pixel *)_pix;
42408+    ptrdiff_t xstride = _xstride / sizeof(pixel);
42409+    ptrdiff_t ystride = _ystride / sizeof(pixel);
42410+
42411+    beta <<= BIT_DEPTH - 8;
42412+
42413+    for (j = 0; j < 2; j++) {
42414+        const int dp0  = abs(P2  - 2 * P1  + P0);
42415+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
42416+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
42417+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
42418+        const int d0   = dp0 + dq0;
42419+        const int d3   = dp3 + dq3;
42420+        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
42421+        const int no_p = _no_p[j];
42422+        const int no_q = _no_q[j];
42423+
42424+        if (d0 + d3 >= beta) {
42425+            pix += 4 * ystride;
42426+            continue;
42427+        } else {
42428+            const int beta_3 = beta >> 3;
42429+            const int beta_2 = beta >> 2;
42430+            const int tc25   = ((tc * 5 + 1) >> 1);
42431+
42432+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
42433+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
42434+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
42435+                // strong filtering
42436+                const int tc2 = tc << 1;
42437+                for (d = 0; d < 4; d++) {
42438+                    const int p3 = P3;
42439+                    const int p2 = P2;
42440+                    const int p1 = P1;
42441+                    const int p0 = P0;
42442+                    const int q0 = Q0;
42443+                    const int q1 = Q1;
42444+                    const int q2 = Q2;
42445+                    const int q3 = Q3;
42446+                    if (!no_p) {
42447+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
42448+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
42449+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
42450+                    }
42451+                    if (!no_q) {
42452+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
42453+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
42454+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
42455+                    }
42456+                    pix += ystride;
42457+                }
42458+            } else { // normal filtering
42459+                int nd_p = 1;
42460+                int nd_q = 1;
42461+                const int tc_2 = tc >> 1;
42462+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
42463+                    nd_p = 2;
42464+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
42465+                    nd_q = 2;
42466+
42467+                for (d = 0; d < 4; d++) {
42468+                    const int p2 = P2;
42469+                    const int p1 = P1;
42470+                    const int p0 = P0;
42471+                    const int q0 = Q0;
42472+                    const int q1 = Q1;
42473+                    const int q2 = Q2;
42474+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
42475+                    if (abs(delta0) < 10 * tc) {
42476+                        delta0 = av_clip(delta0, -tc, tc);
42477+                        if (!no_p)
42478+                            P0 = av_clip_pixel(p0 + delta0);
42479+                        if (!no_q)
42480+                            Q0 = av_clip_pixel(q0 - delta0);
42481+                        if (!no_p && nd_p > 1) {
42482+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
42483+                            P1 = av_clip_pixel(p1 + deltap1);
42484+                        }
42485+                        if (!no_q && nd_q > 1) {
42486+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
42487+                            Q1 = av_clip_pixel(q1 + deltaq1);
42488+                        }
42489+                    }
42490+                    pix += ystride;
42491+                }
42492+            }
42493+        }
42494+    }
42495+}
42496+
42497+static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
42498+                                          ptrdiff_t _ystride, int *_tc,
42499+                                          uint8_t *_no_p, uint8_t *_no_q)
42500+{
42501+    int d, j, no_p, no_q;
42502+    pixel *pix        = (pixel *)_pix;
42503+    ptrdiff_t xstride = _xstride / sizeof(pixel);
42504+    ptrdiff_t ystride = _ystride / sizeof(pixel);
42505+
42506+    for (j = 0; j < 2; j++) {
42507+        const int tc = _tc[j] << (BIT_DEPTH - 8);
42508+        if (tc <= 0) {
42509+            pix += 4 * ystride;
42510+            continue;
42511+        }
42512+        no_p = _no_p[j];
42513+        no_q = _no_q[j];
42514+
42515+        for (d = 0; d < 4; d++) {
42516+            int delta0;
42517+            const int p1 = P1;
42518+            const int p0 = P0;
42519+            const int q0 = Q0;
42520+            const int q1 = Q1;
42521+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
42522+            if (!no_p)
42523+                P0 = av_clip_pixel(p0 + delta0);
42524+            if (!no_q)
42525+                Q0 = av_clip_pixel(q0 - delta0);
42526+            pix += ystride;
42527+        }
42528+    }
42529+}
42530+
42531+static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
42532+                                            int32_t *tc, uint8_t *no_p,
42533+                                            uint8_t *no_q)
42534+{
42535+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
42536+}
42537+
42538+static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
42539+                                            int32_t *tc, uint8_t *no_p,
42540+                                            uint8_t *no_q)
42541+{
42542+    FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
42543+}
42544+
42545+static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
42546+                                          int beta, int32_t *tc, uint8_t *no_p,
42547+                                          uint8_t *no_q)
42548+{
42549+    FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
42550+                                beta, tc, no_p, no_q);
42551+}
42552+
42553+static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
42554+                                          int beta, int32_t *tc, uint8_t *no_p,
42555+                                          uint8_t *no_q)
42556+{
42557+    FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
42558+                                beta, tc, no_p, no_q);
42559+}
42560+
42561+#undef P3
42562+#undef P2
42563+#undef P1
42564+#undef P0
42565+#undef Q0
42566+#undef Q1
42567+#undef Q2
42568+#undef Q3
42569+
42570+#undef TP3
42571+#undef TP2
42572+#undef TP1
42573+#undef TP0
42574+#undef TQ0
42575+#undef TQ1
42576+#undef TQ2
42577+#undef TQ3
42578+
42579+// line zero
42580+#define P3 pix_l[0 * xstride]
42581+#define P2 pix_l[1 * xstride]
42582+#define P1 pix_l[2 * xstride]
42583+#define P0 pix_l[3 * xstride]
42584+#define Q0 pix_r[0 * xstride]
42585+#define Q1 pix_r[1 * xstride]
42586+#define Q2 pix_r[2 * xstride]
42587+#define Q3 pix_r[3 * xstride]
42588+
42589+// line three. used only for deblocking decision
42590+#define TP3 pix_l[0 * xstride + 3 * ystride]
42591+#define TP2 pix_l[1 * xstride + 3 * ystride]
42592+#define TP1 pix_l[2 * xstride + 3 * ystride]
42593+#define TP0 pix_l[3 * xstride + 3 * ystride]
42594+#define TQ0 pix_r[0 * xstride + 3 * ystride]
42595+#define TQ1 pix_r[1 * xstride + 3 * ystride]
42596+#define TQ2 pix_r[2 * xstride + 3 * ystride]
42597+#define TQ3 pix_r[3 * xstride + 3 * ystride]
42598+
42599+// This is identical to hevc_loop_filter_luma except that the P/Q
42600+// components are on separate pointers
42601+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
42602+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
42603+                                 uint8_t * _pix_l)
42604+{
42605+    int d, j;
42606+    pixel *pix_l        = (pixel *)_pix_l;
42607+    pixel *pix_r        = (pixel *)_pix_r;
42608+    const ptrdiff_t xstride = 1;
42609+    const ptrdiff_t ystride = _stride / sizeof(pixel);
42610+
42611+    beta <<= BIT_DEPTH - 8;
42612+
42613+    for (j = 0; j < 2; j++) {
42614+        const int dp0  = abs(P2  - 2 * P1  + P0);
42615+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
42616+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
42617+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
42618+        const int d0   = dp0 + dq0;
42619+        const int d3   = dp3 + dq3;
42620+        const int tc   = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8);
42621+        const int no_p = no_f & 1;
42622+        const int no_q = no_f & 2;
42623+
42624+        if (d0 + d3 >= beta) {
42625+            pix_l += 4 * ystride;
42626+            pix_r += 4 * ystride;
42627+            continue;
42628+        } else {
42629+            const int beta_3 = beta >> 3;
42630+            const int beta_2 = beta >> 2;
42631+            const int tc25   = ((tc * 5 + 1) >> 1);
42632+
42633+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
42634+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
42635+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
42636+                // strong filtering
42637+                const int tc2 = tc << 1;
42638+                for (d = 0; d < 4; d++) {
42639+                    const int p3 = P3;
42640+                    const int p2 = P2;
42641+                    const int p1 = P1;
42642+                    const int p0 = P0;
42643+                    const int q0 = Q0;
42644+                    const int q1 = Q1;
42645+                    const int q2 = Q2;
42646+                    const int q3 = Q3;
42647+                    if (!no_p) {
42648+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
42649+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
42650+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
42651+                    }
42652+                    if (!no_q) {
42653+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
42654+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
42655+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
42656+                    }
42657+                    pix_l += ystride;
42658+                    pix_r += ystride;
42659+                }
42660+            } else { // normal filtering
42661+                int nd_p = 1;
42662+                int nd_q = 1;
42663+                const int tc_2 = tc >> 1;
42664+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
42665+                    nd_p = 2;
42666+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
42667+                    nd_q = 2;
42668+
42669+                for (d = 0; d < 4; d++) {
42670+                    const int p2 = P2;
42671+                    const int p1 = P1;
42672+                    const int p0 = P0;
42673+                    const int q0 = Q0;
42674+                    const int q1 = Q1;
42675+                    const int q2 = Q2;
42676+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
42677+                    if (abs(delta0) < 10 * tc) {
42678+                        delta0 = av_clip(delta0, -tc, tc);
42679+                        if (!no_p)
42680+                            P0 = av_clip_pixel(p0 + delta0);
42681+                        if (!no_q)
42682+                            Q0 = av_clip_pixel(q0 - delta0);
42683+                        if (!no_p && nd_p > 1) {
42684+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
42685+                            P1 = av_clip_pixel(p1 + deltap1);
42686+                        }
42687+                        if (!no_q && nd_q > 1) {
42688+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
42689+                            Q1 = av_clip_pixel(q1 + deltaq1);
42690+                        }
42691+                    }
42692+                    pix_l += ystride;
42693+                    pix_r += ystride;
42694+                }
42695+            }
42696+        }
42697+    }
42698+}
42699+
42700+static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
42701+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f)
42702+{
42703+    // Just call the non-2 function having massaged the parameters
42704+    int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16};
42705+    uint8_t no_p[2] = {no_f & 1, no_f & 1};
42706+    uint8_t no_q[2] = {no_f & 2, no_f & 2};
42707+    FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q);
42708+}
42709+
42710+#undef TP3
42711+#undef TP2
42712+#undef TP1
42713+#undef TP0
42714+#undef TQ0
42715+#undef TQ1
42716+#undef TQ2
42717+#undef TQ3
42718+
42719+#undef P3
42720+#undef P2
42721+#undef P1
42722+#undef P0
42723+#undef Q0
42724+#undef Q1
42725+#undef Q2
42726+#undef Q3
42727+
42728+#define P1 pix_l[0 * xstride]
42729+#define P0 pix_l[1 * xstride]
42730+#define Q0 pix_r[0 * xstride]
42731+#define Q1 pix_r[1 * xstride]
42732+
42733+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
42734+                                          ptrdiff_t _ystride, const int32_t *_tc,
42735+                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
42736+{
42737+    int d, j, no_p, no_q;
42738+    pixel *pix_l        = (pixel *)_pix_l;
42739+    pixel *pix_r        = (pixel *)_pix_r;
42740+    ptrdiff_t xstride = _xstride / sizeof(pixel);
42741+    ptrdiff_t ystride = _ystride / sizeof(pixel);
42742+
42743+    for (j = 0; j < 2; j++) {
42744+        const int tc = _tc[j] << (BIT_DEPTH - 8);
42745+        if (tc <= 0) {
42746+            pix_l += 4 * ystride;
42747+            pix_r += 4 * ystride;
42748+            continue;
42749+        }
42750+        no_p = _no_p[j];
42751+        no_q = _no_q[j];
42752+
42753+        for (d = 0; d < 4; d++) {
42754+            int delta0;
42755+            const int p1 = P1;
42756+            const int p0 = P0;
42757+            const int q0 = Q0;
42758+            const int q1 = Q1;
42759+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
42760+            if (!no_p)
42761+                P0 = av_clip_pixel(p0 + delta0);
42762+            if (!no_q)
42763+                Q0 = av_clip_pixel(q0 - delta0);
42764+            pix_l += ystride;
42765+            pix_r += ystride;
42766+        }
42767+    }
42768+}
42769+
42770+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
42771+                                 unsigned int no_f)
42772+{
42773+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
42774+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
42775+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
42776+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
42777+    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
42778+}
42779+
42780+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
42781+                                 uint8_t * src_l,
42782+                                 unsigned int no_f)
42783+{
42784+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
42785+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
42786+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
42787+    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
42788+    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
42789+}
42790+
42791+#undef P1
42792+#undef P0
42793+#undef Q0
42794+#undef Q1
42795+
42796--- /dev/null
42797+++ b/libavcodec/rpi_hevcpred.c
42798@@ -0,0 +1,161 @@
42799+/*
42800+ * HEVC video Decoder
42801+ *
42802+ * Copyright (C) 2012 - 2013 Guillaume Martres
42803+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
42804+ *
42805+ * This file is part of FFmpeg.
42806+ *
42807+ * FFmpeg is free software; you can redistribute it and/or
42808+ * modify it under the terms of the GNU Lesser General Public
42809+ * License as published by the Free Software Foundation; either
42810+ * version 2.1 of the License, or (at your option) any later version.
42811+ *
42812+ * FFmpeg is distributed in the hope that it will be useful,
42813+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
42814+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
42815+ * Lesser General Public License for more details.
42816+ *
42817+ * You should have received a copy of the GNU Lesser General Public
42818+ * License along with FFmpeg; if not, write to the Free Software
42819+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
42820+ */
42821+
42822+#include "rpi_hevcdec.h"
42823+
42824+#include "rpi_hevcpred.h"
42825+#if (ARCH_ARM)
42826+#include "arm/rpi_hevcpred_arm.h"
42827+#endif
42828+
42829+#define PRED_C 0
42830+#define BIT_DEPTH 8
42831+#include "rpi_hevcpred_template.c"
42832+#undef BIT_DEPTH
42833+
42834+#define BIT_DEPTH 9
42835+#include "rpi_hevcpred_template.c"
42836+#undef BIT_DEPTH
42837+
42838+#define BIT_DEPTH 10
42839+#include "rpi_hevcpred_template.c"
42840+#undef BIT_DEPTH
42841+
42842+#define BIT_DEPTH 12
42843+#include "rpi_hevcpred_template.c"
42844+#undef BIT_DEPTH
42845+#undef PRED_C
42846+
42847+#define PRED_C 1
42848+#define BIT_DEPTH 8
42849+#include "rpi_hevcpred_template.c"
42850+#undef BIT_DEPTH
42851+
42852+#define BIT_DEPTH 9
42853+#include "rpi_hevcpred_template.c"
42854+#undef BIT_DEPTH
42855+
42856+#define BIT_DEPTH 10
42857+#include "rpi_hevcpred_template.c"
42858+#undef BIT_DEPTH
42859+
42860+#define BIT_DEPTH 12
42861+#include "rpi_hevcpred_template.c"
42862+#undef BIT_DEPTH
42863+#undef PRED_C
42864+
42865+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth)
42866+{
42867+#undef FUNC
42868+#define FUNC(a, depth) a ## _ ## depth
42869+
42870+#undef FUNCC
42871+#define FUNCC(a, depth) a ## _ ## depth ## _c
42872+
42873+#define HEVC_PRED_Y(depth)                                \
42874+    hpc->intra_pred      = FUNC(intra_pred, depth);     \
42875+    hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \
42876+    hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \
42877+    hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \
42878+    hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \
42879+    hpc->pred_planar[0]  = FUNC(pred_planar_0, depth);  \
42880+    hpc->pred_planar[1]  = FUNC(pred_planar_1, depth);  \
42881+    hpc->pred_planar[2]  = FUNC(pred_planar_2, depth);  \
42882+    hpc->pred_planar[3]  = FUNC(pred_planar_3, depth);  \
42883+    hpc->pred_dc[0]      = FUNC(pred_dc_0, depth);      \
42884+    hpc->pred_dc[1]      = FUNC(pred_dc_1, depth);      \
42885+    hpc->pred_dc[2]      = FUNC(pred_dc_2, depth);      \
42886+    hpc->pred_dc[3]      = FUNC(pred_dc_3, depth);      \
42887+    hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \
42888+    hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \
42889+    hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \
42890+    hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \
42891+    hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \
42892+    hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \
42893+    hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \
42894+    hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \
42895+    hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
42896+    hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
42897+    hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
42898+    hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \
42899+    hpc->pred_dc0[0]     = FUNC(pred_dc0_0, depth);     \
42900+    hpc->pred_dc0[1]     = FUNC(pred_dc0_1, depth);     \
42901+    hpc->pred_dc0[2]     = FUNC(pred_dc0_2, depth);     \
42902+    hpc->pred_dc0[3]     = FUNC(pred_dc0_3, depth);
42903+
42904+#define HEVC_PRED_C(depth)                                \
42905+    hpc->intra_pred_c      = FUNCC(intra_pred, depth);     \
42906+	hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \
42907+	hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \
42908+	hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \
42909+	hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \
42910+    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
42911+    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
42912+    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
42913+    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
42914+    hpc->pred_dc_c[0]      = FUNCC(pred_dc_0, depth);      \
42915+    hpc->pred_dc_c[1]      = FUNCC(pred_dc_1, depth);      \
42916+    hpc->pred_dc_c[2]      = FUNCC(pred_dc_2, depth);      \
42917+    hpc->pred_dc_c[3]      = FUNCC(pred_dc_3, depth);      \
42918+    hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \
42919+    hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \
42920+    hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \
42921+    hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \
42922+    hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \
42923+    hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \
42924+    hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \
42925+    hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \
42926+    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
42927+    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
42928+    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
42929+    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \
42930+    hpc->pred_dc0_c[0]     = FUNCC(pred_dc0_0, depth);     \
42931+    hpc->pred_dc0_c[1]     = FUNCC(pred_dc0_1, depth);     \
42932+    hpc->pred_dc0_c[2]     = FUNCC(pred_dc0_2, depth);     \
42933+    hpc->pred_dc0_c[3]     = FUNCC(pred_dc0_3, depth);
42934+
42935+#define HEVC_PRED(depth) \
42936+    HEVC_PRED_Y(depth); \
42937+    HEVC_PRED_C(depth);
42938+
42939+    switch (bit_depth) {
42940+    case 9:
42941+        HEVC_PRED(9);
42942+        break;
42943+    case 10:
42944+        HEVC_PRED(10);
42945+        break;
42946+    case 12:
42947+        HEVC_PRED(12);
42948+        break;
42949+    default:
42950+        HEVC_PRED(8);
42951+        break;
42952+    }
42953+
42954+#if (ARCH_ARM)
42955+    ff_hevc_rpi_pred_init_arm(hpc, bit_depth);
42956+#elif (ARCH_MIPS)
42957+    ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
42958+#endif
42959+}
42960--- /dev/null
42961+++ b/libavcodec/rpi_hevcpred.h
42962@@ -0,0 +1,123 @@
42963+/*
42964+ * HEVC video Decoder
42965+ *
42966+ * Copyright (C) 2012 - 2013 Guillaume Martres
42967+ *
42968+ * This file is part of FFmpeg.
42969+ *
42970+ * FFmpeg is free software; you can redistribute it and/or
42971+ * modify it under the terms of the GNU Lesser General Public
42972+ * License as published by the Free Software Foundation; either
42973+ * version 2.1 of the License, or (at your option) any later version.
42974+ *
42975+ * FFmpeg is distributed in the hope that it will be useful,
42976+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
42977+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
42978+ * Lesser General Public License for more details.
42979+ *
42980+ * You should have received a copy of the GNU Lesser General Public
42981+ * License along with FFmpeg; if not, write to the Free Software
42982+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
42983+ */
42984+
42985+#ifndef AVCODEC_RPI_HEVCPRED_H
42986+#define AVCODEC_RPI_HEVCPRED_H
42987+
42988+#include <stddef.h>
42989+#include <stdint.h>
42990+#include "config.h"
42991+
42992+struct HEVCRpiContext;
42993+struct HEVCRpiLocalContext;
42994+
42995+enum IntraPredMode {
42996+    INTRA_PLANAR = 0,
42997+    INTRA_DC,
42998+    INTRA_ANGULAR_2,
42999+    INTRA_ANGULAR_3,
43000+    INTRA_ANGULAR_4,
43001+    INTRA_ANGULAR_5,
43002+    INTRA_ANGULAR_6,
43003+    INTRA_ANGULAR_7,
43004+    INTRA_ANGULAR_8,
43005+    INTRA_ANGULAR_9,
43006+    INTRA_ANGULAR_10,
43007+    INTRA_ANGULAR_11,
43008+    INTRA_ANGULAR_12,
43009+    INTRA_ANGULAR_13,
43010+    INTRA_ANGULAR_14,
43011+    INTRA_ANGULAR_15,
43012+    INTRA_ANGULAR_16,
43013+    INTRA_ANGULAR_17,
43014+    INTRA_ANGULAR_18,
43015+    INTRA_ANGULAR_19,
43016+    INTRA_ANGULAR_20,
43017+    INTRA_ANGULAR_21,
43018+    INTRA_ANGULAR_22,
43019+    INTRA_ANGULAR_23,
43020+    INTRA_ANGULAR_24,
43021+    INTRA_ANGULAR_25,
43022+    INTRA_ANGULAR_26,
43023+    INTRA_ANGULAR_27,
43024+    INTRA_ANGULAR_28,
43025+    INTRA_ANGULAR_29,
43026+    INTRA_ANGULAR_30,
43027+    INTRA_ANGULAR_31,
43028+    INTRA_ANGULAR_32,
43029+    INTRA_ANGULAR_33,
43030+    INTRA_ANGULAR_34,
43031+};
43032+#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10
43033+#define INTRA_ANGULAR_VERTICAL   INTRA_ANGULAR_26
43034+
43035+typedef void intra_filter_fn_t(
43036+        uint8_t * const left, uint8_t * const top,
43037+        const unsigned int req, const unsigned int avail,
43038+        const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur,
43039+        const unsigned int stride,
43040+        const unsigned int top_right_size, const unsigned int down_left_size);
43041+
43042+typedef struct HEVCRpiPredContext {
43043+    void (*intra_pred)(const struct HEVCRpiContext * const s,
43044+                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
43045+                          const unsigned int avail, const unsigned int log2_size);
43046+
43047+    intra_filter_fn_t *intra_filter[4];
43048+    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
43049+                           const uint8_t *left, ptrdiff_t stride);
43050+    void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
43051+                    ptrdiff_t stride);
43052+    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
43053+                            const uint8_t *left, ptrdiff_t stride,
43054+                            int mode);
43055+    void (*pred_vertical[4])(uint8_t *src, const uint8_t *top,
43056+                            const uint8_t *left, ptrdiff_t stride,
43057+                            int mode);
43058+    void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top,
43059+                            const uint8_t *left, ptrdiff_t stride,
43060+                            int mode);
43061+    void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride);
43062+
43063+    void (*intra_pred_c)(const struct HEVCRpiContext * const s,
43064+                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
43065+                          const unsigned int avail, const unsigned int log2_size);
43066+    intra_filter_fn_t *intra_filter_c[4];
43067+    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
43068+                           const uint8_t *left, ptrdiff_t stride);
43069+    void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
43070+                    ptrdiff_t stride);
43071+    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
43072+                            const uint8_t *left, ptrdiff_t stride,
43073+                            int mode);
43074+    void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top,
43075+                            const uint8_t *left, ptrdiff_t stride,
43076+                            int mode);
43077+    void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top,
43078+                            const uint8_t *left, ptrdiff_t stride,
43079+                            int mode);
43080+    void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride);
43081+} HEVCRpiPredContext;
43082+
43083+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth);
43084+
43085+#endif /* AVCODEC_RPI_HEVCPRED_H */
43086--- /dev/null
43087+++ b/libavcodec/rpi_hevcpred_template.c
43088@@ -0,0 +1,1407 @@
43089+/*
43090+ * HEVC video decoder
43091+ *
43092+ * Copyright (C) 2012 - 2013 Guillaume Martres
43093+ *
43094+ * This file is part of FFmpeg.
43095+ *
43096+ * FFmpeg is free software; you can redistribute it and/or
43097+ * modify it under the terms of the GNU Lesser General Public
43098+ * License as published by the Free Software Foundation; either
43099+ * version 2.1 of the License, or (at your option) any later version.
43100+ *
43101+ * FFmpeg is distributed in the hope that it will be useful,
43102+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
43103+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
43104+ * Lesser General Public License for more details.
43105+ *
43106+ * You should have received a copy of the GNU Lesser General Public
43107+ * License along with FFmpeg; if not, write to the Free Software
43108+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
43109+ */
43110+
43111+#include "config.h"
43112+#include "libavutil/pixdesc.h"
43113+#include "libavutil/rpi_sand_fns.h"
43114+#include "bit_depth_template.c"
43115+
43116+#include "rpi_hevcdec.h"
43117+#include "rpi_hevcpred.h"
43118+
43119+#define DUMP_PRED 0
43120+
43121+#define POS(x, y) src[(x) + stride * (y)]
43122+
43123+// INCLUDED_ONCE defined at EOF
43124+#ifndef INCLUDED_ONCE
43125+typedef uint8_t (* c8_dst_ptr_t)[2];
43126+typedef const uint8_t (* c8_src_ptr_t)[2];
43127+typedef uint16_t (* c16_dst_ptr_t)[2];
43128+typedef const uint16_t (* c16_src_ptr_t)[2];
43129+
43130+// *** On ARM make these NEON registers
43131+typedef struct pixel4_16 {
43132+    uint16_t x[4];
43133+} pixel4_16;
43134+typedef struct pixel4_32 {
43135+    uint32_t x[4];
43136+} pixel4_32;
43137+static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
43138+{
43139+    pixel4_16 t = {{x, x, x, x}};
43140+    return t;
43141+}
43142+static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
43143+{
43144+    pixel4_32 t = {{x, x, x, x}};
43145+    return t;
43146+}
43147+#endif
43148+
43149+#if PRED_C
43150+// For chroma we double pixel size so we copy pairs
43151+#undef pixel
43152+#undef pixel2
43153+#undef pixel4
43154+#undef dctcoef
43155+#undef INIT_CLIP
43156+#undef no_rnd_avg_pixel4
43157+#undef rnd_avg_pixel4
43158+#undef AV_RN2P
43159+#undef AV_RN4P
43160+#undef AV_RN4PA
43161+#undef AV_WN2P
43162+#undef AV_WN4P
43163+#undef AV_WN4PA
43164+#undef CLIP
43165+#undef FUNC
43166+#undef FUNCC
43167+#undef av_clip_pixel
43168+#undef PIXEL_SPLAT_X4
43169+
43170+#if BIT_DEPTH == 8
43171+#define pixel uint16_t
43172+#define pixel4 pixel4_16
43173+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
43174+#define cpel uint8_t
43175+#define c_src_ptr_t  c8_src_ptr_t
43176+#define c_dst_ptr_t  c8_dst_ptr_t
43177+#else
43178+#define pixel uint32_t
43179+#define pixel4 pixel4_32
43180+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
43181+#define cpel uint16_t
43182+#define c_src_ptr_t c16_dst_ptr_t
43183+#define c_dst_ptr_t c16_dst_ptr_t
43184+#endif
43185+#define AV_RN4P(p) (*(pixel4*)(p))
43186+#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
43187+#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
43188+#endif
43189+
43190+
43191+// Get PW prior to horrid PRED_C trickery
43192+#if BIT_DEPTH == 8
43193+#define PW 1
43194+#else
43195+#define PW 2
43196+#endif
43197+
43198+
43199+#if DUMP_PRED && !defined(INCLUDED_ONCE)
43200+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
43201+{
43202+    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
43203+        for (unsigned int x = 0; x != size; x++) {
43204+            printf("%4d", data[x * 2]);
43205+        }
43206+        printf("\n");
43207+    }
43208+    printf("\n");
43209+}
43210+#endif
43211+
43212+#ifndef INCLUDED_ONCE
43213+static inline void extend_8(void * ptr, const unsigned int v, unsigned int n)
43214+{
43215+    if ((n >>= 2) != 0) {
43216+        uint32_t v4 = v | (v << 8);
43217+        uint32_t * p = (uint32_t *)ptr;
43218+        v4 = v4 | (v4 << 16);
43219+        do {
43220+            *p++ = v4;
43221+        } while (--n != 0);
43222+    }
43223+}
43224+
43225+static inline void extend_16(void * ptr, const unsigned int v, unsigned int n)
43226+{
43227+    if ((n >>= 2) != 0) {
43228+        uint32_t v2 = v | (v << 16);
43229+        uint32_t * p = (uint32_t *)ptr;
43230+        do {
43231+            *p++ = v2;
43232+            *p++ = v2;
43233+        } while (--n != 0);
43234+    }
43235+}
43236+
43237+static inline void extend_32(void * ptr, const unsigned int v, unsigned int n)
43238+{
43239+    if ((n >>= 2) != 0) {
43240+        uint32_t * p = (uint32_t *)ptr;
43241+        do {
43242+            *p++ = v;
43243+            *p++ = v;
43244+            *p++ = v;
43245+            *p++ = v;
43246+        } while (--n != 0);
43247+    }
43248+}
43249+
43250+// Beware that this inverts the avail ordering
43251+// For CIP it seems easier this way round
43252+static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask,
43253+                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
43254+                              unsigned int s0, unsigned int odd_s)
43255+{
43256+    const unsigned int n = 1 << log2_intra_bits;
43257+    unsigned int fa = 0;
43258+    unsigned int i;
43259+
43260+    size >>= 2;   // Now in 4-pel units
43261+    s0 >>= 2;
43262+
43263+    if ((avail & AVAIL_DL) != 0)
43264+        fa |= ((1 << s0) - 1) << (size - s0);
43265+    if ((avail & AVAIL_L) != 0)
43266+        fa |= ((1 << size) - 1) << size;
43267+    if ((avail & AVAIL_UL) != 0)
43268+        fa |= 1 << (size << 1);
43269+
43270+    if (odd_s) {
43271+        if ((fa & 1) != 0 && (*is_intra & i_mask) == 0)
43272+            fa &= ~1;
43273+        is_intra += i_stride;
43274+    }
43275+
43276+    for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) {
43277+        const unsigned int m = ((1 << n) - 1) << i;
43278+        if ((fa & m) != 0 && (*is_intra & i_mask) == 0)
43279+            fa &= ~m;
43280+    }
43281+
43282+    return fa;
43283+}
43284+
43285+static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift,
43286+                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
43287+                                unsigned int s1, unsigned int odd_s)
43288+{
43289+    if ((avail & (AVAIL_U | AVAIL_UR)) == 0)
43290+    {
43291+        return 0;
43292+    }
43293+    else
43294+    {
43295+        const unsigned int n = 1 << log2_intra_bits;
43296+        unsigned int fa = 0;
43297+        unsigned int i;
43298+        unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift;
43299+
43300+        size >>= 2;   // Now in 4-pel units
43301+        s1 >>= 2;
43302+
43303+        if ((avail & AVAIL_U) != 0)
43304+            fa |= ((1 << size) - 1);
43305+        if ((avail & AVAIL_UR) != 0)
43306+            fa |= ((1 << s1) - 1) << size;
43307+
43308+        if (odd_s) {
43309+            fa &= im | ~1;
43310+            im >>= 1;
43311+        }
43312+
43313+        for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) {
43314+            const unsigned int m = ((1 << n) - 1) << i;
43315+            if ((im & 1) == 0)
43316+                fa &= ~m;
43317+        }
43318+        return fa;
43319+    }
43320+}
43321+
43322+
43323+
43324+static inline unsigned int rmbd(unsigned int x)
43325+{
43326+#if 1
43327+    return __builtin_ctz(x);
43328+#else
43329+    unsigned int n = 0;
43330+    if ((x & 0xffff) == 0) {
43331+        x >>= 16;
43332+        n += 16;
43333+    }
43334+    if ((x & 0xff) == 0) {
43335+        x >>= 8;
43336+        n += 8;
43337+    }
43338+    if ((x & 0xf) == 0) {
43339+        x >>= 4;
43340+        n += 4;
43341+    }
43342+    if ((x & 0x3) == 0) {
43343+        x >>= 2;
43344+        n += 2;
43345+    }
43346+
43347+    return (x & 1) == 0 ? n + 1 : n;
43348+#endif
43349+}
43350+#endif
43351+
43352+
43353+static void FUNC(cip_fill)(pixel * const left, pixel * const top,
43354+    const unsigned int avail_l, const unsigned int avail_u,
43355+    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
43356+    const unsigned int stride,
43357+    const unsigned int size)
43358+{
43359+    pixel a;
43360+    unsigned int i;
43361+
43362+    // 1st find DL value
43363+    if ((avail_l & 1) == 0) {
43364+        if (avail_l != 0)
43365+            a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride];
43366+        else
43367+        {
43368+            // (avail_l | avail_u) != 0 so this must be good
43369+            const unsigned int n = rmbd(avail_u)*4;
43370+            a = (n >= size) ? src_ur[n - size] : src_u[n];
43371+        }
43372+    }
43373+
43374+    // L
43375+    {
43376+        pixel * d = left + size * 2 - 1;
43377+        const pixel * s = src_l + (size * 2 - 1) * stride;
43378+        unsigned int x = avail_l;
43379+        for (i = 0; i < size * 2; i += 4, x >>= 1)
43380+        {
43381+            if ((x & 1) != 0) {
43382+                // Avail
43383+                *d-- = *s;
43384+                s -= stride;
43385+                *d-- = *s;
43386+                s -= stride;
43387+                *d-- = *s;
43388+                s -= stride;
43389+                *d-- = a = *s;
43390+                s -= stride;
43391+            }
43392+            else
43393+            {
43394+                *d-- = a;
43395+                *d-- = a;
43396+                *d-- = a;
43397+                *d-- = a;
43398+                s -= stride * 4;
43399+            }
43400+        }
43401+        // UL
43402+        *d = a = (x & 1) != 0 ? *s : a;
43403+    }
43404+
43405+    // U
43406+    {
43407+        pixel * d = top;
43408+        const pixel * s = src_u;
43409+        unsigned int x = avail_u;
43410+
43411+        for (i = 0; i < size; i += 4, x >>= 1)
43412+        {
43413+            if ((x & 1) != 0) {
43414+                // Avail
43415+                *d++ = *s++;
43416+                *d++ = *s++;
43417+                *d++ = *s++;
43418+                *d++ = a = *s++;
43419+            }
43420+            else
43421+            {
43422+                *d++ = a;
43423+                *d++ = a;
43424+                *d++ = a;
43425+                *d++ = a;
43426+                s += 4;
43427+            }
43428+        }
43429+
43430+        // UR
43431+        s = src_ur;
43432+        for (i = 0; i < size; i += 4, x >>= 1)
43433+        {
43434+            if ((x & 1) != 0) {
43435+                // Avail
43436+                *d++ = *s++;
43437+                *d++ = *s++;
43438+                *d++ = *s++;
43439+                *d++ = a = *s++;
43440+            }
43441+            else
43442+            {
43443+                *d++ = a;
43444+                *d++ = a;
43445+                *d++ = a;
43446+                *d++ = a;
43447+                s += 4;
43448+            }
43449+        }
43450+    }
43451+}
43452+
43453+
43454+#if !PRED_C && PW == 1
43455+#define EXTEND(ptr, val, len) extend_8(ptr, val, len)
43456+#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1)
43457+#define EXTEND(ptr, val, len) extend_16(ptr, val, len)
43458+#else
43459+#define EXTEND(ptr, val, len) extend_32(ptr, val, len)
43460+#endif
43461+
43462+// Reqs:
43463+//
43464+// Planar:  DL[0], L, ul, U, UR[0]
43465+// DC:         dl, L, ul, U, ur
43466+// A2-9:       DL, L, ul, u, ur
43467+// A10:        dl, L, ul, u, ur
43468+// A11-17      dl, L, UL, U, ur
43469+// A18-25      dl, L, Ul, U, ur
43470+// A26         dl, l, ul, U, ur
43471+// A27-34      dl, l, ul, U, UR
43472+
43473+#ifndef INCLUDED_ONCE
43474+
43475+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
43476+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
43477+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
43478+
43479+static const uint8_t req_avail_c[35] =
43480+{
43481+    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
43482+               AVAIL_L | 0         |  AVAIL_U,             // DC
43483+    AVAIL_DL | AVAIL_L,                                    // 2
43484+    AVAIL_DL | AVAIL_L,                                    // 3
43485+    AVAIL_DL | AVAIL_L,                                    // 4
43486+    AVAIL_DL | AVAIL_L,                                    // 5
43487+    AVAIL_DL | AVAIL_L,                                    // 6
43488+    AVAIL_DL | AVAIL_L,                                    // 7
43489+    AVAIL_DL | AVAIL_L,                                    // 8
43490+    AVAIL_DL | AVAIL_L,                                    // 9
43491+               AVAIL_L,                                    // 10 (H)
43492+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
43493+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
43494+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
43495+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
43496+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
43497+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
43498+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
43499+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
43500+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
43501+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
43502+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
43503+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
43504+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
43505+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
43506+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
43507+                                    AVAIL_U,               // 26 (V)
43508+                                    AVAIL_U | AVAIL_UR,    // 27
43509+                                    AVAIL_U | AVAIL_UR,    // 28
43510+                                    AVAIL_U | AVAIL_UR,    // 29
43511+                                    AVAIL_U | AVAIL_UR,    // 30
43512+                                    AVAIL_U | AVAIL_UR,    // 31
43513+                                    AVAIL_U | AVAIL_UR,    // 32
43514+                                    AVAIL_U | AVAIL_UR,    // 33
43515+                                    AVAIL_U | AVAIL_UR     // 34
43516+};
43517+
43518+static const uint8_t req_avail[4][35] = {
43519+{
43520+    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
43521+               AVAIL_L | 0         |  AVAIL_U,             // DC
43522+    AVAIL_DL | AVAIL_L,                                    // 2
43523+    AVAIL_DL | AVAIL_L,                                    // 3
43524+    AVAIL_DL | AVAIL_L,                                    // 4
43525+    AVAIL_DL | AVAIL_L,                                    // 5
43526+    AVAIL_DL | AVAIL_L,                                    // 6
43527+    AVAIL_DL | AVAIL_L,                                    // 7
43528+    AVAIL_DL | AVAIL_L,                                    // 8
43529+    AVAIL_DL | AVAIL_L,                                    // 9
43530+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 10 (H)
43531+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
43532+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
43533+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
43534+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
43535+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
43536+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
43537+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
43538+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
43539+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
43540+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
43541+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
43542+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
43543+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
43544+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
43545+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
43546+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 26 (V)
43547+                                    AVAIL_U | AVAIL_UR,    // 27
43548+                                    AVAIL_U | AVAIL_UR,    // 28
43549+                                    AVAIL_U | AVAIL_UR,    // 29
43550+                                    AVAIL_U | AVAIL_UR,    // 30
43551+                                    AVAIL_U | AVAIL_UR,    // 31
43552+                                    AVAIL_U | AVAIL_UR,    // 32
43553+                                    AVAIL_U | AVAIL_UR,    // 33
43554+                                    AVAIL_U | AVAIL_UR     // 34
43555+},
43556+{  // 3
43557+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
43558+               AVAIL_L | 0        | AVAIL_U,                            // DC
43559+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
43560+    AVAIL_DL | AVAIL_L                                 | 0,             // 3
43561+    AVAIL_DL | AVAIL_L                                 | 0,             // 4
43562+    AVAIL_DL | AVAIL_L                                 | 0,             // 5
43563+    AVAIL_DL | AVAIL_L                                 | 0,             // 6
43564+    AVAIL_DL | AVAIL_L                                 | 0,             // 7
43565+    AVAIL_DL | AVAIL_L                                 | 0,             // 8
43566+    AVAIL_DL | AVAIL_L                                 | 0,             // 9
43567+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
43568+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
43569+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 12
43570+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 13
43571+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 14
43572+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 15
43573+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 16
43574+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 17
43575+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
43576+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 19
43577+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 20
43578+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 21
43579+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 22
43580+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 23
43581+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 24
43582+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
43583+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
43584+                                    AVAIL_U | AVAIL_UR | 0,             // 27
43585+                                    AVAIL_U | AVAIL_UR | 0,             // 28
43586+                                    AVAIL_U | AVAIL_UR | 0,             // 29
43587+                                    AVAIL_U | AVAIL_UR | 0,             // 30
43588+                                    AVAIL_U | AVAIL_UR | 0,             // 31
43589+                                    AVAIL_U | AVAIL_UR | 0,             // 32
43590+                                    AVAIL_U | AVAIL_UR | 0,             // 33
43591+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
43592+},
43593+{  // 4
43594+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
43595+               AVAIL_L | 0        | AVAIL_U,                            // DC
43596+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
43597+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 3
43598+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 4
43599+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 5
43600+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 6
43601+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 7
43602+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 8
43603+    AVAIL_DL | AVAIL_L                                 | 0,             // 9
43604+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
43605+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
43606+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 12
43607+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 13
43608+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 14
43609+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 15
43610+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 16
43611+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 17
43612+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
43613+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 19
43614+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 20
43615+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 21
43616+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 22
43617+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 23
43618+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 24
43619+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
43620+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
43621+                                    AVAIL_U | AVAIL_UR | 0,             // 27
43622+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 28
43623+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 29
43624+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 30
43625+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 31
43626+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 32
43627+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 33
43628+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
43629+},
43630+{  // 5
43631+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed)
43632+               AVAIL_L | 0        | AVAIL_U,                            // DC
43633+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 2
43634+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 3
43635+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 4
43636+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 5
43637+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 6
43638+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 7
43639+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 8
43640+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 9
43641+               AVAIL_L                                 | 0,             // 10 (H)
43642+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 11
43643+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 12
43644+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 13
43645+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 14
43646+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 15
43647+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 16
43648+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 17
43649+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 18
43650+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 19
43651+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 20
43652+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 21
43653+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 22
43654+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 23
43655+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 24
43656+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 25
43657+                                    AVAIL_U            | 0,             // 26 (V)
43658+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27
43659+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28
43660+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29
43661+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30
43662+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31
43663+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32
43664+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33
43665+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER  // 34
43666+}
43667+};
43668+
43669+
43670+#endif
43671+
43672+#define filter_light1 FUNC(filter_light1)
43673+static inline pixel filter_light1(pixel a, pixel b, pixel c)
43674+{
43675+    return (a + b*2 + c + 2) >> 2;
43676+}
43677+
43678+#define filter_light FUNC(filter_light)
43679+static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n)
43680+{
43681+    pixel p0;
43682+    pixel p2 = *src;
43683+    // Allow for final pel - it is just clearer to to have the call take the actual number of output pels
43684+    unsigned int n_minus_1 = n - 1;
43685+
43686+    do
43687+    {
43688+        src += sstride;
43689+        p0 = p1;
43690+        p1 = p2;
43691+        p2 = *src;
43692+        *dst++ = filter_light1(p0, p1, p2);
43693+    } while (--n_minus_1 != 0);
43694+    *dst = filter_light1(p1, p2, pn);
43695+}
43696+
43697+#define filter_strong FUNC(filter_strong)
43698+static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n)
43699+{
43700+    unsigned int a = 64 * p0 + 32;
43701+    const int v = p1 - p0;
43702+
43703+    do
43704+    {
43705+        *dst++ = (a += v) >> 6;
43706+    } while (--n != 0);
43707+}
43708+
43709+#define intra_filter FUNC(intra_filter)
43710+static av_always_inline void intra_filter(
43711+    pixel * const left, pixel * const top,
43712+    const unsigned int req, const unsigned int avail,
43713+    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
43714+    const unsigned int stride,
43715+    const unsigned int top_right_size, const unsigned int down_left_size,
43716+    const unsigned int log2_size)
43717+{
43718+    const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5);
43719+    const unsigned int size = 1 << log2_size;
43720+
43721+    // a_ is the first pel in a section working round dl -> ur
43722+    // b_ is the last
43723+    // Beware that top & left work out from UL so usage of a_ & b_ may
43724+    // swap between them.  It is a bad naming scheme but I have found no
43725+    // better
43726+    const pixel * a_dl = src_l + (down_left_size + size - 1) * stride;
43727+    const pixel * b_dl = src_l + size * stride;
43728+    const pixel * a_l  = src_l + (size - 1) * stride;
43729+    const pixel * b_l  = src_l;
43730+    const pixel * ab_ul = src_l - stride;
43731+    const pixel * a_u = src_u;
43732+    const pixel * b_u = src_u + size - 1;
43733+    const pixel * a_ur = src_ur;
43734+    const pixel * b_ur = src_ur + top_right_size - 1;
43735+
43736+    const unsigned int want = req & ~avail;
43737+    const unsigned int have = req & avail;
43738+    unsigned int i;
43739+
43740+    if ((avail & AVAIL_DL) == 0)
43741+    {
43742+        a_dl = a_ur;
43743+        if ((avail & AVAIL_U) != 0)
43744+            a_dl = a_u;
43745+        if ((avail & AVAIL_UL) != 0)
43746+            a_dl = ab_ul;
43747+        if ((avail & AVAIL_L) != 0)
43748+            a_dl = a_l;
43749+        b_dl = a_dl;
43750+    }
43751+
43752+    if ((avail & AVAIL_L) == 0)
43753+    {
43754+        a_l = b_dl;
43755+        b_l = b_dl;
43756+    }
43757+    if ((avail & AVAIL_UL) == 0)
43758+    {
43759+        ab_ul = b_l;
43760+    }
43761+    if ((avail & AVAIL_U) == 0)
43762+    {
43763+        a_u = ab_ul;
43764+        b_u = ab_ul;
43765+    }
43766+    if ((avail & AVAIL_UR) == 0)
43767+    {
43768+        a_ur = b_u;
43769+        b_ur = b_u;
43770+    }
43771+
43772+    if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2)  // PRED_C, log2_size compiler opt hints
43773+    {
43774+        if ((req & AVAIL_UL) != 0)
43775+            left[-1] = *ab_ul;
43776+
43777+        if ((want & AVAIL_L) != 0)
43778+            EXTEND(left, *a_l, size);
43779+        if ((want & AVAIL_DL) != 0)
43780+            EXTEND(left + size, *a_dl, size);
43781+        if ((want & AVAIL_U) != 0)
43782+            EXTEND(top, *a_u, size);
43783+        if ((want & AVAIL_UR) != 0)
43784+            EXTEND(top + size, *a_ur, size);
43785+
43786+        if ((have & AVAIL_U) != 0)
43787+            // Always good - even with sand
43788+            memcpy(top, a_u, size * sizeof(pixel));
43789+        if ((have & AVAIL_UR) != 0)
43790+        {
43791+            memcpy(top + size, a_ur, top_right_size * sizeof(pixel));
43792+            EXTEND(top + size + top_right_size, *b_ur,
43793+                   size - top_right_size);
43794+        }
43795+        if ((have & AVAIL_L) != 0)
43796+        {
43797+            for (i = 0; i < size; i++)
43798+                left[i] = b_l[stride * i];
43799+        }
43800+        if ((have & AVAIL_DL) != 0)
43801+        {
43802+            for (i = 0; i < down_left_size; i++)
43803+                left[i + size] = b_dl[stride * i];
43804+            EXTEND(left + size + down_left_size, *a_dl,
43805+                   size - down_left_size);
43806+        }
43807+    }
43808+    else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint
43809+            FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold &&
43810+            FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold)
43811+    {
43812+        if ((req & (AVAIL_U | AVAIL_UR)) != 0)
43813+            filter_strong(top, *ab_ul, *b_ur, size * 2);
43814+        left[-1] = *ab_ul;
43815+        if ((req & (AVAIL_L | AVAIL_DL)) != 0)
43816+            filter_strong(left, *ab_ul, *a_dl, size*2);
43817+    }
43818+    else
43819+    {
43820+        // Same code for both have & want for UL
43821+        if ((req & AVAIL_UL) != 0)
43822+        {
43823+            left[-1] = filter_light1(*b_l, *ab_ul, *a_u);
43824+        }
43825+
43826+        if ((want & AVAIL_L) != 0)
43827+        {
43828+            EXTEND(left, *a_l, size);
43829+            left[0] = (*a_l * 3 + *ab_ul + 2) >> 2;
43830+        }
43831+        if ((want & AVAIL_DL) != 0)
43832+        {
43833+            // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding
43834+            EXTEND(left + size, *a_l, size);
43835+        }
43836+        if ((want & AVAIL_U) != 0)
43837+        {
43838+            EXTEND(top, *a_u, size);
43839+            top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2;
43840+        }
43841+        if ((want & AVAIL_UR) != 0)
43842+        {
43843+            // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding
43844+            EXTEND(top + size, *a_ur, size);
43845+        }
43846+
43847+        if ((have & AVAIL_U) != 0)
43848+        {
43849+            filter_light(top, *ab_ul, a_u, *a_ur, 1, size);
43850+        }
43851+        if ((have & AVAIL_UR) != 0) {
43852+            filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size);
43853+            top[size*2 - 1] = *b_ur;
43854+            EXTEND(top + size + top_right_size, *b_ur, size - top_right_size);
43855+        }
43856+        if ((have & AVAIL_L) != 0)
43857+        {
43858+            filter_light(left, *ab_ul, b_l, *b_dl, stride, size);
43859+        }
43860+        if ((have & AVAIL_DL) != 0)
43861+        {
43862+            filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size);
43863+            left[size*2 - 1] = *a_dl;
43864+            EXTEND(left + size + down_left_size, *a_dl, size - down_left_size);
43865+        }
43866+    }
43867+}
43868+
43869+#define INTRA_FILTER(log2_size) \
43870+static void FUNC(intra_filter_ ## log2_size)( \
43871+     uint8_t * const left, uint8_t * const top, \
43872+     const unsigned int req, const unsigned int avail, \
43873+     const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \
43874+     const unsigned int stride, \
43875+     const unsigned int top_right_size, const unsigned int down_left_size) \
43876+{ \
43877+    intra_filter((pixel *)left, (pixel *)top, req, avail, \
43878+        (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \
43879+}
43880+
43881+INTRA_FILTER(2)
43882+INTRA_FILTER(3)
43883+INTRA_FILTER(4)
43884+INTRA_FILTER(5)
43885+
43886+#undef intra_filter
43887+#undef INTRA_FILTER
43888+
43889+static void FUNC(intra_pred)(const HEVCRpiContext * const s,
43890+                                              const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail,
43891+                                              const unsigned int log2_size)
43892+{
43893+    // c_idx will alaways be 1 for _c versions and 0 for y
43894+    const unsigned int c_idx = PRED_C;
43895+    const unsigned int hshift = ctx_hshift(s, c_idx);
43896+    const unsigned int vshift = ctx_vshift(s, c_idx);
43897+    const unsigned int size = (1 << log2_size);
43898+    const unsigned int x = x0 >> hshift;
43899+    const unsigned int y = y0 >> vshift;
43900+
43901+    const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
43902+    pixel *const src = c_idx == 0 ?
43903+        (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
43904+        (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
43905+
43906+    // Align so we can do multiple loads in the asm
43907+    // Padded to 16 byte boundary so as not to confuse anything
43908+    DECLARE_ALIGNED(16, pixel, top[2 * MAX_TB_SIZE]);
43909+    DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
43910+
43911+    pixel  * const left  = left_array  + 16 / sizeof(pixel);
43912+    const pixel * top_pred = top;
43913+
43914+    const pixel * src_l = src - 1;
43915+    const pixel * src_u = src - stride;
43916+    const pixel * src_ur = src_u + size;
43917+#if !PRED_C
43918+    const unsigned int req = req_avail[log2_size - 2][mode] & ~s->ps.sps->intra_filters_disable;
43919+#else
43920+    const unsigned int req = req_avail_c[mode];
43921+#endif
43922+
43923+    // If we have nothing to pred from then fill with grey
43924+    // This isn't a common case but dealing with it here means we don't have to
43925+    // test for it later
43926+    if (avail == 0)
43927+    {
43928+dc_only:
43929+#if !PRED_C
43930+        s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride);
43931+#else
43932+        s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride);
43933+#endif
43934+        return;
43935+    }
43936+
43937+    {
43938+        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
43939+        const AVFrame * const frame = s->frame;
43940+        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
43941+        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
43942+        if ((x & mask) == 0)
43943+            src_l -= stripe_adj;
43944+        if (((x + size) & mask) == 0)
43945+            src_ur += stripe_adj;
43946+    }
43947+
43948+    // Can deal with I-slices in 'normal' code even if CIP
43949+    // This also means that we don't need to generate (elsewhere) is_intra
43950+    // for IRAP frames
43951+    if (s->ps.pps->constrained_intra_pred_flag == 1 &&
43952+        s->sh.slice_type != HEVC_SLICE_I)
43953+    {
43954+        // * If we ever actually care about CIP performance then we should
43955+        //   special case out size 4 stuff (can be done by 'normal') and
43956+        //   have 8-pel avail masks
43957+        unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)),
43958+                                           -(int)(s->ps.sps->pcm_width),
43959+                                           1 << (((x - 1) >> (3 - hshift)) & 7),
43960+                                           1 - hshift,
43961+                                           avail,
43962+                                           size,
43963+                                           FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size),
43964+                                           vshift != 0 ? 0 : (y >> 2) & 1);
43965+
43966+        unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)),
43967+                                           (x >> (3 - hshift)) & 7,
43968+                                           1 - hshift,
43969+                                           avail,
43970+                                           size,
43971+                                           FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size),
43972+                                           hshift != 0 ? 0 : (x >> 2) & 1);
43973+
43974+        // Anything left?
43975+        if ((avail_l | avail_u) == 0)
43976+            goto dc_only;
43977+
43978+        FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size);
43979+
43980+#if !PRED_C
43981+        if ((req & FILTER_LIGHT) != 0)
43982+        {
43983+            const unsigned threshold = 1 << (BIT_DEPTH - 5);
43984+            if ((req & FILTER_STRONG) != 0 &&
43985+                (int)(FFABS(left[-1]  + top[63] - 2 * top[31]))  < threshold &&
43986+                (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold)
43987+            {
43988+                filter_strong(top, left[-1], top[63], 64);
43989+                filter_strong(left, left[-1], left[63], 64);
43990+            } else
43991+            {
43992+                // LHS writes UL too so copy for top
43993+                const pixel p_ul = left[-1];
43994+                filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size);
43995+                filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1);
43996+            }
43997+        }
43998+#endif
43999+    }
44000+    else
44001+    {
44002+        const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size);
44003+        if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 &&
44004+            ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size))
44005+        {
44006+            top_pred = src_u;
44007+        }
44008+        else
44009+        {
44010+#if !PRED_C
44011+            s->hpc.intra_filter[log2_size - 2]
44012+#else
44013+            s->hpc.intra_filter_c[log2_size - 2]
44014+#endif
44015+                ((uint8_t *)left, (uint8_t *)top, req, avail,
44016+                 (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel),
44017+                              ur_size,
44018+                              FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size));
44019+        }
44020+    }
44021+
44022+
44023+#if !PRED_C
44024+    switch (mode) {
44025+    case INTRA_PLANAR:
44026+        s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
44027+                                          (uint8_t *)left, stride);
44028+        break;
44029+    case INTRA_DC:
44030+        s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
44031+                       (uint8_t *)left, stride);
44032+        break;
44033+    case INTRA_ANGULAR_HORIZONTAL:
44034+        s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
44035+                                           (uint8_t *)left, stride,
44036+                                           mode);
44037+        break;
44038+    case INTRA_ANGULAR_VERTICAL:
44039+        s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
44040+                                           (uint8_t *)left, stride,
44041+                                           mode);
44042+        break;
44043+    default:
44044+        s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
44045+                                           (uint8_t *)left, stride,
44046+                                           mode);
44047+        break;
44048+    }
44049+#else
44050+    switch (mode) {
44051+    case INTRA_PLANAR:
44052+        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
44053+                                          (uint8_t *)left, stride);
44054+        break;
44055+    case INTRA_DC:
44056+        s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
44057+                       (uint8_t *)left, stride);
44058+        break;
44059+    case INTRA_ANGULAR_HORIZONTAL:
44060+        s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
44061+                                           (uint8_t *)left, stride,
44062+                                           mode);
44063+        break;
44064+    case INTRA_ANGULAR_VERTICAL:
44065+        s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
44066+                                           (uint8_t *)left, stride,
44067+                                           mode);
44068+        break;
44069+    default:
44070+        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
44071+                                           (uint8_t *)left, stride,
44072+                                           mode);
44073+        break;
44074+    }
44075+
44076+#if DUMP_PRED
44077+    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
44078+    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
44079+    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
44080+    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
44081+#endif
44082+#endif
44083+}
44084+
44085+#if !PRED_C
44086+static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
44087+                                  const uint8_t *_left, ptrdiff_t stride,
44088+                                  int trafo_size)
44089+{
44090+    int x, y;
44091+    pixel *src        = (pixel *)_src;
44092+    const pixel *top  = (const pixel *)_top;
44093+    const pixel *left = (const pixel *)_left;
44094+    int size = 1 << trafo_size;
44095+    for (y = 0; y < size; y++)
44096+        for (x = 0; x < size; x++)
44097+            POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
44098+                         (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
44099+}
44100+#else
44101+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
44102+                                  const uint8_t * _left, ptrdiff_t stride,
44103+                                  int trafo_size)
44104+{
44105+    int x, y;
44106+    int size = 1 << trafo_size;
44107+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
44108+    const c_src_ptr_t top = (c_src_ptr_t)_top;
44109+    const c_src_ptr_t left = (c_src_ptr_t)_left;
44110+
44111+    for (y = 0; y < size; y++, src += stride)
44112+    {
44113+        for (x = 0; x < size; x++)
44114+        {
44115+            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
44116+                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
44117+            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
44118+                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
44119+        }
44120+    }
44121+}
44122+#endif
44123+
44124+#define PRED_PLANAR(size)\
44125+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
44126+                                       const uint8_t *left, ptrdiff_t stride)   \
44127+{                                                                               \
44128+    FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
44129+}
44130+
44131+PRED_PLANAR(0)
44132+PRED_PLANAR(1)
44133+PRED_PLANAR(2)
44134+PRED_PLANAR(3)
44135+
44136+#undef PRED_PLANAR
44137+
44138+#if !PRED_C
44139+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
44140+                          const uint8_t *_left,
44141+                          ptrdiff_t stride, int log2_size)
44142+{
44143+    int i, j, x, y;
44144+    int size          = (1 << log2_size);
44145+    pixel *src        = (pixel *)_src;
44146+    const pixel *top  = (const pixel *)_top;
44147+    const pixel *left = (const pixel *)_left;
44148+    int dc            = size;
44149+    pixel4 a;
44150+    for (i = 0; i < size; i++)
44151+        dc += left[i] + top[i];
44152+
44153+    dc >>= log2_size + 1;
44154+
44155+    a = PIXEL_SPLAT_X4(dc);
44156+
44157+    for (i = 0; i < size; i++)
44158+        for (j = 0; j < size; j+=4)
44159+            AV_WN4P(&POS(j, i), a);
44160+
44161+//    if (c_idx == 0 && size < 32)
44162+// As we now have separate fns for y & c - no need to test that
44163+    if (size < 32)
44164+    {
44165+        POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
44166+        for (x = 1; x < size; x++)
44167+            POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
44168+        for (y = 1; y < size; y++)
44169+            POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
44170+    }
44171+}
44172+#else
44173+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
44174+                          const uint8_t *_left,
44175+                          ptrdiff_t stride, int log2_size)
44176+{
44177+    unsigned int i, j;
44178+    const unsigned int size = (1 << log2_size);
44179+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
44180+    const c_src_ptr_t top = (c_src_ptr_t)_top;
44181+    const c_src_ptr_t left = (c_src_ptr_t)_left;
44182+    unsigned int dc0 = size;
44183+    unsigned int dc1 = size;
44184+
44185+    for (i = 0; i < size; i++)
44186+    {
44187+        dc0 += left[i][0] + top[i][0];
44188+        dc1 += left[i][1] + top[i][1];
44189+    }
44190+
44191+    dc0 >>= log2_size + 1;
44192+    dc1 >>= log2_size + 1;
44193+
44194+    for (i = 0; i < size; i++, src += stride)
44195+    {
44196+        for (j = 0; j < size; ++j)
44197+        {
44198+            src[j][0] = dc0;
44199+            src[j][1] = dc1;
44200+
44201+        }
44202+    }
44203+}
44204+#endif
44205+
44206+#define PRED_DC(size)\
44207+static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top,        \
44208+                                       const uint8_t *left, ptrdiff_t stride)   \
44209+{                                                                               \
44210+    FUNC(pred_dc)(src, top, left, stride, size + 2);                        \
44211+}
44212+
44213+PRED_DC(0)
44214+PRED_DC(1)
44215+PRED_DC(2)
44216+PRED_DC(3)
44217+
44218+#undef PRED_DC
44219+
44220+
44221+
44222+
44223+#if !PRED_C
44224+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
44225+{
44226+    int i, j;
44227+    int size          = (1 << log2_size);
44228+    pixel *src        = (pixel *)_src;
44229+    pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1));
44230+
44231+    for (i = 0; i < size; i++)
44232+        for (j = 0; j < size; j+=4)
44233+            AV_WN4P(&POS(j, i), a);
44234+}
44235+#else
44236+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
44237+{
44238+    unsigned int i, j;
44239+    const unsigned int size = (1 << log2_size);
44240+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
44241+    const pixel a = (1 << (BIT_DEPTH - 1));
44242+
44243+    for (i = 0; i < size; i++, src += stride)
44244+    {
44245+        for (j = 0; j < size; ++j)
44246+        {
44247+            src[j][0] = a;
44248+            src[j][1] = a;
44249+        }
44250+    }
44251+}
44252+#endif
44253+
44254+#define PRED_DC0(size)\
44255+static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride)   \
44256+{                                                                               \
44257+    FUNC(pred_dc0)(src, stride, size + 2);                        \
44258+}
44259+
44260+PRED_DC0(0)
44261+PRED_DC0(1)
44262+PRED_DC0(2)
44263+PRED_DC0(3)
44264+
44265+#undef PRED_DC0
44266+
44267+
44268+
44269+
44270+#ifndef ANGLE_CONSTS
44271+#define ANGLE_CONSTS
44272+static const int intra_pred_angle[] = {
44273+     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
44274+    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
44275+};
44276+static const int inv_angle[] = {
44277+    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
44278+    -630, -910, -1638, -4096
44279+};
44280+#endif
44281+
44282+#if !PRED_C
44283+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
44284+                                                const uint8_t *_top,
44285+                                                const uint8_t *_left,
44286+                                                ptrdiff_t stride,
44287+                                                int mode, int size)
44288+{
44289+    int x, y;
44290+    pixel *src        = (pixel *)_src;
44291+    const pixel *top  = (const pixel *)_top;
44292+    const pixel *left = (const pixel *)_left;
44293+
44294+    int angle = intra_pred_angle[mode - 2];
44295+    pixel ref_array[3 * MAX_TB_SIZE + 4];
44296+    pixel *ref_tmp = ref_array + size;
44297+    const pixel *ref;
44298+    int last = (size * angle) >> 5;
44299+
44300+    if (mode >= 18) {
44301+        ref = top - 1;
44302+
44303+        if (angle < 0)
44304+        {
44305+            memcpy(ref_tmp + 1, top, size * PW);
44306+            ref_tmp[0] = left[-1];
44307+
44308+            for (x = last; x <= -1; x++)
44309+                ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
44310+            ref = ref_tmp;
44311+        }
44312+
44313+        for (y = 0; y < size; y++) {
44314+            int idx  = ((y + 1) * angle) >> 5;
44315+            int fact = ((y + 1) * angle) & 31;
44316+            if (fact) {
44317+                for (x = 0; x < size; x += 4) {
44318+                    POS(x    , y) = ((32 - fact) * ref[x + idx + 1] +
44319+                                           fact  * ref[x + idx + 2] + 16) >> 5;
44320+                    POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
44321+                                           fact  * ref[x + 1 + idx + 2] + 16) >> 5;
44322+                    POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
44323+                                           fact  * ref[x + 2 + idx + 2] + 16) >> 5;
44324+                    POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
44325+                                           fact  * ref[x + 3 + idx + 2] + 16) >> 5;
44326+                }
44327+            } else {
44328+                for (x = 0; x < size; x += 4)
44329+                    AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
44330+            }
44331+        }
44332+        if (mode == 26 && size < 32) {
44333+            for (y = 0; y < size; y++)
44334+                POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
44335+        }
44336+
44337+    } else {
44338+        ref = left - 1;
44339+        if (angle < 0 && last < -1) {
44340+            for (x = 0; x <= size; x += 4)
44341+                AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
44342+            // Inv angle <= -256 so top offset >= 0
44343+            for (x = last; x <= -1; x++)
44344+                ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
44345+            ref = ref_tmp;
44346+        }
44347+
44348+        for (x = 0; x < size; x++) {
44349+            int idx  = ((x + 1) * angle) >> 5;
44350+            int fact = ((x + 1) * angle) & 31;
44351+            if (fact) {
44352+                for (y = 0; y < size; y++) {
44353+                    POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
44354+                                       fact  * ref[y + idx + 2] + 16) >> 5;
44355+                }
44356+            } else {
44357+                for (y = 0; y < size; y++)
44358+                    POS(x, y) = ref[y + idx + 1];
44359+            }
44360+        }
44361+        if (mode == 10 && size < 32) {
44362+            for (x = 0; x < size; x += 4) {
44363+                POS(x,     0) = av_clip_pixel(left[0] + ((top[x    ] - left[-1]) >> 1));
44364+                POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - left[-1]) >> 1));
44365+                POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - left[-1]) >> 1));
44366+                POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - left[-1]) >> 1));
44367+            }
44368+        }
44369+    }
44370+}
44371+#else
44372+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
44373+                                                const uint8_t *_top,
44374+                                                const uint8_t *_left,
44375+                                                ptrdiff_t stride,
44376+                                                int mode, int size)
44377+{
44378+    int x, y;
44379+    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
44380+    c_src_ptr_t top  = (c_src_ptr_t)_top;
44381+    c_src_ptr_t left = (c_src_ptr_t)_left;
44382+
44383+    const int angle = intra_pred_angle[mode - 2];
44384+    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
44385+    c_dst_ptr_t ref_tmp = ref_array + size;
44386+    c_src_ptr_t ref;
44387+    const int last = (size * angle) >> 5;
44388+
44389+    if (mode >= 18) {
44390+        ref = top - 1;
44391+        if (angle < 0) {
44392+            memcpy(ref_tmp + 1, top, size * 2 * PW);
44393+            ref_tmp[0][0] = left[-1][0];
44394+            ref_tmp[0][1] = left[-1][1];
44395+            for (x = last; x <= -1; x++)
44396+            {
44397+                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
44398+                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
44399+            }
44400+            ref = (c_src_ptr_t)ref_tmp;
44401+        }
44402+
44403+        for (y = 0; y < size; y++, src += stride) {
44404+            const int idx  = ((y + 1) * angle) >> 5;
44405+            const int fact = ((y + 1) * angle) & 31;
44406+            if (fact) {
44407+                for (x = 0; x < size; ++x) {
44408+                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
44409+                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
44410+                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
44411+                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
44412+                }
44413+            } else {
44414+                memcpy(src, ref + idx + 1, size * 2 * PW);
44415+            }
44416+        }
44417+    } else {
44418+        ref = left - 1;
44419+        if (angle < 0 && last < -1) {
44420+            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
44421+            for (x = last; x <= -1; x++)
44422+            {
44423+                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
44424+                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
44425+            }
44426+            ref = (c_src_ptr_t)ref_tmp;
44427+        }
44428+
44429+        for (x = 0; x < size; x++, src++) {
44430+            const int idx  = ((x + 1) * angle) >> 5;
44431+            const int fact = ((x + 1) * angle) & 31;
44432+            if (fact) {
44433+                for (y = 0; y < size; y++) {
44434+                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
44435+                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
44436+                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
44437+                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
44438+                }
44439+            } else {
44440+                for (y = 0; y < size; y++)
44441+                {
44442+                    src[y * stride][0] = ref[y + idx + 1][0];
44443+                    src[y * stride][1] = ref[y + idx + 1][1];
44444+                }
44445+            }
44446+        }
44447+    }
44448+}
44449+#endif
44450+
44451+static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
44452+                                 const uint8_t *left,
44453+                                 ptrdiff_t stride, int mode)
44454+{
44455+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2);
44456+}
44457+
44458+static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
44459+                                 const uint8_t *left,
44460+                                 ptrdiff_t stride, int mode)
44461+{
44462+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3);
44463+}
44464+
44465+static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
44466+                                 const uint8_t *left,
44467+                                 ptrdiff_t stride, int mode)
44468+{
44469+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4);
44470+}
44471+
44472+static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
44473+                                 const uint8_t *left,
44474+                                 ptrdiff_t stride, int mode)
44475+{
44476+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5);
44477+}
44478+
44479+#undef cpel
44480+#undef c_src_ptr_t
44481+#undef c_dst_ptr_t
44482+
44483+#undef EXTEND
44484+#undef POS
44485+#undef PW
44486+
44487+#undef filter_light1
44488+#undef filter_light
44489+#undef filter_strong
44490+#undef ref_gen
44491+
44492+#ifndef INCLUDED_ONCE
44493+#define INCLUDED_ONCE
44494+#endif
44495+
44496--- /dev/null
44497+++ b/libavcodec/rpi_mailbox.c
44498@@ -0,0 +1,155 @@
44499+/*
44500+Copyright (c) 2012, Broadcom Europe Ltd.
44501+All rights reserved.
44502+
44503+Redistribution and use in source and binary forms, with or without
44504+modification, are permitted provided that the following conditions are met:
44505+    * Redistributions of source code must retain the above copyright
44506+      notice, this list of conditions and the following disclaimer.
44507+    * Redistributions in binary form must reproduce the above copyright
44508+      notice, this list of conditions and the following disclaimer in the
44509+      documentation and/or other materials provided with the distribution.
44510+    * Neither the name of the copyright holder nor the
44511+      names of its contributors may be used to endorse or promote products
44512+      derived from this software without specific prior written permission.
44513+
44514+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
44515+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
44516+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
44517+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
44518+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
44519+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
44520+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44521+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44522+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
44523+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44524+*/
44525+
44526+#include <stdio.h>
44527+#include <string.h>
44528+#include <stdlib.h>
44529+#include <fcntl.h>
44530+#include <unistd.h>
44531+#include <assert.h>
44532+#include <stdint.h>
44533+#include <sys/ioctl.h>
44534+
44535+#include <linux/ioctl.h>
44536+
44537+#define MAJOR_NUM 100
44538+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
44539+#define DEVICE_FILE_NAME "/dev/vcio"
44540+
44541+#include "rpi_mailbox.h"
44542+//#include <interface/vctypes/vc_image_structs.h>
44543+
44544+/*
44545+ * use ioctl to send mbox property message
44546+ */
44547+
44548+static int mbox_property(int file_desc, void *buf)
44549+{
44550+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
44551+
44552+   if (ret_val < 0) {
44553+      printf("ioctl_set_msg failed:%d\n", ret_val);
44554+   }
44555+
44556+#ifdef DEBUG
44557+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
44558+   for (i=0; i<size/4; i++)
44559+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
44560+#endif
44561+   return ret_val;
44562+}
44563+
44564+#define GET_VCIMAGE_PARAMS 0x30044
44565+
44566+int mbox_get_image_params(int fd, VC_IMAGE_T * img)
44567+{
44568+    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
44569+    uint32_t * p = buf;
44570+    void * rimg;
44571+    int rv;
44572+
44573+    *p++ = 0; // size
44574+    *p++ = 0; // process request
44575+    *p++ = GET_VCIMAGE_PARAMS;
44576+    *p++ = sizeof(*img);
44577+    *p++ = sizeof(*img);
44578+    rimg = p;
44579+    memcpy(p, img, sizeof(*img));
44580+    p += sizeof(*img) / sizeof(*p);
44581+    *p++ = 0;  // End tag
44582+    buf[0] = (p - buf) * sizeof(*p);
44583+
44584+    rv = mbox_property(fd, buf);
44585+    memcpy(img, rimg, sizeof(*img));
44586+
44587+    return rv;
44588+}
44589+
44590+
44591+#define SET_CLOCK_RATE 0x00038002
44592+#define GET_MAX_CLOCK 0x00030004
44593+#define CLOCK_HEVC 11
44594+
44595+static int mbox_property_generic(int fd, unsigned command, unsigned *word0, unsigned *word1)
44596+{
44597+    uint32_t buf[32];
44598+    uint32_t * p = buf;
44599+    int rv;
44600+
44601+    *p++ = 0; // size
44602+    *p++ = 0; // process request
44603+    *p++ = command;
44604+    *p++ = 8;
44605+    *p++ = 8;
44606+    *p++ = *word0;
44607+    *p++ = *word1;
44608+    *p++ = 0;  // End tag
44609+    buf[0] = (p - buf) * sizeof(*p);
44610+
44611+    rv = mbox_property(fd, buf);
44612+    *word0 = buf[6];
44613+    *word1 = buf[7];
44614+    return rv;
44615+}
44616+
44617+int mbox_open() {
44618+   int file_desc;
44619+
44620+   // open a char device file used for communicating with kernel mbox driver
44621+   file_desc = open(DEVICE_FILE_NAME, 0);
44622+   if (file_desc < 0) {
44623+      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
44624+      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
44625+   }
44626+   return file_desc;
44627+}
44628+
44629+void mbox_close(int file_desc) {
44630+  close(file_desc);
44631+}
44632+
44633+int mbox_request_clock(int fd) {
44634+   int rv;
44635+   unsigned word0, word1 = 0;
44636+   word0 = CLOCK_HEVC;
44637+   rv = mbox_property_generic(fd, GET_MAX_CLOCK, &word0, &word1);
44638+   if (rv != 0)
44639+      return rv;
44640+   word1 = word0;
44641+   word0 = CLOCK_HEVC;
44642+   rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
44643+   return rv;
44644+}
44645+
44646+int mbox_release_clock(int fd) {
44647+  int rv;
44648+  unsigned word0, word1 = 0;
44649+  word0 = CLOCK_HEVC;
44650+  word1 = 0;
44651+  rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
44652+  return rv;
44653+}
44654--- /dev/null
44655+++ b/libavcodec/rpi_mailbox.h
44656@@ -0,0 +1,58 @@
44657+#ifndef RPI_MAILBOX_H
44658+#define RPI_MAILBOX_H
44659+
44660+/* The image structure. */
44661+typedef struct vc_image_extra_uv_s {
44662+  void *u, *v;
44663+  int vpitch;
44664+} VC_IMAGE_EXTRA_UV_T;
44665+
44666+typedef union {
44667+    VC_IMAGE_EXTRA_UV_T uv;
44668+//  VC_IMAGE_EXTRA_RGBA_T rgba;
44669+//  VC_IMAGE_EXTRA_PAL_T pal;
44670+//  VC_IMAGE_EXTRA_TF_T tf;
44671+//  VC_IMAGE_EXTRA_BAYER_T bayer;
44672+//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
44673+//  VC_IMAGE_EXTRA_CODEC_T codec;
44674+//  VC_IMAGE_EXTRA_OPENGL_T opengl;
44675+} VC_IMAGE_EXTRA_T;
44676+
44677+
44678+typedef struct VC_IMAGE_T {
44679+  unsigned short                  type;           /* should restrict to 16 bits */
44680+  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
44681+  unsigned short                  width;          /* width in pixels */
44682+  unsigned short                  height;         /* height in pixels */
44683+  int                             pitch;          /* pitch of image_data array in bytes */
44684+  int                             size;           /* number of bytes available in image_data array */
44685+  void                           *image_data;     /* pixel data */
44686+  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
44687+  void                           *metadata;       /* metadata header for the image */
44688+  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
44689+  int                             mem_handle;     /* the mem handle for relocatable memory storage */
44690+  int                             metadata_size;  /* size of metadata of each channel in bytes */
44691+  int                             channel_offset; /* offset of consecutive channels in bytes */
44692+  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
44693+  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
44694+  uint8_t                         current_channel;/* the channel this header is currently pointing to */
44695+  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
44696+  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
44697+                                                            into a linked-mulitchannel image */
44698+  uint8_t                         channel_index;         /* index of the channel this header represents while
44699+                                                            it is being linked. */
44700+  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
44701+} VC_IMAGE_T;
44702+
44703+typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
44704+
44705+
44706+extern int mbox_open(void);
44707+extern void mbox_close(int file_desc);
44708+
44709+int mbox_get_image_params(int fd, VC_IMAGE_T * img);
44710+
44711+int mbox_request_clock(int fd);
44712+int mbox_release_clock(int fd);
44713+
44714+#endif
44715--- /dev/null
44716+++ b/libavcodec/rpi_mem.c
44717@@ -0,0 +1,326 @@
44718+/*
44719+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
44720+All rights reserved.
44721+
44722+Redistribution and use in source and binary forms, with or without
44723+modification, are permitted provided that the following conditions are met:
44724+    * Redistributions of source code must retain the above copyright
44725+      notice, this list of conditions and the following disclaimer.
44726+    * Redistributions in binary form must reproduce the above copyright
44727+      notice, this list of conditions and the following disclaimer in the
44728+      documentation and/or other materials provided with the distribution.
44729+    * Neither the name of the copyright holder nor the
44730+      names of its contributors may be used to endorse or promote products
44731+      derived from this software without specific prior written permission.
44732+
44733+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
44734+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
44735+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
44736+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
44737+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
44738+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
44739+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44740+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44741+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
44742+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44743+
44744+Authors: John Cox
44745+*/
44746+
44747+
44748+#include <stdlib.h>
44749+#include <string.h>
44750+#include <stddef.h>
44751+#include <stdint.h>
44752+
44753+#include "config.h"
44754+
44755+#include "libavutil/avassert.h"
44756+#include "libavutil/rpi_sand_fns.h"
44757+
44758+#pragma GCC diagnostic push
44759+// Many many redundant decls in the header files
44760+#pragma GCC diagnostic ignored "-Wredundant-decls"
44761+#include <bcm_host.h>
44762+#include <interface/vctypes/vc_image_types.h>
44763+#include <interface/vcsm/user-vcsm.h>
44764+#pragma GCC diagnostic pop
44765+
44766+#include "rpi_mem.h"
44767+#include "rpi_zc_frames.h"
44768+
44769+
44770+#define OPT_PREFER_CMA 0
44771+
44772+struct rpi_cache_flush_env_s {
44773+  struct vcsm_user_clean_invalid2_s v;
44774+};
44775+
44776+
44777+// GPU memory alloc fns (internal)
44778+
44779+static void gpu_free_internal(GPU_MEM_PTR_T * const p)
44780+{
44781+    if (p->arm != NULL)
44782+        vcsm_unlock_ptr(p->arm);
44783+    if (p->vcsm_handle != 0)
44784+        vcsm_free(p->vcsm_handle);
44785+    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
44786+}
44787+
44788+
44789+static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
44790+    const int numbytes, const unsigned int cache_type, const char * const name)
44791+{
44792+    memset(p, 0, sizeof(*p));
44793+    p->numbytes = (numbytes + 255) & ~255;  // Round up
44794+
44795+    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0)
44796+    {
44797+        av_log(NULL, AV_LOG_ERROR, "Unable to alloc %d bytes from VCSM for %s\n", p->numbytes, name);
44798+        goto fail;
44799+    }
44800+    if ((p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0)
44801+    {
44802+        av_log(NULL, AV_LOG_ERROR, "Unable to VC handle from VCSM for %s\n", name);
44803+        goto fail;
44804+    }
44805+    if ((p->arm = vcsm_lock(p->vcsm_handle)) == NULL)
44806+    {
44807+        av_log(NULL, AV_LOG_ERROR, "Unable to lock handle from VCSM for %s\n", name);
44808+        goto fail;
44809+    }
44810+    if ((p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
44811+    {
44812+        av_log(NULL, AV_LOG_ERROR, "Unable to get VC addr from VCSM for %s\n", name);
44813+        goto fail;
44814+    }
44815+
44816+    return 0;
44817+
44818+fail:
44819+    gpu_free_internal(p);
44820+    return AVERROR(ENOMEM);
44821+}
44822+
44823+// Public gpu fns
44824+
44825+// Allocate memory on GPU
44826+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
44827+// Returns 0 on success.
44828+// This allocates memory that will not be cached in ARM's data cache.
44829+// Therefore safe to use without data cache flushing.
44830+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
44831+{
44832+    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached");
44833+}
44834+
44835+// This allocates data that will be
44836+//    Cached in ARM L2
44837+//    Uncached in VPU L2
44838+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
44839+{
44840+    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached");
44841+}
44842+
44843+void gpu_free(GPU_MEM_PTR_T * const p) {
44844+    gpu_free_internal(p);
44845+}
44846+
44847+void rpi_mem_gpu_uninit(void)
44848+{
44849+    vcsm_exit();
44850+    bcm_host_deinit();
44851+}
44852+
44853+int rpi_mem_gpu_init(const unsigned int flags)
44854+{
44855+    const int wants_cma = bcm_host_is_fkms_active();
44856+    int use_cma;
44857+
44858+    (void)flags;
44859+
44860+    if (vcsm_init_ex(wants_cma ? 1 : 0, -1) == 0)
44861+        use_cma = 1;
44862+    else if (vcsm_init_ex(wants_cma ? 0 : 1, -1) == 0)
44863+        use_cma = 0;
44864+    else
44865+        return AVERROR(EINVAL);
44866+
44867+    bcm_host_init();
44868+
44869+    return use_cma + 1;
44870+}
44871+
44872+// ----------------------------------------------------------------------------
44873+//
44874+// Cache flush functions
44875+
44876+#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s))
44877+
44878+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf)
44879+{
44880+  rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf;
44881+  *rfe = (rpi_cache_flush_env_t){.v={.op_count = 0}};
44882+  return rfe;
44883+}
44884+
44885+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
44886+{
44887+  // Nothing needed
44888+}
44889+
44890+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
44891+{
44892+    int rc = 0;
44893+    if (rfe->v.op_count != 0) {
44894+        if (vcsm_clean_invalid2(&rfe->v) != 0)
44895+        {
44896+          const int err = errno;
44897+          av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", err);
44898+          rc = AVERROR(err);
44899+        }
44900+        rfe->v.op_count = 0;
44901+    }
44902+    return rc;
44903+}
44904+
44905+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
44906+{
44907+  int rc = rpi_cache_flush_execute(rfe);;
44908+
44909+  return rc;
44910+}
44911+
44912+inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
44913+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
44914+{
44915+  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
44916+
44917+  av_assert1(rfe->v.op_count <= CACHE_EL_MAX);
44918+
44919+  b->invalidate_mode = mode;
44920+  b->block_count = blocks;
44921+  b->start_address = gm->arm + offset0;
44922+  b->block_size = block_size;
44923+  b->inter_block_stride = block_stride;
44924+}
44925+
44926+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
44927+  const unsigned int offset, const unsigned int size)
44928+{
44929+  // Deal with empty pointer trivially
44930+  if (gm == NULL || size == 0)
44931+    return;
44932+
44933+  av_assert1(offset <= gm->numbytes);
44934+  av_assert1(size <= gm->numbytes);
44935+  av_assert1(offset + size <= gm->numbytes);
44936+
44937+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
44938+}
44939+
44940+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
44941+{
44942+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
44943+}
44944+
44945+
44946+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
44947+{
44948+#if !RPI_ONE_BUF
44949+#error Fixme! (NIF)
44950+#endif
44951+  if (gpu_is_buf1(frame)) {
44952+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
44953+  }
44954+  else
44955+  {
44956+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
44957+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
44958+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
44959+  }
44960+}
44961+
44962+// Flush an area of a frame
44963+// Width, height, x0, y0 in luma pels
44964+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
44965+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
44966+  const unsigned int uv_shift, const int do_luma, const int do_chroma)
44967+{
44968+  const unsigned int y_offset = frame->linesize[0] * y0;
44969+  const unsigned int y_size = frame->linesize[0] * height;
44970+  // Round UV up/down to get everything
44971+  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
44972+  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
44973+  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
44974+
44975+#if 0
44976+  // *** frame->height is cropped height so not good
44977+  // As all unsigned they will also reject -ve
44978+  // Test individually as well as added to reject overflow
44979+  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
44980+  av_assert0(n <= (unsigned int)frame->height);
44981+  av_assert0(start_line + n <= (unsigned int)frame->height);
44982+#endif
44983+
44984+  if (!gpu_is_buf1(frame))
44985+  {
44986+    if (do_luma) {
44987+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
44988+    }
44989+    if (do_chroma) {
44990+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
44991+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
44992+    }
44993+  }
44994+  else if (!av_rpi_is_sand_frame(frame))
44995+  {
44996+    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
44997+    if (do_luma) {
44998+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
44999+    }
45000+    if (do_chroma) {
45001+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
45002+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
45003+    }
45004+  }
45005+  else
45006+  {
45007+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
45008+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
45009+    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
45010+    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
45011+    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
45012+    av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
45013+
45014+    if (do_chroma)
45015+    {
45016+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
45017+      b->invalidate_mode = mode;
45018+      b->block_count = block_count;
45019+      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
45020+      b->block_size = uv_size;
45021+      b->inter_block_stride = stride1 * stride2;
45022+    }
45023+    if (do_luma)
45024+    {
45025+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
45026+      b->invalidate_mode = mode;
45027+      b->block_count = block_count;
45028+      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
45029+      b->block_size = y_size;
45030+      b->inter_block_stride = stride1 * stride2;
45031+    }
45032+  }
45033+}
45034+
45035+// Call this to clean and invalidate a region of memory
45036+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
45037+{
45038+  rpi_cache_buf_t cbuf;
45039+  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
45040+  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
45041+  rpi_cache_flush_finish(rfe);
45042+}
45043+
45044--- /dev/null
45045+++ b/libavcodec/rpi_mem.h
45046@@ -0,0 +1,88 @@
45047+/*
45048+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
45049+All rights reserved.
45050+
45051+Redistribution and use in source and binary forms, with or without
45052+modification, are permitted provided that the following conditions are met:
45053+    * Redistributions of source code must retain the above copyright
45054+      notice, this list of conditions and the following disclaimer.
45055+    * Redistributions in binary form must reproduce the above copyright
45056+      notice, this list of conditions and the following disclaimer in the
45057+      documentation and/or other materials provided with the distribution.
45058+    * Neither the name of the copyright holder nor the
45059+      names of its contributors may be used to endorse or promote products
45060+      derived from this software without specific prior written permission.
45061+
45062+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
45063+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
45064+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
45065+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
45066+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
45067+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
45068+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45069+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
45070+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45071+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45072+
45073+Authors: John Cox, Ben Avison
45074+*/
45075+
45076+#ifndef RPI_MEM_H
45077+#define RPI_MEM_H
45078+
45079+typedef struct gpu_mem_ptr_s {
45080+  unsigned char *arm; // Pointer to memory mapped on ARM side
45081+  int vc_handle;   // Videocore handle of relocatable memory
45082+  int vcsm_handle; // Handle for use by VCSM
45083+  int vc;       // Address for use in GPU code
45084+  int numbytes; // Size of memory block
45085+} GPU_MEM_PTR_T;
45086+
45087+// General GPU functions
45088+
45089+#define GPU_INIT_GPU 1
45090+#define GPU_INIT_CMA 2
45091+
45092+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
45093+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
45094+extern void gpu_free(GPU_MEM_PTR_T * const p);
45095+int rpi_mem_gpu_init(const unsigned int flags);
45096+void rpi_mem_gpu_uninit(void);
45097+
45098+// Cache flush stuff
45099+
45100+struct rpi_cache_flush_env_s;
45101+typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
45102+
45103+typedef struct {uint32_t t[33];} rpi_cache_buf_t;
45104+
45105+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf);
45106+// Free env without flushing
45107+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
45108+// Do the accumulated flush & clear but do not free the env
45109+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
45110+// Do the accumulated flush & free the env
45111+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
45112+
45113+typedef enum
45114+{
45115+    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
45116+    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
45117+    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
45118+} rpi_cache_flush_mode_t;
45119+
45120+struct AVFrame;
45121+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
45122+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
45123+  const unsigned int offset, const unsigned int size);
45124+void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
45125+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
45126+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode);
45127+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode,
45128+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
45129+  const unsigned int uv_shift, const int do_luma, const int do_chroma);
45130+
45131+// init, add, finish for one gm ptr
45132+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
45133+
45134+#endif
45135--- /dev/null
45136+++ b/libavcodec/rpi_qpu.c
45137@@ -0,0 +1,776 @@
45138+/*
45139+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
45140+All rights reserved.
45141+
45142+Redistribution and use in source and binary forms, with or without
45143+modification, are permitted provided that the following conditions are met:
45144+    * Redistributions of source code must retain the above copyright
45145+      notice, this list of conditions and the following disclaimer.
45146+    * Redistributions in binary form must reproduce the above copyright
45147+      notice, this list of conditions and the following disclaimer in the
45148+      documentation and/or other materials provided with the distribution.
45149+    * Neither the name of the copyright holder nor the
45150+      names of its contributors may be used to endorse or promote products
45151+      derived from this software without specific prior written permission.
45152+
45153+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
45154+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
45155+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
45156+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
45157+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
45158+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
45159+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45160+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
45161+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45162+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45163+
45164+Authors: John Cox
45165+*/
45166+
45167+
45168+#include <stdio.h>
45169+#include <stdlib.h>
45170+#include <string.h>
45171+#include <stddef.h>
45172+#include <stdint.h>
45173+#include "libavutil/avassert.h"
45174+
45175+#include "config.h"
45176+
45177+#include <pthread.h>
45178+#include <time.h>
45179+
45180+#include <interface/vcsm/user-vcsm.h>
45181+
45182+#include "rpi_mailbox.h"
45183+#include "rpi_mem.h"
45184+#include "rpi_qpu.h"
45185+#include "rpi_hevc_shader.h"
45186+#include "rpi_hevc_transform8.h"
45187+#include "rpi_hevc_transform10.h"
45188+#include "libavutil/rpi_sand_fns.h"
45189+
45190+// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
45191+#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
45192+
45193+// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
45194+// Beware this is expensive and will probably throw off all other timing by >10%
45195+#define RPI_TRACE_QPU_PROFILE_ALL       0
45196+
45197+// QPU "noflush" flags
45198+// a mixture of flushing & profiling
45199+
45200+#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
45201+#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
45202+#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
45203+#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
45204+#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
45205+
45206+#define vcos_verify_ge0(x) ((x)>=0)
45207+
45208+// Size in 32bit words
45209+#define QPU_CODE_SIZE 4098
45210+#define VPU_CODE_SIZE 16384
45211+
45212+static const short rpi_transMatrix2even[32][16] = { // Even rows first
45213+{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
45214+{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
45215+{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
45216+{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
45217+{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
45218+{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
45219+{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
45220+{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
45221+{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
45222+{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
45223+{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
45224+{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
45225+{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
45226+{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
45227+{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
45228+{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
45229+// Odd rows
45230+{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
45231+{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
45232+{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
45233+{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
45234+{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
45235+{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
45236+{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
45237+{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
45238+{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
45239+{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
45240+{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
45241+{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
45242+{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
45243+{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
45244+{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
45245+{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
45246+};
45247+
45248+// Code/constants on GPU
45249+struct GPU
45250+{
45251+//  unsigned int qpu_code[QPU_CODE_SIZE];
45252+    unsigned int vpu_code8[VPU_CODE_SIZE];
45253+    unsigned int vpu_code10[VPU_CODE_SIZE];
45254+    short transMatrix2even[16*16*2];
45255+};
45256+
45257+#define WAIT_COUNT_MAX 16
45258+
45259+typedef struct trace_time_one_s
45260+{
45261+    int count;
45262+    int64_t start[WAIT_COUNT_MAX];
45263+    int64_t total[WAIT_COUNT_MAX];
45264+} trace_time_one_t;
45265+
45266+typedef struct trace_time_wait_s
45267+{
45268+    unsigned int jcount;
45269+    int64_t start0;
45270+    int64_t last_update;
45271+    trace_time_one_t active;
45272+    trace_time_one_t wait;
45273+} trace_time_wait_t;
45274+
45275+typedef struct vq_wait_s
45276+{
45277+    sem_t sem;
45278+    struct vq_wait_s * next;
45279+} vq_wait_t;
45280+
45281+#define VQ_WAIT_POOL_SIZE 16
45282+typedef struct vq_wait_pool_s
45283+{
45284+    vq_wait_t * head;
45285+    vq_wait_t pool[VQ_WAIT_POOL_SIZE];
45286+} vq_wait_pool_t;
45287+
45288+static void vq_wait_pool_init(vq_wait_pool_t * const pool);
45289+static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
45290+
45291+typedef struct gpu_env_s
45292+{
45293+    int open_count;
45294+    int init_count;
45295+    int vpu_i_cache_flushed;
45296+    GPU_MEM_PTR_T qpu_code_gm_ptr;
45297+    GPU_MEM_PTR_T code_gm_ptr;
45298+    GPU_MEM_PTR_T dummy_gm_ptr;
45299+    vq_wait_pool_t wait_pool;
45300+#if RPI_TRACE_TIME_VPU_QPU_WAIT
45301+    trace_time_wait_t ttw;
45302+#endif
45303+} gpu_env_t;
45304+
45305+// Stop more than one thread trying to allocate memory or use the processing resources at once
45306+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
45307+static gpu_env_t * gpu = NULL;
45308+
45309+#if RPI_TRACE_TIME_VPU_QPU_WAIT
45310+
45311+static int64_t ns_time(void)
45312+{
45313+    struct timespec ts;
45314+    clock_gettime(CLOCK_MONOTONIC, &ts);
45315+    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
45316+}
45317+
45318+
45319+#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
45320+
45321+#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
45322+#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
45323+#define T_ARG(t) T_SEC(t), T_MS(t)
45324+#define T_FMT "%u.%03u"
45325+
45326+static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
45327+{
45328+    // Update totals for levels that are still pending
45329+    for (int i = 0; i < tto->count; ++i) {
45330+        tto->total[i] += now - tto->start[i];
45331+        tto->start[i] = now;
45332+    }
45333+
45334+    printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
45335+         prefix,
45336+         T_ARG(now - start0 - tto->total[0]),
45337+         T_ARG(tto->total[0]),
45338+         T_ARG(tto->total[1]),
45339+         T_ARG(tto->total[2]),
45340+         T_ARG(tto->total[3]));
45341+}
45342+
45343+
45344+static void tto_start(trace_time_one_t * const tto, const int64_t now)
45345+{
45346+    av_assert0(tto->count < WAIT_COUNT_MAX);
45347+    tto->start[tto->count++] = now;
45348+}
45349+
45350+static void tto_end(trace_time_one_t * const tto, const int64_t now)
45351+{
45352+    const int n = --tto->count;
45353+    av_assert0(n >= 0);
45354+    tto->total[n] += now - tto->start[n];
45355+}
45356+
45357+static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
45358+{
45359+    printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
45360+    tto_print(&ttw->active, now, ttw->start0, "Active");
45361+    tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
45362+}
45363+
45364+#endif
45365+
45366+// GPU memory alloc fns (internal)
45367+
45368+static void gpu_free_internal(GPU_MEM_PTR_T * const p)
45369+{
45370+    if (p->arm != NULL)
45371+        vcsm_unlock_ptr(p->arm);
45372+    if (p->vcsm_handle != 0)
45373+        vcsm_free(p->vcsm_handle);
45374+    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
45375+}
45376+
45377+
45378+static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
45379+    const int numbytes, const unsigned int cache_type, const char * const name)
45380+{
45381+    memset(p, 0, sizeof(*p));
45382+    p->numbytes = (numbytes + 255) & ~255;  // Round up
45383+
45384+    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 ||
45385+        (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 ||
45386+        (p->arm = vcsm_lock(p->vcsm_handle)) == NULL ||
45387+        (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
45388+    {
45389+        gpu_free_internal(p);
45390+        return AVERROR(ENOMEM);
45391+    }
45392+    return 0;
45393+}
45394+
45395+
45396+// GPU init, free, lock, unlock
45397+
45398+static void gpu_term(void)
45399+{
45400+    gpu_env_t * const ge = gpu;
45401+
45402+    // We have to hope that eveything has terminated...
45403+    gpu = NULL;
45404+
45405+    vc_gpuserv_deinit();
45406+
45407+    gpu_free_internal(&ge->code_gm_ptr);
45408+    gpu_free_internal(&ge->qpu_code_gm_ptr);
45409+    gpu_free_internal(&ge->dummy_gm_ptr);
45410+
45411+    vcsm_exit();
45412+
45413+    vq_wait_pool_deinit(&ge->wait_pool);
45414+
45415+    free(ge);
45416+}
45417+
45418+
45419+// Connect to QPU, returns 0 on success.
45420+static int gpu_init(gpu_env_t ** const gpu) {
45421+    volatile struct GPU* ptr;
45422+    gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
45423+    int rv;
45424+    *gpu = NULL;
45425+
45426+    if (ge == NULL)
45427+        return -1;
45428+
45429+    vq_wait_pool_init(&ge->wait_pool);
45430+
45431+    vcsm_init();
45432+
45433+    // Now copy over the QPU code into GPU memory
45434+    if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0)
45435+      return rv;
45436+
45437+    {
45438+        int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader;
45439+        av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
45440+        memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes);
45441+        memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes);
45442+    }
45443+
45444+    // And the VPU code
45445+    if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0)
45446+        return rv;
45447+    ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
45448+
45449+    // Zero everything so we have zeros between the code bits
45450+    memset((void *)ptr, 0, sizeof(*ptr));
45451+    {
45452+        int num_bytes = sizeof(rpi_hevc_transform8);
45453+        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
45454+        memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
45455+    }
45456+    {
45457+        int num_bytes = sizeof(rpi_hevc_transform10);
45458+        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
45459+        memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
45460+    }
45461+    // And the transform coefficients
45462+    memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
45463+
45464+    // Generate a dummy "frame" & fill with 0x80
45465+    // * Could reset to 1 <<bit_depth?
45466+    if ((rv = gpu_malloc_internal(&ge->dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0)
45467+        return rv;
45468+    memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000);
45469+
45470+    *gpu = ge;
45471+    return 0;
45472+}
45473+
45474+
45475+
45476+static void gpu_unlock(void) {
45477+    pthread_mutex_unlock(&gpu_mutex);
45478+}
45479+
45480+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
45481+static gpu_env_t * gpu_lock(void) {
45482+    pthread_mutex_lock(&gpu_mutex);
45483+
45484+    av_assert1(gpu != NULL);
45485+    return gpu;
45486+}
45487+
45488+static gpu_env_t * gpu_lock_ref(void)
45489+{
45490+    pthread_mutex_lock(&gpu_mutex);
45491+
45492+    if (gpu == NULL) {
45493+        int rv = gpu_init(&gpu);
45494+        if (rv != 0) {
45495+            gpu_unlock();
45496+            return NULL;
45497+        }
45498+    }
45499+
45500+    ++gpu->open_count;
45501+    return gpu;
45502+}
45503+
45504+static void gpu_unlock_unref(gpu_env_t * const ge)
45505+{
45506+    if (--ge->open_count == 0)
45507+        gpu_term();
45508+
45509+    gpu_unlock();
45510+}
45511+
45512+static inline gpu_env_t * gpu_ptr(void)
45513+{
45514+    av_assert1(gpu != NULL);
45515+    return gpu;
45516+}
45517+
45518+unsigned int vpu_get_fn(const unsigned int bit_depth) {
45519+  uint32_t a = 0;
45520+
45521+  // Make sure that the gpu is initialized
45522+  av_assert1(gpu != NULL);
45523+  switch (bit_depth){
45524+    case 8:
45525+      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
45526+      break;
45527+    case 10:
45528+      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
45529+      break;
45530+    default:
45531+      av_assert0(0);
45532+  }
45533+  return a;
45534+}
45535+
45536+unsigned int vpu_get_constants(void) {
45537+  av_assert1(gpu != NULL);
45538+  return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even));
45539+}
45540+
45541+void gpu_ref(void)
45542+{
45543+  gpu_lock_ref();
45544+  gpu_unlock();
45545+}
45546+
45547+void gpu_unref(void)
45548+{
45549+  gpu_env_t * const ge = gpu_lock();
45550+  gpu_unlock_unref(ge);
45551+}
45552+
45553+// ----------------------------------------------------------------------------
45554+
45555+
45556+// Wait abstractions - mostly so we can easily add profile code
45557+static void vq_wait_pool_init(vq_wait_pool_t * const wp)
45558+{
45559+  unsigned int i;
45560+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
45561+    sem_init(&wp->pool[i].sem, 0, 0);
45562+    wp->pool[i].next = wp->pool + i + 1;
45563+  }
45564+  wp->head = wp->pool + 0;
45565+  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
45566+}
45567+
45568+static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
45569+{
45570+  unsigned int i;
45571+  wp->head = NULL;
45572+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
45573+    sem_destroy(&wp->pool[i].sem);
45574+    wp->pool[i].next = NULL;
45575+  }
45576+}
45577+
45578+
45579+// If sem_init actually takes time then maybe we want a pool...
45580+static vq_wait_t * vq_wait_new(void)
45581+{
45582+  gpu_env_t * const ge = gpu_lock_ref();
45583+  vq_wait_t * const wait = ge->wait_pool.head;
45584+  ge->wait_pool.head = wait->next;
45585+  wait->next = NULL;
45586+
45587+#if RPI_TRACE_TIME_VPU_QPU_WAIT
45588+  tto_start(&ge->ttw.active, ns_time());
45589+#endif
45590+
45591+  gpu_unlock();
45592+  return wait;
45593+}
45594+
45595+static void vq_wait_delete(vq_wait_t * const wait)
45596+{
45597+  gpu_env_t * const ge = gpu_lock();
45598+  wait->next = ge->wait_pool.head;
45599+  ge->wait_pool.head = wait;
45600+
45601+#if RPI_TRACE_TIME_VPU_QPU_WAIT
45602+  {
45603+    trace_time_wait_t * const ttw = &ge->ttw;
45604+    const int64_t now = ns_time();
45605+    ++ttw->jcount;
45606+    tto_end(&ttw->wait, now);
45607+
45608+    if (ttw->start0 == 0)
45609+    {
45610+      ttw->start0 = ttw->active.start[0];
45611+      ttw->last_update = ttw->start0;
45612+    }
45613+    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
45614+    {
45615+      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
45616+      ttw_print(ttw, now);
45617+    }
45618+  }
45619+#endif
45620+  gpu_unlock_unref(ge);
45621+}
45622+
45623+static void vq_wait_wait(vq_wait_t * const wait)
45624+{
45625+#if RPI_TRACE_TIME_VPU_QPU_WAIT
45626+  {
45627+      const int64_t now = ns_time();
45628+      gpu_env_t * const ge = gpu_lock();
45629+      tto_start(&ge->ttw.wait, now);
45630+      gpu_unlock();
45631+  }
45632+#endif
45633+
45634+  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
45635+    /* loop */;
45636+}
45637+
45638+static void vq_wait_post(vq_wait_t * const wait)
45639+{
45640+#if RPI_TRACE_TIME_VPU_QPU_WAIT
45641+  {
45642+    gpu_env_t *const ge = gpu_lock();
45643+    tto_end(&ge->ttw.active, ns_time());
45644+    gpu_unlock();
45645+  }
45646+#endif
45647+
45648+  sem_post(&wait->sem);
45649+}
45650+
45651+
45652+
45653+// Header comments were wrong for these two
45654+#define VPU_QPU_MASK_QPU  1
45655+#define VPU_QPU_MASK_VPU  2
45656+
45657+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
45658+
45659+vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf)
45660+{
45661+//  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
45662+  vpu_qpu_job_env_t * vqj = buf;
45663+//  memset(vqj, 0, sizeof(*vqj));
45664+  vqj->n = 0;
45665+  vqj->mask = 0;
45666+  return vqj;
45667+}
45668+
45669+void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
45670+{
45671+//  memset(vqj, 0, sizeof(*vqj));
45672+//  free(vqj);
45673+}
45674+
45675+static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
45676+{
45677+  struct gpu_job_s * const j = vqj->j + vqj->n++;
45678+  av_assert1(vqj->n <= VPU_QPU_JOB_MAX);
45679+  return j;
45680+}
45681+
45682+void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
45683+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
45684+{
45685+  if (vpu_code != 0) {
45686+    struct gpu_job_s *const j = new_job(vqj);
45687+    vqj->mask |= VPU_QPU_MASK_VPU;
45688+
45689+    j->command = EXECUTE_VPU;
45690+    j->callback.func = 0;
45691+    j->callback.cookie = NULL;
45692+    // The bottom two bits of the execute address contain no-flush flags
45693+    // b0 will flush the VPU I-cache if unset so we nearly always want that set
45694+    // as we never reload code
45695+    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
45696+    j->u.v.q[1] = r0;
45697+    j->u.v.q[2] = r1;
45698+    j->u.v.q[3] = r2;
45699+    j->u.v.q[4] = r3;
45700+    j->u.v.q[5] = r4;
45701+    j->u.v.q[6] = r5;
45702+    gpu->vpu_i_cache_flushed = 1;
45703+  }
45704+}
45705+
45706+// flags are QPU_FLAGS_xxx
45707+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
45708+{
45709+  if (n != 0) {
45710+    struct gpu_job_s *const j = new_job(vqj);
45711+    vqj->mask |= VPU_QPU_MASK_QPU;
45712+
45713+    j->command = EXECUTE_QPU;
45714+    j->callback.func = 0;
45715+    j->callback.cookie = NULL;
45716+
45717+    j->u.q.jobs = n;
45718+#if RPI_TRACE_QPU_PROFILE_ALL
45719+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
45720+#else
45721+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
45722+#endif
45723+    j->u.q.timeout = 5000;
45724+    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
45725+  }
45726+}
45727+
45728+// Convert callback to sem post
45729+static void vpu_qpu_job_callback_wait(void * v)
45730+{
45731+  vq_wait_post(v);
45732+}
45733+
45734+// Poke a user-supplied sem
45735+static void vpu_qpu_job_callback_sem(void * v)
45736+{
45737+  sem_post((sem_t *)v);
45738+}
45739+
45740+void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
45741+{
45742+  vq_wait_t * wait;
45743+
45744+  if (vqj->mask == 0) {
45745+    *wait_h = NULL;
45746+    return;
45747+  }
45748+
45749+  // We are going to want a sync object
45750+  wait = vq_wait_new();
45751+
45752+  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
45753+  // If we only posted one thing or only QPU jobs
45754+  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
45755+  {
45756+    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
45757+    av_assert1(j->callback.func == 0);
45758+
45759+    j->callback.func = vpu_qpu_job_callback_wait;
45760+    j->callback.cookie = wait;
45761+  }
45762+  else
45763+  {
45764+    struct gpu_job_s *const j = new_job(vqj);
45765+
45766+    j->command = EXECUTE_SYNC;
45767+    j->u.s.mask = vqj->mask;
45768+    j->callback.func = vpu_qpu_job_callback_wait;
45769+    j->callback.cookie = wait;
45770+  }
45771+
45772+  vqj->mask = 0;
45773+  *wait_h = wait;
45774+}
45775+
45776+// Returns 0 if no sync added ('cos Q empty), 1 if sync added
45777+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem)
45778+{
45779+  // If nothing on q then just return
45780+  if (vqj->mask == 0)
45781+    return 0;
45782+
45783+  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
45784+  // If we only posted one thing or only QPU jobs
45785+  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
45786+  {
45787+    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
45788+    av_assert1(j->callback.func == 0);
45789+
45790+    j->callback.func = vpu_qpu_job_callback_sem;
45791+    j->callback.cookie = sem;
45792+  }
45793+  else
45794+  {
45795+    struct gpu_job_s *const j = new_job(vqj);
45796+
45797+    j->command = EXECUTE_SYNC;
45798+    j->u.s.mask = vqj->mask;
45799+    j->callback.func = vpu_qpu_job_callback_sem;
45800+    j->callback.cookie = sem;
45801+  }
45802+
45803+  vqj->mask = 0;
45804+  return 1;
45805+}
45806+
45807+
45808+int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
45809+{
45810+  if (vqj->n == 0)
45811+    return 0;
45812+
45813+  return vc_gpuserv_execute_code(vqj->n, vqj->j);
45814+}
45815+
45816+// Simple wrapper of start + delete
45817+int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
45818+{
45819+  int rv;
45820+  rv = vpu_qpu_job_start(vqj);
45821+  vpu_qpu_job_delete(vqj);
45822+  return rv;
45823+}
45824+
45825+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
45826+{
45827+  if (wait_h != NULL)
45828+  {
45829+    vq_wait_t * const wait = *wait_h;
45830+    if (wait != NULL) {
45831+      *wait_h = NULL;
45832+      vq_wait_wait(wait);
45833+      vq_wait_delete(wait);
45834+    }
45835+  }
45836+}
45837+
45838+int vpu_qpu_init()
45839+{
45840+  gpu_env_t * const ge = gpu_lock_ref();
45841+  if (ge == NULL)
45842+    return -1;
45843+
45844+  if (ge->init_count++ == 0)
45845+  {
45846+    vc_gpuserv_init();
45847+  }
45848+
45849+  gpu_unlock();
45850+  return 0;
45851+}
45852+
45853+void vpu_qpu_term()
45854+{
45855+  gpu_env_t * const ge = gpu_lock();
45856+
45857+  if (--ge->init_count == 0) {
45858+    vc_gpuserv_deinit();
45859+
45860+#if RPI_TRACE_TIME_VPU_QPU_WAIT
45861+    ttw_print(&ge->ttw, ns_time());
45862+#endif
45863+  }
45864+
45865+  gpu_unlock_unref(ge);
45866+}
45867+
45868+uint32_t qpu_fn(const int * const mc_fn)
45869+{
45870+  return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader);
45871+}
45872+
45873+uint32_t qpu_dummy(void)
45874+{
45875+  return gpu->dummy_gm_ptr.vc;
45876+}
45877+
45878+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
45879+{
45880+  // Dummy values we can catch with emulation
45881+  qf->y_pxx = ~1U;
45882+  qf->y_bxx = ~2U;
45883+  qf->y_p00 = ~3U;
45884+  qf->y_b00 = ~4U;
45885+  qf->c_pxx = ~5U;
45886+  qf->c_bxx = ~6U;
45887+
45888+  switch (bit_depth) {
45889+    case 8:
45890+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
45891+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
45892+      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
45893+      qf->y_p00 = qpu_fn(mc_filter_y_p00);
45894+      qf->y_b00 = qpu_fn(mc_filter_y_b00);
45895+      qf->c_pxx = qpu_fn(mc_filter_c_p);
45896+      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
45897+      qf->c_bxx = qpu_fn(mc_filter_c_b);
45898+      break;
45899+    case 10:
45900+      qf->c_pxx = qpu_fn(mc_filter_c10_p);
45901+      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
45902+      qf->c_bxx = qpu_fn(mc_filter_c10_b);
45903+      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
45904+      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
45905+      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
45906+      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
45907+      break;
45908+    default:
45909+      return -1;
45910+  }
45911+  return 0;
45912+}
45913+
45914--- /dev/null
45915+++ b/libavcodec/rpi_qpu.h
45916@@ -0,0 +1,103 @@
45917+/*
45918+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
45919+All rights reserved.
45920+
45921+Redistribution and use in source and binary forms, with or without
45922+modification, are permitted provided that the following conditions are met:
45923+    * Redistributions of source code must retain the above copyright
45924+      notice, this list of conditions and the following disclaimer.
45925+    * Redistributions in binary form must reproduce the above copyright
45926+      notice, this list of conditions and the following disclaimer in the
45927+      documentation and/or other materials provided with the distribution.
45928+    * Neither the name of the copyright holder nor the
45929+      names of its contributors may be used to endorse or promote products
45930+      derived from this software without specific prior written permission.
45931+
45932+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
45933+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
45934+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
45935+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
45936+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
45937+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
45938+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45939+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
45940+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45941+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45942+
45943+Authors: John Cox, Ben Avison
45944+*/
45945+
45946+#ifndef RPI_QPU_H
45947+#define RPI_QPU_H
45948+
45949+#include "rpi_mem.h"
45950+#include "rpi_zc_frames.h"
45951+
45952+#pragma GCC diagnostic push
45953+// Many many redundant decls in the header files
45954+#pragma GCC diagnostic ignored "-Wredundant-decls"
45955+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
45956+#include "interface/vmcs_host/vc_vchi_gpuserv.h"  // for gpu_job_s
45957+#pragma GCC diagnostic pop
45958+
45959+// QPU specific functions
45960+
45961+typedef struct HEVCRpiQpu {
45962+    uint32_t c_pxx;
45963+    uint32_t c_pxx_l1;
45964+    uint32_t c_bxx;
45965+    uint32_t y_pxx;
45966+    uint32_t y_bxx;
45967+    uint32_t y_p00;
45968+    uint32_t y_b00;
45969+} HEVCRpiQpu;
45970+
45971+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
45972+
45973+uint32_t qpu_fn(const int * const mc_fn);
45974+uint32_t qpu_dummy(void);
45975+
45976+#define QPU_N_GRP    4
45977+#define QPU_N_MAX    12
45978+
45979+#define QPU_MAIL_EL_VALS  2
45980+
45981+struct vpu_qpu_wait_s;
45982+typedef struct vq_wait_s * vpu_qpu_wait_h;
45983+
45984+// VPU specific functions
45985+
45986+struct vpu_qpu_job_env_s;
45987+typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
45988+
45989+#define VPU_QPU_JOB_MAX 4
45990+struct vpu_qpu_job_env_s
45991+{
45992+  unsigned int n;
45993+  unsigned int mask;
45994+  struct gpu_job_s j[VPU_QPU_JOB_MAX];
45995+};
45996+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
45997+
45998+vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf);
45999+void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
46000+void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
46001+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
46002+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
46003+void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
46004+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem);
46005+int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
46006+int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
46007+
46008+extern unsigned int vpu_get_fn(const unsigned int bit_depth);
46009+extern unsigned int vpu_get_constants(void);
46010+
46011+// Waits for previous post_codee to complete and Will null out *wait_h after use
46012+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
46013+int vpu_qpu_init(void);
46014+void vpu_qpu_term(void);
46015+
46016+void gpu_ref(void);
46017+void gpu_unref(void);
46018+
46019+#endif
46020--- /dev/null
46021+++ b/libavcodec/rpi_zc.c
46022@@ -0,0 +1,1227 @@
46023+#include "config.h"
46024+
46025+#include "libavcodec/avcodec.h"
46026+#include "rpi_mem.h"
46027+#include "rpi_mailbox.h"
46028+#include "rpi_zc.h"
46029+#include "libavutil/avassert.h"
46030+#include <pthread.h>
46031+
46032+#include "libavutil/buffer_internal.h"
46033+
46034+#pragma GCC diagnostic push
46035+// Many many redundant decls in the header files
46036+#pragma GCC diagnostic ignored "-Wredundant-decls"
46037+#include <interface/vctypes/vc_image_types.h>
46038+#include <interface/vcsm/user-vcsm.h>
46039+#pragma GCC diagnostic pop
46040+
46041+#define TRACE_ALLOC 0
46042+#define DEBUG_ALWAYS_KEEP_LOCKED 0
46043+
46044+struct ZcPoolEnt;
46045+
46046+typedef struct ZcPool
46047+{
46048+    size_t numbytes;
46049+    struct ZcPoolEnt * head;
46050+    pthread_mutex_t lock;
46051+} ZcPool;
46052+
46053+typedef struct ZcPoolEnt
46054+{
46055+    size_t numbytes;
46056+
46057+    unsigned int vcsm_handle;
46058+    unsigned int vc_handle;
46059+    void * map_arm;
46060+    unsigned int map_vc;
46061+
46062+    struct ZcPoolEnt * next;
46063+    struct ZcPool * pool;
46064+} ZcPoolEnt;
46065+
46066+typedef struct ZcOldCtxVals
46067+{
46068+    int thread_safe_callbacks;
46069+    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
46070+    void * opaque;
46071+} ZcOldCtxVals;
46072+
46073+typedef struct AVZcEnv
46074+{
46075+    unsigned int refcount;
46076+    ZcOldCtxVals old;
46077+
46078+    void * pool_env;
46079+    av_rpi_zc_alloc_buf_fn_t * alloc_buf;
46080+    av_rpi_zc_free_pool_fn_t * free_pool;
46081+
46082+    unsigned int pool_size;
46083+} ZcEnv;
46084+
46085+typedef struct ZcUserBufEnv {
46086+    void * v;
46087+    const av_rpi_zc_buf_fn_tab_t * fn;
46088+    size_t numbytes;
46089+    int offset;
46090+} ZcUserBufEnv;
46091+
46092+#define ZC_BUF_INVALID  0
46093+#define ZC_BUF_VALID    1
46094+#define ZC_BUF_NEVER    2
46095+
46096+typedef struct ZcBufEnv {
46097+    GPU_MEM_PTR_T gmem;
46098+    AVZcEnvPtr zc;
46099+    int is_valid;
46100+    AVBufferRef * user;
46101+    AVRpiZcFrameGeometry geo;
46102+    size_t size_y;
46103+    size_t size_c;
46104+    size_t size_pic;
46105+    ssize_t offset;
46106+    pthread_mutex_t lock;
46107+    pthread_cond_t cond;
46108+} ZcBufEnv;
46109+
46110+
46111+
46112+
46113+
46114+
46115+#define ALLOC_PAD       0
46116+#define ALLOC_ROUND     0x1000
46117+#define STRIDE_ROUND    64
46118+#define STRIDE_OR       0
46119+
46120+#define DEBUG_ZAP0_BUFFERS 0
46121+
46122+static inline int av_rpi_is_sand_format(const int format)
46123+{
46124+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16) ||
46125+        (format == AV_PIX_FMT_RPI4_8 || format == AV_PIX_FMT_RPI4_10);
46126+}
46127+
46128+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
46129+{
46130+    return av_rpi_is_sand_format(frame->format);
46131+}
46132+
46133+//----------------------------------------------------------------------------
46134+//
46135+// Internal pool stuff
46136+
46137+// Pool entry functions
46138+
46139+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const size_t req_size)
46140+{
46141+    ZcPoolEnt * const zp = av_mallocz(sizeof(ZcPoolEnt));
46142+
46143+    // Round up to 4k & add 4k
46144+    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
46145+
46146+    if (zp == NULL) {
46147+        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
46148+        goto fail0;
46149+    }
46150+
46151+    // The 0x80 here maps all pages here rather than waiting for lazy mapping
46152+    // BEWARE that in GPU land a later unlock/lock pair will put us back into
46153+    // lazy mode - which will also break cache invalidate calls.
46154+    if ((zp->vcsm_handle = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST | 0x80, "ffmpeg_rpi_zc")) == 0)
46155+    {
46156+        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
46157+        goto fail1;
46158+    }
46159+
46160+#if TRACE_ALLOC
46161+    printf("%s: Alloc %#x bytes @ h=%d\n", __func__, alloc_size, zp->vcsm_handle);
46162+#endif
46163+
46164+    zp->numbytes = alloc_size;
46165+    zp->pool = pool;
46166+    return zp;
46167+
46168+fail1:
46169+    av_free(zp);
46170+fail0:
46171+    return NULL;
46172+}
46173+
46174+static void zc_pool_ent_free(ZcPoolEnt * const zp)
46175+{
46176+#if TRACE_ALLOC
46177+    printf("%s: Free %#x bytes @ h=%d\n", __func__, zp->numbytes, zp->vcsm_handle);
46178+#endif
46179+
46180+    if (zp->vcsm_handle != 0)
46181+    {
46182+        // VC addr & handle need no dealloc
46183+        if (zp->map_arm != NULL)
46184+            vcsm_unlock_hdl(zp->vcsm_handle);
46185+        vcsm_free(zp->vcsm_handle);
46186+    }
46187+    av_free(zp);
46188+}
46189+
46190+//----------------------------------------------------------------------------
46191+//
46192+// Pool functions
46193+
46194+static void zc_pool_free_ent_list(ZcPoolEnt * p)
46195+{
46196+    while (p != NULL)
46197+    {
46198+        ZcPoolEnt * const zp = p;
46199+        p = p->next;
46200+        zc_pool_ent_free(zp);
46201+    }
46202+}
46203+
46204+static void zc_pool_flush(ZcPool * const pool)
46205+{
46206+    ZcPoolEnt * p = pool->head;
46207+    pool->head = NULL;
46208+    pool->numbytes = ~0U;
46209+    zc_pool_free_ent_list(p);
46210+}
46211+
46212+static ZcPoolEnt * zc_pool_get_ent(ZcPool * const pool, const size_t req_bytes)
46213+{
46214+    ZcPoolEnt * zp = NULL;
46215+    ZcPoolEnt * flush_list = NULL;
46216+    size_t numbytes;
46217+
46218+    pthread_mutex_lock(&pool->lock);
46219+
46220+    numbytes = pool->numbytes;
46221+
46222+    // If size isn't close then dump the pool
46223+    // Close in this context means within 128k
46224+    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
46225+    {
46226+        flush_list = pool->head;
46227+        pool->head = NULL;
46228+        pool->numbytes = numbytes = req_bytes;
46229+    }
46230+    else if (pool->head != NULL)
46231+    {
46232+        zp = pool->head;
46233+        pool->head = zp->next;
46234+    }
46235+
46236+    pthread_mutex_unlock(&pool->lock);
46237+
46238+    zc_pool_free_ent_list(flush_list);
46239+
46240+    if (zp == NULL)
46241+        zp = zc_pool_ent_alloc(pool, numbytes);
46242+
46243+    return zp;
46244+}
46245+
46246+static void zc_pool_put_ent(ZcPoolEnt * const zp)
46247+{
46248+    ZcPool * const pool = zp == NULL ? NULL : zp->pool;
46249+    if (zp != NULL)
46250+    {
46251+        pthread_mutex_lock(&pool->lock);
46252+#if TRACE_ALLOC
46253+        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->numbytes);
46254+#endif
46255+
46256+        if (pool->numbytes == zp->numbytes)
46257+        {
46258+            zp->next = pool->head;
46259+            pool->head = zp;
46260+            pthread_mutex_unlock(&pool->lock);
46261+        }
46262+        else
46263+        {
46264+            pthread_mutex_unlock(&pool->lock);
46265+            zc_pool_ent_free(zp);
46266+        }
46267+    }
46268+}
46269+
46270+static ZcPool *
46271+zc_pool_new(void)
46272+{
46273+    ZcPool * const pool = av_mallocz(sizeof(*pool));
46274+    if (pool == NULL)
46275+        return NULL;
46276+
46277+    pool->numbytes = -1;
46278+    pool->head = NULL;
46279+    pthread_mutex_init(&pool->lock, NULL);
46280+    return pool;
46281+}
46282+
46283+static void
46284+zc_pool_delete(ZcPool * const pool)
46285+{
46286+    if (pool != NULL)
46287+    {
46288+        pool->numbytes = -1;
46289+        zc_pool_flush(pool);
46290+        pthread_mutex_destroy(&pool->lock);
46291+        av_free(pool);
46292+    }
46293+}
46294+
46295+//============================================================================
46296+//
46297+// ZC implementation using above pool implementation
46298+//
46299+// Fn table fns...
46300+
46301+static void zc_pool_free_v(void * v)
46302+{
46303+    zc_pool_put_ent(v);
46304+}
46305+
46306+static unsigned int zc_pool_ent_vcsm_handle_v(void * v)
46307+{
46308+    ZcPoolEnt * zp = v;
46309+    return zp->vcsm_handle;
46310+}
46311+
46312+static unsigned int zc_pool_ent_vc_handle_v(void * v)
46313+{
46314+    ZcPoolEnt * zp = v;
46315+    if (zp->vc_handle == 0)
46316+    {
46317+        if ((zp->vc_handle = vcsm_vc_hdl_from_hdl(zp->vcsm_handle)) == 0)
46318+            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC handle\n",
46319+                   __func__, zp->vcsm_handle);
46320+    }
46321+    return zp->vc_handle;
46322+}
46323+
46324+static void * zc_pool_ent_map_arm_v(void * v)
46325+{
46326+    ZcPoolEnt * zp = v;
46327+    if (zp->map_arm == NULL)
46328+    {
46329+        if ((zp->map_arm = vcsm_lock(zp->vcsm_handle)) == NULL)
46330+            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to ARM address\n",
46331+                   __func__, zp->vcsm_handle);
46332+    }
46333+    return zp->map_arm;
46334+}
46335+
46336+static unsigned int zc_pool_ent_map_vc_v(void * v)
46337+{
46338+    ZcPoolEnt * zp = v;
46339+    if (zp->map_vc == 0)
46340+    {
46341+        if ((zp->map_vc = vcsm_vc_addr_from_hdl(zp->vcsm_handle)) == 0)
46342+            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC address\n",
46343+                   __func__, zp->vcsm_handle);
46344+    }
46345+    return zp->map_vc;
46346+}
46347+
46348+static const av_rpi_zc_buf_fn_tab_t zc_pool_buf_fns = {
46349+    .free        = zc_pool_free_v,
46350+    .vcsm_handle = zc_pool_ent_vcsm_handle_v,
46351+    .vc_handle   = zc_pool_ent_vc_handle_v,
46352+    .map_arm     = zc_pool_ent_map_arm_v,
46353+    .map_vc      = zc_pool_ent_map_vc_v,
46354+};
46355+
46356+// ZC Env fns
46357+
46358+// Delete pool
46359+// All buffers guaranteed freed by now
46360+static void
46361+zc_pool_delete_v(void * v)
46362+{
46363+    zc_pool_delete((ZcPool *)v);
46364+    rpi_mem_gpu_uninit();
46365+}
46366+
46367+// Allocate a new ZC buffer
46368+static AVBufferRef *
46369+zc_pool_buf_alloc(void * v, size_t size, const AVRpiZcFrameGeometry * geo)
46370+{
46371+    ZcPool * const pool = v;
46372+    ZcPoolEnt *const zp = zc_pool_get_ent(pool, size);
46373+    AVBufferRef * buf;
46374+
46375+    (void)geo;  // geo ignored here
46376+
46377+    if (zp == NULL) {
46378+        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
46379+        goto fail0;
46380+    }
46381+
46382+    if ((buf = av_rpi_zc_buf(size, 0, zp, &zc_pool_buf_fns)) == NULL)
46383+    {
46384+        av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_buf() failed\n");
46385+        goto fail2;
46386+    }
46387+
46388+    return buf;
46389+
46390+fail2:
46391+    zc_pool_put_ent(zp);
46392+fail0:
46393+    return NULL;
46394+}
46395+
46396+// Init wrappers - the public fns
46397+
46398+AVZcEnvPtr
46399+av_rpi_zc_int_env_alloc(void * logctx)
46400+{
46401+    ZcEnv * zc;
46402+    ZcPool * pool_env;
46403+
46404+    if (rpi_mem_gpu_init(0) < 0)
46405+        return NULL;
46406+
46407+    if ((pool_env = zc_pool_new()) == NULL)
46408+        goto fail1;
46409+
46410+    if ((zc = av_rpi_zc_env_alloc(logctx, pool_env, zc_pool_buf_alloc, zc_pool_delete_v)) == NULL)
46411+        goto fail2;
46412+
46413+    return zc;
46414+
46415+fail2:
46416+    zc_pool_delete(pool_env);
46417+fail1:
46418+    rpi_mem_gpu_uninit();
46419+    return NULL;
46420+}
46421+
46422+void
46423+av_rpi_zc_int_env_freep(AVZcEnvPtr * zcp)
46424+{
46425+    const AVZcEnvPtr zc = *zcp;
46426+    *zcp = NULL;
46427+    if (zc != NULL)
46428+        av_rpi_zc_env_release(zc);
46429+}
46430+
46431+//============================================================================
46432+//
46433+// Geometry
46434+//
46435+// This is a separate chunck to the rest
46436+
46437+// Get mailbox fd - should be in a lock when called
46438+// Rely on process close to close it
46439+static int mbox_fd(void)
46440+{
46441+    static int fd = -1;
46442+    if (fd != -1)
46443+        return fd;
46444+    return (fd = mbox_open());
46445+}
46446+
46447+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
46448+    const int format, const unsigned int video_width, const unsigned int video_height)
46449+{
46450+    static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
46451+
46452+    AVRpiZcFrameGeometry geo = {
46453+        .format       = format,
46454+        .video_width  = video_width,
46455+        .video_height = video_height
46456+    };
46457+
46458+    switch (format)
46459+    {
46460+        case AV_PIX_FMT_YUV420P:
46461+            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
46462+            geo.stride_c = geo.stride_y / 2;
46463+            geo.height_y = (video_height + 32 + 31) & ~31;
46464+            geo.height_c = geo.height_y / 2;
46465+            geo.planes_c = 2;
46466+            geo.stripes = 1;
46467+            geo.bytes_per_pel = 1;
46468+            geo.stripe_is_yc = 1;
46469+            break;
46470+
46471+        case AV_PIX_FMT_YUV420P10:
46472+            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
46473+            geo.stride_c = geo.stride_y / 2;
46474+            geo.height_y = (video_height + 32 + 31) & ~31;
46475+            geo.height_c = geo.height_y / 2;
46476+            geo.planes_c = 2;
46477+            geo.stripes = 1;
46478+            geo.bytes_per_pel = 2;
46479+            geo.stripe_is_yc = 1;
46480+            break;
46481+
46482+        case AV_PIX_FMT_SAND128:
46483+        case AV_PIX_FMT_RPI4_8:
46484+        {
46485+            const unsigned int stripe_w = 128;
46486+
46487+            static VC_IMAGE_T img = {0};
46488+
46489+            // Given the overhead of calling the mailbox keep a stashed
46490+            // copy as we will almost certainly just want the same numbers again
46491+            // but that means we need a lock
46492+            pthread_mutex_lock(&sand_lock);
46493+
46494+            if (img.width != video_width || img.height != video_height)
46495+            {
46496+                VC_IMAGE_T new_img = {
46497+                    .type = VC_IMAGE_YUV_UV,
46498+                    .width = video_width,
46499+                    .height = video_height
46500+                };
46501+
46502+                mbox_get_image_params(mbox_fd(), &new_img);
46503+                img = new_img;
46504+            }
46505+
46506+            geo.stride_y = stripe_w;
46507+            geo.stride_c = stripe_w;
46508+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
46509+            geo.height_c = img.pitch / stripe_w - geo.height_y;
46510+            geo.stripe_is_yc = 1;
46511+            if (geo.height_y * stripe_w > img.pitch)
46512+            {
46513+                // "tall" sand - all C blocks now follow Y
46514+                geo.height_y = img.pitch / stripe_w;
46515+                geo.height_c = geo.height_y;
46516+                geo.stripe_is_yc = 0;
46517+            }
46518+            geo.planes_c = 1;
46519+            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
46520+            geo.bytes_per_pel = 1;
46521+
46522+            pthread_mutex_unlock(&sand_lock);
46523+#if 0
46524+            printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
46525+                   video_width, video_height,
46526+                   geo.stride_y, geo.stride_c,
46527+                   geo.height_y, geo.height_c,
46528+                   geo.stripes, img.pitch);
46529+#endif
46530+            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
46531+            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
46532+            break;
46533+        }
46534+
46535+        case AV_PIX_FMT_RPI4_10:
46536+        {
46537+            const unsigned int stripe_w = 128;  // bytes
46538+
46539+            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
46540+            static VC_IMAGE_T img = {0};
46541+
46542+            // Given the overhead of calling the mailbox keep a stashed
46543+            // copy as we will almost certainly just want the same numbers again
46544+            // but that means we need a lock
46545+            pthread_mutex_lock(&sand_lock);
46546+
46547+            if (img.width != video_width || img.height != video_height)
46548+            {
46549+                VC_IMAGE_T new_img = {
46550+                    .type = VC_IMAGE_YUV10COL,
46551+                    .width = video_width,
46552+                    .height = video_height
46553+                };
46554+
46555+                mbox_get_image_params(mbox_fd(), &new_img);
46556+                img = new_img;
46557+            }
46558+
46559+            geo.stride_y = stripe_w;
46560+            geo.stride_c = stripe_w;
46561+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
46562+            geo.height_c = img.pitch / stripe_w - geo.height_y;
46563+            geo.planes_c = 1;
46564+            geo.stripes = ((video_width * 4 + 2) / 3 + stripe_w - 1) / stripe_w;
46565+            geo.bytes_per_pel = 1;
46566+            geo.stripe_is_yc = 1;
46567+
46568+            pthread_mutex_unlock(&sand_lock);
46569+
46570+#if 0
46571+            printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
46572+                   video_width, video_height,
46573+                   geo.stride_y, geo.stride_c,
46574+                   geo.height_y, geo.height_c,
46575+                   geo.stripes, img.pitch);
46576+#endif
46577+            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
46578+            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
46579+            break;
46580+        }
46581+
46582+        case AV_PIX_FMT_SAND64_16:
46583+        case AV_PIX_FMT_SAND64_10:
46584+        {
46585+            const unsigned int stripe_w = 128;  // bytes
46586+
46587+            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
46588+            static VC_IMAGE_T img = {0};
46589+
46590+            // Given the overhead of calling the mailbox keep a stashed
46591+            // copy as we will almost certainly just want the same numbers again
46592+            // but that means we need a lock
46593+            pthread_mutex_lock(&sand_lock);
46594+
46595+             if (img.width != video_width || img.height != video_height)
46596+            {
46597+                VC_IMAGE_T new_img = {
46598+                    .type = VC_IMAGE_YUV_UV_16,
46599+                    .width = video_width,
46600+                    .height = video_height
46601+                };
46602+
46603+                mbox_get_image_params(mbox_fd(), &new_img);
46604+                img = new_img;
46605+            }
46606+
46607+            geo.stride_y = stripe_w;
46608+            geo.stride_c = stripe_w;
46609+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
46610+            geo.height_c = img.pitch / stripe_w - geo.height_y;
46611+            geo.planes_c = 1;
46612+            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
46613+            geo.bytes_per_pel = 2;
46614+            geo.stripe_is_yc = 1;
46615+
46616+            pthread_mutex_unlock(&sand_lock);
46617+            break;
46618+        }
46619+
46620+        default:
46621+            break;
46622+    }
46623+    return geo;
46624+}
46625+
46626+//============================================================================
46627+//
46628+// ZC Env fns
46629+//
46630+// Frame copy fns
46631+
46632+static AVBufferRef * zc_copy(const AVZcEnvPtr zc,
46633+    const AVFrame * const src)
46634+{
46635+    AVFrame dest_frame;
46636+    AVFrame * const dest = &dest_frame;
46637+    unsigned int i;
46638+    uint8_t * psrc, * pdest;
46639+
46640+    dest->format = src->format;
46641+    dest->width = src->width;
46642+    dest->height = src->height;
46643+
46644+    if (av_rpi_zc_get_buffer(zc, dest) != 0 ||
46645+        av_rpi_zc_resolve_frame(dest, ZC_RESOLVE_ALLOC_VALID) != 0)
46646+    {
46647+        return NULL;
46648+    }
46649+
46650+    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
46651+         i != dest->height;
46652+         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
46653+    {
46654+        memcpy(pdest, psrc, dest->width);
46655+    }
46656+    for (i = 0, psrc = src->data[1], pdest = dest->data[1];
46657+         i != dest->height / 2;
46658+         ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
46659+    {
46660+        memcpy(pdest, psrc, dest->width / 2);
46661+    }
46662+    for (i = 0, psrc = src->data[2], pdest = dest->data[2];
46663+         i != dest->height / 2;
46664+         ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
46665+    {
46666+        memcpy(pdest, psrc, dest->width / 2);
46667+    }
46668+
46669+    return dest->buf[0];
46670+}
46671+
46672+
46673+static AVBufferRef * zc_420p10_to_sand128(const AVZcEnvPtr zc,
46674+    const AVFrame * const src)
46675+{
46676+    assert(0);
46677+    return NULL;
46678+}
46679+
46680+
46681+static AVBufferRef * zc_sand64_16_to_sand128(const AVZcEnvPtr zc,
46682+    const AVFrame * const src, const unsigned int src_bits)
46683+{
46684+    assert(0);
46685+    return NULL;
46686+}
46687+
46688+//----------------------------------------------------------------------------
46689+//
46690+// Public info extraction calls
46691+
46692+static void zc_buf_env_free_cb(void * opaque, uint8_t * data);
46693+
46694+static inline ZcBufEnv * pic_zbe_ptr(AVBufferRef *const buf)
46695+{
46696+    // Kludge where we check the free fn to check this is really
46697+    // one of our buffers - can't think of a better way
46698+    return buf == NULL || buf->buffer->free != zc_buf_env_free_cb ? NULL :
46699+        av_buffer_get_opaque(buf);
46700+}
46701+
46702+static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
46703+{
46704+    // As gmem is the first el NULL should be preserved
46705+    return &pic_zbe_ptr(buf)->gmem;
46706+}
46707+
46708+unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref)
46709+{
46710+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
46711+    return p == NULL ? 0 : p->vcsm_handle;
46712+}
46713+
46714+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
46715+{
46716+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
46717+    return p == NULL ? -1 : p->vc_handle;
46718+}
46719+
46720+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
46721+{
46722+    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
46723+    return zbe == NULL ? 0 : zbe->offset;
46724+}
46725+
46726+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
46727+{
46728+    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
46729+    return zbe == NULL ? 0 : zbe->size_pic;
46730+}
46731+
46732+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
46733+{
46734+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
46735+    return p == NULL ? 0 : p->numbytes;
46736+}
46737+
46738+const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref)
46739+{
46740+    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
46741+    return zbe == NULL ? NULL : &zbe->geo;
46742+}
46743+
46744+AVRpiZcRefPtr av_rpi_zc_ref(void * const logctx, const AVZcEnvPtr zc,
46745+    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
46746+{
46747+    av_assert0(!maycopy || zc != NULL);
46748+
46749+    if (frame->format != AV_PIX_FMT_YUV420P &&
46750+        frame->format != AV_PIX_FMT_YUV420P10 &&
46751+        !av_rpi_is_sand_frame(frame))
46752+    {
46753+        av_log(logctx, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
46754+        return NULL;
46755+    }
46756+
46757+    if (frame->buf[1] != NULL || frame->format != expected_format)
46758+    {
46759+#if RPI_ZC_SAND_8_IN_10_BUF
46760+        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
46761+        {
46762+//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
46763+            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
46764+        }
46765+#endif
46766+
46767+        if (maycopy)
46768+        {
46769+            if (frame->buf[1] != NULL)
46770+                av_log(logctx, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
46771+            else
46772+                av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
46773+
46774+            switch (frame->format)
46775+            {
46776+                case AV_PIX_FMT_YUV420P10:
46777+                    return zc_420p10_to_sand128(zc, frame);
46778+
46779+                case AV_PIX_FMT_SAND64_10:
46780+                    return zc_sand64_16_to_sand128(zc, frame, 10);
46781+
46782+                default:
46783+                    return zc_copy(zc, frame);
46784+            }
46785+        }
46786+        else
46787+        {
46788+            if (frame->buf[1] != NULL)
46789+                av_log(logctx, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
46790+            else
46791+                av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
46792+            return NULL;
46793+        }
46794+    }
46795+
46796+    if (pic_gm_ptr(frame->buf[0]) == NULL)
46797+    {
46798+        if (maycopy)
46799+        {
46800+            av_log(logctx, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
46801+            return zc_copy(zc, frame);
46802+        }
46803+        else
46804+        {
46805+            av_log(logctx, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
46806+            return NULL;
46807+        }
46808+    }
46809+
46810+    return av_buffer_ref(frame->buf[0]);
46811+}
46812+
46813+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
46814+{
46815+    if (fr_ref != NULL)
46816+    {
46817+        av_buffer_unref(&fr_ref);
46818+    }
46819+}
46820+
46821+//----------------------------------------------------------------------------
46822+
46823+// Extract user environment from an AVBufferRef
46824+void * av_rpi_zc_buf_v(AVBufferRef * const buf)
46825+{
46826+    ZcBufEnv * const zbe = pic_zbe_ptr(buf);
46827+    if (zbe != NULL && zbe->user != NULL)
46828+    {
46829+        const ZcUserBufEnv * const zub = (const ZcUserBufEnv *)zbe->user->data;
46830+        return zub == NULL ? NULL : zub->v;
46831+    }
46832+    return NULL;
46833+}
46834+
46835+// AV buffer pre-free callback
46836+static void zc_user_buf_free_cb(void * opaque, uint8_t * data)
46837+{
46838+    if (opaque != NULL)
46839+    {
46840+        ZcUserBufEnv * const zub = opaque;
46841+
46842+        if (zub->fn->free)
46843+            zub->fn->free(zub->v);
46844+
46845+        av_free(zub);
46846+    }
46847+}
46848+
46849+static void zc_buf_env_free_cb(void * opaque, uint8_t * data)
46850+{
46851+    if (opaque != NULL)
46852+    {
46853+        ZcBufEnv * const zbe = opaque;
46854+
46855+        av_buffer_unref(&zbe->user);
46856+
46857+        if (zbe->zc != NULL)
46858+            av_rpi_zc_env_release(zbe->zc);
46859+
46860+        pthread_cond_destroy(&zbe->cond);
46861+        pthread_mutex_destroy(&zbe->lock);
46862+        av_free(zbe);
46863+    }
46864+}
46865+
46866+
46867+// Wrap the various ZC bits in an AV Buffer and resolve those things we want
46868+// resolved now.
46869+// Currently we resolve everything, but in future we might not
46870+AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab)
46871+{
46872+    AVBufferRef *buf;
46873+    ZcUserBufEnv * zub;
46874+
46875+    if ((zub = av_malloc(sizeof(ZcUserBufEnv))) == NULL)
46876+        return NULL;
46877+
46878+    zub->fn = fn_tab;
46879+    zub->v = v;
46880+    zub->numbytes = numbytes;
46881+    zub->offset = addr_offset;
46882+
46883+    if ((buf = av_buffer_create((uint8_t*)zub, sizeof(*zub), zc_user_buf_free_cb, zub, 0)) == NULL)
46884+    {
46885+        av_log(NULL, AV_LOG_ERROR, "ZC: Failed av_buffer_create\n");
46886+        av_free(zub);
46887+        return NULL;
46888+    }
46889+
46890+    return buf;
46891+}
46892+
46893+int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int alloc_mode)
46894+{
46895+    ZcBufEnv * const zbe = pic_zbe_ptr(buf);
46896+
46897+    if (zbe == NULL)
46898+        return AVERROR(EINVAL);
46899+
46900+    if (alloc_mode == ZC_RESOLVE_FAIL && !zbe->is_valid)
46901+        return AVERROR(EAGAIN);
46902+
46903+    if (alloc_mode == ZC_RESOLVE_WAIT_VALID && !zbe->is_valid)
46904+    {
46905+        pthread_mutex_lock(&zbe->lock);
46906+        while (!zbe->is_valid)
46907+            pthread_cond_wait(&zbe->cond, &zbe->lock);
46908+        pthread_mutex_unlock(&zbe->lock);
46909+    }
46910+
46911+    if (zbe->is_valid == ZC_BUF_NEVER)
46912+        return AVERROR(EINVAL);
46913+
46914+    // Do alloc if we need it
46915+    if (zbe->user == NULL)
46916+    {
46917+        ZcEnv * const zc = zbe->zc;
46918+        const ZcUserBufEnv * zub;
46919+
46920+        av_assert0(alloc_mode == ZC_RESOLVE_ALLOC || alloc_mode == ZC_RESOLVE_ALLOC_VALID);
46921+
46922+        if ((zbe->user = zc->alloc_buf(zc->pool_env, zbe->size_pic, &zbe->geo)) == NULL)
46923+        {
46924+            av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
46925+            goto fail;
46926+        }
46927+        zub = (const ZcUserBufEnv *)zbe->user->data;
46928+
46929+        // Track
46930+
46931+        zbe->offset = zub->offset;
46932+        zbe->gmem.numbytes = zub->numbytes;
46933+        if ((zbe->gmem.arm =  zub->fn->map_arm(zub->v)) == NULL)
46934+        {
46935+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to lock vcsm_handle %u\n", zbe->gmem.vcsm_handle);
46936+            goto fail;
46937+        }
46938+
46939+        if ((zbe->gmem.vcsm_handle = zub->fn->vcsm_handle(zub->v)) == 0)
46940+        {
46941+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vcsm_handle\n");
46942+            goto fail;
46943+        }
46944+
46945+        if ((zbe->gmem.vc_handle = zub->fn->vc_handle(zub->v)) == 0)
46946+        {
46947+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc handle from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
46948+            goto fail;
46949+        }
46950+        if ((zbe->gmem.vc = zub->fn->map_vc(zub->v)) == 0)
46951+        {
46952+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc addr from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
46953+            goto fail;
46954+        }
46955+
46956+        buf->buffer->data = zbe->gmem.arm + zbe->offset;
46957+        buf->buffer->size = zbe->size_pic;
46958+
46959+        // In this mode we shouldn't have anyone waiting for us
46960+        // so no need to signal
46961+        if (alloc_mode == ZC_RESOLVE_ALLOC_VALID)
46962+            zbe->is_valid = 1;
46963+    }
46964+
46965+    // Just overwrite - no point in testing
46966+    buf->data = zbe->gmem.arm + zbe->offset;
46967+    buf->size = zbe->size_pic;
46968+    return 0;
46969+
46970+fail:
46971+    av_buffer_unref(&zbe->user);
46972+    return AVERROR(ENOMEM);
46973+}
46974+
46975+int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc)
46976+{
46977+    int rv;
46978+
46979+    // Do alloc if we need it
46980+    if ((rv = av_rpi_zc_resolve_buffer(frame->buf[0], may_alloc)) != 0)
46981+        return rv;
46982+
46983+    // If we are a framebuf copy then the alloc can be done but we haven't
46984+    // imported its results yet
46985+    if (frame->data[0] == NULL)
46986+    {
46987+        const ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
46988+
46989+        frame->linesize[0] = zbe->geo.stride_y;
46990+        frame->linesize[1] = zbe->geo.stride_c;
46991+        frame->linesize[2] = zbe->geo.stride_c;
46992+        // abuse: linesize[3] = "stripe stride"
46993+        // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
46994+        // In a general case this makes the calculation an xor and multiply rather
46995+        // than a divide and multiply
46996+        if (zbe->geo.stripes > 1)
46997+            frame->linesize[3] = zbe->geo.stripe_is_yc ? zbe->geo.height_y + zbe->geo.height_c : zbe->geo.height_y;
46998+
46999+        frame->data[0] = frame->buf[0]->data;
47000+        frame->data[1] = frame->data[0] + (zbe->geo.stripe_is_yc ? zbe->size_y : zbe->size_y * zbe->geo.stripes);
47001+        if (zbe->geo.planes_c > 1)
47002+            frame->data[2] = frame->data[1] + zbe->size_c;
47003+
47004+        frame->extended_data = frame->data;
47005+        // Leave extended buf alone
47006+    }
47007+
47008+    return 0;
47009+}
47010+
47011+int av_rpi_zc_set_valid_frame(AVFrame * const frame)
47012+{
47013+    ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
47014+
47015+    if (zbe == NULL)
47016+        return AVERROR(EINVAL);
47017+
47018+    zbe->is_valid = ZC_BUF_VALID;
47019+    pthread_cond_broadcast(&zbe->cond);
47020+
47021+    return 0;
47022+}
47023+
47024+int av_rpi_zc_set_broken_frame(AVFrame * const frame)
47025+{
47026+    ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
47027+
47028+    if (zbe == NULL)
47029+        return AVERROR(EINVAL);
47030+
47031+    zbe->is_valid = ZC_BUF_NEVER;
47032+    pthread_cond_broadcast(&zbe->cond);
47033+
47034+    return 0;
47035+}
47036+
47037+void av_rpi_zc_set_decoder_pool_size(ZcEnv *const zc, const unsigned int pool_size)
47038+{
47039+    zc->pool_size = pool_size;
47040+}
47041+
47042+unsigned int av_rpi_zc_get_decoder_pool_size(ZcEnv *const zc)
47043+{
47044+    return zc->pool_size;
47045+}
47046+
47047+int av_rpi_zc_get_buffer(ZcEnv *const zc, AVFrame * const frame)
47048+{
47049+#if 1
47050+    ZcBufEnv * zbe = av_mallocz(sizeof(*zbe));
47051+
47052+    for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; i++) {
47053+        frame->buf[i] = NULL;
47054+        frame->data[i] = NULL;
47055+        frame->linesize[i] = 0;
47056+    }
47057+
47058+    if (zbe == NULL)
47059+        return AVERROR(ENOMEM);
47060+
47061+    if ((frame->buf[0] = av_buffer_create((uint8_t *)zbe, sizeof(*zbe), zc_buf_env_free_cb, zbe, 0)) == NULL)
47062+    {
47063+        av_free(zbe);
47064+        return AVERROR(ENOMEM);
47065+    }
47066+
47067+    pthread_mutex_init(&zbe->lock, NULL);
47068+    pthread_cond_init(&zbe->cond, NULL);
47069+    zbe->zc = zc;
47070+    atomic_fetch_add(&zc->refcount, 1);
47071+
47072+    zbe->geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);  // Note geometry for later use
47073+    zbe->size_y = zbe->geo.stride_y * zbe->geo.height_y;
47074+    zbe->size_c = zbe->geo.stride_c * zbe->geo.height_c;
47075+    zbe->size_pic = (zbe->size_y + zbe->size_c * zbe->geo.planes_c) * zbe->geo.stripes;
47076+
47077+#else
47078+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
47079+    const unsigned int size_y = geo.stride_y * geo.height_y;
47080+    const unsigned int size_c = geo.stride_c * geo.height_c;
47081+    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
47082+    AVBufferRef * buf;
47083+    unsigned int i;
47084+
47085+//    printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
47086+
47087+    if ((buf = zc->alloc_buf(zc->pool_env, size_pic, &geo)) == NULL)
47088+    {
47089+        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
47090+        return AVERROR(ENOMEM);
47091+    }
47092+
47093+    // Track
47094+    atomic_fetch_add(&zc->refcount, 1);
47095+    pic_zbe_ptr(buf)->zc = zc;
47096+
47097+    for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
47098+        frame->buf[i] = NULL;
47099+        frame->data[i] = NULL;
47100+        frame->linesize[i] = 0;
47101+    }
47102+
47103+    frame->buf[0] = buf;
47104+
47105+    frame->linesize[0] = geo.stride_y;
47106+    frame->linesize[1] = geo.stride_c;
47107+    frame->linesize[2] = geo.stride_c;
47108+    // abuse: linesize[3] = "stripe stride"
47109+    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
47110+    // In a general case this makes the calculation an xor and multiply rather
47111+    // than a divide and multiply
47112+    if (geo.stripes > 1)
47113+        frame->linesize[3] = geo.stripe_is_yc ? geo.height_y + geo.height_c : geo.height_y;
47114+
47115+    frame->data[0] = buf->data;
47116+    frame->data[1] = frame->data[0] + (geo.stripe_is_yc ? size_y : size_y * geo.stripes);
47117+    if (geo.planes_c > 1)
47118+        frame->data[2] = frame->data[1] + size_c;
47119+
47120+    frame->extended_data = frame->data;
47121+    // Leave extended buf alone
47122+
47123+#if RPI_ZC_SAND_8_IN_10_BUF != 0
47124+    // *** If we intend to use this for real we will want a 2nd buffer pool
47125+    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = zc_pool_buf_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
47126+#endif
47127+#endif
47128+
47129+    return 0;
47130+}
47131+
47132+void av_rpi_zc_env_release(const AVZcEnvPtr zc)
47133+{
47134+    const int n = atomic_fetch_add(&zc->refcount, -1);
47135+    if (n == 1)  // was 1, now 0
47136+    {
47137+        zc->free_pool(zc->pool_env);
47138+        av_free(zc);
47139+    }
47140+}
47141+
47142+AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
47143+                    void * pool_env,
47144+                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
47145+                    av_rpi_zc_free_pool_fn_t * free_pool_fn)
47146+{
47147+    ZcEnv * zc;
47148+
47149+    if ((zc = av_mallocz(sizeof(ZcEnv))) == NULL)
47150+    {
47151+        av_log(logctx, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
47152+        return NULL;
47153+    }
47154+
47155+    *zc = (ZcEnv){
47156+        .refcount = ATOMIC_VAR_INIT(1),
47157+        .pool_env = pool_env,
47158+        .alloc_buf = alloc_buf_fn,
47159+        .free_pool = free_pool_fn,
47160+        .pool_size = 0
47161+    };
47162+
47163+    return zc;
47164+}
47165+
47166+//============================================================================
47167+//
47168+// External ZC initialisation
47169+
47170+#define RPI_GET_BUFFER2 1
47171+
47172+
47173+static int zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
47174+{
47175+#if !RPI_GET_BUFFER2
47176+    return avcodec_default_get_buffer2(s, frame, flags);
47177+#else
47178+    int rv;
47179+
47180+    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
47181+    {
47182+//        printf("Do default alloc: format=%#x\n", frame->format);
47183+        rv = avcodec_default_get_buffer2(s, frame, flags);
47184+    }
47185+    else if (frame->format == AV_PIX_FMT_YUV420P ||
47186+             av_rpi_is_sand_frame(frame))
47187+    {
47188+        if ((rv = av_rpi_zc_get_buffer(s->opaque, frame)) == 0)
47189+            rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);
47190+    }
47191+    else
47192+    {
47193+        rv = avcodec_default_get_buffer2(s, frame, flags);
47194+    }
47195+
47196+#if 0
47197+    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
47198+        frame->format, frame->width, frame->height,
47199+        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
47200+        frame->data[0], frame->data[1], frame->data[2],
47201+        frame->buf[0], frame->buf[1], frame->buf[2],
47202+        av_buffer_get_opaque(frame->buf[0]));
47203+#endif
47204+    return rv;
47205+#endif
47206+}
47207+
47208+int av_rpi_zc_in_use(const struct AVCodecContext * const s)
47209+{
47210+    return s->get_buffer2 == zc_get_buffer2;
47211+}
47212+
47213+int av_rpi_zc_init2(struct AVCodecContext * const s,
47214+                    void * pool_env,
47215+                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
47216+                    av_rpi_zc_free_pool_fn_t * free_pool_fn)
47217+{
47218+    ZcEnv * zc;
47219+
47220+    av_assert0(!av_rpi_zc_in_use(s));
47221+
47222+    if ((zc = av_rpi_zc_env_alloc(s, pool_env, alloc_buf_fn, free_pool_fn)) == NULL)
47223+        return AVERROR(ENOMEM);
47224+
47225+    zc->old = (ZcOldCtxVals){
47226+        .opaque = s->opaque,
47227+        .get_buffer2 = s->get_buffer2,
47228+        .thread_safe_callbacks = s->thread_safe_callbacks
47229+    };
47230+
47231+    s->opaque = zc;
47232+    s->get_buffer2 = zc_get_buffer2;
47233+    s->thread_safe_callbacks = 1;
47234+    return 0;
47235+}
47236+
47237+void av_rpi_zc_uninit2(struct AVCodecContext * const s)
47238+{
47239+    ZcEnv * const zc = s->opaque;
47240+
47241+    av_assert0(av_rpi_zc_in_use(s));
47242+
47243+    s->get_buffer2 = zc->old.get_buffer2;
47244+    s->opaque = zc->old.opaque;
47245+    s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
47246+
47247+    av_rpi_zc_env_release(zc);
47248+}
47249+
47250--- /dev/null
47251+++ b/libavcodec/rpi_zc.h
47252@@ -0,0 +1,228 @@
47253+/*
47254+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
47255+All rights reserved.
47256+
47257+Redistribution and use in source and binary forms, with or without
47258+modification, are permitted provided that the following conditions are met:
47259+    * Redistributions of source code must retain the above copyright
47260+      notice, this list of conditions and the following disclaimer.
47261+    * Redistributions in binary form must reproduce the above copyright
47262+      notice, this list of conditions and the following disclaimer in the
47263+      documentation and/or other materials provided with the distribution.
47264+    * Neither the name of the copyright holder nor the
47265+      names of its contributors may be used to endorse or promote products
47266+      derived from this software without specific prior written permission.
47267+
47268+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
47269+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
47270+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
47271+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
47272+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
47273+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
47274+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47275+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
47276+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
47277+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
47278+
47279+Authors: John Cox
47280+*/
47281+
47282+#ifndef LIBAVCODEC_RPI_ZC_H
47283+#define LIBAVCODEC_RPI_ZC_H
47284+
47285+// Zero-Copy frame code for RPi
47286+// RPi needs Y/U/V planes to be contiguous for display.  By default
47287+// ffmpeg will allocate separated planes so a memcpy is needed before
47288+// display.  This code provides a method a making ffmpeg allocate a single
47289+// bit of memory for the frame when can then be reference counted until
47290+// display has finished with it.
47291+
47292+// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
47293+// 0 disables
47294+// *** This option still in development
47295+//     Only works if SAO active
47296+//     Allocates buffers that are twice the required size
47297+#define RPI_ZC_SAND_8_IN_10_BUF  0
47298+
47299+struct AVBufferRef;
47300+struct AVFrame;
47301+struct AVCodecContext;
47302+enum AVPixelFormat;
47303+
47304+// "Opaque" pointer to whatever we are using as a buffer reference
47305+typedef struct AVBufferRef * AVRpiZcRefPtr;
47306+
47307+struct AVZcEnv;
47308+typedef struct AVZcEnv * AVZcEnvPtr;
47309+
47310+typedef struct AVRpiZcFrameGeometry
47311+{
47312+    unsigned int stride_y;  // Luma stride (bytes)
47313+    unsigned int height_y;  // Luma height (lines)
47314+    unsigned int stride_c;  // Chroma stride (bytes)
47315+    unsigned int height_c;  // Chroma stride (lines)
47316+    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
47317+    unsigned int stripes;   // Number of stripes (sand)
47318+    unsigned int bytes_per_pel;
47319+    int stripe_is_yc;       // A single stripe is Y then C (false for tall sand)
47320+
47321+    int format;                 // Requested format
47322+    unsigned int video_width;   // Requested width
47323+    unsigned int video_height;  // Requested height
47324+} AVRpiZcFrameGeometry;
47325+
47326+// Get expected MMAL geometry for a given format, width & height
47327+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
47328+    const int format,
47329+    const unsigned int video_width, const unsigned int video_height);
47330+
47331+//----------------------------------------------------------------------------
47332+//
47333+// Calls that extract info from a ZC frame whether internally or externally
47334+// allocated
47335+
47336+// Generate a ZC reference to the buffer(s) in this frame
47337+// If the buffer doesn't appear to be one allocated by ZC
47338+// then the behaviour depends on maycopy:
47339+//   If maycopy=0 then return NULL
47340+//   If maycopy=1 && the src frame is in a form where we can easily copy
47341+//     the data, then allocate a new buffer and copy the data into it
47342+//   Otherwise return NULL
47343+// If maycopy == 0 then ZC may be NULL
47344+AVRpiZcRefPtr av_rpi_zc_ref(void * const logging_context, const AVZcEnvPtr zc,
47345+    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
47346+
47347+// Unreference the buffer refed/allocated by _zc_ref
47348+// If fr_ref is NULL then this will NOP
47349+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
47350+
47351+// Get the vc_handle from the frame ref
47352+// Returns -1 if ref doesn't look valid
47353+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
47354+// Get the vcsm_handle from the frame ref
47355+// Returns 0 if ref doesn't look valid
47356+unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref);
47357+// Get offset from the start of the memory referenced
47358+// by the vc_handle to valid data
47359+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
47360+// Length of buffer data
47361+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
47362+// Get the number of bytes allocated from the frame ref
47363+// Returns 0 if ref doesn't look valid
47364+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
47365+// Geometry this frame was allocated with
47366+const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref);
47367+
47368+//----------------------------------------------------------------------------
47369+//
47370+// Calls for external frame allocation
47371+
47372+// Callbacks registered in av_rpi_zc_init2
47373+
47374+// Callback to allocate a buf for a frame
47375+// The frame itself is generated in the calling code
47376+//
47377+// Parameters:
47378+//   pool_env  value passed to av-rpi_zc_init2
47379+//   size      size wanted
47380+//   geo       geometry of the frame to be allocated
47381+// Returns:
47382+//   NULL      Alloc failed
47383+//   ptr       AVBufferBuf* of allocated buffer
47384+//             In most cases av_rpi_zc_buf will be called by this function
47385+//             and this will be the buf returned by that.
47386+typedef AVBufferRef * av_rpi_zc_alloc_buf_fn_t(void * pool_env, size_t size,
47387+                                               const AVRpiZcFrameGeometry * geo);
47388+
47389+// Callback once ffmpeg is completely done with this pool
47390+// Called once all allocated buffers have been derefed and ffmpegs ref to this
47391+// pool has been dropped
47392+typedef void av_rpi_zc_free_pool_fn_t(void * pool_env);
47393+
47394+// Init ZC into a context
47395+// Sets opaque, get_buffer2, thread_safe_callbacks
47396+// Use if you want to allocate your own pools and/or create ZC buffers for
47397+// all decoders
47398+// RPI HEVC decoders will allocate appropriate VCSM buffers which can be taken
47399+// apart by av_rpi_zc_xxx calls without this
47400+int av_rpi_zc_init2(struct AVCodecContext * const s,
47401+                    void * pool_env, av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
47402+                    av_rpi_zc_free_pool_fn_t * free_pool_fn);
47403+
47404+// Free ZC from a context
47405+void av_rpi_zc_uninit2(struct AVCodecContext * const s);
47406+
47407+// Get minimum pool size in frames - valid by the time the first alloc request
47408+// occurs.  Takes into account thread requests and DPB sizes derived from SPS
47409+// rather than just adding a worst case DPB size.
47410+unsigned int av_rpi_zc_get_decoder_pool_size(const AVZcEnvPtr zc);
47411+
47412+typedef struct av_rpi_zc_buf_fn_tab_s {
47413+    // This AVBuffer is being freed by ffmpeg - return memory
47414+    // to external pool. Memory may be, but need not be, unmapped.
47415+    // v is the ptr passed in av_rpi_zc_buf
47416+    void (* free)(void * v);
47417+
47418+    // Return appropriate handles / mappings
47419+    // v is the ptr passed in av_rpi_zc_buf
47420+    unsigned int (* vcsm_handle)(void * v);
47421+    unsigned int (* vc_handle)(void * v);
47422+    void * (* map_arm)(void * v);
47423+    unsigned int (* map_vc)(void * v);
47424+} av_rpi_zc_buf_fn_tab_t;
47425+
47426+// Allocate a ZC AVBufferRef and set its callback table
47427+// Doesn't take a buffer address directly - relies on callbacks to return
47428+// addresses as they are required.  Mappings need not be generated until
47429+// the map callbacks are called but they should persist from then until
47430+// the buffer is freed.
47431+//
47432+// Parameters:
47433+//   numbytes    Size of the buffer
47434+//   addr_offset Offset to first usable byte of buffer (for alignment)
47435+//               normally 0
47436+//   v           Pointer passed to callbacks
47437+//   fn_tab      Function table
47438+AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab);
47439+
47440+// Get v ptr set in in av_rpi_zc_buf
47441+void * av_rpi_zc_buf_v(AVBufferRef * const buf);
47442+
47443+//----------------------------------------------------------------------------
47444+//
47445+// Mostly internal calls but might possibly be wanted by outside code
47446+
47447+void av_rpi_zc_int_env_freep(AVZcEnvPtr * zc);
47448+AVZcEnvPtr av_rpi_zc_int_env_alloc(void * const logctx);
47449+void av_rpi_zc_set_decoder_pool_size(const AVZcEnvPtr zc, const unsigned int pool_size);
47450+
47451+// Test to see if the context is using zc (checks get_buffer2)
47452+int av_rpi_zc_in_use(const struct AVCodecContext * const s);
47453+
47454+// Get buffer generates placeholders for later alloc
47455+int av_rpi_zc_get_buffer(const AVZcEnvPtr zc, AVFrame * const frame);
47456+// Resolve actually does the alloc (noop if already alloced)
47457+// Set data pointers on a buffer/frame that was copied before the alloc
47458+// accured
47459+#define ZC_RESOLVE_FAIL         0  // return error on invalid
47460+#define ZC_RESOLVE_ALLOC        1  // alloc as invalid
47461+#define ZC_RESOLVE_WAIT_VALID   2  // wait for valid
47462+#define ZC_RESOLVE_ALLOC_VALID  3  // alloc as valid
47463+int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int may_alloc);
47464+int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc);
47465+
47466+int av_rpi_zc_set_valid_frame(AVFrame * const frame);
47467+int av_rpi_zc_set_broken_frame(AVFrame * const frame);
47468+
47469+
47470+
47471+
47472+AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
47473+                    void * pool_env,
47474+                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
47475+                    av_rpi_zc_free_pool_fn_t * free_pool_fn);
47476+void av_rpi_zc_env_release(const AVZcEnvPtr zc);
47477+
47478+
47479+#endif
47480+
47481--- /dev/null
47482+++ b/libavcodec/rpi_zc_frames.h
47483@@ -0,0 +1,142 @@
47484+/*
47485+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
47486+All rights reserved.
47487+
47488+Redistribution and use in source and binary forms, with or without
47489+modification, are permitted provided that the following conditions are met:
47490+    * Redistributions of source code must retain the above copyright
47491+      notice, this list of conditions and the following disclaimer.
47492+    * Redistributions in binary form must reproduce the above copyright
47493+      notice, this list of conditions and the following disclaimer in the
47494+      documentation and/or other materials provided with the distribution.
47495+    * Neither the name of the copyright holder nor the
47496+      names of its contributors may be used to endorse or promote products
47497+      derived from this software without specific prior written permission.
47498+
47499+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
47500+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
47501+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
47502+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
47503+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
47504+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
47505+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47506+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
47507+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
47508+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
47509+
47510+Authors: John Cox, Ben Avison
47511+*/
47512+
47513+#ifndef RPI_ZC_FRAMES_H
47514+#define RPI_ZC_FRAMES_H
47515+
47516+#define RPI_ONE_BUF 1
47517+
47518+#include "rpi_mem.h"  // for GPU_MEM_PTR_T
47519+#include "libavutil/frame.h"
47520+
47521+#if !RPI_ONE_BUF
47522+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
47523+    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[0]);
47524+    return p->vc;
47525+}
47526+
47527+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
47528+    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[1]);
47529+    return p->vc;
47530+}
47531+
47532+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
47533+    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[2]);
47534+    return p->vc;
47535+}
47536+
47537+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
47538+    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[0]);
47539+}
47540+
47541+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
47542+    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[1]);
47543+}
47544+
47545+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
47546+    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[2]);
47547+}
47548+
47549+#else
47550+
47551+static inline int gpu_is_buf1(const AVFrame * const frame)
47552+{
47553+    return frame->buf[1] == NULL;
47554+}
47555+
47556+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
47557+{
47558+    return av_buffer_get_opaque(frame->buf[0]);
47559+}
47560+
47561+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
47562+{
47563+    return av_buffer_pool_buffer_get_opaque(frame->buf[n]);
47564+}
47565+
47566+static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
47567+{
47568+    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
47569+    return gm->vc + (frame->data[n] - gm->arm);
47570+}
47571+
47572+
47573+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
47574+    return get_vc_address3(frame, 0);
47575+}
47576+
47577+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
47578+    return get_vc_address3(frame, 1);
47579+}
47580+
47581+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
47582+    return get_vc_address3(frame, 2);
47583+}
47584+
47585+#if 0
47586+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
47587+    if (gpu_is_buf1(frame))
47588+    {
47589+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
47590+        g.numbytes = frame->data[1] - frame->data[0];
47591+        return g;
47592+    }
47593+    else
47594+        return *gpu_buf3_gmem(frame, 0);
47595+}
47596+
47597+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
47598+    if (gpu_is_buf1(frame))
47599+    {
47600+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
47601+        g.arm += frame->data[1] - frame->data[0];
47602+        g.vc += frame->data[1] - frame->data[0];
47603+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
47604+        return g;
47605+    }
47606+    else
47607+        return *gpu_buf3_gmem(frame, 1);
47608+}
47609+
47610+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
47611+    if (gpu_is_buf1(frame))
47612+    {
47613+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
47614+        g.arm += frame->data[2] - frame->data[0];
47615+        g.vc += frame->data[2] - frame->data[0];
47616+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
47617+        return g;
47618+    }
47619+    else
47620+        return *gpu_buf3_gmem(frame, 2);
47621+}
47622+#endif
47623+#endif
47624+
47625+#endif
47626--- /dev/null
47627+++ b/libavcodec/rpivid_hevc.c
47628@@ -0,0 +1,2128 @@
47629+// FFMPEG HEVC decoder hardware accelerator
47630+// Andrew Holme, Argon Design Ltd
47631+// Copyright (c) June 2017 Raspberry Pi Ltd
47632+
47633+#include <stdio.h>
47634+#include <fcntl.h>
47635+#include <pthread.h>
47636+#include <semaphore.h>
47637+#include <unistd.h>
47638+#include <sys/mman.h>
47639+
47640+#include "fftools/ffmpeg.h"
47641+#include "libavutil/avassert.h"
47642+#include "libavutil/imgutils.h"
47643+#include "avcodec.h"
47644+#include "hwconfig.h"
47645+#include "decode.h"
47646+
47647+#include "hevc.h"
47648+#include "hevcdec.h"
47649+#include "rpi_zc.h"
47650+#include "rpi_mem.h"
47651+#include "rpi_zc_frames.h"
47652+#include "rpi_mailbox.h"
47653+
47654+
47655+#define OPT_PHASE_TIMING 0      // Generate stats for phase usage
47656+
47657+#define OPT_EMU 0
47658+
47659+#define TRACE_DEV 0
47660+#define TRACE_ENTRY 0
47661+
47662+#define NUM_SCALING_FACTORS 4064
47663+
47664+#define AXI_BASE64 0
47665+
47666+#define PROB_BACKUP ((20<<12) + (20<<6) + (0<<0))
47667+#define PROB_RELOAD ((20<<12) + (20<<0) + (0<<6))
47668+
47669+#define RPIVID_COL_PICS 17                 // 16 ref & current
47670+
47671+#define RPIVID_BITBUFS          2          // Bit + Cmd bufs (phase 0 & 1)
47672+#define RPIVID_BITBUF_SIZE      (4 << 20)  // Bit + Cmd buf size
47673+
47674+#define RPIVID_COEFFBUFS        3          // PU + Coeff bufs (phase 1 & 2)
47675+#define RPIVID_COEFFBUF_SIZE    (16 << 20) // PU + Coeff buf size
47676+
47677+//////////////////////////////////////////////////////////////////////////////
47678+//
47679+// Register offsets
47680+
47681+#define RPI_SPS0         0
47682+#define RPI_SPS1         4
47683+#define RPI_PPS          8
47684+#define RPI_SLICE        12
47685+#define RPI_TILESTART    16
47686+#define RPI_TILEEND      20
47687+#define RPI_SLICESTART   24
47688+#define RPI_MODE         28
47689+#define RPI_LEFT0        32
47690+#define RPI_LEFT1        36
47691+#define RPI_LEFT2        40
47692+#define RPI_LEFT3        44
47693+#define RPI_QP           48
47694+#define RPI_CONTROL      52
47695+#define RPI_STATUS       56
47696+#define RPI_VERSION      60
47697+#define RPI_BFBASE       64
47698+#define RPI_BFNUM        68
47699+#define RPI_BFCONTROL    72
47700+#define RPI_BFSTATUS     76
47701+#define RPI_PUWBASE      80
47702+#define RPI_PUWSTRIDE    84
47703+#define RPI_COEFFWBASE   88
47704+#define RPI_COEFFWSTRIDE 92
47705+#define RPI_SLICECMDS    96
47706+#define RPI_BEGINTILEEND 100
47707+#define RPI_TRANSFER     104
47708+#define RPI_CFBASE       108
47709+#define RPI_CFNUM        112
47710+#define RPI_CFSTATUS     116
47711+
47712+#define RPI_PURBASE       0x8000
47713+#define RPI_PURSTRIDE     0x8004
47714+#define RPI_COEFFRBASE    0x8008
47715+#define RPI_COEFFRSTRIDE  0x800C
47716+#define RPI_NUMROWS       0x8010
47717+#define RPI_CONFIG2       0x8014
47718+#define RPI_OUTYBASE      0x8018
47719+#define RPI_OUTYSTRIDE    0x801C
47720+#define RPI_OUTCBASE      0x8020
47721+#define RPI_OUTCSTRIDE    0x8024
47722+#define RPI_STATUS2       0x8028
47723+#define RPI_FRAMESIZE     0x802C
47724+#define RPI_MVBASE        0x8030
47725+#define RPI_MVSTRIDE      0x8034
47726+#define RPI_COLBASE       0x8038
47727+#define RPI_COLSTRIDE     0x803C
47728+#define RPI_CURRPOC       0x8040
47729+
47730+//////////////////////////////////////////////////////////////////////////////
47731+
47732+// Unused but left here to illustrate the diffrences between FFmpegs prob
47733+// structure and the rpivid one
47734+
47735+struct FFM_PROB {
47736+    uint8_t  sao_merge_flag                   [ 1];
47737+    uint8_t  sao_type_idx                     [ 1];
47738+    uint8_t  split_coding_unit_flag           [ 3];
47739+    uint8_t  cu_transquant_bypass_flag        [ 1];
47740+    uint8_t  skip_flag                        [ 3];
47741+    uint8_t  cu_qp_delta                      [ 3];
47742+    uint8_t  pred_mode_flag                   [ 1];
47743+    uint8_t  part_mode                        [ 4];
47744+    uint8_t  prev_intra_luma_pred_flag        [ 1];
47745+    uint8_t  intra_chroma_pred_mode           [ 2];
47746+    uint8_t  merge_flag                       [ 1];
47747+    uint8_t  merge_idx                        [ 1];
47748+    uint8_t  inter_pred_idc                   [ 5];
47749+    uint8_t  ref_idx_l0                       [ 2];
47750+    uint8_t  ref_idx_l1                       [ 2];
47751+    uint8_t  abs_mvd_greater0_flag            [ 2];
47752+    uint8_t  abs_mvd_greater1_flag            [ 2];
47753+    uint8_t  mvp_lx_flag                      [ 1];
47754+    uint8_t  no_residual_data_flag            [ 1];
47755+    uint8_t  split_transform_flag             [ 3];
47756+    uint8_t  cbf_luma                         [ 2];
47757+    uint8_t  cbf_cb_cr                        [ 4];
47758+    uint8_t  transform_skip_flag/*[][]*/      [ 2];
47759+    uint8_t  explicit_rdpcm_flag/*[][]*/      [ 2];
47760+    uint8_t  explicit_rdpcm_dir_flag/*[][]*/  [ 2];
47761+    uint8_t  last_significant_coeff_x_prefix  [18];
47762+    uint8_t  last_significant_coeff_y_prefix  [18];
47763+    uint8_t  significant_coeff_group_flag     [ 4];
47764+    uint8_t  significant_coeff_flag           [44];
47765+    uint8_t  coeff_abs_level_greater1_flag    [24];
47766+    uint8_t  coeff_abs_level_greater2_flag    [ 6];
47767+    uint8_t  log2_res_scale_abs               [ 8];
47768+    uint8_t  res_scale_sign_flag              [ 2];
47769+    uint8_t  cu_chroma_qp_offset_flag         [ 1];
47770+    uint8_t  cu_chroma_qp_offset_idx          [ 1];
47771+} __attribute__((packed));
47772+
47773+//////////////////////////////////////////////////////////////////////////////
47774+
47775+struct RPI_PROB {
47776+    uint8_t  SAO_MERGE_FLAG             [ 1];
47777+    uint8_t  SAO_TYPE_IDX               [ 1];
47778+    uint8_t  SPLIT_FLAG                 [ 3];
47779+    uint8_t  CU_SKIP_FLAG               [ 3];
47780+    uint8_t  CU_TRANSQUANT_BYPASS_FLAG  [ 1];
47781+    uint8_t  PRED_MODE                  [ 1];
47782+    uint8_t  PART_SIZE                  [ 4];
47783+    uint8_t  INTRA_PRED_MODE            [ 1];
47784+    uint8_t  CHROMA_PRED_MODE           [ 1];
47785+    uint8_t  MERGE_FLAG_EXT             [ 1];
47786+    uint8_t  MERGE_IDX_EXT              [ 1];
47787+    uint8_t  INTER_DIR                  [ 5];
47788+    uint8_t  REF_PIC                    [ 2];
47789+    uint8_t  MVP_IDX                    [ 1];
47790+    uint8_t  MVD                        [ 2];
47791+    uint8_t  QT_ROOT_CBF                [ 1];
47792+    uint8_t  TRANS_SUBDIV_FLAG          [ 3];
47793+    uint8_t  QT_CBF                     [ 6];
47794+    uint8_t  DQP                        [ 2];
47795+    uint8_t  ONE_FLAG                   [24];
47796+    uint8_t  LASTX                      [18];
47797+    uint8_t  LASTY                      [18];
47798+    uint8_t  SIG_CG_FLAG                [ 4];
47799+    uint8_t  ABS_FLAG                   [ 6];
47800+    uint8_t  TRANSFORMSKIP_FLAG         [ 2];
47801+    uint8_t  SIG_FLAG                   [42];
47802+    uint8_t  SIG_FLAG_unused            [ 2];
47803+} __attribute__((packed));
47804+
47805+//////////////////////////////////////////////////////////////////////////////
47806+
47807+struct RPI_CMD {
47808+    uint32_t addr;
47809+    uint32_t data;
47810+} __attribute__((packed));
47811+
47812+struct RPI_BIT {
47813+    int         cmd;
47814+    const void *ptr;
47815+    int         len;
47816+};
47817+
47818+//////////////////////////////////////////////////////////////////////////////
47819+
47820+struct RPI_T;
47821+
47822+// Actual addressability is 38bits but we can only alloc in the bottom 32
47823+// currently - when passed to rpivid h/w the address is always >> 6 so will
47824+// fit in 32 bit there
47825+// At some point we may weant to make this uint64_t
47826+typedef uint32_t vid_vc_addr_t;
47827+
47828+typedef enum rpivid_decode_state_e {
47829+    RPIVID_DECODE_NEW = 0,
47830+    RPIVID_DECODE_START,
47831+    RPIVID_DECODE_SLICE,
47832+    RPIVID_DECODE_END,
47833+} rpivid_decode_state_t;
47834+
47835+#define RPI_PROB_VALS 154U
47836+#define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3)
47837+
47838+typedef struct dec_env_s {
47839+    const AVCodecContext * avctx;
47840+
47841+    rpivid_decode_state_t state;
47842+    unsigned int    decode_order;
47843+
47844+    int             phase_no;           // Current phase (i.e. the last one we waited for)
47845+    struct dec_env_s * phase_wait_q_next;
47846+    sem_t           phase_wait;
47847+
47848+    struct RPI_BIT *bit_fifo;
47849+    struct RPI_CMD *cmd_fifo;
47850+    unsigned int    bit_len, bit_max;
47851+    unsigned int    cmd_len, cmd_max;
47852+    unsigned int    num_slice_msgs;
47853+    unsigned int    PicWidthInCtbsY;
47854+    unsigned int    PicHeightInCtbsY;
47855+    unsigned int    dpbno_col;
47856+    uint32_t        reg_slicestart;
47857+    unsigned int    wpp_entry_x;
47858+    unsigned int    wpp_entry_y;
47859+
47860+    const uint8_t * nal_buffer;
47861+    size_t          nal_size;
47862+
47863+    uint16_t        slice_msgs[2*HEVC_MAX_REFS*8+3];
47864+    uint8_t         scaling_factors[NUM_SCALING_FACTORS];
47865+//    unsigned int    RefPicList[2][HEVC_MAX_REFS];
47866+} dec_env_t;
47867+
47868+#define RPIVID_PHASES 3
47869+#define RPIVID_PHASE_NEW (RPIVID_PHASES) // Phase before we have inced decode order
47870+#define RPIVID_PHASE_START (-1)          // Phase after we have inced decode_order
47871+
47872+#if OPT_PHASE_TIMING
47873+static const unsigned int time_thresholds[8] = {
47874+    10, 15, 20, 30, 45, 60, 75, 90
47875+};
47876+#endif
47877+
47878+typedef struct phase_wait_env_s {
47879+    unsigned int    last_order;
47880+    dec_env_t *     q;
47881+#if OPT_PHASE_TIMING
47882+    uint64_t phase_time;
47883+    uint64_t max_phase_time;
47884+    uint64_t time_in_phase;
47885+    uint64_t time_out_phase;
47886+    unsigned int max_time_decode_order;
47887+    unsigned int time_bins[9];
47888+    unsigned int time_bins3[9];
47889+    unsigned int time_bins5[9];
47890+    uint64_t time_stash[16];
47891+    unsigned int i3;
47892+#endif
47893+} phase_wait_env_t;                      // Single linked list of threads waiting for this phase
47894+
47895+typedef struct RPI_T {
47896+    atomic_int      ref_count;
47897+    sem_t           ref_zero;
47898+
47899+    dec_env_t **    dec_envs;
47900+    AVZcEnvPtr      zc;
47901+
47902+    pthread_mutex_t phase_lock;
47903+    phase_wait_env_t phase_reqs[RPIVID_PHASES];
47904+
47905+    volatile uint32_t * regs;
47906+    volatile uint32_t * ints;
47907+
47908+    GPU_MEM_PTR_T   gcolbuf;
47909+    unsigned int    col_stride;
47910+    size_t          col_picsize;
47911+
47912+    unsigned int    bitbuf_no;
47913+    sem_t           bitbuf_sem;
47914+    GPU_MEM_PTR_T   gbitbufs[RPIVID_BITBUFS];
47915+
47916+    unsigned int    max_pu_msgs;
47917+    unsigned int    coeffbuf_no;
47918+    sem_t           coeffbuf_sem;
47919+    GPU_MEM_PTR_T   gcoeffbufs[RPIVID_COEFFBUFS];
47920+
47921+    unsigned int    decode_order;
47922+    int             mbox_fd;
47923+    int             gpu_init_type;
47924+} RPI_T;
47925+
47926+#if OPT_PHASE_TIMING
47927+static uint64_t tus64(void)
47928+{
47929+    struct timespec ts;
47930+    clock_gettime(CLOCK_MONOTONIC, &ts);
47931+    return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
47932+}
47933+#endif
47934+
47935+static inline unsigned int rnd64(unsigned int x)
47936+{
47937+    return (x + 63) & ~63;
47938+}
47939+
47940+static inline int rpi_sem_wait(sem_t * const sem)
47941+{
47942+    int rv;
47943+    while ((rv = sem_wait(sem)) != 0 && errno == EINTR)
47944+        /* Loop */;
47945+    return rv;
47946+}
47947+
47948+//============================================================================
47949+
47950+#define REGS_NAME "/dev/rpivid-hevcmem"
47951+#define REGS_SIZE 0x10000
47952+#define INTS_NAME "/dev/rpivid-intcmem"
47953+#define INTS_SIZE 0x10000  // 4 is probably enough but we are going to alloc a page anyway
47954+
47955+static volatile uint32_t * map_dev(AVCodecContext * const avctx, const char * const dev_name, size_t size)
47956+{
47957+    void *gpio_map;
47958+    int  mem_fd;
47959+
47960+    /* open /dev/mem */
47961+    if ((mem_fd = open(dev_name, O_RDWR|O_SYNC) ) < 0) {
47962+        av_log(avctx, AV_LOG_WARNING, "can't open %s\n", dev_name);
47963+        return NULL;
47964+    }
47965+
47966+    // Now map it
47967+    gpio_map = mmap(
47968+       NULL,
47969+       size,
47970+       PROT_READ|PROT_WRITE,
47971+       MAP_SHARED,
47972+       mem_fd,
47973+       0
47974+    );
47975+
47976+    close(mem_fd);  // No longer need the FD
47977+
47978+    if (gpio_map == MAP_FAILED) {
47979+        av_log(avctx, AV_LOG_WARNING, "GPIO mapping failed");
47980+        return NULL;
47981+    }
47982+
47983+    return (volatile uint32_t *)gpio_map;
47984+}
47985+
47986+static void unmap_devp(volatile uint32_t ** const p_gpio_map, size_t size)
47987+{
47988+    volatile uint32_t * const gpio_map = *p_gpio_map;
47989+    if (gpio_map != NULL) {
47990+        *p_gpio_map = NULL;
47991+        munmap((void *)gpio_map, size);
47992+    }
47993+}
47994+
47995+#define MANGLE(x) ((x) &~0xc0000000)          // ** If x is ever a 64 bit thing this will need fixing!
47996+#define MANGLE64(x) (uint32_t)(MANGLE(x)>>6)
47997+
47998+static inline void apb_write_vc_addr(const RPI_T *const rpi, const uint32_t addr, const vid_vc_addr_t data)
47999+{
48000+#if TRACE_DEV
48001+    printf("W %x %08x\n", addr, MANGLE64(data));
48002+#endif
48003+
48004+    rpi->regs[addr >> 2] = MANGLE64(data);
48005+}
48006+
48007+static inline void apb_write_vc_len(const RPI_T *const rpi, const uint32_t addr, const unsigned int data)
48008+{
48009+#if TRACE_DEV
48010+    printf("W %x %08x\n", addr, data >> 6);
48011+#endif
48012+
48013+    rpi->regs[addr >> 2] = data >> 6;  // ?? rnd64 - but not currently needed
48014+}
48015+
48016+static inline void apb_write(const RPI_T * const rpi, const uint32_t addr, const uint32_t data)
48017+{
48018+#if TRACE_DEV
48019+    printf("W %x %08x\n", addr, data);
48020+#endif
48021+
48022+    rpi->regs[addr >> 2] = data;
48023+}
48024+
48025+static inline uint32_t apb_read(const RPI_T * const rpi, const uint32_t addr)
48026+{
48027+    const uint32_t v = rpi->regs[addr >> 2];
48028+#if TRACE_DEV
48029+    printf("R %x (=%x)\n", addr, v);
48030+#endif
48031+    return v;
48032+}
48033+
48034+#define ARG_IC_ICTRL_ACTIVE1_INT_SET                   0x00000001
48035+#define ARG_IC_ICTRL_ACTIVE1_EDGE_SET                  0x00000002
48036+#define ARG_IC_ICTRL_ACTIVE1_EN_SET                    0x00000004
48037+#define ARG_IC_ICTRL_ACTIVE1_STATUS_SET                0x00000008
48038+#define ARG_IC_ICTRL_ACTIVE2_INT_SET                   0x00000010
48039+#define ARG_IC_ICTRL_ACTIVE2_EDGE_SET                  0x00000020
48040+#define ARG_IC_ICTRL_ACTIVE2_EN_SET                    0x00000040
48041+#define ARG_IC_ICTRL_ACTIVE2_STATUS_SET                0x00000080
48042+
48043+static inline void int_wait(const RPI_T * const rpi, const unsigned int phase)
48044+{
48045+    const uint32_t mask_reset = phase == 1 ? ~ARG_IC_ICTRL_ACTIVE2_INT_SET : ~ARG_IC_ICTRL_ACTIVE1_INT_SET;
48046+    const uint32_t mask_done = phase == 1 ? ARG_IC_ICTRL_ACTIVE1_INT_SET : ARG_IC_ICTRL_ACTIVE2_INT_SET;
48047+    uint32_t ival;
48048+    while (((ival = rpi->ints[0]) & mask_done) == 0) {
48049+        usleep(1000);
48050+    }
48051+    rpi->ints[0] = ival & mask_reset;
48052+}
48053+
48054+#if TRACE_DEV && 0
48055+static void apb_dump_regs(const RPI_T * const rpi, uint16_t addr, int num) {
48056+    int i;
48057+
48058+    for (i=0; i<num; i++)
48059+    {
48060+        if ((i%4)==0)
48061+          printf("%08x: ", 0x7eb00000 + addr + 4*i);
48062+
48063+        printf("%08x", rpi->regs[(addr>>2)+i]);
48064+
48065+        if ((i%4)==3 || i+1 == num)
48066+            printf("\n");
48067+        else
48068+            printf(" ");
48069+    }
48070+}
48071+
48072+static void axi_dump(const dec_env_t * const de, uint64_t addr, uint32_t size) {
48073+    int i;
48074+
48075+    for (i=0; i<size>>2; i++)
48076+    {
48077+        if ((i%4)==0)
48078+            printf("%08x: ", MANGLE(de->gbuf.vc) + (uint32_t)addr + 4*i);
48079+
48080+        printf("%08x", ((uint32_t*)de->gbuf.arm)[(addr>>2)+i]);
48081+
48082+        if ((i%4)==3 || i+1 == size>>2)
48083+            printf("\n");
48084+        else
48085+            printf(" ");
48086+    }
48087+}
48088+#endif
48089+
48090+//////////////////////////////////////////////////////////////////////////////
48091+
48092+static inline size_t round_up_size(const size_t x)
48093+{
48094+    /* Admit no size < 256 */
48095+    const unsigned int n = x < 256 ? 8 : av_log2(x) - 1;
48096+
48097+    return x >= (3 << n) ? 4 << n : (3 << n);
48098+}
48099+
48100+//////////////////////////////////////////////////////////////////////////////
48101+// Scaling factors
48102+
48103+static void expand_scaling_list(
48104+    const unsigned int sizeID,
48105+    const unsigned int matrixID,
48106+    uint8_t * const dst0,
48107+    const uint8_t * const src0,
48108+    uint8_t dc)
48109+{
48110+    switch (sizeID) {
48111+        case 0:
48112+            memcpy(dst0, src0, 16);
48113+            break;
48114+        case 1:
48115+            memcpy(dst0, src0, 64);
48116+            break;
48117+        case 2:
48118+        {
48119+            uint8_t * d = dst0;
48120+            for (unsigned int y=0; y != 16; y++) {
48121+                const uint8_t * s = src0 + (y >> 1) * 8;
48122+                for (unsigned int x = 0; x != 8; ++x) {
48123+                    *d++ = *s;
48124+                    *d++ = *s++;
48125+                }
48126+            }
48127+            dst0[0] = dc;
48128+            break;
48129+        }
48130+        default:
48131+        {
48132+            uint8_t * d = dst0;
48133+            for (unsigned int y=0; y != 32; y++) {
48134+                const uint8_t * s = src0 + (y >> 2) * 8;
48135+                for (unsigned int x = 0; x != 8; ++x) {
48136+                    *d++ = *s;
48137+                    *d++ = *s;
48138+                    *d++ = *s;
48139+                    *d++ = *s++;
48140+                }
48141+            }
48142+            dst0[0] = dc;
48143+            break;
48144+        }
48145+    }
48146+}
48147+
48148+static void populate_scaling_factors(dec_env_t * const de, const HEVCContext * const s) {
48149+    // Array of constants for scaling factors
48150+    static const uint32_t scaling_factor_offsets[4][6] = {
48151+        // MID0    MID1    MID2    MID3    MID4    MID5
48152+        {0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050},   // SID0 (4x4)
48153+        {0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0},   // SID1 (8x8)
48154+        {0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0},   // SID2 (16x16)
48155+        {0x07E0,      0,      0, 0x0BE0,      0,      0}};  // SID3 (32x32)
48156+
48157+    // ffmpeg places SID3,MID1 where matrixID 3 normally is
48158+    const ScalingList * const sl =
48159+        s->ps.pps->scaling_list_data_present_flag ? &s->ps.pps->scaling_list
48160+                                                  : &s->ps.sps->scaling_list;
48161+    unsigned int mid;
48162+
48163+    for (mid=0; mid<6; mid++)
48164+        expand_scaling_list(0, mid,
48165+            de->scaling_factors + scaling_factor_offsets[0][mid],
48166+            sl->sl[0][mid], 0);
48167+    for (mid=0; mid<6; mid++)
48168+        expand_scaling_list(1, mid,
48169+            de->scaling_factors + scaling_factor_offsets[1][mid],
48170+            sl->sl[1][mid], 0);
48171+    for (mid=0; mid<6; mid++)
48172+        expand_scaling_list(2, mid,
48173+            de->scaling_factors + scaling_factor_offsets[2][mid],
48174+            sl->sl[2][mid],
48175+            sl->sl_dc[0][mid]);
48176+    // second scaling matrix for 32x32 is at matrixID 3 not 1 in ffmpeg
48177+    for (mid=0; mid<6; mid += 3)
48178+        expand_scaling_list(3, mid,
48179+            de->scaling_factors + scaling_factor_offsets[3][mid],
48180+            sl->sl[3][mid],
48181+            sl->sl_dc[1][mid]);
48182+}
48183+
48184+//////////////////////////////////////////////////////////////////////////////
48185+// Probabilities
48186+
48187+static const uint8_t prob_init[3][156] = {
48188+	{
48189+		 153, 200, 139, 141, 157, 154, 154, 154,
48190+		 154, 154, 184, 154, 154, 154, 184,  63,
48191+		 154, 154, 154, 154, 154, 154, 154, 154,
48192+		 154, 154, 154, 154, 154, 153, 138, 138,
48193+		 111, 141,  94, 138, 182, 154, 154, 154,
48194+		 140,  92, 137, 138, 140, 152, 138, 139,
48195+		 153,  74, 149,  92, 139, 107, 122, 152,
48196+		 140, 179, 166, 182, 140, 227, 122, 197,
48197+		 110, 110, 124, 125, 140, 153, 125, 127,
48198+		 140, 109, 111, 143, 127, 111,  79, 108,
48199+		 123,  63, 110, 110, 124, 125, 140, 153,
48200+		 125, 127, 140, 109, 111, 143, 127, 111,
48201+		  79, 108, 123,  63,  91, 171, 134, 141,
48202+		 138, 153, 136, 167, 152, 152, 139, 139,
48203+		 111, 111, 125, 110, 110,  94, 124, 108,
48204+		 124, 107, 125, 141, 179, 153, 125, 107,
48205+		 125, 141, 179, 153, 125, 107, 125, 141,
48206+		 179, 153, 125, 140, 139, 182, 182, 152,
48207+		 136, 152, 136, 153, 136, 139, 111, 136,
48208+		 139, 111,   0,   0,	},
48209+	{
48210+		 153, 185, 107, 139, 126, 197, 185, 201,
48211+		 154, 149, 154, 139, 154, 154, 154, 152,
48212+		 110, 122,  95,  79,  63,  31,  31, 153,
48213+		 153, 168, 140, 198,  79, 124, 138,  94,
48214+		 153, 111, 149, 107, 167, 154, 154, 154,
48215+		 154, 196, 196, 167, 154, 152, 167, 182,
48216+		 182, 134, 149, 136, 153, 121, 136, 137,
48217+		 169, 194, 166, 167, 154, 167, 137, 182,
48218+		 125, 110,  94, 110,  95,  79, 125, 111,
48219+		 110,  78, 110, 111, 111,  95,  94, 108,
48220+		 123, 108, 125, 110,  94, 110,  95,  79,
48221+		 125, 111, 110,  78, 110, 111, 111,  95,
48222+		  94, 108, 123, 108, 121, 140,  61, 154,
48223+		 107, 167,  91, 122, 107, 167, 139, 139,
48224+		 155, 154, 139, 153, 139, 123, 123,  63,
48225+		 153, 166, 183, 140, 136, 153, 154, 166,
48226+		 183, 140, 136, 153, 154, 166, 183, 140,
48227+		 136, 153, 154, 170, 153, 123, 123, 107,
48228+		 121, 107, 121, 167, 151, 183, 140, 151,
48229+		 183, 140,   0,   0,	},
48230+	{
48231+		 153, 160, 107, 139, 126, 197, 185, 201,
48232+		 154, 134, 154, 139, 154, 154, 183, 152,
48233+		 154, 137,  95,  79,  63,  31,  31, 153,
48234+		 153, 168, 169, 198,  79, 224, 167, 122,
48235+		 153, 111, 149,  92, 167, 154, 154, 154,
48236+		 154, 196, 167, 167, 154, 152, 167, 182,
48237+		 182, 134, 149, 136, 153, 121, 136, 122,
48238+		 169, 208, 166, 167, 154, 152, 167, 182,
48239+		 125, 110, 124, 110,  95,  94, 125, 111,
48240+		 111,  79, 125, 126, 111, 111,  79, 108,
48241+		 123,  93, 125, 110, 124, 110,  95,  94,
48242+		 125, 111, 111,  79, 125, 126, 111, 111,
48243+		  79, 108, 123,  93, 121, 140,  61, 154,
48244+		 107, 167,  91, 107, 107, 167, 139, 139,
48245+		 170, 154, 139, 153, 139, 123, 123,  63,
48246+		 124, 166, 183, 140, 136, 153, 154, 166,
48247+		 183, 140, 136, 153, 154, 166, 183, 140,
48248+		 136, 153, 154, 170, 153, 138, 138, 122,
48249+		 121, 122, 121, 167, 151, 183, 140, 151,
48250+		 183, 140,   0,   0,	},
48251+};
48252+
48253+
48254+//////////////////////////////////////////////////////////////////////////////
48255+// Phase 1 command and bit FIFOs
48256+
48257+// ???? uint16_t addr - put in uint32_t
48258+static int p1_apb_write(dec_env_t * const de, const uint16_t addr, const uint32_t data) {
48259+    if (de->cmd_len==de->cmd_max)
48260+        av_assert0(de->cmd_fifo = realloc(de->cmd_fifo, (de->cmd_max*=2)*sizeof(struct RPI_CMD)));
48261+
48262+#if TRACE_DEV
48263+    printf("[%02x] %x %x\n", de->cmd_len, addr, data);
48264+#endif
48265+
48266+    de->cmd_fifo[de->cmd_len].addr = addr;
48267+    de->cmd_fifo[de->cmd_len].data = data;
48268+    return de->cmd_len++;
48269+}
48270+
48271+static void p1_axi_write(dec_env_t * const de, const uint32_t len, const void * const ptr, const int cmd_idx) {
48272+    if (de->bit_len==de->bit_max)
48273+        av_assert0(de->bit_fifo = realloc(de->bit_fifo, (de->bit_max*=2)*sizeof(struct RPI_BIT)));
48274+    de->bit_fifo[de->bit_len].cmd = cmd_idx;
48275+    de->bit_fifo[de->bit_len].ptr = ptr;
48276+    de->bit_fifo[de->bit_len].len = len;
48277+    de->bit_len++;
48278+}
48279+
48280+//////////////////////////////////////////////////////////////////////////////
48281+// Write probability and scaling factor memories
48282+
48283+#if 0
48284+static void WriteProb(dec_env_t * const de) {
48285+    int i;
48286+    const uint8_t *p = (uint8_t *) &de->probabilities;
48287+    for (i=0; i<sizeof(struct RPI_PROB); i+=4, p+=4)
48288+        p1_apb_write(de, 0x1000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
48289+}
48290+#endif
48291+
48292+static void WriteProb(dec_env_t * const de, const HEVCContext * const s) {
48293+    uint8_t dst[RPI_PROB_ARRAY_SIZE];
48294+
48295+    const unsigned int init_type = (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ?
48296+        s->sh.slice_type + 1 : 2 - s->sh.slice_type;
48297+    const uint8_t * p = prob_init[init_type];
48298+    const int q = av_clip(s->sh.slice_qp, 0, 51);
48299+    unsigned int i;
48300+
48301+    for (i = 0; i < RPI_PROB_VALS; i++) {
48302+        int init_value = p[i];
48303+        int m = (init_value >> 4) * 5 - 45;
48304+        int n = ((init_value & 15) << 3) - 16;
48305+        int pre = 2 * (((m * q) >> 4) + n) - 127;
48306+
48307+        pre ^= pre >> 31;
48308+        if (pre > 124)
48309+            pre = 124 + (pre & 1);
48310+        dst[i] = pre;
48311+    }
48312+    for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i) {
48313+        dst[i] = 0;
48314+    }
48315+
48316+    for (i=0; i < RPI_PROB_ARRAY_SIZE; i+=4)
48317+        p1_apb_write(de, 0x1000+i, dst[i] + (dst[i+1]<<8) + (dst[i+2]<<16) + (dst[i+3]<<24));
48318+
48319+}
48320+
48321+
48322+static void WriteScalingFactors(dec_env_t * const de) {
48323+    int i;
48324+    const uint8_t *p = (uint8_t *) de->scaling_factors;
48325+    for (i=0; i<NUM_SCALING_FACTORS; i+=4, p+=4)
48326+        p1_apb_write(de, 0x2000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
48327+}
48328+
48329+//////////////////////////////////////////////////////////////////////////////
48330+
48331+static int ctb_to_tile (unsigned int ctb, unsigned int *bd, int num) {
48332+    int i;
48333+    for (i=1; ctb >= bd[i]; i++); // bd[] has num+1 elements; bd[0]=0; see hevc_ps.c
48334+    return i-1;
48335+}
48336+
48337+static int ctb_to_slice_w_h (unsigned int ctb, int ctb_size, int width, unsigned int *bd, int num) {
48338+    if (ctb < bd[num-1]) return ctb_size;
48339+    else if (width % ctb_size) return width % ctb_size;
48340+    else return ctb_size;
48341+}
48342+
48343+//////////////////////////////////////////////////////////////////////////////
48344+// Handle PU and COEFF stream overflow
48345+
48346+
48347+// Returns:
48348+// -2 Other error
48349+// -1 Out of coeff space
48350+//  0  OK
48351+//  1  Out of PU space
48352+
48353+static int check_status(const RPI_T * const rpi, dec_env_t * const de) {
48354+    uint32_t status;
48355+
48356+    // this is the definition of successful completion of phase 1
48357+    // it assures that status register is zero and all blocks in each tile have completed
48358+    if (apb_read(rpi, RPI_CFSTATUS) == apb_read(rpi, RPI_CFNUM))
48359+        return 0;
48360+
48361+    status = apb_read(rpi, RPI_STATUS);
48362+
48363+    if ((status & 8) != 0)
48364+        return -1;
48365+
48366+    if ((status & 0x10) != 0)
48367+        return 1;
48368+
48369+    return -2;
48370+}
48371+
48372+//////////////////////////////////////////////////////////////////////////////
48373+// Write STATUS register with expected end CTU address of previous slice
48374+
48375+static void end_previous_slice(dec_env_t * const de, const HEVCContext * const s, const int ctb_addr_ts) {
48376+    const HEVCPPS * const pps = s->ps.pps;
48377+    int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
48378+    int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
48379+    p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
48380+}
48381+
48382+static void wpp_pause(dec_env_t * const de, int ctb_row) {
48383+    p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + 0x25);
48384+    p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
48385+    p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1 ? 0x70000 : 0x30000);
48386+    p1_apb_write(de, RPI_CONTROL, (ctb_row<<16) + 2);
48387+}
48388+
48389+static void wpp_end_previous_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
48390+    const HEVCPPS *pps = s->ps.pps;
48391+    int new_x = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
48392+    int new_y = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
48393+    int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
48394+    int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
48395+    if (de->wpp_entry_x<2 && (de->wpp_entry_y<new_y || new_x>2) && de->PicWidthInCtbsY>2)
48396+        wpp_pause(de, last_y);
48397+    p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
48398+    if (new_x==2 || de->PicWidthInCtbsY==2 && de->wpp_entry_y<new_y)
48399+        p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
48400+}
48401+
48402+//////////////////////////////////////////////////////////////////////////////
48403+
48404+static void new_slice_segment(dec_env_t * const de, const HEVCContext * const s)
48405+{
48406+    const HEVCSPS *sps = s->ps.sps;
48407+    const HEVCPPS *pps = s->ps.pps;
48408+
48409+    p1_apb_write(de, RPI_SPS0,
48410+        (sps->log2_min_cb_size                    <<  0) +
48411+        (sps->log2_ctb_size                       <<  4) +
48412+        (sps->log2_min_tb_size                    <<  8) +
48413+        (sps->log2_max_trafo_size                 << 12) +
48414+        (sps->bit_depth                           << 16) +
48415+        (sps->bit_depth                           << 20) +
48416+        (sps->max_transform_hierarchy_depth_intra << 24) +
48417+        (sps->max_transform_hierarchy_depth_inter << 28));
48418+
48419+    p1_apb_write(de, RPI_SPS1,
48420+        (sps->pcm.bit_depth                                        <<  0) +
48421+        (sps->pcm.bit_depth_chroma                                 <<  4) +
48422+        (sps->pcm.log2_min_pcm_cb_size                             <<  8) +
48423+        (sps->pcm.log2_max_pcm_cb_size                             << 12) +
48424+        (sps->separate_colour_plane_flag? 0:sps->chroma_format_idc << 16) +
48425+        (sps->amp_enabled_flag                                     << 18) +
48426+        (sps->pcm_enabled_flag                                     << 19) +
48427+        (sps->scaling_list_enable_flag                             << 20) +
48428+        (sps->sps_strong_intra_smoothing_enable_flag               << 21));
48429+
48430+    p1_apb_write(de, RPI_PPS,
48431+        (sps->log2_ctb_size - pps->diff_cu_qp_delta_depth   <<  0) +
48432+        (pps->cu_qp_delta_enabled_flag                      <<  4) +
48433+        (pps->transquant_bypass_enable_flag                 <<  5) +
48434+        (pps->transform_skip_enabled_flag                   <<  6) +
48435+        (pps->sign_data_hiding_flag                         <<  7) +
48436+      (((pps->cb_qp_offset + s->sh.slice_cb_qp_offset)&255) <<  8) +
48437+      (((pps->cr_qp_offset + s->sh.slice_cr_qp_offset)&255) << 16) +
48438+        (pps->constrained_intra_pred_flag                   << 24));
48439+
48440+    if (s->ps.sps->scaling_list_enable_flag) WriteScalingFactors(de);
48441+
48442+    if (!s->sh.dependent_slice_segment_flag) {
48443+        int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
48444+        int ctb_row = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
48445+        de->reg_slicestart = (ctb_col<<0) + (ctb_row<<16);
48446+    }
48447+
48448+    p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart);
48449+}
48450+
48451+//////////////////////////////////////////////////////////////////////////////
48452+
48453+static void write_slice(dec_env_t * const de, const HEVCContext * const s,
48454+                        const unsigned int slice_w, const unsigned int slice_h) {
48455+    uint32_t u32 =
48456+          (s->sh.slice_type                           << 12)
48457+        + (s->sh.slice_sample_adaptive_offset_flag[0] << 14)
48458+        + (s->sh.slice_sample_adaptive_offset_flag[1] << 15)
48459+        + (slice_w                                    << 17)
48460+        + (slice_h                                    << 24);
48461+
48462+    if (s->sh.slice_type==HEVC_SLICE_B || s->sh.slice_type==HEVC_SLICE_P) u32 |=
48463+          (s->sh.max_num_merge_cand << 0)
48464+        + (s->sh.nb_refs[L0]        << 4)
48465+        + (s->sh.nb_refs[L1]        << 8);
48466+
48467+    if (s->sh.slice_type==HEVC_SLICE_B)
48468+        u32 |= s->sh.mvd_l1_zero_flag<<16;
48469+    p1_apb_write(de, RPI_SLICE, u32);
48470+}
48471+
48472+//////////////////////////////////////////////////////////////////////////////
48473+// Wavefront mode
48474+
48475+static void wpp_entry_point(dec_env_t * const de, const HEVCContext * const s,
48476+                            const int do_bte, const int resetQPY, const int ctb_addr_ts) {
48477+    const HEVCSPS * const sps = s->ps.sps;
48478+    const HEVCPPS * const pps = s->ps.pps;
48479+
48480+    int ctb_size = 1<<sps->log2_ctb_size;
48481+    int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
48482+
48483+    int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->PicWidthInCtbsY;
48484+    int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->PicWidthInCtbsY;
48485+
48486+    int endx = de->PicWidthInCtbsY-1;
48487+    int endy = ctb_row;
48488+
48489+    uint8_t slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, sps->width,  pps->col_bd, pps->num_tile_columns);
48490+    uint8_t slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
48491+
48492+    p1_apb_write(de, RPI_TILESTART, 0);
48493+    p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
48494+
48495+    if (do_bte)
48496+        p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
48497+
48498+    write_slice(de, s, slice_w, ctb_row==de->PicHeightInCtbsY-1? slice_h : ctb_size);
48499+
48500+    if (resetQPY) p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
48501+
48502+    p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1? 0x60001 : 0x20001);
48503+    p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
48504+}
48505+
48506+//////////////////////////////////////////////////////////////////////////////
48507+// Tiles mode
48508+
48509+static void new_entry_point(dec_env_t * const de, const HEVCContext * const s,
48510+                            const int do_bte, const int resetQPY, const int ctb_addr_ts) {
48511+    const HEVCSPS * const sps = s->ps.sps;
48512+    const HEVCPPS * const pps = s->ps.pps;
48513+
48514+    int ctb_col = pps->ctb_addr_ts_to_rs[ctb_addr_ts] % de->PicWidthInCtbsY;
48515+    int ctb_row = pps->ctb_addr_ts_to_rs[ctb_addr_ts] / de->PicWidthInCtbsY;
48516+
48517+    int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
48518+    int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
48519+
48520+    int endx = pps->col_bd[tile_x+1] - 1;
48521+    int endy = pps->row_bd[tile_y+1] - 1;
48522+
48523+    uint8_t slice_w = ctb_to_slice_w_h(ctb_col, 1<<sps->log2_ctb_size, sps->width,  pps->col_bd, pps->num_tile_columns);
48524+    uint8_t slice_h = ctb_to_slice_w_h(ctb_row, 1<<sps->log2_ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
48525+
48526+    p1_apb_write(de, RPI_TILESTART, pps->col_bd[tile_x] + (pps->row_bd[tile_y]<<16));
48527+    p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
48528+
48529+    if (do_bte)
48530+        p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
48531+
48532+    write_slice(de, s, slice_w, slice_h);
48533+
48534+    if (resetQPY)
48535+        p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
48536+
48537+    p1_apb_write(de, RPI_MODE, (0xFFFF                            <<  0)
48538+                              + (0x0                               << 16)
48539+                              + ((tile_x==pps->num_tile_columns-1) << 17)
48540+                              + ((tile_y==pps->num_tile_rows-1)    << 18));
48541+
48542+    p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
48543+}
48544+
48545+//////////////////////////////////////////////////////////////////////////////
48546+
48547+// Doesn't attempt to remove from context as we should only do this at the end
48548+// of time or on create error
48549+static void
48550+dec_env_delete(dec_env_t * const de)
48551+{
48552+//    gpu_free(&de->gbuf);
48553+
48554+    av_freep(&de->cmd_fifo);
48555+    av_freep(&de->bit_fifo);
48556+
48557+    sem_destroy(&de->phase_wait);
48558+    av_free(de);
48559+}
48560+
48561+static dec_env_t *
48562+dec_env_new(AVCodecContext * const avctx, RPI_T * const rpi)
48563+{
48564+    dec_env_t * const de = av_mallocz(sizeof(*de));
48565+    int i;
48566+
48567+    if (de == NULL)
48568+        return NULL;
48569+
48570+    de->avctx = avctx;
48571+    de->phase_no = RPIVID_PHASE_NEW;
48572+
48573+    sem_init(&de->phase_wait, 0, 0);
48574+
48575+    if ((de->cmd_fifo = malloc((de->cmd_max=1024)*sizeof(struct RPI_CMD))) == NULL)
48576+        goto fail;
48577+
48578+    if ((de->bit_fifo = malloc((de->bit_max=1024)*sizeof(struct RPI_BIT))) == NULL)
48579+        goto fail;
48580+
48581+    pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
48582+    for (i = 0; i != avctx->thread_count; ++i) {
48583+        if (rpi->dec_envs[i] == NULL)
48584+        {
48585+            rpi->dec_envs[i] = de;
48586+            break;
48587+        }
48588+    }
48589+    pthread_mutex_unlock(&rpi->phase_lock);
48590+
48591+    if (i == avctx->thread_count) {
48592+        av_log(avctx, AV_LOG_ERROR, "Failed to find a slot for hw thread context\n");
48593+        goto fail;
48594+    }
48595+
48596+    return de;
48597+
48598+fail:
48599+    dec_env_delete(de);
48600+    return NULL;
48601+}
48602+
48603+
48604+static dec_env_t *
48605+dec_env_get(AVCodecContext * const avctx, RPI_T * const rpi)
48606+{
48607+    dec_env_t * de = NULL;
48608+    const int ref_count = atomic_fetch_add(&rpi->ref_count, 1);
48609+
48610+    if (ref_count <= 0) {
48611+        // Already dead
48612+        av_log(avctx, AV_LOG_ERROR, "RPIVID called whilst dead\n");;
48613+        return NULL;
48614+    }
48615+
48616+    for (int i = 0; i != avctx->thread_count; ++i) {
48617+        if (rpi->dec_envs[i] == NULL)
48618+        {
48619+            de = dec_env_new(avctx, rpi);
48620+            break;
48621+        }
48622+        if (rpi->dec_envs[i]->avctx == avctx)
48623+        {
48624+            de = rpi->dec_envs[i];
48625+            break;
48626+        }
48627+    }
48628+    return de;
48629+}
48630+
48631+// Call at end of fn
48632+// Used to ensure we aren't in a worker thead when killed
48633+static void
48634+dec_env_release(RPI_T * const rpi, dec_env_t * const de)
48635+{
48636+    const int n = atomic_fetch_sub(&rpi->ref_count, 1);
48637+    if (n == 1) {
48638+        sem_post(&rpi->ref_zero);
48639+    }
48640+}
48641+
48642+//----------------------------------------------------------------------------
48643+
48644+// Wait for a slot in the given phase
48645+// Any error return is probably fatal
48646+static int
48647+wait_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
48648+{
48649+    int needs_wait = 0;
48650+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
48651+
48652+    pthread_mutex_lock(&rpi->phase_lock);
48653+    if (p->last_order + 1 != de->decode_order) {
48654+        de->phase_wait_q_next = p->q;
48655+        p->q = de;
48656+        needs_wait = 1;
48657+    }
48658+    pthread_mutex_unlock(&rpi->phase_lock);
48659+
48660+    if (needs_wait) {
48661+        while (sem_wait(&de->phase_wait) == -1)
48662+        {
48663+            int err;
48664+            if ((err = errno) != EINTR)
48665+                return AVERROR(err);
48666+        }
48667+    }
48668+
48669+    de->phase_no = phase_no;
48670+    return 0;
48671+}
48672+
48673+static void
48674+post_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
48675+{
48676+    dec_env_t * next_de = NULL;
48677+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
48678+    dec_env_t ** q = &p->q;
48679+
48680+    pthread_mutex_lock(&rpi->phase_lock);
48681+
48682+    p->last_order = de->decode_order;
48683+    while (*q != NULL) {
48684+        dec_env_t * const t_de = *q;
48685+
48686+        if (t_de->decode_order == p->last_order + 1) {
48687+            // This is us - remove from Q
48688+            *q = t_de->phase_wait_q_next;
48689+            t_de->phase_wait_q_next = NULL; // Tidy
48690+            next_de = t_de;
48691+            break;
48692+        }
48693+        q = &t_de->phase_wait_q_next;
48694+    }
48695+
48696+    pthread_mutex_unlock(&rpi->phase_lock);
48697+
48698+    if (next_de != NULL)
48699+        sem_post(&next_de->phase_wait);
48700+}
48701+
48702+// Wait & signal stuff s.t. threads in other phases can continue
48703+static void
48704+abort_phases(RPI_T * const rpi, dec_env_t * const de)
48705+{
48706+    for (int i = de->phase_no + 1; i < RPIVID_PHASE_NEW; ++i) {
48707+        wait_phase(rpi, de, i);
48708+        post_phase(rpi, de, i);
48709+    }
48710+    de->phase_no = RPIVID_PHASE_NEW;
48711+}
48712+
48713+// Start timing for phase
48714+// Stats only - no actual effect
48715+static inline void tstart_phase(RPI_T * const rpi, const int phase_no)
48716+{
48717+#if OPT_PHASE_TIMING
48718+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
48719+    const int64_t now = tus64();
48720+    if (p->phase_time != 0)
48721+        p->time_out_phase += now - p->phase_time;
48722+    p->phase_time = now;
48723+#endif
48724+}
48725+
48726+#if OPT_PHASE_TIMING
48727+static unsigned int tavg_bin_phase(phase_wait_env_t *const p, const unsigned int avg_n)
48728+{
48729+    uint64_t tsum = 0;
48730+    unsigned int i;
48731+    for (i = 0; i != avg_n; ++i)
48732+        tsum += p->time_stash[(p->i3 - i) & 15];
48733+    for (i = 0; i != 9; ++i) {
48734+        if (time_thresholds[i] * 1000 * avg_n > tsum)
48735+            break;
48736+    }
48737+    return i;
48738+}
48739+#endif
48740+
48741+// End timing for phase
48742+// Stats only - no actual effect
48743+static inline void tend_phase(RPI_T * const rpi, const int phase_no)
48744+{
48745+#if OPT_PHASE_TIMING
48746+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
48747+    const uint64_t now = tus64();
48748+    const uint64_t in_time = now - p->phase_time;
48749+
48750+    p->time_in_phase += in_time;
48751+    p->phase_time = now;
48752+    p->time_stash[p->i3] = in_time;
48753+    if (in_time > p->max_phase_time) {
48754+        p->max_phase_time = in_time;
48755+        p->max_time_decode_order = p->last_order;
48756+    }
48757+    ++p->time_bins[tavg_bin_phase(p, 1)];
48758+    ++p->time_bins3[tavg_bin_phase(p, 3)];
48759+    ++p->time_bins5[tavg_bin_phase(p, 5)];
48760+
48761+    p->i3 = (p->i3 + 1) & 15;
48762+#endif
48763+}
48764+
48765+//////////////////////////////////////////////////////////////////////////////
48766+// Start frame
48767+
48768+static int rpi_hevc_start_frame(
48769+    AVCodecContext * avctx,
48770+    const uint8_t *buffer,
48771+    uint32_t size) {
48772+
48773+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
48774+    dec_env_t * const de = dec_env_get(avctx, rpi);
48775+    const HEVCContext * const s = avctx->priv_data;
48776+    const HEVCSPS * const sps = s->ps.sps;
48777+    const unsigned int CtbSizeY = 1U << sps->log2_ctb_size;
48778+
48779+#if TRACE_ENTRY
48780+    printf("<<< %s[%p]\n", __func__, de);
48781+#endif
48782+
48783+    if (de == NULL) {
48784+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
48785+        return -1;
48786+    }
48787+
48788+    de->phase_no = RPIVID_PHASE_START;
48789+    de->decode_order = ++rpi->decode_order;  // *** atomic?
48790+
48791+    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
48792+
48793+    if (de->state != RPIVID_DECODE_NEW && de->state != RPIVID_DECODE_END) {
48794+        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
48795+        return -1;
48796+    }
48797+    de->state = RPIVID_DECODE_START;
48798+
48799+    de->PicWidthInCtbsY  = (sps->width + CtbSizeY - 1) / CtbSizeY;  //7-15
48800+    de->PicHeightInCtbsY = (sps->height + CtbSizeY - 1) / CtbSizeY;  //7-17
48801+    de->bit_len = 0;
48802+    de->cmd_len = 0;
48803+
48804+#if TRACE_ENTRY
48805+    printf(">>> %s[%p]\n", __func__, de);
48806+#endif
48807+
48808+    dec_env_release(rpi, de);
48809+    return 0;
48810+}
48811+
48812+//////////////////////////////////////////////////////////////////////////////
48813+// Slice messages
48814+
48815+static void msg_slice(dec_env_t * const de, const uint16_t msg) {
48816+    de->slice_msgs[de->num_slice_msgs++] = msg;
48817+}
48818+
48819+static void program_slicecmds(dec_env_t * const de, const int sliceid) {
48820+    int i;
48821+    p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs+(sliceid<<8));
48822+    for(i=0; i < de->num_slice_msgs; i++) {
48823+        p1_apb_write(de, 0x4000+4*i, de->slice_msgs[i] & 0xffff);
48824+    }
48825+}
48826+
48827+static void pre_slice_decode(dec_env_t * const de, const HEVCContext * const s) {
48828+    const HEVCSPS * const sps = s->ps.sps;
48829+    const HEVCPPS * const pps = s->ps.pps;
48830+    const SliceHeader *sh = &s->sh;
48831+
48832+    int weightedPredFlag, i, rIdx;
48833+    uint16_t cmd_slice;
48834+    unsigned int collocated_from_l0_flag;
48835+
48836+    de->num_slice_msgs=0;
48837+    de->dpbno_col = 0;
48838+    cmd_slice = 0;
48839+    if (sh->slice_type==HEVC_SLICE_I) cmd_slice = 1;
48840+    if (sh->slice_type==HEVC_SLICE_P) cmd_slice = 2;
48841+    if (sh->slice_type==HEVC_SLICE_B) cmd_slice = 3;
48842+
48843+    if (sh->slice_type!=HEVC_SLICE_I) {
48844+        cmd_slice += sh->nb_refs[L0]<<2;
48845+        cmd_slice += sh->nb_refs[L1]<<6;
48846+    }
48847+
48848+    if (sh->slice_type==HEVC_SLICE_P ||  sh->slice_type==HEVC_SLICE_B)
48849+        cmd_slice |= sh->max_num_merge_cand<<11;
48850+
48851+    collocated_from_l0_flag =
48852+        !sh->slice_temporal_mvp_enabled_flag ?
48853+            0 :
48854+        sh->slice_type == HEVC_SLICE_B ?
48855+            (sh->collocated_list == L0) :
48856+            (sh->slice_type==HEVC_SLICE_P);
48857+    cmd_slice |= collocated_from_l0_flag<<14;
48858+
48859+    if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) {
48860+
48861+        int NoBackwardPredFlag = 1; // Flag to say all reference pictures are from the past
48862+        for(i=L0; i<=L1; i++) {
48863+            for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
48864+                HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
48865+                HEVCFrame *c = s->ref; // CurrentPicture
48866+                if (c->poc < f->poc) NoBackwardPredFlag = 0;
48867+            }
48868+        }
48869+
48870+        if (sps->sps_temporal_mvp_enabled_flag)
48871+        {
48872+            const RefPicList *rpl = (sh->slice_type != HEVC_SLICE_B || collocated_from_l0_flag) ?
48873+                s->ref->refPicList + 0 :
48874+                s->ref->refPicList + 1;
48875+            de->dpbno_col = rpl->ref[sh->collocated_ref_idx] - s->DPB;
48876+        }
48877+
48878+        cmd_slice += NoBackwardPredFlag<<10;
48879+        msg_slice(de, cmd_slice);
48880+
48881+        // Write reference picture descriptions
48882+        weightedPredFlag = sh->slice_type==HEVC_SLICE_P? pps->weighted_pred_flag : pps->weighted_bipred_flag;
48883+
48884+        for(i=L0; i<=L1; i++)
48885+            for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
48886+                HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
48887+                HEVCFrame *c = s->ref; // CurrentPicture
48888+                int pic = f - s->DPB;
48889+                // Make sure pictures are in range 0 to 15
48890+                int adjusted_pic = f<c? pic : pic-1;
48891+                int lt = s->ref->refPicList[i].isLongTerm[rIdx];
48892+                msg_slice(de, adjusted_pic+(lt<<4)+(weightedPredFlag<<5)+(weightedPredFlag<<6));
48893+                msg_slice(de, f->poc);
48894+                if (weightedPredFlag) {
48895+                    msg_slice(de,   s->sh.luma_log2_weight_denom+(((i?s->  sh.luma_weight_l1:  s->sh.luma_weight_l0)[rIdx]   &0x1ff)<<3));
48896+                    msg_slice(de,                                  (i?s->  sh.luma_offset_l1:  s->sh.luma_offset_l0)[rIdx]   & 0xff);
48897+                    msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][0]&0x1ff)<<3));
48898+                    msg_slice(de,                                  (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][0]& 0xff);
48899+                    msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][1]&0x1ff)<<3));
48900+                    msg_slice(de,                                  (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][1]& 0xff);
48901+                }
48902+            }
48903+    }
48904+    else
48905+        msg_slice(de, cmd_slice);
48906+
48907+    msg_slice(de, ((sh->beta_offset/2)&15)
48908+        + (((sh->tc_offset/2)&15)                           <<  4)
48909+        + (sh->disable_deblocking_filter_flag               <<  8)
48910+        + (sh->slice_loop_filter_across_slices_enabled_flag <<  9)
48911+        + (pps->loop_filter_across_tiles_enabled_flag       << 10)); // CMD_DEBLOCK
48912+
48913+    msg_slice(de, ((sh->slice_cr_qp_offset&31)<<5) + (sh->slice_cb_qp_offset&31)); // CMD_QPOFF
48914+}
48915+
48916+
48917+//////////////////////////////////////////////////////////////////////////////
48918+
48919+static void rpi_hevc_abort_frame(AVCodecContext * const avctx) {
48920+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
48921+    dec_env_t * const de = dec_env_get(avctx,  rpi);
48922+
48923+#if TRACE_ENTRY
48924+    printf("<<< %s[%p]\n", __func__, de);
48925+#endif
48926+
48927+    if (de == NULL) {
48928+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
48929+        return;
48930+    }
48931+
48932+    switch (de->state) {
48933+        case RPIVID_DECODE_NEW:
48934+        case RPIVID_DECODE_END:
48935+            // Expected transition
48936+            break;
48937+
48938+        case RPIVID_DECODE_SLICE:
48939+            // Error transition
48940+            av_log(avctx, AV_LOG_INFO, "Error in decode - aborting\n");
48941+            break;
48942+
48943+        case RPIVID_DECODE_START:
48944+        default:
48945+            av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
48946+            break;
48947+    }
48948+
48949+    abort_phases(rpi, de);
48950+    de->state = RPIVID_DECODE_NEW;
48951+
48952+    dec_env_release(rpi, de);
48953+}
48954+
48955+//////////////////////////////////////////////////////////////////////////////
48956+// End frame
48957+
48958+static int rpi_hevc_end_frame(AVCodecContext * const avctx) {
48959+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
48960+    const HEVCContext * const s = avctx->priv_data;
48961+    const HEVCPPS * const pps = s->ps.pps;
48962+    const HEVCSPS * const sps = s->ps.sps;
48963+    dec_env_t * const de = dec_env_get(avctx,  rpi);
48964+    AVFrame * const f = s->ref->frame;
48965+    const unsigned int dpbno_cur = s->ref - s->DPB;
48966+    vid_vc_addr_t cmds_vc;
48967+    vid_vc_addr_t pu_base_vc;
48968+    unsigned int pu_stride;
48969+    vid_vc_addr_t coeff_base_vc;
48970+    unsigned int coeff_stride;
48971+    unsigned int i;
48972+    int rv = 0;
48973+    int status = 0;
48974+    int coeffbuf_sem_claimed = 0;
48975+
48976+#if TRACE_ENTRY
48977+    fprintf("<<< %s[%p]\n", __func__, de);
48978+#endif
48979+
48980+    if (de == NULL) {
48981+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
48982+        return AVERROR_BUG;  // Should never happen
48983+    }
48984+
48985+    if (de->state != RPIVID_DECODE_SLICE) {
48986+        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
48987+        rv = AVERROR_UNKNOWN;
48988+        goto fail;
48989+    }
48990+    de->state = RPIVID_DECODE_END;
48991+
48992+    // End of command compilation
48993+    {
48994+        const unsigned int last_x = pps->col_bd[pps->num_tile_columns]-1;
48995+        const unsigned int last_y = pps->row_bd[pps->num_tile_rows]-1;
48996+        if (pps->entropy_coding_sync_enabled_flag) {
48997+            if (de->wpp_entry_x<2 && de->PicWidthInCtbsY>2)
48998+                wpp_pause(de, last_y);
48999+        }
49000+        p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
49001+    }
49002+
49003+    // Phase 0 ---------------------------------------------------------------
49004+
49005+    wait_phase(rpi, de, 0);
49006+    rpi_sem_wait(&rpi->bitbuf_sem);
49007+    tstart_phase(rpi, 0);
49008+
49009+    // Copy cmds & bits into gpu side buffer
49010+    // Layout: CMDS, BITS
49011+    {
49012+        uint8_t * const armbase = rpi->gbitbufs[rpi->bitbuf_no].arm;
49013+        vid_vc_addr_t vcbase = rpi->gbitbufs[rpi->bitbuf_no].vc;
49014+        unsigned int cmd_bytes = de->cmd_len * sizeof(struct RPI_CMD);
49015+
49016+        uint8_t * p = armbase + rnd64(cmd_bytes);
49017+        uint8_t * const eobits = armbase + rpi->gbitbufs[rpi->bitbuf_no].numbytes;
49018+
49019+        cmds_vc = vcbase;
49020+
49021+        // Copy all the bits & update bitstream cmds to point at the right bits
49022+        for (i = 0; i < de->bit_len; ++i)
49023+        {
49024+            const unsigned int seg_len = de->bit_fifo[i].len;
49025+
49026+            if (p + seg_len > eobits) {
49027+                status = -1;
49028+                break;
49029+            }
49030+
49031+            memcpy(p, de->bit_fifo[i].ptr, seg_len);
49032+            de->cmd_fifo[de->bit_fifo[i].cmd].data = MANGLE64((p - armbase) + vcbase);
49033+
49034+            p += rnd64(seg_len);
49035+        }
49036+
49037+        memcpy(armbase, de->cmd_fifo, cmd_bytes);
49038+    }
49039+
49040+    if (status == 0)
49041+    {
49042+        if (++rpi->bitbuf_no >= RPIVID_BITBUFS)
49043+            rpi->bitbuf_no = 0;
49044+    }
49045+    else
49046+    {
49047+        sem_post(&rpi->bitbuf_sem);
49048+        av_log(avctx, AV_LOG_ERROR, "Out of HEVC bit/cmd memory\n");
49049+        rv = AVERROR_BUFFER_TOO_SMALL;
49050+    }
49051+
49052+    tend_phase(rpi, 0);
49053+    post_phase(rpi, de, 0);
49054+
49055+    if (status < 0)
49056+        goto fail;
49057+
49058+    // Phase 1 ---------------------------------------------------------------
49059+
49060+    wait_phase(rpi, de, 1);
49061+    rpi_sem_wait(&rpi->coeffbuf_sem);
49062+    coeffbuf_sem_claimed = 1;
49063+    tstart_phase(rpi, 1);
49064+
49065+    status = 0;
49066+    for (;;)
49067+    {
49068+        // (Re-)allocate PU/COEFF stream space
49069+        const unsigned int total_size = rpi->gcoeffbufs[rpi->coeffbuf_no].numbytes;
49070+        unsigned int pu_size;
49071+
49072+        pu_base_vc = rpi->gcoeffbufs[rpi->coeffbuf_no].vc;
49073+        pu_stride = rnd64(rpi->max_pu_msgs * 2 * de->PicWidthInCtbsY);
49074+        pu_size = pu_stride * de->PicHeightInCtbsY;
49075+
49076+        if (pu_size >= total_size || status == -1) {
49077+            GPU_MEM_PTR_T newbuf;
49078+
49079+            if (gpu_malloc_uncached(round_up_size(total_size + 1), &newbuf) != 0)
49080+            {
49081+                av_log(avctx, AV_LOG_ERROR, "Failed to reallocate coeffbuf\n");
49082+                status = -1;
49083+                break;
49084+            }
49085+            gpu_free(rpi->gcoeffbufs + rpi->coeffbuf_no);
49086+            rpi->gcoeffbufs[rpi->coeffbuf_no] = newbuf;
49087+            status = 0;
49088+            continue;
49089+        }
49090+
49091+        // Allocate all remaining space to coeff
49092+        coeff_base_vc = pu_base_vc + pu_size;
49093+        coeff_stride = ((total_size - pu_size) / de->PicHeightInCtbsY) & ~63;  // Round down to multiple of 64
49094+
49095+        apb_write_vc_addr(rpi, RPI_PUWBASE, pu_base_vc);
49096+        apb_write_vc_len(rpi, RPI_PUWSTRIDE, pu_stride);
49097+        apb_write_vc_addr(rpi, RPI_COEFFWBASE, coeff_base_vc);
49098+        apb_write_vc_len(rpi, RPI_COEFFWSTRIDE, coeff_stride);
49099+
49100+        // Trigger command FIFO
49101+        apb_write(rpi, RPI_CFNUM, de->cmd_len);
49102+#if TRACE_DEV && 0
49103+        apb_dump_regs(rpi, 0x0, 32);
49104+        apb_dump_regs(rpi, 0x8000, 24);
49105+        axi_dump(de, ((uint64_t)a64)<<6, de->cmd_len * sizeof(struct RPI_CMD));
49106+#endif
49107+        apb_write_vc_addr(rpi, RPI_CFBASE, cmds_vc);
49108+
49109+        int_wait(rpi, 1);
49110+
49111+        status = check_status(rpi, de);
49112+
49113+        if (status == -1)
49114+            continue;
49115+        else if (status != 1)
49116+            break;
49117+
49118+        // Status 1 means out of PU space so try again with more
49119+        // If we ran out of Coeff space then we are out of memory - we could possibly realloc?
49120+        rpi->max_pu_msgs += rpi->max_pu_msgs / 2;
49121+    }
49122+
49123+    // Inc inside the phase 1 lock, but only inc if we succeeded otherwise we
49124+    // may reuse a live buffer when we kick the coeff sem
49125+    if (status == 0)
49126+    {
49127+        if (++rpi->coeffbuf_no >= RPIVID_COEFFBUFS)
49128+            rpi->coeffbuf_no = 0;
49129+    }
49130+    else
49131+    {
49132+        if (status == -1)
49133+        {
49134+            av_log(avctx, AV_LOG_ERROR, "Out of pu + coeff intermediate memory: pus=%d\n", rpi->max_pu_msgs);
49135+            rv = AVERROR_BUFFER_TOO_SMALL;
49136+        }
49137+        else
49138+        {
49139+            av_log(avctx, AV_LOG_WARNING, "Phase 1 decode error\n");
49140+            rv = AVERROR_INVALIDDATA;
49141+        }
49142+    }
49143+
49144+    tend_phase(rpi, 1);
49145+    sem_post(&rpi->bitbuf_sem);
49146+    post_phase(rpi, de, 1);
49147+
49148+    if (status != 0)
49149+        goto fail;
49150+
49151+    // Phase 2 ---------------------------------------------------------------
49152+
49153+    wait_phase(rpi, de, 2);
49154+
49155+    if ((rv = av_rpi_zc_resolve_frame(f, ZC_RESOLVE_ALLOC)) != 0)
49156+    {
49157+        // As we are in phase 2 already here we don't need to worry about
49158+        // ceoffbuf_no despite the early exit
49159+        post_phase(rpi, de, 2);
49160+        av_log(avctx, AV_LOG_ERROR, "Failed to allocate output frame\n");
49161+        goto fail;
49162+    }
49163+
49164+    tstart_phase(rpi, 2);
49165+
49166+    apb_write_vc_addr(rpi, RPI_PURBASE, pu_base_vc);
49167+    apb_write_vc_len(rpi, RPI_PURSTRIDE, pu_stride);
49168+    apb_write_vc_addr(rpi, RPI_COEFFRBASE, coeff_base_vc);
49169+    apb_write_vc_len(rpi, RPI_COEFFRSTRIDE, coeff_stride);
49170+
49171+    apb_write_vc_addr(rpi, RPI_OUTYBASE, get_vc_address_y(f));
49172+    apb_write_vc_addr(rpi, RPI_OUTCBASE, get_vc_address_u(f));
49173+    apb_write_vc_len(rpi, RPI_OUTYSTRIDE, f->linesize[3] * 128);
49174+    apb_write_vc_len(rpi, RPI_OUTCSTRIDE, f->linesize[3] * 128);
49175+
49176+    // Keep the last thing we resolved as fallback for any ref we fail to
49177+    // resolve.  As a final fallback use our current frame.  The pels might
49178+    // not be there yet but at least the memory is valid.
49179+    //
49180+    // Attempt to resolve the entire DPB - we could note what we have used
49181+    // in ref lists but probably simpler and more reliable to set the whole thing
49182+    {
49183+        AVFrame * fallback_frame = f;
49184+        for (i = 0; i != 16; ++i) {
49185+            // Avoid current frame
49186+            const HEVCFrame * hevc_fr = (s->DPB + i >= s->ref) ? s->DPB + i + 1 : s->DPB + i;
49187+            AVFrame * fr = hevc_fr->frame;
49188+
49189+            if (fr != NULL &&
49190+                av_rpi_zc_resolve_frame(fr, ZC_RESOLVE_FAIL) == 0)
49191+            {
49192+                fallback_frame = fr;
49193+            }
49194+            else
49195+            {
49196+                fr = fallback_frame;
49197+            }
49198+
49199+            apb_write_vc_addr(rpi, 0x9000+16*i, get_vc_address_y(fr));
49200+            apb_write(rpi, 0x9004+16*i, 0);
49201+            apb_write_vc_addr(rpi, 0x9008+16*i, get_vc_address_u(fr));
49202+            apb_write(rpi, 0x900C+16*i, 0);
49203+        }
49204+    }
49205+
49206+    apb_write(rpi, RPI_CONFIG2,
49207+          (sps->bit_depth                             << 0) // BitDepthY
49208+        + (sps->bit_depth                             << 4) // BitDepthC
49209+       + ((sps->bit_depth>8)                          << 8) // BitDepthY
49210+       + ((sps->bit_depth>8)                          << 9) // BitDepthC
49211+        + (sps->log2_ctb_size                         <<10)
49212+        + (pps->constrained_intra_pred_flag           <<13)
49213+        + (sps->sps_strong_intra_smoothing_enable_flag<<14)
49214+        + (sps->sps_temporal_mvp_enabled_flag         <<15)
49215+        + (pps->log2_parallel_merge_level             <<16)
49216+        + (s->sh.slice_temporal_mvp_enabled_flag      <<19)
49217+        + (sps->pcm.loop_filter_disable_flag          <<20)
49218+       + ((pps->cb_qp_offset&31)                      <<21)
49219+       + ((pps->cr_qp_offset&31)                      <<26));
49220+
49221+    apb_write(rpi, RPI_FRAMESIZE, (sps->height<<16) + sps->width);
49222+    apb_write(rpi, RPI_CURRPOC, s->poc);
49223+
49224+    // collocated reads/writes
49225+    if (sps->sps_temporal_mvp_enabled_flag) {
49226+        av_assert0(de->dpbno_col < RPIVID_COL_PICS);
49227+        av_assert0(dpbno_cur < RPIVID_COL_PICS);
49228+
49229+        apb_write_vc_len(rpi, RPI_COLSTRIDE, rpi->col_stride);
49230+        apb_write_vc_len(rpi, RPI_MVSTRIDE,  rpi->col_stride);
49231+        apb_write_vc_addr(rpi, RPI_MVBASE,  rpi->gcolbuf.vc + dpbno_cur * rpi->col_picsize);
49232+        apb_write_vc_addr(rpi, RPI_COLBASE, rpi->gcolbuf.vc + de->dpbno_col * rpi->col_picsize);
49233+    }
49234+
49235+#if TRACE_DEV && 0
49236+    apb_dump_regs(rpi, 0x0, 32);
49237+    apb_dump_regs(rpi, 0x8000, 24);
49238+#endif
49239+
49240+    apb_write(rpi, RPI_NUMROWS, de->PicHeightInCtbsY);
49241+    apb_read(rpi, RPI_NUMROWS); // Read back to confirm write has reached block
49242+
49243+    int_wait(rpi, 2);
49244+
49245+    tend_phase(rpi, 2);
49246+    coeffbuf_sem_claimed = 0;
49247+    sem_post(&rpi->coeffbuf_sem);
49248+    // Set valid here to avoid race in resolving in any pending phase 2
49249+    av_rpi_zc_set_valid_frame(f);
49250+
49251+    post_phase(rpi, de, 2);
49252+
49253+    // Flush frame for CPU access
49254+    // Arguably the best place would be at the start of phase 2 but here
49255+    // will overlap with the wait
49256+    //
49257+    // * Even better would be to have better lock/unlock control in ZC for external access
49258+    if (rpi->gpu_init_type == GPU_INIT_GPU)  // * CMA is currently always uncached
49259+    {
49260+        rpi_cache_buf_t cbuf;
49261+        rpi_cache_flush_env_t * const fe = rpi_cache_flush_init(&cbuf);
49262+        rpi_cache_flush_add_frame(fe, f, RPI_CACHE_FLUSH_MODE_INVALIDATE);
49263+        rpi_cache_flush_finish(fe);
49264+    }
49265+
49266+#if TRACE_ENTRY
49267+    printf(">>> %s[%p] OK\n", __func__, de);
49268+#endif
49269+
49270+    dec_env_release(rpi, de);
49271+    return 0;
49272+
49273+fail:
49274+    av_rpi_zc_set_broken_frame(f);
49275+    if (coeffbuf_sem_claimed)
49276+        sem_post(&rpi->coeffbuf_sem);
49277+    abort_phases(rpi, de);  // Dummy any unresolved phases
49278+
49279+#if TRACE_ENTRY
49280+    printf(">>> %s[%p] FAIL\n", __func__, de);
49281+#endif
49282+
49283+    dec_env_release(rpi, de);
49284+    return rv;
49285+}
49286+
49287+//////////////////////////////////////////////////////////////////////////////
49288+
49289+
49290+#if TRACE_DEV
49291+static void dump_data(const uint8_t * p, size_t len)
49292+{
49293+    size_t i;
49294+    for (i = 0; i < len; i += 16) {
49295+        size_t j;
49296+        printf("%04x", i);
49297+        for (j = 0; j != 16; ++j) {
49298+            printf("%c%02x", i == 8 ? '-' : ' ', p[i+j]);
49299+        }
49300+        printf("\n");
49301+    }
49302+}
49303+#endif
49304+
49305+#if OPT_EMU
49306+static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
49307+{
49308+    unsigned int z = 0;
49309+    while (idx--) {
49310+        if (*b++ == 0) {
49311+            ++z;
49312+            if (z >= 2 && *b == 3) {
49313+                ++b;
49314+                z = 0;
49315+            }
49316+        }
49317+        else {
49318+            z = 0;
49319+        }
49320+    }
49321+    return b;
49322+}
49323+#endif
49324+
49325+static void WriteBitstream(dec_env_t * const de, const HEVCContext * const s) {
49326+    const int rpi_use_emu = OPT_EMU; // FFmpeg removes emulation prevention bytes
49327+    const int offset = 0; // Always 64-byte aligned in sim, need not be on real hardware
49328+    const GetBitContext *gb = &s->HEVClc->gb;
49329+
49330+#if OPT_EMU
49331+    const uint8_t *ptr = ptr_from_index(de->nal_buffer, gb->index/8 + 1);
49332+    const int len = de->nal_size - (ptr - de->nal_buffer);
49333+#else
49334+    const int len = 1 + gb->size_in_bits/8 - gb->index/8;
49335+    const void *ptr = &gb->buffer[gb->index/8];
49336+#endif
49337+
49338+#if TRACE_DEV
49339+    printf("Index=%d, /8=%#x\n", gb->index, gb->index/8);
49340+    dump_data(de->nal_buffer, 128);
49341+#endif
49342+
49343+    p1_axi_write(de, len, ptr, p1_apb_write(de, RPI_BFBASE, 0)); // BFBASE set later
49344+    p1_apb_write(de, RPI_BFNUM, len);
49345+    p1_apb_write(de, RPI_BFCONTROL, offset + (1<<7)); // Stop
49346+    p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu<<6));
49347+}
49348+
49349+//////////////////////////////////////////////////////////////////////////////
49350+// Wavefront mode
49351+
49352+static void wpp_decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts)
49353+{
49354+    const HEVCPPS * const pps = s->ps.pps;
49355+
49356+    int i, resetQPY=1;
49357+    int indep = !s->sh.dependent_slice_segment_flag;
49358+    int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
49359+
49360+    if (ctb_addr_ts)
49361+        wpp_end_previous_slice(de, s, ctb_addr_ts);
49362+    pre_slice_decode(de, s);
49363+    WriteBitstream(de, s);
49364+    if (ctb_addr_ts==0 || indep || de->PicWidthInCtbsY==1)
49365+        WriteProb(de, s);
49366+    else if (ctb_col==0)
49367+        p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
49368+    else
49369+        resetQPY=0;
49370+    program_slicecmds(de, s->slice_idx);
49371+    new_slice_segment(de, s);
49372+    wpp_entry_point(de, s, indep, resetQPY, ctb_addr_ts);
49373+    for (i=0; i<s->sh.num_entry_point_offsets; i++) {
49374+        int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
49375+        int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
49376+        int last_x = de->PicWidthInCtbsY-1;
49377+        if (de->PicWidthInCtbsY>2)
49378+            wpp_pause(de, ctb_row);
49379+        p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + (last_x<<5) + 2);
49380+        if (de->PicWidthInCtbsY==2)
49381+            p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
49382+        if (de->PicWidthInCtbsY==1)
49383+            WriteProb(de, s);
49384+        else
49385+            p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
49386+        ctb_addr_ts += pps->column_width[0];
49387+        wpp_entry_point(de, s, 0, 1, ctb_addr_ts);
49388+    }
49389+}
49390+
49391+//////////////////////////////////////////////////////////////////////////////
49392+// Tiles mode
49393+
49394+static void decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
49395+    const HEVCPPS * const pps = s->ps.pps;
49396+    int i, resetQPY;
49397+
49398+    if (ctb_addr_ts) end_previous_slice(de, s, ctb_addr_ts);
49399+    pre_slice_decode(de, s);
49400+    WriteBitstream(de, s);
49401+    resetQPY = ctb_addr_ts==0
49402+            || pps->tile_id[ctb_addr_ts]!=pps->tile_id[ctb_addr_ts-1]
49403+            || !s->sh.dependent_slice_segment_flag;
49404+    if (resetQPY) WriteProb(de, s);
49405+    program_slicecmds(de, s->slice_idx);
49406+    new_slice_segment(de, s);
49407+    new_entry_point(de, s, !s->sh.dependent_slice_segment_flag, resetQPY, ctb_addr_ts);
49408+    for (i=0; i<s->sh.num_entry_point_offsets; i++) {
49409+        int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
49410+        int ctb_col = ctb_addr_rs % de->PicWidthInCtbsY;
49411+        int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
49412+        int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
49413+        int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
49414+        int last_x = pps->col_bd[tile_x+1]-1;
49415+        int last_y = pps->row_bd[tile_y+1]-1;
49416+        p1_apb_write(de, RPI_STATUS, 2 + (last_x<<5) + (last_y<<18));
49417+        WriteProb(de, s);
49418+        ctb_addr_ts += pps->column_width[tile_x] * pps->row_height[tile_y];
49419+        new_entry_point(de, s, 0, 1, ctb_addr_ts);
49420+    }
49421+}
49422+
49423+//////////////////////////////////////////////////////////////////////////////
49424+
49425+static int cabac_start_align(HEVCContext *s)
49426+{
49427+    GetBitContext *gb = &s->HEVClc->gb;
49428+    skip_bits(gb, 1);
49429+    align_get_bits(gb);
49430+    // Should look at getting rid of this
49431+    return ff_init_cabac_decoder(&s->HEVClc->cc,
49432+                          gb->buffer + get_bits_count(gb) / 8,
49433+                          (get_bits_left(gb) + 7) / 8);
49434+}
49435+
49436+static int rpi_hevc_decode_slice(
49437+    AVCodecContext *avctx,
49438+    const uint8_t *buffer,
49439+    uint32_t size)
49440+{
49441+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
49442+    HEVCContext * const s = avctx->priv_data;
49443+    dec_env_t * const de = dec_env_get(avctx, rpi);
49444+    const HEVCPPS *pps = s->ps.pps;
49445+    int ctb_addr_ts = pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
49446+
49447+#if TRACE_ENTRY
49448+    printf("<<< %s[%p]\n", __func__, de);
49449+#endif
49450+    if (de == NULL) {
49451+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
49452+        return -1;
49453+    }
49454+
49455+    if (de->state != RPIVID_DECODE_START && de->state != RPIVID_DECODE_SLICE) {
49456+        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
49457+        return -1;
49458+    }
49459+    de->state = RPIVID_DECODE_SLICE;
49460+
49461+    de->nal_buffer = buffer;
49462+    de->nal_size   = size;
49463+
49464+#if !OPT_EMU
49465+//    ff_hevc_cabac_init(s, ctb_addr_ts);
49466+    cabac_start_align(s);
49467+#endif
49468+    if (s->ps.sps->scaling_list_enable_flag)
49469+        populate_scaling_factors(de, s);
49470+    pps->entropy_coding_sync_enabled_flag? wpp_decode_slice(de, s, ctb_addr_ts)
49471+                                             : decode_slice(de, s, ctb_addr_ts);
49472+#if TRACE_ENTRY
49473+    printf(">>> %s[%p]\n", __func__, de);
49474+#endif
49475+    dec_env_release(rpi, de);
49476+    return 0;
49477+}
49478+
49479+//////////////////////////////////////////////////////////////////////////////
49480+
49481+static int rpivid_retrieve_data(void *logctx, AVFrame *frame)
49482+{
49483+    int rv;
49484+    if ((rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_WAIT_VALID)) != 0)
49485+        av_log(logctx, AV_LOG_ERROR, "Unable to resolve output frame\n");
49486+    return rv;
49487+}
49488+
49489+static int rpivid_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
49490+{
49491+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
49492+    HEVCContext * const s = avctx->priv_data;
49493+    // Frame buffering + 1 output.  Would need thread_count extra but we now
49494+    // alloc at the start of phase 2 so that is the only thread we need the
49495+    // extra buffer for.
49496+    const unsigned int pool_req = s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering + 1;
49497+    int rv;
49498+
49499+    if (av_rpi_zc_in_use(avctx))
49500+    {
49501+        const AVZcEnvPtr zc = avctx->opaque;
49502+        av_rpi_zc_set_decoder_pool_size(zc, pool_req);
49503+        rv = av_rpi_zc_get_buffer(zc, frame);   // get_buffer2 would alloc
49504+    }
49505+    else
49506+    {
49507+        if (rpi->zc == NULL) {
49508+            pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
49509+            // Alloc inside lock to make sure we only ever alloc one
49510+            if (rpi->zc == NULL) {
49511+                rpi->zc = av_rpi_zc_int_env_alloc(s);
49512+            }
49513+            pthread_mutex_unlock(&rpi->phase_lock);
49514+        }
49515+        av_rpi_zc_set_decoder_pool_size(rpi->zc, pool_req); // Ignored by local allocator, but set anyway :-)
49516+        rv = (rpi->zc == NULL) ? AVERROR(ENOMEM) :
49517+            av_rpi_zc_get_buffer(rpi->zc, frame);
49518+    }
49519+
49520+    if (rv == 0 &&
49521+        (rv = ff_attach_decode_data(frame)) < 0)
49522+    {
49523+        av_frame_unref(frame);
49524+    }
49525+
49526+    if (rv == 0)
49527+    {
49528+        FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data;
49529+        fdd->post_process = rpivid_retrieve_data;
49530+    }
49531+
49532+    return rv;
49533+}
49534+
49535+#if OPT_PHASE_TIMING
49536+static void log_bin_phase(AVCodecContext * const avctx, const unsigned int * const bins)
49537+{
49538+    av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d %7d\n",
49539+           bins[0],  bins[1], bins[2], bins[3],
49540+           bins[4],  bins[5], bins[6], bins[7], bins[8]);
49541+}
49542+#endif
49543+
49544+//////////////////////////////////////////////////////////////////////////////
49545+
49546+static int rpi_hevc_free(AVCodecContext *avctx) {
49547+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
49548+
49549+#if TRACE_ENTRY
49550+    printf("<<< %s\n", __func__);
49551+#endif
49552+
49553+    dec_env_release(rpi, NULL);
49554+
49555+    // Wait for everything else to stop
49556+    {
49557+        struct timespec tt;
49558+        clock_gettime(CLOCK_REALTIME, &tt);
49559+        tt.tv_sec += 2;
49560+        while (sem_timedwait(&rpi->ref_zero, &tt) == -1) {
49561+            const int err = errno;
49562+            if (err == ETIMEDOUT) {
49563+                av_log(avctx, AV_LOG_FATAL, "Rpivid worker threads still running\n");
49564+                return -1;
49565+            }
49566+            if (err != EINTR) {
49567+                av_log(avctx, AV_LOG_ERROR, "Unexpected error %d waiting for work thread to stop\n", err);
49568+                break;
49569+            }
49570+        }
49571+    }
49572+
49573+#if OPT_PHASE_TIMING
49574+    {
49575+        unsigned int i;
49576+        for (i = 0; i != RPIVID_PHASES; ++i) {
49577+            const phase_wait_env_t * const p = rpi->phase_reqs + i;
49578+            av_log(avctx, AV_LOG_INFO, "Phase %u: In %3u.%06u, Out %3u.%06u\n", i,
49579+                   (unsigned int)(p->time_in_phase / 1000000), (unsigned int)(p->time_in_phase % 1000000),
49580+                   (unsigned int)(p->time_out_phase / 1000000), (unsigned int)(p->time_out_phase % 1000000));
49581+            av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d        >\n",
49582+                   time_thresholds[0], time_thresholds[1], time_thresholds[2], time_thresholds[3],
49583+                   time_thresholds[4], time_thresholds[5], time_thresholds[6], time_thresholds[7]);
49584+            log_bin_phase(avctx, p->time_bins);
49585+            log_bin_phase(avctx, p->time_bins3);
49586+            log_bin_phase(avctx, p->time_bins5);
49587+            av_log(avctx, AV_LOG_INFO, "Longest duraction: %ums @ frame %u\n",
49588+                   (unsigned int)(p->max_phase_time / 1000),
49589+                   p->max_time_decode_order);
49590+        }
49591+        av_log(avctx, AV_LOG_INFO, "PU max=%d\n", rpi->max_pu_msgs);
49592+    }
49593+#endif
49594+
49595+    if (rpi->dec_envs != NULL)
49596+    {
49597+        for (int i; i < avctx->thread_count && rpi->dec_envs[i] != NULL; ++i) {
49598+            dec_env_delete(rpi->dec_envs[i]);
49599+        }
49600+        av_freep(&rpi->dec_envs);
49601+    }
49602+
49603+    av_rpi_zc_int_env_freep(&rpi->zc);
49604+
49605+    gpu_free(&rpi->gcolbuf);
49606+
49607+    for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
49608+        gpu_free(rpi->gbitbufs + i);
49609+    }
49610+    for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
49611+        gpu_free(rpi->gcoeffbufs + i);
49612+    }
49613+
49614+    unmap_devp(&rpi->regs, REGS_SIZE);
49615+    unmap_devp(&rpi->ints, INTS_SIZE);
49616+
49617+    if (rpi->gpu_init_type > 0)
49618+        rpi_mem_gpu_uninit();
49619+
49620+    if (rpi->mbox_fd >= 0) {
49621+        mbox_release_clock(rpi->mbox_fd);
49622+        mbox_close(rpi->mbox_fd);
49623+    }
49624+
49625+    sem_destroy(&rpi->ref_zero);
49626+    sem_destroy(&rpi->coeffbuf_sem);
49627+    sem_destroy(&rpi->bitbuf_sem);
49628+
49629+#if TRACE_ENTRY
49630+    printf(">>> %s\n", __func__);
49631+#endif
49632+    return 0;
49633+}
49634+
49635+//////////////////////////////////////////////////////////////////////////////
49636+
49637+static int rpi_hevc_init(AVCodecContext *avctx) {
49638+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
49639+//    const char *err;
49640+
49641+#if TRACE_ENTRY
49642+    printf("<<< %s\n", __func__);
49643+#endif
49644+
49645+    if (avctx->width>4096 || avctx->height>4096) {
49646+        av_log(NULL, AV_LOG_FATAL, "Picture size %dx%d exceeds 4096x4096 maximum for HWAccel\n", avctx->width, avctx->height);
49647+        return AVERROR(ENOTSUP);
49648+    }
49649+
49650+    memset(rpi, 0, sizeof(*rpi));
49651+
49652+    rpi->mbox_fd = -1;
49653+    rpi->decode_order = 0;
49654+
49655+    // Initial PU/COEFF stream buffer split chosen as worst case seen so far
49656+    rpi->max_pu_msgs = 768; // 7.2 says at most 1611 messages per CTU
49657+
49658+
49659+    atomic_store(&rpi->ref_count, 1);
49660+    sem_init(&rpi->ref_zero, 0, 0);
49661+
49662+    sem_init(&rpi->bitbuf_sem,   0, RPIVID_BITBUFS);
49663+    sem_init(&rpi->coeffbuf_sem, 0, RPIVID_COEFFBUFS);
49664+
49665+    pthread_mutex_init(&rpi->phase_lock, NULL);
49666+
49667+    if ((rpi->mbox_fd = mbox_open()) < 0)
49668+    {
49669+        av_log(avctx, AV_LOG_ERROR, "Failed to open mailbox\n");
49670+        goto fail;
49671+    }
49672+    mbox_request_clock(rpi->mbox_fd);
49673+
49674+    if ((rpi->regs = map_dev(avctx, REGS_NAME, REGS_SIZE)) == NULL ||
49675+        (rpi->ints = map_dev(avctx, INTS_NAME, INTS_SIZE)) == NULL) {
49676+        av_log(avctx, AV_LOG_ERROR, "Failed to open rpivid devices\n");
49677+        goto fail;
49678+    }
49679+
49680+    if ((rpi->gpu_init_type = rpi_mem_gpu_init(0)) < 0) {
49681+        av_log(avctx, AV_LOG_ERROR, "Failed to init GPU\n");
49682+        goto fail;
49683+    }
49684+
49685+    if ((rpi->dec_envs = av_mallocz(sizeof(dec_env_t *) * avctx->thread_count)) == NULL) {
49686+        av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d dec envs\n", avctx->thread_count);
49687+        goto fail;
49688+    }
49689+
49690+    rpi->col_stride = rnd64(avctx->width);
49691+    rpi->col_picsize = rpi->col_stride * (((avctx->height + 63) & ~63) >> 4);
49692+    if (gpu_malloc_uncached(rpi->col_picsize * RPIVID_COL_PICS, &rpi->gcolbuf) != 0)
49693+    {
49694+        av_log(avctx, AV_LOG_ERROR, "Failed to allocate col mv buffer\n");
49695+        goto fail;
49696+    }
49697+
49698+    for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
49699+        if (gpu_malloc_uncached(RPIVID_BITBUF_SIZE, rpi->gbitbufs + i) != 0)
49700+        {
49701+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate bitbuf %d\n", i);
49702+            goto fail;
49703+        }
49704+    }
49705+
49706+    for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
49707+        if (gpu_malloc_uncached(RPIVID_COEFFBUF_SIZE, rpi->gcoeffbufs + i) != 0)
49708+        {
49709+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate coeffbuf %d\n", i);
49710+            goto fail;
49711+        }
49712+    }
49713+
49714+    av_log(avctx, AV_LOG_INFO, "RPI HEVC h/w accel init OK\n");
49715+
49716+    return 0;
49717+
49718+fail:
49719+    rpi_hevc_free(avctx);
49720+    return AVERROR_EXTERNAL;
49721+}
49722+
49723+//////////////////////////////////////////////////////////////////////////////
49724+
49725+const AVHWAccel ff_hevc_rpi4_8_hwaccel = {
49726+    .name           = "hevc_rpi4_8",
49727+    .type           = AVMEDIA_TYPE_VIDEO,
49728+    .id             = AV_CODEC_ID_HEVC,
49729+    .pix_fmt        = AV_PIX_FMT_RPI4_8,
49730+    .alloc_frame    = rpivid_hevc_alloc_frame,
49731+    .start_frame    = rpi_hevc_start_frame,
49732+    .end_frame      = rpi_hevc_end_frame,
49733+    .abort_frame    = rpi_hevc_abort_frame,
49734+    .decode_slice   = rpi_hevc_decode_slice,
49735+    .init           = rpi_hevc_init,
49736+    .uninit         = rpi_hevc_free,
49737+    .priv_data_size = sizeof(RPI_T),
49738+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
49739+};
49740+
49741+const AVHWAccel ff_hevc_rpi4_10_hwaccel = {
49742+    .name           = "hevc_rpi4_10",
49743+    .type           = AVMEDIA_TYPE_VIDEO,
49744+    .id             = AV_CODEC_ID_HEVC,
49745+    .pix_fmt        = AV_PIX_FMT_RPI4_10,
49746+    .alloc_frame    = rpivid_hevc_alloc_frame,
49747+    .start_frame    = rpi_hevc_start_frame,
49748+    .end_frame      = rpi_hevc_end_frame,
49749+    .abort_frame    = rpi_hevc_abort_frame,
49750+    .decode_slice   = rpi_hevc_decode_slice,
49751+    .init           = rpi_hevc_init,
49752+    .uninit         = rpi_hevc_free,
49753+    .priv_data_size = sizeof(RPI_T),
49754+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
49755+};
49756+
49757--- a/libavcodec/v4l2_buffers.c
49758+++ b/libavcodec/v4l2_buffers.c
49759@@ -21,6 +21,7 @@
49760  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
49761  */
49762
49763+#include <drm_fourcc.h>
49764 #include <linux/videodev2.h>
49765 #include <sys/ioctl.h>
49766 #include <sys/mman.h>
49767@@ -29,57 +30,82 @@
49768 #include <poll.h>
49769 #include "libavcodec/avcodec.h"
49770 #include "libavcodec/internal.h"
49771+#include "libavutil/avassert.h"
49772 #include "libavutil/pixdesc.h"
49773+#include "libavutil/hwcontext.h"
49774 #include "v4l2_context.h"
49775 #include "v4l2_buffers.h"
49776 #include "v4l2_m2m.h"
49777+#include "weak_link.h"
49778
49779 #define USEC_PER_SEC 1000000
49780-static AVRational v4l2_timebase = { 1, USEC_PER_SEC };
49781+static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
49782
49783-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
49784+static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
49785 {
49786     return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
49787         container_of(buf->context, V4L2m2mContext, output) :
49788         container_of(buf->context, V4L2m2mContext, capture);
49789 }
49790
49791-static inline AVCodecContext *logger(V4L2Buffer *buf)
49792+static inline AVCodecContext *logger(const V4L2Buffer * const buf)
49793 {
49794     return buf_to_m2mctx(buf)->avctx;
49795 }
49796
49797-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
49798+static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
49799 {
49800-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
49801-
49802-    if (s->avctx->pkt_timebase.num)
49803-        return s->avctx->pkt_timebase;
49804-    return s->avctx->time_base;
49805+    const V4L2m2mContext *s = buf_to_m2mctx(avbuf);
49806+    const AVRational tb = s->avctx->pkt_timebase.num ?
49807+        s->avctx->pkt_timebase :
49808+        s->avctx->time_base;
49809+    return tb.num && tb.den ? tb : v4l2_timebase;
49810 }
49811
49812-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
49813+static inline struct timeval tv_from_int(const int64_t t)
49814 {
49815-    int64_t v4l2_pts;
49816+    return (struct timeval){
49817+        .tv_usec = t % USEC_PER_SEC,
49818+        .tv_sec  = t / USEC_PER_SEC
49819+    };
49820+}
49821
49822-    if (pts == AV_NOPTS_VALUE)
49823-        pts = 0;
49824+static inline int64_t int_from_tv(const struct timeval t)
49825+{
49826+    return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
49827+}
49828
49829+static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
49830+{
49831     /* convert pts to v4l2 timebase */
49832-    v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
49833-    out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
49834-    out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
49835+    const int64_t v4l2_pts =
49836+        pts == AV_NOPTS_VALUE ? 0 :
49837+            av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
49838+    out->buf.timestamp = tv_from_int(v4l2_pts);
49839 }
49840
49841-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
49842+static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
49843 {
49844-    int64_t v4l2_pts;
49845-
49846+    const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
49847+    return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
49848+#if 0
49849     /* convert pts back to encoder timebase */
49850-    v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
49851-                        avbuf->buf.timestamp.tv_usec;
49852+    return
49853+        avbuf->context->no_pts_rescale ? v4l2_pts :
49854+        v4l2_pts == 0 ? AV_NOPTS_VALUE :
49855+            av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
49856+#endif
49857+}
49858
49859-    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
49860+static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
49861+{
49862+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
49863+        out->planes[plane].bytesused = bytesused;
49864+        out->planes[plane].length = length;
49865+    } else {
49866+        out->buf.bytesused = bytesused;
49867+        out->buf.length = length;
49868+    }
49869 }
49870
49871 static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
49872@@ -116,49 +142,176 @@ static enum AVColorPrimaries v4l2_get_co
49873     return AVCOL_PRI_UNSPECIFIED;
49874 }
49875
49876-static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
49877-{
49878-    enum v4l2_quantization qt;
49879+static void v4l2_set_color(V4L2Buffer *buf,
49880+                           const enum AVColorPrimaries avcp,
49881+                           const enum AVColorSpace avcs,
49882+                           const enum AVColorTransferCharacteristic avxc)
49883+{
49884+    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
49885+    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
49886+    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
49887+
49888+    switch (avcp) {
49889+    case AVCOL_PRI_BT709:
49890+        cs = V4L2_COLORSPACE_REC709;
49891+        ycbcr = V4L2_YCBCR_ENC_709;
49892+        break;
49893+    case AVCOL_PRI_BT470M:
49894+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
49895+        ycbcr = V4L2_YCBCR_ENC_601;
49896+        break;
49897+    case AVCOL_PRI_BT470BG:
49898+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
49899+        break;
49900+    case AVCOL_PRI_SMPTE170M:
49901+        cs = V4L2_COLORSPACE_SMPTE170M;
49902+        break;
49903+    case AVCOL_PRI_SMPTE240M:
49904+        cs = V4L2_COLORSPACE_SMPTE240M;
49905+        break;
49906+    case AVCOL_PRI_BT2020:
49907+        cs = V4L2_COLORSPACE_BT2020;
49908+        break;
49909+    case AVCOL_PRI_SMPTE428:
49910+    case AVCOL_PRI_SMPTE431:
49911+    case AVCOL_PRI_SMPTE432:
49912+    case AVCOL_PRI_EBU3213:
49913+    case AVCOL_PRI_RESERVED:
49914+    case AVCOL_PRI_FILM:
49915+    case AVCOL_PRI_UNSPECIFIED:
49916+    default:
49917+        break;
49918+    }
49919
49920-    qt = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
49921-        buf->context->format.fmt.pix_mp.quantization :
49922-        buf->context->format.fmt.pix.quantization;
49923+    switch (avcs) {
49924+    case AVCOL_SPC_RGB:
49925+        cs = V4L2_COLORSPACE_SRGB;
49926+        break;
49927+    case AVCOL_SPC_BT709:
49928+        cs = V4L2_COLORSPACE_REC709;
49929+        break;
49930+    case AVCOL_SPC_FCC:
49931+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
49932+        break;
49933+    case AVCOL_SPC_BT470BG:
49934+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
49935+        break;
49936+    case AVCOL_SPC_SMPTE170M:
49937+        cs = V4L2_COLORSPACE_SMPTE170M;
49938+        break;
49939+    case AVCOL_SPC_SMPTE240M:
49940+        cs = V4L2_COLORSPACE_SMPTE240M;
49941+        break;
49942+    case AVCOL_SPC_BT2020_CL:
49943+        cs = V4L2_COLORSPACE_BT2020;
49944+        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
49945+        break;
49946+    case AVCOL_SPC_BT2020_NCL:
49947+        cs = V4L2_COLORSPACE_BT2020;
49948+        break;
49949+    default:
49950+        break;
49951+    }
49952
49953-    switch (qt) {
49954-    case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG;
49955-    case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG;
49956+    switch (xfer) {
49957+    case AVCOL_TRC_BT709:
49958+        xfer = V4L2_XFER_FUNC_709;
49959+        break;
49960+    case AVCOL_TRC_IEC61966_2_1:
49961+        xfer = V4L2_XFER_FUNC_SRGB;
49962+        break;
49963+    case AVCOL_TRC_SMPTE240M:
49964+        xfer = V4L2_XFER_FUNC_SMPTE240M;
49965+        break;
49966+    case AVCOL_TRC_SMPTE2084:
49967+        xfer = V4L2_XFER_FUNC_SMPTE2084;
49968+        break;
49969     default:
49970         break;
49971     }
49972
49973-     return AVCOL_RANGE_UNSPECIFIED;
49974+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
49975+        buf->context->format.fmt.pix_mp.colorspace = cs;
49976+        buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr;
49977+        buf->context->format.fmt.pix_mp.xfer_func = xfer;
49978+    } else {
49979+        buf->context->format.fmt.pix.colorspace = cs;
49980+        buf->context->format.fmt.pix.ycbcr_enc = ycbcr;
49981+        buf->context->format.fmt.pix.xfer_func = xfer;
49982+    }
49983 }
49984
49985-static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
49986+static inline enum v4l2_quantization
49987+buf_quantization(const V4L2Buffer * const buf)
49988 {
49989-    enum v4l2_ycbcr_encoding ycbcr;
49990-    enum v4l2_colorspace cs;
49991+    return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
49992+        buf->context->format.fmt.pix_mp.quantization :
49993+        buf->context->format.fmt.pix.quantization;
49994+}
49995
49996-    cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
49997+static inline enum v4l2_colorspace
49998+buf_colorspace(const V4L2Buffer * const buf)
49999+{
50000+    return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
50001         buf->context->format.fmt.pix_mp.colorspace :
50002         buf->context->format.fmt.pix.colorspace;
50003+}
50004
50005-    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
50006+static inline enum v4l2_ycbcr_encoding
50007+buf_ycbcr_enc(const V4L2Buffer * const buf)
50008+{
50009+    return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
50010         buf->context->format.fmt.pix_mp.ycbcr_enc:
50011         buf->context->format.fmt.pix.ycbcr_enc;
50012+}
50013
50014-    switch(cs) {
50015-    case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB;
50016+static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
50017+{
50018+    switch (buf_quantization(buf)) {
50019+    case V4L2_QUANTIZATION_LIM_RANGE:
50020+        return AVCOL_RANGE_MPEG;
50021+    case V4L2_QUANTIZATION_FULL_RANGE:
50022+        return AVCOL_RANGE_JPEG;
50023+    case V4L2_QUANTIZATION_DEFAULT:
50024+        // If YUV (which we assume for all video decode) then, from the header
50025+        // comments, range is limited unless CS is JPEG
50026+        return buf_colorspace(buf) == V4L2_COLORSPACE_JPEG ?
50027+            AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
50028+    default:
50029+        break;
50030+    }
50031+
50032+     return AVCOL_RANGE_UNSPECIFIED;
50033+}
50034+
50035+static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr)
50036+{
50037+    const enum v4l2_quantization q =
50038+        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
50039+        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
50040+            V4L2_QUANTIZATION_DEFAULT;
50041+
50042+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
50043+        buf->context->format.fmt.pix_mp.quantization = q;
50044+    } else {
50045+        buf->context->format.fmt.pix.quantization = q;
50046+    }
50047+}
50048+
50049+static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
50050+{
50051+    switch (buf_colorspace(buf)) {
50052+    case V4L2_COLORSPACE_JPEG:  // JPEG -> SRGB
50053+    case V4L2_COLORSPACE_SRGB:
50054+        return AVCOL_SPC_RGB;
50055     case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709;
50056     case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC;
50057     case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG;
50058     case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M;
50059     case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M;
50060     case V4L2_COLORSPACE_BT2020:
50061-        if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM)
50062-            return AVCOL_SPC_BT2020_CL;
50063-        else
50064-             return AVCOL_SPC_BT2020_NCL;
50065+        return buf_ycbcr_enc(buf) == V4L2_YCBCR_ENC_BT2020_CONST_LUM ?
50066+            AVCOL_SPC_BT2020_CL : AVCOL_SPC_BT2020_NCL;
50067     default:
50068         break;
50069     }
50070@@ -168,17 +321,9 @@ static enum AVColorSpace v4l2_get_color_
50071
50072 static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
50073 {
50074-    enum v4l2_ycbcr_encoding ycbcr;
50075+    const enum v4l2_ycbcr_encoding ycbcr = buf_ycbcr_enc(buf);
50076     enum v4l2_xfer_func xfer;
50077-    enum v4l2_colorspace cs;
50078-
50079-    cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
50080-        buf->context->format.fmt.pix_mp.colorspace :
50081-        buf->context->format.fmt.pix.colorspace;
50082-
50083-    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
50084-        buf->context->format.fmt.pix_mp.ycbcr_enc:
50085-        buf->context->format.fmt.pix.ycbcr_enc;
50086+    const enum v4l2_colorspace cs = buf_colorspace(buf);
50087
50088     xfer = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
50089         buf->context->format.fmt.pix_mp.xfer_func:
50090@@ -210,73 +355,165 @@ static enum AVColorTransferCharacteristi
50091     return AVCOL_TRC_UNSPECIFIED;
50092 }
50093
50094-static void v4l2_free_buffer(void *opaque, uint8_t *unused)
50095+static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf)
50096 {
50097-    V4L2Buffer* avbuf = opaque;
50098-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
50099-
50100-    if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) {
50101-        atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
50102+    return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
50103+}
50104
50105-        if (s->reinit) {
50106-            if (!atomic_load(&s->refcount))
50107-                sem_post(&s->refsync);
50108-        } else {
50109-            if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) {
50110-                /* no need to queue more buffers to the driver */
50111-                avbuf->status = V4L2BUF_AVAILABLE;
50112-            }
50113-            else if (avbuf->context->streamon)
50114-                ff_v4l2_buffer_enqueue(avbuf);
50115-        }
50116+static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
50117+{
50118+    return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
50119+}
50120
50121-        av_buffer_unref(&avbuf->context_ref);
50122-    }
50123+static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
50124+{
50125+    buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
50126+        is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
50127 }
50128
50129-static int v4l2_buf_increase_ref(V4L2Buffer *in)
50130+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
50131 {
50132-    V4L2m2mContext *s = buf_to_m2mctx(in);
50133+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
50134+    AVDRMLayerDescriptor *layer;
50135
50136-    if (in->context_ref)
50137-        atomic_fetch_add(&in->context_refcount, 1);
50138-    else {
50139-        in->context_ref = av_buffer_ref(s->self_ref);
50140-        if (!in->context_ref)
50141-            return AVERROR(ENOMEM);
50142+    /* fill the DRM frame descriptor */
50143+    drm_desc->nb_objects = avbuf->num_planes;
50144+    drm_desc->nb_layers = 1;
50145
50146-        in->context_refcount = 1;
50147+    layer = &drm_desc->layers[0];
50148+    layer->nb_planes = avbuf->num_planes;
50149+
50150+    for (int i = 0; i < avbuf->num_planes; i++) {
50151+        layer->planes[i].object_index = i;
50152+        layer->planes[i].offset = 0;
50153+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
50154     }
50155
50156-    in->status = V4L2BUF_RET_USER;
50157-    atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
50158+    switch (avbuf->context->av_pix_fmt) {
50159+    case AV_PIX_FMT_YUYV422:
50160+
50161+        layer->format = DRM_FORMAT_YUYV;
50162+        layer->nb_planes = 1;
50163
50164-    return 0;
50165+        break;
50166+
50167+    case AV_PIX_FMT_NV12:
50168+    case AV_PIX_FMT_NV21:
50169+
50170+        layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
50171+            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
50172+
50173+        if (avbuf->num_planes > 1)
50174+            break;
50175+
50176+        layer->nb_planes = 2;
50177+
50178+        layer->planes[1].object_index = 0;
50179+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
50180+            avbuf->context->format.fmt.pix.height;
50181+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
50182+        break;
50183+
50184+    case AV_PIX_FMT_YUV420P:
50185+
50186+        layer->format = DRM_FORMAT_YUV420;
50187+
50188+        if (avbuf->num_planes > 1)
50189+            break;
50190+
50191+        layer->nb_planes = 3;
50192+
50193+        layer->planes[1].object_index = 0;
50194+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
50195+            avbuf->context->format.fmt.pix.height;
50196+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
50197+
50198+        layer->planes[2].object_index = 0;
50199+        layer->planes[2].offset = layer->planes[1].offset +
50200+            ((avbuf->plane_info[0].bytesperline *
50201+              avbuf->context->format.fmt.pix.height) >> 2);
50202+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
50203+        break;
50204+
50205+    default:
50206+        drm_desc->nb_layers = 0;
50207+        break;
50208+    }
50209+
50210+    return (uint8_t *) drm_desc;
50211 }
50212
50213-static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
50214+static void v4l2_free_bufref(void *opaque, uint8_t *data)
50215 {
50216-    int ret;
50217+    AVBufferRef * bufref = (AVBufferRef *)data;
50218+    V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data;
50219+    struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl);
50220
50221-    if (plane >= in->num_planes)
50222-        return AVERROR(EINVAL);
50223+    if (ctx != NULL) {
50224+        // Buffer still attached to context
50225+        V4L2m2mContext *s = buf_to_m2mctx(avbuf);
50226
50227-    /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
50228-    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
50229-                            in->plane_info[plane].length, v4l2_free_buffer, in, 0);
50230-    if (!*buf)
50231-        return AVERROR(ENOMEM);
50232+        ff_mutex_lock(&ctx->lock);
50233
50234-    ret = v4l2_buf_increase_ref(in);
50235-    if (ret)
50236-        av_buffer_unref(buf);
50237+        ff_v4l2_buffer_set_avail(avbuf);
50238
50239-    return ret;
50240+        if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
50241+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
50242+            /* no need to queue more buffers to the driver */
50243+        }
50244+        else if (ctx->streamon) {
50245+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name);
50246+            avbuf->buf.timestamp.tv_sec = 0;
50247+            avbuf->buf.timestamp.tv_usec = 0;
50248+            ff_v4l2_buffer_enqueue(avbuf);  // will set to IN_DRIVER
50249+        }
50250+        else {
50251+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name);
50252+        }
50253+
50254+        ff_mutex_unlock(&ctx->lock);
50255+    }
50256+
50257+    ff_weak_link_unlock(avbuf->context_wl);
50258+    av_buffer_unref(&bufref);
50259 }
50260
50261-static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset, AVBufferRef* bref)
50262+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
50263+{
50264+    struct v4l2_exportbuffer expbuf;
50265+    int i, ret;
50266+
50267+    for (i = 0; i < avbuf->num_planes; i++) {
50268+        memset(&expbuf, 0, sizeof(expbuf));
50269+
50270+        expbuf.index = avbuf->buf.index;
50271+        expbuf.type = avbuf->buf.type;
50272+        expbuf.plane = i;
50273+
50274+        ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
50275+        if (ret < 0)
50276+            return AVERROR(errno);
50277+
50278+        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
50279+            /* drm frame */
50280+            avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
50281+            avbuf->drm_frame.objects[i].fd = expbuf.fd;
50282+            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
50283+        } else {
50284+            /* drm frame */
50285+            avbuf->drm_frame.objects[0].size = avbuf->buf.length;
50286+            avbuf->drm_frame.objects[0].fd = expbuf.fd;
50287+            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
50288+        }
50289+    }
50290+
50291+    return 0;
50292+}
50293+
50294+static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
50295 {
50296     unsigned int bytesused, length;
50297+    int rv = 0;
50298
50299     if (plane >= out->num_planes)
50300         return AVERROR(EINVAL);
50301@@ -284,32 +521,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer
50302     length = out->plane_info[plane].length;
50303     bytesused = FFMIN(size+offset, length);
50304
50305-    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
50306-
50307-    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
50308-        out->planes[plane].bytesused = bytesused;
50309-        out->planes[plane].length = length;
50310-    } else {
50311-        out->buf.bytesused = bytesused;
50312-        out->buf.length = length;
50313+    if (size > length - offset) {
50314+        size = length - offset;
50315+        rv = AVERROR(ENOMEM);
50316     }
50317
50318-    return 0;
50319+    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size);
50320+
50321+    set_buf_length(out, plane, bytesused, length);
50322+
50323+    return rv;
50324+}
50325+
50326+static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
50327+{
50328+    AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]);
50329+    AVBufferRef * newbuf;
50330+
50331+    if (!bufref)
50332+        return NULL;
50333+
50334+    newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0);
50335+    if (newbuf == NULL)
50336+        av_buffer_unref(&bufref);
50337+
50338+    avbuf->status = V4L2BUF_RET_USER;
50339+    return newbuf;
50340 }
50341
50342 static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
50343 {
50344-    int i, ret;
50345+    int i;
50346
50347     frame->format = avbuf->context->av_pix_fmt;
50348
50349-    for (i = 0; i < avbuf->num_planes; i++) {
50350-        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
50351-        if (ret)
50352-            return ret;
50353+    frame->buf[0] = wrap_avbuf(avbuf);
50354+    if (frame->buf[0] == NULL)
50355+        return AVERROR(ENOMEM);
50356+
50357+    if (buf_to_m2mctx(avbuf)->output_drm) {
50358+        /* 1. get references to the actual data */
50359+        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
50360+        frame->format = AV_PIX_FMT_DRM_PRIME;
50361+        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
50362+        return 0;
50363+    }
50364
50365+
50366+    /* 1. get references to the actual data */
50367+    for (i = 0; i < avbuf->num_planes; i++) {
50368+        frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset;
50369         frame->linesize[i] = avbuf->plane_info[i].bytesperline;
50370-        frame->data[i] = frame->buf[i]->data;
50371     }
50372
50373     /* fixup special cases */
50374@@ -318,17 +580,17 @@ static int v4l2_buffer_buf_to_swframe(AV
50375     case AV_PIX_FMT_NV21:
50376         if (avbuf->num_planes > 1)
50377             break;
50378-        frame->linesize[1] = avbuf->plane_info[0].bytesperline;
50379-        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
50380+        frame->linesize[1] = frame->linesize[0];
50381+        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
50382         break;
50383
50384     case AV_PIX_FMT_YUV420P:
50385         if (avbuf->num_planes > 1)
50386             break;
50387-        frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1;
50388-        frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1;
50389-        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
50390-        frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2);
50391+        frame->linesize[1] = frame->linesize[0] / 2;
50392+        frame->linesize[2] = frame->linesize[1];
50393+        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
50394+        frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2;
50395         break;
50396
50397     default:
50398@@ -338,68 +600,127 @@ static int v4l2_buffer_buf_to_swframe(AV
50399     return 0;
50400 }
50401
50402+static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h)
50403+{
50404+    if (dst_stride == src_stride && w + 32 >= dst_stride) {
50405+        memcpy(dst, src, dst_stride * h);
50406+    }
50407+    else {
50408+        while (--h >= 0) {
50409+            memcpy(dst, src, w);
50410+            dst += dst_stride;
50411+            src += src_stride;
50412+        }
50413+    }
50414+}
50415+
50416+static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
50417+{
50418+    return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
50419+}
50420+
50421+static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
50422+{
50423+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
50424+
50425+    if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
50426+        return AVERROR(EINVAL);
50427+
50428+    av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
50429+
50430+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
50431+        // Only currently cope with single buffer types
50432+        if (out->buf.length != 1)
50433+            return AVERROR_PATCHWELCOME;
50434+        if (src->nb_objects != 1)
50435+            return AVERROR(EINVAL);
50436+
50437+        out->planes[0].m.fd = src->objects[0].fd;
50438+    }
50439+    else {
50440+        if (src->nb_objects != 1)
50441+            return AVERROR(EINVAL);
50442+
50443+        out->buf.m.fd      = src->objects[0].fd;
50444+    }
50445+
50446+    // No need to copy src AVDescriptor and if we did then we may confuse
50447+    // fd close on free
50448+    out->ref_buf = av_buffer_ref(frame->buf[0]);
50449+
50450+    return 0;
50451+}
50452+
50453 static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
50454 {
50455-    int i, ret;
50456-    struct v4l2_format fmt = out->context->format;
50457-    int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
50458-                       fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat;
50459-    int height       = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
50460-                       fmt.fmt.pix_mp.height : fmt.fmt.pix.height;
50461-    int is_planar_format = 0;
50462-
50463-    switch (pixel_format) {
50464-    case V4L2_PIX_FMT_YUV420M:
50465-    case V4L2_PIX_FMT_YVU420M:
50466-#ifdef V4L2_PIX_FMT_YUV422M
50467-    case V4L2_PIX_FMT_YUV422M:
50468-#endif
50469-#ifdef V4L2_PIX_FMT_YVU422M
50470-    case V4L2_PIX_FMT_YVU422M:
50471-#endif
50472-#ifdef V4L2_PIX_FMT_YUV444M
50473-    case V4L2_PIX_FMT_YUV444M:
50474-#endif
50475-#ifdef V4L2_PIX_FMT_YVU444M
50476-    case V4L2_PIX_FMT_YVU444M:
50477-#endif
50478-    case V4L2_PIX_FMT_NV12M:
50479-    case V4L2_PIX_FMT_NV21M:
50480-    case V4L2_PIX_FMT_NV12MT_16X16:
50481-    case V4L2_PIX_FMT_NV12MT:
50482-    case V4L2_PIX_FMT_NV16M:
50483-    case V4L2_PIX_FMT_NV61M:
50484-        is_planar_format = 1;
50485-    }
50486-
50487-    if (!is_planar_format) {
50488-        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
50489-        int planes_nb = 0;
50490-        int offset = 0;
50491-
50492-        for (i = 0; i < desc->nb_components; i++)
50493-            planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
50494-
50495-        for (i = 0; i < planes_nb; i++) {
50496-            int size, h = height;
50497-            if (i == 1 || i == 2) {
50498+    int i;
50499+    int num_planes = 0;
50500+    int pel_strides[4] = {0};
50501+
50502+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
50503+
50504+    if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) {
50505+        av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__);
50506+        return -1;
50507+    }
50508+
50509+    for (i = 0; i != desc->nb_components; ++i) {
50510+        if (desc->comp[i].plane >= num_planes)
50511+            num_planes = desc->comp[i].plane + 1;
50512+        pel_strides[desc->comp[i].plane] = desc->comp[i].step;
50513+    }
50514+
50515+    if (out->num_planes > 1) {
50516+        if (num_planes != out->num_planes) {
50517+            av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes);
50518+            return -1;
50519+        }
50520+        for (i = 0; i != num_planes; ++i) {
50521+            int w = frame->width;
50522+            int h = frame->height;
50523+            if (is_chroma(desc, i, num_planes)) {
50524+                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
50525                 h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
50526             }
50527-            size = frame->linesize[i] * h;
50528-            ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset, frame->buf[i]);
50529-            if (ret)
50530-                return ret;
50531-            offset += size;
50532+
50533+            cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline,
50534+                   frame->data[i], frame->linesize[i],
50535+                   w * pel_strides[i], h);
50536+            set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length);
50537         }
50538-        return 0;
50539     }
50540+    else
50541+    {
50542+        unsigned int offset = 0;
50543+
50544+        for (i = 0; i != num_planes; ++i) {
50545+            int w = frame->width;
50546+            int h = frame->height;
50547+            int dst_stride = out->plane_info[0].bytesperline;
50548+            uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset;
50549+
50550+            if (is_chroma(desc, i, num_planes)) {
50551+                // Is chroma
50552+                dst_stride >>= desc->log2_chroma_w;
50553+                offset += dst_stride * (out->context->height >> desc->log2_chroma_h);
50554+                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
50555+                h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
50556+            }
50557+            else {
50558+                // Is luma or alpha
50559+                offset += dst_stride * out->context->height;
50560+            }
50561+            if (offset > out->plane_info[0].length) {
50562+                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length);
50563+                return -1;
50564+            }
50565
50566-    for (i = 0; i < out->num_planes; i++) {
50567-        ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0, frame->buf[i]);
50568-        if (ret)
50569-            return ret;
50570+            cpy_2d(dst, dst_stride,
50571+                   frame->data[i], frame->linesize[i],
50572+                   w * pel_strides[i], h);
50573+        }
50574+        set_buf_length(out, 0, offset, out->plane_info[0].length);
50575     }
50576-
50577     return 0;
50578 }
50579
50580@@ -409,16 +730,31 @@ static int v4l2_buffer_swframe_to_buf(co
50581  *
50582  ******************************************************************************/
50583
50584-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
50585+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
50586 {
50587-    v4l2_set_pts(out, frame->pts);
50588-
50589-    return v4l2_buffer_swframe_to_buf(frame, out);
50590+    out->buf.flags = frame->key_frame ?
50591+        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
50592+        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
50593+    // Beware that colour info is held in format rather than the actual
50594+    // v4l2 buffer struct so this may not be as useful as you might hope
50595+    v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
50596+    v4l2_set_color_range(out, frame->color_range);
50597+    // PTS & interlace are buffer vars
50598+    if (track_ts)
50599+        out->buf.timestamp = tv_from_int(track_ts);
50600+    else
50601+        v4l2_set_pts(out, frame->pts);
50602+    v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
50603+
50604+    return frame->format == AV_PIX_FMT_DRM_PRIME ?
50605+        v4l2_buffer_primeframe_to_buf(frame, out) :
50606+        v4l2_buffer_swframe_to_buf(frame, out);
50607 }
50608
50609 int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
50610 {
50611     int ret;
50612+    V4L2Context * const ctx = avbuf->context;
50613
50614     av_frame_unref(frame);
50615
50616@@ -429,17 +765,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
50617
50618     /* 2. get frame information */
50619     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
50620+    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I :
50621+        (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P :
50622+        (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B :
50623+            AV_PICTURE_TYPE_NONE;
50624     frame->color_primaries = v4l2_get_color_primaries(avbuf);
50625     frame->colorspace = v4l2_get_color_space(avbuf);
50626     frame->color_range = v4l2_get_color_range(avbuf);
50627     frame->color_trc = v4l2_get_color_trc(avbuf);
50628     frame->pts = v4l2_get_pts(avbuf);
50629     frame->pkt_dts = AV_NOPTS_VALUE;
50630+    frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
50631+    frame->top_field_first = v4l2_buf_is_top_first(avbuf);
50632
50633     /* these values are updated also during re-init in v4l2_process_driver_event */
50634-    frame->height = avbuf->context->height;
50635-    frame->width = avbuf->context->width;
50636-    frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio;
50637+    frame->height = ctx->height;
50638+    frame->width = ctx->width;
50639+    frame->sample_aspect_ratio = ctx->sample_aspect_ratio;
50640+
50641+    if (ctx->selection.height && ctx->selection.width) {
50642+        frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0;
50643+        frame->crop_top  = ctx->selection.top < frame->height ? ctx->selection.top  : 0;
50644+        frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ?
50645+            frame->width - (ctx->selection.left + ctx->selection.width) : 0;
50646+        frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ?
50647+            frame->height - (ctx->selection.top + ctx->selection.height) : 0;
50648+    }
50649
50650     /* 3. report errors upstream */
50651     if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
50652@@ -452,15 +803,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
50653
50654 int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
50655 {
50656-    int ret;
50657-
50658     av_packet_unref(pkt);
50659-    ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
50660-    if (ret)
50661-        return ret;
50662+
50663+    pkt->buf = wrap_avbuf(avbuf);
50664+    if (pkt->buf == NULL)
50665+        return AVERROR(ENOMEM);
50666
50667     pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
50668-    pkt->data = pkt->buf->data;
50669+    pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
50670+    pkt->flags = 0;
50671
50672     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
50673         pkt->flags |= AV_PKT_FLAG_KEY;
50674@@ -475,31 +826,91 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
50675     return 0;
50676 }
50677
50678-int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
50679+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
50680+                                    const void *extdata, size_t extlen,
50681+                                    const int64_t timestamp)
50682 {
50683     int ret;
50684
50685-    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0, pkt->buf);
50686-    if (ret)
50687+    if (extlen) {
50688+        ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0);
50689+        if (ret)
50690+            return ret;
50691+    }
50692+
50693+    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
50694+    if (ret && ret != AVERROR(ENOMEM))
50695         return ret;
50696
50697-    v4l2_set_pts(out, pkt->pts);
50698+    if (timestamp)
50699+        out->buf.timestamp = tv_from_int(timestamp);
50700+    else
50701+        v4l2_set_pts(out, pkt->pts);
50702+
50703+    out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
50704+        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
50705+        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
50706
50707-    if (pkt->flags & AV_PKT_FLAG_KEY)
50708-        out->flags = V4L2_BUF_FLAG_KEYFRAME;
50709+    return ret;
50710+}
50711
50712-    return 0;
50713+int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
50714+{
50715+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
50716+}
50717+
50718+
50719+static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
50720+{
50721+    V4L2Buffer * const avbuf = (V4L2Buffer *)data;
50722+    int i;
50723+
50724+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) {
50725+        struct V4L2Plane_info *p = avbuf->plane_info + i;
50726+        if (p->mm_addr != NULL)
50727+            munmap(p->mm_addr, p->length);
50728+    }
50729+
50730+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
50731+        if (avbuf->drm_frame.objects[i].fd != -1)
50732+            close(avbuf->drm_frame.objects[i].fd);
50733+    }
50734+
50735+    av_buffer_unref(&avbuf->ref_buf);
50736+
50737+    ff_weak_link_unref(&avbuf->context_wl);
50738+
50739+    av_free(avbuf);
50740 }
50741
50742-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
50743+
50744+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
50745 {
50746-    V4L2Context *ctx = avbuf->context;
50747     int ret, i;
50748+    V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
50749+    AVBufferRef * bufref;
50750
50751-    avbuf->buf.memory = V4L2_MEMORY_MMAP;
50752+    *pbufref = NULL;
50753+    if (avbuf == NULL)
50754+        return AVERROR(ENOMEM);
50755+
50756+    bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0);
50757+    if (bufref == NULL) {
50758+        av_free(avbuf);
50759+        return AVERROR(ENOMEM);
50760+    }
50761+
50762+    avbuf->context = ctx;
50763+    avbuf->buf.memory = mem;
50764     avbuf->buf.type = ctx->type;
50765     avbuf->buf.index = index;
50766
50767+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
50768+        avbuf->drm_frame.objects[i].fd = -1;
50769+    }
50770+
50771+    avbuf->context_wl = ff_weak_link_ref(ctx->wl_master);
50772+
50773     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
50774         avbuf->buf.length = VIDEO_MAX_PLANES;
50775         avbuf->buf.m.planes = avbuf->planes;
50776@@ -507,7 +918,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
50777
50778     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
50779     if (ret < 0)
50780-        return AVERROR(errno);
50781+        goto fail;
50782
50783     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
50784         avbuf->num_planes = 0;
50785@@ -520,6 +931,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
50786         avbuf->num_planes = 1;
50787
50788     for (i = 0; i < avbuf->num_planes; i++) {
50789+        const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
50790+            (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);
50791
50792         avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
50793             ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
50794@@ -527,25 +940,29 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
50795
50796         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
50797             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
50798-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
50799-                                           PROT_READ | PROT_WRITE, MAP_SHARED,
50800-                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
50801+
50802+            if (want_mmap)
50803+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
50804+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
50805+                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
50806         } else {
50807             avbuf->plane_info[i].length = avbuf->buf.length;
50808-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
50809-                                          PROT_READ | PROT_WRITE, MAP_SHARED,
50810-                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
50811+
50812+            if (want_mmap)
50813+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
50814+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
50815+                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
50816         }
50817
50818-        if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
50819-            return AVERROR(ENOMEM);
50820+        if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
50821+            avbuf->plane_info[i].mm_addr = NULL;
50822+            ret = AVERROR(ENOMEM);
50823+            goto fail;
50824+        }
50825     }
50826
50827     avbuf->status = V4L2BUF_AVAILABLE;
50828
50829-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
50830-        return 0;
50831-
50832     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
50833         avbuf->buf.m.planes = avbuf->planes;
50834         avbuf->buf.length   = avbuf->num_planes;
50835@@ -555,20 +972,51 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
50836         avbuf->buf.length    = avbuf->planes[0].length;
50837     }
50838
50839-    return ff_v4l2_buffer_enqueue(avbuf);
50840+    if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
50841+        if (buf_to_m2mctx(avbuf)->output_drm) {
50842+            ret = v4l2_buffer_export_drm(avbuf);
50843+            if (ret)
50844+                    goto fail;
50845+        }
50846+    }
50847+
50848+    *pbufref = bufref;
50849+    return 0;
50850+
50851+fail:
50852+    av_buffer_unref(&bufref);
50853+    return ret;
50854 }
50855
50856 int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
50857 {
50858     int ret;
50859+    int qc;
50860
50861-    avbuf->buf.flags = avbuf->flags;
50862+    if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
50863+        av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
50864+               avbuf->context->name, avbuf->buf.index,
50865+               avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
50866+               avbuf->context->q_count);
50867+    }
50868
50869     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
50870-    if (ret < 0)
50871-        return AVERROR(errno);
50872+    if (ret < 0) {
50873+        int err = errno;
50874+        av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n",
50875+               avbuf->context->name, avbuf->buf.index,
50876+               err, strerror(err));
50877+        return AVERROR(err);
50878+    }
50879
50880+    // Lock not wanted - if called from buffer free then lock already obtained
50881+    qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
50882     avbuf->status = V4L2BUF_IN_DRIVER;
50883+    pthread_cond_broadcast(&avbuf->context->cond);
50884+
50885+    av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
50886+           avbuf->context->name, avbuf->buf.index,
50887+           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
50888
50889     return 0;
50890 }
50891--- a/libavcodec/v4l2_buffers.h
50892+++ b/libavcodec/v4l2_buffers.h
50893@@ -27,25 +27,38 @@
50894 #include <stdatomic.h>
50895 #include <linux/videodev2.h>
50896
50897+#include "libavutil/hwcontext_drm.h"
50898 #include "avcodec.h"
50899
50900 enum V4L2Buffer_status {
50901     V4L2BUF_AVAILABLE,
50902     V4L2BUF_IN_DRIVER,
50903+    V4L2BUF_IN_USE,
50904     V4L2BUF_RET_USER,
50905 };
50906
50907 /**
50908  * V4L2Buffer (wrapper for v4l2_buffer management)
50909  */
50910+struct V4L2Context;
50911+struct ff_weak_link_client;
50912+
50913 typedef struct V4L2Buffer {
50914-    /* each buffer needs to have a reference to its context */
50915+    /* each buffer needs to have a reference to its context
50916+     * The pointer is good enough for most operation but once the buffer has
50917+     * been passed to the user the buffer may become orphaned so for free ops
50918+     * the weak link must be used to ensure that the context is actually
50919+     * there
50920+     */
50921     struct V4L2Context *context;
50922+    struct ff_weak_link_client *context_wl;
50923
50924-    /* This object is refcounted per-plane, so we need to keep track
50925-     * of how many context-refs we are holding. */
50926-    AVBufferRef *context_ref;
50927-    atomic_uint context_refcount;
50928+    /* DRM descriptor */
50929+    AVDRMFrameDescriptor drm_frame;
50930+    /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
50931+     * are done
50932+     */
50933+    AVBufferRef * ref_buf;
50934
50935     /* keep track of the mmap address and mmap length */
50936     struct V4L2Plane_info {
50937@@ -60,7 +73,6 @@ typedef struct V4L2Buffer {
50938     struct v4l2_buffer buf;
50939     struct v4l2_plane planes[VIDEO_MAX_PLANES];
50940
50941-    int flags;
50942     enum V4L2Buffer_status status;
50943
50944 } V4L2Buffer;
50945@@ -98,6 +110,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
50946  */
50947 int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
50948
50949+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
50950+                                    const void *extdata, size_t extlen,
50951+                                    const int64_t timestamp);
50952+
50953 /**
50954  * Extracts the data from an AVFrame to a V4L2Buffer
50955  *
50956@@ -106,7 +122,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AV
50957  *
50958  * @returns 0 in case of success, a negative AVERROR code otherwise
50959  */
50960-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
50961+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);
50962
50963 /**
50964  * Initializes a V4L2Buffer
50965@@ -116,7 +132,7 @@ int ff_v4l2_buffer_avframe_to_buf(const
50966  *
50967  * @returns 0 in case of success, a negative AVERROR code otherwise
50968  */
50969-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
50970+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);
50971
50972 /**
50973  * Enqueues a V4L2Buffer
50974@@ -127,5 +143,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
50975  */
50976 int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
50977
50978+static inline void
50979+ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
50980+{
50981+    avbuf->status = V4L2BUF_AVAILABLE;
50982+    av_buffer_unref(&avbuf->ref_buf);
50983+}
50984+
50985
50986 #endif // AVCODEC_V4L2_BUFFERS_H
50987--- a/libavcodec/v4l2_context.c
50988+++ b/libavcodec/v4l2_context.c
50989@@ -27,11 +27,13 @@
50990 #include <unistd.h>
50991 #include <fcntl.h>
50992 #include <poll.h>
50993+#include "libavutil/avassert.h"
50994 #include "libavcodec/avcodec.h"
50995 #include "libavcodec/internal.h"
50996 #include "v4l2_buffers.h"
50997 #include "v4l2_fmt.h"
50998 #include "v4l2_m2m.h"
50999+#include "weak_link.h"
51000
51001 struct v4l2_format_update {
51002     uint32_t v4l2_fmt;
51003@@ -41,26 +43,168 @@ struct v4l2_format_update {
51004     int update_avfmt;
51005 };
51006
51007-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
51008+
51009+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
51010 {
51011-    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
51012-        container_of(ctx, V4L2m2mContext, output) :
51013-        container_of(ctx, V4L2m2mContext, capture);
51014+    return (int64_t)n;
51015 }
51016
51017-static inline AVCodecContext *logger(V4L2Context *ctx)
51018+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
51019 {
51020-    return ctx_to_m2mctx(ctx)->avctx;
51021+    return (unsigned int)pts;
51022 }
51023
51024-static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
51025+// FFmpeg requires us to propagate a number of vars from the coded pkt into
51026+// the decoded frame. The only thing that tracks like that in V4L2 stateful
51027+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
51028+// guarantees about PTS being unique or specified for every frame so replace
51029+// the supplied PTS with a simple incrementing number and keep a circular
51030+// buffer of all the things we want preserved (including the original PTS)
51031+// indexed by the tracking no.
51032+static int64_t
51033+xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
51034 {
51035-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
51036+    int64_t track_pts;
51037+
51038+    // Avoid 0
51039+    if (++x->track_no == 0)
51040+        x->track_no = 1;
51041+
51042+    track_pts = track_to_pts(avctx, x->track_no);
51043+
51044+    av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
51045+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
51046+        .discard          = 0,
51047+        .pending          = 1,
51048+        .pkt_size         = avpkt->size,
51049+        .pts              = avpkt->pts,
51050+        .dts              = avpkt->dts,
51051+        .reordered_opaque = avctx->reordered_opaque,
51052+        .pkt_pos          = avpkt->pos,
51053+        .pkt_duration     = avpkt->duration,
51054+        .track_pts        = track_pts
51055+    };
51056+    return track_pts;
51057 }
51058
51059-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
51060+static int64_t
51061+xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
51062 {
51063-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
51064+    int64_t track_pts;
51065+
51066+    // Avoid 0
51067+    if (++x->track_no == 0)
51068+        x->track_no = 1;
51069+
51070+    track_pts = track_to_pts(avctx, x->track_no);
51071+
51072+    av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
51073+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
51074+        .discard          = 0,
51075+        .pending          = 1,
51076+        .pkt_size         = 0,
51077+        .pts              = frame->pts,
51078+        .dts              = AV_NOPTS_VALUE,
51079+        .reordered_opaque = frame->reordered_opaque,
51080+        .pkt_pos          = frame->pkt_pos,
51081+        .pkt_duration     = frame->pkt_duration,
51082+        .track_pts        = track_pts
51083+    };
51084+    return track_pts;
51085+}
51086+
51087+
51088+// Returns -1 if we should discard the frame
51089+static int
51090+xlat_pts_frame_out(AVCodecContext *const avctx,
51091+             xlat_track_t * const x,
51092+             AVFrame *const frame)
51093+{
51094+    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
51095+    V4L2m2mTrackEl *const t = x->track_els + n;
51096+    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
51097+    {
51098+        av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
51099+               "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
51100+        frame->pts              = AV_NOPTS_VALUE;
51101+        frame->pkt_dts          = AV_NOPTS_VALUE;
51102+        frame->reordered_opaque = x->last_opaque;
51103+        frame->pkt_pos          = -1;
51104+        frame->pkt_duration     = 0;
51105+        frame->pkt_size         = -1;
51106+    }
51107+    else if (!t->discard)
51108+    {
51109+        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
51110+        frame->pkt_dts          = t->dts;
51111+        frame->reordered_opaque = t->reordered_opaque;
51112+        frame->pkt_pos          = t->pkt_pos;
51113+        frame->pkt_duration     = t->pkt_duration;
51114+        frame->pkt_size         = t->pkt_size;
51115+
51116+        x->last_opaque = x->track_els[n].reordered_opaque;
51117+        if (frame->pts != AV_NOPTS_VALUE)
51118+            x->last_pts = frame->pts;
51119+        t->pending = 0;
51120+    }
51121+    else
51122+    {
51123+        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
51124+        return -1;
51125+    }
51126+
51127+    av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
51128+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
51129+    return 0;
51130+}
51131+
51132+// Returns -1 if we should discard the frame
51133+static int
51134+xlat_pts_pkt_out(AVCodecContext *const avctx,
51135+             xlat_track_t * const x,
51136+             AVPacket *const pkt)
51137+{
51138+    unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
51139+    V4L2m2mTrackEl *const t = x->track_els + n;
51140+    if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
51141+    {
51142+        av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
51143+               "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
51144+        pkt->pts                = AV_NOPTS_VALUE;
51145+    }
51146+    else if (!t->discard)
51147+    {
51148+        pkt->pts                = t->pending ? t->pts : AV_NOPTS_VALUE;
51149+
51150+        x->last_opaque = x->track_els[n].reordered_opaque;
51151+        if (pkt->pts != AV_NOPTS_VALUE)
51152+            x->last_pts = pkt->pts;
51153+        t->pending = 0;
51154+    }
51155+    else
51156+    {
51157+        av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
51158+        return -1;
51159+    }
51160+
51161+    // * Would like something much better than this...xlat(offset + out_count)?
51162+    pkt->dts = pkt->pts;
51163+    av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
51164+           pkt->pts, t->track_pts, n);
51165+    return 0;
51166+}
51167+
51168+
51169+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
51170+{
51171+    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
51172+        container_of(ctx, V4L2m2mContext, output) :
51173+        container_of(ctx, V4L2m2mContext, capture);
51174+}
51175+
51176+static inline AVCodecContext *logger(const V4L2Context *ctx)
51177+{
51178+    return ctx_to_m2mctx(ctx)->avctx;
51179 }
51180
51181 static AVRational v4l2_get_sar(V4L2Context *ctx)
51182@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Conte
51183     return sar;
51184 }
51185
51186-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
51187+static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
51188+{
51189+    return ctx->bufrefs != NULL;
51190+}
51191+
51192+// Width/Height changed or we don't have an alloc in the first place?
51193+static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
51194 {
51195-    struct v4l2_format *fmt1 = &ctx->format;
51196-    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
51197-        fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
51198-        fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
51199-        :
51200-        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
51201-        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
51202+    const struct v4l2_format *fmt1 = &ctx->format;
51203+    int ret = !ctx_buffers_alloced(ctx) ||
51204+        (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
51205+            fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
51206+            fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
51207+            :
51208+            fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
51209+            fmt1->fmt.pix.height != fmt2->fmt.pix.height);
51210
51211     if (ret)
51212-        av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
51213+        av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n",
51214             ctx->name,
51215-            v4l2_get_width(fmt1), v4l2_get_height(fmt1),
51216-            v4l2_get_width(fmt2), v4l2_get_height(fmt2));
51217+            ctx_buffers_alloced(ctx),
51218+            ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
51219+            ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));
51220
51221     return ret;
51222 }
51223@@ -153,90 +305,110 @@ static inline void v4l2_save_to_context(
51224     }
51225 }
51226
51227-/**
51228- * handle resolution change event and end of stream event
51229- * returns 1 if reinit was successful, negative if it failed
51230- * returns 0 if reinit was not executed
51231- */
51232-static int v4l2_handle_event(V4L2Context *ctx)
51233+static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r)
51234 {
51235-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
51236-    struct v4l2_format cap_fmt = s->capture.format;
51237-    struct v4l2_format out_fmt = s->output.format;
51238-    struct v4l2_event evt = { 0 };
51239-    int full_reinit, reinit, ret;
51240+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
51241+    struct v4l2_selection selection = {
51242+        .type = V4L2_BUF_TYPE_VIDEO_CAPTURE,
51243+        .target = V4L2_SEL_TGT_COMPOSE
51244+    };
51245
51246-    ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
51247-    if (ret < 0) {
51248-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
51249-        return 0;
51250-    }
51251+    memset(r, 0, sizeof(*r));
51252+    if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection))
51253+        return AVERROR(errno);
51254
51255-    if (evt.type == V4L2_EVENT_EOS) {
51256-        ctx->done = 1;
51257-        return 0;
51258-    }
51259+    *r = selection.r;
51260+    return 0;
51261+}
51262
51263-    if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
51264-        return 0;
51265+static int do_source_change(V4L2m2mContext * const s)
51266+{
51267+    AVCodecContext *const avctx = s->avctx;
51268
51269-    ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt);
51270-    if (ret) {
51271-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name);
51272-        return 0;
51273-    }
51274+    int ret;
51275+    int reinit;
51276+    struct v4l2_format cap_fmt = s->capture.format;
51277+
51278+    s->capture.done = 0;
51279
51280     ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
51281     if (ret) {
51282-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
51283+        av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name);
51284         return 0;
51285     }
51286
51287-    full_reinit = v4l2_resolution_changed(&s->output, &out_fmt);
51288-    if (full_reinit) {
51289-        s->output.height = v4l2_get_height(&out_fmt);
51290-        s->output.width = v4l2_get_width(&out_fmt);
51291-        s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
51292-    }
51293+    get_default_selection(&s->capture, &s->capture.selection);
51294+
51295+    reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
51296+    if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
51297+        reinit = 1;
51298
51299-    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
51300+    s->capture.format = cap_fmt;
51301     if (reinit) {
51302-        s->capture.height = v4l2_get_height(&cap_fmt);
51303-        s->capture.width = v4l2_get_width(&cap_fmt);
51304-        s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
51305+        s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
51306+        s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
51307     }
51308
51309-    if (full_reinit || reinit)
51310-        s->reinit = 1;
51311-
51312-    if (full_reinit) {
51313-        ret = ff_v4l2_m2m_codec_full_reinit(s);
51314-        if (ret) {
51315-            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n");
51316-            return AVERROR(EINVAL);
51317-        }
51318-        goto reinit_run;
51319+    // If we don't support selection (or it is bust) and we obviously have HD then kludge
51320+    if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) &&
51321+        (s->capture.height == 1088 && s->capture.width == 1920)) {
51322+        s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080};
51323     }
51324
51325+    s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
51326+
51327+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
51328+           s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
51329+           s->capture.width, s->capture.height,
51330+           s->capture.selection.width, s->capture.selection.height,
51331+           s->capture.selection.left, s->capture.selection.top, reinit);
51332+
51333     if (reinit) {
51334-        if (s->avctx)
51335-            ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
51336+        if (avctx)
51337+            ret = ff_set_dimensions(s->avctx,
51338+                                    s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
51339+                                    s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height);
51340         if (ret < 0)
51341-            av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
51342+            av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
51343
51344         ret = ff_v4l2_m2m_codec_reinit(s);
51345         if (ret) {
51346-            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
51347+            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
51348             return AVERROR(EINVAL);
51349         }
51350+
51351+        if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
51352+            s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
51353+            av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
51354+                   s->capture.width, s->capture.height,
51355+                   ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
51356+            return AVERROR(EINVAL);
51357+        }
51358+
51359+        // Update pixel format - should only actually do something on initial change
51360+        s->capture.av_pix_fmt =
51361+            ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
51362+        if (s->output_drm) {
51363+            avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
51364+            avctx->sw_pix_fmt = s->capture.av_pix_fmt;
51365+        }
51366+        else
51367+            avctx->pix_fmt = s->capture.av_pix_fmt;
51368+
51369         goto reinit_run;
51370     }
51371
51372-    /* dummy event received */
51373-    return 0;
51374+    /* Buffers are OK so just stream off to ack */
51375+    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
51376+
51377+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
51378+    if (ret)
51379+        av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
51380+    s->draining = 0;
51381
51382     /* reinit executed */
51383 reinit_run:
51384+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON);
51385     return 1;
51386 }
51387
51388@@ -280,171 +452,277 @@ static int v4l2_stop_encode(V4L2Context
51389     return 0;
51390 }
51391
51392-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
51393-{
51394-    struct v4l2_plane planes[VIDEO_MAX_PLANES];
51395-    struct v4l2_buffer buf = { 0 };
51396-    V4L2Buffer *avbuf;
51397-    struct pollfd pfd = {
51398-        .events =  POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
51399-        .fd = ctx_to_m2mctx(ctx)->fd,
51400+// DQ a buffer
51401+// Amalgamates all the various ways there are of signalling EOS/Event to
51402+// generate a consistant EPIPE.
51403+//
51404+// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped)
51405+//
51406+// Returns:
51407+//  0               Success
51408+//  AVERROR(EPIPE)  Nothing more to read
51409+//  AVERROR(ENOSPC) No buffers in Q to put result in
51410+//  *               AVERROR(..)
51411+
51412+ static int
51413+dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
51414+{
51415+    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
51416+    AVCodecContext * const avctx = m->avctx;
51417+    V4L2Buffer * avbuf;
51418+    const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type);
51419+
51420+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
51421+
51422+    struct v4l2_buffer buf = {
51423+        .type = ctx->type,
51424+        .memory = V4L2_MEMORY_MMAP,
51425     };
51426-    int i, ret;
51427
51428-    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) {
51429-        for (i = 0; i < ctx->num_buffers; i++) {
51430-            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
51431-                break;
51432-        }
51433-        if (i == ctx->num_buffers)
51434-            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to "
51435-                                                "userspace. Increase num_capture_buffers "
51436-                                                "to prevent device deadlock or dropped "
51437-                                                "packets/frames.\n");
51438-    }
51439-
51440-    /* if we are draining and there are no more capture buffers queued in the driver we are done */
51441-    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
51442-        for (i = 0; i < ctx->num_buffers; i++) {
51443-            /* capture buffer initialization happens during decode hence
51444-             * detection happens at runtime
51445-             */
51446-            if (!ctx->buffers)
51447-                break;
51448-
51449-            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
51450-                goto start;
51451-        }
51452-        ctx->done = 1;
51453-        return NULL;
51454-    }
51455-
51456-start:
51457-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
51458-        pfd.events =  POLLOUT | POLLWRNORM;
51459-    else {
51460-        /* no need to listen to requests for more input while draining */
51461-        if (ctx_to_m2mctx(ctx)->draining)
51462-            pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
51463+    *ppavbuf = NULL;
51464+
51465+    if (ctx->flag_last)
51466+        return AVERROR(EPIPE);
51467+
51468+    if (is_mp) {
51469+        buf.length = VIDEO_MAX_PLANES;
51470+        buf.m.planes = planes;
51471     }
51472
51473-    for (;;) {
51474-        ret = poll(&pfd, 1, timeout);
51475-        if (ret > 0)
51476-            break;
51477-        if (errno == EINTR)
51478-            continue;
51479-        return NULL;
51480+    while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
51481+        const int err = errno;
51482+        av_assert0(AVERROR(err) < 0);
51483+        if (err != EINTR) {
51484+            av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
51485+                ctx->name, av_err2str(AVERROR(err)));
51486+
51487+            if (err == EPIPE)
51488+                ctx->flag_last = 1;
51489+
51490+            return AVERROR(err);
51491+        }
51492     }
51493+    atomic_fetch_sub(&ctx->q_count, 1);
51494
51495-    /* 0. handle errors */
51496-    if (pfd.revents & POLLERR) {
51497-        /* if we are trying to get free buffers but none have been queued yet
51498-           no need to raise a warning */
51499-        if (timeout == 0) {
51500-            for (i = 0; i < ctx->num_buffers; i++) {
51501-                if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
51502-                    av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
51503-            }
51504+    avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
51505+    ff_v4l2_buffer_set_avail(avbuf);
51506+    avbuf->buf = buf;
51507+    if (is_mp) {
51508+        memcpy(avbuf->planes, planes, sizeof(planes));
51509+        avbuf->buf.m.planes = avbuf->planes;
51510+    }
51511+    // Done with any attached buffer
51512+    av_buffer_unref(&avbuf->ref_buf);
51513+
51514+    if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
51515+        // Zero length cap buffer return == EOS
51516+        if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) {
51517+            av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n");
51518+
51519+            // Must reQ so we don't leak
51520+            // May not matter if the next thing we do is release all the
51521+            // buffers but better to be tidy.
51522+            ff_v4l2_buffer_enqueue(avbuf);
51523+
51524+            ctx->flag_last = 1;
51525+            return AVERROR(EPIPE);
51526         }
51527-        else
51528-            av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
51529
51530-        return NULL;
51531+#ifdef V4L2_BUF_FLAG_LAST
51532+        // If flag_last set then this contains data but is the last frame
51533+        // so remember that but return OK
51534+        if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0)
51535+            ctx->flag_last = 1;
51536+#endif
51537     }
51538
51539-    /* 1. handle resolution changes */
51540-    if (pfd.revents & POLLPRI) {
51541-        ret = v4l2_handle_event(ctx);
51542-        if (ret < 0) {
51543-            /* if re-init failed, abort */
51544-            ctx->done = 1;
51545-            return NULL;
51546-        }
51547-        if (ret) {
51548-            /* if re-init was successful drop the buffer (if there was one)
51549-             * since we had to reconfigure capture (unmap all buffers)
51550-             */
51551-            return NULL;
51552+    *ppavbuf = avbuf;
51553+    return 0;
51554+}
51555+
51556+/**
51557+ * handle resolution change event and end of stream event
51558+ * Expects to be called after the stream has stopped
51559+ *
51560+ * returns 1 if reinit was successful, negative if it failed
51561+ * returns 0 if reinit was not executed
51562+ */
51563+static int
51564+get_event(V4L2m2mContext * const m)
51565+{
51566+    AVCodecContext * const avctx = m->avctx;
51567+    struct v4l2_event evt = { 0 };
51568+
51569+    while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) {
51570+        const int rv = AVERROR(errno);
51571+        if (rv == AVERROR(EINTR))
51572+            continue;
51573+        if (rv == AVERROR(EAGAIN)) {
51574+            av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n");
51575+            return AVERROR_EOF;
51576         }
51577+        av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv));
51578+        return rv;
51579     }
51580
51581-    /* 2. dequeue the buffer */
51582-    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
51583+    av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type);
51584
51585-        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
51586-            /* there is a capture buffer ready */
51587-            if (pfd.revents & (POLLIN | POLLRDNORM))
51588-                goto dequeue;
51589+    if (evt.type == V4L2_EVENT_EOS) {
51590+        av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n");
51591+        return AVERROR_EOF;
51592+    }
51593+
51594+    if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
51595+        return do_source_change(m);
51596+
51597+    return 0;
51598+}
51599+
51600+
51601+// Get a buffer
51602+// If output then just gets the buffer in the expected way
51603+// If capture then runs the capture state m/c to deal with res change etc.
51604+// If return value == 0 then *ppavbuf != NULL
51605+
51606+static int
51607+get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
51608+{
51609+    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
51610+    AVCodecContext * const avctx = m->avctx;
51611+    const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type);
51612+
51613+    const unsigned int poll_cap = (POLLIN | POLLRDNORM);
51614+    const unsigned int poll_out = (POLLOUT | POLLWRNORM);
51615+    const unsigned int poll_event = POLLPRI;
51616+
51617+    *ppavbuf = NULL;
51618
51619-            /* the driver is ready to accept more input; instead of waiting for the capture
51620-             * buffer to complete we return NULL so input can proceed (we are single threaded)
51621-             */
51622-            if (pfd.revents & (POLLOUT | POLLWRNORM))
51623-                return NULL;
51624+    for (;;) {
51625+        struct pollfd pfd = {
51626+            .fd = m->fd,
51627+            // If capture && stream not started then assume we are waiting for the initial event
51628+            .events = !is_cap ? poll_out :
51629+                !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap :
51630+                    poll_event,
51631+        };
51632+        int ret;
51633+
51634+        if (ctx->done) {
51635+            av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
51636+            return AVERROR_EOF;
51637         }
51638
51639-dequeue:
51640-        memset(&buf, 0, sizeof(buf));
51641-        buf.memory = V4L2_MEMORY_MMAP;
51642-        buf.type = ctx->type;
51643-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
51644-            memset(planes, 0, sizeof(planes));
51645-            buf.length = VIDEO_MAX_PLANES;
51646-            buf.m.planes = planes;
51647+        // If capture && timeout == -1 then also wait for rx buffer free
51648+        if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
51649+            pfd.events |= poll_out;
51650+
51651+        // If nothing Qed all we will get is POLLERR - avoid that
51652+        if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
51653+            (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
51654+            (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
51655+            av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
51656+            return AVERROR(ENOSPC);
51657         }
51658
51659-        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
51660-        if (ret) {
51661-            if (errno != EAGAIN) {
51662-                ctx->done = 1;
51663-                if (errno != EPIPE)
51664-                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
51665-                        ctx->name, av_err2str(AVERROR(errno)));
51666+        // Timeout kludged s.t. "forever" eventually gives up & produces logging
51667+        // If waiting for an event when we have seen a last_frame then we expect
51668+        //   it to be ready already so force a short timeout
51669+        ret = poll(&pfd, 1,
51670+                   ff_v4l2_ctx_eos(ctx) ? 10 :
51671+                   timeout == -1 ? 3000 : timeout);
51672+        if (ret < 0) {
51673+            ret = AVERROR(errno);  // Remember errno before logging etc.
51674+            av_assert0(ret < 0);
51675+        }
51676+
51677+        av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
51678+               ctx->name, ret, timeout, pfd.events, pfd.revents);
51679+
51680+        if (ret < 0) {
51681+            if (ret == AVERROR(EINTR))
51682+                continue;
51683+            av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
51684+            return ret;
51685+        }
51686+
51687+        if (ret == 0) {
51688+            if (timeout == -1)
51689+                av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
51690+            if (ff_v4l2_ctx_eos(ctx)) {
51691+                av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name);
51692+                ret = get_event(m);
51693+                if (ret < 0) {
51694+                    ctx->done = 1;
51695+                    return ret;
51696+                }
51697             }
51698-            return NULL;
51699+            return AVERROR(EAGAIN);
51700         }
51701
51702-        if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
51703-            int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
51704-                            buf.m.planes[0].bytesused : buf.bytesused;
51705-            if (bytesused == 0) {
51706+        if ((pfd.revents & POLLERR) != 0) {
51707+            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
51708+            return AVERROR_UNKNOWN;
51709+        }
51710+
51711+        if ((pfd.revents & poll_event) != 0) {
51712+            ret = get_event(m);
51713+            if (ret < 0) {
51714                 ctx->done = 1;
51715-                return NULL;
51716+                return ret;
51717             }
51718-#ifdef V4L2_BUF_FLAG_LAST
51719-            if (buf.flags & V4L2_BUF_FLAG_LAST)
51720-                ctx->done = 1;
51721-#endif
51722+            continue;
51723+        }
51724+
51725+        if ((pfd.revents & poll_cap) != 0) {
51726+            ret = dq_buf(ctx, ppavbuf);
51727+            if (ret == AVERROR(EPIPE))
51728+                continue;
51729+            return ret;
51730         }
51731
51732-        avbuf = &ctx->buffers[buf.index];
51733-        avbuf->status = V4L2BUF_AVAILABLE;
51734-        avbuf->buf = buf;
51735-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
51736-            memcpy(avbuf->planes, planes, sizeof(planes));
51737-            avbuf->buf.m.planes = avbuf->planes;
51738+        if ((pfd.revents & poll_out) != 0) {
51739+            if (is_cap)
51740+                return AVERROR(EAGAIN);
51741+            return dq_buf(ctx, ppavbuf);
51742         }
51743-        return avbuf;
51744+
51745+        av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
51746+        return AVERROR_UNKNOWN;
51747     }
51748+}
51749
51750-    return NULL;
51751+// Clear out flags and timestamps that should should be set by the user
51752+// Returns the passed avbuf
51753+static V4L2Buffer *
51754+clean_v4l2_buffer(V4L2Buffer * const avbuf)
51755+{
51756+    struct v4l2_buffer *const buf = &avbuf->buf;
51757+
51758+    buf->flags = 0;
51759+    buf->field = V4L2_FIELD_ANY;
51760+    buf->timestamp = (struct timeval){0};
51761+    buf->timecode = (struct v4l2_timecode){0};
51762+    buf->sequence = 0;
51763+
51764+    return avbuf;
51765 }
51766
51767 static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
51768 {
51769-    int timeout = 0; /* return when no more buffers to dequeue */
51770     int i;
51771
51772     /* get back as many output buffers as possible */
51773     if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
51774-          do {
51775-          } while (v4l2_dequeue_v4l2buf(ctx, timeout));
51776+        V4L2Buffer * avbuf;
51777+        do {
51778+            get_qbuf(ctx, &avbuf, 0);
51779+        } while (avbuf);
51780     }
51781
51782     for (i = 0; i < ctx->num_buffers; i++) {
51783-        if (ctx->buffers[i].status == V4L2BUF_AVAILABLE)
51784-            return &ctx->buffers[i];
51785+        V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
51786+        if (avbuf->status == V4L2BUF_AVAILABLE)
51787+            return clean_v4l2_buffer(avbuf);
51788     }
51789
51790     return NULL;
51791@@ -452,25 +730,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(
51792
51793 static int v4l2_release_buffers(V4L2Context* ctx)
51794 {
51795-    struct v4l2_requestbuffers req = {
51796-        .memory = V4L2_MEMORY_MMAP,
51797-        .type = ctx->type,
51798-        .count = 0, /* 0 -> unmaps buffers from the driver */
51799-    };
51800-    int i, j;
51801+    int i;
51802+    int ret = 0;
51803+    const int fd = ctx_to_m2mctx(ctx)->fd;
51804
51805-    for (i = 0; i < ctx->num_buffers; i++) {
51806-        V4L2Buffer *buffer = &ctx->buffers[i];
51807+    // Orphan any buffers in the wild
51808+    ff_weak_link_break(&ctx->wl_master);
51809
51810-        for (j = 0; j < buffer->num_planes; j++) {
51811-            struct V4L2Plane_info *p = &buffer->plane_info[j];
51812-            if (p->mm_addr && p->length)
51813-                if (munmap(p->mm_addr, p->length) < 0)
51814-                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
51815+    if (ctx->bufrefs) {
51816+        for (i = 0; i < ctx->num_buffers; i++)
51817+            av_buffer_unref(ctx->bufrefs + i);
51818+    }
51819+
51820+    if (fd != -1) {
51821+        struct v4l2_requestbuffers req = {
51822+            .memory = V4L2_MEMORY_MMAP,
51823+            .type = ctx->type,
51824+            .count = 0, /* 0 -> unmap all buffers from the driver */
51825+        };
51826+
51827+        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
51828+            if (errno == EINTR)
51829+                continue;
51830+
51831+            ret = AVERROR(errno);
51832+
51833+            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
51834+                ctx->name, av_err2str(AVERROR(errno)));
51835+
51836+            if (ctx_to_m2mctx(ctx)->output_drm)
51837+                av_log(logger(ctx), AV_LOG_ERROR,
51838+                    "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
51839+                    "for all buffers: \n"
51840+                    "  1. drmModeRmFB(..)\n"
51841+                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
51842         }
51843     }
51844+    atomic_store(&ctx->q_count, 0);
51845
51846-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
51847+    return ret;
51848 }
51849
51850 static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
51851@@ -499,6 +797,8 @@ static inline int v4l2_try_raw_format(V4
51852
51853 static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
51854 {
51855+    V4L2m2mContext* s = ctx_to_m2mctx(ctx);
51856+    V4L2m2mPriv *priv = s->avctx->priv_data;
51857     enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
51858     struct v4l2_fmtdesc fdesc;
51859     int ret;
51860@@ -517,6 +817,13 @@ static int v4l2_get_raw_format(V4L2Conte
51861         if (ret)
51862             return AVERROR(EINVAL);
51863
51864+        if (priv->pix_fmt != AV_PIX_FMT_NONE) {
51865+            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) {
51866+                fdesc.index++;
51867+                continue;
51868+            }
51869+        }
51870+
51871         pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
51872         ret = v4l2_try_raw_format(ctx, pixfmt);
51873         if (ret){
51874@@ -569,30 +876,99 @@ static int v4l2_get_coded_format(V4L2Con
51875   *
51876   *****************************************************************************/
51877
51878+
51879+static void flush_all_buffers_status(V4L2Context* const ctx)
51880+{
51881+    int i;
51882+
51883+    if (!ctx->bufrefs)
51884+        return;
51885+
51886+    for (i = 0; i < ctx->num_buffers; ++i) {
51887+        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
51888+        if (buf->status == V4L2BUF_IN_DRIVER)
51889+            ff_v4l2_buffer_set_avail(buf);
51890+    }
51891+    atomic_store(&ctx->q_count, 0);
51892+}
51893+
51894+static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
51895+{
51896+    int i;
51897+    int rv;
51898+
51899+    if (!ctx->bufrefs) {
51900+        rv = ff_v4l2_context_init(ctx);
51901+        if (rv) {
51902+            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
51903+            return rv;
51904+        }
51905+    }
51906+
51907+    for (i = 0; i < ctx->num_buffers; ++i) {
51908+        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
51909+        if (buf->status == V4L2BUF_AVAILABLE) {
51910+            rv = ff_v4l2_buffer_enqueue(buf);
51911+            if (rv < 0)
51912+                return rv;
51913+        }
51914+    }
51915+    return 0;
51916+}
51917+
51918 int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
51919 {
51920     int type = ctx->type;
51921-    int ret;
51922+    int ret = 0;
51923+    AVCodecContext * const avctx = logger(ctx);
51924
51925-    ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
51926-    if (ret < 0)
51927-        return AVERROR(errno);
51928+    // Avoid doing anything if there is nothing we can do
51929+    if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon)
51930+        return 0;
51931
51932-    ctx->streamon = (cmd == VIDIOC_STREAMON);
51933+    ff_mutex_lock(&ctx->lock);
51934
51935-    return 0;
51936+    if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
51937+        stuff_all_buffers(avctx, ctx);
51938+
51939+    if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) {
51940+        const int err = errno;
51941+        av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
51942+               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
51943+        ret = AVERROR(err);
51944+    }
51945+    else
51946+    {
51947+        if (cmd == VIDIOC_STREAMOFF)
51948+            flush_all_buffers_status(ctx);
51949+        else
51950+            ctx->first_buf = 1;
51951+
51952+        ctx->streamon = (cmd == VIDIOC_STREAMON);
51953+        av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
51954+               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
51955+    }
51956+
51957+    // Both stream off & on effectively clear flag_last
51958+    ctx->flag_last = 0;
51959+
51960+    ff_mutex_unlock(&ctx->lock);
51961+
51962+    return ret;
51963 }
51964
51965 int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
51966 {
51967-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
51968+    V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
51969+    AVCodecContext *const avctx = s->avctx;
51970+    int64_t track_ts;
51971     V4L2Buffer* avbuf;
51972     int ret;
51973
51974     if (!frame) {
51975         ret = v4l2_stop_encode(ctx);
51976         if (ret)
51977-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
51978+            av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
51979         s->draining= 1;
51980         return 0;
51981     }
51982@@ -601,23 +977,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Co
51983     if (!avbuf)
51984         return AVERROR(ENOMEM);
51985
51986-    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
51987+    track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
51988+
51989+    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
51990     if (ret)
51991         return ret;
51992
51993     return ff_v4l2_buffer_enqueue(avbuf);
51994 }
51995
51996-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
51997+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
51998+                                   const void * extdata, size_t extlen)
51999 {
52000     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
52001+    AVCodecContext *const avctx = s->avctx;
52002     V4L2Buffer* avbuf;
52003     int ret;
52004+    int64_t track_ts;
52005
52006     if (!pkt->size) {
52007         ret = v4l2_stop_decode(ctx);
52008+        // Log but otherwise ignore stop failure
52009         if (ret)
52010-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
52011+            av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
52012         s->draining = 1;
52013         return 0;
52014     }
52015@@ -626,8 +1008,13 @@ int ff_v4l2_context_enqueue_packet(V4L2C
52016     if (!avbuf)
52017         return AVERROR(EAGAIN);
52018
52019-    ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
52020-    if (ret)
52021+    track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
52022+
52023+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
52024+    if (ret == AVERROR(ENOMEM))
52025+        av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
52026+               __func__, pkt->size, avbuf->planes[0].length);
52027+    else if (ret)
52028         return ret;
52029
52030     return ff_v4l2_buffer_enqueue(avbuf);
52031@@ -635,42 +1022,36 @@ int ff_v4l2_context_enqueue_packet(V4L2C
52032
52033 int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
52034 {
52035+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
52036+    AVCodecContext *const avctx = s->avctx;
52037     V4L2Buffer *avbuf;
52038+    int rv;
52039
52040-    /*
52041-     * timeout=-1 blocks until:
52042-     *  1. decoded frame available
52043-     *  2. an input buffer is ready to be dequeued
52044-     */
52045-    avbuf = v4l2_dequeue_v4l2buf(ctx, timeout);
52046-    if (!avbuf) {
52047-        if (ctx->done)
52048-            return AVERROR_EOF;
52049-
52050-        return AVERROR(EAGAIN);
52051-    }
52052+    do {
52053+        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
52054+            return rv;
52055+        if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
52056+            return rv;
52057+    } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);
52058
52059-    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
52060+   return 0;
52061 }
52062
52063 int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
52064 {
52065+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
52066+    AVCodecContext *const avctx = s->avctx;
52067     V4L2Buffer *avbuf;
52068+    int rv;
52069
52070-    /*
52071-     * blocks until:
52072-     *  1. encoded packet available
52073-     *  2. an input buffer ready to be dequeued
52074-     */
52075-    avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
52076-    if (!avbuf) {
52077-        if (ctx->done)
52078-            return AVERROR_EOF;
52079+    do {
52080+        if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
52081+            return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
52082+        if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
52083+            return rv;
52084+    } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);
52085
52086-        return AVERROR(EAGAIN);
52087-    }
52088-
52089-    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
52090+    return 0;
52091 }
52092
52093 int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
52094@@ -702,78 +1083,160 @@ int ff_v4l2_context_get_format(V4L2Conte
52095
52096 int ff_v4l2_context_set_format(V4L2Context* ctx)
52097 {
52098-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
52099+    int ret;
52100+
52101+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
52102+    if (ret != 0)
52103+        return ret;
52104+
52105+    // Check returned size against min size and if smaller have another go
52106+    // Only worry about plane[0] as this is meant to enforce limits for
52107+    // encoded streams where we might know a bit more about the shape
52108+    // than the driver
52109+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
52110+        if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage)
52111+            return 0;
52112+        ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size;
52113+    }
52114+    else {
52115+        if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage)
52116+            return 0;
52117+        ctx->format.fmt.pix.sizeimage = ctx->min_buf_size;
52118+    }
52119+
52120+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
52121+    return ret;
52122 }
52123
52124 void ff_v4l2_context_release(V4L2Context* ctx)
52125 {
52126     int ret;
52127
52128-    if (!ctx->buffers)
52129+    if (!ctx->bufrefs)
52130         return;
52131
52132     ret = v4l2_release_buffers(ctx);
52133     if (ret)
52134         av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name);
52135
52136-    av_freep(&ctx->buffers);
52137+    av_freep(&ctx->bufrefs);
52138+    av_buffer_unref(&ctx->frames_ref);
52139+
52140+    ff_mutex_destroy(&ctx->lock);
52141+    pthread_cond_destroy(&ctx->cond);
52142 }
52143
52144-int ff_v4l2_context_init(V4L2Context* ctx)
52145+
52146+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
52147 {
52148-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
52149+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
52150     struct v4l2_requestbuffers req;
52151-    int ret, i;
52152-
52153-    if (!v4l2_type_supported(ctx)) {
52154-        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
52155-        return AVERROR_PATCHWELCOME;
52156-    }
52157+    int ret;
52158+    int i;
52159
52160-    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
52161-    if (ret)
52162-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
52163+    av_assert0(ctx->bufrefs == NULL);
52164
52165     memset(&req, 0, sizeof(req));
52166-    req.count = ctx->num_buffers;
52167-    req.memory = V4L2_MEMORY_MMAP;
52168+    req.count = req_buffers;
52169+    req.memory = mem;
52170     req.type = ctx->type;
52171-    ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
52172-    if (ret < 0) {
52173-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno));
52174-        return AVERROR(errno);
52175+    while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
52176+        if (errno != EINTR) {
52177+            ret = AVERROR(errno);
52178+            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret));
52179+            return ret;
52180+        }
52181     }
52182
52183     ctx->num_buffers = req.count;
52184-    ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer));
52185-    if (!ctx->buffers) {
52186+    ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs));
52187+    if (!ctx->bufrefs) {
52188         av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name);
52189-        return AVERROR(ENOMEM);
52190+        goto fail_release;
52191     }
52192
52193-    for (i = 0; i < req.count; i++) {
52194-        ctx->buffers[i].context = ctx;
52195-        ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
52196-        if (ret < 0) {
52197+    ctx->wl_master = ff_weak_link_new(ctx);
52198+    if (!ctx->wl_master) {
52199+        ret = AVERROR(ENOMEM);
52200+        goto fail_release;
52201+    }
52202+
52203+    for (i = 0; i < ctx->num_buffers; i++) {
52204+        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
52205+        if (ret) {
52206             av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
52207-            goto error;
52208+            goto fail_release;
52209         }
52210     }
52211
52212     av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name,
52213         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat),
52214         req.count,
52215-        v4l2_get_width(&ctx->format),
52216-        v4l2_get_height(&ctx->format),
52217+        ff_v4l2_get_format_width(&ctx->format),
52218+        ff_v4l2_get_format_height(&ctx->format),
52219         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage,
52220         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);
52221
52222     return 0;
52223
52224-error:
52225+fail_release:
52226     v4l2_release_buffers(ctx);
52227+    av_freep(&ctx->bufrefs);
52228+    return ret;
52229+}
52230+
52231+int ff_v4l2_context_init(V4L2Context* ctx)
52232+{
52233+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
52234+    int ret;
52235+
52236+    // It is not valid to reinit a context without a previous release
52237+    av_assert0(ctx->bufrefs == NULL);
52238
52239-    av_freep(&ctx->buffers);
52240+    if (!v4l2_type_supported(ctx)) {
52241+        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
52242+        return AVERROR_PATCHWELCOME;
52243+    }
52244+
52245+    ff_mutex_init(&ctx->lock, NULL);
52246+    pthread_cond_init(&ctx->cond, NULL);
52247+    atomic_init(&ctx->q_count, 0);
52248+
52249+    if (s->output_drm) {
52250+        AVHWFramesContext *hwframes;
52251+
52252+        ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref);
52253+        if (!ctx->frames_ref) {
52254+            ret = AVERROR(ENOMEM);
52255+            goto fail_unlock;
52256+        }
52257+
52258+        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
52259+        hwframes->format = AV_PIX_FMT_DRM_PRIME;
52260+        hwframes->sw_format = ctx->av_pix_fmt;
52261+        hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width;
52262+        hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height;
52263+        ret = av_hwframe_ctx_init(ctx->frames_ref);
52264+        if (ret < 0)
52265+            goto fail_unref_hwframes;
52266+    }
52267+
52268+    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
52269+    if (ret) {
52270+        ret = AVERROR(errno);
52271+        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret));
52272+        goto fail_unref_hwframes;
52273+    }
52274+
52275+    ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
52276+    if (ret < 0)
52277+        goto fail_unref_hwframes;
52278+
52279+    return 0;
52280
52281+fail_unref_hwframes:
52282+    av_buffer_unref(&ctx->frames_ref);
52283+fail_unlock:
52284+    ff_mutex_destroy(&ctx->lock);
52285     return ret;
52286 }
52287--- a/libavcodec/v4l2_context.h
52288+++ b/libavcodec/v4l2_context.h
52289@@ -31,6 +31,7 @@
52290 #include "libavutil/pixfmt.h"
52291 #include "libavutil/frame.h"
52292 #include "libavutil/buffer.h"
52293+#include "libavutil/thread.h"
52294 #include "v4l2_buffers.h"
52295
52296 typedef struct V4L2Context {
52297@@ -70,11 +71,18 @@ typedef struct V4L2Context {
52298      */
52299     int width, height;
52300     AVRational sample_aspect_ratio;
52301+    struct v4l2_rect selection;
52302
52303     /**
52304-     * Indexed array of V4L2Buffers
52305+     * If the default size of buffer is less than this then try to
52306+     * set to this.
52307      */
52308-    V4L2Buffer *buffers;
52309+    uint32_t min_buf_size;
52310+
52311+    /**
52312+     * Indexed array of pointers to V4L2Buffers
52313+     */
52314+    AVBufferRef **bufrefs;
52315
52316     /**
52317      * Readonly after init.
52318@@ -82,16 +90,38 @@ typedef struct V4L2Context {
52319     int num_buffers;
52320
52321     /**
52322+     * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
52323+     */
52324+    enum v4l2_memory buf_mem;
52325+
52326+    /**
52327      * Whether the stream has been started (VIDIOC_STREAMON has been sent).
52328      */
52329     int streamon;
52330
52331+    /* 1st buffer after stream on */
52332+    int first_buf;
52333+
52334     /**
52335      *  Either no more buffers available or an unrecoverable error was notified
52336      *  by the V4L2 kernel driver: once set the context has to be exited.
52337      */
52338     int done;
52339
52340+    int flag_last;
52341+
52342+    /**
52343+     * If NZ then when Qing frame/pkt use this rather than the
52344+     * "real" PTS
52345+     */
52346+    uint64_t track_ts;
52347+
52348+    AVBufferRef *frames_ref;
52349+    atomic_int q_count;
52350+    struct ff_weak_link_master *wl_master;
52351+
52352+    AVMutex lock;
52353+    pthread_cond_t cond;
52354 } V4L2Context;
52355
52356 /**
52357@@ -156,7 +186,10 @@ int ff_v4l2_context_dequeue_packet(V4L2C
52358  * @param[in] ctx The V4L2Context to dequeue from.
52359  * @param[inout] f The AVFrame to dequeue to.
52360  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
52361+ *
52362  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
52363+ *                AVERROR(ENOSPC) if no buffer availible to put
52364+ *                the frame in
52365  */
52366 int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
52367
52368@@ -170,7 +203,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co
52369  * @param[in] pkt A pointer to an AVPacket.
52370  * @return 0 in case of success, a negative error otherwise.
52371  */
52372-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
52373+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size);
52374
52375 /**
52376  * Enqueues a buffer to a V4L2Context from an AVFrame
52377--- a/libavcodec/v4l2_m2m.c
52378+++ b/libavcodec/v4l2_m2m.c
52379@@ -36,6 +36,14 @@
52380 #include "v4l2_fmt.h"
52381 #include "v4l2_m2m.h"
52382
52383+static void
52384+xlat_init(xlat_track_t * const x)
52385+{
52386+    memset(x, 0, sizeof(*x));
52387+    x->last_pts = AV_NOPTS_VALUE;
52388+}
52389+
52390+
52391 static inline int v4l2_splane_video(struct v4l2_capability *cap)
52392 {
52393     if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) &&
52394@@ -68,7 +76,9 @@ static int v4l2_prepare_contexts(V4L2m2m
52395
52396     s->capture.done = s->output.done = 0;
52397     s->capture.name = "capture";
52398+    s->capture.buf_mem = V4L2_MEMORY_MMAP;
52399     s->output.name = "output";
52400+    s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
52401     atomic_init(&s->refcount, 0);
52402     sem_init(&s->refsync, 0, 0);
52403
52404@@ -215,13 +225,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
52405         av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
52406
52407     /* 2. unmap the capture buffers (v4l2 and ffmpeg):
52408-     *    we must wait for all references to be released before being allowed
52409-     *    to queue new buffers.
52410      */
52411-    av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n");
52412-    if (atomic_load(&s->refcount))
52413-        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
52414-
52415     ff_v4l2_context_release(&s->capture);
52416
52417     /* 3. get the new capture format */
52418@@ -240,7 +244,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
52419
52420     /* 5. complete reinit */
52421     s->draining = 0;
52422-    s->reinit = 0;
52423
52424     return 0;
52425 }
52426@@ -274,7 +277,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2
52427
52428     /* start again now that we know the stream dimensions */
52429     s->draining = 0;
52430-    s->reinit = 0;
52431
52432     ret = ff_v4l2_context_get_format(&s->output, 0);
52433     if (ret) {
52434@@ -328,7 +330,13 @@ static void v4l2_m2m_destroy_context(voi
52435     ff_v4l2_context_release(&s->capture);
52436     sem_destroy(&s->refsync);
52437
52438-    close(s->fd);
52439+    if (s->fd != -1)
52440+        close(s->fd);
52441+
52442+    av_packet_unref(&s->buf_pkt);
52443+    av_freep(&s->extdata_data);
52444+
52445+    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
52446
52447     av_free(s);
52448 }
52449@@ -338,17 +346,34 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *p
52450     V4L2m2mContext *s = priv->context;
52451     int ret;
52452
52453-    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
52454-    if (ret)
52455-        av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
52456+    if (!s)
52457+        return 0;
52458
52459-    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
52460-    if (ret)
52461-        av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
52462+    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");
52463+
52464+    if (av_codec_is_decoder(s->avctx->codec))
52465+        av_packet_unref(&s->buf_pkt);
52466+
52467+    if (s->fd >= 0) {
52468+        ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
52469+        if (ret)
52470+            av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
52471+
52472+        ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
52473+        if (ret)
52474+            av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
52475+    }
52476
52477     ff_v4l2_context_release(&s->output);
52478
52479+    close(s->fd);
52480+    s->fd = -1;
52481+
52482     s->self_ref = NULL;
52483+    // This is only called on avctx close so after this point we don't have that
52484+    // Crash sooner if we find we are using it (can still log with avctx = NULL)
52485+    s->avctx = NULL;
52486+    priv->context = NULL;
52487     av_buffer_unref(&priv->context_ref);
52488
52489     return 0;
52490@@ -392,28 +417,33 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *
52491     return v4l2_configure_contexts(s);
52492 }
52493
52494-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
52495+int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
52496 {
52497-    *s = av_mallocz(sizeof(V4L2m2mContext));
52498-    if (!*s)
52499+    V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
52500+
52501+    *pps = NULL;
52502+    if (!s)
52503         return AVERROR(ENOMEM);
52504
52505-    priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
52506+    priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
52507                                          &v4l2_m2m_destroy_context, NULL, 0);
52508     if (!priv->context_ref) {
52509-        av_freep(s);
52510+        av_free(s);
52511         return AVERROR(ENOMEM);
52512     }
52513
52514     /* assign the context */
52515-    priv->context = *s;
52516-    (*s)->priv = priv;
52517+    priv->context = s;
52518+    s->priv = priv;
52519
52520     /* populate it */
52521-    priv->context->capture.num_buffers = priv->num_capture_buffers;
52522-    priv->context->output.num_buffers  = priv->num_output_buffers;
52523-    priv->context->self_ref = priv->context_ref;
52524-    priv->context->fd = -1;
52525+    s->capture.num_buffers = priv->num_capture_buffers;
52526+    s->output.num_buffers  = priv->num_output_buffers;
52527+    s->self_ref = priv->context_ref;
52528+    s->fd = -1;
52529+
52530+    xlat_init(&s->xlat);
52531
52532+    *pps = s;
52533     return 0;
52534 }
52535--- a/libavcodec/v4l2_m2m.h
52536+++ b/libavcodec/v4l2_m2m.h
52537@@ -30,6 +30,7 @@
52538 #include <linux/videodev2.h>
52539
52540 #include "libavcodec/avcodec.h"
52541+#include "libavutil/pixfmt.h"
52542 #include "v4l2_context.h"
52543
52544 #define container_of(ptr, type, member) ({ \
52545@@ -38,7 +39,37 @@
52546
52547 #define V4L_M2M_DEFAULT_OPTS \
52548     { "num_output_buffers", "Number of buffers in the output context",\
52549-        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS }
52550+        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS }
52551+
52552+#define FF_V4L2_M2M_TRACK_SIZE 128
52553+typedef struct V4L2m2mTrackEl {
52554+    int     discard;   // If we see this buffer its been flushed, so discard
52555+    int     pending;
52556+    int     pkt_size;
52557+    int64_t pts;
52558+    int64_t dts;
52559+    int64_t reordered_opaque;
52560+    int64_t pkt_pos;
52561+    int64_t pkt_duration;
52562+    int64_t track_pts;
52563+} V4L2m2mTrackEl;
52564+
52565+typedef struct pts_stats_s
52566+{
52567+    void * logctx;
52568+    const char * name;  // For debug
52569+    unsigned int last_count;
52570+    unsigned int last_interval;
52571+    int64_t last_pts;
52572+    int64_t guess;
52573+} pts_stats_t;
52574+
52575+typedef struct xlat_track_s {
52576+    unsigned int track_no;
52577+    int64_t last_pts;
52578+    int64_t last_opaque;
52579+    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
52580+} xlat_track_t;
52581
52582 typedef struct V4L2m2mContext {
52583     char devname[PATH_MAX];
52584@@ -52,7 +83,6 @@ typedef struct V4L2m2mContext {
52585     AVCodecContext *avctx;
52586     sem_t refsync;
52587     atomic_uint refcount;
52588-    int reinit;
52589
52590     /* null frame/packet received */
52591     int draining;
52592@@ -63,6 +93,36 @@ typedef struct V4L2m2mContext {
52593
52594     /* reference back to V4L2m2mPriv */
52595     void *priv;
52596+
52597+    AVBufferRef *device_ref;
52598+
52599+    /* generate DRM frames */
52600+    int output_drm;
52601+
52602+    /* input frames are drmprime */
52603+    int input_drm;
52604+
52605+    /* Frame tracking */
52606+    xlat_track_t xlat;
52607+    int pending_hw;
52608+    int pending_n;
52609+
52610+    pts_stats_t pts_stat;
52611+
52612+    /* req pkt */
52613+    int req_pkt;
52614+
52615+    /* Ext data sent */
52616+    int extdata_sent;
52617+    /* Ext data sent in packet - overrides ctx */
52618+    uint8_t * extdata_data;
52619+    size_t extdata_size;
52620+
52621+#define FF_V4L2_QUIRK_REINIT_ALWAYS             1
52622+#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
52623+    /* Quirks */
52624+    unsigned int quirks;
52625+
52626 } V4L2m2mContext;
52627
52628 typedef struct V4L2m2mPriv {
52629@@ -73,6 +133,7 @@ typedef struct V4L2m2mPriv {
52630
52631     int num_output_buffers;
52632     int num_capture_buffers;
52633+    enum AVPixelFormat pix_fmt;
52634 } V4L2m2mPriv;
52635
52636 /**
52637@@ -126,4 +187,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
52638  */
52639 int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
52640
52641+
52642+static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt)
52643+{
52644+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
52645+}
52646+
52647+static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt)
52648+{
52649+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
52650+}
52651+
52652+static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt)
52653+{
52654+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
52655+}
52656+
52657+static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx)
52658+{
52659+    return ctx->flag_last;
52660+}
52661+
52662+
52663 #endif /* AVCODEC_V4L2_M2M_H */
52664--- a/libavcodec/v4l2_m2m_dec.c
52665+++ b/libavcodec/v4l2_m2m_dec.c
52666@@ -23,6 +23,10 @@
52667
52668 #include <linux/videodev2.h>
52669 #include <sys/ioctl.h>
52670+
52671+#include "libavutil/avassert.h"
52672+#include "libavutil/hwcontext.h"
52673+#include "libavutil/hwcontext_drm.h"
52674 #include "libavutil/pixfmt.h"
52675 #include "libavutil/pixdesc.h"
52676 #include "libavutil/opt.h"
52677@@ -30,75 +34,111 @@
52678 #include "libavcodec/decode.h"
52679 #include "libavcodec/internal.h"
52680
52681+#include "libavcodec/hwaccels.h"
52682+#include "libavcodec/internal.h"
52683+#include "libavcodec/hwconfig.h"
52684+
52685 #include "v4l2_context.h"
52686 #include "v4l2_m2m.h"
52687 #include "v4l2_fmt.h"
52688
52689-static int v4l2_try_start(AVCodecContext *avctx)
52690+// Pick 64 for max last count - that is >1sec at 60fps
52691+#define STATS_LAST_COUNT_MAX 64
52692+#define STATS_INTERVAL_MAX (1 << 30)
52693+
52694+#ifndef FF_API_BUFFER_SIZE_T
52695+#define FF_API_BUFFER_SIZE_T 1
52696+#endif
52697+
52698+static int64_t pts_stats_guess(const pts_stats_t * const stats)
52699 {
52700-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
52701-    V4L2Context *const capture = &s->capture;
52702-    V4L2Context *const output = &s->output;
52703-    struct v4l2_selection selection = { 0 };
52704-    int ret;
52705+    if (stats->last_pts == AV_NOPTS_VALUE ||
52706+            stats->last_interval == 0 ||
52707+            stats->last_count >= STATS_LAST_COUNT_MAX)
52708+        return AV_NOPTS_VALUE;
52709+    return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
52710+}
52711
52712-    /* 1. start the output process */
52713-    if (!output->streamon) {
52714-        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
52715-        if (ret < 0) {
52716-            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
52717-            return ret;
52718+static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
52719+{
52720+    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
52721+        if (stats->last_count < STATS_LAST_COUNT_MAX)
52722+            ++stats->last_count;
52723+        return;
52724+    }
52725+
52726+    if (stats->last_pts != AV_NOPTS_VALUE) {
52727+        const int64_t interval = pts - stats->last_pts;
52728+
52729+        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
52730+            stats->last_count >= STATS_LAST_COUNT_MAX) {
52731+            if (stats->last_interval != 0)
52732+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
52733+                       __func__, stats->name, interval, stats->last_count);
52734+            stats->last_interval = 0;
52735+        }
52736+        else {
52737+            const int64_t frame_time = interval / (int64_t)stats->last_count;
52738+
52739+            if (frame_time != stats->last_interval)
52740+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
52741+                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
52742+            stats->last_interval = frame_time;
52743         }
52744     }
52745
52746-    if (capture->streamon)
52747+    stats->last_pts = pts;
52748+    stats->last_count = 1;
52749+}
52750+
52751+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
52752+{
52753+    *stats = (pts_stats_t){
52754+        .logctx = logctx,
52755+        .name = name,
52756+        .last_count = 1,
52757+        .last_interval = 0,
52758+        .last_pts = AV_NOPTS_VALUE
52759+    };
52760+}
52761+
52762+static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
52763+{
52764+    int ret;
52765+    struct v4l2_decoder_cmd cmd = {
52766+        .cmd = V4L2_DEC_CMD_START,
52767+        .flags = 0,
52768+    };
52769+
52770+    if (s->output.streamon)
52771         return 0;
52772
52773-    /* 2. get the capture format */
52774-    capture->format.type = capture->type;
52775-    ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
52776-    if (ret) {
52777-        av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
52778+    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
52779+    if (ret != 0) {
52780+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret));
52781         return ret;
52782     }
52783
52784-    /* 2.1 update the AVCodecContext */
52785-    avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
52786-    capture->av_pix_fmt = avctx->pix_fmt;
52787-
52788-    /* 3. set the crop parameters */
52789-    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
52790-    selection.r.height = avctx->coded_height;
52791-    selection.r.width = avctx->coded_width;
52792-    ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
52793-    if (!ret) {
52794-        ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
52795-        if (ret) {
52796-            av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
52797-        } else {
52798-            av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
52799-            /* update the size of the resulting frame */
52800-            capture->height = selection.r.height;
52801-            capture->width  = selection.r.width;
52802-        }
52803+    // STREAMON should do implicit START so this just for those that don't.
52804+    // It is optional so don't worry if it fails
52805+    if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) {
52806+        ret = AVERROR(errno);
52807+        av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret));
52808     }
52809-
52810-    /* 4. init the capture context now that we have the capture format */
52811-    if (!capture->buffers) {
52812-        ret = ff_v4l2_context_init(capture);
52813-        if (ret) {
52814-            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
52815-            return AVERROR(ENOMEM);
52816-        }
52817+    else {
52818+        av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n");
52819     }
52820+    return 0;
52821+}
52822
52823-    /* 5. start the capture process */
52824-    ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
52825-    if (ret) {
52826-        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
52827-        return ret;
52828-    }
52829+static int v4l2_try_start(AVCodecContext *avctx)
52830+{
52831+    V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
52832+    int ret;
52833
52834+    /* 1. start the output process */
52835+    if ((ret = check_output_streamon(avctx, s)) != 0)
52836+        return ret;
52837     return 0;
52838 }
52839
52840@@ -133,52 +173,525 @@ static int v4l2_prepare_decoder(V4L2m2mC
52841     return 0;
52842 }
52843
52844-static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
52845+static void
52846+set_best_effort_pts(AVCodecContext *const avctx,
52847+             pts_stats_t * const ps,
52848+             AVFrame *const frame)
52849+{
52850+    pts_stats_add(ps, frame->pts);
52851+
52852+#if FF_API_PKT_PTS
52853+FF_DISABLE_DEPRECATION_WARNINGS
52854+    frame->pkt_pts = frame->pts;
52855+FF_ENABLE_DEPRECATION_WARNINGS
52856+#endif
52857+    frame->best_effort_timestamp = pts_stats_guess(ps);
52858+    // If we can't guess from just PTS - try DTS
52859+    if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
52860+        frame->best_effort_timestamp = frame->pkt_dts;
52861+
52862+    // We can't emulate what s/w does in a useful manner and using the
52863+    // "correct" answer seems to just confuse things.
52864+    frame->pkt_dts               = frame->pts;
52865+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
52866+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
52867+}
52868+
52869+static void
52870+xlat_flush(xlat_track_t * const x)
52871+{
52872+    unsigned int i;
52873+    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
52874+        x->track_els[i].pending = 0;
52875+        x->track_els[i].discard = 1;
52876+    }
52877+    x->last_pts = AV_NOPTS_VALUE;
52878+}
52879+
52880+static int
52881+xlat_pending(const xlat_track_t * const x)
52882+{
52883+    unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
52884+    unsigned int i;
52885+    int r = 0;
52886+    int64_t now = AV_NOPTS_VALUE;
52887+
52888+    for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) {
52889+        const V4L2m2mTrackEl * const t = x->track_els + n;
52890+
52891+        if (!t->pending)
52892+            continue;
52893+
52894+        if (now == AV_NOPTS_VALUE)
52895+            now = t->dts;
52896+
52897+        if (t->pts == AV_NOPTS_VALUE ||
52898+            ((now == AV_NOPTS_VALUE || t->pts <= now) &&
52899+             (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts)))
52900+            ++r;
52901+    }
52902+
52903+    // If we never get any ideas about PTS vs DTS allow a lot more buffer
52904+    if (now == AV_NOPTS_VALUE)
52905+        r -= 16;
52906+
52907+    return r;
52908+}
52909+
52910+static inline int stream_started(const V4L2m2mContext * const s) {
52911+    return s->output.streamon;
52912+}
52913+
52914+#define NQ_OK        0
52915+#define NQ_Q_FULL    1
52916+#define NQ_SRC_EMPTY 2
52917+#define NQ_NONE      3
52918+#define NQ_DRAINING  4
52919+#define NQ_DEAD      5
52920+
52921+#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
52922+#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
52923+
52924+// do_not_get      If true then no new packet will be got but status will
52925+//                  be set appropriately
52926+
52927+// AVERROR_EOF     Flushing an already flushed stream
52928+// -ve             Error (all errors except EOF are unexpected)
52929+// NQ_OK (0)       OK
52930+// NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
52931+// NQ_SRC_EMPTY    Src empty (do not retry)
52932+// NQ_NONE         Enqueue not attempted
52933+// NQ_DRAINING     At EOS, dQ dest until EOS there too
52934+// NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
52935+
52936+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get)
52937 {
52938-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
52939-    V4L2Context *const capture = &s->capture;
52940-    V4L2Context *const output = &s->output;
52941-    AVPacket avpkt = {0};
52942     int ret;
52943
52944-    if (s->buf_pkt.size) {
52945-        avpkt = s->buf_pkt;
52946-        memset(&s->buf_pkt, 0, sizeof(AVPacket));
52947-    } else {
52948-        ret = ff_decode_get_packet(avctx, &avpkt);
52949-        if (ret < 0 && ret != AVERROR_EOF)
52950+    // If we don't already have a coded packet - get a new one
52951+    // We will already have a coded pkt if the output Q was full last time we
52952+    // tried to Q it
52953+    if (!s->buf_pkt.size && !do_not_get) {
52954+        unsigned int i;
52955+
52956+        for (i = 0; i < 256; ++i) {
52957+            uint8_t * side_data;
52958+#if FF_API_BUFFER_SIZE_T
52959+            int side_size;
52960+#else
52961+            size_t side_size;
52962+#endif
52963+            ret = ff_decode_get_packet(avctx, &s->buf_pkt);
52964+            if (ret != 0)
52965+                break;
52966+
52967+            // New extradata is the only side-data we undertand
52968+            side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
52969+            if (side_data) {
52970+                av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
52971+                av_freep(&s->extdata_data);
52972+                if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) {
52973+                    av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d bytes of extra data\n", (int)side_size);
52974+                    return AVERROR(ENOMEM);
52975+                }
52976+                memcpy(s->extdata_data, side_data, side_size);
52977+                s->extdata_size = side_size;
52978+                s->extdata_sent = 0;
52979+            }
52980+
52981+            if (s->buf_pkt.size != 0)
52982+                break;
52983+
52984+            if (s->buf_pkt.side_data_elems == 0) {
52985+                av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n");
52986+                ret = AVERROR_EOF;
52987+                break;
52988+            }
52989+
52990+            // Retry a side-data only pkt
52991+        }
52992+        // If i >= 256 something has gone wrong
52993+        if (i >= 256) {
52994+            av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n");
52995+            return AVERROR(EIO);
52996+        }
52997+
52998+        if (ret == AVERROR(EAGAIN)) {
52999+            if (!stream_started(s)) {
53000+                av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__);
53001+                return NQ_DEAD;
53002+            }
53003+            return NQ_SRC_EMPTY;
53004+        }
53005+
53006+        if (ret == AVERROR_EOF) {
53007+            // EOF - enter drain mode
53008+            av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n",
53009+                   ret, s->buf_pkt.size, stream_started(s), s->draining);
53010+            if (!stream_started(s)) {
53011+                av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n");
53012+                s->draining = 1;
53013+                s->capture.done = 1;
53014+                return AVERROR_EOF;
53015+            }
53016+
53017+            if (!s->draining) {
53018+                // Calling enqueue with an empty pkt starts drain
53019+                av_assert0(s->buf_pkt.size == 0);
53020+                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
53021+                if (ret) {
53022+                    av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
53023+                    return ret;
53024+                }
53025+            }
53026+            return NQ_DRAINING;
53027+        }
53028+
53029+        if (ret < 0) {
53030+            av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
53031             return ret;
53032+        }
53033     }
53034
53035-    if (s->draining)
53036-        goto dequeue;
53037+    if (s->draining) {
53038+        if (s->buf_pkt.size) {
53039+            av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
53040+            av_packet_unref(&s->buf_pkt);
53041+        }
53042+        return NQ_DRAINING;
53043+    }
53044
53045-    ret = ff_v4l2_context_enqueue_packet(output, &avpkt);
53046-    if (ret < 0) {
53047-        if (ret != AVERROR(EAGAIN))
53048-           return ret;
53049+    if (!s->buf_pkt.size)
53050+        return NQ_NONE;
53051
53052-        s->buf_pkt = avpkt;
53053-        /* no input buffers available, continue dequeing */
53054-    }
53055+    if ((ret = check_output_streamon(avctx, s)) != 0)
53056+        return ret;
53057+
53058+    if (s->extdata_sent)
53059+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
53060+    else if (s->extdata_data)
53061+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
53062+    else
53063+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);
53064+
53065+    if (ret == AVERROR(EAGAIN)) {
53066+        // Out of input buffers - keep packet
53067+        ret = NQ_Q_FULL;
53068+    }
53069+    else {
53070+        // In all other cases we are done with this packet
53071+        av_packet_unref(&s->buf_pkt);
53072+        s->extdata_sent = 1;
53073
53074-    if (avpkt.size) {
53075-        ret = v4l2_try_start(avctx);
53076         if (ret) {
53077-            av_packet_unref(&avpkt);
53078+            av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret);
53079+            return ret;
53080+        }
53081+    }
53082
53083-            /* cant recover */
53084-            if (ret == AVERROR(ENOMEM))
53085-                return ret;
53086+    // Start if we haven't
53087+    {
53088+        const int ret2 = v4l2_try_start(avctx);
53089+        if (ret2) {
53090+            av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2);
53091+            ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD;
53092+        }
53093+    }
53094+
53095+    return ret;
53096+}
53097+
53098+static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
53099+{
53100+    int rv = 0;
53101
53102-            return 0;
53103+    ff_mutex_lock(&ctx->lock);
53104+
53105+    while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
53106+        if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
53107+            rv = AVERROR(errno);
53108+            av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
53109+            break;
53110         }
53111     }
53112
53113-dequeue:
53114-    if (!s->buf_pkt.size)
53115-        av_packet_unref(&avpkt);
53116-    return ff_v4l2_context_dequeue_frame(capture, frame, -1);
53117+    ff_mutex_unlock(&ctx->lock);
53118+    return rv;
53119+}
53120+
53121+// Number of frames over what xlat_pending returns that we keep *16
53122+// This is a min value - if it appears to be too small the threshold should
53123+// adjust dynamically.
53124+#define PENDING_HW_MIN      (3 * 16)
53125+// Offset to use when setting dynamically
53126+// Set to %16 == 15 to avoid the threshold changing immediately as we relax
53127+#define PENDING_HW_OFFSET   (PENDING_HW_MIN - 1)
53128+// Number of consecutive times we've failed to get a frame when we prefer it
53129+// before we increase the prefer threshold (5ms * N = max expected decode
53130+// time)
53131+#define PENDING_N_THRESHOLD 6
53132+
53133+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
53134+{
53135+    V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
53136+    int src_rv = NQ_OK;
53137+    int dst_rv = 1;  // Non-zero (done), non-negative (error) number
53138+    unsigned int i = 0;
53139+
53140+    do {
53141+        const int pending = xlat_pending(&s->xlat);
53142+        const int prefer_dq = (pending > s->pending_hw / 16);
53143+        const int last_src_rv = src_rv;
53144+
53145+        // Enqueue another pkt for decode if
53146+        // (a) We don't have a lot of stuff in the buffer already OR
53147+        // (b) ... we (think we) do but we've failed to get a frame already OR
53148+        // (c) We've dequeued a lot of frames without asking for input
53149+        src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2));
53150+
53151+        // If we got a frame last time or we've already tried to get a frame and
53152+        // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
53153+        // indicating that we want more input.
53154+        // This should mean that once decode starts we enter a stable state where
53155+        // we alternately ask for input and produce output
53156+        if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
53157+            break;
53158+
53159+        if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) {
53160+            av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n");
53161+            break;
53162+        }
53163+
53164+        // Try to get a new frame if
53165+        // (a) we haven't already got one AND
53166+        // (b) enqueue returned a status indicating that decode should be attempted
53167+        if (dst_rv != 0 && TRY_DQ(src_rv)) {
53168+            // Pick a timeout depending on state
53169+            const int t =
53170+                src_rv == NQ_DRAINING ? 300 :
53171+                prefer_dq ? 5 :
53172+                src_rv == NQ_Q_FULL ? -1 : 0;
53173+
53174+            // Dequeue frame will unref any previous contents of frame
53175+            // if it returns success so we don't need an explicit unref
53176+            // when discarding
53177+            // This returns AVERROR(EAGAIN) on timeout or if
53178+            // there is room in the input Q and timeout == -1
53179+            dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
53180+
53181+            // Failure due to no buffer in Q?
53182+            if (dst_rv == AVERROR(ENOSPC)) {
53183+                // Wait & retry
53184+                if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
53185+                    dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
53186+                }
53187+            }
53188+
53189+            // Adjust dynamic pending threshold
53190+            if (dst_rv == 0) {
53191+                if (--s->pending_hw < PENDING_HW_MIN)
53192+                    s->pending_hw = PENDING_HW_MIN;
53193+                s->pending_n = 0;
53194+
53195+                set_best_effort_pts(avctx, &s->pts_stat, frame);
53196+            }
53197+            else if (dst_rv == AVERROR(EAGAIN)) {
53198+                if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
53199+                    s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
53200+                    s->pending_n = 0;
53201+                }
53202+            }
53203+
53204+            if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
53205+                av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
53206+                dst_rv = AVERROR_EOF;
53207+                s->capture.done = 1;
53208+            }
53209+            else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
53210+                av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
53211+                       s->draining, s->capture.done);
53212+            else if (dst_rv && dst_rv != AVERROR(EAGAIN))
53213+                av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
53214+                       s->draining, s->capture.done, dst_rv);
53215+        }
53216+
53217+        ++i;
53218+        if (i >= 256) {
53219+            av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i);
53220+            src_rv = AVERROR(EIO);
53221+        }
53222+
53223+        // Continue trying to enqueue packets if either
53224+        // (a) we succeeded last time OR
53225+        // (b) we didn't ret a frame and we can retry the input
53226+    } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv)));
53227+
53228+    // Ensure that the frame contains nothing if we aren't returning a frame
53229+    // (might happen when discarding)
53230+    if (dst_rv)
53231+        av_frame_unref(frame);
53232+
53233+    // If we got a frame this time ask for a pkt next time
53234+    s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0;
53235+
53236+#if 0
53237+    if (dst_rv == 0)
53238+    {
53239+        static int z = 0;
53240+        if (++z > 50) {
53241+            av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n");
53242+            ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
53243+            return -1;
53244+        }
53245+    }
53246+#endif
53247+
53248+    return dst_rv == 0 ? 0 :
53249+        src_rv < 0 ? src_rv :
53250+        dst_rv < 0 ? dst_rv :
53251+            AVERROR(EAGAIN);
53252+}
53253+
53254+#if 0
53255+#include <time.h>
53256+static int64_t us_time(void)
53257+{
53258+    struct timespec ts;
53259+    clock_gettime(CLOCK_MONOTONIC, &ts);
53260+    return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
53261+}
53262+
53263+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
53264+{
53265+    int ret;
53266+    const int64_t now = us_time();
53267+    int64_t done;
53268+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
53269+    ret = v4l2_receive_frame2(avctx, frame);
53270+    done = us_time();
53271+    av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret);
53272+    return ret;
53273+}
53274+#endif
53275+
53276+static int
53277+check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
53278+{
53279+    unsigned int i;
53280+    const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
53281+    const uint32_t w = avctx->coded_width;
53282+    const uint32_t h = avctx->coded_height;
53283+
53284+    if (w == 0 || h == 0 || fcc == 0) {
53285+        av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
53286+        return 0;
53287+    }
53288+    if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) {
53289+        av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc));
53290+        return 0;
53291+    }
53292+
53293+    for (i = 0;; ++i) {
53294+        struct v4l2_frmsizeenum fs = {
53295+            .index = i,
53296+            .pixel_format = fcc,
53297+        };
53298+
53299+        while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) {
53300+            const int err = AVERROR(errno);
53301+            if (err == AVERROR(EINTR))
53302+                continue;
53303+            if (i == 0 && err == AVERROR(ENOTTY)) {
53304+                av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n");
53305+                return 0;
53306+            }
53307+            if (err != AVERROR(EINVAL)) {
53308+                av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
53309+                return err;
53310+            }
53311+            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n",
53312+                   w, h, av_fourcc2str(fcc), i);
53313+            return err;
53314+        }
53315+
53316+        switch (fs.type) {
53317+            case V4L2_FRMSIZE_TYPE_DISCRETE:
53318+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i,
53319+                       fs.discrete.width,fs.discrete.height);
53320+                if (w == fs.discrete.width && h == fs.discrete.height)
53321+                    return 0;
53322+                break;
53323+            case V4L2_FRMSIZE_TYPE_STEPWISE:
53324+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
53325+                       fs.stepwise.min_width, fs.stepwise.min_height,
53326+                       fs.stepwise.max_width, fs.stepwise.max_height,
53327+                       fs.stepwise.step_width,fs.stepwise.step_height);
53328+                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
53329+                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height &&
53330+                    (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 &&
53331+                    (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0)
53332+                    return 0;
53333+                break;
53334+            case V4L2_FRMSIZE_TYPE_CONTINUOUS:
53335+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
53336+                       fs.stepwise.min_width, fs.stepwise.min_height,
53337+                       fs.stepwise.max_width, fs.stepwise.max_height,
53338+                       fs.stepwise.step_width,fs.stepwise.step_height);
53339+                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
53340+                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height)
53341+                    return 0;
53342+                break;
53343+            default:
53344+                av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type);
53345+                return AVERROR(EINVAL);
53346+        }
53347+    }
53348+}
53349+
53350+static int
53351+get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
53352+{
53353+    struct v4l2_capability cap;
53354+
53355+    memset(&cap, 0, sizeof(cap));
53356+    while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) {
53357+        int err = errno;
53358+        if (err == EINTR)
53359+            continue;
53360+        av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err));
53361+        return AVERROR(err);
53362+    }
53363+
53364+    // Could be made table driven if we have a few more but right now there
53365+    // seems no point
53366+
53367+    // Meson (amlogic) always gives a resolution changed event after output
53368+    // streamon and userspace must (re)allocate capture buffers and streamon
53369+    // capture to clear the event even if the capture buffers were the right
53370+    // size in the first place.
53371+    if (strcmp(cap.driver, "meson-vdec") == 0)
53372+        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN;
53373+
53374+    av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
53375+    return 0;
53376+}
53377+
53378+// This heuristic is for H264 but use for everything
53379+static uint32_t max_coded_size(const AVCodecContext * const avctx)
53380+{
53381+    uint32_t wxh = avctx->coded_width * avctx->coded_height;
53382+    uint32_t size;
53383+
53384+    size = wxh * 3 / 2;
53385+    // H.264 Annex A table A-1 gives minCR which is either 2 or 4
53386+    // unfortunately that doesn't yield an actually useful limit
53387+    // and it should be noted that frame 0 is special cased to allow
53388+    // a bigger number which really isn't helpful for us. So just pick
53389+    // frame_size / 2
53390+    size /= 2;
53391+    // Add 64k to allow for any overheads and/or encoder hopefulness
53392+    // with small WxH
53393+    return size + (1 << 16);
53394 }
53395
53396 static av_cold int v4l2_decode_init(AVCodecContext *avctx)
53397@@ -186,12 +699,29 @@ static av_cold int v4l2_decode_init(AVCo
53398     V4L2Context *capture, *output;
53399     V4L2m2mContext *s;
53400     V4L2m2mPriv *priv = avctx->priv_data;
53401+    int gf_pix_fmt;
53402     int ret;
53403
53404+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
53405+
53406+    if (avctx->codec_id == AV_CODEC_ID_H264) {
53407+        if (avctx->ticks_per_frame == 1) {
53408+            if(avctx->time_base.den < INT_MAX/2) {
53409+                avctx->time_base.den *= 2;
53410+            } else
53411+                avctx->time_base.num /= 2;
53412+        }
53413+        avctx->ticks_per_frame = 2;
53414+    }
53415+
53416+    av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
53417     ret = ff_v4l2_m2m_create_context(priv, &s);
53418     if (ret < 0)
53419         return ret;
53420
53421+    pts_stats_init(&s->pts_stat, avctx, "decoder");
53422+    s->pending_hw = PENDING_HW_MIN;
53423+
53424     capture = &s->capture;
53425     output = &s->output;
53426
53427@@ -199,34 +729,127 @@ static av_cold int v4l2_decode_init(AVCo
53428      * by the v4l2 driver; this event will trigger a full pipeline reconfig and
53429      * the proper values will be retrieved from the kernel driver.
53430      */
53431-    output->height = capture->height = avctx->coded_height;
53432-    output->width = capture->width = avctx->coded_width;
53433+//    output->height = capture->height = avctx->coded_height;
53434+//    output->width = capture->width = avctx->coded_width;
53435+    output->height = capture->height = 0;
53436+    output->width = capture->width = 0;
53437
53438     output->av_codec_id = avctx->codec_id;
53439     output->av_pix_fmt  = AV_PIX_FMT_NONE;
53440+    output->min_buf_size = max_coded_size(avctx);
53441
53442     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
53443     capture->av_pix_fmt = avctx->pix_fmt;
53444+    capture->min_buf_size = 0;
53445+
53446+    /* the client requests the codec to generate DRM frames:
53447+     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
53448+     *       check the ff_v4l2_buffer_to_avframe conversion function.
53449+     *   - the DRM frame format is passed in the DRM frame descriptor layer.
53450+     *       check the v4l2_get_drm_frame function.
53451+     */
53452+
53453+    avctx->sw_pix_fmt = avctx->pix_fmt;
53454+    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
53455+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
53456+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
53457+           avctx->coded_width, avctx->coded_height,
53458+           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
53459+
53460+    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
53461+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
53462+        s->output_drm = 1;
53463+    }
53464+    else {
53465+        capture->av_pix_fmt = gf_pix_fmt;
53466+        s->output_drm = 0;
53467+    }
53468+
53469+    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
53470+    if (!s->device_ref) {
53471+        ret = AVERROR(ENOMEM);
53472+        return ret;
53473+    }
53474+
53475+    ret = av_hwdevice_ctx_init(s->device_ref);
53476+    if (ret < 0)
53477+        return ret;
53478
53479     s->avctx = avctx;
53480     ret = ff_v4l2_m2m_codec_init(priv);
53481     if (ret) {
53482         av_log(avctx, AV_LOG_ERROR, "can't configure decoder\n");
53483-        s->self_ref = NULL;
53484-        av_buffer_unref(&priv->context_ref);
53485-
53486         return ret;
53487     }
53488
53489-    return v4l2_prepare_decoder(s);
53490+    if ((ret = v4l2_prepare_decoder(s)) < 0)
53491+        return ret;
53492+
53493+    if ((ret = get_quirks(avctx, s)) != 0)
53494+        return ret;
53495+
53496+    if ((ret = check_size(avctx, s)) != 0)
53497+        return ret;
53498+
53499+    return 0;
53500 }
53501
53502 static av_cold int v4l2_decode_close(AVCodecContext *avctx)
53503 {
53504-    V4L2m2mPriv *priv = avctx->priv_data;
53505-    V4L2m2mContext *s = priv->context;
53506+    int rv;
53507+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
53508+    rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
53509+    av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv);
53510+    return rv;
53511+}
53512+
53513+static void v4l2_decode_flush(AVCodecContext *avctx)
53514+{
53515+    // An alternatve and more drastic form of flush is to simply do this:
53516+    //    v4l2_decode_close(avctx);
53517+    //    v4l2_decode_init(avctx);
53518+    // The downside is that this keeps a decoder open until all the frames
53519+    // associated with it have been returned.  This is a bit wasteful on
53520+    // possibly limited h/w resources and fails on a Pi for this reason unless
53521+    // more GPU mem is allocated than is the default.
53522+
53523+    V4L2m2mPriv * const priv = avctx->priv_data;
53524+    V4L2m2mContext * const s = priv->context;
53525+    V4L2Context * const output = &s->output;
53526+    V4L2Context * const capture = &s->capture;
53527+
53528+    av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
53529+
53530+    // Reflushing everything is benign, quick and avoids having to worry about
53531+    // states like EOS processing so don't try to optimize out (having got it
53532+    // wrong once)
53533+
53534+    ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
53535+
53536+    // Clear any buffered input packet
53537     av_packet_unref(&s->buf_pkt);
53538-    return ff_v4l2_m2m_codec_end(priv);
53539+
53540+    // Clear a pending EOS
53541+    if (ff_v4l2_ctx_eos(capture)) {
53542+        // Arguably we could delay this but this is easy and doesn't require
53543+        // thought or extra vars
53544+        ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF);
53545+        ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
53546+    }
53547+
53548+    // V4L2 makes no guarantees about whether decoded frames are flushed or not
53549+    // so mark all frames we are tracking to be discarded if they appear
53550+    xlat_flush(&s->xlat);
53551+
53552+    // resend extradata
53553+    s->extdata_sent = 0;
53554+    // clear EOS status vars
53555+    s->draining = 0;
53556+    output->done = 0;
53557+    capture->done = 0;
53558+
53559+    // Stream on will occur when we actually submit a new frame
53560+    av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
53561 }
53562
53563 #define OFFSET(x) offsetof(V4L2m2mPriv, x)
53564@@ -235,10 +858,16 @@ static av_cold int v4l2_decode_close(AVC
53565 static const AVOption options[] = {
53566     V4L_M2M_DEFAULT_OPTS,
53567     { "num_capture_buffers", "Number of buffers in the capture context",
53568-        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS },
53569+        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
53570+    { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
53571     { NULL},
53572 };
53573
53574+static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
53575+    HW_CONFIG_INTERNAL(DRM_PRIME),
53576+    NULL
53577+};
53578+
53579 #define M2MDEC_CLASS(NAME) \
53580     static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
53581         .class_name = #NAME "_v4l2m2m_decoder", \
53582@@ -259,9 +888,15 @@ static const AVOption options[] = {
53583         .init           = v4l2_decode_init, \
53584         .receive_frame  = v4l2_receive_frame, \
53585         .close          = v4l2_decode_close, \
53586+        .flush          = v4l2_decode_flush, \
53587         .bsfs           = bsf_name, \
53588         .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
53589-        .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS, \
53590+        .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
53591+        .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
53592+                                                         AV_PIX_FMT_NV12, \
53593+                                                         AV_PIX_FMT_YUV420P, \
53594+                                                         AV_PIX_FMT_NONE}, \
53595+        .hw_configs     = v4l2_m2m_hw_configs, \
53596         .wrapper_name   = "v4l2m2m", \
53597     }
53598
53599--- a/libavcodec/v4l2_m2m_enc.c
53600+++ b/libavcodec/v4l2_m2m_enc.c
53601@@ -24,6 +24,8 @@
53602 #include <linux/videodev2.h>
53603 #include <sys/ioctl.h>
53604 #include <search.h>
53605+#include <drm_fourcc.h>
53606+
53607 #include "libavcodec/avcodec.h"
53608 #include "libavcodec/internal.h"
53609 #include "libavutil/pixdesc.h"
53610@@ -37,6 +39,34 @@
53611 #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
53612 #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x
53613
53614+// P030 should be defined in drm_fourcc.h and hopefully will be sometime
53615+// in the future but until then...
53616+#ifndef DRM_FORMAT_P030
53617+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
53618+#endif
53619+
53620+#ifndef DRM_FORMAT_NV15
53621+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
53622+#endif
53623+
53624+#ifndef DRM_FORMAT_NV20
53625+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
53626+#endif
53627+
53628+#ifndef V4L2_CID_CODEC_BASE
53629+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
53630+#endif
53631+
53632+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
53633+// in videodev2.h hopefully will be sometime in the future but until then...
53634+#ifndef V4L2_PIX_FMT_NV12_10_COL128
53635+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
53636+#endif
53637+
53638+#ifndef V4L2_PIX_FMT_NV12_COL128
53639+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
53640+#endif
53641+
53642 static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
53643 {
53644     struct v4l2_streamparm parm = { 0 };
53645@@ -147,15 +177,14 @@ static inline int v4l2_mpeg4_profile_fro
53646 static int v4l2_check_b_frame_support(V4L2m2mContext *s)
53647 {
53648     if (s->avctx->max_b_frames)
53649-        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
53650+        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);
53651
53652-    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
53653+    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
53654     v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
53655     if (s->avctx->max_b_frames == 0)
53656         return 0;
53657
53658     avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
53659-
53660     return AVERROR_PATCHWELCOME;
53661 }
53662
53663@@ -270,13 +299,184 @@ static int v4l2_prepare_encoder(V4L2m2mC
53664     return 0;
53665 }
53666
53667+static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
53668+{
53669+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
53670+
53671+    const uint32_t drm_fmt = src->layers[0].format;
53672+    // Treat INVALID as LINEAR
53673+    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
53674+        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
53675+    uint32_t pix_fmt = 0;
53676+    uint32_t w = 0;
53677+    uint32_t h = 0;
53678+    uint32_t bpl = src->layers[0].planes[0].pitch;
53679+
53680+    // We really don't expect multiple layers
53681+    // All formats that we currently cope with are single object
53682+
53683+    if (src->nb_layers != 1 || src->nb_objects != 1)
53684+        return AVERROR(EINVAL);
53685+
53686+    switch (drm_fmt) {
53687+        case DRM_FORMAT_YUV420:
53688+            if (mod == DRM_FORMAT_MOD_LINEAR) {
53689+                if (src->layers[0].nb_planes != 3)
53690+                    break;
53691+                pix_fmt = V4L2_PIX_FMT_YUV420;
53692+                h = src->layers[0].planes[1].offset / bpl;
53693+                w = bpl;
53694+            }
53695+            break;
53696+
53697+        case DRM_FORMAT_NV12:
53698+            if (mod == DRM_FORMAT_MOD_LINEAR) {
53699+                if (src->layers[0].nb_planes != 2)
53700+                    break;
53701+                pix_fmt = V4L2_PIX_FMT_NV12;
53702+                h = src->layers[0].planes[1].offset / bpl;
53703+                w = bpl;
53704+            }
53705+            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
53706+                if (src->layers[0].nb_planes != 2)
53707+                    break;
53708+                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
53709+                w = bpl;
53710+                h = src->layers[0].planes[1].offset / 128;
53711+                bpl = fourcc_mod_broadcom_param(mod);
53712+            }
53713+            break;
53714+
53715+        case DRM_FORMAT_P030:
53716+            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
53717+                if (src->layers[0].nb_planes != 2)
53718+                    break;
53719+                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
53720+                w = bpl / 2;  // Matching lie to how we construct this
53721+                h = src->layers[0].planes[1].offset / 128;
53722+                bpl = fourcc_mod_broadcom_param(mod);
53723+            }
53724+            break;
53725+
53726+        default:
53727+            break;
53728+    }
53729+
53730+    if (!pix_fmt)
53731+        return AVERROR(EINVAL);
53732+
53733+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
53734+        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
53735+
53736+        pix->width = w;
53737+        pix->height = h;
53738+        pix->pixelformat = pix_fmt;
53739+        pix->plane_fmt[0].bytesperline = bpl;
53740+        pix->num_planes = 1;
53741+    }
53742+    else {
53743+        struct v4l2_pix_format *const pix = &format->fmt.pix;
53744+
53745+        pix->width = w;
53746+        pix->height = h;
53747+        pix->pixelformat = pix_fmt;
53748+        pix->bytesperline = bpl;
53749+    }
53750+
53751+    return 0;
53752+}
53753+
53754+// Do we have similar enough formats to be usable?
53755+static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
53756+{
53757+    if (a->type != b->type)
53758+        return 0;
53759+
53760+    if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
53761+        const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
53762+        const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
53763+        unsigned int i;
53764+        if (pa->pixelformat != pb->pixelformat ||
53765+            pa->num_planes != pb->num_planes)
53766+            return 0;
53767+        for (i = 0; i != pa->num_planes; ++i) {
53768+            if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
53769+                return 0;
53770+        }
53771+    }
53772+    else {
53773+        const struct v4l2_pix_format *const pa = &a->fmt.pix;
53774+        const struct v4l2_pix_format *const pb = &b->fmt.pix;
53775+        if (pa->pixelformat != pb->pixelformat ||
53776+            pa->bytesperline != pb->bytesperline)
53777+            return 0;
53778+    }
53779+    return 1;
53780+}
53781+
53782+
53783 static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
53784 {
53785     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
53786     V4L2Context *const output = &s->output;
53787
53788+    // Signal EOF if needed
53789+    if (!frame) {
53790+        return ff_v4l2_context_enqueue_frame(output, frame);
53791+    }
53792+
53793+    if (s->input_drm && !output->streamon) {
53794+        int rv;
53795+        struct v4l2_format req_format = {.type = output->format.type};
53796+
53797+        // Set format when we first get a buffer
53798+        if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
53799+            av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
53800+            return rv;
53801+        }
53802+
53803+        ff_v4l2_context_release(output);
53804+
53805+        output->format = req_format;
53806+
53807+        if ((rv = ff_v4l2_context_set_format(output)) != 0) {
53808+            av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
53809+            return rv;
53810+        }
53811+
53812+        if (!fmt_eq(&req_format, &output->format)) {
53813+            av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
53814+            return AVERROR(EINVAL);
53815+        }
53816+
53817+        output->selection.top = frame->crop_top;
53818+        output->selection.left = frame->crop_left;
53819+        output->selection.width = av_frame_cropped_width(frame);
53820+        output->selection.height = av_frame_cropped_height(frame);
53821+
53822+        if ((rv = ff_v4l2_context_init(output)) != 0) {
53823+            av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
53824+            return rv;
53825+        }
53826+
53827+        {
53828+            struct v4l2_selection selection = {
53829+                .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
53830+                .target = V4L2_SEL_TGT_CROP,
53831+                .r = output->selection
53832+            };
53833+            if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
53834+                av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
53835+                       selection.r.width, selection.r.height, selection.r.left, selection.r.top,
53836+                       av_err2str(AVERROR(errno)));
53837+            }
53838+            av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
53839+                   selection.r.width, selection.r.height, selection.r.left, selection.r.top);
53840+        }
53841+    }
53842+
53843 #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
53844-    if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
53845+    if (frame->pict_type == AV_PICTURE_TYPE_I)
53846         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
53847 #endif
53848
53849@@ -310,7 +510,70 @@ static int v4l2_receive_packet(AVCodecCo
53850     }
53851
53852 dequeue:
53853-    return ff_v4l2_context_dequeue_packet(capture, avpkt);
53854+    if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
53855+        return ret;
53856+
53857+    if (capture->first_buf == 1) {
53858+        uint8_t * data;
53859+        const int len = avpkt->size;
53860+
53861+        // 1st buffer after streamon should be SPS/PPS
53862+        capture->first_buf = 2;
53863+
53864+        // Clear both possible stores so there is no chance of confusion
53865+        av_freep(&s->extdata_data);
53866+        s->extdata_size = 0;
53867+        av_freep(&avctx->extradata);
53868+        avctx->extradata_size = 0;
53869+
53870+        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
53871+            memcpy(data, avpkt->data, len);
53872+
53873+        av_packet_unref(avpkt);
53874+
53875+        if (data == NULL)
53876+            return AVERROR(ENOMEM);
53877+
53878+        // We need to copy the header, but keep local if not global
53879+        if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
53880+            avctx->extradata = data;
53881+            avctx->extradata_size = len;
53882+        }
53883+        else {
53884+            s->extdata_data = data;
53885+            s->extdata_size = len;
53886+        }
53887+
53888+        if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
53889+            return ret;
53890+    }
53891+
53892+    // First frame must be key so mark as such even if encoder forgot
53893+    if (capture->first_buf == 2)
53894+        avpkt->flags |= AV_PKT_FLAG_KEY;
53895+
53896+    // Add SPS/PPS to the start of every key frame if non-global headers
53897+    if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
53898+        const size_t newlen = s->extdata_size + avpkt->size;
53899+        AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
53900+
53901+        if (buf == NULL) {
53902+            av_packet_unref(avpkt);
53903+            return AVERROR(ENOMEM);
53904+        }
53905+
53906+        memcpy(buf->data, s->extdata_data, s->extdata_size);
53907+        memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
53908+
53909+        av_buffer_unref(&avpkt->buf);
53910+        avpkt->buf = buf;
53911+        avpkt->data = buf->data;
53912+        avpkt->size = newlen;
53913+    }
53914+
53915+//    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
53916+    capture->first_buf = 0;
53917+    return 0;
53918 }
53919
53920 static av_cold int v4l2_encode_init(AVCodecContext *avctx)
53921@@ -322,6 +585,8 @@ static av_cold int v4l2_encode_init(AVCo
53922     uint32_t v4l2_fmt_output;
53923     int ret;
53924
53925+    av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
53926+
53927     ret = ff_v4l2_m2m_create_context(priv, &s);
53928     if (ret < 0)
53929         return ret;
53930@@ -329,13 +594,17 @@ static av_cold int v4l2_encode_init(AVCo
53931     capture = &s->capture;
53932     output  = &s->output;
53933
53934+    s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
53935+
53936     /* common settings output/capture */
53937     output->height = capture->height = avctx->height;
53938     output->width = capture->width = avctx->width;
53939
53940     /* output context */
53941     output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
53942-    output->av_pix_fmt = avctx->pix_fmt;
53943+    output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
53944+            avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
53945+            AV_PIX_FMT_YUV420P;
53946
53947     /* capture context */
53948     capture->av_codec_id = avctx->codec_id;
53949@@ -354,7 +623,7 @@ static av_cold int v4l2_encode_init(AVCo
53950         v4l2_fmt_output = output->format.fmt.pix.pixelformat;
53951
53952     pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
53953-    if (pix_fmt_output != avctx->pix_fmt) {
53954+    if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
53955         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
53956         av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
53957         return AVERROR(EINVAL);
53958--- /dev/null
53959+++ b/libavcodec/v4l2_req_decode_q.c
53960@@ -0,0 +1,84 @@
53961+#include <memory.h>
53962+#include <semaphore.h>
53963+#include <pthread.h>
53964+
53965+#include "v4l2_req_decode_q.h"
53966+
53967+int decode_q_in_q(const req_decode_ent * const d)
53968+{
53969+    return d->in_q;
53970+}
53971+
53972+void decode_q_add(req_decode_q * const q, req_decode_ent * const d)
53973+{
53974+    pthread_mutex_lock(&q->q_lock);
53975+    if (!q->head) {
53976+        q->head = d;
53977+        q->tail = d;
53978+        d->prev = NULL;
53979+    }
53980+    else {
53981+        q->tail->next = d;
53982+        d->prev = q->tail;
53983+        q->tail = d;
53984+    }
53985+    d->next = NULL;
53986+    d->in_q = 1;
53987+    pthread_mutex_unlock(&q->q_lock);
53988+}
53989+
53990+// Remove entry from Q - if head wake-up anything that was waiting
53991+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d)
53992+{
53993+    int try_signal = 0;
53994+
53995+    if (!d->in_q)
53996+        return;
53997+
53998+    pthread_mutex_lock(&q->q_lock);
53999+    if (d->prev)
54000+        d->prev->next = d->next;
54001+    else {
54002+        try_signal = 1;  // Only need to signal if we were head
54003+        q->head = d->next;
54004+    }
54005+
54006+    if (d->next)
54007+        d->next->prev = d->prev;
54008+    else
54009+        q->tail = d->prev;
54010+
54011+    // Not strictly needed but makes debug easier
54012+    d->next = NULL;
54013+    d->prev = NULL;
54014+    d->in_q = 0;
54015+    pthread_mutex_unlock(&q->q_lock);
54016+
54017+    if (try_signal)
54018+        pthread_cond_broadcast(&q->q_cond);
54019+}
54020+
54021+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d)
54022+{
54023+    pthread_mutex_lock(&q->q_lock);
54024+
54025+    while (q->head != d)
54026+        pthread_cond_wait(&q->q_cond, &q->q_lock);
54027+
54028+    pthread_mutex_unlock(&q->q_lock);
54029+}
54030+
54031+void decode_q_uninit(req_decode_q * const q)
54032+{
54033+    pthread_mutex_destroy(&q->q_lock);
54034+    pthread_cond_destroy(&q->q_cond);
54035+}
54036+
54037+void decode_q_init(req_decode_q * const q)
54038+{
54039+    memset(q, 0, sizeof(*q));
54040+    pthread_mutex_init(&q->q_lock, NULL);
54041+    pthread_cond_init(&q->q_cond, NULL);
54042+}
54043+
54044+
54045--- /dev/null
54046+++ b/libavcodec/v4l2_req_decode_q.h
54047@@ -0,0 +1,25 @@
54048+#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H
54049+#define AVCODEC_V4L2_REQ_DECODE_Q_H
54050+
54051+typedef struct req_decode_ent {
54052+    struct req_decode_ent * next;
54053+    struct req_decode_ent * prev;
54054+    int in_q;
54055+} req_decode_ent;
54056+
54057+typedef struct req_decode_q {
54058+    pthread_mutex_t q_lock;
54059+    pthread_cond_t q_cond;
54060+    req_decode_ent * head;
54061+    req_decode_ent * tail;
54062+} req_decode_q;
54063+
54064+int decode_q_in_q(const req_decode_ent * const d);
54065+void decode_q_add(req_decode_q * const q, req_decode_ent * const d);
54066+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d);
54067+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d);
54068+void decode_q_uninit(req_decode_q * const q);
54069+void decode_q_init(req_decode_q * const q);
54070+
54071+#endif
54072+
54073--- /dev/null
54074+++ b/libavcodec/v4l2_req_devscan.c
54075@@ -0,0 +1,449 @@
54076+#include <errno.h>
54077+#include <fcntl.h>
54078+#include <libudev.h>
54079+#include <stdlib.h>
54080+#include <string.h>
54081+#include <unistd.h>
54082+
54083+#include <sys/ioctl.h>
54084+#include <sys/sysmacros.h>
54085+
54086+#include <linux/media.h>
54087+#include <linux/videodev2.h>
54088+
54089+#include "v4l2_req_devscan.h"
54090+#include "v4l2_req_utils.h"
54091+
54092+struct decdev {
54093+    enum v4l2_buf_type src_type;
54094+    uint32_t src_fmt_v4l2;
54095+    const char * vname;
54096+    const char * mname;
54097+};
54098+
54099+struct devscan {
54100+    struct decdev env;
54101+    unsigned int dev_size;
54102+    unsigned int dev_count;
54103+    struct decdev *devs;
54104+};
54105+
54106+static int video_src_pixfmt_supported(uint32_t fmt)
54107+{
54108+    return 1;
54109+}
54110+
54111+static void v4l2_setup_format(struct v4l2_format *format, unsigned int type,
54112+                  unsigned int width, unsigned int height,
54113+                  unsigned int pixelformat)
54114+{
54115+    unsigned int sizeimage;
54116+
54117+    memset(format, 0, sizeof(*format));
54118+    format->type = type;
54119+
54120+    sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0;
54121+
54122+    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
54123+        format->fmt.pix_mp.width = width;
54124+        format->fmt.pix_mp.height = height;
54125+        format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage;
54126+        format->fmt.pix_mp.pixelformat = pixelformat;
54127+    } else {
54128+        format->fmt.pix.width = width;
54129+        format->fmt.pix.height = height;
54130+        format->fmt.pix.sizeimage = sizeimage;
54131+        format->fmt.pix.pixelformat = pixelformat;
54132+    }
54133+}
54134+
54135+static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat,
54136+            unsigned int width, unsigned int height)
54137+{
54138+    struct v4l2_format format;
54139+
54140+    v4l2_setup_format(&format, type, width, height, pixelformat);
54141+
54142+    return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0;
54143+}
54144+
54145+static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities)
54146+{
54147+    struct v4l2_capability capability = { 0 };
54148+    int rc;
54149+
54150+    rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability);
54151+    if (rc < 0)
54152+        return -errno;
54153+
54154+    if (capabilities != NULL) {
54155+        if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0)
54156+            *capabilities = capability.device_caps;
54157+        else
54158+            *capabilities = capability.capabilities;
54159+    }
54160+
54161+    return 0;
54162+}
54163+
54164+static int devscan_add(struct devscan *const scan,
54165+                       enum v4l2_buf_type src_type,
54166+                       uint32_t src_fmt_v4l2,
54167+                       const char * vname,
54168+                       const char * mname)
54169+{
54170+    struct decdev *d;
54171+
54172+    if (scan->dev_size <= scan->dev_count) {
54173+        unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2;
54174+        d = realloc(scan->devs, n * sizeof(*d));
54175+        if (!d)
54176+            return -ENOMEM;
54177+        scan->devs = d;
54178+        scan->dev_size = n;
54179+    }
54180+
54181+    d = scan->devs + scan->dev_count;
54182+    d->src_type = src_type;
54183+    d->src_fmt_v4l2 = src_fmt_v4l2;
54184+    d->vname = strdup(vname);
54185+    if (!d->vname)
54186+        return -ENOMEM;
54187+    d->mname = strdup(mname);
54188+    if (!d->mname) {
54189+        free((char *)d->vname);
54190+        return -ENOMEM;
54191+    }
54192+    ++scan->dev_count;
54193+    return 0;
54194+}
54195+
54196+void devscan_delete(struct devscan **const pScan)
54197+{
54198+    unsigned int i;
54199+    struct devscan * const scan = *pScan;
54200+
54201+    if (!scan)
54202+        return;
54203+    *pScan = NULL;
54204+
54205+    for (i = 0; i < scan->dev_count; ++i) {
54206+        free((char*)scan->devs[i].mname);
54207+        free((char*)scan->devs[i].vname);
54208+    }
54209+    free(scan->devs);
54210+    free(scan);
54211+}
54212+
54213+#define REQ_BUF_CAPS (\
54214+    V4L2_BUF_CAP_SUPPORTS_DMABUF |\
54215+    V4L2_BUF_CAP_SUPPORTS_REQUESTS |\
54216+    V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
54217+
54218+static void probe_formats(void * const dc,
54219+              struct devscan *const scan,
54220+              const int fd,
54221+              const unsigned int type_v4l2,
54222+              const char *const mpath,
54223+              const char *const vpath)
54224+{
54225+    unsigned int i;
54226+    for (i = 0;; ++i) {
54227+        struct v4l2_fmtdesc fmtdesc = {
54228+            .index = i,
54229+            .type = type_v4l2
54230+        };
54231+        struct v4l2_requestbuffers rbufs = {
54232+            .count = 0,
54233+            .type = type_v4l2,
54234+            .memory = V4L2_MEMORY_MMAP
54235+        };
54236+        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
54237+            if (errno == EINTR)
54238+                continue;
54239+            if (errno != EINVAL)
54240+                request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2);
54241+            return;
54242+        }
54243+        if (!video_src_pixfmt_supported(fmtdesc.pixelformat))
54244+            continue;
54245+
54246+        if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) {
54247+            request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat);
54248+            continue;
54249+        }
54250+
54251+        while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) {
54252+            if (errno != EINTR) {
54253+                request_debug(dc, "%s: Reqbufs failed\n", vpath);
54254+                continue;
54255+            }
54256+        }
54257+
54258+        if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) {
54259+            request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities);
54260+            continue;
54261+        }
54262+
54263+        request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n",
54264+                 mpath, vpath, fmtdesc.pixelformat, type_v4l2);
54265+        devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath);
54266+    }
54267+}
54268+
54269+
54270+static int probe_video_device(void * const dc,
54271+                   struct udev_device *const device,
54272+                   struct devscan *const scan,
54273+                   const char *const mpath)
54274+{
54275+    int ret;
54276+    unsigned int capabilities = 0;
54277+    int video_fd = -1;
54278+
54279+    const char *path = udev_device_get_devnode(device);
54280+    if (!path) {
54281+        request_err(dc, "%s: get video device devnode failed\n", __func__);
54282+        ret = -EINVAL;
54283+        goto fail;
54284+    }
54285+
54286+    video_fd = open(path, O_RDWR, 0);
54287+    if (video_fd == -1) {
54288+        ret = -errno;
54289+        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
54290+        goto fail;
54291+    }
54292+
54293+    ret = v4l2_query_capabilities(video_fd, &capabilities);
54294+    if (ret < 0) {
54295+        request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret);
54296+        goto fail;
54297+    }
54298+
54299+    request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities);
54300+
54301+    if (!(capabilities & V4L2_CAP_STREAMING)) {
54302+        request_debug(dc, "%s: missing required streaming capability\n", __func__);
54303+        ret = -EINVAL;
54304+        goto fail;
54305+    }
54306+
54307+    if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) {
54308+        request_debug(dc, "%s: missing required mem2mem capability\n", __func__);
54309+        ret = -EINVAL;
54310+        goto fail;
54311+    }
54312+
54313+    /* Should check capture formats too... */
54314+    if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0)
54315+        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path);
54316+    if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0)
54317+        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path);
54318+
54319+    close(video_fd);
54320+    return 0;
54321+
54322+fail:
54323+    if (video_fd >= 0)
54324+        close(video_fd);
54325+    return ret;
54326+}
54327+
54328+static int probe_media_device(void * const dc,
54329+                   struct udev_device *const device,
54330+                   struct devscan *const scan)
54331+{
54332+    int ret;
54333+    int rv;
54334+    struct media_device_info device_info = { 0 };
54335+    struct media_v2_topology topology = { 0 };
54336+    struct media_v2_interface *interfaces = NULL;
54337+    struct udev *udev = udev_device_get_udev(device);
54338+    struct udev_device *video_device;
54339+    dev_t devnum;
54340+    int media_fd = -1;
54341+
54342+    const char *path = udev_device_get_devnode(device);
54343+    if (!path) {
54344+        request_err(dc, "%s: get media device devnode failed\n", __func__);
54345+        ret = -EINVAL;
54346+        goto fail;
54347+    }
54348+
54349+    media_fd = open(path, O_RDWR, 0);
54350+    if (media_fd < 0) {
54351+        ret = -errno;
54352+        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret);
54353+        goto fail;
54354+    }
54355+
54356+    rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
54357+    if (rv < 0) {
54358+        ret = -errno;
54359+        request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret);
54360+        goto fail;
54361+    }
54362+
54363+    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
54364+    if (rv < 0) {
54365+        ret = -errno;
54366+        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
54367+        goto fail;
54368+    }
54369+
54370+    if (topology.num_interfaces <= 0) {
54371+        request_err(dc, "%s: media device has no interfaces\n", __func__);
54372+        ret = -EINVAL;
54373+        goto fail;
54374+    }
54375+
54376+    interfaces = calloc(topology.num_interfaces, sizeof(*interfaces));
54377+    if (!interfaces) {
54378+        request_err(dc, "%s: allocating media interface struct failed\n", __func__);
54379+        ret = -ENOMEM;
54380+        goto fail;
54381+    }
54382+
54383+    topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
54384+    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
54385+    if (rv < 0) {
54386+        ret = -errno;
54387+        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
54388+        goto fail;
54389+    }
54390+
54391+    for (int i = 0; i < topology.num_interfaces; i++) {
54392+        if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
54393+            continue;
54394+
54395+        devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
54396+        video_device = udev_device_new_from_devnum(udev, 'c', devnum);
54397+        if (!video_device) {
54398+            ret = -errno;
54399+            request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device);
54400+            continue;
54401+        }
54402+
54403+        ret = probe_video_device(dc, video_device, scan, path);
54404+        udev_device_unref(video_device);
54405+
54406+        if (ret != 0)
54407+            goto fail;
54408+    }
54409+
54410+fail:
54411+    free(interfaces);
54412+    if (media_fd != -1)
54413+        close(media_fd);
54414+    return ret;
54415+}
54416+
54417+const char *decdev_media_path(const struct decdev *const dev)
54418+{
54419+    return !dev ? NULL : dev->mname;
54420+}
54421+
54422+const char *decdev_video_path(const struct decdev *const dev)
54423+{
54424+    return !dev ? NULL : dev->vname;
54425+}
54426+
54427+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev)
54428+{
54429+    return !dev ? 0 : dev->src_type;
54430+}
54431+
54432+uint32_t decdev_src_pixelformat(const struct decdev *const dev)
54433+{
54434+    return !dev ? 0 : dev->src_fmt_v4l2;
54435+}
54436+
54437+
54438+const struct decdev *devscan_find(struct devscan *const scan,
54439+                  const uint32_t src_fmt_v4l2)
54440+{
54441+    unsigned int i;
54442+
54443+    if (scan->env.mname && scan->env.vname)
54444+        return &scan->env;
54445+
54446+    if (!src_fmt_v4l2)
54447+        return scan->dev_count ? scan->devs + 0 : NULL;
54448+
54449+    for (i = 0; i != scan->dev_count; ++i) {
54450+        if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2)
54451+            return scan->devs + i;
54452+    }
54453+    return NULL;
54454+}
54455+
54456+int devscan_build(void * const dc, struct devscan **pscan)
54457+{
54458+    int ret;
54459+    struct udev *udev;
54460+    struct udev_enumerate *enumerate;
54461+    struct udev_list_entry *devices;
54462+    struct udev_list_entry *entry;
54463+    struct udev_device *device;
54464+    struct devscan * scan;
54465+
54466+    *pscan = NULL;
54467+
54468+    scan = calloc(1, sizeof(*scan));
54469+    if (!scan) {
54470+        ret = -ENOMEM;
54471+        goto fail;
54472+    }
54473+
54474+    scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH");
54475+    scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH");
54476+    if (scan->env.mname && scan->env.vname) {
54477+        request_info(dc, "Media/video device env overrides found: %s,%s\n",
54478+                 scan->env.mname, scan->env.vname);
54479+        *pscan = scan;
54480+        return 0;
54481+    }
54482+
54483+    udev = udev_new();
54484+    if (!udev) {
54485+        request_err(dc, "%s: allocating udev context failed\n", __func__);
54486+        ret = -ENOMEM;
54487+        goto fail;
54488+    }
54489+
54490+    enumerate = udev_enumerate_new(udev);
54491+    if (!enumerate) {
54492+        request_err(dc, "%s: allocating udev enumerator failed\n", __func__);
54493+        ret = -ENOMEM;
54494+        goto fail;
54495+    }
54496+
54497+    udev_enumerate_add_match_subsystem(enumerate, "media");
54498+    udev_enumerate_scan_devices(enumerate);
54499+
54500+    devices = udev_enumerate_get_list_entry(enumerate);
54501+    udev_list_entry_foreach(entry, devices) {
54502+        const char *path = udev_list_entry_get_name(entry);
54503+        if (!path)
54504+            continue;
54505+
54506+        device = udev_device_new_from_syspath(udev, path);
54507+        if (!device)
54508+            continue;
54509+
54510+        probe_media_device(dc, device, scan);
54511+        udev_device_unref(device);
54512+    }
54513+
54514+    udev_enumerate_unref(enumerate);
54515+
54516+    *pscan = scan;
54517+    return 0;
54518+
54519+fail:
54520+    udev_unref(udev);
54521+    devscan_delete(&scan);
54522+    return ret;
54523+}
54524+
54525--- /dev/null
54526+++ b/libavcodec/v4l2_req_devscan.h
54527@@ -0,0 +1,21 @@
54528+#ifndef _DEVSCAN_H_
54529+#define _DEVSCAN_H_
54530+
54531+struct devscan;
54532+struct decdev;
54533+enum v4l2_buf_type;
54534+
54535+/* These return pointers to data in the devscan structure and so are vaild
54536+ * for the lifetime of that
54537+ */
54538+const char *decdev_media_path(const struct decdev *const dev);
54539+const char *decdev_video_path(const struct decdev *const dev);
54540+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
54541+uint32_t decdev_src_pixelformat(const struct decdev *const dev);
54542+
54543+const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
54544+
54545+int devscan_build(void * const dc, struct devscan **pscan);
54546+void devscan_delete(struct devscan **const pScan);
54547+
54548+#endif
54549--- /dev/null
54550+++ b/libavcodec/v4l2_req_dmabufs.c
54551@@ -0,0 +1,266 @@
54552+#include <stdio.h>
54553+#include <stdlib.h>
54554+#include <unistd.h>
54555+#include <inttypes.h>
54556+#include <fcntl.h>
54557+#include <errno.h>
54558+#include <string.h>
54559+#include <sys/ioctl.h>
54560+#include <sys/mman.h>
54561+#include <linux/mman.h>
54562+#include <linux/dma-buf.h>
54563+#include <linux/dma-heap.h>
54564+
54565+#include "v4l2_req_dmabufs.h"
54566+#include "v4l2_req_utils.h"
54567+
54568+#define DMABUF_NAME1  "/dev/dma_heap/linux,cma"
54569+#define DMABUF_NAME2  "/dev/dma_heap/reserved"
54570+
54571+#define TRACE_ALLOC 0
54572+
54573+struct dmabufs_ctl {
54574+    int fd;
54575+    size_t page_size;
54576+};
54577+
54578+struct dmabuf_h {
54579+    int fd;
54580+    size_t size;
54581+    size_t len;
54582+    void * mapptr;
54583+};
54584+
54585+#if TRACE_ALLOC
54586+static unsigned int total_bufs = 0;
54587+static size_t total_size = 0;
54588+#endif
54589+
54590+struct dmabuf_h * dmabuf_import(int fd, size_t size)
54591+{
54592+    struct dmabuf_h *dh;
54593+
54594+    fd = dup(fd);
54595+    if (fd < 0  || size == 0)
54596+        return NULL;
54597+
54598+    dh = malloc(sizeof(*dh));
54599+    if (!dh) {
54600+        close(fd);
54601+        return NULL;
54602+    }
54603+
54604+    *dh = (struct dmabuf_h) {
54605+        .fd = fd,
54606+        .size = size,
54607+        .mapptr = MAP_FAILED
54608+    };
54609+
54610+#if TRACE_ALLOC
54611+    ++total_bufs;
54612+    total_size += dh->size;
54613+    request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
54614+#endif
54615+
54616+    return dh;
54617+}
54618+
54619+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
54620+{
54621+    struct dmabuf_h * dh;
54622+    struct dma_heap_allocation_data data = {
54623+        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
54624+        .fd = 0,
54625+        .fd_flags = O_RDWR,
54626+        .heap_flags = 0
54627+    };
54628+
54629+    if (old != NULL) {
54630+        if (old->size == data.len) {
54631+            return old;
54632+        }
54633+        dmabuf_free(old);
54634+    }
54635+
54636+    if (size == 0 ||
54637+        (dh = malloc(sizeof(*dh))) == NULL)
54638+        return NULL;
54639+
54640+    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
54641+        int err = errno;
54642+        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
54643+                (uint64_t)data.len,
54644+                dbsc->fd,
54645+                err,
54646+                strerror(err));
54647+        if (err == EINTR)
54648+            continue;
54649+        goto fail;
54650+    }
54651+
54652+    *dh = (struct dmabuf_h){
54653+        .fd = data.fd,
54654+        .size = (size_t)data.len,
54655+        .mapptr = MAP_FAILED
54656+    };
54657+
54658+#if TRACE_ALLOC
54659+    ++total_bufs;
54660+    total_size += dh->size;
54661+    request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
54662+#endif
54663+
54664+    return dh;
54665+
54666+fail:
54667+    free(dh);
54668+    return NULL;
54669+}
54670+
54671+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
54672+{
54673+    struct dma_buf_sync sync = {
54674+        .flags = flags
54675+    };
54676+    while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
54677+        const int err = errno;
54678+        if (errno == EINTR)
54679+            continue;
54680+        request_log("%s: ioctl failed: flags=%#x\n", __func__, flags);
54681+        return -err;
54682+    }
54683+    return 0;
54684+}
54685+
54686+int dmabuf_write_start(struct dmabuf_h * const dh)
54687+{
54688+    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE);
54689+}
54690+
54691+int dmabuf_write_end(struct dmabuf_h * const dh)
54692+{
54693+    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE);
54694+}
54695+
54696+int dmabuf_read_start(struct dmabuf_h * const dh)
54697+{
54698+    if (!dmabuf_map(dh))
54699+        return -1;
54700+    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ);
54701+}
54702+
54703+int dmabuf_read_end(struct dmabuf_h * const dh)
54704+{
54705+    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ);
54706+}
54707+
54708+
54709+void * dmabuf_map(struct dmabuf_h * const dh)
54710+{
54711+    if (!dh)
54712+        return NULL;
54713+    if (dh->mapptr != MAP_FAILED)
54714+        return dh->mapptr;
54715+    dh->mapptr = mmap(NULL, dh->size,
54716+              PROT_READ | PROT_WRITE,
54717+              MAP_SHARED | MAP_POPULATE,
54718+              dh->fd, 0);
54719+    if (dh->mapptr == MAP_FAILED) {
54720+        request_log("%s: Map failed\n", __func__);
54721+        return NULL;
54722+    }
54723+    return dh->mapptr;
54724+}
54725+
54726+int dmabuf_fd(const struct dmabuf_h * const dh)
54727+{
54728+    if (!dh)
54729+        return -1;
54730+    return dh->fd;
54731+}
54732+
54733+size_t dmabuf_size(const struct dmabuf_h * const dh)
54734+{
54735+    if (!dh)
54736+        return 0;
54737+    return dh->size;
54738+}
54739+
54740+size_t dmabuf_len(const struct dmabuf_h * const dh)
54741+{
54742+    if (!dh)
54743+        return 0;
54744+    return dh->len;
54745+}
54746+
54747+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
54748+{
54749+    dh->len = len;
54750+}
54751+
54752+
54753+
54754+void dmabuf_free(struct dmabuf_h * dh)
54755+{
54756+    if (!dh)
54757+        return;
54758+
54759+#if TRACE_ALLOC
54760+    --total_bufs;
54761+    total_size -= dh->size;
54762+    request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
54763+#endif
54764+
54765+    if (dh->mapptr != MAP_FAILED)
54766+        munmap(dh->mapptr, dh->size);
54767+    while (close(dh->fd) == -1 && errno == EINTR)
54768+        /* loop */;
54769+    free(dh);
54770+}
54771+
54772+struct dmabufs_ctl * dmabufs_ctl_new(void)
54773+{
54774+    struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc));
54775+
54776+    if (!dbsc)
54777+        return NULL;
54778+
54779+    while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
54780+           errno == EINTR)
54781+        /* Loop */;
54782+
54783+    if (dbsc->fd == -1) {
54784+        while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 &&
54785+               errno == EINTR)
54786+            /* Loop */;
54787+        if (dbsc->fd == -1) {
54788+            request_log("Unable to open either %s or %s\n",
54789+                    DMABUF_NAME1, DMABUF_NAME2);
54790+            goto fail;
54791+        }
54792+    }
54793+
54794+    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
54795+
54796+    return dbsc;
54797+
54798+fail:
54799+    free(dbsc);
54800+    return NULL;
54801+}
54802+
54803+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
54804+{
54805+    struct dmabufs_ctl * const dbsc = *pDbsc;
54806+
54807+    if (!dbsc)
54808+        return;
54809+    *pDbsc = NULL;
54810+
54811+    while (close(dbsc->fd) == -1 && errno == EINTR)
54812+        /* loop */;
54813+
54814+    free(dbsc);
54815+}
54816+
54817+
54818--- /dev/null
54819+++ b/libavcodec/v4l2_req_dmabufs.h
54820@@ -0,0 +1,38 @@
54821+#ifndef DMABUFS_H
54822+#define DMABUFS_H
54823+
54824+struct dmabufs_ctl;
54825+struct dmabuf_h;
54826+
54827+struct dmabufs_ctl * dmabufs_ctl_new(void);
54828+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc);
54829+
54830+// Need not preserve old contents
54831+// On NULL return old buffer is freed
54832+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size);
54833+
54834+static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) {
54835+    return dmabuf_realloc(dbsc, NULL, size);
54836+}
54837+/* Create from existing fd - dups(fd) */
54838+struct dmabuf_h * dmabuf_import(int fd, size_t size);
54839+void * dmabuf_map(struct dmabuf_h * const dh);
54840+
54841+/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
54842+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags);
54843+
54844+int dmabuf_write_start(struct dmabuf_h * const dh);
54845+int dmabuf_write_end(struct dmabuf_h * const dh);
54846+int dmabuf_read_start(struct dmabuf_h * const dh);
54847+int dmabuf_read_end(struct dmabuf_h * const dh);
54848+
54849+int dmabuf_fd(const struct dmabuf_h * const dh);
54850+/* Allocated size */
54851+size_t dmabuf_size(const struct dmabuf_h * const dh);
54852+/* Bytes in use */
54853+size_t dmabuf_len(const struct dmabuf_h * const dh);
54854+/* Set bytes in use */
54855+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len);
54856+void dmabuf_free(struct dmabuf_h * dh);
54857+
54858+#endif
54859--- /dev/null
54860+++ b/libavcodec/v4l2_req_hevc_v1.c
54861@@ -0,0 +1,3 @@
54862+#define HEVC_CTRLS_VERSION 1
54863+#include "v4l2_req_hevc_vx.c"
54864+
54865--- /dev/null
54866+++ b/libavcodec/v4l2_req_hevc_v2.c
54867@@ -0,0 +1,3 @@
54868+#define HEVC_CTRLS_VERSION 2
54869+#include "v4l2_req_hevc_vx.c"
54870+
54871--- /dev/null
54872+++ b/libavcodec/v4l2_req_hevc_v3.c
54873@@ -0,0 +1,3 @@
54874+#define HEVC_CTRLS_VERSION 3
54875+#include "v4l2_req_hevc_vx.c"
54876+
54877--- /dev/null
54878+++ b/libavcodec/v4l2_req_hevc_v4.c
54879@@ -0,0 +1,3 @@
54880+#define HEVC_CTRLS_VERSION 4
54881+#include "v4l2_req_hevc_vx.c"
54882+
54883--- /dev/null
54884+++ b/libavcodec/v4l2_req_hevc_vx.c
54885@@ -0,0 +1,1365 @@
54886+// File included by v4l2_req_hevc_v* - not compiled on its own
54887+
54888+#include "decode.h"
54889+#include "hevcdec.h"
54890+#include "hwconfig.h"
54891+
54892+#if HEVC_CTRLS_VERSION == 1
54893+#include "hevc-ctrls-v1.h"
54894+
54895+// Fixup renamed entries
54896+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT
54897+
54898+#elif HEVC_CTRLS_VERSION == 2
54899+#include "hevc-ctrls-v2.h"
54900+#elif HEVC_CTRLS_VERSION == 3
54901+#include "hevc-ctrls-v3.h"
54902+#elif HEVC_CTRLS_VERSION == 4
54903+#include <linux/v4l2-controls.h>
54904+#if !defined(V4L2_CID_STATELESS_HEVC_SPS)
54905+#include "hevc-ctrls-v4.h"
54906+#endif
54907+#else
54908+#error Unknown HEVC_CTRLS_VERSION
54909+#endif
54910+
54911+#ifndef V4L2_CID_STATELESS_HEVC_SPS
54912+#define V4L2_CID_STATELESS_HEVC_SPS                     V4L2_CID_MPEG_VIDEO_HEVC_SPS
54913+#define V4L2_CID_STATELESS_HEVC_PPS                     V4L2_CID_MPEG_VIDEO_HEVC_PPS
54914+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS            V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS
54915+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX          V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX
54916+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS           V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS
54917+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE             V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE
54918+#define V4L2_CID_STATELESS_HEVC_START_CODE              V4L2_CID_MPEG_VIDEO_HEVC_START_CODE
54919+
54920+#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED
54921+#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED
54922+#define V4L2_STATELESS_HEVC_START_CODE_NONE             V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE
54923+#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B          V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
54924+#endif
54925+
54926+// Should be in videodev2 but we might not have a good enough one
54927+#ifndef V4L2_PIX_FMT_HEVC_SLICE
54928+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
54929+#endif
54930+
54931+#include "v4l2_request_hevc.h"
54932+
54933+#include "libavutil/hwcontext_drm.h"
54934+
54935+#include <semaphore.h>
54936+#include <pthread.h>
54937+
54938+#include "v4l2_req_devscan.h"
54939+#include "v4l2_req_dmabufs.h"
54940+#include "v4l2_req_pollqueue.h"
54941+#include "v4l2_req_media.h"
54942+#include "v4l2_req_utils.h"
54943+
54944+// Attached to buf[0] in frame
54945+// Pooled in hwcontext so generally create once - 1/frame
54946+typedef struct V4L2MediaReqDescriptor {
54947+    AVDRMFrameDescriptor drm;
54948+
54949+    // Media
54950+    uint64_t timestamp;
54951+    struct qent_dst * qe_dst;
54952+
54953+    // Decode only - should be NULL by the time we emit the frame
54954+    struct req_decode_ent decode_ent;
54955+
54956+    struct media_request *req;
54957+    struct qent_src *qe_src;
54958+
54959+#if HEVC_CTRLS_VERSION >= 2
54960+    struct v4l2_ctrl_hevc_decode_params dec;
54961+#endif
54962+
54963+    size_t num_slices;
54964+    size_t alloced_slices;
54965+    struct v4l2_ctrl_hevc_slice_params * slice_params;
54966+    struct slice_info * slices;
54967+
54968+    size_t num_offsets;
54969+    size_t alloced_offsets;
54970+    uint32_t *offsets;
54971+
54972+} V4L2MediaReqDescriptor;
54973+
54974+struct slice_info {
54975+    const uint8_t * ptr;
54976+    size_t len; // bytes
54977+    size_t n_offsets;
54978+};
54979+
54980+// Handy container for accumulating controls before setting
54981+struct req_controls {
54982+    int has_scaling;
54983+    struct timeval tv;
54984+    struct v4l2_ctrl_hevc_sps sps;
54985+    struct v4l2_ctrl_hevc_pps pps;
54986+    struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
54987+};
54988+
54989+//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
54990+
54991+
54992+// Get an FFmpeg format from the v4l2 format
54993+static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format)
54994+{
54995+    switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ?
54996+            format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) {
54997+    case V4L2_PIX_FMT_YUV420:
54998+        return AV_PIX_FMT_YUV420P;
54999+    case V4L2_PIX_FMT_NV12:
55000+        return AV_PIX_FMT_NV12;
55001+#if CONFIG_SAND
55002+    case V4L2_PIX_FMT_NV12_COL128:
55003+        return AV_PIX_FMT_RPI4_8;
55004+    case V4L2_PIX_FMT_NV12_10_COL128:
55005+        return AV_PIX_FMT_RPI4_10;
55006+#endif
55007+    default:
55008+        break;
55009+    }
55010+    return AV_PIX_FMT_NONE;
55011+}
55012+
55013+static inline uint64_t frame_capture_dpb(const AVFrame * const frame)
55014+{
55015+    const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
55016+    return rd->timestamp;
55017+}
55018+
55019+static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp)
55020+{
55021+    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
55022+    rd->timestamp = dpb_stamp;
55023+}
55024+
55025+static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
55026+{
55027+    int32_t luma_weight_denom, chroma_weight_denom;
55028+    const SliceHeader *sh = &h->sh;
55029+
55030+    if (sh->slice_type == HEVC_SLICE_I ||
55031+        (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
55032+        (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
55033+        return;
55034+
55035+    table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
55036+
55037+    if (h->ps.sps->chroma_format_idc)
55038+        table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
55039+
55040+    luma_weight_denom = (1 << sh->luma_log2_weight_denom);
55041+    chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
55042+
55043+    for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
55044+        table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
55045+        table->luma_offset_l0[i] = sh->luma_offset_l0[i];
55046+        table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
55047+        table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
55048+        table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
55049+        table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
55050+    }
55051+
55052+    if (sh->slice_type != HEVC_SLICE_B)
55053+        return;
55054+
55055+    for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
55056+        table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
55057+        table->luma_offset_l1[i] = sh->luma_offset_l1[i];
55058+        table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
55059+        table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
55060+        table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
55061+        table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
55062+    }
55063+}
55064+
55065+#if HEVC_CTRLS_VERSION <= 2
55066+static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
55067+{
55068+    const HEVCFrame *frame;
55069+    int i;
55070+
55071+    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
55072+        frame = h->rps[ST_CURR_BEF].ref[i];
55073+        if (frame && timestamp == frame_capture_dpb(frame->frame))
55074+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
55075+    }
55076+
55077+    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
55078+        frame = h->rps[ST_CURR_AFT].ref[i];
55079+        if (frame && timestamp == frame_capture_dpb(frame->frame))
55080+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
55081+    }
55082+
55083+    for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
55084+        frame = h->rps[LT_CURR].ref[i];
55085+        if (frame && timestamp == frame_capture_dpb(frame->frame))
55086+            return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
55087+    }
55088+
55089+    return 0;
55090+}
55091+#endif
55092+
55093+static unsigned int
55094+get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
55095+                  const struct v4l2_hevc_dpb_entry * const entries,
55096+                  const unsigned int num_entries)
55097+{
55098+    uint64_t timestamp;
55099+
55100+    if (!frame)
55101+        return 0;
55102+
55103+    timestamp = frame_capture_dpb(frame->frame);
55104+
55105+    for (unsigned int i = 0; i < num_entries; i++) {
55106+        if (entries[i].timestamp == timestamp)
55107+            return i;
55108+    }
55109+
55110+    return 0;
55111+}
55112+
55113+static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
55114+{
55115+    unsigned int z = 0;
55116+    while (idx--) {
55117+        if (*b++ == 0) {
55118+            ++z;
55119+            if (z >= 2 && *b == 3) {
55120+                ++b;
55121+                z = 0;
55122+            }
55123+        }
55124+        else {
55125+            z = 0;
55126+        }
55127+    }
55128+    return b;
55129+}
55130+
55131+static int slice_add(V4L2MediaReqDescriptor * const rd)
55132+{
55133+    if (rd->num_slices >= rd->alloced_slices) {
55134+        struct v4l2_ctrl_hevc_slice_params * p2;
55135+        struct slice_info * s2;
55136+        size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2;
55137+
55138+        p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
55139+        if (p2 == NULL)
55140+            return AVERROR(ENOMEM);
55141+        rd->slice_params = p2;
55142+
55143+        s2 = av_realloc_array(rd->slices, n2, sizeof(*s2));
55144+        if (s2 == NULL)
55145+            return AVERROR(ENOMEM);
55146+        rd->slices = s2;
55147+
55148+        rd->alloced_slices = n2;
55149+    }
55150+    ++rd->num_slices;
55151+    return 0;
55152+}
55153+
55154+static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets)
55155+{
55156+    if (rd->num_offsets + n > rd->alloced_offsets) {
55157+        size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2;
55158+        void * p2;
55159+        while (rd->num_offsets + n > n2)
55160+            n2 *= 2;
55161+        if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL)
55162+            return AVERROR(ENOMEM);
55163+        rd->offsets = p2;
55164+        rd->alloced_offsets = n2;
55165+    }
55166+    for (size_t i = 0; i != n; ++i)
55167+        rd->offsets[rd->num_offsets++] = offsets[i] - 1;
55168+    return 0;
55169+}
55170+
55171+static unsigned int
55172+fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
55173+{
55174+    unsigned int i;
55175+    unsigned int n = 0;
55176+    const HEVCFrame * const pic = h->ref;
55177+
55178+    for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
55179+        const HEVCFrame * const frame = &h->DPB[i];
55180+        if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
55181+            struct v4l2_hevc_dpb_entry * const entry = entries + n++;
55182+
55183+            entry->timestamp = frame_capture_dpb(frame->frame);
55184+#if HEVC_CTRLS_VERSION <= 2
55185+            entry->rps = find_frame_rps_type(h, entry->timestamp);
55186+#else
55187+            entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 :
55188+                V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE;
55189+#endif
55190+            entry->field_pic = frame->frame->interlaced_frame;
55191+
55192+#if HEVC_CTRLS_VERSION <= 3
55193+            /* TODO: Interleaved: Get the POC for each field. */
55194+            entry->pic_order_cnt[0] = frame->poc;
55195+            entry->pic_order_cnt[1] = frame->poc;
55196+#else
55197+            entry->pic_order_cnt_val = frame->poc;
55198+#endif
55199+        }
55200+    }
55201+    return n;
55202+}
55203+
55204+static void fill_slice_params(const HEVCContext * const h,
55205+#if HEVC_CTRLS_VERSION >= 2
55206+                              const struct v4l2_ctrl_hevc_decode_params * const dec,
55207+#endif
55208+                              struct v4l2_ctrl_hevc_slice_params *slice_params,
55209+                              uint32_t bit_size, uint32_t bit_offset)
55210+{
55211+    const SliceHeader * const sh = &h->sh;
55212+#if HEVC_CTRLS_VERSION >= 2
55213+    const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb;
55214+    const unsigned int dpb_n = dec->num_active_dpb_entries;
55215+#else
55216+    struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb;
55217+    unsigned int dpb_n;
55218+#endif
55219+    unsigned int i;
55220+    RefPicList *rpl;
55221+
55222+    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
55223+        .bit_size = bit_size,
55224+#if HEVC_CTRLS_VERSION <= 3
55225+        .data_bit_offset = bit_offset,
55226+#else
55227+        .data_byte_offset = bit_offset / 8 + 1,
55228+#endif
55229+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
55230+        .slice_segment_addr = sh->slice_segment_addr,
55231+
55232+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
55233+        .nal_unit_type = h->nal_unit_type,
55234+        .nuh_temporal_id_plus1 = h->temporal_id + 1,
55235+
55236+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
55237+        .slice_type = sh->slice_type,
55238+        .colour_plane_id = sh->colour_plane_id,
55239+        .slice_pic_order_cnt = h->ref->poc,
55240+        .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
55241+        .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
55242+        .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
55243+        .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
55244+        .slice_qp_delta = sh->slice_qp_delta,
55245+        .slice_cb_qp_offset = sh->slice_cb_qp_offset,
55246+        .slice_cr_qp_offset = sh->slice_cr_qp_offset,
55247+        .slice_act_y_qp_offset = 0,
55248+        .slice_act_cb_qp_offset = 0,
55249+        .slice_act_cr_qp_offset = 0,
55250+        .slice_beta_offset_div2 = sh->beta_offset / 2,
55251+        .slice_tc_offset_div2 = sh->tc_offset / 2,
55252+
55253+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
55254+        .pic_struct = h->sei.picture_timing.picture_struct,
55255+
55256+#if HEVC_CTRLS_VERSION < 2
55257+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
55258+        .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
55259+        .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
55260+        .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
55261+#endif
55262+    };
55263+
55264+    if (sh->slice_sample_adaptive_offset_flag[0])
55265+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
55266+
55267+    if (sh->slice_sample_adaptive_offset_flag[1])
55268+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
55269+
55270+    if (sh->slice_temporal_mvp_enabled_flag)
55271+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
55272+
55273+    if (sh->mvd_l1_zero_flag)
55274+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
55275+
55276+    if (sh->cabac_init_flag)
55277+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
55278+
55279+    if (sh->collocated_list == L0)
55280+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
55281+
55282+    if (sh->disable_deblocking_filter_flag)
55283+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
55284+
55285+    if (sh->slice_loop_filter_across_slices_enabled_flag)
55286+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
55287+
55288+    if (sh->dependent_slice_segment_flag)
55289+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT;
55290+
55291+#if HEVC_CTRLS_VERSION < 2
55292+    dpb_n = fill_dpb_entries(h, dpb);
55293+    slice_params->num_active_dpb_entries = dpb_n;
55294+#endif
55295+
55296+    if (sh->slice_type != HEVC_SLICE_I) {
55297+        rpl = &h->ref->refPicList[0];
55298+        for (i = 0; i < rpl->nb_refs; i++)
55299+            slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
55300+    }
55301+
55302+    if (sh->slice_type == HEVC_SLICE_B) {
55303+        rpl = &h->ref->refPicList[1];
55304+        for (i = 0; i < rpl->nb_refs; i++)
55305+            slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
55306+    }
55307+
55308+    fill_pred_table(h, &slice_params->pred_weight_table);
55309+
55310+    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
55311+#if HEVC_CTRLS_VERSION <= 3
55312+    if (slice_params->num_entry_point_offsets > 256) {
55313+        slice_params->num_entry_point_offsets = 256;
55314+        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
55315+    }
55316+
55317+    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
55318+        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
55319+#endif
55320+}
55321+
55322+#if HEVC_CTRLS_VERSION >= 2
55323+static void
55324+fill_decode_params(const HEVCContext * const h,
55325+                   struct v4l2_ctrl_hevc_decode_params * const dec)
55326+{
55327+    unsigned int i;
55328+
55329+    *dec = (struct v4l2_ctrl_hevc_decode_params){
55330+        .pic_order_cnt_val = h->poc,
55331+        .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
55332+        .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
55333+        .num_poc_lt_curr = h->rps[LT_CURR].nb_refs,
55334+    };
55335+
55336+    dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb);
55337+
55338+    // The docn does seem to ask that we fit our 32 bit signed POC into
55339+    // a U8 so... (To be fair 16 bits would be enough)
55340+    // Luckily we (Pi) don't use these fields
55341+    for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i)
55342+        dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc;
55343+    for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i)
55344+        dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc;
55345+    for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i)
55346+        dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc;
55347+
55348+    if (IS_IRAP(h))
55349+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC;
55350+    if (IS_IDR(h))
55351+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC;
55352+    if (h->sh.no_output_of_prior_pics_flag)
55353+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR;
55354+
55355+}
55356+#endif
55357+
55358+static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps)
55359+{
55360+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
55361+    *ctrl = (struct v4l2_ctrl_hevc_sps) {
55362+        .chroma_format_idc = sps->chroma_format_idc,
55363+        .pic_width_in_luma_samples = sps->width,
55364+        .pic_height_in_luma_samples = sps->height,
55365+        .bit_depth_luma_minus8 = sps->bit_depth - 8,
55366+        .bit_depth_chroma_minus8 = sps->bit_depth - 8,
55367+        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
55368+        .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
55369+        .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
55370+        .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
55371+        .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
55372+        .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
55373+        .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
55374+        .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
55375+        .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
55376+        .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
55377+        .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
55378+        .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
55379+        .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
55380+        .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
55381+        .num_short_term_ref_pic_sets = sps->nb_st_rps,
55382+        .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
55383+        .chroma_format_idc = sps->chroma_format_idc,
55384+        .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1,
55385+    };
55386+
55387+    if (sps->separate_colour_plane_flag)
55388+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
55389+
55390+    if (sps->scaling_list_enable_flag)
55391+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
55392+
55393+    if (sps->amp_enabled_flag)
55394+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
55395+
55396+    if (sps->sao_enabled)
55397+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
55398+
55399+    if (sps->pcm_enabled_flag)
55400+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
55401+
55402+    if (sps->pcm.loop_filter_disable_flag)
55403+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
55404+
55405+    if (sps->long_term_ref_pics_present_flag)
55406+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
55407+
55408+    if (sps->sps_temporal_mvp_enabled_flag)
55409+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
55410+
55411+    if (sps->sps_strong_intra_smoothing_enable_flag)
55412+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
55413+}
55414+
55415+static void fill_scaling_matrix(const ScalingList * const sl,
55416+                                struct v4l2_ctrl_hevc_scaling_matrix * const sm)
55417+{
55418+    unsigned int i;
55419+
55420+    for (i = 0; i < 6; i++) {
55421+        unsigned int j;
55422+
55423+        for (j = 0; j < 16; j++)
55424+            sm->scaling_list_4x4[i][j] = sl->sl[0][i][j];
55425+        for (j = 0; j < 64; j++) {
55426+            sm->scaling_list_8x8[i][j]   = sl->sl[1][i][j];
55427+            sm->scaling_list_16x16[i][j] = sl->sl[2][i][j];
55428+            if (i < 2)
55429+                sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
55430+        }
55431+        sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
55432+        if (i < 2)
55433+            sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
55434+    }
55435+}
55436+
55437+static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps)
55438+{
55439+    uint64_t flags = 0;
55440+
55441+    if (pps->dependent_slice_segments_enabled_flag)
55442+        flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED;
55443+
55444+    if (pps->output_flag_present_flag)
55445+        flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
55446+
55447+    if (pps->sign_data_hiding_flag)
55448+        flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
55449+
55450+    if (pps->cabac_init_present_flag)
55451+        flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
55452+
55453+    if (pps->constrained_intra_pred_flag)
55454+        flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
55455+
55456+    if (pps->transform_skip_enabled_flag)
55457+        flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
55458+
55459+    if (pps->cu_qp_delta_enabled_flag)
55460+        flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
55461+
55462+    if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
55463+        flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
55464+
55465+    if (pps->weighted_pred_flag)
55466+        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
55467+
55468+    if (pps->weighted_bipred_flag)
55469+        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
55470+
55471+    if (pps->transquant_bypass_enable_flag)
55472+        flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
55473+
55474+    if (pps->tiles_enabled_flag)
55475+        flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
55476+
55477+    if (pps->entropy_coding_sync_enabled_flag)
55478+        flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
55479+
55480+    if (pps->loop_filter_across_tiles_enabled_flag)
55481+        flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
55482+
55483+    if (pps->seq_loop_filter_across_slices_enabled_flag)
55484+        flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
55485+
55486+    if (pps->deblocking_filter_override_enabled_flag)
55487+        flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
55488+
55489+    if (pps->disable_dbf)
55490+        flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
55491+
55492+    if (pps->lists_modification_present_flag)
55493+        flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
55494+
55495+    if (pps->slice_header_extension_present_flag)
55496+        flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
55497+
55498+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
55499+    *ctrl = (struct v4l2_ctrl_hevc_pps) {
55500+        .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
55501+        .init_qp_minus26 = pps->pic_init_qp_minus26,
55502+        .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
55503+        .pps_cb_qp_offset = pps->cb_qp_offset,
55504+        .pps_cr_qp_offset = pps->cr_qp_offset,
55505+        .pps_beta_offset_div2 = pps->beta_offset / 2,
55506+        .pps_tc_offset_div2 = pps->tc_offset / 2,
55507+        .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
55508+        .flags = flags
55509+    };
55510+
55511+
55512+    if (pps->tiles_enabled_flag) {
55513+        ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1;
55514+        ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1;
55515+
55516+        for (int i = 0; i < pps->num_tile_columns; i++)
55517+            ctrl->column_width_minus1[i] = pps->column_width[i] - 1;
55518+
55519+        for (int i = 0; i < pps->num_tile_rows; i++)
55520+            ctrl->row_height_minus1[i] = pps->row_height[i] - 1;
55521+    }
55522+}
55523+
55524+// Called before finally returning the frame to the user
55525+// Set corrupt flag here as this is actually the frame structure that
55526+// is going to the user (in MT land each thread has its own pool)
55527+static int frame_post_process(void *logctx, AVFrame *frame)
55528+{
55529+    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0];
55530+
55531+//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
55532+    frame->flags &= ~AV_FRAME_FLAG_CORRUPT;
55533+    if (rd->qe_dst) {
55534+        MediaBufsStatus stat = qent_dst_wait(rd->qe_dst);
55535+        if (stat != MEDIABUFS_STATUS_SUCCESS) {
55536+            av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__);
55537+            frame->flags |= AV_FRAME_FLAG_CORRUPT;
55538+        }
55539+    }
55540+
55541+    return 0;
55542+}
55543+
55544+static inline struct timeval cvt_dpb_to_tv(uint64_t t)
55545+{
55546+    t /= 1000;
55547+    return (struct timeval){
55548+        .tv_usec = t % 1000000,
55549+        .tv_sec = t / 1000000
55550+    };
55551+}
55552+
55553+static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t)
55554+{
55555+    return (uint64_t)t * 1000;
55556+}
55557+
55558+static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
55559+                                         av_unused const uint8_t *buffer,
55560+                                         av_unused uint32_t size)
55561+{
55562+    const HEVCContext *h = avctx->priv_data;
55563+    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
55564+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
55565+
55566+//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
55567+    decode_q_add(&ctx->decode_q, &rd->decode_ent);
55568+
55569+    rd->num_slices = 0;
55570+    ctx->timestamp++;
55571+    rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp);
55572+
55573+    {
55574+        FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data;
55575+        fdd->post_process = frame_post_process;
55576+    }
55577+
55578+    // qe_dst needs to be bound to the data buffer and only returned when that is
55579+    if (!rd->qe_dst)
55580+    {
55581+        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
55582+            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
55583+            return AVERROR(ENOMEM);
55584+        }
55585+    }
55586+
55587+    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
55588+
55589+    return 0;
55590+}
55591+
55592+// Object fd & size will be zapped by this & need setting later
55593+static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format)
55594+{
55595+    AVDRMLayerDescriptor *layer = &desc->layers[0];
55596+    unsigned int width;
55597+    unsigned int height;
55598+    unsigned int bpl;
55599+    uint32_t pixelformat;
55600+
55601+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
55602+        width       = format->fmt.pix_mp.width;
55603+        height      = format->fmt.pix_mp.height;
55604+        pixelformat = format->fmt.pix_mp.pixelformat;
55605+        bpl         = format->fmt.pix_mp.plane_fmt[0].bytesperline;
55606+    }
55607+    else {
55608+        width       = format->fmt.pix.width;
55609+        height      = format->fmt.pix.height;
55610+        pixelformat = format->fmt.pix.pixelformat;
55611+        bpl         = format->fmt.pix.bytesperline;
55612+    }
55613+
55614+    switch (pixelformat) {
55615+    case V4L2_PIX_FMT_NV12:
55616+        layer->format = DRM_FORMAT_NV12;
55617+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
55618+        break;
55619+#if CONFIG_SAND
55620+    case V4L2_PIX_FMT_NV12_COL128:
55621+        layer->format = DRM_FORMAT_NV12;
55622+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
55623+        break;
55624+    case V4L2_PIX_FMT_NV12_10_COL128:
55625+        layer->format = DRM_FORMAT_P030;
55626+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
55627+        break;
55628+#endif
55629+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
55630+    case V4L2_PIX_FMT_SUNXI_TILED_NV12:
55631+        layer->format = DRM_FORMAT_NV12;
55632+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
55633+        break;
55634+#endif
55635+#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15)
55636+    case V4L2_PIX_FMT_NV15:
55637+        layer->format = DRM_FORMAT_NV15;
55638+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
55639+        break;
55640+#endif
55641+    case V4L2_PIX_FMT_NV16:
55642+        layer->format = DRM_FORMAT_NV16;
55643+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
55644+        break;
55645+#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20)
55646+    case V4L2_PIX_FMT_NV20:
55647+        layer->format = DRM_FORMAT_NV20;
55648+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
55649+        break;
55650+#endif
55651+    default:
55652+        return -1;
55653+    }
55654+
55655+    desc->nb_objects = 1;
55656+    desc->objects[0].fd = -1;
55657+    desc->objects[0].size = 0;
55658+
55659+    desc->nb_layers = 1;
55660+    layer->nb_planes = 2;
55661+
55662+    layer->planes[0].object_index = 0;
55663+    layer->planes[0].offset = 0;
55664+    layer->planes[0].pitch = bpl;
55665+#if CONFIG_SAND
55666+    if (pixelformat == V4L2_PIX_FMT_NV12_COL128) {
55667+        layer->planes[1].object_index = 0;
55668+        layer->planes[1].offset = height * 128;
55669+        layer->planes[0].pitch = width;
55670+        layer->planes[1].pitch = width;
55671+    }
55672+    else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
55673+        layer->planes[1].object_index = 0;
55674+        layer->planes[1].offset = height * 128;
55675+        layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy
55676+        layer->planes[1].pitch = width * 2;
55677+    }
55678+    else
55679+#endif
55680+    {
55681+        layer->planes[1].object_index = 0;
55682+        layer->planes[1].offset = layer->planes[0].pitch * height;
55683+        layer->planes[1].pitch = layer->planes[0].pitch;
55684+    }
55685+
55686+    return 0;
55687+}
55688+
55689+static int
55690+set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
55691+    struct req_controls *const controls,
55692+#if HEVC_CTRLS_VERSION >= 2
55693+    struct v4l2_ctrl_hevc_decode_params * const dec,
55694+#endif
55695+    struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count,
55696+    void * const offsets, const size_t offset_count)
55697+{
55698+    int rv;
55699+#if HEVC_CTRLS_VERSION >= 2
55700+    unsigned int n = 3;
55701+#else
55702+    unsigned int n = 2;
55703+#endif
55704+
55705+    struct v4l2_ext_control control[6] = {
55706+        {
55707+            .id = V4L2_CID_STATELESS_HEVC_SPS,
55708+            .ptr = &controls->sps,
55709+            .size = sizeof(controls->sps),
55710+        },
55711+        {
55712+            .id = V4L2_CID_STATELESS_HEVC_PPS,
55713+            .ptr = &controls->pps,
55714+            .size = sizeof(controls->pps),
55715+        },
55716+#if HEVC_CTRLS_VERSION >= 2
55717+        {
55718+            .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
55719+            .ptr = dec,
55720+            .size = sizeof(*dec),
55721+        },
55722+#endif
55723+    };
55724+
55725+    if (slices)
55726+        control[n++] = (struct v4l2_ext_control) {
55727+            .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
55728+            .ptr = slices,
55729+            .size = sizeof(*slices) * slice_count,
55730+        };
55731+
55732+    if (controls->has_scaling)
55733+        control[n++] = (struct v4l2_ext_control) {
55734+            .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
55735+            .ptr = &controls->scaling_matrix,
55736+            .size = sizeof(controls->scaling_matrix),
55737+        };
55738+
55739+#if HEVC_CTRLS_VERSION >= 4
55740+    if (offsets)
55741+        control[n++] = (struct v4l2_ext_control) {
55742+            .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS,
55743+            .ptr = offsets,
55744+            .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count,
55745+        };
55746+#endif
55747+
55748+    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n);
55749+
55750+    return rv;
55751+}
55752+
55753+// This only works because we started out from a single coded frame buffer
55754+// that will remain intact until after end_frame
55755+static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
55756+{
55757+    const HEVCContext * const h = avctx->priv_data;
55758+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
55759+    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
55760+    int bcount = get_bits_count(&h->HEVClc->gb);
55761+    uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
55762+
55763+    const unsigned int n = rd->num_slices;
55764+    const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices;
55765+
55766+    int rv;
55767+    struct slice_info * si;
55768+
55769+    // This looks dodgy but we know that FFmpeg has parsed this from a buffer
55770+    // that contains the entire frame including the start code
55771+    if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
55772+        buffer -= 3;
55773+        size += 3;
55774+        boff += 24;
55775+        if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) {
55776+            av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n",
55777+                   buffer[0], buffer[1], buffer[2]);
55778+        }
55779+    }
55780+
55781+    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
55782+        if (rd->slices == NULL) {
55783+            if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL)
55784+                return AVERROR(ENOMEM);
55785+            rd->slices->ptr = buffer;
55786+            rd->num_slices = 1;
55787+        }
55788+        rd->slices->len = buffer - rd->slices->ptr + size;
55789+        return 0;
55790+    }
55791+
55792+    if ((rv = slice_add(rd)) != 0)
55793+        return rv;
55794+
55795+    si = rd->slices + n;
55796+    si->ptr = buffer;
55797+    si->len = size;
55798+    si->n_offsets = rd->num_offsets;
55799+
55800+    if (n != block_start) {
55801+        struct slice_info *const si0 = rd->slices + block_start;
55802+        const size_t offset = (buffer - si0->ptr);
55803+        boff += offset * 8;
55804+        size += offset;
55805+        si0->len = si->len + offset;
55806+    }
55807+
55808+#if HEVC_CTRLS_VERSION >= 2
55809+    if (n == 0)
55810+        fill_decode_params(h, &rd->dec);
55811+    fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff);
55812+#else
55813+    fill_slice_params(h, rd->slice_params + n, size * 8, boff);
55814+#endif
55815+    if (ctx->max_offsets != 0 &&
55816+        (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0)
55817+        return rv;
55818+
55819+    return 0;
55820+}
55821+
55822+static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx)
55823+{
55824+    const HEVCContext * const h = avctx->priv_data;
55825+    if (h->ref != NULL) {
55826+        V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
55827+        V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
55828+
55829+        media_request_abort(&rd->req);
55830+        mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src);
55831+
55832+        decode_q_remove(&ctx->decode_q, &rd->decode_ent);
55833+    }
55834+}
55835+
55836+static int send_slice(AVCodecContext * const avctx,
55837+                      V4L2MediaReqDescriptor * const rd,
55838+                      struct req_controls *const controls,
55839+                      const unsigned int i, const unsigned int j)
55840+{
55841+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
55842+
55843+    const int is_last = (j == rd->num_slices);
55844+    struct slice_info *const si = rd->slices + i;
55845+    struct media_request * req = NULL;
55846+    struct qent_src * src = NULL;
55847+    MediaBufsStatus stat;
55848+    void * offsets = rd->offsets + rd->slices[i].n_offsets;
55849+    size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets;
55850+
55851+    if ((req = media_request_get(ctx->mpool)) == NULL) {
55852+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
55853+        return AVERROR(ENOMEM);
55854+    }
55855+
55856+    if (set_req_ctls(ctx, req,
55857+                     controls,
55858+#if HEVC_CTRLS_VERSION >= 2
55859+                     &rd->dec,
55860+#endif
55861+                     rd->slice_params + i, j - i,
55862+                     offsets, n_offsets)) {
55863+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
55864+        goto fail1;
55865+    }
55866+
55867+    if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) {
55868+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__);
55869+        goto fail1;
55870+    }
55871+
55872+    if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) {
55873+        av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__);
55874+        goto fail2;
55875+    }
55876+
55877+    if (qent_src_params_set(src, &controls->tv)) {
55878+        av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__);
55879+        goto fail2;
55880+    }
55881+
55882+    stat = mediabufs_start_request(ctx->mbufs, &req, &src,
55883+                                   i == 0 ? rd->qe_dst : NULL,
55884+                                   is_last);
55885+
55886+    if (stat != MEDIABUFS_STATUS_SUCCESS) {
55887+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
55888+        return AVERROR_UNKNOWN;
55889+    }
55890+    return 0;
55891+
55892+fail2:
55893+    mediabufs_src_qent_abort(ctx->mbufs, &src);
55894+fail1:
55895+    media_request_abort(&req);
55896+    return AVERROR_UNKNOWN;
55897+}
55898+
55899+static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
55900+{
55901+    const HEVCContext * const h = avctx->priv_data;
55902+    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
55903+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
55904+    struct req_controls rc;
55905+    unsigned int i;
55906+    int rv;
55907+
55908+    // It is possible, though maybe a bug, to get an end_frame without
55909+    // a previous start_frame.  If we do then give up.
55910+    if (!decode_q_in_q(&rd->decode_ent)) {
55911+        av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__);
55912+        return AVERROR_INVALIDDATA;
55913+    }
55914+
55915+    {
55916+        const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ?
55917+                                    &h->ps.pps->scaling_list :
55918+                                h->ps.sps->scaling_list_enable_flag ?
55919+                                    &h->ps.sps->scaling_list : NULL;
55920+
55921+
55922+        memset(&rc, 0, sizeof(rc));
55923+        rc.tv = cvt_dpb_to_tv(rd->timestamp);
55924+        fill_sps(&rc.sps, h->ps.sps);
55925+        fill_pps(&rc.pps, h->ps.pps);
55926+        if (sl) {
55927+            rc.has_scaling = 1;
55928+            fill_scaling_matrix(sl, &rc.scaling_matrix);
55929+        }
55930+    }
55931+
55932+    decode_q_wait(&ctx->decode_q, &rd->decode_ent);
55933+
55934+    // qe_dst needs to be bound to the data buffer and only returned when that is
55935+    // Alloc almost certainly wants to be serialised if there is any chance of blocking
55936+    // so we get the next frame to be free in the thread that needs it for decode first.
55937+    //
55938+    // In our current world this probably isn't a concern but put it here anyway
55939+    if (!rd->qe_dst)
55940+    {
55941+        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
55942+            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
55943+            rv = AVERROR(ENOMEM);
55944+            goto fail;
55945+        }
55946+    }
55947+
55948+    // Send as slices
55949+    for (i = 0; i < rd->num_slices; i += ctx->max_slices) {
55950+        const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices);
55951+        if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0)
55952+            goto fail;
55953+    }
55954+
55955+    // Set the drm_prime desriptor
55956+    drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
55957+    rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0));
55958+    rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0));
55959+
55960+    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
55961+    return 0;
55962+
55963+fail:
55964+    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
55965+    return rv;
55966+}
55967+
55968+static inline int
55969+ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
55970+{
55971+    return v >= c->minimum && v <= c->maximum;
55972+}
55973+
55974+// Initial check & init
55975+static int
55976+probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
55977+{
55978+    const HEVCContext *h = avctx->priv_data;
55979+    const HEVCSPS * const sps = h->ps.sps;
55980+    struct v4l2_ctrl_hevc_sps ctrl_sps;
55981+    unsigned int i;
55982+
55983+    // Check for var slice array
55984+    struct v4l2_query_ext_ctrl qc[] = {
55985+        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
55986+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
55987+        { .id = V4L2_CID_STATELESS_HEVC_SPS },
55988+        { .id = V4L2_CID_STATELESS_HEVC_PPS },
55989+        { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
55990+#if HEVC_CTRLS_VERSION >= 2
55991+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS },
55992+#endif
55993+    };
55994+    // Order & size must match!
55995+    static const size_t ctrl_sizes[] = {
55996+        sizeof(struct v4l2_ctrl_hevc_slice_params),
55997+        sizeof(int32_t),
55998+        sizeof(struct v4l2_ctrl_hevc_sps),
55999+        sizeof(struct v4l2_ctrl_hevc_pps),
56000+        sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
56001+#if HEVC_CTRLS_VERSION >= 2
56002+        sizeof(struct v4l2_ctrl_hevc_decode_params),
56003+#endif
56004+    };
56005+    const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
56006+
56007+#if HEVC_CTRLS_VERSION == 2
56008+    if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0))
56009+        return AVERROR(EINVAL);
56010+#elif HEVC_CTRLS_VERSION == 3
56011+    if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0))
56012+        return AVERROR(EINVAL);
56013+#endif
56014+
56015+    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls);
56016+    i = 0;
56017+#if HEVC_CTRLS_VERSION >= 4
56018+    // Skip slice check if no slice mode
56019+    if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
56020+        i = 1;
56021+#else
56022+    // Fail frame mode silently for anything prior to V4
56023+    if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
56024+        return AVERROR(EINVAL);
56025+#endif
56026+    for (; i != noof_ctrls; ++i) {
56027+        if (qc[i].type == 0) {
56028+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id);
56029+            return AVERROR(EINVAL);
56030+        }
56031+        if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
56032+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
56033+                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
56034+            return AVERROR(EINVAL);
56035+        }
56036+    }
56037+
56038+    fill_sps(&ctrl_sps, sps);
56039+
56040+    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
56041+        av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
56042+        return AVERROR(EINVAL);
56043+    }
56044+
56045+    return 0;
56046+}
56047+
56048+// Final init
56049+static int
56050+set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
56051+{
56052+    int ret;
56053+
56054+    struct v4l2_query_ext_ctrl querys[] = {
56055+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
56056+        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
56057+        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
56058+#if HEVC_CTRLS_VERSION >= 4
56059+        { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, },
56060+#endif
56061+    };
56062+
56063+    struct v4l2_ext_control ctrls[] = {
56064+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
56065+        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
56066+    };
56067+
56068+    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
56069+
56070+    ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
56071+                       querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
56072+        1 : querys[2].dims[0];
56073+    av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);
56074+
56075+#if HEVC_CTRLS_VERSION >= 4
56076+    ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
56077+        0 : querys[3].dims[0];
56078+    av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
56079+#else
56080+    ctx->max_offsets = 0;
56081+#endif
56082+
56083+    if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED ||
56084+        querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)
56085+        ctx->decode_mode = querys[0].default_value;
56086+    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED))
56087+        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED;
56088+    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
56089+        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
56090+    else {
56091+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
56092+        return AVERROR(EINVAL);
56093+    }
56094+
56095+    if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE ||
56096+        querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)
56097+        ctx->start_code = querys[1].default_value;
56098+    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
56099+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
56100+    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
56101+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
56102+    else {
56103+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
56104+        return AVERROR(EINVAL);
56105+    }
56106+
56107+    // If we are in slice mode & START_CODE_NONE supported then pick that
56108+    // as it doesn't require the slightly dodgy look backwards in our raw buffer
56109+    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
56110+        ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
56111+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
56112+
56113+    ctrls[0].value = ctx->decode_mode;
56114+    ctrls[1].value = ctx->start_code;
56115+
56116+    ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls));
56117+    return !ret ? 0 : AVERROR(-ret);
56118+}
56119+
56120+static void v4l2_req_frame_free(void *opaque, uint8_t *data)
56121+{
56122+    AVCodecContext *avctx = opaque;
56123+    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data;
56124+
56125+    av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data);
56126+
56127+    qent_dst_unref(&rd->qe_dst);
56128+
56129+    // We don't expect req or qe_src to be set
56130+    if (rd->req || rd->qe_src)
56131+        av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src);
56132+
56133+    av_freep(&rd->slices);
56134+    av_freep(&rd->slice_params);
56135+    av_freep(&rd->offsets);
56136+
56137+    av_free(rd);
56138+}
56139+
56140+static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size)
56141+{
56142+    AVCodecContext *avctx = opaque;
56143+//    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
56144+//    V4L2MediaReqDescriptor *req;
56145+    AVBufferRef *ref;
56146+    uint8_t *data;
56147+//    int ret;
56148+
56149+    data = av_mallocz(size);
56150+    if (!data)
56151+        return NULL;
56152+
56153+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
56154+    ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0);
56155+    if (!ref) {
56156+        av_freep(&data);
56157+        return NULL;
56158+    }
56159+    return ref;
56160+}
56161+
56162+#if 0
56163+static void v4l2_req_pool_free(void *opaque)
56164+{
56165+    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
56166+}
56167+
56168+static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc)
56169+{
56170+    av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
56171+
56172+    av_buffer_pool_uninit(&hwfc->pool);
56173+}
56174+#endif
56175+
56176+static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
56177+{
56178+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
56179+    AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
56180+    const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs);
56181+
56182+    hwfc->format = AV_PIX_FMT_DRM_PRIME;
56183+    hwfc->sw_format = pixel_format_from_format(vfmt);
56184+    if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) {
56185+        hwfc->width = vfmt->fmt.pix_mp.width;
56186+        hwfc->height = vfmt->fmt.pix_mp.height;
56187+    } else {
56188+        hwfc->width = vfmt->fmt.pix.width;
56189+        hwfc->height = vfmt->fmt.pix.height;
56190+    }
56191+#if 0
56192+    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free);
56193+    if (!hwfc->pool)
56194+        return AVERROR(ENOMEM);
56195+
56196+    hwfc->free = v4l2_req_hwframe_ctx_free;
56197+
56198+    hwfc->initial_pool_size = 1;
56199+
56200+    switch (avctx->codec_id) {
56201+    case AV_CODEC_ID_VP9:
56202+        hwfc->initial_pool_size += 8;
56203+        break;
56204+    case AV_CODEC_ID_VP8:
56205+        hwfc->initial_pool_size += 3;
56206+        break;
56207+    default:
56208+        hwfc->initial_pool_size += 2;
56209+    }
56210+#endif
56211+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
56212+
56213+    return 0;
56214+}
56215+
56216+static int alloc_frame(AVCodecContext * avctx, AVFrame *frame)
56217+{
56218+    int rv;
56219+
56220+    frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor));
56221+    if (!frame->buf[0])
56222+        return AVERROR(ENOMEM);
56223+
56224+    frame->data[0] = frame->buf[0]->data;
56225+
56226+    frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
56227+
56228+    if ((rv = ff_attach_decode_data(frame)) != 0) {
56229+        av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n");
56230+        av_frame_unref(frame);
56231+        return rv;
56232+    }
56233+
56234+    return 0;
56235+}
56236+
56237+const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = {
56238+    .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE,
56239+    .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION),
56240+    .probe = probe,
56241+    .set_controls = set_controls,
56242+
56243+    .start_frame    = v4l2_request_hevc_start_frame,
56244+    .decode_slice   = v4l2_request_hevc_decode_slice,
56245+    .end_frame      = v4l2_request_hevc_end_frame,
56246+    .abort_frame    = v4l2_request_hevc_abort_frame,
56247+    .frame_params   = frame_params,
56248+    .alloc_frame    = alloc_frame,
56249+};
56250+
56251--- /dev/null
56252+++ b/libavcodec/v4l2_req_media.c
56253@@ -0,0 +1,1601 @@
56254+/*
56255+ * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
56256+ *
56257+ * Permission is hereby granted, free of charge, to any person obtaining a
56258+ * copy of this software and associated documentation files (the
56259+ * "Software"), to deal in the Software without restriction, including
56260+ * without limitation the rights to use, copy, modify, merge, publish,
56261+ * distribute, sub license, and/or sell copies of the Software, and to
56262+ * permit persons to whom the Software is furnished to do so, subject to
56263+ * the following conditions:
56264+ *
56265+ * The above copyright notice and this permission notice (including the
56266+ * next paragraph) shall be included in all copies or substantial portions
56267+ * of the Software.
56268+ *
56269+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
56270+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
56271+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
56272+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
56273+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
56274+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
56275+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
56276+ */
56277+
56278+#include <errno.h>
56279+#include <fcntl.h>
56280+#include <poll.h>
56281+#include <pthread.h>
56282+#include <semaphore.h>
56283+#include <stdatomic.h>
56284+#include <stdbool.h>
56285+#include <stdlib.h>
56286+#include <string.h>
56287+#include <unistd.h>
56288+#include <linux/media.h>
56289+#include <sys/ioctl.h>
56290+#include <sys/select.h>
56291+#include <sys/ioctl.h>
56292+
56293+#include <linux/videodev2.h>
56294+
56295+#include "v4l2_req_dmabufs.h"
56296+#include "v4l2_req_media.h"
56297+#include "v4l2_req_pollqueue.h"
56298+#include "v4l2_req_utils.h"
56299+#include "weak_link.h"
56300+
56301+
56302+/* floor(log2(x)) */
56303+static unsigned int log2_size(size_t x)
56304+{
56305+    unsigned int n = 0;
56306+
56307+    if (x & ~0xffff) {
56308+        n += 16;
56309+        x >>= 16;
56310+    }
56311+    if (x & ~0xff) {
56312+        n += 8;
56313+        x >>= 8;
56314+    }
56315+    if (x & ~0xf) {
56316+        n += 4;
56317+        x >>= 4;
56318+    }
56319+    if (x & ~3) {
56320+        n += 2;
56321+        x >>= 2;
56322+    }
56323+    return (x & ~1) ? n + 1 : n;
56324+}
56325+
56326+static size_t round_up_size(const size_t x)
56327+{
56328+    /* Admit no size < 256 */
56329+    const unsigned int n = x < 256 ? 8 : log2_size(x) - 1;
56330+
56331+    return x >= (3 << n) ? 4 << n : (3 << n);
56332+}
56333+
56334+struct media_request;
56335+
56336+struct media_pool {
56337+    int fd;
56338+    sem_t sem;
56339+    pthread_mutex_t lock;
56340+    struct media_request * free_reqs;
56341+    struct pollqueue * pq;
56342+};
56343+
56344+struct media_request {
56345+    struct media_request * next;
56346+    struct media_pool * mp;
56347+    int fd;
56348+    struct polltask * pt;
56349+};
56350+
56351+
56352+static inline int do_trywait(sem_t *const sem)
56353+{
56354+    while (sem_trywait(sem)) {
56355+        if (errno != EINTR)
56356+            return -errno;
56357+    }
56358+    return 0;
56359+}
56360+
56361+static inline int do_wait(sem_t *const sem)
56362+{
56363+    while (sem_wait(sem)) {
56364+        if (errno != EINTR)
56365+            return -errno;
56366+    }
56367+    return 0;
56368+}
56369+
56370+static int request_buffers(int video_fd, unsigned int type,
56371+                           enum v4l2_memory memory, unsigned int buffers_count)
56372+{
56373+    struct v4l2_requestbuffers buffers;
56374+    int rc;
56375+
56376+    memset(&buffers, 0, sizeof(buffers));
56377+    buffers.type = type;
56378+    buffers.memory = memory;
56379+    buffers.count = buffers_count;
56380+
56381+    rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
56382+    if (rc < 0) {
56383+        rc = -errno;
56384+        request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc));
56385+        return rc;
56386+    }
56387+
56388+    return 0;
56389+}
56390+
56391+
56392+static int set_stream(int video_fd, unsigned int type, bool enable)
56393+{
56394+    enum v4l2_buf_type buf_type = type;
56395+    int rc;
56396+
56397+    rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF,
56398+           &buf_type);
56399+    if (rc < 0) {
56400+        rc = -errno;
56401+        request_log("Unable to %sable stream: %s\n",
56402+                enable ? "en" : "dis", strerror(-rc));
56403+        return rc;
56404+    }
56405+
56406+    return 0;
56407+}
56408+
56409+
56410+
56411+struct media_request * media_request_get(struct media_pool * const mp)
56412+{
56413+    struct media_request *req = NULL;
56414+
56415+    /* Timeout handled by poll code */
56416+    if (do_wait(&mp->sem))
56417+        return NULL;
56418+
56419+    pthread_mutex_lock(&mp->lock);
56420+    req = mp->free_reqs;
56421+    if (req) {
56422+        mp->free_reqs = req->next;
56423+        req->next = NULL;
56424+    }
56425+    pthread_mutex_unlock(&mp->lock);
56426+    return req;
56427+}
56428+
56429+int media_request_fd(const struct media_request * const req)
56430+{
56431+    return req->fd;
56432+}
56433+
56434+int media_request_start(struct media_request * const req)
56435+{
56436+    while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1)
56437+    {
56438+        const int err = errno;
56439+        if (err == EINTR)
56440+            continue;
56441+        request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err));
56442+        return -err;
56443+    }
56444+
56445+    pollqueue_add_task(req->pt, 2000);
56446+    return 0;
56447+}
56448+
56449+static void media_request_done(void *v, short revents)
56450+{
56451+    struct media_request *const req = v;
56452+    struct media_pool *const mp = req->mp;
56453+
56454+    /* ** Not sure what to do about timeout */
56455+
56456+    if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0)
56457+        request_log("Unable to reinit media request: %s\n",
56458+                strerror(errno));
56459+
56460+    pthread_mutex_lock(&mp->lock);
56461+    req->next = mp->free_reqs;
56462+    mp->free_reqs = req;
56463+    pthread_mutex_unlock(&mp->lock);
56464+    sem_post(&mp->sem);
56465+}
56466+
56467+int media_request_abort(struct media_request ** const preq)
56468+{
56469+    struct media_request * const req = *preq;
56470+
56471+    if (req == NULL)
56472+        return 0;
56473+    *preq = NULL;
56474+
56475+    media_request_done(req, 0);
56476+    return 0;
56477+}
56478+
56479+static void delete_req_chain(struct media_request * const chain)
56480+{
56481+    struct media_request * next = chain;
56482+    while (next) {
56483+        struct media_request * const req = next;
56484+        next = req->next;
56485+        if (req->pt)
56486+            polltask_delete(&req->pt);
56487+        if (req->fd != -1)
56488+            close(req->fd);
56489+        free(req);
56490+    }
56491+}
56492+
56493+struct media_pool * media_pool_new(const char * const media_path,
56494+                   struct pollqueue * const pq,
56495+                   const unsigned int n)
56496+{
56497+    struct media_pool * const mp = calloc(1, sizeof(*mp));
56498+    unsigned int i;
56499+
56500+    if (!mp)
56501+        goto fail0;
56502+
56503+    mp->pq = pq;
56504+    pthread_mutex_init(&mp->lock, NULL);
56505+    mp->fd = open(media_path, O_RDWR | O_NONBLOCK);
56506+    if (mp->fd == -1) {
56507+        request_log("Failed to open '%s': %s\n", media_path, strerror(errno));
56508+        goto fail1;
56509+    }
56510+
56511+    for (i = 0; i != n; ++i) {
56512+        struct media_request * req = malloc(sizeof(*req));
56513+        if (!req)
56514+            goto fail4;
56515+
56516+        *req = (struct media_request){
56517+            .next = mp->free_reqs,
56518+            .mp = mp,
56519+            .fd = -1
56520+        };
56521+        mp->free_reqs = req;
56522+
56523+        if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) {
56524+            request_log("Failed to alloc request %d: %s\n", i, strerror(errno));
56525+            goto fail4;
56526+        }
56527+
56528+        req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req);
56529+        if (!req->pt)
56530+            goto fail4;
56531+    }
56532+
56533+    sem_init(&mp->sem, 0, n);
56534+
56535+    return mp;
56536+
56537+fail4:
56538+    delete_req_chain(mp->free_reqs);
56539+    close(mp->fd);
56540+    pthread_mutex_destroy(&mp->lock);
56541+fail1:
56542+    free(mp);
56543+fail0:
56544+    return NULL;
56545+}
56546+
56547+void media_pool_delete(struct media_pool ** pMp)
56548+{
56549+    struct media_pool * const mp = *pMp;
56550+
56551+    if (!mp)
56552+        return;
56553+    *pMp = NULL;
56554+
56555+    delete_req_chain(mp->free_reqs);
56556+    close(mp->fd);
56557+    sem_destroy(&mp->sem);
56558+    pthread_mutex_destroy(&mp->lock);
56559+    free(mp);
56560+}
56561+
56562+
56563+#define INDEX_UNSET (~(uint32_t)0)
56564+
56565+enum qent_status {
56566+    QENT_NEW = 0,       // Initial state - shouldn't last
56567+    QENT_FREE,          // On free chain
56568+    QENT_PENDING,       // User has ent
56569+    QENT_WAITING,       // On inuse
56570+    QENT_DONE,          // Frame rx
56571+    QENT_ERROR,         // Error
56572+    QENT_IMPORT
56573+};
56574+
56575+struct qent_base {
56576+    atomic_int ref_count;
56577+    struct qent_base *next;
56578+    struct qent_base *prev;
56579+    enum qent_status status;
56580+    uint32_t index;
56581+    struct dmabuf_h *dh[VIDEO_MAX_PLANES];
56582+    struct timeval timestamp;
56583+};
56584+
56585+struct qent_src {
56586+    struct qent_base base;
56587+    int fixed_size;
56588+};
56589+
56590+struct qent_dst {
56591+    struct qent_base base;
56592+    bool waiting;
56593+    pthread_mutex_t lock;
56594+    pthread_cond_t cond;
56595+    struct ff_weak_link_client * mbc_wl;
56596+};
56597+
56598+struct qe_list_head {
56599+    struct qent_base *head;
56600+    struct qent_base *tail;
56601+};
56602+
56603+struct buf_pool {
56604+    pthread_mutex_t lock;
56605+    sem_t free_sem;
56606+    enum v4l2_buf_type buf_type;
56607+    struct qe_list_head free;
56608+    struct qe_list_head inuse;
56609+};
56610+
56611+
56612+static inline struct qent_dst *base_to_dst(struct qent_base *be)
56613+{
56614+    return (struct qent_dst *)be;
56615+}
56616+
56617+static inline struct qent_src *base_to_src(struct qent_base *be)
56618+{
56619+    return (struct qent_src *)be;
56620+}
56621+
56622+
56623+#define QENT_BASE_INITIALIZER {\
56624+    .ref_count = ATOMIC_VAR_INIT(0),\
56625+    .status = QENT_NEW,\
56626+    .index  = INDEX_UNSET\
56627+}
56628+
56629+static void qe_base_uninit(struct qent_base *const be)
56630+{
56631+    unsigned int i;
56632+    for (i = 0; i != VIDEO_MAX_PLANES; ++i) {
56633+        dmabuf_free(be->dh[i]);
56634+        be->dh[i] = NULL;
56635+    }
56636+}
56637+
56638+static void qe_src_free(struct qent_src *const be_src)
56639+{
56640+    if (!be_src)
56641+        return;
56642+    qe_base_uninit(&be_src->base);
56643+    free(be_src);
56644+}
56645+
56646+static struct qent_src * qe_src_new(void)
56647+{
56648+    struct qent_src *const be_src = malloc(sizeof(*be_src));
56649+    if (!be_src)
56650+        return NULL;
56651+    *be_src = (struct qent_src){
56652+        .base = QENT_BASE_INITIALIZER
56653+    };
56654+    return be_src;
56655+}
56656+
56657+static void qe_dst_free(struct qent_dst *const be_dst)
56658+{
56659+    if (!be_dst)
56660+        return;
56661+
56662+    ff_weak_link_unref(&be_dst->mbc_wl);
56663+    pthread_cond_destroy(&be_dst->cond);
56664+    pthread_mutex_destroy(&be_dst->lock);
56665+    qe_base_uninit(&be_dst->base);
56666+    free(be_dst);
56667+}
56668+
56669+static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl)
56670+{
56671+    struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
56672+    if (!be_dst)
56673+        return NULL;
56674+    *be_dst = (struct qent_dst){
56675+        .base = QENT_BASE_INITIALIZER,
56676+        .lock = PTHREAD_MUTEX_INITIALIZER,
56677+        .cond = PTHREAD_COND_INITIALIZER,
56678+        .mbc_wl = ff_weak_link_ref(wl)
56679+    };
56680+    return be_dst;
56681+}
56682+
56683+static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be)
56684+{
56685+    if (ql->tail)
56686+        ql->tail->next = be;
56687+    else
56688+        ql->head = be;
56689+    be->prev = ql->tail;
56690+    be->next = NULL;
56691+    ql->tail = be;
56692+}
56693+
56694+static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be)
56695+{
56696+    if (!be)
56697+        return NULL;
56698+
56699+    if (be->next)
56700+        be->next->prev = be->prev;
56701+    else
56702+        ql->tail = be->prev;
56703+    if (be->prev)
56704+        be->prev->next = be->next;
56705+    else
56706+        ql->head = be->next;
56707+    be->next = NULL;
56708+    be->prev = NULL;
56709+    return be;
56710+}
56711+
56712+
56713+static void bq_put_free(struct buf_pool *const bp, struct qent_base * be)
56714+{
56715+    ql_add_tail(&bp->free, be);
56716+}
56717+
56718+static struct qent_base * bq_get_free(struct buf_pool *const bp)
56719+{
56720+    return ql_extract(&bp->free, bp->free.head);
56721+}
56722+
56723+static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be)
56724+{
56725+    return ql_extract(&bp->inuse, be);
56726+}
56727+
56728+static struct qent_base * bq_get_inuse(struct buf_pool *const bp)
56729+{
56730+    return ql_extract(&bp->inuse, bp->inuse.head);
56731+}
56732+
56733+static void bq_free_all_free_src(struct buf_pool *const bp)
56734+{
56735+    struct qent_base *be;
56736+    while ((be = bq_get_free(bp)) != NULL)
56737+        qe_src_free(base_to_src(be));
56738+}
56739+
56740+static void bq_free_all_inuse_src(struct buf_pool *const bp)
56741+{
56742+    struct qent_base *be;
56743+    while ((be = bq_get_inuse(bp)) != NULL)
56744+        qe_src_free(base_to_src(be));
56745+}
56746+
56747+static void bq_free_all_free_dst(struct buf_pool *const bp)
56748+{
56749+    struct qent_base *be;
56750+    while ((be = bq_get_free(bp)) != NULL)
56751+        qe_dst_free(base_to_dst(be));
56752+}
56753+
56754+static void queue_put_free(struct buf_pool *const bp, struct qent_base *be)
56755+{
56756+    unsigned int i;
56757+
56758+    pthread_mutex_lock(&bp->lock);
56759+    /* Clear out state vars */
56760+    be->timestamp.tv_sec = 0;
56761+    be->timestamp.tv_usec = 0;
56762+    be->status = QENT_FREE;
56763+    for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i)
56764+        dmabuf_len_set(be->dh[i], 0);
56765+    bq_put_free(bp, be);
56766+    pthread_mutex_unlock(&bp->lock);
56767+    sem_post(&bp->free_sem);
56768+}
56769+
56770+static bool queue_is_inuse(const struct buf_pool *const bp)
56771+{
56772+    return bp->inuse.tail != NULL;
56773+}
56774+
56775+static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be)
56776+{
56777+    if (!be)
56778+        return;
56779+    pthread_mutex_lock(&bp->lock);
56780+    ql_add_tail(&bp->inuse, be);
56781+    be->status = QENT_WAITING;
56782+    pthread_mutex_unlock(&bp->lock);
56783+}
56784+
56785+static struct qent_base *queue_get_free(struct buf_pool *const bp)
56786+{
56787+    struct qent_base *buf;
56788+
56789+    if (do_wait(&bp->free_sem))
56790+        return NULL;
56791+    pthread_mutex_lock(&bp->lock);
56792+    buf = bq_get_free(bp);
56793+    pthread_mutex_unlock(&bp->lock);
56794+    return buf;
56795+}
56796+
56797+static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
56798+{
56799+    struct qent_base *buf;
56800+
56801+    if (do_trywait(&bp->free_sem))
56802+        return NULL;
56803+    pthread_mutex_lock(&bp->lock);
56804+    buf = bq_get_free(bp);
56805+    pthread_mutex_unlock(&bp->lock);
56806+    return buf;
56807+}
56808+
56809+static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd)
56810+{
56811+    struct qent_base *be;
56812+
56813+    pthread_mutex_lock(&bp->lock);
56814+    /* Expect 1st in Q, but allow anywhere */
56815+    for (be = bp->inuse.head; be; be = be->next) {
56816+        if (dmabuf_fd(be->dh[0]) == fd) {
56817+            bq_extract_inuse(bp, be);
56818+            break;
56819+        }
56820+    }
56821+    pthread_mutex_unlock(&bp->lock);
56822+
56823+    return be;
56824+}
56825+
56826+static void queue_delete(struct buf_pool *const bp)
56827+{
56828+    sem_destroy(&bp->free_sem);
56829+    pthread_mutex_destroy(&bp->lock);
56830+    free(bp);
56831+}
56832+
56833+static struct buf_pool* queue_new(const int vfd)
56834+{
56835+    struct buf_pool *bp = calloc(1, sizeof(*bp));
56836+    if (!bp)
56837+        return NULL;
56838+    pthread_mutex_init(&bp->lock, NULL);
56839+    sem_init(&bp->free_sem, 0, 0);
56840+    return bp;
56841+}
56842+
56843+
56844+struct mediabufs_ctl {
56845+    atomic_int ref_count;  /* 0 is single ref for easier atomics */
56846+    void * dc;
56847+    int vfd;
56848+    bool stream_on;
56849+    bool polling;
56850+    bool dst_fixed;             // Dst Q is fixed size
56851+    pthread_mutex_t lock;
56852+    struct buf_pool * src;
56853+    struct buf_pool * dst;
56854+    struct polltask * pt;
56855+    struct pollqueue * pq;
56856+    struct ff_weak_link_master * this_wlm;
56857+
56858+    struct v4l2_format src_fmt;
56859+    struct v4l2_format dst_fmt;
56860+    struct v4l2_capability capability;
56861+};
56862+
56863+static int qe_v4l2_queue(struct qent_base *const be,
56864+               const int vfd, struct media_request *const mreq,
56865+               const struct v4l2_format *const fmt,
56866+               const bool is_dst, const bool hold_flag)
56867+{
56868+    struct v4l2_buffer buffer = {
56869+        .type = fmt->type,
56870+        .memory = V4L2_MEMORY_DMABUF,
56871+        .index = be->index
56872+    };
56873+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
56874+
56875+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
56876+        unsigned int i;
56877+        for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) {
56878+            if (is_dst)
56879+                dmabuf_len_set(be->dh[i], 0);
56880+
56881+            /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
56882+            planes[i].length = dmabuf_size(be->dh[i]);
56883+            planes[i].bytesused = dmabuf_len(be->dh[i]);
56884+            planes[i].m.fd = dmabuf_fd(be->dh[i]);
56885+        }
56886+        buffer.m.planes = planes;
56887+        buffer.length = i;
56888+    }
56889+    else {
56890+        if (is_dst)
56891+            dmabuf_len_set(be->dh[0], 0);
56892+
56893+        buffer.bytesused = dmabuf_len(be->dh[0]);
56894+        buffer.length = dmabuf_size(be->dh[0]);
56895+        buffer.m.fd = dmabuf_fd(be->dh[0]);
56896+    }
56897+
56898+    if (!is_dst && mreq) {
56899+        buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD;
56900+        buffer.request_fd = media_request_fd(mreq);
56901+        if (hold_flag)
56902+            buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF;
56903+    }
56904+
56905+    if (is_dst)
56906+        be->timestamp = (struct timeval){0,0};
56907+
56908+    buffer.timestamp = be->timestamp;
56909+
56910+    while (ioctl(vfd, VIDIOC_QBUF, &buffer)) {
56911+        const int err = errno;
56912+        if (err != EINTR) {
56913+            request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err));
56914+            return -err;
56915+        }
56916+    }
56917+    return 0;
56918+}
56919+
56920+static struct qent_base * qe_dequeue(struct buf_pool *const bp,
56921+                     const int vfd,
56922+                     const struct v4l2_format * const f)
56923+{
56924+    int fd;
56925+    struct qent_base *be;
56926+    int rc;
56927+    const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
56928+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
56929+    struct v4l2_buffer buffer = {
56930+        .type =  f->type,
56931+        .memory = V4L2_MEMORY_DMABUF
56932+    };
56933+    if (mp) {
56934+        buffer.length = f->fmt.pix_mp.num_planes;
56935+        buffer.m.planes = planes;
56936+    }
56937+
56938+    while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 &&
56939+           errno == EINTR)
56940+        /* Loop */;
56941+    if (rc) {
56942+        request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno));
56943+        return NULL;
56944+    }
56945+
56946+    fd = mp ? planes[0].m.fd : buffer.m.fd;
56947+    be = queue_find_extract_fd(bp, fd);
56948+    if (!be) {
56949+        request_log("Failed to find fd %d in Q\n", fd);
56950+        return NULL;
56951+    }
56952+
56953+    be->timestamp = buffer.timestamp;
56954+    be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
56955+    return be;
56956+}
56957+
56958+static void qe_dst_done(struct qent_dst * dst_be)
56959+{
56960+    pthread_mutex_lock(&dst_be->lock);
56961+    dst_be->waiting = false;
56962+    pthread_cond_broadcast(&dst_be->cond);
56963+    pthread_mutex_unlock(&dst_be->lock);
56964+
56965+    qent_dst_unref(&dst_be);
56966+}
56967+
56968+static bool qe_dst_waiting(struct qent_dst *const dst_be)
56969+{
56970+    bool waiting;
56971+    pthread_mutex_lock(&dst_be->lock);
56972+    waiting = dst_be->waiting;
56973+    dst_be->waiting = true;
56974+    pthread_mutex_unlock(&dst_be->lock);
56975+    return waiting;
56976+}
56977+
56978+
56979+static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc)
56980+{
56981+    return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst);
56982+}
56983+
56984+static void mediabufs_poll_cb(void * v, short revents)
56985+{
56986+    struct mediabufs_ctl *mbc = v;
56987+    struct qent_src *src_be = NULL;
56988+    struct qent_dst *dst_be = NULL;
56989+
56990+    if (!revents)
56991+        request_err(mbc->dc, "%s: Timeout\n", __func__);
56992+
56993+    pthread_mutex_lock(&mbc->lock);
56994+    mbc->polling = false;
56995+
56996+    if ((revents & POLLOUT) != 0)
56997+        src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt));
56998+    if ((revents & POLLIN) != 0)
56999+        dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt));
57000+
57001+    /* Reschedule */
57002+    if (mediabufs_wants_poll(mbc)) {
57003+        mbc->polling = true;
57004+        pollqueue_add_task(mbc->pt, 2000);
57005+    }
57006+    pthread_mutex_unlock(&mbc->lock);
57007+
57008+    if (src_be)
57009+        queue_put_free(mbc->src, &src_be->base);
57010+    if (dst_be)
57011+        qe_dst_done(dst_be);
57012+}
57013+
57014+int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp)
57015+{
57016+    struct qent_base *const be = &be_src->base;
57017+
57018+    be->timestamp = *timestamp;
57019+    return 0;
57020+}
57021+
57022+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst)
57023+{
57024+    return be_dst->base.timestamp;
57025+}
57026+
57027+static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc)
57028+{
57029+    if (!be->dh[0] || len > dmabuf_size(be->dh[0])) {
57030+        size_t newsize = round_up_size(len);
57031+        request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
57032+        if (!dbsc) {
57033+            request_log("%s: No dmbabuf_ctrl for realloc\n", __func__);
57034+            return -ENOMEM;
57035+        }
57036+        if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) {
57037+            request_log("%s: Realloc %zd failed\n", __func__, newsize);
57038+            return -ENOMEM;
57039+        }
57040+    }
57041+    return 0;
57042+}
57043+
57044+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc)
57045+{
57046+    struct qent_base *const be = &be_src->base;
57047+    return qent_base_realloc(be, len, dbsc);
57048+}
57049+
57050+
57051+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc)
57052+{
57053+    void * dst;
57054+    struct qent_base *const be = &be_src->base;
57055+    int rv;
57056+
57057+    // Realloc doesn't copy so don't alloc if offset != 0
57058+    if ((rv = qent_base_realloc(be, offset + len,
57059+                                be_src->fixed_size || offset ? NULL : dbsc)) != 0)
57060+        return rv;
57061+
57062+    dmabuf_write_start(be->dh[0]);
57063+    dst = dmabuf_map(be->dh[0]);
57064+    if (!dst)
57065+        return -1;
57066+    memcpy((char*)dst + offset, src, len);
57067+    dmabuf_len_set(be->dh[0], len);
57068+    dmabuf_write_end(be->dh[0]);
57069+    return 0;
57070+}
57071+
57072+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane)
57073+{
57074+    const struct qent_base *const be = &be_dst->base;
57075+
57076+    return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane];
57077+}
57078+
57079+int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane)
57080+{
57081+    return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane)));
57082+}
57083+
57084+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
57085+                struct media_request **const pmreq,
57086+                struct qent_src **const psrc_be,
57087+                struct qent_dst *const dst_be,
57088+                const bool is_final)
57089+{
57090+    struct media_request * mreq = *pmreq;
57091+    struct qent_src *const src_be = *psrc_be;
57092+
57093+    // Req & src are always both "consumed"
57094+    *pmreq = NULL;
57095+    *psrc_be = NULL;
57096+
57097+    pthread_mutex_lock(&mbc->lock);
57098+
57099+    if (!src_be)
57100+        goto fail1;
57101+
57102+    if (dst_be) {
57103+        if (qe_dst_waiting(dst_be)) {
57104+            request_info(mbc->dc, "Request buffer already waiting on start\n");
57105+            goto fail1;
57106+        }
57107+        dst_be->base.timestamp = (struct timeval){0,0};
57108+        if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false))
57109+            goto fail1;
57110+
57111+        qent_dst_ref(dst_be);
57112+        queue_put_inuse(mbc->dst, &dst_be->base);
57113+    }
57114+
57115+    if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final))
57116+        goto fail1;
57117+    queue_put_inuse(mbc->src, &src_be->base);
57118+
57119+    if (!mbc->polling && mediabufs_wants_poll(mbc)) {
57120+        mbc->polling = true;
57121+        pollqueue_add_task(mbc->pt, 2000);
57122+    }
57123+    pthread_mutex_unlock(&mbc->lock);
57124+
57125+    if (media_request_start(mreq))
57126+        return MEDIABUFS_ERROR_OPERATION_FAILED;
57127+
57128+    return MEDIABUFS_STATUS_SUCCESS;
57129+
57130+fail1:
57131+    media_request_abort(&mreq);
57132+    if (src_be)
57133+        queue_put_free(mbc->src, &src_be->base);
57134+
57135+// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q
57136+    if (dst_be) {
57137+        dst_be->base.status = QENT_ERROR;
57138+        qe_dst_done(dst_be);
57139+    }
57140+    pthread_mutex_unlock(&mbc->lock);
57141+    return MEDIABUFS_ERROR_OPERATION_FAILED;
57142+}
57143+
57144+
57145+static int qe_alloc_from_fmt(struct qent_base *const be,
57146+                   struct dmabufs_ctl *const dbsc,
57147+                   const struct v4l2_format *const fmt)
57148+{
57149+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
57150+        unsigned int i;
57151+        for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) {
57152+            be->dh[i] = dmabuf_realloc(dbsc, be->dh[i],
57153+                fmt->fmt.pix_mp.plane_fmt[i].sizeimage);
57154+            /* On failure tidy up and die */
57155+            if (!be->dh[i]) {
57156+                while (i--) {
57157+                    dmabuf_free(be->dh[i]);
57158+                    be->dh[i] = NULL;
57159+                }
57160+                return -1;
57161+            }
57162+        }
57163+    }
57164+    else {
57165+//      be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage);
57166+        size_t size = fmt->fmt.pix.sizeimage;
57167+        be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size);
57168+        if (!be->dh[0])
57169+            return -1;
57170+    }
57171+    return 0;
57172+}
57173+
57174+static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd,
57175+            const enum v4l2_buf_type buftype,
57176+            uint32_t pixfmt,
57177+            const unsigned int width, const unsigned int height,
57178+                               const size_t bufsize)
57179+{
57180+    *fmt = (struct v4l2_format){.type = buftype};
57181+
57182+    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
57183+        fmt->fmt.pix_mp.width = width;
57184+        fmt->fmt.pix_mp.height = height;
57185+        fmt->fmt.pix_mp.pixelformat = pixfmt;
57186+        if (bufsize) {
57187+            fmt->fmt.pix_mp.num_planes = 1;
57188+            fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize;
57189+        }
57190+    }
57191+    else {
57192+        fmt->fmt.pix.width = width;
57193+        fmt->fmt.pix.height = height;
57194+        fmt->fmt.pix.pixelformat = pixfmt;
57195+        fmt->fmt.pix.sizeimage = bufsize;
57196+    }
57197+
57198+    while (ioctl(fd, VIDIOC_S_FMT, fmt))
57199+        if (errno != EINTR)
57200+            return MEDIABUFS_ERROR_OPERATION_FAILED;
57201+
57202+    // Treat anything where we don't get at least what we asked for as a fail
57203+    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
57204+        if (fmt->fmt.pix_mp.width < width ||
57205+            fmt->fmt.pix_mp.height < height ||
57206+            fmt->fmt.pix_mp.pixelformat != pixfmt) {
57207+            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
57208+        }
57209+    }
57210+    else {
57211+        if (fmt->fmt.pix.width < width ||
57212+            fmt->fmt.pix.height < height ||
57213+            fmt->fmt.pix.pixelformat != pixfmt) {
57214+            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
57215+        }
57216+    }
57217+
57218+    return MEDIABUFS_STATUS_SUCCESS;
57219+}
57220+
57221+static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt,
57222+                   const int fd,
57223+                   const unsigned int type_v4l2,
57224+                   const uint32_t flags_must,
57225+                   const uint32_t flags_not,
57226+                   const unsigned int width,
57227+                   const unsigned int height,
57228+                   mediabufs_dst_fmt_accept_fn *const accept_fn,
57229+                   void *const accept_v)
57230+{
57231+    unsigned int i;
57232+
57233+    for (i = 0;; ++i) {
57234+        struct v4l2_fmtdesc fmtdesc = {
57235+            .index = i,
57236+            .type = type_v4l2
57237+        };
57238+        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
57239+            if (errno != EINTR)
57240+                return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
57241+        }
57242+        if ((fmtdesc.flags & flags_must) != flags_must ||
57243+            (fmtdesc.flags & flags_not))
57244+            continue;
57245+        if (!accept_fn(accept_v, &fmtdesc))
57246+            continue;
57247+
57248+        if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat,
57249+                width, height, 0) == MEDIABUFS_STATUS_SUCCESS)
57250+            return MEDIABUFS_STATUS_SUCCESS;
57251+    }
57252+    return 0;
57253+}
57254+
57255+
57256+/* Wait for qent done */
57257+
57258+MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst)
57259+{
57260+    struct qent_base *const be = &be_dst->base;
57261+    enum qent_status estat;
57262+
57263+    pthread_mutex_lock(&be_dst->lock);
57264+    while (be_dst->waiting &&
57265+           !pthread_cond_wait(&be_dst->cond, &be_dst->lock))
57266+        /* Loop */;
57267+    estat = be->status;
57268+    pthread_mutex_unlock(&be_dst->lock);
57269+
57270+    return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS :
57271+        estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR :
57272+            MEDIABUFS_ERROR_OPERATION_FAILED;
57273+}
57274+
57275+const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no)
57276+{
57277+    struct qent_base *const be = &be_dst->base;
57278+    return dmabuf_map(be->dh[buf_no]);
57279+}
57280+
57281+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst)
57282+{
57283+    struct qent_base *const be = &be_dst->base;
57284+    unsigned int i;
57285+    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
57286+        if (dmabuf_read_start(be->dh[i])) {
57287+            while (i--)
57288+                dmabuf_read_end(be->dh[i]);
57289+            return MEDIABUFS_ERROR_ALLOCATION_FAILED;
57290+        }
57291+    }
57292+    return MEDIABUFS_STATUS_SUCCESS;
57293+}
57294+
57295+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst)
57296+{
57297+    struct qent_base *const be = &be_dst->base;
57298+    unsigned int i;
57299+    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
57300+
57301+    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
57302+        if (dmabuf_read_end(be->dh[i]))
57303+            status = MEDIABUFS_ERROR_OPERATION_FAILED;
57304+    }
57305+    return status;
57306+}
57307+
57308+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst)
57309+{
57310+    if (be_dst)
57311+        atomic_fetch_add(&be_dst->base.ref_count, 1);
57312+    return be_dst;
57313+}
57314+
57315+void qent_dst_unref(struct qent_dst ** const pbe_dst)
57316+{
57317+    struct qent_dst * const be_dst = *pbe_dst;
57318+    struct mediabufs_ctl * mbc;
57319+    if (!be_dst)
57320+        return;
57321+    *pbe_dst = NULL;
57322+
57323+    if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0)
57324+        return;
57325+
57326+    if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) {
57327+        queue_put_free(mbc->dst, &be_dst->base);
57328+        ff_weak_link_unlock(be_dst->mbc_wl);
57329+    }
57330+    else {
57331+        qe_dst_free(be_dst);
57332+    }
57333+}
57334+
57335+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
57336+                unsigned int plane,
57337+                int fd, size_t size)
57338+{
57339+    struct qent_base *const be = &be_dst->base;
57340+    struct dmabuf_h * dh;
57341+
57342+    if (be->status != QENT_IMPORT || be->dh[plane])
57343+        return MEDIABUFS_ERROR_OPERATION_FAILED;
57344+
57345+    dh = dmabuf_import(fd, size);
57346+    if (!dh)
57347+        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
57348+
57349+    be->dh[plane] = dh;
57350+    return MEDIABUFS_STATUS_SUCCESS;
57351+}
57352+
57353+// Returns noof buffers created, -ve for error
57354+static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[])
57355+{
57356+    unsigned int i;
57357+
57358+    struct v4l2_create_buffers cbuf = {
57359+        .count = n,
57360+        .memory = V4L2_MEMORY_DMABUF,
57361+        .format = mbc->dst_fmt,
57362+    };
57363+
57364+    while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) {
57365+        const int err = -errno;
57366+        if (err != EINTR) {
57367+            request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__);
57368+            return -err;
57369+        }
57370+    }
57371+
57372+    if (cbuf.count != n)
57373+        request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n);
57374+
57375+    for (i = 0; i != cbuf.count; ++i)
57376+        qes[i]->base.index = cbuf.index + i;
57377+
57378+    return cbuf.count;
57379+}
57380+
57381+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
57382+{
57383+    struct qent_dst * be_dst;
57384+
57385+    if (mbc == NULL) {
57386+        be_dst = qe_dst_new(NULL);
57387+        if (be_dst)
57388+            be_dst->base.status = QENT_IMPORT;
57389+        return be_dst;
57390+    }
57391+
57392+    if (mbc->dst_fixed) {
57393+        be_dst = base_to_dst(queue_get_free(mbc->dst));
57394+        if (!be_dst)
57395+            return NULL;
57396+    }
57397+    else {
57398+        be_dst = base_to_dst(queue_tryget_free(mbc->dst));
57399+        if (!be_dst) {
57400+            be_dst = qe_dst_new(mbc->this_wlm);
57401+            if (!be_dst)
57402+                return NULL;
57403+
57404+            if (create_dst_bufs(mbc, 1, &be_dst) != 1) {
57405+                qe_dst_free(be_dst);
57406+                return NULL;
57407+            }
57408+        }
57409+    }
57410+
57411+    if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
57412+        /* Given  how create buf works we can't uncreate it on alloc failure
57413+         * all we can do is put it on the free Q
57414+        */
57415+        queue_put_free(mbc->dst, &be_dst->base);
57416+        return NULL;
57417+    }
57418+
57419+    be_dst->base.status = QENT_PENDING;
57420+    atomic_store(&be_dst->base.ref_count, 0);
57421+    return be_dst;
57422+}
57423+
57424+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc)
57425+{
57426+    return &mbc->dst_fmt;
57427+}
57428+
57429+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
57430+               const unsigned int width,
57431+               const unsigned int height,
57432+               mediabufs_dst_fmt_accept_fn *const accept_fn,
57433+               void *const accept_v)
57434+{
57435+    MediaBufsStatus status;
57436+    unsigned int i;
57437+    const enum v4l2_buf_type buf_type = mbc->dst_fmt.type;
57438+    static const struct {
57439+        unsigned int flags_must;
57440+        unsigned int flags_not;
57441+    } trys[] = {
57442+        {0, V4L2_FMT_FLAG_EMULATED},
57443+        {V4L2_FMT_FLAG_EMULATED, 0},
57444+    };
57445+    for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) {
57446+        status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd,
57447+                                buf_type,
57448+                                trys[i].flags_must,
57449+                                trys[i].flags_not,
57450+                                width, height, accept_fn, accept_v);
57451+        if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE)
57452+            return status;
57453+    }
57454+
57455+    if (status != MEDIABUFS_STATUS_SUCCESS)
57456+        return status;
57457+
57458+    /* Try to create a buffer - don't alloc */
57459+    return status;
57460+}
57461+
57462+// ** This is a mess if we get partial alloc but without any way to remove
57463+//    individual V4L2 Q members we are somewhat stuffed
57464+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed)
57465+{
57466+    unsigned int i;
57467+    int a = 0;
57468+    unsigned int qc;
57469+    struct qent_dst * qes[32];
57470+
57471+    if (n > 32)
57472+        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
57473+
57474+    // Create qents first as it is hard to get rid of the V4L2 buffers on error
57475+    for (qc = 0; qc != n; ++qc)
57476+    {
57477+        if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL)
57478+            goto fail;
57479+    }
57480+
57481+    if ((a = create_dst_bufs(mbc, n, qes)) < 0)
57482+        goto fail;
57483+
57484+    for (i = 0; i != a; ++i)
57485+        queue_put_free(mbc->dst, &qes[i]->base);
57486+
57487+    if (a != n)
57488+        goto fail;
57489+
57490+    mbc->dst_fixed = fixed;
57491+    return MEDIABUFS_STATUS_SUCCESS;
57492+
57493+fail:
57494+    for (i = (a < 0 ? 0 : a); i != qc; ++i)
57495+        qe_dst_free(qes[i]);
57496+
57497+    return MEDIABUFS_ERROR_ALLOCATION_FAILED;
57498+}
57499+
57500+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc)
57501+{
57502+    struct qent_base * buf = queue_get_free(mbc->src);
57503+    buf->status = QENT_PENDING;
57504+    return base_to_src(buf);
57505+}
57506+
57507+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src)
57508+{
57509+    struct qent_src *const qe_src = *pqe_src;
57510+    if (!qe_src)
57511+        return;
57512+    *pqe_src = NULL;
57513+    queue_put_free(mbc->src, &qe_src->base);
57514+}
57515+
57516+/* src format must have been set up before this */
57517+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
57518+                  struct dmabufs_ctl * const dbsc,
57519+                  unsigned int n)
57520+{
57521+    unsigned int i;
57522+    struct v4l2_requestbuffers req = {
57523+        .count = n,
57524+        .type = mbc->src_fmt.type,
57525+        .memory = V4L2_MEMORY_DMABUF
57526+    };
57527+
57528+    bq_free_all_free_src(mbc->src);
57529+    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
57530+        if (errno != EINTR) {
57531+            request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
57532+            return MEDIABUFS_ERROR_OPERATION_FAILED;
57533+        }
57534+    }
57535+
57536+    if (n > req.count) {
57537+        request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n);
57538+        n = req.count;
57539+    }
57540+
57541+    for (i = 0; i != n; ++i) {
57542+        struct qent_src *const be_src = qe_src_new();
57543+        if (!be_src) {
57544+            request_err(mbc->dc, "Failed to create src be %d\n", i);
57545+            goto fail;
57546+        }
57547+        if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
57548+            qe_src_free(be_src);
57549+            goto fail;
57550+        }
57551+        be_src->base.index = i;
57552+        be_src->fixed_size = !mediabufs_src_resizable(mbc);
57553+
57554+        queue_put_free(mbc->src, &be_src->base);
57555+    }
57556+
57557+    return MEDIABUFS_STATUS_SUCCESS;
57558+
57559+fail:
57560+    bq_free_all_free_src(mbc->src);
57561+    req.count = 0;
57562+    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 &&
57563+           errno == EINTR)
57564+        /* Loop */;
57565+
57566+    return MEDIABUFS_ERROR_OPERATION_FAILED;
57567+}
57568+
57569+
57570+
57571+/*
57572+ * Set stuff order:
57573+ *  Set src fmt
57574+ *  Set parameters (sps) on vfd
57575+ *  Negotiate dst format (dst_fmt_set)
57576+ *  Create src buffers
57577+ *  Alloc a dst buffer or Create dst slots
57578+*/
57579+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc)
57580+{
57581+    if (mbc->stream_on)
57582+        return MEDIABUFS_STATUS_SUCCESS;
57583+
57584+    if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) {
57585+        request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type);
57586+        return MEDIABUFS_ERROR_OPERATION_FAILED;
57587+    }
57588+
57589+    if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) {
57590+        request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type);
57591+        set_stream(mbc->vfd, mbc->src_fmt.type, false);
57592+        return MEDIABUFS_ERROR_OPERATION_FAILED;
57593+    }
57594+
57595+    mbc->stream_on = true;
57596+    return MEDIABUFS_STATUS_SUCCESS;
57597+}
57598+
57599+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc)
57600+{
57601+    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
57602+
57603+    if (!mbc->stream_on)
57604+        return MEDIABUFS_STATUS_SUCCESS;
57605+
57606+    if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) {
57607+        request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type);
57608+        status = MEDIABUFS_ERROR_OPERATION_FAILED;
57609+    }
57610+
57611+    if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) {
57612+        request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type);
57613+        status = MEDIABUFS_ERROR_OPERATION_FAILED;
57614+    }
57615+
57616+    mbc->stream_on = false;
57617+    return status;
57618+}
57619+
57620+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n)
57621+{
57622+    struct v4l2_ext_controls controls = {
57623+        .controls = control_array,
57624+        .count = n
57625+    };
57626+
57627+    if (mreq) {
57628+        controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
57629+        controls.request_fd = media_request_fd(mreq);
57630+    }
57631+
57632+    while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls))
57633+    {
57634+        const int err = errno;
57635+        if (err != EINTR) {
57636+            request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err));
57637+            return -err;
57638+        }
57639+    }
57640+
57641+    return 0;
57642+}
57643+
57644+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
57645+                struct media_request * const mreq,
57646+                unsigned int id, void *data,
57647+                unsigned int size)
57648+{
57649+    struct v4l2_ext_control control = {
57650+        .id = id,
57651+        .ptr = data,
57652+        .size = size
57653+    };
57654+
57655+    int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1);
57656+    return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED;
57657+}
57658+
57659+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
57660+                                      enum v4l2_buf_type buf_type,
57661+                   const uint32_t pixfmt,
57662+                   const uint32_t width, const uint32_t height,
57663+                                      const size_t bufsize)
57664+{
57665+    MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize);
57666+    if (rv != MEDIABUFS_STATUS_SUCCESS)
57667+        request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height);
57668+
57669+    return rv;
57670+}
57671+
57672+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n)
57673+{
57674+    int rv = 0;
57675+    while (n--) {
57676+        while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) {
57677+            const int err = errno;
57678+            if (err != EINTR) {
57679+                // Often used for probing - errors are to be expected
57680+                request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err);
57681+                ctrls->type = 0; // 0 is invalid
57682+                rv = -err;
57683+                break;
57684+            }
57685+        }
57686+        ++ctrls;
57687+    }
57688+    return rv;
57689+}
57690+
57691+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
57692+{
57693+    // Single planar OUTPUT can only take exact size buffers
57694+    // Multiplanar will take larger than negotiated
57695+    return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
57696+}
57697+
57698+static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
57699+{
57700+    if (!mbc)
57701+        return;
57702+
57703+    // Break the weak link first
57704+    ff_weak_link_break(&mbc->this_wlm);
57705+
57706+    polltask_delete(&mbc->pt);
57707+
57708+    mediabufs_stream_off(mbc);
57709+
57710+    // Empty v4l2 buffer stash
57711+    request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0);
57712+    request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0);
57713+
57714+    bq_free_all_free_src(mbc->src);
57715+    bq_free_all_inuse_src(mbc->src);
57716+    bq_free_all_free_dst(mbc->dst);
57717+
57718+    {
57719+        struct qent_dst *dst_be;
57720+        while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) {
57721+            dst_be->base.timestamp = (struct timeval){0};
57722+            dst_be->base.status = QENT_ERROR;
57723+            qe_dst_done(dst_be);
57724+        }
57725+    }
57726+
57727+    queue_delete(mbc->dst);
57728+    queue_delete(mbc->src);
57729+    close(mbc->vfd);
57730+    pthread_mutex_destroy(&mbc->lock);
57731+
57732+    free(mbc);
57733+}
57734+
57735+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc)
57736+{
57737+    atomic_fetch_add(&mbc->ref_count, 1);
57738+    return mbc;
57739+}
57740+
57741+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
57742+{
57743+    struct mediabufs_ctl *const mbc = *pmbc;
57744+    int n;
57745+
57746+    if (!mbc)
57747+        return;
57748+    *pmbc = NULL;
57749+    n = atomic_fetch_sub(&mbc->ref_count, 1);
57750+    if (n)
57751+        return;
57752+    mediabufs_ctl_delete(mbc);
57753+}
57754+
57755+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc)
57756+{
57757+    return mbc->capability.version;
57758+}
57759+
57760+static int set_capabilities(struct mediabufs_ctl *const mbc)
57761+{
57762+    uint32_t caps;
57763+
57764+    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) {
57765+        int err = errno;
57766+        request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
57767+        return -err;
57768+    }
57769+
57770+    caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
57771+            mbc->capability.device_caps :
57772+            mbc->capability.capabilities;
57773+
57774+    if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
57775+        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
57776+        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
57777+    }
57778+    else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) {
57779+        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
57780+        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
57781+    }
57782+    else {
57783+        request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps);
57784+        return -EINVAL;
57785+    }
57786+
57787+    return 0;
57788+}
57789+
57790+/* One of these per context */
57791+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq)
57792+{
57793+    struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc));
57794+
57795+    if (!mbc)
57796+        return NULL;
57797+
57798+    mbc->dc = dc;
57799+    // Default mono planar
57800+    mbc->pq = pq;
57801+    pthread_mutex_init(&mbc->lock, NULL);
57802+
57803+    /* Pick a default  - could we scan for this? */
57804+    if (vpath == NULL)
57805+        vpath = "/dev/media0";
57806+
57807+    while ((mbc->vfd = open(vpath, O_RDWR)) == -1)
57808+    {
57809+        const int err = errno;
57810+        if (err != EINTR) {
57811+            request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err));
57812+            goto fail0;
57813+        }
57814+    }
57815+
57816+    if (set_capabilities(mbc)) {
57817+        request_err(dc, "Bad capabilities for video dev '%s'\n", vpath);
57818+        goto fail1;
57819+    }
57820+
57821+    mbc->src = queue_new(mbc->vfd);
57822+    if (!mbc->src)
57823+        goto fail1;
57824+    mbc->dst = queue_new(mbc->vfd);
57825+    if (!mbc->dst)
57826+        goto fail2;
57827+    mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc);
57828+    if (!mbc->pt)
57829+        goto fail3;
57830+    mbc->this_wlm = ff_weak_link_new(mbc);
57831+    if (!mbc->this_wlm)
57832+        goto fail4;
57833+
57834+    /* Cannot add polltask now - polling with nothing pending
57835+     * generates infinite error polls
57836+    */
57837+    return mbc;
57838+
57839+fail4:
57840+    polltask_delete(&mbc->pt);
57841+fail3:
57842+    queue_delete(mbc->dst);
57843+fail2:
57844+    queue_delete(mbc->src);
57845+fail1:
57846+    close(mbc->vfd);
57847+fail0:
57848+    free(mbc);
57849+    request_info(dc, "%s: FAILED\n", __func__);
57850+    return NULL;
57851+}
57852+
57853+
57854+
57855--- /dev/null
57856+++ b/libavcodec/v4l2_req_media.h
57857@@ -0,0 +1,154 @@
57858+/*
57859+e.h
57860+*
57861+ * Permission is hereby granted, free of charge, to any person obtaining a
57862+ * copy of this software and associated documentation files (the
57863+ * "Software"), to deal in the Software without restriction, including
57864+ * without limitation the rights to use, copy, modify, merge, publish,
57865+ * distribute, sub license, and/or sell copies of the Software, and to
57866+ * permit persons to whom the Software is furnished to do so, subject to
57867+ * the following conditions:
57868+ *
57869+ * The above copyright notice and this permission notice (including the
57870+ * next paragraph) shall be included in all copies or substantial portions
57871+ * of the Software.
57872+ *
57873+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
57874+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
57875+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
57876+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
57877+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
57878+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
57879+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
57880+ */
57881+
57882+#ifndef _MEDIA_H_
57883+#define _MEDIA_H_
57884+
57885+#include <stdbool.h>
57886+#include <stdint.h>
57887+
57888+struct v4l2_format;
57889+struct v4l2_fmtdesc;
57890+struct v4l2_query_ext_ctrl;
57891+
57892+struct pollqueue;
57893+struct media_request;
57894+struct media_pool;
57895+
57896+typedef enum media_buf_status {
57897+    MEDIABUFS_STATUS_SUCCESS = 0,
57898+    MEDIABUFS_ERROR_OPERATION_FAILED,
57899+    MEDIABUFS_ERROR_DECODING_ERROR,
57900+    MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
57901+    MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
57902+    MEDIABUFS_ERROR_ALLOCATION_FAILED,
57903+} MediaBufsStatus;
57904+
57905+struct media_pool * media_pool_new(const char * const media_path,
57906+                   struct pollqueue * const pq,
57907+                   const unsigned int n);
57908+void media_pool_delete(struct media_pool ** pmp);
57909+
57910+// Obtain a media request
57911+// Will block if none availible - has a 2sec timeout
57912+struct media_request * media_request_get(struct media_pool * const mp);
57913+int media_request_fd(const struct media_request * const req);
57914+
57915+// Start this request
57916+// Request structure is returned to pool once done
57917+int media_request_start(struct media_request * const req);
57918+
57919+// Return an *unstarted* media_request to the pool
57920+// May later be upgraded to allow for aborting a started req
57921+int media_request_abort(struct media_request ** const preq);
57922+
57923+
57924+struct mediabufs_ctl;
57925+struct qent_src;
57926+struct qent_dst;
57927+struct dmabuf_h;
57928+struct dmabufs_ctl;
57929+
57930+int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
57931+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);
57932+
57933+// prealloc
57934+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc);
57935+// dbsc may be NULL if realloc not required
57936+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc);
57937+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane);
57938+int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane);
57939+MediaBufsStatus qent_dst_wait(struct qent_dst *const be);
57940+void qent_dst_delete(struct qent_dst *const be);
57941+// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead
57942+void qent_dst_unref(struct qent_dst ** const pbe_dst);
57943+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst);
57944+
57945+const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no);
57946+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be);
57947+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be);
57948+/* Import an fd unattached to any mediabuf */
57949+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
57950+                unsigned int plane,
57951+                int fd, size_t size);
57952+
57953+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
57954+                struct media_request **const pmreq,
57955+                struct qent_src **const psrc_be,
57956+                struct qent_dst *const dst_be,
57957+                const bool is_final);
57958+// Get / alloc a dst buffer & associate with a slot
57959+// If the dst pool is empty then behaviour depends on the fixed flag passed to
57960+// dst_slots_create.  Default is !fixed = unlimited alloc
57961+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
57962+                           struct dmabufs_ctl *const dbsc);
57963+// Create dst slots without alloc
57964+// If fixed true then qent_alloc will only get slots from this pool and will
57965+// block until a qent has been unrefed
57966+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed);
57967+
57968+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
57969+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
57970+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc);
57971+
57972+typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc);
57973+
57974+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
57975+               const unsigned int width,
57976+               const unsigned int height,
57977+               mediabufs_dst_fmt_accept_fn *const accept_fn,
57978+               void *const accept_v);
57979+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc);
57980+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src);
57981+
57982+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq,
57983+                                struct v4l2_ext_control control_array[], unsigned int n);
57984+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
57985+                struct media_request * const mreq,
57986+                unsigned int id, void *data,
57987+                unsigned int size);
57988+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n);
57989+
57990+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc);
57991+
57992+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
57993+                                      enum v4l2_buf_type buf_type,
57994+                                      const uint32_t pixfmt,
57995+                                      const uint32_t width, const uint32_t height,
57996+                                      const size_t bufsize);
57997+
57998+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
57999+                  struct dmabufs_ctl * const dbsc,
58000+                  unsigned int n);
58001+
58002+#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
58003+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
58004+
58005+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
58006+                     const char *vpath, struct pollqueue *const pq);
58007+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
58008+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc);
58009+
58010+
58011+#endif
58012--- /dev/null
58013+++ b/libavcodec/v4l2_req_pollqueue.c
58014@@ -0,0 +1,361 @@
58015+#include <errno.h>
58016+#include <limits.h>
58017+#include <poll.h>
58018+#include <pthread.h>
58019+#include <semaphore.h>
58020+#include <stdatomic.h>
58021+#include <stdbool.h>
58022+#include <stdlib.h>
58023+#include <stdint.h>
58024+#include <stdio.h>
58025+#include <string.h>
58026+#include <unistd.h>
58027+#include <sys/eventfd.h>
58028+
58029+#include "v4l2_req_pollqueue.h"
58030+#include "v4l2_req_utils.h"
58031+
58032+
58033+struct pollqueue;
58034+
58035+enum polltask_state {
58036+    POLLTASK_UNQUEUED = 0,
58037+    POLLTASK_QUEUED,
58038+    POLLTASK_RUNNING,
58039+    POLLTASK_Q_KILL,
58040+    POLLTASK_RUN_KILL,
58041+};
58042+
58043+struct polltask {
58044+    struct polltask *next;
58045+    struct polltask *prev;
58046+    struct pollqueue *q;
58047+    enum polltask_state state;
58048+
58049+    int fd;
58050+    short events;
58051+
58052+    void (*fn)(void *v, short revents);
58053+    void * v;
58054+
58055+    uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */
58056+    sem_t kill_sem;
58057+};
58058+
58059+struct pollqueue {
58060+    atomic_int ref_count;
58061+    pthread_mutex_t lock;
58062+
58063+    struct polltask *head;
58064+    struct polltask *tail;
58065+
58066+    bool kill;
58067+    bool no_prod;
58068+    int prod_fd;
58069+    struct polltask *prod_pt;
58070+    pthread_t worker;
58071+};
58072+
58073+struct polltask *polltask_new(struct pollqueue *const pq,
58074+                              const int fd, const short events,
58075+                  void (*const fn)(void *v, short revents),
58076+                  void *const v)
58077+{
58078+    struct polltask *pt;
58079+
58080+    if (!events)
58081+        return NULL;
58082+
58083+    pt = malloc(sizeof(*pt));
58084+    if (!pt)
58085+        return NULL;
58086+
58087+    *pt = (struct polltask){
58088+        .next = NULL,
58089+        .prev = NULL,
58090+        .q = pollqueue_ref(pq),
58091+        .fd = fd,
58092+        .events = events,
58093+        .fn = fn,
58094+        .v = v
58095+    };
58096+
58097+    sem_init(&pt->kill_sem, 0, 0);
58098+
58099+    return pt;
58100+}
58101+
58102+static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt)
58103+{
58104+    if (pt->prev)
58105+        pt->prev->next = pt->next;
58106+    else
58107+        pq->head = pt->next;
58108+    if (pt->next)
58109+        pt->next->prev = pt->prev;
58110+    else
58111+        pq->tail = pt->prev;
58112+    pt->next = NULL;
58113+    pt->prev = NULL;
58114+}
58115+
58116+static void polltask_free(struct polltask * const pt)
58117+{
58118+    sem_destroy(&pt->kill_sem);
58119+    free(pt);
58120+}
58121+
58122+static int pollqueue_prod(const struct pollqueue *const pq)
58123+{
58124+    static const uint64_t one = 1;
58125+    return write(pq->prod_fd, &one, sizeof(one));
58126+}
58127+
58128+void polltask_delete(struct polltask **const ppt)
58129+{
58130+    struct polltask *const pt = *ppt;
58131+    struct pollqueue * pq;
58132+    enum polltask_state state;
58133+    bool prodme;
58134+
58135+    if (!pt)
58136+        return;
58137+
58138+    pq = pt->q;
58139+    pthread_mutex_lock(&pq->lock);
58140+    state = pt->state;
58141+    pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL;
58142+    prodme = !pq->no_prod;
58143+    pthread_mutex_unlock(&pq->lock);
58144+
58145+    if (state != POLLTASK_UNQUEUED) {
58146+        if (prodme)
58147+            pollqueue_prod(pq);
58148+        while (sem_wait(&pt->kill_sem) && errno == EINTR)
58149+            /* loop */;
58150+    }
58151+
58152+    // Leave zapping the ref until we have DQed the PT as might well be
58153+    // legitimately used in it
58154+    *ppt = NULL;
58155+    polltask_free(pt);
58156+    pollqueue_unref(&pq);
58157+}
58158+
58159+static uint64_t pollqueue_now(int timeout)
58160+{
58161+    struct timespec now;
58162+    uint64_t now_ms;
58163+
58164+    if (clock_gettime(CLOCK_MONOTONIC, &now))
58165+        return 0;
58166+    now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout;
58167+    return now_ms ? now_ms : (uint64_t)1;
58168+}
58169+
58170+void pollqueue_add_task(struct polltask *const pt, const int timeout)
58171+{
58172+    bool prodme = false;
58173+    struct pollqueue * const pq = pt->q;
58174+
58175+    pthread_mutex_lock(&pq->lock);
58176+    if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) {
58177+        if (pq->tail)
58178+            pq->tail->next = pt;
58179+        else
58180+            pq->head = pt;
58181+        pt->prev = pq->tail;
58182+        pt->next = NULL;
58183+        pt->state = POLLTASK_QUEUED;
58184+        pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout);
58185+        pq->tail = pt;
58186+        prodme = !pq->no_prod;
58187+    }
58188+    pthread_mutex_unlock(&pq->lock);
58189+    if (prodme)
58190+        pollqueue_prod(pq);
58191+}
58192+
58193+static void *poll_thread(void *v)
58194+{
58195+    struct pollqueue *const pq = v;
58196+    struct pollfd *a = NULL;
58197+    size_t asize = 0;
58198+
58199+    pthread_mutex_lock(&pq->lock);
58200+    do {
58201+        unsigned int i;
58202+        unsigned int n = 0;
58203+        struct polltask *pt;
58204+        struct polltask *pt_next;
58205+        uint64_t now = pollqueue_now(0);
58206+        int timeout = -1;
58207+        int rv;
58208+
58209+        for (pt = pq->head; pt; pt = pt_next) {
58210+            int64_t t;
58211+
58212+            pt_next = pt->next;
58213+
58214+            if (pt->state == POLLTASK_Q_KILL) {
58215+                pollqueue_rem_task(pq, pt);
58216+                sem_post(&pt->kill_sem);
58217+                continue;
58218+            }
58219+
58220+            if (n >= asize) {
58221+                asize = asize ? asize * 2 : 4;
58222+                a = realloc(a, asize * sizeof(*a));
58223+                if (!a) {
58224+                    request_log("Failed to realloc poll array to %zd\n", asize);
58225+                    goto fail_locked;
58226+                }
58227+            }
58228+
58229+            a[n++] = (struct pollfd){
58230+                .fd = pt->fd,
58231+                .events = pt->events
58232+            };
58233+
58234+            t = (int64_t)(pt->timeout - now);
58235+            if (pt->timeout && t < INT_MAX &&
58236+                (timeout < 0 || (int)t < timeout))
58237+                timeout = (t < 0) ? 0 : (int)t;
58238+        }
58239+        pthread_mutex_unlock(&pq->lock);
58240+
58241+        if ((rv = poll(a, n, timeout)) == -1) {
58242+            if (errno != EINTR) {
58243+                request_log("Poll error: %s\n", strerror(errno));
58244+                goto fail_unlocked;
58245+            }
58246+        }
58247+
58248+        pthread_mutex_lock(&pq->lock);
58249+        now = pollqueue_now(0);
58250+
58251+        /* Prodding in this loop is pointless and might lead to
58252+         * infinite looping
58253+        */
58254+        pq->no_prod = true;
58255+        for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) {
58256+            pt_next = pt->next;
58257+
58258+            /* Pending? */
58259+            if (a[i].revents ||
58260+                (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) {
58261+                pollqueue_rem_task(pq, pt);
58262+                if (pt->state == POLLTASK_QUEUED)
58263+                    pt->state = POLLTASK_RUNNING;
58264+                if (pt->state == POLLTASK_Q_KILL)
58265+                    pt->state = POLLTASK_RUN_KILL;
58266+                pthread_mutex_unlock(&pq->lock);
58267+
58268+                /* This can add new entries to the Q but as
58269+                 * those are added to the tail our existing
58270+                 * chain remains intact
58271+                */
58272+                pt->fn(pt->v, a[i].revents);
58273+
58274+                pthread_mutex_lock(&pq->lock);
58275+                if (pt->state == POLLTASK_RUNNING)
58276+                    pt->state = POLLTASK_UNQUEUED;
58277+                if (pt->state == POLLTASK_RUN_KILL)
58278+                    sem_post(&pt->kill_sem);
58279+            }
58280+        }
58281+        pq->no_prod = false;
58282+
58283+    } while (!pq->kill);
58284+
58285+fail_locked:
58286+    pthread_mutex_unlock(&pq->lock);
58287+fail_unlocked:
58288+    free(a);
58289+    return NULL;
58290+}
58291+
58292+static void prod_fn(void *v, short revents)
58293+{
58294+    struct pollqueue *const pq = v;
58295+    char buf[8];
58296+    if (revents)
58297+        read(pq->prod_fd, buf, 8);
58298+    if (!pq->kill)
58299+        pollqueue_add_task(pq->prod_pt, -1);
58300+}
58301+
58302+struct pollqueue * pollqueue_new(void)
58303+{
58304+    struct pollqueue *pq = malloc(sizeof(*pq));
58305+    if (!pq)
58306+        return NULL;
58307+    *pq = (struct pollqueue){
58308+        .ref_count = ATOMIC_VAR_INIT(0),
58309+        .lock = PTHREAD_MUTEX_INITIALIZER,
58310+        .head = NULL,
58311+        .tail = NULL,
58312+        .kill = false,
58313+        .prod_fd = -1
58314+    };
58315+
58316+    pq->prod_fd = eventfd(0, EFD_NONBLOCK);
58317+    if (pq->prod_fd == 1)
58318+        goto fail1;
58319+    pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq);
58320+    if (!pq->prod_pt)
58321+        goto fail2;
58322+    pollqueue_add_task(pq->prod_pt, -1);
58323+    if (pthread_create(&pq->worker, NULL, poll_thread, pq))
58324+        goto fail3;
58325+    // Reset ref count which will have been inced by the add_task
58326+    atomic_store(&pq->ref_count, 0);
58327+    return pq;
58328+
58329+fail3:
58330+    polltask_free(pq->prod_pt);
58331+fail2:
58332+    close(pq->prod_fd);
58333+fail1:
58334+    free(pq);
58335+    return NULL;
58336+}
58337+
58338+static void pollqueue_free(struct pollqueue *const pq)
58339+{
58340+    void *rv;
58341+
58342+    pthread_mutex_lock(&pq->lock);
58343+    pq->kill = true;
58344+    pollqueue_prod(pq);
58345+    pthread_mutex_unlock(&pq->lock);
58346+
58347+    pthread_join(pq->worker, &rv);
58348+    polltask_free(pq->prod_pt);
58349+    pthread_mutex_destroy(&pq->lock);
58350+    close(pq->prod_fd);
58351+    free(pq);
58352+}
58353+
58354+struct pollqueue * pollqueue_ref(struct pollqueue *const pq)
58355+{
58356+    atomic_fetch_add(&pq->ref_count, 1);
58357+    return pq;
58358+}
58359+
58360+void pollqueue_unref(struct pollqueue **const ppq)
58361+{
58362+    struct pollqueue * const pq = *ppq;
58363+
58364+    if (!pq)
58365+        return;
58366+    *ppq = NULL;
58367+
58368+    if (atomic_fetch_sub(&pq->ref_count, 1) != 0)
58369+        return;
58370+
58371+    pollqueue_free(pq);
58372+}
58373+
58374+
58375+
58376--- /dev/null
58377+++ b/libavcodec/v4l2_req_pollqueue.h
58378@@ -0,0 +1,18 @@
58379+#ifndef POLLQUEUE_H_
58380+#define POLLQUEUE_H_
58381+
58382+struct polltask;
58383+struct pollqueue;
58384+
58385+struct polltask *polltask_new(struct pollqueue *const pq,
58386+			      const int fd, const short events,
58387+			      void (*const fn)(void *v, short revents),
58388+			      void *const v);
58389+void polltask_delete(struct polltask **const ppt);
58390+
58391+void pollqueue_add_task(struct polltask *const pt, const int timeout);
58392+struct pollqueue * pollqueue_new(void);
58393+void pollqueue_unref(struct pollqueue **const ppq);
58394+struct pollqueue * pollqueue_ref(struct pollqueue *const pq);
58395+
58396+#endif /* POLLQUEUE_H_ */
58397--- /dev/null
58398+++ b/libavcodec/v4l2_req_utils.h
58399@@ -0,0 +1,22 @@
58400+#include "libavutil/log.h"
58401+
58402+#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__)
58403+
58404+#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__)
58405+#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__)
58406+#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__)
58407+#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__)
58408+
58409+static inline char safechar(char c) {
58410+    return c > 0x20 && c < 0x7f ? c : '.';
58411+}
58412+
58413+static inline const char * strfourcc(char tbuf[5], uint32_t fcc) {
58414+    tbuf[0] = safechar((fcc >>  0) & 0xff);
58415+    tbuf[1] = safechar((fcc >>  8) & 0xff);
58416+    tbuf[2] = safechar((fcc >> 16) & 0xff);
58417+    tbuf[3] = safechar((fcc >> 24) & 0xff);
58418+    tbuf[4] = '\0';
58419+    return tbuf;
58420+}
58421+
58422--- /dev/null
58423+++ b/libavcodec/v4l2_request_hevc.c
58424@@ -0,0 +1,315 @@
58425+/*
58426+ * This file is part of FFmpeg.
58427+ *
58428+ * FFmpeg is free software; you can redistribute it and/or
58429+ * modify it under the terms of the GNU Lesser General Public
58430+ * License as published by the Free Software Foundation; either
58431+ * version 2.1 of the License, or (at your option) any later version.
58432+ *
58433+ * FFmpeg is distributed in the hope that it will be useful,
58434+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
58435+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
58436+ * Lesser General Public License for more details.
58437+ *
58438+ * You should have received a copy of the GNU Lesser General Public
58439+ * License along with FFmpeg; if not, write to the Free Software
58440+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
58441+ */
58442+
58443+
58444+
58445+#include "decode.h"
58446+#include "hevcdec.h"
58447+#include "hwconfig.h"
58448+
58449+#include "v4l2_request_hevc.h"
58450+
58451+#include "libavutil/hwcontext_drm.h"
58452+
58453+#include "v4l2_req_devscan.h"
58454+#include "v4l2_req_dmabufs.h"
58455+#include "v4l2_req_pollqueue.h"
58456+#include "v4l2_req_media.h"
58457+#include "v4l2_req_utils.h"
58458+
58459+static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8)
58460+{
58461+    const size_t wxh = w * h;
58462+    size_t bits_alloc;
58463+
58464+    /* Annex A gives a min compression of 2 @ lvl 3.1
58465+     * (wxh <= 983040) and min 4 thereafter but avoid
58466+     * the odity of 983041 having a lower limit than
58467+     * 983040.
58468+     * Multiply by 3/2 for 4:2:0
58469+     */
58470+    bits_alloc = wxh < 983040 ? wxh * 3 / 4 :
58471+        wxh < 983040 * 2 ? 983040 * 3 / 4 :
58472+        wxh * 3 / 8;
58473+    /* Allow for bit depth */
58474+    bits_alloc += (bits_alloc * bits_minus8) / 8;
58475+    /* Add a few bytes (16k) for overhead */
58476+    bits_alloc += 0x4000;
58477+    return bits_alloc;
58478+}
58479+
58480+static int v4l2_req_hevc_start_frame(AVCodecContext *avctx,
58481+                                     av_unused const uint8_t *buffer,
58482+                                     av_unused uint32_t size)
58483+{
58484+    const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
58485+    return ctx->fns->start_frame(avctx, buffer, size);
58486+}
58487+
58488+static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
58489+{
58490+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
58491+    return ctx->fns->decode_slice(avctx, buffer, size);
58492+}
58493+
58494+static int v4l2_req_hevc_end_frame(AVCodecContext *avctx)
58495+{
58496+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
58497+    return ctx->fns->end_frame(avctx);
58498+}
58499+
58500+static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx)
58501+{
58502+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
58503+    ctx->fns->abort_frame(avctx);
58504+}
58505+
58506+static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
58507+{
58508+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
58509+    return ctx->fns->frame_params(avctx, hw_frames_ctx);
58510+}
58511+
58512+static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
58513+{
58514+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
58515+    return ctx->fns->alloc_frame(avctx, frame);
58516+}
58517+
58518+
58519+static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
58520+{
58521+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
58522+
58523+    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
58524+
58525+    decode_q_wait(&ctx->decode_q, NULL);  // Wait for all other threads to be out of decode
58526+
58527+    mediabufs_ctl_unref(&ctx->mbufs);
58528+    media_pool_delete(&ctx->mpool);
58529+    pollqueue_unref(&ctx->pq);
58530+    dmabufs_ctl_delete(&ctx->dbufs);
58531+    devscan_delete(&ctx->devscan);
58532+
58533+    decode_q_uninit(&ctx->decode_q);
58534+
58535+//    if (avctx->hw_frames_ctx) {
58536+//        AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
58537+//        av_buffer_pool_flush(hwfc->pool);
58538+//    }
58539+    return 0;
58540+}
58541+
58542+static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc)
58543+{
58544+    AVCodecContext *const avctx = v;
58545+    const HEVCContext *const h = avctx->priv_data;
58546+
58547+    if (h->ps.sps->bit_depth == 8) {
58548+        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 ||
58549+            fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) {
58550+            return 1;
58551+        }
58552+    }
58553+    else if (h->ps.sps->bit_depth == 10) {
58554+        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
58555+            return 1;
58556+        }
58557+    }
58558+    return 0;
58559+}
58560+
58561+static int v4l2_request_hevc_init(AVCodecContext *avctx)
58562+{
58563+    const HEVCContext *h = avctx->priv_data;
58564+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
58565+    const HEVCSPS * const sps = h->ps.sps;
58566+    int ret;
58567+    const struct decdev * decdev;
58568+    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
58569+    size_t src_size;
58570+
58571+    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
58572+
58573+    // Give up immediately if this is something that we have no code to deal with
58574+    if (h->ps.sps->chroma_format_idc != 1) {
58575+        av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc);
58576+        return AVERROR_PATCHWELCOME;
58577+    }
58578+    if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) ||
58579+        h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) {
58580+        av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma);
58581+        return AVERROR_PATCHWELCOME;
58582+    }
58583+
58584+    if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
58585+        av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
58586+        return (AVERROR(-ret));
58587+    }
58588+    ret = AVERROR(ENOMEM);  // Assume mem fail by default for these
58589+
58590+    if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL)
58591+    {
58592+        av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n");
58593+        ret = AVERROR(ENODEV);
58594+        goto fail0;
58595+    }
58596+    av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n",
58597+           decdev_media_path(decdev), decdev_video_path(decdev));
58598+
58599+    if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
58600+        av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n");
58601+        goto fail0;
58602+    }
58603+
58604+    if ((ctx->pq = pollqueue_new()) == NULL) {
58605+        av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n");
58606+        goto fail1;
58607+    }
58608+
58609+    if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) {
58610+        av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n");
58611+        goto fail2;
58612+    }
58613+
58614+    if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) {
58615+        av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n");
58616+        goto fail3;
58617+    }
58618+
58619+    // Ask for an initial bitbuf size of max size / 4
58620+    // We will realloc if we need more
58621+    // Must use sps->h/w as avctx contains cropped size
58622+    src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
58623+    if (mediabufs_src_resizable(ctx->mbufs))
58624+        src_size /= 4;
58625+    // Kludge for conformance tests which break Annex A limits
58626+    else if (src_size < 0x40000)
58627+        src_size = 0x40000;
58628+
58629+    if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt,
58630+                              sps->width, sps->height, src_size)) {
58631+        char tbuf1[5];
58632+        av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
58633+        goto fail4;
58634+    }
58635+
58636+    if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
58637+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
58638+        ctx->fns = &V2(ff_v4l2_req_hevc, 4);
58639+    }
58640+    else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
58641+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
58642+        ctx->fns = &V2(ff_v4l2_req_hevc, 3);
58643+    }
58644+    else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
58645+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
58646+        ctx->fns = &V2(ff_v4l2_req_hevc, 2);
58647+    }
58648+    else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) {
58649+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
58650+        ctx->fns = &V2(ff_v4l2_req_hevc, 1);
58651+    }
58652+    else {
58653+        av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
58654+        ret = AVERROR(EINVAL);
58655+        goto fail4;
58656+    }
58657+
58658+    if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) {
58659+        char tbuf1[5];
58660+        av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
58661+        goto fail4;
58662+    }
58663+
58664+    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) {
58665+        av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
58666+        goto fail4;
58667+    }
58668+
58669+    {
58670+        unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering +
58671+            avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6);
58672+        av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots,
58673+               sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
58674+               avctx->thread_count, avctx->extra_hw_frames);
58675+
58676+        // extra_hw_frames is -1 if unset
58677+        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) {
58678+            av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
58679+            goto fail4;
58680+        }
58681+    }
58682+
58683+    if (mediabufs_stream_on(ctx->mbufs)) {
58684+        av_log(avctx, AV_LOG_ERROR, "Failed stream on\n");
58685+        goto fail4;
58686+    }
58687+
58688+    if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) {
58689+        av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n");
58690+        goto fail4;
58691+    }
58692+
58693+    if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) {
58694+        av_log(avctx, AV_LOG_ERROR, "Failed set controls\n");
58695+        goto fail5;
58696+    }
58697+
58698+    decode_q_init(&ctx->decode_q);
58699+
58700+    // Set our s/w format
58701+    avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
58702+
58703+    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n",
58704+           ctx->fns->name,
58705+           decdev_media_path(decdev), decdev_video_path(decdev));
58706+
58707+    return 0;
58708+
58709+fail5:
58710+    av_buffer_unref(&avctx->hw_frames_ctx);
58711+fail4:
58712+    mediabufs_ctl_unref(&ctx->mbufs);
58713+fail3:
58714+    media_pool_delete(&ctx->mpool);
58715+fail2:
58716+    pollqueue_unref(&ctx->pq);
58717+fail1:
58718+    dmabufs_ctl_delete(&ctx->dbufs);
58719+fail0:
58720+    devscan_delete(&ctx->devscan);
58721+    return ret;
58722+}
58723+
58724+const AVHWAccel ff_hevc_v4l2request_hwaccel = {
58725+    .name           = "hevc_v4l2request",
58726+    .type           = AVMEDIA_TYPE_VIDEO,
58727+    .id             = AV_CODEC_ID_HEVC,
58728+    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
58729+    .alloc_frame    = v4l2_req_hevc_alloc_frame,
58730+    .start_frame    = v4l2_req_hevc_start_frame,
58731+    .decode_slice   = v4l2_req_hevc_decode_slice,
58732+    .end_frame      = v4l2_req_hevc_end_frame,
58733+    .abort_frame    = v4l2_req_hevc_abort_frame,
58734+    .init           = v4l2_request_hevc_init,
58735+    .uninit         = v4l2_request_hevc_uninit,
58736+    .priv_data_size = sizeof(V4L2RequestContextHEVC),
58737+    .frame_params   = v4l2_req_hevc_frame_params,
58738+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
58739+};
58740--- /dev/null
58741+++ b/libavcodec/v4l2_request_hevc.h
58742@@ -0,0 +1,101 @@
58743+#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
58744+#define AVCODEC_V4L2_REQUEST_HEVC_H
58745+
58746+#include <drm_fourcc.h>
58747+#include "v4l2_req_decode_q.h"
58748+
58749+#ifndef DRM_FORMAT_NV15
58750+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
58751+#endif
58752+
58753+#ifndef DRM_FORMAT_NV20
58754+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
58755+#endif
58756+
58757+// P030 should be defined in drm_fourcc.h and hopefully will be sometime
58758+// in the future but until then...
58759+#ifndef DRM_FORMAT_P030
58760+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
58761+#endif
58762+
58763+#ifndef DRM_FORMAT_NV15
58764+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
58765+#endif
58766+
58767+#ifndef DRM_FORMAT_NV20
58768+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
58769+#endif
58770+
58771+#include <linux/videodev2.h>
58772+#ifndef V4L2_CID_CODEC_BASE
58773+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
58774+#endif
58775+
58776+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
58777+// in drm_fourcc.h hopefully will be sometime in the future but until then...
58778+#ifndef V4L2_PIX_FMT_NV12_10_COL128
58779+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
58780+#endif
58781+
58782+#ifndef V4L2_PIX_FMT_NV12_COL128
58783+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
58784+#endif
58785+
58786+#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY
58787+#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
58788+#endif
58789+
58790+#define VCAT(name, version) name##_v##version
58791+#define V2(n,v) VCAT(n, v)
58792+#define V(n) V2(n, HEVC_CTRLS_VERSION)
58793+
58794+#define S2(x) #x
58795+#define STR(x) S2(x)
58796+
58797+// 1 per decoder
58798+struct v4l2_req_decode_fns;
58799+
58800+typedef struct V4L2RequestContextHEVC {
58801+//    V4L2RequestContext base;
58802+    const struct v4l2_req_decode_fns * fns;
58803+
58804+    unsigned int timestamp;  // ?? maybe uint64_t
58805+
58806+    int decode_mode;
58807+    int start_code;
58808+    unsigned int max_slices;    // 0 => not wanted (frame mode)
58809+    unsigned int max_offsets;   // 0 => not wanted
58810+
58811+    req_decode_q decode_q;
58812+
58813+    struct devscan *devscan;
58814+    struct dmabufs_ctl *dbufs;
58815+    struct pollqueue *pq;
58816+    struct media_pool * mpool;
58817+    struct mediabufs_ctl *mbufs;
58818+} V4L2RequestContextHEVC;
58819+
58820+typedef struct v4l2_req_decode_fns {
58821+    int src_pix_fmt_v4l2;
58822+    const char * name;
58823+
58824+    // Init setup
58825+    int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
58826+    int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
58827+
58828+    // Passthrough of hwaccel fns
58829+    int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
58830+    int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
58831+    int (*end_frame)(AVCodecContext *avctx);
58832+    void (*abort_frame)(AVCodecContext *avctx);
58833+    int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
58834+    int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame);
58835+} v4l2_req_decode_fns;
58836+
58837+
58838+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
58839+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
58840+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
58841+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4);
58842+
58843+#endif
58844--- a/libavcodec/vc1dec.c
58845+++ b/libavcodec/vc1dec.c
58846@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCod
58847             size = next - start - 4;
58848             if (size <= 0)
58849                 continue;
58850-            buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
58851+            buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
58852             init_get_bits(&gb, buf2, buf2_size * 8);
58853             switch (AV_RB32(start)) {
58854             case VC1_CODE_SEQHDR:
58855@@ -689,7 +689,7 @@ static int vc1_decode_frame(AVCodecConte
58856                 case VC1_CODE_FRAME:
58857                     if (avctx->hwaccel)
58858                         buf_start = start;
58859-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
58860+                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
58861                     break;
58862                 case VC1_CODE_FIELD: {
58863                     int buf_size3;
58864@@ -706,8 +706,8 @@ static int vc1_decode_frame(AVCodecConte
58865                         ret = AVERROR(ENOMEM);
58866                         goto err;
58867                     }
58868-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
58869-                                                    slices[n_slices].buf);
58870+                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
58871+                                                              slices[n_slices].buf);
58872                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
58873                                   buf_size3 << 3);
58874                     slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
58875@@ -718,7 +718,7 @@ static int vc1_decode_frame(AVCodecConte
58876                     break;
58877                 }
58878                 case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
58879-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
58880+                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
58881                     init_get_bits(&s->gb, buf2, buf_size2 * 8);
58882                     ff_vc1_decode_entry_point(avctx, v, &s->gb);
58883                     break;
58884@@ -735,8 +735,8 @@ static int vc1_decode_frame(AVCodecConte
58885                         ret = AVERROR(ENOMEM);
58886                         goto err;
58887                     }
58888-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
58889-                                                    slices[n_slices].buf);
58890+                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
58891+                                                              slices[n_slices].buf);
58892                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
58893                                   buf_size3 << 3);
58894                     slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
58895@@ -770,7 +770,7 @@ static int vc1_decode_frame(AVCodecConte
58896                     ret = AVERROR(ENOMEM);
58897                     goto err;
58898                 }
58899-                buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
58900+                buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
58901                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
58902                               buf_size3 << 3);
58903                 slices[n_slices].mby_start = s->mb_height + 1 >> 1;
58904@@ -779,9 +779,9 @@ static int vc1_decode_frame(AVCodecConte
58905                 n_slices1 = n_slices - 1;
58906                 n_slices++;
58907             }
58908-            buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
58909+            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
58910         } else {
58911-            buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
58912+            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
58913         }
58914         init_get_bits(&s->gb, buf2, buf_size2*8);
58915     } else
58916--- a/libavcodec/vc1dsp.c
58917+++ b/libavcodec/vc1dsp.c
58918@@ -32,6 +32,7 @@
58919 #include "rnd_avg.h"
58920 #include "vc1dsp.h"
58921 #include "startcode.h"
58922+#include "vc1_common.h"
58923
58924 /* Apply overlap transform to horizontal edge */
58925 static void vc1_v_overlap_c(uint8_t *src, int stride)
58926@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContex
58927 #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
58928
58929     dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
58930+    dsp->vc1_unescape_buffer      = vc1_unescape_buffer;
58931
58932     if (ARCH_AARCH64)
58933         ff_vc1dsp_init_aarch64(dsp);
58934--- a/libavcodec/vc1dsp.h
58935+++ b/libavcodec/vc1dsp.h
58936@@ -80,6 +80,9 @@ typedef struct VC1DSPContext {
58937      * one or more further zero bytes and a one byte.
58938      */
58939     int (*startcode_find_candidate)(const uint8_t *buf, int size);
58940+
58941+    /* Copy a buffer, removing startcode emulation escape bytes as we go */
58942+    int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
58943 } VC1DSPContext;
58944
58945 void ff_vc1dsp_init(VC1DSPContext* c);
58946--- /dev/null
58947+++ b/libavcodec/weak_link.c
58948@@ -0,0 +1,102 @@
58949+#include <stdlib.h>
58950+#include <pthread.h>
58951+#include <stdatomic.h>
58952+#include "weak_link.h"
58953+
58954+struct ff_weak_link_master {
58955+    atomic_int ref_count;    /* 0 is single ref for easier atomics */
58956+    pthread_rwlock_t lock;
58957+    void * ptr;
58958+};
58959+
58960+static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c)
58961+{
58962+    return (struct ff_weak_link_master *)c;
58963+}
58964+
58965+struct ff_weak_link_master * ff_weak_link_new(void * p)
58966+{
58967+    struct ff_weak_link_master * w = malloc(sizeof(*w));
58968+    if (!w)
58969+        return NULL;
58970+    w->ptr = p;
58971+    if (pthread_rwlock_init(&w->lock, NULL)) {
58972+        free(w);
58973+        return NULL;
58974+    }
58975+    return w;
58976+}
58977+
58978+static void weak_link_do_unref(struct ff_weak_link_master * const w)
58979+{
58980+    int n = atomic_fetch_sub(&w->ref_count, 1);
58981+    if (n)
58982+        return;
58983+
58984+    pthread_rwlock_destroy(&w->lock);
58985+    free(w);
58986+}
58987+
58988+// Unref & break link
58989+void ff_weak_link_break(struct ff_weak_link_master ** ppLink)
58990+{
58991+    struct ff_weak_link_master * const w = *ppLink;
58992+    if (!w)
58993+        return;
58994+
58995+    *ppLink = NULL;
58996+    pthread_rwlock_wrlock(&w->lock);
58997+    w->ptr = NULL;
58998+    pthread_rwlock_unlock(&w->lock);
58999+
59000+    weak_link_do_unref(w);
59001+}
59002+
59003+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w)
59004+{
59005+    if (!w)
59006+        return NULL;
59007+    atomic_fetch_add(&w->ref_count, 1);
59008+    return (struct ff_weak_link_client*)w;
59009+}
59010+
59011+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink)
59012+{
59013+    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
59014+    if (!w)
59015+        return;
59016+
59017+    *ppLink = NULL;
59018+    weak_link_do_unref(w);
59019+}
59020+
59021+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink)
59022+{
59023+    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
59024+
59025+    if (!w)
59026+        return NULL;
59027+
59028+    if (pthread_rwlock_rdlock(&w->lock))
59029+        goto broken;
59030+
59031+    if (w->ptr)
59032+        return w->ptr;
59033+
59034+    pthread_rwlock_unlock(&w->lock);
59035+
59036+broken:
59037+    *ppLink = NULL;
59038+    weak_link_do_unref(w);
59039+    return NULL;
59040+}
59041+
59042+// Ignores a NULL c (so can be on the return path of both broken & live links)
59043+void ff_weak_link_unlock(struct ff_weak_link_client * c)
59044+{
59045+    struct ff_weak_link_master * const w = weak_link_x(c);
59046+    if (w)
59047+        pthread_rwlock_unlock(&w->lock);
59048+}
59049+
59050+
59051--- /dev/null
59052+++ b/libavcodec/weak_link.h
59053@@ -0,0 +1,23 @@
59054+struct ff_weak_link_master;
59055+struct ff_weak_link_client;
59056+
59057+struct ff_weak_link_master * ff_weak_link_new(void * p);
59058+void ff_weak_link_break(struct ff_weak_link_master ** ppLink);
59059+
59060+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w);
59061+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink);
59062+
59063+// Returns NULL if link broken - in this case it will also zap
59064+//   *ppLink and unref the weak_link.
59065+// Returns NULL if *ppLink is NULL (so a link once broken stays broken)
59066+//
59067+// The above does mean that there is a race if this is called simultainiously
59068+// by two threads using the same weak_link_client (so don't do that)
59069+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink);
59070+void ff_weak_link_unlock(struct ff_weak_link_client * c);
59071+
59072+
59073+
59074+
59075+
59076+
59077--- a/libavdevice/Makefile
59078+++ b/libavdevice/Makefile
59079@@ -46,6 +46,9 @@ OBJS-$(CONFIG_SNDIO_OUTDEV)
59080 OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
59081 OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
59082 OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
59083+OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
59084+OBJS-$(CONFIG_VOUT_EGL_OUTDEV)           += egl_vout.o
59085+OBJS-$(CONFIG_VOUT_RPI_OUTDEV)           += rpi_vout.o
59086 OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
59087 OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o
59088
59089--- a/libavdevice/alldevices.c
59090+++ b/libavdevice/alldevices.c
59091@@ -52,6 +52,9 @@ extern AVOutputFormat ff_sndio_muxer;
59092 extern AVInputFormat  ff_v4l2_demuxer;
59093 extern AVOutputFormat ff_v4l2_muxer;
59094 extern AVInputFormat  ff_vfwcap_demuxer;
59095+extern AVOutputFormat ff_vout_drm_muxer;
59096+extern AVOutputFormat ff_vout_egl_muxer;
59097+extern AVOutputFormat ff_vout_rpi_muxer;
59098 extern AVInputFormat  ff_xcbgrab_demuxer;
59099 extern AVOutputFormat ff_xv_muxer;
59100
59101--- /dev/null
59102+++ b/libavdevice/drm_vout.c
59103@@ -0,0 +1,643 @@
59104+/*
59105+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
59106+ *
59107+ * This file is part of FFmpeg.
59108+ *
59109+ * FFmpeg is free software; you can redistribute it and/or
59110+ * modify it under the terms of the GNU Lesser General Public
59111+ * License as published by the Free Software Foundation; either
59112+ * version 2.1 of the License, or (at your option) any later version.
59113+ *
59114+ * FFmpeg is distributed in the hope that it will be useful,
59115+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
59116+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
59117+ * Lesser General Public License for more details.
59118+ *
59119+ * You should have received a copy of the GNU Lesser General Public
59120+ * License along with FFmpeg; if not, write to the Free Software
59121+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
59122+ */
59123+
59124+
59125+// *** This module is a work in progress and its utility is strictly
59126+//     limited to testing.
59127+
59128+#include "libavutil/opt.h"
59129+#include "libavutil/pixdesc.h"
59130+#include "libavutil/hwcontext_drm.h"
59131+#include "libavformat/internal.h"
59132+#include "avdevice.h"
59133+
59134+#include "pthread.h"
59135+#include <semaphore.h>
59136+#include <unistd.h>
59137+
59138+#include <xf86drm.h>
59139+#include <xf86drmMode.h>
59140+
59141+#define TRACE_ALL 0
59142+
59143+#define DRM_MODULE "vc4"
59144+
59145+#define ERRSTR strerror(errno)
59146+
59147+struct drm_setup {
59148+   int conId;
59149+   uint32_t crtcId;
59150+   int crtcIdx;
59151+   uint32_t planeId;
59152+   unsigned int out_fourcc;
59153+   struct {
59154+       int x, y, width, height;
59155+   } compose;
59156+};
59157+
59158+typedef struct drm_aux_s {
59159+    unsigned int fb_handle;
59160+    uint32_t bo_handles[AV_DRM_MAX_PLANES];
59161+    AVFrame * frame;
59162+} drm_aux_t;
59163+
59164+// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS
59165+// we get initial flicker probably due to dodgy drm timing
59166+#define AUX_SIZE 3
59167+typedef struct drm_display_env_s
59168+{
59169+    AVClass *class;
59170+
59171+    int drm_fd;
59172+    uint32_t con_id;
59173+    struct drm_setup setup;
59174+    enum AVPixelFormat avfmt;
59175+    int show_all;
59176+
59177+    unsigned int ano;
59178+    drm_aux_t aux[AUX_SIZE];
59179+
59180+    pthread_t q_thread;
59181+    sem_t q_sem_in;
59182+    sem_t q_sem_out;
59183+    int q_terminate;
59184+    AVFrame * q_next;
59185+
59186+} drm_display_env_t;
59187+
59188+
59189+static int drm_vout_write_trailer(AVFormatContext *s)
59190+{
59191+#if TRACE_ALL
59192+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
59193+#endif
59194+
59195+    return 0;
59196+}
59197+
59198+static int drm_vout_write_header(AVFormatContext *s)
59199+{
59200+    const AVCodecParameters * const par = s->streams[0]->codecpar;
59201+
59202+#if TRACE_ALL
59203+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
59204+#endif
59205+    if (   s->nb_streams > 1
59206+        || par->codec_type != AVMEDIA_TYPE_VIDEO
59207+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
59208+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
59209+        return AVERROR(EINVAL);
59210+    }
59211+
59212+    return 0;
59213+}
59214+
59215+static int find_plane(struct AVFormatContext * const avctx,
59216+                      const int drmfd, const int crtcidx, const uint32_t format,
59217+                      uint32_t * const pplane_id)
59218+{
59219+   drmModePlaneResPtr planes;
59220+   drmModePlanePtr plane;
59221+   unsigned int i;
59222+   unsigned int j;
59223+   int ret = 0;
59224+
59225+   planes = drmModeGetPlaneResources(drmfd);
59226+   if (!planes)
59227+   {
59228+       av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR);
59229+       return -1;
59230+   }
59231+
59232+   for (i = 0; i < planes->count_planes; ++i) {
59233+      plane = drmModeGetPlane(drmfd, planes->planes[i]);
59234+      if (!planes)
59235+      {
59236+          av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR);
59237+          break;
59238+      }
59239+
59240+      if (!(plane->possible_crtcs & (1 << crtcidx))) {
59241+         drmModeFreePlane(plane);
59242+         continue;
59243+      }
59244+
59245+      for (j = 0; j < plane->count_formats; ++j) {
59246+         if (plane->formats[j] == format)
59247+            break;
59248+      }
59249+
59250+      if (j == plane->count_formats) {
59251+         drmModeFreePlane(plane);
59252+         continue;
59253+      }
59254+
59255+      *pplane_id = plane->plane_id;
59256+      drmModeFreePlane(plane);
59257+      break;
59258+   }
59259+
59260+   if (i == planes->count_planes)
59261+      ret = -1;
59262+
59263+   drmModeFreePlaneResources(planes);
59264+   return ret;
59265+}
59266+
59267+static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)
59268+{
59269+    if (da->fb_handle != 0) {
59270+        drmModeRmFB(de->drm_fd, da->fb_handle);
59271+        da->fb_handle = 0;
59272+    }
59273+
59274+    for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) {
59275+        if (da->bo_handles[i]) {
59276+            struct drm_gem_close gem_close = {.handle = da->bo_handles[i]};
59277+            drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
59278+            da->bo_handles[i] = 0;
59279+        }
59280+    }
59281+    av_frame_free(&da->frame);
59282+}
59283+
59284+static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame)
59285+{
59286+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
59287+    drm_aux_t * da = de->aux + de->ano;
59288+    const uint32_t format = desc->layers[0].format;
59289+    int ret = 0;
59290+
59291+#if TRACE_ALL
59292+    av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd);
59293+#endif
59294+
59295+    if (de->setup.out_fourcc != format) {
59296+        if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) {
59297+            av_frame_free(&frame);
59298+            av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format);
59299+            return -1;
59300+        }
59301+        de->setup.out_fourcc = format;
59302+    }
59303+
59304+    {
59305+        drmVBlank vbl = {
59306+            .request = {
59307+                .type = DRM_VBLANK_RELATIVE,
59308+                .sequence = 0
59309+            }
59310+        };
59311+
59312+        while (drmWaitVBlank(de->drm_fd, &vbl)) {
59313+            if (errno != EINTR) {
59314+//                av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR);
59315+                break;
59316+            }
59317+        }
59318+    }
59319+
59320+    da_uninit(de, da);
59321+
59322+    {
59323+        uint32_t pitches[4] = {0};
59324+        uint32_t offsets[4] = {0};
59325+        uint64_t modifiers[4] = {0};
59326+        uint32_t bo_handles[4] = {0};
59327+        int i, j, n;
59328+
59329+        da->frame = frame;
59330+
59331+        for (i = 0; i < desc->nb_objects; ++i) {
59332+            if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) {
59333+                av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
59334+                return -1;
59335+            }
59336+        }
59337+
59338+        n = 0;
59339+        for (i = 0; i < desc->nb_layers; ++i) {
59340+            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
59341+                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
59342+                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
59343+                pitches[n] = p->pitch;
59344+                offsets[n] = p->offset;
59345+                modifiers[n] = obj->format_modifier;
59346+                bo_handles[n] = da->bo_handles[p->object_index];
59347+                ++n;
59348+            }
59349+        }
59350+
59351+#if 1 && TRACE_ALL
59352+        av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
59353+               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
59354+               av_frame_cropped_width(frame),
59355+               av_frame_cropped_height(frame),
59356+               desc->layers[0].format,
59357+               bo_handles[0],
59358+               bo_handles[1],
59359+               bo_handles[2],
59360+               bo_handles[3],
59361+               pitches[0],
59362+               pitches[1],
59363+               pitches[2],
59364+               pitches[3],
59365+               offsets[0],
59366+               offsets[1],
59367+               offsets[2],
59368+               offsets[3],
59369+               (long long)modifiers[0],
59370+               (long long)modifiers[1],
59371+               (long long)modifiers[2],
59372+               (long long)modifiers[3]
59373+               );
59374+#endif
59375+
59376+        if (drmModeAddFB2WithModifiers(de->drm_fd,
59377+                                         av_frame_cropped_width(frame),
59378+                                         av_frame_cropped_height(frame),
59379+                                         desc->layers[0].format, bo_handles,
59380+                                         pitches, offsets, modifiers,
59381+                                         &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) {
59382+            av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
59383+            return -1;
59384+        }
59385+    }
59386+
59387+    ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId,
59388+                              da->fb_handle, 0,
59389+                de->setup.compose.x, de->setup.compose.y,
59390+                de->setup.compose.width,
59391+                de->setup.compose.height,
59392+                0, 0,
59393+                av_frame_cropped_width(frame) << 16,
59394+                av_frame_cropped_height(frame) << 16);
59395+
59396+    if (ret != 0) {
59397+        av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR);
59398+    }
59399+
59400+    de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1;
59401+
59402+    return ret;
59403+}
59404+
59405+static int do_sem_wait(sem_t * const sem, const int nowait)
59406+{
59407+    while (nowait ? sem_trywait(sem) : sem_wait(sem)) {
59408+        if (errno != EINTR)
59409+            return -errno;
59410+    }
59411+    return 0;
59412+}
59413+
59414+static void * display_thread(void * v)
59415+{
59416+    AVFormatContext * const s = v;
59417+    drm_display_env_t * const de = s->priv_data;
59418+    int i;
59419+
59420+#if TRACE_ALL
59421+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
59422+#endif
59423+
59424+    sem_post(&de->q_sem_out);
59425+
59426+    for (;;) {
59427+        AVFrame * frame;
59428+
59429+        do_sem_wait(&de->q_sem_in, 0);
59430+
59431+        if (de->q_terminate)
59432+            break;
59433+
59434+        frame = de->q_next;
59435+        de->q_next = NULL;
59436+        sem_post(&de->q_sem_out);
59437+
59438+        do_display(s, de, frame);
59439+    }
59440+
59441+#if TRACE_ALL
59442+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
59443+#endif
59444+
59445+    for (i = 0; i != AUX_SIZE; ++i)
59446+        da_uninit(de, de->aux + i);
59447+
59448+    av_frame_free(&de->q_next);
59449+
59450+    return NULL;
59451+}
59452+
59453+static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
59454+{
59455+    const AVFrame * const src_frame = (AVFrame *)pkt->data;
59456+    AVFrame * frame;
59457+    drm_display_env_t * const de = s->priv_data;
59458+    int ret;
59459+
59460+#if TRACE_ALL
59461+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
59462+#endif
59463+
59464+    if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) {
59465+        av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts);
59466+        return 0;
59467+    }
59468+
59469+    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
59470+        frame = av_frame_alloc();
59471+        av_frame_ref(frame, src_frame);
59472+    }
59473+    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
59474+        frame = av_frame_alloc();
59475+        frame->format = AV_PIX_FMT_DRM_PRIME;
59476+        if (av_hwframe_map(frame, src_frame, 0) != 0)
59477+        {
59478+            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
59479+            av_frame_free(&frame);
59480+            return AVERROR(EINVAL);
59481+        }
59482+    }
59483+    else {
59484+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
59485+        return AVERROR(EINVAL);
59486+    }
59487+
59488+    ret = do_sem_wait(&de->q_sem_out, !de->show_all);
59489+    if (ret) {
59490+        av_frame_free(&frame);
59491+    }
59492+    else {
59493+        de->q_next = frame;
59494+        sem_post(&de->q_sem_in);
59495+    }
59496+
59497+    return 0;
59498+}
59499+
59500+static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
59501+                          unsigned flags)
59502+{
59503+#if TRACE_ALL
59504+    av_log(s, AV_LOG_DEBUG, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
59505+#endif
59506+
59507+    /* drm_vout_write_header() should have accepted only supported formats */
59508+    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
59509+        return 0;
59510+
59511+    return 0;
59512+}
59513+
59514+static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
59515+{
59516+#if TRACE_ALL
59517+    av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type);
59518+#endif
59519+    switch(type) {
59520+    case AV_APP_TO_DEV_WINDOW_REPAINT:
59521+        return 0;
59522+    default:
59523+        break;
59524+    }
59525+    return AVERROR(ENOSYS);
59526+}
59527+
59528+static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId)
59529+{
59530+   int ret = -1;
59531+   int i;
59532+   drmModeRes *res = drmModeGetResources(drmfd);
59533+   drmModeConnector *c;
59534+
59535+   if(!res)
59536+   {
59537+      printf( "drmModeGetResources failed: %s\n", ERRSTR);
59538+      return -1;
59539+   }
59540+
59541+   if (res->count_crtcs <= 0)
59542+   {
59543+      printf( "drm: no crts\n");
59544+      goto fail_res;
59545+   }
59546+
59547+   if (!s->conId) {
59548+      fprintf(stderr,
59549+         "No connector ID specified.  Choosing default from list:\n");
59550+
59551+      for (i = 0; i < res->count_connectors; i++) {
59552+         drmModeConnector *con =
59553+            drmModeGetConnector(drmfd, res->connectors[i]);
59554+         drmModeEncoder *enc = NULL;
59555+         drmModeCrtc *crtc = NULL;
59556+
59557+         if (con->encoder_id) {
59558+            enc = drmModeGetEncoder(drmfd, con->encoder_id);
59559+            if (enc->crtc_id) {
59560+               crtc = drmModeGetCrtc(drmfd, enc->crtc_id);
59561+            }
59562+         }
59563+
59564+         if (!s->conId && crtc) {
59565+            s->conId = con->connector_id;
59566+            s->crtcId = crtc->crtc_id;
59567+         }
59568+
59569+         av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n",
59570+                con->connector_id,
59571+                crtc ? crtc->crtc_id : 0,
59572+                con->connector_type,
59573+                crtc ? crtc->width : 0,
59574+                crtc ? crtc->height : 0,
59575+                (s->conId == (int)con->connector_id ?
59576+            " (chosen)" : ""));
59577+      }
59578+
59579+      if (!s->conId) {
59580+         av_log(avctx, AV_LOG_ERROR,
59581+            "No suitable enabled connector found.\n");
59582+         return -1;;
59583+      }
59584+   }
59585+
59586+   s->crtcIdx = -1;
59587+
59588+   for (i = 0; i < res->count_crtcs; ++i) {
59589+      if (s->crtcId == res->crtcs[i]) {
59590+         s->crtcIdx = i;
59591+         break;
59592+      }
59593+   }
59594+
59595+   if (s->crtcIdx == -1)
59596+   {
59597+       av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId);
59598+       goto fail_res;
59599+   }
59600+
59601+   if (res->count_connectors <= 0)
59602+   {
59603+       av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n");
59604+       goto fail_res;
59605+   }
59606+
59607+   c = drmModeGetConnector(drmfd, s->conId);
59608+   if (!c)
59609+   {
59610+       av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR);
59611+       goto fail_res;
59612+   }
59613+
59614+   if (!c->count_modes)
59615+   {
59616+       av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n");
59617+       goto fail_conn;
59618+   }
59619+
59620+   {
59621+      drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId);
59622+      s->compose.x = crtc->x;
59623+      s->compose.y = crtc->y;
59624+      s->compose.width = crtc->width;
59625+      s->compose.height = crtc->height;
59626+      drmModeFreeCrtc(crtc);
59627+   }
59628+
59629+   if (pConId)
59630+      *pConId = c->connector_id;
59631+   ret = 0;
59632+
59633+fail_conn:
59634+   drmModeFreeConnector(c);
59635+
59636+fail_res:
59637+   drmModeFreeResources(res);
59638+
59639+   return ret;
59640+}
59641+
59642+// deinit is called if init fails so no need to clean up explicity here
59643+static int drm_vout_init(struct AVFormatContext * s)
59644+{
59645+    drm_display_env_t * const de = s->priv_data;
59646+    int rv;
59647+    const char * drm_module = DRM_MODULE;
59648+
59649+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
59650+
59651+    de->drm_fd = -1;
59652+    de->con_id = 0;
59653+    de->setup = (struct drm_setup){0};
59654+    de->q_terminate = 0;
59655+
59656+    if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0)
59657+    {
59658+        rv = AVERROR(errno);
59659+        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv));
59660+        return rv;
59661+    }
59662+
59663+    if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0)
59664+    {
59665+        av_log(s, AV_LOG_ERROR, "failed to find valid mode\n");
59666+        rv = AVERROR(EINVAL);
59667+        goto fail_close;
59668+    }
59669+
59670+    sem_init(&de->q_sem_in, 0, 0);
59671+    sem_init(&de->q_sem_out, 0, 0);
59672+    if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
59673+        rv = AVERROR(errno);
59674+        av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv));
59675+        goto fail_close;
59676+    }
59677+
59678+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
59679+
59680+    return 0;
59681+
59682+fail_close:
59683+    close(de->drm_fd);
59684+    de->drm_fd = -1;
59685+    av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__);
59686+
59687+    return rv;
59688+}
59689+
59690+static void drm_vout_deinit(struct AVFormatContext * s)
59691+{
59692+    drm_display_env_t * const de = s->priv_data;
59693+
59694+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
59695+
59696+    de->q_terminate = 1;
59697+    sem_post(&de->q_sem_in);
59698+    pthread_join(de->q_thread, NULL);
59699+    sem_destroy(&de->q_sem_in);
59700+    sem_destroy(&de->q_sem_out);
59701+
59702+    for (unsigned int i = 0; i != AUX_SIZE; ++i)
59703+        da_uninit(de, de->aux + i);
59704+
59705+    av_frame_free(&de->q_next);
59706+
59707+    if (de->drm_fd >= 0) {
59708+        close(de->drm_fd);
59709+        de->drm_fd = -1;
59710+    }
59711+
59712+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
59713+}
59714+
59715+
59716+#define OFFSET(x) offsetof(drm_display_env_t, x)
59717+static const AVOption options[] = {
59718+    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
59719+    { NULL }
59720+};
59721+
59722+static const AVClass drm_vout_class = {
59723+    .class_name = "drm vid outdev",
59724+    .item_name  = av_default_item_name,
59725+    .option     = options,
59726+    .version    = LIBAVUTIL_VERSION_INT,
59727+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
59728+};
59729+
59730+AVOutputFormat ff_vout_drm_muxer = {
59731+    .name           = "vout_drm",
59732+    .long_name      = NULL_IF_CONFIG_SMALL("Drm video output device"),
59733+    .priv_data_size = sizeof(drm_display_env_t),
59734+    .audio_codec    = AV_CODEC_ID_NONE,
59735+    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
59736+    .write_header   = drm_vout_write_header,
59737+    .write_packet   = drm_vout_write_packet,
59738+    .write_uncoded_frame = drm_vout_write_frame,
59739+    .write_trailer  = drm_vout_write_trailer,
59740+    .control_message = drm_vout_control_message,
59741+    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
59742+    .priv_class     = &drm_vout_class,
59743+    .init           = drm_vout_init,
59744+    .deinit         = drm_vout_deinit,
59745+};
59746+
59747--- /dev/null
59748+++ b/libavdevice/egl_vout.c
59749@@ -0,0 +1,816 @@
59750+/*
59751+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
59752+ *
59753+ * This file is part of FFmpeg.
59754+ *
59755+ * FFmpeg is free software; you can redistribute it and/or
59756+ * modify it under the terms of the GNU Lesser General Public
59757+ * License as published by the Free Software Foundation; either
59758+ * version 2.1 of the License, or (at your option) any later version.
59759+ *
59760+ * FFmpeg is distributed in the hope that it will be useful,
59761+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
59762+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
59763+ * Lesser General Public License for more details.
59764+ *
59765+ * You should have received a copy of the GNU Lesser General Public
59766+ * License along with FFmpeg; if not, write to the Free Software
59767+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
59768+ */
59769+
59770+
59771+// *** This module is a work in progress and its utility is strictly
59772+//     limited to testing.
59773+//     Amongst other issues it doesn't wait for the pic to be displayed before
59774+//     returning the buffer so flikering does occur.
59775+
59776+#include <epoxy/gl.h>
59777+#include <epoxy/egl.h>
59778+
59779+#include "libavutil/opt.h"
59780+#include "libavutil/avassert.h"
59781+#include "libavutil/pixdesc.h"
59782+#include "libavutil/imgutils.h"
59783+#include "libavutil/hwcontext_drm.h"
59784+#include "libavformat/internal.h"
59785+#include "avdevice.h"
59786+
59787+#include "pthread.h"
59788+#include <semaphore.h>
59789+#include <stdatomic.h>
59790+#include <unistd.h>
59791+
59792+#include <X11/Xlib.h>
59793+#include <X11/Xutil.h>
59794+
59795+#include "libavutil/rpi_sand_fns.h"
59796+
59797+#define TRACE_ALL 0
59798+
59799+struct egl_setup {
59800+   int conId;
59801+
59802+   Display *dpy;
59803+   EGLDisplay egl_dpy;
59804+   EGLContext ctx;
59805+   EGLSurface surf;
59806+   Window win;
59807+
59808+   uint32_t crtcId;
59809+   int crtcIdx;
59810+   uint32_t planeId;
59811+   struct {
59812+       int x, y, width, height;
59813+   } compose;
59814+};
59815+
59816+typedef struct egl_aux_s {
59817+    int fd;
59818+    GLuint texture;
59819+
59820+} egl_aux_t;
59821+
59822+typedef struct egl_display_env_s
59823+{
59824+    AVClass *class;
59825+
59826+    struct egl_setup setup;
59827+    enum AVPixelFormat avfmt;
59828+
59829+    int show_all;
59830+    int window_width, window_height;
59831+    int window_x, window_y;
59832+    int fullscreen;
59833+
59834+    egl_aux_t aux[32];
59835+
59836+    pthread_t q_thread;
59837+    pthread_mutex_t q_lock;
59838+    sem_t display_start_sem;
59839+    sem_t q_sem;
59840+    int q_terminate;
59841+    AVFrame * q_this;
59842+    AVFrame * q_next;
59843+
59844+} egl_display_env_t;
59845+
59846+
59847+/**
59848+ * Remove window border/decorations.
59849+ */
59850+static void
59851+no_border( Display *dpy, Window w)
59852+{
59853+   static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
59854+   static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
59855+
59856+   typedef struct
59857+   {
59858+      unsigned long       flags;
59859+      unsigned long       functions;
59860+      unsigned long       decorations;
59861+      long                inputMode;
59862+      unsigned long       status;
59863+   } PropMotifWmHints;
59864+
59865+   PropMotifWmHints motif_hints;
59866+   Atom prop, proptype;
59867+   unsigned long flags = 0;
59868+
59869+   /* setup the property */
59870+   motif_hints.flags = MWM_HINTS_DECORATIONS;
59871+   motif_hints.decorations = flags;
59872+
59873+   /* get the atom for the property */
59874+   prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True );
59875+   if (!prop) {
59876+      /* something went wrong! */
59877+      return;
59878+   }
59879+
59880+   /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
59881+   proptype = prop;
59882+
59883+   XChangeProperty( dpy, w,                         /* display, window */
59884+                    prop, proptype,                 /* property, type */
59885+                    32,                             /* format: 32-bit datums */
59886+                    PropModeReplace,                /* mode */
59887+                    (unsigned char *) &motif_hints, /* data */
59888+                    PROP_MOTIF_WM_HINTS_ELEMENTS    /* nelements */
59889+                  );
59890+}
59891+
59892+
59893+/*
59894+ * Create an RGB, double-buffered window.
59895+ * Return the window and context handles.
59896+ */
59897+static int
59898+make_window(struct AVFormatContext * const s,
59899+            egl_display_env_t * const de,
59900+            Display *dpy, EGLDisplay egl_dpy, const char *name,
59901+            Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
59902+{
59903+   int scrnum = DefaultScreen( dpy );
59904+   XSetWindowAttributes attr;
59905+   unsigned long mask;
59906+   Window root = RootWindow( dpy, scrnum );
59907+   Window win;
59908+   EGLContext ctx;
59909+   const int fullscreen = de->fullscreen;
59910+   EGLConfig config;
59911+   int x = de->window_x;
59912+   int y = de->window_y;
59913+   int width = de->window_width ? de->window_width : 1280;
59914+   int height = de->window_height ? de->window_height : 720;
59915+
59916+
59917+   if (fullscreen) {
59918+      int scrnum = DefaultScreen(dpy);
59919+
59920+      x = 0; y = 0;
59921+      width = DisplayWidth(dpy, scrnum);
59922+      height = DisplayHeight(dpy, scrnum);
59923+   }
59924+
59925+   {
59926+      EGLint num_configs;
59927+      static const EGLint attribs[] = {
59928+         EGL_RED_SIZE, 1,
59929+         EGL_GREEN_SIZE, 1,
59930+         EGL_BLUE_SIZE, 1,
59931+         EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
59932+         EGL_NONE
59933+      };
59934+
59935+      if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
59936+         av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
59937+         return -1;
59938+      }
59939+   }
59940+
59941+   {
59942+      EGLint vid;
59943+      if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
59944+         av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
59945+         return -1;
59946+      }
59947+
59948+      {
59949+         XVisualInfo visTemplate = {
59950+            .visualid = vid,
59951+         };
59952+         int num_visuals;
59953+         XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
59954+                                               &visTemplate, &num_visuals);
59955+
59956+         /* window attributes */
59957+         attr.background_pixel = 0;
59958+         attr.border_pixel = 0;
59959+         attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone);
59960+         attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
59961+         /* XXX this is a bad way to get a borderless window! */
59962+         mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
59963+
59964+         win = XCreateWindow( dpy, root, x, y, width, height,
59965+                              0, visinfo->depth, InputOutput,
59966+                              visinfo->visual, mask, &attr );
59967+         XFree(visinfo);
59968+      }
59969+   }
59970+
59971+   if (fullscreen)
59972+      no_border(dpy, win);
59973+
59974+   /* set hints and properties */
59975+   {
59976+      XSizeHints sizehints;
59977+      sizehints.x = x;
59978+      sizehints.y = y;
59979+      sizehints.width  = width;
59980+      sizehints.height = height;
59981+      sizehints.flags = USSize | USPosition;
59982+      XSetNormalHints(dpy, win, &sizehints);
59983+      XSetStandardProperties(dpy, win, name, name,
59984+                              None, (char **)NULL, 0, &sizehints);
59985+   }
59986+
59987+   eglBindAPI(EGL_OPENGL_ES_API);
59988+
59989+   {
59990+      static const EGLint ctx_attribs[] = {
59991+         EGL_CONTEXT_CLIENT_VERSION, 2,
59992+         EGL_NONE
59993+      };
59994+      ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs );
59995+      if (!ctx) {
59996+         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
59997+         return -1;
59998+      }
59999+   }
60000+
60001+
60002+   XMapWindow(dpy, win);
60003+
60004+   {
60005+      EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
60006+      if (!surf) {
60007+         av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
60008+         return -1;
60009+      }
60010+
60011+      if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
60012+         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
60013+         return -1;
60014+      }
60015+
60016+      *winRet = win;
60017+      *ctxRet = ctx;
60018+      *surfRet = surf;
60019+   }
60020+
60021+   return 0;
60022+}
60023+
60024+static GLint
60025+compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source)
60026+{
60027+   GLuint s = glCreateShader(target);
60028+
60029+   if (s == 0) {
60030+      av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
60031+      return 0;
60032+   }
60033+
60034+   glShaderSource(s, 1, (const GLchar **) &source, NULL);
60035+   glCompileShader(s);
60036+
60037+   {
60038+      GLint ok;
60039+      glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
60040+
60041+      if (!ok) {
60042+         GLchar *info;
60043+         GLint size;
60044+
60045+         glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
60046+         info = malloc(size);
60047+
60048+         glGetShaderInfoLog(s, size, NULL, info);
60049+         av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
60050+
60051+         return 0;
60052+      }
60053+   }
60054+
60055+   return s;
60056+}
60057+
60058+static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs)
60059+{
60060+   GLuint prog = glCreateProgram();
60061+
60062+   if (prog == 0) {
60063+      av_log(s, AV_LOG_ERROR, "Failed to create program\n");
60064+      return 0;
60065+   }
60066+
60067+   glAttachShader(prog, vs);
60068+   glAttachShader(prog, fs);
60069+   glLinkProgram(prog);
60070+
60071+   {
60072+      GLint ok;
60073+      glGetProgramiv(prog, GL_LINK_STATUS, &ok);
60074+      if (!ok) {
60075+         /* Some drivers return a size of 1 for an empty log.  This is the size
60076+          * of a log that contains only a terminating NUL character.
60077+          */
60078+         GLint size;
60079+         GLchar *info = NULL;
60080+         glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
60081+         if (size > 1) {
60082+            info = malloc(size);
60083+            glGetProgramInfoLog(prog, size, NULL, info);
60084+         }
60085+
60086+         av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
60087+                 (info != NULL) ? info : "<empty log>");
60088+         return 0;
60089+      }
60090+   }
60091+
60092+   return prog;
60093+}
60094+
60095+static int
60096+gl_setup(struct AVFormatContext * const s)
60097+{
60098+   const char *vs =
60099+      "attribute vec4 pos;\n"
60100+      "varying vec2 texcoord;\n"
60101+      "\n"
60102+      "void main() {\n"
60103+      "  gl_Position = pos;\n"
60104+      "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
60105+      "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
60106+      "}\n";
60107+   const char *fs =
60108+      "#extension GL_OES_EGL_image_external : enable\n"
60109+      "precision mediump float;\n"
60110+      "uniform samplerExternalOES s;\n"
60111+      "varying vec2 texcoord;\n"
60112+      "void main() {\n"
60113+      "  gl_FragColor = texture2D(s, texcoord);\n"
60114+      "}\n";
60115+
60116+   GLuint vs_s;
60117+   GLuint fs_s;
60118+   GLuint prog;
60119+
60120+   if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
60121+       !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
60122+       !(prog = link_program(s, vs_s, fs_s)))
60123+      return -1;
60124+
60125+   glUseProgram(prog);
60126+
60127+   {
60128+      static const float verts[] = {
60129+         -1, -1,
60130+         1, -1,
60131+         1, 1,
60132+         -1, 1,
60133+      };
60134+      glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
60135+   }
60136+
60137+   glEnableVertexAttribArray(0);
60138+   return 0;
60139+}
60140+
60141+static int egl_vout_write_trailer(AVFormatContext *s)
60142+{
60143+#if TRACE_ALL
60144+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
60145+#endif
60146+
60147+    return 0;
60148+}
60149+
60150+static int egl_vout_write_header(AVFormatContext *s)
60151+{
60152+    const AVCodecParameters * const par = s->streams[0]->codecpar;
60153+
60154+#if TRACE_ALL
60155+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
60156+#endif
60157+    if (   s->nb_streams > 1
60158+        || par->codec_type != AVMEDIA_TYPE_VIDEO
60159+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
60160+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
60161+        return AVERROR(EINVAL);
60162+    }
60163+
60164+    return 0;
60165+}
60166+
60167+
60168+static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame)
60169+{
60170+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
60171+    egl_aux_t * da = NULL;
60172+    unsigned int i;
60173+
60174+#if TRACE_ALL
60175+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
60176+#endif
60177+
60178+    for (i = 0; i != 32; ++i) {
60179+        if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) {
60180+            da = de->aux + i;
60181+            break;
60182+        }
60183+    }
60184+
60185+    if (da == NULL) {
60186+        av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__);
60187+        return AVERROR(EINVAL);
60188+    }
60189+
60190+    if (da->texture == 0) {
60191+        EGLint attribs[50];
60192+        EGLint * a = attribs;
60193+        int i, j;
60194+        static const EGLint anames[] = {
60195+           EGL_DMA_BUF_PLANE0_FD_EXT,
60196+           EGL_DMA_BUF_PLANE0_OFFSET_EXT,
60197+           EGL_DMA_BUF_PLANE0_PITCH_EXT,
60198+           EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
60199+           EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
60200+           EGL_DMA_BUF_PLANE1_FD_EXT,
60201+           EGL_DMA_BUF_PLANE1_OFFSET_EXT,
60202+           EGL_DMA_BUF_PLANE1_PITCH_EXT,
60203+           EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
60204+           EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
60205+           EGL_DMA_BUF_PLANE2_FD_EXT,
60206+           EGL_DMA_BUF_PLANE2_OFFSET_EXT,
60207+           EGL_DMA_BUF_PLANE2_PITCH_EXT,
60208+           EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
60209+           EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
60210+        };
60211+        const EGLint * b = anames;
60212+
60213+        *a++ = EGL_WIDTH;
60214+        *a++ = av_frame_cropped_width(frame);
60215+        *a++ = EGL_HEIGHT;
60216+        *a++ = av_frame_cropped_height(frame);
60217+        *a++ = EGL_LINUX_DRM_FOURCC_EXT;
60218+        *a++ = desc->layers[0].format;
60219+
60220+        for (i = 0; i < desc->nb_layers; ++i) {
60221+            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
60222+                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
60223+                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
60224+                *a++ = *b++;
60225+                *a++ = obj->fd;
60226+                *a++ = *b++;
60227+                *a++ = p->offset;
60228+                *a++ = *b++;
60229+                *a++ = p->pitch;
60230+                if (obj->format_modifier == 0) {
60231+                   b += 2;
60232+                }
60233+                else {
60234+                   *a++ = *b++;
60235+                   *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
60236+                   *a++ = *b++;
60237+                   *a++ = (EGLint)(obj->format_modifier >> 32);
60238+                }
60239+            }
60240+        }
60241+
60242+        *a = EGL_NONE;
60243+
60244+#if TRACE_ALL
60245+        for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
60246+           av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
60247+        }
60248+#endif
60249+        {
60250+           const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
60251+                                              EGL_NO_CONTEXT,
60252+                                              EGL_LINUX_DMA_BUF_EXT,
60253+                                              NULL, attribs);
60254+           if (!image) {
60255+              av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
60256+              return -1;
60257+           }
60258+
60259+           glGenTextures(1, &da->texture);
60260+           glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
60261+           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
60262+           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
60263+           glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
60264+
60265+           eglDestroyImageKHR(de->setup.egl_dpy, image);
60266+        }
60267+
60268+        da->fd = desc->objects[0].fd;
60269+
60270+#if 0
60271+        av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
60272+               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
60273+               av_frame_cropped_width(frame),
60274+               av_frame_cropped_height(frame),
60275+               desc->layers[0].format,
60276+               bo_plane_handles[0],
60277+               bo_plane_handles[1],
60278+               bo_plane_handles[2],
60279+               bo_plane_handles[3],
60280+               pitches[0],
60281+               pitches[1],
60282+               pitches[2],
60283+               pitches[3],
60284+               offsets[0],
60285+               offsets[1],
60286+               offsets[2],
60287+               offsets[3],
60288+               (long long)modifiers[0],
60289+               (long long)modifiers[1],
60290+               (long long)modifiers[2],
60291+               (long long)modifiers[3]
60292+               );
60293+#endif
60294+    }
60295+
60296+    glClearColor(0.5, 0.5, 0.5, 0.5);
60297+    glClear(GL_COLOR_BUFFER_BIT);
60298+
60299+    glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
60300+    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
60301+    eglSwapBuffers(de->setup.egl_dpy, de->setup.surf);
60302+
60303+    glDeleteTextures(1, &da->texture);
60304+    da->texture = 0;
60305+    da->fd = -1;
60306+
60307+    return 0;
60308+}
60309+
60310+static void * display_thread(void * v)
60311+{
60312+    AVFormatContext * const s = v;
60313+    egl_display_env_t * const de = s->priv_data;
60314+
60315+#if TRACE_ALL
60316+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
60317+#endif
60318+    {
60319+       EGLint egl_major, egl_minor;
60320+
60321+       de->setup.dpy = XOpenDisplay(NULL);
60322+       if (!de->setup.dpy) {
60323+          av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
60324+          goto fail;
60325+       }
60326+
60327+       de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
60328+       if (!de->setup.egl_dpy) {
60329+          av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
60330+          goto fail;
60331+       }
60332+
60333+       if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
60334+           av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
60335+           goto fail;
60336+       }
60337+
60338+       av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
60339+
60340+       if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
60341+          av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
60342+          goto fail;
60343+       }
60344+    }
60345+
60346+    if (!de->window_width || !de->window_height) {
60347+       de->window_width = 1280;
60348+       de->window_height = 720;
60349+    }
60350+    if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
60351+                    &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
60352+       av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
60353+       goto fail;
60354+    }
60355+
60356+    if (gl_setup(s)) {
60357+       av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
60358+       goto fail;
60359+    }
60360+
60361+#if TRACE_ALL
60362+    av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__);
60363+#endif
60364+    sem_post(&de->display_start_sem);
60365+
60366+    for (;;) {
60367+        AVFrame * frame;
60368+
60369+        while (sem_wait(&de->q_sem) != 0) {
60370+            av_assert0(errno == EINTR);
60371+        }
60372+
60373+        if (de->q_terminate)
60374+            break;
60375+
60376+        pthread_mutex_lock(&de->q_lock);
60377+        frame = de->q_next;
60378+        de->q_next = NULL;
60379+        pthread_mutex_unlock(&de->q_lock);
60380+
60381+        do_display(s, de, frame);
60382+
60383+        av_frame_free(&de->q_this);
60384+        de->q_this = frame;
60385+    }
60386+
60387+#if TRACE_ALL
60388+    av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
60389+#endif
60390+
60391+    return NULL;
60392+
60393+fail:
60394+#if TRACE_ALL
60395+    av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__);
60396+#endif
60397+    de->q_terminate = 1;
60398+    sem_post(&de->display_start_sem);
60399+
60400+    return NULL;
60401+}
60402+
60403+static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
60404+{
60405+    const AVFrame * const src_frame = (AVFrame *)pkt->data;
60406+    AVFrame * frame;
60407+    egl_display_env_t * const de = s->priv_data;
60408+
60409+#if TRACE_ALL
60410+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
60411+#endif
60412+
60413+    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
60414+        frame = av_frame_alloc();
60415+        av_frame_ref(frame, src_frame);
60416+    }
60417+    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
60418+        frame = av_frame_alloc();
60419+        frame->format = AV_PIX_FMT_DRM_PRIME;
60420+        if (av_hwframe_map(frame, src_frame, 0) != 0)
60421+        {
60422+            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
60423+            av_frame_free(&frame);
60424+            return AVERROR(EINVAL);
60425+        }
60426+    }
60427+    else {
60428+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
60429+        return AVERROR(EINVAL);
60430+    }
60431+
60432+    // Really hacky sync
60433+    while (de->show_all && de->q_next) {
60434+       usleep(3000);
60435+    }
60436+
60437+    pthread_mutex_lock(&de->q_lock);
60438+    {
60439+        AVFrame * const t = de->q_next;
60440+        de->q_next = frame;
60441+        frame = t;
60442+    }
60443+    pthread_mutex_unlock(&de->q_lock);
60444+
60445+    if (frame == NULL)
60446+        sem_post(&de->q_sem);
60447+    else
60448+        av_frame_free(&frame);
60449+
60450+    return 0;
60451+}
60452+
60453+static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
60454+                          unsigned flags)
60455+{
60456+#if TRACE_ALL
60457+    av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
60458+#endif
60459+
60460+    /* egl_vout_write_header() should have accepted only supported formats */
60461+    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
60462+        return 0;
60463+
60464+    return 0;
60465+}
60466+
60467+static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
60468+{
60469+#if TRACE_ALL
60470+    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
60471+#endif
60472+    switch(type) {
60473+    case AV_APP_TO_DEV_WINDOW_REPAINT:
60474+        return 0;
60475+    default:
60476+        break;
60477+    }
60478+    return AVERROR(ENOSYS);
60479+}
60480+
60481+// deinit is called if init fails so no need to clean up explicity here
60482+static int egl_vout_init(struct AVFormatContext * s)
60483+{
60484+    egl_display_env_t * const de = s->priv_data;
60485+    unsigned int i;
60486+
60487+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
60488+
60489+    de->setup = (struct egl_setup){0};
60490+
60491+    for (i = 0; i != 32; ++i) {
60492+        de->aux[i].fd = -1;
60493+    }
60494+
60495+    de->q_terminate = 0;
60496+    pthread_mutex_init(&de->q_lock, NULL);
60497+    sem_init(&de->q_sem, 0, 0);
60498+    sem_init(&de->display_start_sem, 0, 0);
60499+    av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0);
60500+
60501+    sem_wait(&de->display_start_sem);
60502+    if (de->q_terminate) {
60503+       av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
60504+       return -1;
60505+    }
60506+
60507+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
60508+
60509+    return 0;
60510+}
60511+
60512+static void egl_vout_deinit(struct AVFormatContext * s)
60513+{
60514+    egl_display_env_t * const de = s->priv_data;
60515+
60516+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
60517+
60518+    de->q_terminate = 1;
60519+    sem_post(&de->q_sem);
60520+    pthread_join(de->q_thread, NULL);
60521+    sem_destroy(&de->q_sem);
60522+    pthread_mutex_destroy(&de->q_lock);
60523+
60524+    av_frame_free(&de->q_next);
60525+    av_frame_free(&de->q_this);
60526+
60527+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
60528+}
60529+
60530+#define OFFSET(x) offsetof(egl_display_env_t, x)
60531+static const AVOption options[] = {
60532+   { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
60533+   { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
60534+   { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
60535+   { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
60536+   { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
60537+    { NULL }
60538+
60539+};
60540+
60541+static const AVClass egl_vout_class = {
60542+    .class_name = "egl vid outdev",
60543+    .item_name  = av_default_item_name,
60544+    .option     = options,
60545+    .version    = LIBAVUTIL_VERSION_INT,
60546+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
60547+};
60548+
60549+AVOutputFormat ff_vout_egl_muxer = {
60550+    .name           = "vout_egl",
60551+    .long_name      = NULL_IF_CONFIG_SMALL("Egl video output device"),
60552+    .priv_data_size = sizeof(egl_display_env_t),
60553+    .audio_codec    = AV_CODEC_ID_NONE,
60554+    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
60555+    .write_header   = egl_vout_write_header,
60556+    .write_packet   = egl_vout_write_packet,
60557+    .write_uncoded_frame = egl_vout_write_frame,
60558+    .write_trailer  = egl_vout_write_trailer,
60559+    .control_message = egl_vout_control_message,
60560+    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
60561+    .priv_class     = &egl_vout_class,
60562+    .init           = egl_vout_init,
60563+    .deinit         = egl_vout_deinit,
60564+};
60565+
60566--- /dev/null
60567+++ b/libavdevice/rpi_vout.c
60568@@ -0,0 +1,534 @@
60569+/*
60570+ * Copyright (c) 2013 Jeff Moguillansky
60571+ *
60572+ * This file is part of FFmpeg.
60573+ *
60574+ * FFmpeg is free software; you can redistribute it and/or
60575+ * modify it under the terms of the GNU Lesser General Public
60576+ * License as published by the Free Software Foundation; either
60577+ * version 2.1 of the License, or (at your option) any later version.
60578+ *
60579+ * FFmpeg is distributed in the hope that it will be useful,
60580+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
60581+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
60582+ * Lesser General Public License for more details.
60583+ *
60584+ * You should have received a copy of the GNU Lesser General Public
60585+ * License along with FFmpeg; if not, write to the Free Software
60586+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
60587+ */
60588+
60589+/**
60590+ * @file
60591+ * XVideo output device
60592+ *
60593+ * TODO:
60594+ * - add support to more formats
60595+ */
60596+
60597+#include "libavutil/opt.h"
60598+#include "libavutil/avassert.h"
60599+#include "libavutil/pixdesc.h"
60600+#include "libavutil/imgutils.h"
60601+#include "libavformat/internal.h"
60602+#include "avdevice.h"
60603+
60604+#include <stdatomic.h>
60605+#include <unistd.h>
60606+
60607+#pragma GCC diagnostic push
60608+// Many many redundant decls in the header files
60609+#pragma GCC diagnostic ignored "-Wredundant-decls"
60610+#include <bcm_host.h>
60611+#include <interface/mmal/mmal.h>
60612+#include <interface/mmal/mmal_parameters_camera.h>
60613+#include <interface/mmal/mmal_buffer.h>
60614+#include <interface/mmal/mmal_port.h>
60615+#include <interface/mmal/util/mmal_util.h>
60616+#include <interface/mmal/util/mmal_default_components.h>
60617+#include <interface/mmal/util/mmal_connection.h>
60618+#include <interface/mmal/util/mmal_util_params.h>
60619+#pragma GCC diagnostic pop
60620+#include "libavutil/rpi_sand_fns.h"
60621+#include "libavcodec/rpi_zc.h"
60622+
60623+#define TRACE_ALL 0
60624+
60625+#define DISPLAY_PORT_DEPTH 4
60626+
60627+typedef struct rpi_display_env_s
60628+{
60629+    AVClass *class;
60630+
60631+    MMAL_COMPONENT_T* display;
60632+    MMAL_COMPONENT_T* isp;
60633+    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
60634+    MMAL_CONNECTION_T * conn;
60635+
60636+    MMAL_POOL_T *rpi_pool;
60637+    volatile int rpi_display_count;
60638+
60639+    MMAL_FOURCC_T req_fmt;
60640+    MMAL_VIDEO_FORMAT_T req_vfmt;
60641+
60642+    AVZcEnvPtr zc;
60643+
60644+    int window_width, window_height;
60645+    int window_x, window_y;
60646+    int layer, fullscreen;
60647+    int show_all;
60648+} rpi_display_env_t;
60649+
60650+
60651+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
60652+    mmal_buffer_header_release(buffer);
60653+}
60654+
60655+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
60656+    mmal_buffer_header_release(buffer);
60657+}
60658+
60659+
60660+static MMAL_FOURCC_T mmfmt_from_avfmt(const enum AVPixelFormat fmt)
60661+{
60662+    switch (fmt) {
60663+    case AV_PIX_FMT_SAND128:
60664+    case AV_PIX_FMT_RPI4_8:
60665+        return MMAL_ENCODING_YUVUV128;
60666+    case AV_PIX_FMT_RPI4_10:
60667+        return MMAL_ENCODING_YUV10_COL;
60668+    case AV_PIX_FMT_SAND64_10:
60669+        return MMAL_ENCODING_YUVUV64_10;
60670+    case AV_PIX_FMT_SAND64_16:
60671+        return MMAL_ENCODING_YUVUV64_16;
60672+    case AV_PIX_FMT_YUV420P:
60673+        return MMAL_ENCODING_I420;
60674+
60675+    default:
60676+        break;
60677+    }
60678+    return 0;
60679+}
60680+
60681+
60682+static void video_format_from_zc_frame(MMAL_ES_FORMAT_T* const es_fmt,
60683+                                       const AVFrame * const frame, const AVRpiZcRefPtr fr_ref)
60684+{
60685+    MMAL_VIDEO_FORMAT_T *const vfmt = &es_fmt->es->video;
60686+    const AVRpiZcFrameGeometry * geo = av_rpi_zc_geometry(fr_ref);
60687+    if (av_rpi_is_sand_format(geo->format)) {
60688+        // Sand formats are a bit "special"
60689+        // stride1 implicit in format
60690+        // width = stride2
60691+        vfmt->width = geo->stripe_is_yc ?
60692+            geo->height_y + geo->height_c : geo->height_y;
60693+//        es->height = geo->video_height;  //*** When we get the FLAG this will change
60694+        vfmt->height = geo->height_y;
60695+        es_fmt->flags = MMAL_ES_FORMAT_FLAG_COL_FMTS_WIDTH_IS_COL_STRIDE;
60696+    }
60697+    else {
60698+        vfmt->width = geo->stride_y / geo->bytes_per_pel;
60699+        vfmt->height = geo->height_y;
60700+        es_fmt->flags = 0;
60701+    }
60702+
60703+    es_fmt->type = MMAL_ES_TYPE_VIDEO;
60704+    es_fmt->encoding = mmfmt_from_avfmt(geo->format);
60705+    es_fmt->encoding_variant = 0;
60706+    es_fmt->bitrate = 0;
60707+
60708+    vfmt->crop.x = frame->crop_left;
60709+    vfmt->crop.y = frame->crop_top;
60710+    vfmt->crop.width = av_frame_cropped_width(frame);
60711+    vfmt->crop.height = av_frame_cropped_height(frame);
60712+
60713+    vfmt->frame_rate.den = 0;  // Don't think I know it here
60714+    vfmt->frame_rate.num = 0;
60715+
60716+    vfmt->par.den = frame->sample_aspect_ratio.den;
60717+    vfmt->par.num = frame->sample_aspect_ratio.num;
60718+
60719+    vfmt->color_space = 0;  // Unknown currently
60720+}
60721+
60722+static MMAL_BOOL_T buf_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata)
60723+{
60724+    rpi_display_env_t * const de = userdata;
60725+    if (buf->user_data != NULL) {
60726+        av_rpi_zc_unref((AVRpiZcRefPtr)buf->user_data);
60727+        buf->user_data = NULL;
60728+    }
60729+    atomic_fetch_add(&de->rpi_display_count, -1);
60730+    return MMAL_FALSE;
60731+}
60732+
60733+static inline int avfmt_needs_isp(const enum AVPixelFormat avfmt)
60734+{
60735+    return avfmt == AV_PIX_FMT_SAND64_10;
60736+}
60737+
60738+static void isp_remove(AVFormatContext * const s, rpi_display_env_t * const de)
60739+{
60740+    if (de->isp != NULL)
60741+    {
60742+        if (de->isp->input[0]->is_enabled)
60743+            mmal_port_disable(de->isp->input[0]);
60744+        if (de->isp->control->is_enabled)
60745+            mmal_port_disable(de->isp->control);
60746+    }
60747+    if (de->conn != NULL) {
60748+        mmal_connection_destroy(de->conn);
60749+        de->conn = NULL;
60750+    }
60751+    if (de->isp != NULL) {
60752+        mmal_component_destroy(de->isp);
60753+        de->isp = NULL;
60754+    }
60755+}
60756+
60757+static void display_frame(AVFormatContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
60758+{
60759+    MMAL_BUFFER_HEADER_T* buf = NULL;
60760+    AVRpiZcRefPtr fr_buf = NULL;
60761+
60762+    if (de == NULL)
60763+        return;
60764+
60765+    if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
60766+        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
60767+        return;
60768+    }
60769+
60770+    if ((fr_buf = av_rpi_zc_ref(s, de->zc, fr, fr->format, 1)) == NULL) {
60771+        return;
60772+    }
60773+
60774+    buf = mmal_queue_get(de->rpi_pool->queue);
60775+    if (!buf) {
60776+        // Running too fast so drop the frame (unexpected)
60777+        goto fail;
60778+    }
60779+
60780+    buf->cmd = 0;
60781+    buf->offset = 0;
60782+    buf->flags = 0;
60783+    mmal_buffer_header_reset(buf);
60784+
60785+    atomic_fetch_add(&de->rpi_display_count, 1);  // Deced on release
60786+    mmal_buffer_header_pre_release_cb_set(buf, buf_release_cb, de);
60787+
60788+    buf->user_data = fr_buf;
60789+    buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
60790+    buf->offset = av_rpi_zc_offset(fr_buf);
60791+    buf->length = av_rpi_zc_length(fr_buf);
60792+    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
60793+
60794+    while (de->show_all && atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
60795+        usleep(5000);
60796+    }
60797+
60798+    {
60799+        MMAL_ES_SPECIFIC_FORMAT_T new_ess = {.video = {0}};
60800+        MMAL_ES_FORMAT_T new_es = {.es = &new_ess};
60801+		MMAL_VIDEO_FORMAT_T * const new_vfmt = &new_ess.video;
60802+
60803+        video_format_from_zc_frame(&new_es, fr, fr_buf);
60804+        if (de->req_fmt != new_es.encoding ||
60805+            de->req_vfmt.width       != new_vfmt->width ||
60806+            de->req_vfmt.height      != new_vfmt->height ||
60807+            de->req_vfmt.crop.x      != new_vfmt->crop.x ||
60808+            de->req_vfmt.crop.y      != new_vfmt->crop.y ||
60809+            de->req_vfmt.crop.width  != new_vfmt->crop.width ||
60810+            de->req_vfmt.crop.height != new_vfmt->crop.height) {
60811+            // Something has changed
60812+
60813+            // If we have an ISP tear it down
60814+            isp_remove(s, de);
60815+            de->port_in = de->display->input[0];
60816+
60817+            // If we still need an ISP create it now
60818+            if (avfmt_needs_isp(fr->format))
60819+            {
60820+                if (mmal_component_create("vc.ril.isp", &de->isp) != MMAL_SUCCESS)
60821+                {
60822+                    av_log(s, AV_LOG_ERROR, "ISP creation failed\n");
60823+                    goto fail;
60824+                }
60825+                de->port_in = de->isp->input[0];
60826+            }
60827+
60828+            mmal_format_copy(de->port_in->format, &new_es);
60829+
60830+            if (mmal_port_format_commit(de->port_in)) {
60831+                av_log(s, AV_LOG_ERROR, "Failed to commit input format\n");
60832+                goto fail;
60833+            }
60834+
60835+            // If we have an ISP then we must want to use it
60836+            if (de->isp != NULL) {
60837+                MMAL_PORT_T * const port_out = de->isp->output[0];
60838+                MMAL_VIDEO_FORMAT_T* vfmt_in = &de->port_in->format->es->video;
60839+                MMAL_VIDEO_FORMAT_T* vfmt_out = &port_out->format->es->video;
60840+
60841+                port_out->format->type = MMAL_ES_TYPE_VIDEO;
60842+                port_out->format->encoding  = MMAL_ENCODING_YUVUV128;
60843+                port_out->format->encoding_variant = 0;
60844+                port_out->format->bitrate = 0;
60845+                port_out->format->flags = 0;
60846+                port_out->format->extradata = NULL;
60847+                port_out->format->extradata_size = 0;
60848+
60849+                vfmt_out->width       = (vfmt_in->crop.width + 31) & ~31;
60850+                vfmt_out->height      = (vfmt_in->crop.height + 15) & ~15;
60851+                vfmt_out->crop.x      = 0;
60852+                vfmt_out->crop.y      = 0;
60853+                vfmt_out->crop.width  = vfmt_in->crop.width;
60854+                vfmt_out->crop.height = vfmt_in->crop.height;
60855+                vfmt_out->frame_rate  = vfmt_in->frame_rate;
60856+                vfmt_out->par         = vfmt_in->par;
60857+                vfmt_out->color_space = vfmt_in->color_space;
60858+
60859+                if (mmal_port_format_commit(port_out)) {
60860+                    av_log(s, AV_LOG_ERROR, "Failed to commit output format\n");
60861+                    goto fail;
60862+                }
60863+
60864+                if (mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING) != MMAL_SUCCESS) {
60865+                    av_log(s, AV_LOG_ERROR, "Failed to create connection\n");
60866+                    goto fail;
60867+                }
60868+                if (mmal_connection_enable(de->conn) != MMAL_SUCCESS) {
60869+                    av_log(s, AV_LOG_ERROR, "Failed to enable connection\n");
60870+                    goto fail;
60871+                }
60872+                mmal_port_enable(de->isp->control,display_cb_control);
60873+                mmal_component_enable(de->isp);
60874+            }
60875+
60876+            // Number of slots in my port Q
60877+            de->port_in->buffer_num = DISPLAY_PORT_DEPTH;
60878+            // Size to keep it happy - isn't used for anything other than error checking
60879+            de->port_in->buffer_size = buf->alloc_size;
60880+            if (!de->port_in->is_enabled)
60881+            {
60882+                mmal_port_parameter_set_boolean(de->port_in, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
60883+                if (mmal_port_enable(de->port_in, display_cb_input) != MMAL_SUCCESS) {
60884+                    av_log(s, AV_LOG_ERROR, "Failed to enable input port\n");
60885+                    goto fail;
60886+                }
60887+            }
60888+
60889+            de->req_fmt  = new_es.encoding;
60890+            de->req_vfmt = *new_vfmt;
60891+        }
60892+    }
60893+
60894+    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
60895+    {
60896+        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
60897+        goto fail;
60898+    }
60899+    return;
60900+
60901+fail:
60902+    // If we have a buf then fr_buf is held by that
60903+    if (buf != NULL)
60904+        mmal_buffer_header_release(buf);
60905+    else if (fr_buf != NULL)
60906+        av_rpi_zc_unref(fr_buf);
60907+}
60908+
60909+
60910+static int xv_write_trailer(AVFormatContext *s)
60911+{
60912+    rpi_display_env_t * const de = s->priv_data;
60913+#if TRACE_ALL
60914+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
60915+#endif
60916+    if (de->port_in != NULL && de->port_in->is_enabled) {
60917+        mmal_port_disable(de->port_in);
60918+    }
60919+
60920+    // The above disable should kick out all buffers - check that
60921+    if (atomic_load(&de->rpi_display_count) != 0) {
60922+        av_log(s, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count));
60923+    }
60924+
60925+    isp_remove(s, de);
60926+    if (de->rpi_pool != NULL) {
60927+        mmal_pool_destroy(de->rpi_pool);
60928+        de->rpi_pool = NULL;
60929+    }
60930+    if (de->display != NULL) {
60931+        mmal_component_destroy(de->display);
60932+        de->display = NULL;
60933+    }
60934+
60935+    return 0;
60936+}
60937+
60938+static int xv_write_header(AVFormatContext *s)
60939+{
60940+    rpi_display_env_t * const de = s->priv_data;
60941+    const AVCodecParameters * const par = s->streams[0]->codecpar;
60942+    const unsigned int w = de->window_width ? de->window_width : par->width;
60943+    const unsigned int h = de->window_height ? de->window_height : par->height;
60944+    const unsigned int x = de->window_x;
60945+    const unsigned int y = de->window_y;
60946+    const int layer = de->layer ? de->layer : 2;
60947+    const MMAL_BOOL_T fullscreen = de->fullscreen;
60948+
60949+#if TRACE_ALL
60950+    av_log(s, AV_LOG_INFO, "%s: %dx%d\n", __func__, w, h);
60951+#endif
60952+    if (   s->nb_streams > 1
60953+        || par->codec_type != AVMEDIA_TYPE_VIDEO
60954+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
60955+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
60956+        return AVERROR(EINVAL);
60957+    }
60958+
60959+    {
60960+        MMAL_DISPLAYREGION_T region =
60961+        {
60962+            .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
60963+            .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN |
60964+                MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_ALPHA,
60965+            .layer = layer,
60966+            .fullscreen = fullscreen,
60967+            .dest_rect = {x, y, w, h},
60968+            .alpha = !fullscreen ? 0xff : 0xff | MMAL_DISPLAY_ALPHA_FLAGS_DISCARD_LOWER_LAYERS,
60969+        };
60970+
60971+        bcm_host_init();  // Needs to be done by someone...
60972+
60973+        if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display) != MMAL_SUCCESS)
60974+        {
60975+            av_log(s, AV_LOG_ERROR, "Failed to create display component\n");
60976+            goto fail;
60977+        }
60978+        de->port_in = de->display->input[0];
60979+
60980+        mmal_port_parameter_set(de->display->input[0], &region.hdr);
60981+
60982+        if (mmal_component_enable(de->display) != MMAL_SUCCESS)
60983+        {
60984+            av_log(s, AV_LOG_ERROR, "Failed to enable display component\n");
60985+            goto fail;
60986+        }
60987+        if (mmal_port_enable(de->display->control,display_cb_control) != MMAL_SUCCESS)
60988+        {
60989+            av_log(s, AV_LOG_ERROR, "Failed to enable display control port\n");
60990+            goto fail;
60991+        }
60992+
60993+        if ((de->rpi_pool = mmal_pool_create(DISPLAY_PORT_DEPTH, 0)) == NULL)
60994+        {
60995+            av_log(s, AV_LOG_ERROR, "Failed to create pool\n");
60996+            goto fail;
60997+        }
60998+    }
60999+
61000+    return 0;
61001+
61002+fail:
61003+    xv_write_trailer(s);
61004+    return AVERROR_UNKNOWN;
61005+}
61006+
61007+static int xv_write_packet(AVFormatContext *s, AVPacket *pkt)
61008+{
61009+    AVFrame * const frame = (AVFrame *)pkt->data;
61010+#if TRACE_ALL
61011+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
61012+#endif
61013+    display_frame(s, s->priv_data, frame);
61014+    return 0;
61015+}
61016+
61017+static int xv_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
61018+                          unsigned flags)
61019+{
61020+#if TRACE_ALL
61021+    av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
61022+#endif
61023+
61024+    /* xv_write_header() should have accepted only supported formats */
61025+    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
61026+        return 0;
61027+//    return write_picture(s, (*frame)->data, (*frame)->linesize);
61028+
61029+    display_frame(s, s->priv_data, *ppframe);
61030+    return 0;
61031+}
61032+
61033+static int xv_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
61034+{
61035+#if TRACE_ALL
61036+    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
61037+#endif
61038+    switch(type) {
61039+    case AV_APP_TO_DEV_WINDOW_REPAINT:
61040+        return 0;
61041+    default:
61042+        break;
61043+    }
61044+    return AVERROR(ENOSYS);
61045+}
61046+
61047+// deinit is called if init fails so no need to clean up explicity here
61048+static int rpi_vout_init(struct AVFormatContext * s)
61049+{
61050+    rpi_display_env_t * const de = s->priv_data;
61051+
61052+    // Get a ZC context in case we need one - has little overhead if unused
61053+    if ((de->zc = av_rpi_zc_int_env_alloc(s)) == NULL)
61054+        return 1;
61055+
61056+    return 0;
61057+}
61058+
61059+static void rpi_vout_deinit(struct AVFormatContext * s)
61060+{
61061+    rpi_display_env_t * const de = s->priv_data;
61062+
61063+    av_rpi_zc_int_env_freep(&de->zc);
61064+}
61065+
61066+
61067+#define OFFSET(x) offsetof(rpi_display_env_t, x)
61068+static const AVOption options[] = {
61069+    { "show_all",     "show all frames",        OFFSET(show_all),     AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
61070+    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
61071+    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
61072+    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
61073+    { "display_layer","set display layer",      OFFSET(layer),        AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
61074+    { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
61075+    { NULL }
61076+
61077+};
61078+
61079+static const AVClass xv_class = {
61080+    .class_name = "rpi vid outdev",
61081+    .item_name  = av_default_item_name,
61082+    .option     = options,
61083+    .version    = LIBAVUTIL_VERSION_INT,
61084+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
61085+};
61086+
61087+AVOutputFormat ff_vout_rpi_muxer = {
61088+    .name           = "vout_rpi",
61089+    .long_name      = NULL_IF_CONFIG_SMALL("Rpi (mmal) video output device"),
61090+    .priv_data_size = sizeof(rpi_display_env_t),
61091+    .audio_codec    = AV_CODEC_ID_NONE,
61092+    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
61093+    .write_header   = xv_write_header,
61094+    .write_packet   = xv_write_packet,
61095+    .write_uncoded_frame = xv_write_frame,
61096+    .write_trailer  = xv_write_trailer,
61097+    .control_message = xv_control_message,
61098+    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
61099+    .priv_class     = &xv_class,
61100+    .init           = rpi_vout_init,
61101+    .deinit         = rpi_vout_deinit,
61102+};
61103--- a/libavfilter/Makefile
61104+++ b/libavfilter/Makefile
61105@@ -218,6 +218,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER)
61106 OBJS-$(CONFIG_DEFLICKER_FILTER)              += vf_deflicker.o
61107 OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER)        += vf_deinterlace_qsv.o
61108 OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER)      += vf_deinterlace_vaapi.o vaapi_vpp.o
61109+OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER)    += vf_deinterlace_v4l2m2m.o
61110 OBJS-$(CONFIG_DEJUDDER_FILTER)               += vf_dejudder.o
61111 OBJS-$(CONFIG_DELOGO_FILTER)                 += vf_delogo.o
61112 OBJS-$(CONFIG_DENOISE_VAAPI_FILTER)          += vf_misc_vaapi.o vaapi_vpp.o
61113@@ -434,6 +435,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER)
61114 OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o vaapi_vpp.o
61115 OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
61116 OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
61117+OBJS-$(CONFIG_UNSAND_FILTER)                 += vf_unsand.o
61118 OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
61119 OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER)         += vf_unsharp_opencl.o opencl.o \
61120                                                 opencl/unsharp.o
61121--- a/libavfilter/allfilters.c
61122+++ b/libavfilter/allfilters.c
61123@@ -204,6 +204,7 @@ extern AVFilter ff_vf_dedot;
61124 extern AVFilter ff_vf_deflate;
61125 extern AVFilter ff_vf_deflicker;
61126 extern AVFilter ff_vf_deinterlace_qsv;
61127+extern AVFilter ff_vf_deinterlace_v4l2m2m;
61128 extern AVFilter ff_vf_deinterlace_vaapi;
61129 extern AVFilter ff_vf_dejudder;
61130 extern AVFilter ff_vf_delogo;
61131@@ -414,6 +415,7 @@ extern AVFilter ff_vf_transpose_opencl;
61132 extern AVFilter ff_vf_transpose_vaapi;
61133 extern AVFilter ff_vf_trim;
61134 extern AVFilter ff_vf_unpremultiply;
61135+extern AVFilter ff_vf_unsand;
61136 extern AVFilter ff_vf_unsharp;
61137 extern AVFilter ff_vf_unsharp_opencl;
61138 extern AVFilter ff_vf_untile;
61139--- a/libavfilter/avfiltergraph.c
61140+++ b/libavfilter/avfiltergraph.c
61141@@ -32,6 +32,9 @@
61142 #include "libavutil/internal.h"
61143 #include "libavutil/opt.h"
61144 #include "libavutil/pixdesc.h"
61145+#if CONFIG_UNSAND_FILTER
61146+#include "libavutil/rpi_sand_fns.h"
61147+#endif
61148
61149 #define FF_INTERNAL_FIELDS 1
61150 #include "framequeue.h"
61151@@ -427,6 +430,19 @@ static int can_merge_formats(AVFilterFor
61152     }
61153 }
61154
61155+#if CONFIG_UNSAND_FILTER
61156+static int has_sand_format(const AVFilterFormats * const ff)
61157+{
61158+    int i;
61159+    for (i = 0; i != ff->nb_formats; ++i) {
61160+        if (av_rpi_is_sand_format(ff->formats[i])) {
61161+            return 1;
61162+        }
61163+    }
61164+    return 0;
61165+}
61166+#endif
61167+
61168 /**
61169  * Perform one round of query_formats() and merging formats lists on the
61170  * filter graph.
61171@@ -467,6 +483,7 @@ static int query_formats(AVFilterGraph *
61172         for (j = 0; j < filter->nb_inputs; j++) {
61173             AVFilterLink *link = filter->inputs[j];
61174             int convert_needed = 0;
61175+            unsigned int extra_convert_tried = 0;
61176
61177             if (!link)
61178                 continue;
61179@@ -514,11 +531,14 @@ static int query_formats(AVFilterGraph *
61180             )
61181 #undef MERGE_DISPATCH
61182
61183-            if (convert_needed) {
61184+            while (convert_needed) {
61185                 AVFilterContext *convert;
61186                 const AVFilter *filter;
61187                 AVFilterLink *inlink, *outlink;
61188                 char inst_name[30];
61189+                int can_retry = 0;
61190+
61191+                convert_needed = 0;
61192
61193                 if (graph->disable_auto_convert) {
61194                     av_log(log_ctx, AV_LOG_ERROR,
61195@@ -531,19 +551,45 @@ static int query_formats(AVFilterGraph *
61196                 /* couldn't merge format lists. auto-insert conversion filter */
61197                 switch (link->type) {
61198                 case AVMEDIA_TYPE_VIDEO:
61199-                    if (!(filter = avfilter_get_by_name("scale"))) {
61200-                        av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
61201-                               "not present, cannot convert pixel formats.\n");
61202-                        return AVERROR(EINVAL);
61203-                    }
61204-
61205-                    snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
61206-                             scaler_count++);
61207+#if CONFIG_UNSAND_FILTER
61208+                    // Only try each extra conversion once
61209+                    // The unsand output pad should never trigger has_sand_format
61210+                    // but it is better to be safe
61211+                    if ((extra_convert_tried & 1) == 0 && has_sand_format(link->in_formats)) {
61212+                        if (!(filter = avfilter_get_by_name("unsand"))) {
61213+                            av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter "
61214+                                   "not present, cannot convert pixel formats.\n");
61215+                            return AVERROR(EINVAL);
61216+                        }
61217+
61218+                        snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d",
61219+                                 scaler_count++);
61220+
61221+                        if ((ret = avfilter_graph_create_filter(&convert, filter,
61222+                                                                inst_name, "", NULL,
61223+                                                                graph)) < 0)
61224+                            return ret;
61225
61226-                    if ((ret = avfilter_graph_create_filter(&convert, filter,
61227-                                                            inst_name, graph->scale_sws_opts, NULL,
61228-                                                            graph)) < 0)
61229-                        return ret;
61230+                        extra_convert_tried |= 1;
61231+                        can_retry = 1;
61232+                    }
61233+                    else
61234+#endif
61235+                    {
61236+                        if (!(filter = avfilter_get_by_name("scale"))) {
61237+                            av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
61238+                                   "not present, cannot convert pixel formats.\n");
61239+                            return AVERROR(EINVAL);
61240+                        }
61241+
61242+                        snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
61243+                                 scaler_count++);
61244+
61245+                        if ((ret = avfilter_graph_create_filter(&convert, filter,
61246+                                                                inst_name, graph->scale_sws_opts, NULL,
61247+                                                                graph)) < 0)
61248+                            return ret;
61249+                    }
61250                     break;
61251                 case AVMEDIA_TYPE_AUDIO:
61252                     if (!(filter = avfilter_get_by_name("aresample"))) {
61253@@ -585,9 +631,19 @@ static int query_formats(AVFilterGraph *
61254                     av_assert0(outlink-> in_channel_layouts->refcount > 0);
61255                     av_assert0(outlink->out_channel_layouts->refcount > 0);
61256                 }
61257-                if (!ff_merge_formats( inlink->in_formats,  inlink->out_formats,  inlink->type) ||
61258-                    !ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
61259+                // If we have added an extra filter we must merge the input
61260+                // side but we can have another go at the output
61261+                if (!ff_merge_formats( inlink->in_formats,  inlink->out_formats,  inlink->type))
61262                     ret = AVERROR(ENOSYS);
61263+                else if (!ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
61264+                {
61265+                    if (can_retry) {
61266+                        link = outlink;
61267+                        convert_needed = 1;
61268+                        continue;
61269+                    }
61270+                    ret = AVERROR(ENOSYS);
61271+                }
61272                 if (inlink->type == AVMEDIA_TYPE_AUDIO &&
61273                     (!ff_merge_samplerates(inlink->in_samplerates,
61274                                            inlink->out_samplerates) ||
61275--- a/libavfilter/buffersrc.c
61276+++ b/libavfilter/buffersrc.c
61277@@ -210,7 +210,7 @@ static int av_buffersrc_add_frame_intern
61278
61279         switch (ctx->outputs[0]->type) {
61280         case AVMEDIA_TYPE_VIDEO:
61281-            CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
61282+            CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
61283                                      frame->format, frame->pts);
61284             break;
61285         case AVMEDIA_TYPE_AUDIO:
61286--- /dev/null
61287+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
61288@@ -0,0 +1,1336 @@
61289+/*
61290+ * This file is part of FFmpeg.
61291+ *
61292+ * FFmpeg is free software; you can redistribute it and/or
61293+ * modify it under the terms of the GNU Lesser General Public
61294+ * License as published by the Free Software Foundation; either
61295+ * version 2.1 of the License, or (at your option) any later version.
61296+ *
61297+ * FFmpeg is distributed in the hope that it will be useful,
61298+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
61299+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
61300+ * Lesser General Public License for more details.
61301+ *
61302+ * You should have received a copy of the GNU Lesser General Public
61303+ * License along with FFmpeg; if not, write to the Free Software
61304+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
61305+ */
61306+
61307+/**
61308+ * @file
61309+ * deinterlace video filter - V4L2 M2M
61310+ */
61311+
61312+#include <drm_fourcc.h>
61313+
61314+#include <linux/videodev2.h>
61315+
61316+#include <dirent.h>
61317+#include <fcntl.h>
61318+#include <poll.h>
61319+#include <stdatomic.h>
61320+#include <stdio.h>
61321+#include <string.h>
61322+#include <sys/ioctl.h>
61323+#include <sys/mman.h>
61324+#include <unistd.h>
61325+
61326+#include "libavutil/avassert.h"
61327+#include "libavutil/avstring.h"
61328+#include "libavutil/common.h"
61329+#include "libavutil/hwcontext.h"
61330+#include "libavutil/hwcontext_drm.h"
61331+#include "libavutil/internal.h"
61332+#include "libavutil/mathematics.h"
61333+#include "libavutil/opt.h"
61334+#include "libavutil/pixdesc.h"
61335+#include "libavutil/time.h"
61336+
61337+#define FF_INTERNAL_FIELDS 1
61338+#include "framequeue.h"
61339+#include "filters.h"
61340+#include "avfilter.h"
61341+#include "formats.h"
61342+#include "internal.h"
61343+#include "video.h"
61344+
61345+typedef struct V4L2Queue V4L2Queue;
61346+typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
61347+
61348+typedef struct V4L2PlaneInfo {
61349+    int bytesperline;
61350+    size_t length;
61351+} V4L2PlaneInfo;
61352+
61353+typedef struct V4L2Buffer {
61354+    int enqueued;
61355+    int reenqueue;
61356+    int fd;
61357+    struct v4l2_buffer buffer;
61358+    AVFrame frame;
61359+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
61360+    int num_planes;
61361+    V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES];
61362+    AVDRMFrameDescriptor drm_frame;
61363+    V4L2Queue *q;
61364+} V4L2Buffer;
61365+
61366+typedef struct V4L2Queue {
61367+    struct v4l2_format format;
61368+    int num_buffers;
61369+    V4L2Buffer *buffers;
61370+    DeintV4L2M2MContextShared *ctx;
61371+} V4L2Queue;
61372+
61373+typedef struct pts_stats_s
61374+{
61375+    void * logctx;
61376+    const char * name;  // For debug
61377+    unsigned int last_count;
61378+    unsigned int last_interval;
61379+    int64_t last_pts;
61380+} pts_stats_t;
61381+
61382+#define PTS_TRACK_SIZE 32
61383+typedef struct pts_track_el_s
61384+{
61385+    uint32_t n;
61386+    unsigned int interval;
61387+    AVFrame * props;
61388+} pts_track_el_t;
61389+
61390+typedef struct pts_track_s
61391+{
61392+    uint32_t n;
61393+    uint32_t last_n;
61394+    int got_2;
61395+    void * logctx;
61396+    pts_stats_t stats;
61397+    pts_track_el_t a[PTS_TRACK_SIZE];
61398+} pts_track_t;
61399+
61400+typedef struct DeintV4L2M2MContextShared {
61401+    void * logctx;  // For logging - will be NULL when done
61402+
61403+    int fd;
61404+    int done;
61405+    int width;
61406+    int height;
61407+    int orig_width;
61408+    int orig_height;
61409+    atomic_uint refcount;
61410+
61411+    AVBufferRef *hw_frames_ctx;
61412+
61413+    unsigned int field_order;
61414+
61415+    pts_track_t track;
61416+
61417+    V4L2Queue output;
61418+    V4L2Queue capture;
61419+} DeintV4L2M2MContextShared;
61420+
61421+typedef struct DeintV4L2M2MContext {
61422+    const AVClass *class;
61423+
61424+    DeintV4L2M2MContextShared *shared;
61425+} DeintV4L2M2MContext;
61426+
61427+static unsigned int pts_stats_interval(const pts_stats_t * const stats)
61428+{
61429+    return stats->last_interval;
61430+}
61431+
61432+// Pick 64 for max last count - that is >1sec at 60fps
61433+#define STATS_LAST_COUNT_MAX 64
61434+#define STATS_INTERVAL_MAX (1 << 30)
61435+static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
61436+{
61437+    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
61438+        if (stats->last_count < STATS_LAST_COUNT_MAX)
61439+            ++stats->last_count;
61440+        return;
61441+    }
61442+
61443+    if (stats->last_pts != AV_NOPTS_VALUE) {
61444+        const int64_t interval = pts - stats->last_pts;
61445+
61446+        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
61447+            stats->last_count >= STATS_LAST_COUNT_MAX) {
61448+            if (stats->last_interval != 0)
61449+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
61450+                       __func__, stats->name, interval, stats->last_count);
61451+            stats->last_interval = 0;
61452+        }
61453+        else {
61454+            const int64_t frame_time = interval / (int64_t)stats->last_count;
61455+
61456+            if (frame_time != stats->last_interval)
61457+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
61458+                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
61459+            stats->last_interval = frame_time;
61460+        }
61461+    }
61462+
61463+    stats->last_pts = pts;
61464+    stats->last_count = 1;
61465+}
61466+
61467+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
61468+{
61469+    *stats = (pts_stats_t){
61470+        .logctx = logctx,
61471+        .name = name,
61472+        .last_count = 1,
61473+        .last_interval = 0,
61474+        .last_pts = AV_NOPTS_VALUE
61475+    };
61476+}
61477+
61478+static inline uint32_t pts_track_next_n(pts_track_t * const trk)
61479+{
61480+    if (++trk->n == 0)
61481+        trk->n = 1;
61482+    return trk->n;
61483+}
61484+
61485+static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst)
61486+{
61487+    uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000);
61488+    pts_track_el_t * t;
61489+
61490+    // As a first guess assume that n==0 means last frame
61491+    if (n == 0) {
61492+        n = trk->last_n;
61493+        if (n == 0)
61494+            goto fail;
61495+    }
61496+
61497+    t = trk->a + (n & (PTS_TRACK_SIZE - 1));
61498+
61499+    if (t->n != n) {
61500+        av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n);
61501+        goto fail;
61502+    }
61503+
61504+    // 1st frame is simple - just believe it
61505+    if (n != trk->last_n) {
61506+        trk->last_n = n;
61507+        trk->got_2 = 0;
61508+        return av_frame_copy_props(dst, t->props);
61509+    }
61510+
61511+    // Only believe in a single interpolated frame
61512+    if (trk->got_2)
61513+        goto fail;
61514+    trk->got_2 = 1;
61515+
61516+    av_frame_copy_props(dst, t->props);
61517+
61518+
61519+    // If we can't guess - don't
61520+    if (t->interval == 0) {
61521+        dst->best_effort_timestamp = AV_NOPTS_VALUE;
61522+        dst->pts = AV_NOPTS_VALUE;
61523+        dst->pkt_dts = AV_NOPTS_VALUE;
61524+    }
61525+    else {
61526+        if (dst->best_effort_timestamp != AV_NOPTS_VALUE)
61527+            dst->best_effort_timestamp += t->interval / 2;
61528+        if (dst->pts != AV_NOPTS_VALUE)
61529+            dst->pts += t->interval / 2;
61530+        if (dst->pkt_dts != AV_NOPTS_VALUE)
61531+            dst->pkt_dts += t->interval / 2;
61532+    }
61533+
61534+    return 0;
61535+
61536+fail:
61537+    trk->last_n = 0;
61538+    trk->got_2 = 0;
61539+    dst->pts = AV_NOPTS_VALUE;
61540+    dst->pkt_dts = AV_NOPTS_VALUE;
61541+    return 0;
61542+}
61543+
61544+static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
61545+{
61546+    const uint32_t n = pts_track_next_n(trk);
61547+    pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1));
61548+
61549+    pts_stats_add(&trk->stats, src->pts);
61550+
61551+    t->n = n;
61552+    t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last
61553+    av_frame_unref(t->props);
61554+    av_frame_copy_props(t->props, src);
61555+
61556+    // We now know what the previous interval was, rather than having to guess,
61557+    // so set it.  There is a better than decent chance that this is before
61558+    // we use it.
61559+    if (t->interval != 0) {
61560+        pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1));
61561+        prev_t->interval = t->interval;
61562+    }
61563+
61564+    // In case deinterlace interpolates frames use every other usec
61565+    return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2};
61566+}
61567+
61568+static void pts_track_uninit(pts_track_t * const trk)
61569+{
61570+    unsigned int i;
61571+    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
61572+        trk->a[i].n = 0;
61573+        av_frame_free(&trk->a[i].props);
61574+    }
61575+}
61576+
61577+static int pts_track_init(pts_track_t * const trk, void *logctx)
61578+{
61579+    unsigned int i;
61580+    trk->n = 1;
61581+    pts_stats_init(&trk->stats, logctx, "track");
61582+    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
61583+        trk->a[i].n = 0;
61584+        if ((trk->a[i].props = av_frame_alloc()) == NULL) {
61585+            pts_track_uninit(trk);
61586+            return AVERROR(ENOMEM);
61587+        }
61588+    }
61589+    return 0;
61590+}
61591+
61592+static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
61593+{
61594+    struct v4l2_capability cap;
61595+    int ret;
61596+
61597+    memset(&cap, 0, sizeof(cap));
61598+    ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap);
61599+    if (ret < 0)
61600+        return ret;
61601+
61602+    if (!(cap.capabilities & V4L2_CAP_STREAMING))
61603+        return AVERROR(EINVAL);
61604+
61605+    if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
61606+        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
61607+        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
61608+
61609+        return 0;
61610+    }
61611+
61612+    if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
61613+        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
61614+        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
61615+
61616+        return 0;
61617+    }
61618+
61619+    return AVERROR(EINVAL);
61620+}
61621+
61622+static int deint_v4l2m2m_try_format(V4L2Queue *queue)
61623+{
61624+    struct v4l2_format *fmt        = &queue->format;
61625+    DeintV4L2M2MContextShared *ctx = queue->ctx;
61626+    int ret, field;
61627+
61628+    ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt);
61629+    if (ret)
61630+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
61631+
61632+    if (V4L2_TYPE_IS_OUTPUT(fmt->type))
61633+        field = V4L2_FIELD_INTERLACED_TB;
61634+    else
61635+        field = V4L2_FIELD_NONE;
61636+
61637+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
61638+        fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420;
61639+        fmt->fmt.pix_mp.field = field;
61640+        fmt->fmt.pix_mp.width = ctx->width;
61641+        fmt->fmt.pix_mp.height = ctx->height;
61642+    } else {
61643+        fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420;
61644+        fmt->fmt.pix.field = field;
61645+        fmt->fmt.pix.width = ctx->width;
61646+        fmt->fmt.pix.height = ctx->height;
61647+    }
61648+
61649+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
61650+		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
61651+		 fmt->fmt.pix_mp.pixelformat,
61652+		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
61653+
61654+    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt);
61655+    if (ret)
61656+        return AVERROR(EINVAL);
61657+
61658+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
61659+		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
61660+		 fmt->fmt.pix_mp.pixelformat,
61661+		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
61662+
61663+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
61664+        if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 &&
61665+             fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) ||
61666+            fmt->fmt.pix_mp.field != field) {
61667+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
61668+
61669+            return AVERROR(EINVAL);
61670+        }
61671+    } else {
61672+        if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 &&
61673+             fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) ||
61674+            fmt->fmt.pix.field != field) {
61675+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
61676+
61677+            return AVERROR(EINVAL);
61678+        }
61679+    }
61680+
61681+    return 0;
61682+}
61683+
61684+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize)
61685+{
61686+    struct v4l2_format *fmt        = &queue->format;
61687+    DeintV4L2M2MContextShared *ctx = queue->ctx;
61688+    int ret;
61689+
61690+    struct v4l2_selection sel = {
61691+        .type = fmt->type,
61692+        .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
61693+    };
61694+
61695+    // This works for most single object 4:2:0 types
61696+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
61697+        fmt->fmt.pix_mp.pixelformat = pixelformat;
61698+        fmt->fmt.pix_mp.field = field;
61699+        fmt->fmt.pix_mp.width = width;
61700+        fmt->fmt.pix_mp.height = ysize / pitch;
61701+        fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
61702+        fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
61703+    } else {
61704+        fmt->fmt.pix.pixelformat = pixelformat;
61705+        fmt->fmt.pix.field = field;
61706+        fmt->fmt.pix.width = width;
61707+        fmt->fmt.pix.height = height;
61708+        fmt->fmt.pix.sizeimage = 0;
61709+        fmt->fmt.pix.bytesperline = 0;
61710+    }
61711+
61712+    ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
61713+    if (ret) {
61714+        ret = AVERROR(errno);
61715+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
61716+        return ret;
61717+    }
61718+
61719+    if (pixelformat != fmt->fmt.pix.pixelformat) {
61720+        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat));
61721+        return AVERROR(EINVAL);
61722+    }
61723+
61724+    ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
61725+    if (ret) {
61726+        ret = AVERROR(errno);
61727+        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret);
61728+    }
61729+
61730+    sel.r.width = width;
61731+    sel.r.height = height;
61732+    sel.r.left = 0;
61733+    sel.r.top = 0;
61734+    sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
61735+    sel.flags = V4L2_SEL_FLAG_LE;
61736+
61737+    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
61738+    if (ret) {
61739+        ret = AVERROR(errno);
61740+        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret);
61741+    }
61742+
61743+    return 0;
61744+}
61745+
61746+static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
61747+{
61748+    int ret;
61749+
61750+    ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0);
61751+    if (ctx->fd < 0)
61752+        return AVERROR(errno);
61753+
61754+    ret = deint_v4l2m2m_prepare_context(ctx);
61755+    if (ret)
61756+        goto fail;
61757+
61758+    ret = deint_v4l2m2m_try_format(&ctx->capture);
61759+    if (ret)
61760+        goto fail;
61761+
61762+    ret = deint_v4l2m2m_try_format(&ctx->output);
61763+    if (ret)
61764+        goto fail;
61765+
61766+    return 0;
61767+
61768+fail:
61769+    close(ctx->fd);
61770+    ctx->fd = -1;
61771+
61772+    return ret;
61773+}
61774+
61775+static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx)
61776+{
61777+    int ret = AVERROR(EINVAL);
61778+    struct dirent *entry;
61779+    char node[PATH_MAX];
61780+    DIR *dirp;
61781+
61782+    dirp = opendir("/dev");
61783+    if (!dirp)
61784+        return AVERROR(errno);
61785+
61786+    for (entry = readdir(dirp); entry; entry = readdir(dirp)) {
61787+
61788+        if (strncmp(entry->d_name, "video", 5))
61789+            continue;
61790+
61791+        snprintf(node, sizeof(node), "/dev/%s", entry->d_name);
61792+        av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node);
61793+        ret = deint_v4l2m2m_probe_device(ctx, node);
61794+        if (!ret)
61795+            break;
61796+    }
61797+
61798+    closedir(dirp);
61799+
61800+    if (ret) {
61801+        av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n");
61802+        ctx->fd = -1;
61803+
61804+        return ret;
61805+    }
61806+
61807+    av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node);
61808+
61809+    return 0;
61810+}
61811+
61812+static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
61813+{
61814+    int ret;
61815+
61816+    ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer);
61817+    if (ret < 0)
61818+        return AVERROR(errno);
61819+
61820+    buf->enqueued = 1;
61821+
61822+    return 0;
61823+}
61824+
61825+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
61826+{
61827+    struct v4l2_exportbuffer expbuf;
61828+    int i, ret;
61829+    uint64_t mod = DRM_FORMAT_MOD_LINEAR;
61830+    uint32_t fmt = 0;
61831+
61832+    switch (pixelformat) {
61833+    case V4L2_PIX_FMT_NV12:
61834+        fmt = DRM_FORMAT_NV12;
61835+        break;
61836+    case V4L2_PIX_FMT_YUV420:
61837+        fmt = DRM_FORMAT_YUV420;
61838+        break;
61839+    default:
61840+        return AVERROR(EINVAL);
61841+    }
61842+
61843+    avbuf->drm_frame.layers[0].format = fmt;
61844+
61845+    for (i = 0; i < avbuf->num_planes; i++) {
61846+        memset(&expbuf, 0, sizeof(expbuf));
61847+
61848+        expbuf.index = avbuf->buffer.index;
61849+        expbuf.type = avbuf->buffer.type;
61850+        expbuf.plane = i;
61851+
61852+        ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf);
61853+        if (ret < 0)
61854+            return AVERROR(errno);
61855+
61856+        avbuf->fd = expbuf.fd;
61857+
61858+        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) {
61859+            /* drm frame */
61860+            avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
61861+            avbuf->drm_frame.objects[i].fd = expbuf.fd;
61862+            avbuf->drm_frame.objects[i].format_modifier = mod;
61863+        } else {
61864+            /* drm frame */
61865+            avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
61866+            avbuf->drm_frame.objects[0].fd = expbuf.fd;
61867+            avbuf->drm_frame.objects[0].format_modifier = mod;
61868+        }
61869+    }
61870+
61871+    return 0;
61872+}
61873+
61874+static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
61875+{
61876+    struct v4l2_format *fmt = &queue->format;
61877+    DeintV4L2M2MContextShared *ctx = queue->ctx;
61878+    struct v4l2_requestbuffers req;
61879+    int ret, i, j, multiplanar;
61880+    uint32_t memory;
61881+
61882+    memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
61883+        V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
61884+
61885+    multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type);
61886+
61887+    memset(&req, 0, sizeof(req));
61888+    req.count = queue->num_buffers;
61889+    req.memory = memory;
61890+    req.type = fmt->type;
61891+
61892+    ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req);
61893+    if (ret < 0) {
61894+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno));
61895+
61896+        return AVERROR(errno);
61897+    }
61898+
61899+    queue->num_buffers = req.count;
61900+    queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer));
61901+    if (!queue->buffers) {
61902+        av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n");
61903+
61904+        return AVERROR(ENOMEM);
61905+    }
61906+
61907+    for (i = 0; i < queue->num_buffers; i++) {
61908+        V4L2Buffer *buf = &queue->buffers[i];
61909+
61910+        buf->enqueued = 0;
61911+        buf->fd = -1;
61912+        buf->q = queue;
61913+
61914+        buf->buffer.type = fmt->type;
61915+        buf->buffer.memory = memory;
61916+        buf->buffer.index = i;
61917+
61918+        if (multiplanar) {
61919+            buf->buffer.length = VIDEO_MAX_PLANES;
61920+            buf->buffer.m.planes = buf->planes;
61921+        }
61922+
61923+        ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
61924+        if (ret < 0) {
61925+            ret = AVERROR(errno);
61926+
61927+            goto fail;
61928+        }
61929+
61930+        if (multiplanar)
61931+            buf->num_planes = buf->buffer.length;
61932+        else
61933+            buf->num_planes = 1;
61934+
61935+        for (j = 0; j < buf->num_planes; j++) {
61936+            V4L2PlaneInfo *info = &buf->plane_info[j];
61937+
61938+            if (multiplanar) {
61939+                info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline;
61940+                info->length = buf->buffer.m.planes[j].length;
61941+            } else {
61942+                info->bytesperline = fmt->fmt.pix.bytesperline;
61943+                info->length = buf->buffer.length;
61944+            }
61945+        }
61946+
61947+        if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
61948+            ret = deint_v4l2m2m_enqueue_buffer(buf);
61949+            if (ret)
61950+                goto fail;
61951+
61952+            ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat);
61953+            if (ret)
61954+                goto fail;
61955+        }
61956+    }
61957+
61958+    return 0;
61959+
61960+fail:
61961+    for (i = 0; i < queue->num_buffers; i++)
61962+        if (queue->buffers[i].fd >= 0)
61963+            close(queue->buffers[i].fd);
61964+    av_free(queue->buffers);
61965+    queue->buffers = NULL;
61966+
61967+    return ret;
61968+}
61969+
61970+static int deint_v4l2m2m_streamon(V4L2Queue *queue)
61971+{
61972+    DeintV4L2M2MContextShared * const ctx = queue->ctx;
61973+    int type = queue->format.type;
61974+    int ret;
61975+
61976+    ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type);
61977+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
61978+    if (ret < 0)
61979+        return AVERROR(errno);
61980+
61981+    return 0;
61982+}
61983+
61984+static int deint_v4l2m2m_streamoff(V4L2Queue *queue)
61985+{
61986+    DeintV4L2M2MContextShared * const ctx = queue->ctx;
61987+    int type = queue->format.type;
61988+    int ret;
61989+
61990+    ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type);
61991+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
61992+    if (ret < 0)
61993+        return AVERROR(errno);
61994+
61995+    return 0;
61996+}
61997+
61998+// timeout in ms
61999+static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout)
62000+{
62001+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
62002+    DeintV4L2M2MContextShared *ctx = queue->ctx;
62003+    struct v4l2_buffer buf = { 0 };
62004+    V4L2Buffer* avbuf = NULL;
62005+    struct pollfd pfd;
62006+    short events;
62007+    int ret;
62008+
62009+    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
62010+        events =  POLLOUT | POLLWRNORM;
62011+    else
62012+        events = POLLIN | POLLRDNORM;
62013+
62014+    pfd.events = events;
62015+    pfd.fd = ctx->fd;
62016+
62017+    for (;;) {
62018+        ret = poll(&pfd, 1, timeout);
62019+        if (ret > 0)
62020+            break;
62021+        if (errno == EINTR)
62022+            continue;
62023+        return NULL;
62024+    }
62025+
62026+    if (pfd.revents & POLLERR)
62027+        return NULL;
62028+
62029+    if (pfd.revents & events) {
62030+        memset(&buf, 0, sizeof(buf));
62031+        buf.memory = V4L2_MEMORY_MMAP;
62032+        buf.type = queue->format.type;
62033+        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
62034+            memset(planes, 0, sizeof(planes));
62035+            buf.length = VIDEO_MAX_PLANES;
62036+            buf.m.planes = planes;
62037+        }
62038+
62039+        ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf);
62040+        if (ret) {
62041+            if (errno != EAGAIN)
62042+                av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n",
62043+                       av_err2str(AVERROR(errno)));
62044+            return NULL;
62045+        }
62046+
62047+        avbuf = &queue->buffers[buf.index];
62048+        avbuf->enqueued = 0;
62049+        avbuf->buffer = buf;
62050+        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
62051+            memcpy(avbuf->planes, planes, sizeof(planes));
62052+            avbuf->buffer.m.planes = avbuf->planes;
62053+        }
62054+        return avbuf;
62055+    }
62056+
62057+    return NULL;
62058+}
62059+
62060+static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue)
62061+{
62062+    int i;
62063+    V4L2Buffer *buf = NULL;
62064+
62065+    for (i = 0; i < queue->num_buffers; i++)
62066+        if (!queue->buffers[i].enqueued) {
62067+            buf = &queue->buffers[i];
62068+            break;
62069+        }
62070+    return buf;
62071+}
62072+
62073+static void deint_v4l2m2m_unref_queued(V4L2Queue *queue)
62074+{
62075+    int i;
62076+    V4L2Buffer *buf = NULL;
62077+
62078+    if (!queue || !queue->buffers)
62079+        return;
62080+    for (i = 0; i < queue->num_buffers; i++) {
62081+        buf = &queue->buffers[i];
62082+        if (queue->buffers[i].enqueued)
62083+            av_frame_unref(&buf->frame);
62084+    }
62085+}
62086+
62087+static void recycle_q(V4L2Queue * const queue)
62088+{
62089+    V4L2Buffer* avbuf;
62090+    while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) {
62091+        av_frame_unref(&avbuf->frame);
62092+    }
62093+}
62094+
62095+static int count_enqueued(V4L2Queue *queue)
62096+{
62097+    int i;
62098+    int n = 0;
62099+
62100+    if (queue->buffers == NULL)
62101+        return 0;
62102+
62103+    for (i = 0; i < queue->num_buffers; i++)
62104+        if (queue->buffers[i].enqueued)
62105+            ++n;
62106+    return n;
62107+}
62108+
62109+static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame)
62110+{
62111+    DeintV4L2M2MContextShared *const ctx = queue->ctx;
62112+    AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0];
62113+    V4L2Buffer *buf;
62114+    int i;
62115+
62116+    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
62117+        recycle_q(queue);
62118+
62119+    buf = deint_v4l2m2m_find_free_buf(queue);
62120+    if (!buf) {
62121+        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0);
62122+        return AVERROR(EAGAIN);
62123+    }
62124+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type))
62125+        for (i = 0; i < drm_desc->nb_objects; i++)
62126+            buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd;
62127+    else
62128+        buf->buffer.m.fd = drm_desc->objects[0].fd;
62129+
62130+    buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE :
62131+        frame->top_field_first ? V4L2_FIELD_INTERLACED_TB :
62132+            V4L2_FIELD_INTERLACED_BT;
62133+
62134+    if (ctx->field_order != buf->buffer.field) {
62135+        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field);
62136+        ctx->field_order = buf->buffer.field;
62137+    }
62138+
62139+    buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame);
62140+
62141+    buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd;
62142+
62143+    av_frame_move_ref(&buf->frame, frame);
62144+
62145+    return deint_v4l2m2m_enqueue_buffer(buf);
62146+}
62147+
62148+static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
62149+{
62150+    if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
62151+        V4L2Queue *capture = &ctx->capture;
62152+        V4L2Queue *output  = &ctx->output;
62153+        int i;
62154+
62155+        av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
62156+
62157+        if (ctx->fd >= 0) {
62158+            deint_v4l2m2m_streamoff(capture);
62159+            deint_v4l2m2m_streamoff(output);
62160+        }
62161+
62162+        if (capture->buffers)
62163+            for (i = 0; i < capture->num_buffers; i++) {
62164+                capture->buffers[i].q = NULL;
62165+                if (capture->buffers[i].fd >= 0)
62166+                    close(capture->buffers[i].fd);
62167+            }
62168+
62169+        deint_v4l2m2m_unref_queued(output);
62170+
62171+        av_buffer_unref(&ctx->hw_frames_ctx);
62172+
62173+        if (capture->buffers)
62174+            av_free(capture->buffers);
62175+
62176+        if (output->buffers)
62177+            av_free(output->buffers);
62178+
62179+        if (ctx->fd >= 0) {
62180+            close(ctx->fd);
62181+            ctx->fd = -1;
62182+        }
62183+
62184+        av_free(ctx);
62185+    }
62186+}
62187+
62188+static void v4l2_free_buffer(void *opaque, uint8_t *unused)
62189+{
62190+    V4L2Buffer *buf                = opaque;
62191+    DeintV4L2M2MContextShared *ctx = buf->q->ctx;
62192+
62193+    if (!ctx->done)
62194+        deint_v4l2m2m_enqueue_buffer(buf);
62195+
62196+    deint_v4l2m2m_destroy_context(ctx);
62197+}
62198+
62199+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
62200+{
62201+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
62202+    AVDRMLayerDescriptor *layer;
62203+
62204+    /* fill the DRM frame descriptor */
62205+    drm_desc->nb_objects = avbuf->num_planes;
62206+    drm_desc->nb_layers = 1;
62207+
62208+    layer = &drm_desc->layers[0];
62209+    layer->nb_planes = avbuf->num_planes;
62210+
62211+    for (int i = 0; i < avbuf->num_planes; i++) {
62212+        layer->planes[i].object_index = i;
62213+        layer->planes[i].offset = 0;
62214+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
62215+    }
62216+
62217+    switch (layer->format) {
62218+    case DRM_FORMAT_YUYV:
62219+        layer->nb_planes = 1;
62220+        break;
62221+
62222+    case DRM_FORMAT_NV12:
62223+    case DRM_FORMAT_NV21:
62224+        if (avbuf->num_planes > 1)
62225+            break;
62226+
62227+        layer->nb_planes = 2;
62228+
62229+        layer->planes[1].object_index = 0;
62230+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
62231+            height;
62232+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
62233+        break;
62234+
62235+    case DRM_FORMAT_YUV420:
62236+        if (avbuf->num_planes > 1)
62237+            break;
62238+
62239+        layer->nb_planes = 3;
62240+
62241+        layer->planes[1].object_index = 0;
62242+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
62243+            height;
62244+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
62245+
62246+        layer->planes[2].object_index = 0;
62247+        layer->planes[2].offset = layer->planes[1].offset +
62248+            ((avbuf->plane_info[0].bytesperline *
62249+              height) >> 2);
62250+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
62251+        break;
62252+
62253+    default:
62254+        drm_desc->nb_layers = 0;
62255+        break;
62256+    }
62257+
62258+    return (uint8_t *) drm_desc;
62259+}
62260+
62261+// timeout in ms
62262+static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
62263+{
62264+    DeintV4L2M2MContextShared *ctx = queue->ctx;
62265+    V4L2Buffer* avbuf;
62266+
62267+    av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
62268+
62269+    avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
62270+    if (!avbuf) {
62271+        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
62272+        return AVERROR(EAGAIN);
62273+    }
62274+
62275+    // Fill in PTS and anciliary info from src frame
62276+    // we will want to overwrite some fields as only the pts/dts
62277+    // fields are updated with new timing in this fn
62278+    pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
62279+
62280+    frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
62281+                            sizeof(avbuf->drm_frame), v4l2_free_buffer,
62282+                            avbuf, AV_BUFFER_FLAG_READONLY);
62283+    if (!frame->buf[0]) {
62284+        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0);
62285+        return AVERROR(ENOMEM);
62286+    }
62287+
62288+    atomic_fetch_add(&ctx->refcount, 1);
62289+
62290+    frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height);
62291+    frame->format = AV_PIX_FMT_DRM_PRIME;
62292+    if (ctx->hw_frames_ctx)
62293+        frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
62294+    frame->height = ctx->height;
62295+    frame->width = ctx->width;
62296+
62297+    // Not interlaced now
62298+    frame->interlaced_frame = 0;
62299+    frame->top_field_first = 0;
62300+    // Pkt duration halved
62301+    frame->pkt_duration /= 2;
62302+
62303+    if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
62304+        av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
62305+        frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM;
62306+    }
62307+
62308+    av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts);
62309+    return 0;
62310+}
62311+
62312+static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
62313+{
62314+    AVFilterLink *inlink           = outlink->src->inputs[0];
62315+    AVFilterContext *avctx         = outlink->src;
62316+    DeintV4L2M2MContext *priv      = avctx->priv;
62317+    DeintV4L2M2MContextShared *ctx = priv->shared;
62318+    int ret;
62319+
62320+    ctx->height = avctx->inputs[0]->h;
62321+    ctx->width = avctx->inputs[0]->w;
62322+
62323+    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height);
62324+
62325+    outlink->time_base           = inlink->time_base;
62326+    outlink->w                   = inlink->w;
62327+    outlink->h                   = inlink->h;
62328+    outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
62329+    outlink->format              = inlink->format;
62330+    outlink->frame_rate = (AVRational) {1, 0};  // Deny knowledge of frame rate
62331+
62332+    ret = deint_v4l2m2m_find_device(ctx);
62333+    if (ret)
62334+        return ret;
62335+
62336+    if (inlink->hw_frames_ctx) {
62337+        ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
62338+        if (!ctx->hw_frames_ctx)
62339+            return AVERROR(ENOMEM);
62340+    }
62341+    return 0;
62342+}
62343+
62344+static int deint_v4l2m2m_query_formats(AVFilterContext *avctx)
62345+{
62346+    static const enum AVPixelFormat pixel_formats[] = {
62347+        AV_PIX_FMT_DRM_PRIME,
62348+        AV_PIX_FMT_YUV420P,
62349+        AV_PIX_FMT_NONE,
62350+    };
62351+
62352+    return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats));
62353+}
62354+
62355+static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
62356+{
62357+    const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR ||
62358+            drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID);
62359+
62360+    switch (drm_desc->layers[0].format) {
62361+    case DRM_FORMAT_YUV420:
62362+        if (is_linear)
62363+            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0;
62364+        break;
62365+    case DRM_FORMAT_NV12:
62366+        if (is_linear)
62367+            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0;
62368+        break;
62369+    default:
62370+        break;
62371+    }
62372+    return 0;
62373+}
62374+
62375+static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
62376+{
62377+    AVFilterContext *avctx         = link->dst;
62378+    DeintV4L2M2MContext *priv      = avctx->priv;
62379+    DeintV4L2M2MContextShared *ctx = priv->shared;
62380+    V4L2Queue *capture             = &ctx->capture;
62381+    V4L2Queue *output              = &ctx->output;
62382+    int ret;
62383+
62384+    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n",
62385+          __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
62386+    av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
62387+           avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
62388+
62389+    if (ctx->field_order == V4L2_FIELD_ANY) {
62390+        const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
62391+        const uint32_t pixelformat = desc_pixelformat(drm_desc);
62392+
62393+        if (pixelformat == 0) {
62394+            av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
62395+                   av_fourcc2str(drm_desc->layers[0].format),
62396+                   drm_desc->nb_objects, drm_desc->objects[0].format_modifier);
62397+            return AVERROR(EINVAL);
62398+        }
62399+
62400+        ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
62401+        ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
62402+
62403+        av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
62404+           drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
62405+
62406+        ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
62407+        if (ret)
62408+            return ret;
62409+
62410+        ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
62411+        if (ret)
62412+            return ret;
62413+
62414+        ret = deint_v4l2m2m_allocate_buffers(capture);
62415+        if (ret)
62416+            return ret;
62417+
62418+        ret = deint_v4l2m2m_streamon(capture);
62419+        if (ret)
62420+            return ret;
62421+
62422+        ret = deint_v4l2m2m_allocate_buffers(output);
62423+        if (ret)
62424+            return ret;
62425+
62426+        ret = deint_v4l2m2m_streamon(output);
62427+        if (ret)
62428+            return ret;
62429+
62430+        if (in->top_field_first)
62431+            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
62432+        else
62433+            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
62434+
62435+    }
62436+
62437+    ret = deint_v4l2m2m_enqueue_frame(output, in);
62438+
62439+    av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret));
62440+    return ret;
62441+}
62442+
62443+static int deint_v4l2m2m_activate(AVFilterContext *avctx)
62444+{
62445+    DeintV4L2M2MContext * const priv = avctx->priv;
62446+    DeintV4L2M2MContextShared *const s = priv->shared;
62447+    AVFilterLink * const outlink = avctx->outputs[0];
62448+    AVFilterLink * const inlink = avctx->inputs[0];
62449+    int n = 0;
62450+    int cn = 99;
62451+    int instatus = 0;
62452+    int64_t inpts = 0;
62453+    int did_something = 0;
62454+
62455+    av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
62456+
62457+    FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
62458+
62459+    ff_inlink_acknowledge_status(inlink, &instatus, &inpts);
62460+
62461+    if (!ff_outlink_frame_wanted(outlink)) {
62462+        av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
62463+    }
62464+    else if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
62465+    {
62466+        AVFrame * frame = av_frame_alloc();
62467+        int rv;
62468+
62469+again:
62470+        recycle_q(&s->output);
62471+        n = count_enqueued(&s->output);
62472+
62473+        if (frame == NULL) {
62474+            av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__);
62475+            return AVERROR(ENOMEM);
62476+        }
62477+
62478+        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0);
62479+        if (rv != 0) {
62480+            av_frame_free(&frame);
62481+            if (rv != AVERROR(EAGAIN)) {
62482+                av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
62483+                return rv;
62484+            }
62485+        }
62486+        else {
62487+            frame->interlaced_frame = 0;
62488+            // frame is always consumed by filter_frame - even on error despite
62489+            // a somewhat confusing comment in the header
62490+            rv = ff_filter_frame(outlink, frame);
62491+
62492+            if (instatus != 0) {
62493+                av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__);
62494+                goto again;
62495+            }
62496+
62497+            av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
62498+            did_something = 1;
62499+        }
62500+
62501+        cn = count_enqueued(&s->capture);
62502+    }
62503+
62504+    if (instatus != 0) {
62505+        ff_outlink_set_status(outlink, instatus, inpts);
62506+        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus));
62507+        return 0;
62508+    }
62509+
62510+    recycle_q(&s->output);
62511+    n = count_enqueued(&s->output);
62512+
62513+    while (n < 6) {
62514+        AVFrame * frame;
62515+        int rv;
62516+
62517+        if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
62518+            av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
62519+            return rv;
62520+        }
62521+
62522+        if (frame == NULL) {
62523+            av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
62524+            break;
62525+        }
62526+
62527+        rv = deint_v4l2m2m_filter_frame(inlink, frame);
62528+        av_frame_free(&frame);
62529+
62530+        if (rv != 0)
62531+            return rv;
62532+
62533+        av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
62534+        ++n;
62535+    }
62536+
62537+    if (n < 6) {
62538+        ff_inlink_request_frame(inlink);
62539+        did_something = 1;
62540+        av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
62541+    }
62542+
62543+    if (n > 4 && ff_outlink_frame_wanted(outlink)) {
62544+        ff_filter_set_ready(avctx, 1);
62545+        did_something = 1;
62546+        av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
62547+    }
62548+
62549+    av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn);
62550+    return did_something ? 0 : FFERROR_NOT_READY;
62551+}
62552+
62553+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
62554+{
62555+    DeintV4L2M2MContext * const priv = avctx->priv;
62556+    DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
62557+
62558+    if (!ctx) {
62559+        av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0);
62560+        return AVERROR(ENOMEM);
62561+    }
62562+    priv->shared = ctx;
62563+    ctx->logctx = priv;
62564+    ctx->fd = -1;
62565+    ctx->output.ctx = ctx;
62566+    ctx->output.num_buffers = 8;
62567+    ctx->capture.ctx = ctx;
62568+    ctx->capture.num_buffers = 12;
62569+    ctx->done = 0;
62570+    ctx->field_order = V4L2_FIELD_ANY;
62571+
62572+    pts_track_init(&ctx->track, priv);
62573+
62574+    atomic_init(&ctx->refcount, 1);
62575+
62576+    return 0;
62577+}
62578+
62579+static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
62580+{
62581+    DeintV4L2M2MContext *priv = avctx->priv;
62582+    DeintV4L2M2MContextShared *ctx = priv->shared;
62583+
62584+    ctx->done = 1;
62585+    ctx->logctx = NULL;  // Log to NULL works, log to missing crashes
62586+    pts_track_uninit(&ctx->track);
62587+    deint_v4l2m2m_destroy_context(ctx);
62588+}
62589+
62590+static const AVOption deinterlace_v4l2m2m_options[] = {
62591+    { NULL },
62592+};
62593+
62594+AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
62595+
62596+static const AVFilterPad deint_v4l2m2m_inputs[] = {
62597+    {
62598+        .name         = "default",
62599+        .type         = AVMEDIA_TYPE_VIDEO,
62600+    },
62601+    { NULL }
62602+};
62603+
62604+static const AVFilterPad deint_v4l2m2m_outputs[] = {
62605+    {
62606+        .name          = "default",
62607+        .type          = AVMEDIA_TYPE_VIDEO,
62608+        .config_props  = deint_v4l2m2m_config_props,
62609+    },
62610+    { NULL }
62611+};
62612+
62613+AVFilter ff_vf_deinterlace_v4l2m2m = {
62614+    .name           = "deinterlace_v4l2m2m",
62615+    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"),
62616+    .priv_size      = sizeof(DeintV4L2M2MContext),
62617+    .init           = &deint_v4l2m2m_init,
62618+    .uninit         = &deint_v4l2m2m_uninit,
62619+    .query_formats  = &deint_v4l2m2m_query_formats,
62620+    .inputs         = deint_v4l2m2m_inputs,
62621+    .outputs        = deint_v4l2m2m_outputs,
62622+    .priv_class     = &deinterlace_v4l2m2m_class,
62623+    .activate       = deint_v4l2m2m_activate,
62624+};
62625--- /dev/null
62626+++ b/libavfilter/vf_unsand.c
62627@@ -0,0 +1,234 @@
62628+/*
62629+ * Copyright (c) 2007 Bobby Bingham
62630+ *
62631+ * This file is part of FFmpeg.
62632+ *
62633+ * FFmpeg is free software; you can redistribute it and/or
62634+ * modify it under the terms of the GNU Lesser General Public
62635+ * License as published by the Free Software Foundation; either
62636+ * version 2.1 of the License, or (at your option) any later version.
62637+ *
62638+ * FFmpeg is distributed in the hope that it will be useful,
62639+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
62640+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
62641+ * Lesser General Public License for more details.
62642+ *
62643+ * You should have received a copy of the GNU Lesser General Public
62644+ * License along with FFmpeg; if not, write to the Free Software
62645+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
62646+ */
62647+
62648+/**
62649+ * @file
62650+ * format and noformat video filters
62651+ */
62652+
62653+#include <string.h>
62654+
62655+#include "libavutil/internal.h"
62656+#include "libavutil/mem.h"
62657+#include "libavutil/pixdesc.h"
62658+#include "libavutil/opt.h"
62659+#include "libavutil/rpi_sand_fns.h"
62660+
62661+#include "avfilter.h"
62662+#include "formats.h"
62663+#include "internal.h"
62664+#include "video.h"
62665+
62666+typedef struct UnsandContext {
62667+    const AVClass *class;
62668+} UnsandContext;
62669+
62670+static av_cold void uninit(AVFilterContext *ctx)
62671+{
62672+//    UnsandContext *s = ctx->priv;
62673+}
62674+
62675+static av_cold int init(AVFilterContext *ctx)
62676+{
62677+//    UnsandContext *s = ctx->priv;
62678+
62679+    return 0;
62680+}
62681+
62682+
62683+static int filter_frame(AVFilterLink *link, AVFrame *in)
62684+{
62685+    AVFilterLink * const outlink = link->dst->outputs[0];
62686+    AVFrame *out = NULL;
62687+    int rv = 0;
62688+
62689+    if (outlink->format == in->format) {
62690+        // If nothing to do then do nothing
62691+        out = in;
62692+    }
62693+    else
62694+    {
62695+        if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
62696+        {
62697+            rv = AVERROR(ENOMEM);
62698+            goto fail;
62699+        }
62700+        if (av_rpi_sand_to_planar_frame(out, in) != 0)
62701+        {
62702+            rv = -1;
62703+            goto fail;
62704+        }
62705+
62706+        av_frame_free(&in);
62707+    }
62708+
62709+    return ff_filter_frame(outlink, out);
62710+
62711+fail:
62712+    av_frame_free(&out);
62713+    av_frame_free(&in);
62714+    return rv;
62715+}
62716+
62717+#if 0
62718+static void dump_fmts(const AVFilterFormats * fmts)
62719+{
62720+    int i;
62721+    if (fmts== NULL) {
62722+        printf("NULL\n");
62723+        return;
62724+    }
62725+    for (i = 0; i < fmts->nb_formats; ++i) {
62726+        printf(" %d", fmts->formats[i]);
62727+    }
62728+    printf("\n");
62729+}
62730+#endif
62731+
62732+static int query_formats(AVFilterContext *ctx)
62733+{
62734+//    UnsandContext *s = ctx->priv;
62735+    int ret;
62736+
62737+    // If we aren't connected at both ends then just do nothing
62738+    if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
62739+        return 0;
62740+
62741+//    printf("Unsand: %s in: ", __func__);
62742+//    dump_fmts(ctx->inputs[0]->in_formats);
62743+//    printf("Unsand: %s out: ", __func__);
62744+//    dump_fmts(ctx->outputs[0]->out_formats);
62745+
62746+    // Our output formats depend on our input formats and we can't/don't
62747+    // want to convert between bit depths so we need to wait for the source
62748+    // to have an opinion before we do
62749+    if (ctx->inputs[0]->in_formats == NULL)
62750+        return AVERROR(EAGAIN);
62751+
62752+    // Accept anything
62753+    if (ctx->inputs[0]->out_formats == NULL &&
62754+        (ret = ff_formats_ref(ctx->inputs[0]->in_formats, &ctx->inputs[0]->out_formats)) < 0)
62755+        return ret;
62756+
62757+    // Filter out sand formats
62758+
62759+    // Generate a container if we don't already have one
62760+    if (ctx->outputs[0]->in_formats == NULL)
62761+    {
62762+        // Somewhat rubbish way of ensuring we have a good structure
62763+        const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
62764+        AVFilterFormats *formats = ff_make_format_list(out_fmts);
62765+
62766+        if (formats == NULL)
62767+            return AVERROR(ENOMEM);
62768+        if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->in_formats)) < 0)
62769+            return ret;
62770+    }
62771+
62772+    // Replace old format list with new filtered list derived from what our
62773+    // input says it can do
62774+    {
62775+        const AVFilterFormats * const src_ff = ctx->inputs[0]->out_formats;
62776+        AVFilterFormats * const dst_ff = ctx->outputs[0]->in_formats;
62777+        enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
62778+        int i;
62779+        int n = 0;
62780+        int seen_420p = 0;
62781+        int seen_420p10 = 0;
62782+
62783+        for (i = 0; i < src_ff->nb_formats; ++i) {
62784+            const enum AVPixelFormat f = src_ff->formats[i];
62785+
62786+            switch (f){
62787+                case AV_PIX_FMT_YUV420P:
62788+                case AV_PIX_FMT_SAND128:
62789+                case AV_PIX_FMT_RPI4_8:
62790+                    if (!seen_420p) {
62791+                        seen_420p = 1;
62792+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P;
62793+                    }
62794+                    break;
62795+                case AV_PIX_FMT_SAND64_10:
62796+                case AV_PIX_FMT_YUV420P10:
62797+                case AV_PIX_FMT_RPI4_10:
62798+                    if (!seen_420p10) {
62799+                        seen_420p10 = 1;
62800+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
62801+                    }
62802+                    break;
62803+                default:
62804+                    dst_fmts[n++] = f;
62805+                    break;
62806+            }
62807+        }
62808+
62809+        av_freep(&dst_ff->formats);
62810+        dst_ff->formats = dst_fmts;
62811+        dst_ff->nb_formats = n;
62812+    }
62813+
62814+//    printf("Unsand: %s calc: ", __func__);
62815+//    dump_fmts(ctx->outputs[0]->in_formats);
62816+
62817+    return 0;
62818+}
62819+
62820+
62821+#define OFFSET(x) offsetof(UnsandContext, x)
62822+static const AVOption unsand_options[] = {
62823+    { NULL }
62824+};
62825+
62826+
62827+AVFILTER_DEFINE_CLASS(unsand);
62828+
62829+static const AVFilterPad avfilter_vf_unsand_inputs[] = {
62830+    {
62831+        .name             = "default",
62832+        .type             = AVMEDIA_TYPE_VIDEO,
62833+        .filter_frame = filter_frame,
62834+    },
62835+    { NULL }
62836+};
62837+
62838+static const AVFilterPad avfilter_vf_unsand_outputs[] = {
62839+    {
62840+        .name = "default",
62841+        .type = AVMEDIA_TYPE_VIDEO
62842+    },
62843+    { NULL }
62844+};
62845+
62846+AVFilter ff_vf_unsand = {
62847+    .name          = "unsand",
62848+    .description   = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
62849+
62850+    .init          = init,
62851+    .uninit        = uninit,
62852+
62853+    .query_formats = query_formats,
62854+
62855+    .priv_size     = sizeof(UnsandContext),
62856+    .priv_class    = &unsand_class,
62857+
62858+    .inputs        = avfilter_vf_unsand_inputs,
62859+    .outputs       = avfilter_vf_unsand_outputs,
62860+};
62861+
62862--- a/libavformat/utils.c
62863+++ b/libavformat/utils.c
62864@@ -3051,6 +3051,40 @@ static int has_codec_parameters(AVStream
62865     return 1;
62866 }
62867
62868+#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
62869+// This should be quite general purpose but avoid possible conflicts
62870+// by limiting usage to cases wehere we know it works.
62871+static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts)
62872+{
62873+    // Only try fallback if we know it is supported (HEVC only)
62874+    const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL :
62875+        avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE);
62876+    int err;
62877+
62878+    // Failed to find fallback or we are already at the fallback
62879+    if (new_codec == NULL || new_codec == old_codec)
62880+    {
62881+        return AVERROR_DECODER_NOT_FOUND;
62882+    }
62883+
62884+    // * This may be dodgy - header says to not use this fn,
62885+    //   especially if we are going to reopen the context...
62886+    //   (but it does seem to work for our cases)
62887+    if (avcodec_is_open(avctx)) {
62888+        avcodec_close(avctx);
62889+    }
62890+
62891+    if ((err = avcodec_open2(avctx, new_codec, opts)) < 0)
62892+    {
62893+        return err;
62894+    }
62895+
62896+    return 0;
62897+}
62898+#else
62899+#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND)
62900+#endif
62901+
62902 /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
62903 static int try_decode_frame(AVFormatContext *s, AVStream *st,
62904                             const AVPacket *avpkt, AVDictionary **options)
62905@@ -3085,7 +3119,11 @@ static int try_decode_frame(AVFormatCont
62906         av_dict_set(options ? options : &thread_opt, "threads", "1", 0);
62907         if (s->codec_whitelist)
62908             av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
62909-        ret = avcodec_open2(avctx, codec, options ? options : &thread_opt);
62910+        if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND)
62911+        {
62912+            // Try fallback if if looks worth a try
62913+            ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt);
62914+        }
62915         if (!options)
62916             av_dict_free(&thread_opt);
62917         if (ret < 0) {
62918@@ -3116,6 +3154,14 @@ static int try_decode_frame(AVFormatCont
62919         if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
62920             avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
62921             ret = avcodec_send_packet(avctx, &pkt);
62922+
62923+            // If we are going to want to fall back we should know here
62924+            if (ret == AVERROR_DECODER_NOT_FOUND) {
62925+                if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0)
62926+                    break;
62927+                continue;
62928+            }
62929+
62930             if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
62931                 break;
62932             if (ret >= 0)
62933@@ -3726,9 +3772,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
62934         // Try to just open decoders, in case this is enough to get parameters.
62935         if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) {
62936             if (codec && !avctx->codec)
62937-                if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0)
62938-                    av_log(ic, AV_LOG_WARNING,
62939-                           "Failed to open codec in %s\n",__FUNCTION__);
62940+            {
62941+                int err;
62942+
62943+                if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0)
62944+                {
62945+                    if (err == AVERROR_DECODER_NOT_FOUND) {
62946+                        err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt);
62947+                    }
62948+                    if (err < 0) {
62949+                        av_log(ic, AV_LOG_WARNING,
62950+                               "Failed to open codec in %s\n",__FUNCTION__);
62951+                    }
62952+                }
62953+            }
62954         }
62955         if (!options)
62956             av_dict_free(&thread_opt);
62957--- a/libavutil/Makefile
62958+++ b/libavutil/Makefile
62959@@ -68,6 +68,7 @@ HEADERS = adler32.h
62960           rational.h                                                    \
62961           replaygain.h                                                  \
62962           ripemd.h                                                      \
62963+	  rpi_sand_fns.h                                                \
62964           samplefmt.h                                                   \
62965           sha.h                                                         \
62966           sha512.h                                                      \
62967@@ -86,6 +87,7 @@ HEADERS = adler32.h
62968           tx.h                                                          \
62969
62970 HEADERS-$(CONFIG_LZO)                   += lzo.h
62971+HEADERS-$(CONFIG-RPI)                   += rpi_sand_fn_pw.h
62972
62973 ARCH_HEADERS = bswap.h                                                  \
62974                intmath.h                                                \
62975@@ -180,6 +182,7 @@ OBJS-$(CONFIG_LZO)
62976 OBJS-$(CONFIG_MEDIACODEC)               += hwcontext_mediacodec.o
62977 OBJS-$(CONFIG_OPENCL)                   += hwcontext_opencl.o
62978 OBJS-$(CONFIG_QSV)                      += hwcontext_qsv.o
62979+OBJS-$(CONFIG_SAND)                     += rpi_sand_fns.o
62980 OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
62981 OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
62982 OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
62983--- a/libavutil/aarch64/Makefile
62984+++ b/libavutil/aarch64/Makefile
62985@@ -1,4 +1,6 @@
62986 OBJS += aarch64/cpu.o                                                 \
62987         aarch64/float_dsp_init.o                                      \
62988
62989-NEON-OBJS += aarch64/float_dsp_neon.o
62990+NEON-OBJS += aarch64/float_dsp_neon.o                                 \
62991+             aarch64/rpi_sand_neon.o                                  \
62992+
62993--- /dev/null
62994+++ b/libavutil/aarch64/rpi_sand_neon.S
62995@@ -0,0 +1,781 @@
62996+/*
62997+Copyright (c) 2021 Michael Eiler
62998+
62999+Redistribution and use in source and binary forms, with or without
63000+modification, are permitted provided that the following conditions are met:
63001+    * Redistributions of source code must retain the above copyright
63002+      notice, this list of conditions and the following disclaimer.
63003+    * Redistributions in binary form must reproduce the above copyright
63004+      notice, this list of conditions and the following disclaimer in the
63005+      documentation and/or other materials provided with the distribution.
63006+    * Neither the name of the copyright holder nor the
63007+      names of its contributors may be used to endorse or promote products
63008+      derived from this software without specific prior written permission.
63009+
63010+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
63011+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
63012+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
63013+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
63014+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
63015+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
63016+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
63017+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
63018+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63019+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63020+
63021+Authors: Michael Eiler <eiler.mike@gmail.com>
63022+*/
63023+
63024+#include "asm.S"
63025+
63026+// void ff_rpi_sand8_lines_to_planar_y8(
63027+//   uint8_t * dest,            : x0
63028+//   unsigned int dst_stride,   : w1
63029+//   const uint8_t * src,       : x2
63030+//   unsigned int src_stride1,  : w3, always 128
63031+//   unsigned int src_stride2,  : w4
63032+//   unsigned int _x,           : w5
63033+//   unsigned int y,            : w6
63034+//   unsigned int _w,           : w7
63035+//   unsigned int h);           : [sp, #0]
63036+
63037+function ff_rpi_sand8_lines_to_planar_y8, export=1
63038+    // w15 contains the number of rows we need to process
63039+    ldr w15, [sp, #0]
63040+
63041+    // w8 will contain the number of blocks per row
63042+    // w8 = floor(_w/stride1)
63043+    // stride1 is assumed to always be 128
63044+    mov w8, w1
63045+    lsr w8, w8, #7
63046+
63047+    // in case the width of the image is not a multiple of 128, there will
63048+    // be an incomplete block at the end of every row
63049+    // w9 contains the number of pixels stored within this block
63050+    // w9 = _w - w8 * 128
63051+    lsl w9, w8, #7
63052+    sub w9, w7, w9
63053+
63054+    // this is the value we have to add to the src pointer after reading a complete block
63055+    // it will move the address to the start of the next block
63056+    // w10 = stride2 * stride1 - stride1
63057+    mov w10, w4
63058+    lsl w10, w10, #7
63059+    sub w10, w10, #128
63060+
63061+    // w11 is the row offset, meaning the start offset of the first block of every collumn
63062+    // this will be increased with stride1 within every iteration of the row_loop
63063+    eor w11, w11, w11
63064+
63065+    // w12 = 0, processed row count
63066+    eor w12, w12, w12
63067+row_loop:
63068+    // start of the first block within the current row
63069+    // x13 = row offset + src
63070+    mov x13, x2
63071+    add x13, x13, x11
63072+
63073+    // w14 = 0, processed block count
63074+    eor w14, w14, w14
63075+
63076+    cmp w8, #0
63077+    beq no_main_y8
63078+
63079+block_loop:
63080+    // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128
63081+    // fortunately these aren't callee saved ones, meaning we don't need to backup them
63082+    ld1 { v0.16b,  v1.16b,  v2.16b,  v3.16b}, [x13], #64
63083+    ld1 { v4.16b,  v5.16b,  v6.16b,  v7.16b}, [x13], #64
63084+
63085+    // write these registers back to the destination vector and increase the dst address by 128
63086+    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
63087+    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x0], #64
63088+
63089+    // move the source register to the beginning of the next block (x13 = src + block offset)
63090+    add x13, x13, x10
63091+    // increase the block counter
63092+    add w14, w14, #1
63093+
63094+    // continue with the block_loop if we haven't copied all full blocks yet
63095+    cmp w8, w14
63096+    bgt block_loop
63097+
63098+    // handle the last block at the end of each row
63099+    // at most 127 byte values copied from src to dst
63100+no_main_y8:
63101+    eor w5, w5, w5 // i = 0
63102+incomplete_block_loop_y8:
63103+    cmp w5, w9
63104+    bge incomplete_block_loop_end_y8
63105+
63106+    ldrb w6, [x13]
63107+    strb w6, [x0]
63108+    add x13, x13, #1
63109+    add x0, x0, #1
63110+
63111+    add w5, w5, #1
63112+    b incomplete_block_loop_y8
63113+incomplete_block_loop_end_y8:
63114+
63115+
63116+    // increase the row offset by 128 (stride1)
63117+    add w11, w11, #128
63118+    // increment the row counter
63119+    add w12, w12, #1
63120+
63121+    // process the next row if we haven't finished yet
63122+    cmp w15, w12
63123+    bgt row_loop
63124+
63125+    ret
63126+endfunc
63127+
63128+
63129+
63130+// void ff_rpi_sand8_lines_to_planar_c8(
63131+//   uint8_t * dst_u,           : x0
63132+//   unsigned int dst_stride_u, : w1 == width
63133+//   uint8_t * dst_v,           : x2
63134+//   unsigned int dst_stride_v, : w3 == width
63135+//   const uint8_t * src,       : x4
63136+//   unsigned int stride1,      : w5 == 128
63137+//   unsigned int stride2,      : w6
63138+//   unsigned int _x,           : w7
63139+//   unsigned int y,            : [sp, #0]
63140+//   unsigned int _w,           : [sp, #8]
63141+//   unsigned int h);           : [sp, #16]
63142+
63143+function ff_rpi_sand8_lines_to_planar_c8, export=1
63144+    // w7 = width
63145+    ldr w7, [sp, #8]
63146+
63147+    // w15 contains the number of rows we need to process
63148+    // counts down
63149+    ldr w15, [sp, #16]
63150+
63151+    // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6
63152+    mov w8, w7
63153+    lsr w8, w8, #6
63154+
63155+    // number of pixels in block at the end of every row
63156+    // w9 = _w - (w8 * 64)
63157+    lsl w9, w8, #6
63158+    sub w9, w7, w9
63159+
63160+    // Skip at the end of the line to account for stride
63161+    sub w12, w1, w7
63162+
63163+    // address delta to the beginning of the next block
63164+    // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128
63165+    lsl w10, w6, #7
63166+    sub w10, w10, #128
63167+
63168+    // w11 = row address start offset = 0
63169+    eor w11, w11, w11
63170+
63171+row_loop_c8:
63172+    // start of the first block within the current row
63173+    // x13 = row offset + src
63174+    mov x13, x4
63175+    add x13, x13, x11
63176+
63177+    // w14 = 0, processed block count
63178+    eor w14, w14, w14
63179+
63180+    cmp w8, #0
63181+    beq no_main_c8
63182+
63183+block_loop_c8:
63184+    // load the full block -> 128 bytes, the block contains 64 interleaved U and V values
63185+    ld2 { v0.16b,  v1.16b }, [x13], #32
63186+    ld2 { v2.16b,  v3.16b }, [x13], #32
63187+    ld2 { v4.16b,  v5.16b }, [x13], #32
63188+    ld2 { v6.16b,  v7.16b }, [x13], #32
63189+
63190+    // swap register so that we can write them out with a single instruction
63191+    mov v16.16b, v1.16b
63192+    mov v17.16b, v3.16b
63193+    mov v18.16b, v5.16b
63194+    mov v1.16b, v2.16b
63195+    mov v2.16b, v4.16b
63196+    mov v3.16b, v6.16b
63197+    mov v4.16b, v16.16b
63198+    mov v5.16b, v17.16b
63199+    mov v6.16b, v18.16b
63200+
63201+    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
63202+    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x2], #64
63203+
63204+    // increment row counter and move src to the beginning of the next block
63205+    add w14, w14, #1
63206+    add x13, x13, x10
63207+
63208+    // jump to block_loop_c8 iff the block count is smaller than the number of full blocks
63209+    cmp w8, w14
63210+    bgt block_loop_c8
63211+
63212+no_main_c8:
63213+    // handle incomplete block at the end of every row
63214+    eor w5, w5, w5 // point counter, this might be
63215+incomplete_block_loop_c8:
63216+    cmp w5, w9
63217+    bge incomplete_block_loop_end_c8
63218+
63219+    ldrb w1, [x13]
63220+    strb w1, [x0]
63221+    add x13, x13, #1
63222+
63223+    ldrb w1, [x13]
63224+    strb w1, [x2]
63225+    add x13, x13, #1
63226+
63227+    add x0, x0, #1
63228+    add x2, x2, #1
63229+
63230+    add w5, w5, #1
63231+    b incomplete_block_loop_c8
63232+incomplete_block_loop_end_c8:
63233+
63234+    // increase row_offset by stride1
63235+    add w11, w11, #128
63236+    add x0, x0, w12, sxtw
63237+    add x2, x2, w12, sxtw
63238+
63239+    // jump to row_Loop_c8 iff the row count is small than the height
63240+    subs w15, w15, #1
63241+    bgt row_loop_c8
63242+
63243+    ret
63244+endfunc
63245+
63246+//void ff_rpi_sand30_lines_to_planar_c16(
63247+//  uint8_t * dst_u,            // [x0]
63248+//  unsigned int dst_stride_u,  // [w1] == _w*2
63249+//  uint8_t * dst_v,            // [x2]
63250+//  unsigned int dst_stride_v,  // [w3] == _w*2
63251+//  const uint8_t * src,        // [x4]
63252+//  unsigned int stride1,       // [w5] == 128
63253+//  unsigned int stride2,       // [w6]
63254+//  unsigned int _x,            // [w7] == 0
63255+//  unsigned int y,             // [sp, #0] == 0
63256+//  unsigned int _w,            // [sp, #8] -> w3
63257+//  unsigned int h);            // [sp, #16] -> w7
63258+
63259+.macro rpi_sand30_lines_to_planar_c16_block_half
63260+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
63261+
63262+    xtn v4.4h, v0.4s
63263+    ushr v0.4s, v0.4s, #10
63264+    xtn v5.4h, v0.4s
63265+    ushr v0.4s, v0.4s, #10
63266+    xtn v6.4h, v0.4s
63267+    xtn2 v4.8h, v1.4s
63268+    ushr v1.4s, v1.4s, #10
63269+    xtn2 v5.8h, v1.4s
63270+    ushr v1.4s, v1.4s, #10
63271+    xtn2 v6.8h, v1.4s
63272+    and v4.16b, v4.16b, v16.16b
63273+    and v5.16b, v5.16b, v16.16b
63274+    and v6.16b, v6.16b, v16.16b
63275+    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
63276+
63277+    xtn v4.4h, v2.4s
63278+    ushr v2.4s, v2.4s, #10
63279+    xtn v5.4h, v2.4s
63280+    ushr v2.4s, v2.4s, #10
63281+    xtn v6.4h, v2.4s
63282+    xtn2 v4.8h, v3.4s
63283+    ushr v3.4s, v3.4s, #10
63284+    xtn2 v5.8h, v3.4s
63285+    ushr v3.4s, v3.4s, #10
63286+    xtn2 v6.8h, v3.4s
63287+    and v4.16b, v4.16b, v16.16b
63288+    and v5.16b, v5.16b, v16.16b
63289+    and v6.16b, v6.16b, v16.16b
63290+    st3 { v4.8h, v5.8h, v6.8h }, [sp]
63291+    sub sp, sp, #48
63292+.endm
63293+
63294+function ff_rpi_sand30_lines_to_planar_c16, export=1
63295+    stp x19, x20, [sp, #-48]!
63296+    stp x21, x22, [sp, #16]
63297+    stp x23, x24, [sp, #32]
63298+
63299+    ldr w3, [sp, #48+8]    // w3 = width
63300+    ldr w7, [sp, #48+16]   // w7 = height
63301+
63302+    // reserve space on the stack for intermediate results
63303+    sub sp, sp, #256
63304+
63305+    // number of 128byte blocks per row, w8 = width / 48
63306+    mov w9, #48
63307+    udiv w8, w3, w9
63308+
63309+    // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
63310+    mul w9, w8, w9
63311+    sub w9, w3, w9
63312+
63313+    // row offset, the beginning of the next row to process
63314+    eor w10, w10, w10
63315+
63316+    // offset to the beginning of the next block, w11 = stride2 * 128 - 128
63317+    lsl w11, w6, #7
63318+    sub w11, w11, #128
63319+
63320+    // decrease the height by one and in case of remaining pixels increase the block count by one
63321+    sub w7, w7, #1
63322+    cmp w9, #0
63323+    cset w19, ne    // w19 == 1 iff reamining pixels != 0
63324+    add w8, w8, w19
63325+
63326+    // bytes we have to move dst back by at the end of every row
63327+    mov w21, #48*2
63328+    mul w21, w21, w8
63329+    sub w21, w1, w21
63330+
63331+    mov w20, #0     // w20 = flag, last row processed
63332+
63333+    mov x12, #0x03ff03ff03ff03ff
63334+    dup v16.2d, x12
63335+
63336+    // iterate through rows, row counter = w12 = 0
63337+    eor w12, w12, w12
63338+row_loop_c16:
63339+    cmp w12, w7
63340+    bge row_loop_c16_fin
63341+
63342+    // address of row data = src + row_offset
63343+    mov x13, x4
63344+    add x13, x13, x10
63345+
63346+    eor w14, w14, w14
63347+block_loop_c16:
63348+    cmp w14, w8
63349+    bge block_loop_c16_fin
63350+
63351+    rpi_sand30_lines_to_planar_c16_block_half
63352+
63353+    ld2 { v0.8h, v1.8h }, [sp], #32
63354+    ld2 { v2.8h, v3.8h }, [sp], #32
63355+    ld2 { v4.8h, v5.8h }, [sp]
63356+    sub sp, sp, #64
63357+
63358+    st1 { v0.8h }, [x0], #16
63359+    st1 { v2.8h }, [x0], #16
63360+    st1 { v4.8h }, [x0], #16
63361+    st1 { v1.8h }, [x2], #16
63362+    st1 { v3.8h }, [x2], #16
63363+    st1 { v5.8h }, [x2], #16
63364+
63365+    rpi_sand30_lines_to_planar_c16_block_half
63366+
63367+    ld2 { v0.8h, v1.8h }, [sp], #32
63368+    ld2 { v2.8h, v3.8h }, [sp], #32
63369+    ld2 { v4.8h, v5.8h }, [sp]
63370+    sub sp, sp, #64
63371+
63372+    st1 { v0.8h }, [x0], #16
63373+    st1 { v2.8h }, [x0], #16
63374+    st1 { v4.8h }, [x0], #16
63375+    st1 { v1.8h }, [x2], #16
63376+    st1 { v3.8h }, [x2], #16
63377+    st1 { v5.8h }, [x2], #16
63378+
63379+    add x13, x13, x11 // offset to next block
63380+    add w14, w14, #1
63381+    b block_loop_c16
63382+block_loop_c16_fin:
63383+
63384+    add w10, w10, #128
63385+    add w12, w12, #1
63386+    add x0, x0, w21, sxtw  // move dst pointers back by x21
63387+    add x2, x2, w21, sxtw
63388+    b row_loop_c16
63389+row_loop_c16_fin:
63390+
63391+    cmp w20, #1
63392+    beq row_loop_c16_fin2
63393+    mov w20, #1
63394+    sub w8, w8, w19 // decrease block count by w19
63395+    add w7, w7, #1 // increase height
63396+    b row_loop_c16
63397+
63398+row_loop_c16_fin2:
63399+    sub x0, x0, w21, sxtw // readd x21 in case of the last row
63400+    sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
63401+
63402+    // last incomplete block to be finished
63403+    // read operations are fine, stride2 is more than large enough even if rem_pix is 0
63404+    rpi_sand30_lines_to_planar_c16_block_half
63405+    ld2 { v0.8h, v1.8h }, [sp], #32
63406+    ld2 { v2.8h, v3.8h }, [sp], #32
63407+    ld2 { v4.8h, v5.8h }, [sp], #32
63408+    rpi_sand30_lines_to_planar_c16_block_half
63409+    ld2 { v0.8h, v1.8h }, [sp], #32
63410+    ld2 { v2.8h, v3.8h }, [sp], #32
63411+    ld2 { v4.8h, v5.8h }, [sp]
63412+    sub sp, sp, #160
63413+
63414+    mov x4, sp
63415+    eor w20, w20, w20
63416+rem_pix_c16_loop:
63417+    cmp w20, w9
63418+    bge rem_pix_c16_fin
63419+
63420+    ldr w22, [x4], #4
63421+    str w22, [x0], #2
63422+    lsr w22, w22, #16
63423+    str w22, [x2], #2
63424+
63425+    add w20, w20, #1
63426+    b rem_pix_c16_loop
63427+rem_pix_c16_fin:
63428+
63429+    add sp, sp, #256
63430+
63431+    ldp x23, x24, [sp, #32]
63432+    ldp x21, x22, [sp, #16]
63433+    ldp x19, x20, [sp], #48
63434+    ret
63435+endfunc
63436+
63437+
63438+
63439+//void ff_rpi_sand30_lines_to_planar_p010(
63440+//  uint8_t * dest,
63441+//  unsigned int dst_stride,
63442+//  const uint8_t * src,
63443+//  unsigned int src_stride1,
63444+//  unsigned int src_stride2,
63445+//  unsigned int _x,
63446+//  unsigned int y,
63447+//  unsigned int _w,
63448+//  unsigned int h);
63449+
63450+// void ff_rpi_sand30_lines_to_planar_y8(
63451+//   uint8_t * dest,            : x0
63452+//   unsigned int dst_stride,   : w1
63453+//   const uint8_t * src,       : x2
63454+//   unsigned int src_stride1,  : w3, always 128
63455+//   unsigned int src_stride2,  : w4
63456+//   unsigned int _x,           : w5
63457+//   unsigned int y,            : w6
63458+//   unsigned int _w,           : w7
63459+//   unsigned int h);           : [sp, #0]
63460+//
63461+// Assumes that we are starting on a stripe boundary and that overreading
63462+// within the stripe is OK. However it does respect the dest size for wri
63463+
63464+function ff_rpi_sand30_lines_to_planar_y16, export=1
63465+                lsl             w4,  w4,  #7
63466+                sub             w4,  w4,  #64
63467+                sub             w1,  w1,  w7, lsl #1
63468+                uxtw            x6,  w6
63469+                add             x8,  x2,  x6, lsl #7
63470+                ldr             w6,  [sp, #0]
63471+
63472+10:
63473+                mov             x2,  x8
63474+                mov             w5,  w7
63475+1:
63476+                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
63477+                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
63478+
63479+                subs            w5,  w5,  #96
63480+
63481+                // v0, v1
63482+
63483+                shrn            v18.4h,  v0.4s,   #14
63484+                xtn             v16.4h,  v0.4s
63485+                shrn            v17.4h,  v0.4s,   #10
63486+
63487+                shrn2           v18.8h,  v1.4s,   #14
63488+                xtn2            v16.8h,  v1.4s
63489+                shrn2           v17.8h,  v1.4s,   #10
63490+
63491+                ushr            v18.8h,  v18.8h,  #6
63492+                bic             v16.8h,  #0xfc,   lsl #8
63493+                bic             v17.8h,  #0xfc,   lsl #8
63494+
63495+                // v2, v3
63496+
63497+                shrn            v21.4h,  v2.4s,   #14
63498+                xtn             v19.4h,  v2.4s
63499+                shrn            v20.4h,  v2.4s,   #10
63500+
63501+                shrn2           v21.8h,  v3.4s,   #14
63502+                xtn2            v19.8h,  v3.4s
63503+                shrn2           v20.8h,  v3.4s,   #10
63504+
63505+                ushr            v21.8h,  v21.8h,  #6
63506+                bic             v19.8h,  #0xfc,   lsl #8
63507+                bic             v20.8h,  #0xfc,   lsl #8
63508+
63509+                // v4, v5
63510+
63511+                shrn            v24.4h,  v4.4s,   #14
63512+                xtn             v22.4h,  v4.4s
63513+                shrn            v23.4h,  v4.4s,   #10
63514+
63515+                shrn2           v24.8h,  v5.4s,   #14
63516+                xtn2            v22.8h,  v5.4s
63517+                shrn2           v23.8h,  v5.4s,   #10
63518+
63519+                ushr            v24.8h,  v24.8h,  #6
63520+                bic             v22.8h,  #0xfc,   lsl #8
63521+                bic             v23.8h,  #0xfc,   lsl #8
63522+
63523+                // v6, v7
63524+
63525+                shrn            v27.4h,  v6.4s,   #14
63526+                xtn             v25.4h,  v6.4s
63527+                shrn            v26.4h,  v6.4s,   #10
63528+
63529+                shrn2           v27.8h,  v7.4s,   #14
63530+                xtn2            v25.8h,  v7.4s
63531+                shrn2           v26.8h,  v7.4s,   #10
63532+
63533+                ushr            v27.8h,  v27.8h,  #6
63534+                bic             v25.8h,  #0xfc,   lsl #8
63535+                bic             v26.8h,  #0xfc,   lsl #8
63536+
63537+                blt             2f
63538+
63539+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
63540+                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
63541+                st3             {v22.8h, v23.8h, v24.8h}, [x0], #48
63542+                st3             {v25.8h, v26.8h, v27.8h}, [x0], #48
63543+
63544+                bne             1b
63545+
63546+11:
63547+                subs            w6,  w6,  #1
63548+                add             x0,  x0,  w1,  uxtw
63549+                add             x8,  x8,  #128
63550+                bne             10b
63551+
63552+                ret
63553+
63554+// Partial final write
63555+2:
63556+                cmp             w5,  #48-96
63557+                blt             1f
63558+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
63559+                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
63560+                beq             11b
63561+                mov             v16.16b, v22.16b
63562+                mov             v17.16b, v23.16b
63563+                sub             w5,  w5,  #48
63564+                mov             v18.16b, v24.16b
63565+                mov             v19.16b, v25.16b
63566+                mov             v20.16b, v26.16b
63567+                mov             v21.16b, v27.16b
63568+1:
63569+                cmp             w5,  #24-96
63570+                blt             1f
63571+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
63572+                beq             11b
63573+                mov             v16.16b, v19.16b
63574+                mov             v17.16b, v20.16b
63575+                sub             w5,  w5,  #24
63576+                mov             v18.16b, v21.16b
63577+1:
63578+                cmp             w5,  #12-96
63579+                blt             1f
63580+                st3             {v16.4h, v17.4h, v18.4h}, [x0], #24
63581+                beq             11b
63582+                mov             v16.2d[0], v16.2d[1]
63583+                sub             w5,  w5,  #12
63584+                mov             v17.2d[0], v17.2d[1]
63585+                mov             v18.2d[0], v18.2d[1]
63586+1:
63587+                cmp             w5,  #6-96
63588+                blt             1f
63589+                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
63590+                st3             {v16.h, v17.h, v18.h}[1], [x0], #6
63591+                beq             11b
63592+                mov             v16.2s[0], v16.2s[1]
63593+                sub             w5,  w5,  #6
63594+                mov             v17.2s[0], v17.2s[1]
63595+                mov             v18.2s[0], v18.2s[1]
63596+1:
63597+                cmp             w5,  #3-96
63598+                blt             1f
63599+                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
63600+                beq             11b
63601+                mov             v16.4h[0], v16.4h[1]
63602+                sub             w5,  w5,  #3
63603+                mov             v17.4h[0], v17.4h[1]
63604+1:
63605+                cmp             w5,  #2-96
63606+                blt             1f
63607+                st2             {v16.h, v17.h}[0], [x0], #4
63608+                b               11b
63609+1:
63610+                st1             {v16.h}[0], [x0], #2
63611+                b               11b
63612+
63613+endfunc
63614+
63615+// void ff_rpi_sand30_lines_to_planar_y8(
63616+//   uint8_t * dest,            : x0
63617+//   unsigned int dst_stride,   : w1
63618+//   const uint8_t * src,       : x2
63619+//   unsigned int src_stride1,  : w3, always 128
63620+//   unsigned int src_stride2,  : w4
63621+//   unsigned int _x,           : w5
63622+//   unsigned int y,            : w6
63623+//   unsigned int _w,           : w7
63624+//   unsigned int h);           : [sp, #0]
63625+//
63626+// Assumes that we are starting on a stripe boundary and that overreading
63627+// within the stripe is OK. However it does respect the dest size for wri
63628+
63629+function ff_rpi_sand30_lines_to_planar_y8, export=1
63630+                lsl             w4,  w4,  #7
63631+                sub             w4,  w4,  #64
63632+                sub             w1,  w1,  w7
63633+                uxtw            x6,  w6
63634+                add             x8,  x2,  x6, lsl #7
63635+                ldr             w6,  [sp, #0]
63636+
63637+10:
63638+                mov             x2,  x8
63639+                mov             w5,  w7
63640+1:
63641+                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
63642+                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
63643+
63644+                subs            w5,  w5,  #96
63645+
63646+                // v0, v1
63647+
63648+                shrn            v18.4h,  v0.4s,   #16
63649+                xtn             v16.4h,  v0.4s
63650+                shrn            v17.4h,  v0.4s,   #12
63651+
63652+                shrn2           v18.8h,  v1.4s,   #16
63653+                xtn2            v16.8h,  v1.4s
63654+                shrn2           v17.8h,  v1.4s,   #12
63655+
63656+                shrn            v18.8b,  v18.8h,  #6
63657+                shrn            v16.8b,  v16.8h,  #2
63658+                xtn             v17.8b,  v17.8h
63659+
63660+                // v2, v3
63661+
63662+                shrn            v21.4h,  v2.4s,   #16
63663+                xtn             v19.4h,  v2.4s
63664+                shrn            v20.4h,  v2.4s,   #12
63665+
63666+                shrn2           v21.8h,  v3.4s,   #16
63667+                xtn2            v19.8h,  v3.4s
63668+                shrn2           v20.8h,  v3.4s,   #12
63669+
63670+                shrn2           v18.16b, v21.8h,  #6
63671+                shrn2           v16.16b, v19.8h,  #2
63672+                xtn2            v17.16b, v20.8h
63673+
63674+                // v4, v5
63675+
63676+                shrn            v24.4h,  v4.4s,   #16
63677+                xtn             v22.4h,  v4.4s
63678+                shrn            v23.4h,  v4.4s,   #12
63679+
63680+                shrn2           v24.8h,  v5.4s,   #16
63681+                xtn2            v22.8h,  v5.4s
63682+                shrn2           v23.8h,  v5.4s,   #12
63683+
63684+                shrn            v21.8b,  v24.8h,  #6
63685+                shrn            v19.8b,  v22.8h,  #2
63686+                xtn             v20.8b,  v23.8h
63687+
63688+                // v6, v7
63689+
63690+                shrn            v27.4h,  v6.4s,   #16
63691+                xtn             v25.4h,  v6.4s
63692+                shrn            v26.4h,  v6.4s,   #12
63693+
63694+                shrn2           v27.8h,  v7.4s,   #16
63695+                xtn2            v25.8h,  v7.4s
63696+                shrn2           v26.8h,  v7.4s,   #12
63697+
63698+                shrn2           v21.16b, v27.8h,  #6
63699+                shrn2           v19.16b, v25.8h,  #2
63700+                xtn2            v20.16b, v26.8h
63701+
63702+                blt             2f
63703+
63704+                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
63705+                st3             {v19.16b, v20.16b, v21.16b}, [x0], #48
63706+
63707+                bne             1b
63708+
63709+11:
63710+                subs            w6,  w6,  #1
63711+                add             x0,  x0,  w1,  uxtw
63712+                add             x8,  x8,  #128
63713+                bne             10b
63714+
63715+                ret
63716+
63717+// Partial final write
63718+2:
63719+                cmp             w5,  #48-96
63720+                blt             1f
63721+                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
63722+                beq             11b
63723+                mov             v16.16b, v22.16b
63724+                mov             v17.16b, v23.16b
63725+                sub             w5,  w5,  #48
63726+                mov             v18.16b, v24.16b
63727+1:
63728+                cmp             w5,  #24-96
63729+                blt             1f
63730+                st3             {v16.8b, v17.8b, v18.8b}, [x0], #24
63731+                beq             11b
63732+                mov             v16.2d[0], v16.2d[1]
63733+                sub             w5,  w5,  #24
63734+                mov             v17.2d[0], v17.2d[1]
63735+                mov             v18.2d[0], v18.2d[1]
63736+1:
63737+                cmp             w5,  #12-96
63738+                blt             1f
63739+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
63740+                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
63741+                st3             {v16.b, v17.b, v18.b}[2], [x0], #3
63742+                st3             {v16.b, v17.b, v18.b}[3], [x0], #3
63743+                beq             11b
63744+                mov             v16.2s[0], v16.2s[1]
63745+                sub             w5,  w5,  #12
63746+                mov             v17.2s[0], v17.2s[1]
63747+                mov             v18.2s[0], v18.2s[1]
63748+1:
63749+                cmp             w5,  #6-96
63750+                blt             1f
63751+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
63752+                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
63753+                beq             11b
63754+                mov             v16.4h[0], v16.4h[1]
63755+                sub             w5,  w5,  #6
63756+                mov             v17.4h[0], v17.4h[1]
63757+                mov             v18.4h[0], v18.4h[1]
63758+1:
63759+                cmp             w5,  #3-96
63760+                blt             1f
63761+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
63762+                beq             11b
63763+                mov             v16.8b[0], v16.8b[1]
63764+                sub             w5,  w5,  #3
63765+                mov             v17.8b[0], v17.8b[1]
63766+1:
63767+                cmp             w5,  #2-96
63768+                blt             1f
63769+                st2             {v16.b, v17.b}[0], [x0], #2
63770+                b               11b
63771+1:
63772+                st1             {v16.b}[0], [x0], #1
63773+                b               11b
63774+
63775+endfunc
63776+
63777--- /dev/null
63778+++ b/libavutil/aarch64/rpi_sand_neon.h
63779@@ -0,0 +1,59 @@
63780+/*
63781+Copyright (c) 2021 Michael Eiler
63782+
63783+Redistribution and use in source and binary forms, with or without
63784+modification, are permitted provided that the following conditions are met:
63785+    * Redistributions of source code must retain the above copyright
63786+      notice, this list of conditions and the following disclaimer.
63787+    * Redistributions in binary form must reproduce the above copyright
63788+      notice, this list of conditions and the following disclaimer in the
63789+      documentation and/or other materials provided with the distribution.
63790+    * Neither the name of the copyright holder nor the
63791+      names of its contributors may be used to endorse or promote products
63792+      derived from this software without specific prior written permission.
63793+
63794+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
63795+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
63796+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
63797+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
63798+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
63799+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
63800+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
63801+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
63802+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63803+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63804+
63805+Authors: Michael Eiler <eiler.mike@gmail.com>
63806+*/
63807+
63808+#pragma once
63809+
63810+#ifdef __cplusplus
63811+extern "C" {
63812+#endif
63813+
63814+void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
63815+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
63816+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
63817+
63818+void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u,
63819+  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src,
63820+  unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
63821+  unsigned int _w, unsigned int h);
63822+
63823+void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride,
63824+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
63825+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
63826+
63827+void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u,
63828+  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
63829+  unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
63830+
63831+void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
63832+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
63833+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
63834+
63835+#ifdef __cplusplus
63836+}
63837+#endif
63838+
63839--- a/libavutil/arm/Makefile
63840+++ b/libavutil/arm/Makefile
63841@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o
63842
63843 NEON-OBJS += arm/float_dsp_init_neon.o                                  \
63844              arm/float_dsp_neon.o                                       \
63845+             arm/rpi_sand_neon.o                                        \
63846--- /dev/null
63847+++ b/libavutil/arm/rpi_sand_neon.S
63848@@ -0,0 +1,925 @@
63849+/*
63850+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
63851+All rights reserved.
63852+
63853+Redistribution and use in source and binary forms, with or without
63854+modification, are permitted provided that the following conditions are met:
63855+    * Redistributions of source code must retain the above copyright
63856+      notice, this list of conditions and the following disclaimer.
63857+    * Redistributions in binary form must reproduce the above copyright
63858+      notice, this list of conditions and the following disclaimer in the
63859+      documentation and/or other materials provided with the distribution.
63860+    * Neither the name of the copyright holder nor the
63861+      names of its contributors may be used to endorse or promote products
63862+      derived from this software without specific prior written permission.
63863+
63864+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
63865+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
63866+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
63867+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
63868+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
63869+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
63870+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
63871+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
63872+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63873+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63874+
63875+Authors: John Cox
63876+*/
63877+
63878+#include "libavutil/arm/asm.S"
63879+
63880+
63881+@ General notes:
63882+@ Having done some timing on this in sand8->y8 (Pi4)
63883+@  vst1 (680fps) is a bit faster than vstm (660fps)
63884+@  vldm (680fps) is noticably faster than vld1 (480fps)
63885+@  (or it might be that a mix is what is required)
63886+@
63887+@ At least on a Pi4 it is no more expensive to have a single auto-inc register
63888+@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted
63889+@ the latter was better)
63890+@
63891+@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless
63892+@ the memory is uncached.
63893+@ As these are Sand -> planar we can assume that src is going to be aligned but
63894+@ it is possible that dest isn't (converting to .yuv or other packed format).
63895+@ Luckily vst1 is faster than vstm :-) so all is well
63896+@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4
63897+@ .8 stores would let us do non-word aligned stores into uncached but it
63898+@ probably isn't worth it.
63899+
63900+
63901+
63902+
63903+@ void ff_rpi_sand128b_stripe_to_8_10(
63904+@   uint8_t * dest,             // [r0]
63905+@   const uint8_t * src1,       // [r1]
63906+@   const uint8_t * src2,       // [r2]
63907+@   unsigned int lines);        // [r3]
63908+
63909+.macro  stripe2_to_8, bit_depth
63910+        vpush    {q4-q7}
63911+1:
63912+        vldm     r1!, {q0-q7}
63913+        subs     r3, #1
63914+        vldm     r2!, {q8-q15}
63915+        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
63916+        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
63917+        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
63918+        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
63919+        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
63920+        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
63921+        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
63922+        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
63923+        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
63924+        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
63925+        vqrshrn.u16 d10, q10, #\bit_depth - 8
63926+        vqrshrn.u16 d11, q11, #\bit_depth - 8
63927+        vqrshrn.u16 d12, q12, #\bit_depth - 8
63928+        vqrshrn.u16 d13, q13, #\bit_depth - 8
63929+        vqrshrn.u16 d14, q14, #\bit_depth - 8
63930+        vqrshrn.u16 d15, q15, #\bit_depth - 8
63931+        vstm     r0!, {q0-q7}
63932+        bne      1b
63933+        vpop     {q4-q7}
63934+        bx       lr
63935+.endm
63936+
63937+function ff_rpi_sand128b_stripe_to_8_10, export=1
63938+        stripe2_to_8     10
63939+endfunc
63940+
63941+@ void ff_rpi_sand8_lines_to_planar_y8(
63942+@   uint8_t * dest,             // [r0]
63943+@   unsigned int dst_stride,    // [r1]
63944+@   const uint8_t * src,        // [r2]
63945+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
63946+@   unsigned int src_stride2,   // [sp, #0]  -> r3
63947+@   unsigned int _x,            // [sp, #4]  Ignored - 0
63948+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
63949+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
63950+@   unsigned int h);            // [sp, #16] -> r7
63951+@
63952+@ Assumes that we are starting on a stripe boundary and that overreading
63953+@ within the stripe is OK. However it does respect the dest size for writing
63954+
63955+function ff_rpi_sand8_lines_to_planar_y8, export=1
63956+                push            {r4-r8, lr}     @ +24            L
63957+                ldr             r3,  [sp, #24]
63958+                ldr             r6,  [sp, #36]
63959+                ldr             r7,  [sp, #32]  @ y
63960+                lsl             r3,  #7
63961+                sub             r1,  r6
63962+                add             r8,  r2,  r7,  lsl #7
63963+                ldr             r7,  [sp, #40]
63964+
63965+10:
63966+                mov             r2,  r8
63967+                add             r4,  r0,  #24
63968+                mov             r5,  r6
63969+                mov             lr,  #0
63970+1:
63971+                vldm            r2,  {q8-q15}
63972+                add             r2,  r3
63973+                subs            r5,  #128
63974+                blt             2f
63975+                vst1.8          {d16, d17, d18, d19}, [r0]!
63976+                vst1.8          {d20, d21, d22, d23}, [r0]!
63977+                vst1.8          {d24, d25, d26, d27}, [r0]!
63978+                vst1.8          {d28, d29, d30, d31}, [r0]!
63979+                bne             1b
63980+11:
63981+                subs            r7,  #1
63982+                add             r0,  r1
63983+                add             r8,  #128
63984+                bne             10b
63985+
63986+                pop             {r4-r8, pc}
63987+
63988+@ Partial final write
63989+2:
63990+                cmp             r5,  #64-128
63991+                blt             1f
63992+                vst1.8          {d16, d17, d18, d19}, [r0]!
63993+                vst1.8          {d20, d21, d22, d23}, [r0]!
63994+                beq             11b
63995+                vmov            q8,  q12
63996+                vmov            q9,  q13
63997+                sub             r5,  #64
63998+                vmov            q10, q14
63999+                vmov            q11, q15
64000+1:
64001+                cmp             r5,  #32-128
64002+                blt             1f
64003+                vst1.8          {d16, d17, d18, d19}, [r0]!
64004+                beq             11b
64005+                vmov            q8,  q10
64006+                sub             r5,  #32
64007+                vmov            q9,  q11
64008+1:
64009+                cmp             r5,  #16-128
64010+                blt             1f
64011+                vst1.8          {d16, d17}, [r0]!
64012+                beq             11b
64013+                sub             r5,  #16
64014+                vmov            q8,  q9
64015+1:
64016+                cmp             r5,  #8-128
64017+                blt             1f
64018+                vst1.8          {d16}, [r0]!
64019+                beq             11b
64020+                sub             r5,  #8
64021+                vmov            d16, d17
64022+1:
64023+                cmp             r5,  #4-128
64024+                blt             1f
64025+                vst1.32         {d16[0]}, [r0]!
64026+                beq             11b
64027+                sub             r5,  #4
64028+                vshr.u64        d16, #32
64029+1:
64030+                cmp             r5,  #2-128
64031+                blt             1f
64032+                vst1.16         {d16[0]}, [r0]!
64033+                beq             11b
64034+                vst1.8          {d16[2]}, [r0]!
64035+                b               11b
64036+1:
64037+                vst1.8          {d16[0]}, [r0]!
64038+                b               11b
64039+endfunc
64040+
64041+@ void ff_rpi_sand8_lines_to_planar_c8(
64042+@   uint8_t * dst_u,            // [r0]
64043+@   unsigned int dst_stride_u,  // [r1]
64044+@   uint8_t * dst_v,            // [r2]
64045+@   unsigned int dst_stride_v,  // [r3]
64046+@   const uint8_t * src,        // [sp, #0]  -> r4, r5
64047+@   unsigned int stride1,       // [sp, #4]  128
64048+@   unsigned int stride2,       // [sp, #8]  -> r8
64049+@   unsigned int _x,            // [sp, #12] 0
64050+@   unsigned int y,             // [sp, #16] (r7 in prefix)
64051+@   unsigned int _w,            // [sp, #20] -> r12, r6
64052+@   unsigned int h);            // [sp, #24] -> r7
64053+@
64054+@ Assumes that we are starting on a stripe boundary and that overreading
64055+@ within the stripe is OK. However it does respect the dest size for writing
64056+
64057+function ff_rpi_sand8_lines_to_planar_c8, export=1
64058+                push            {r4-r8, lr}     @ +24
64059+
64060+                ldr             r5,  [sp, #24]
64061+                ldr             r8,  [sp, #32]
64062+                ldr             r7,  [sp, #40]
64063+                ldr             r6,  [sp, #44]
64064+                lsl             r8,  #7
64065+                add             r5,  r5,  r7,  lsl #7
64066+                sub             r1,  r1,  r6
64067+                sub             r3,  r3,  r6
64068+                ldr             r7,  [sp, #48]
64069+                vpush           {q4-q7}
64070+
64071+10:
64072+                mov             r4,  r5
64073+                mov             r12, r6
64074+1:
64075+                subs            r12, #64
64076+                vldm            r4,  {q0-q7}
64077+                add             r4,  r8
64078+                it              gt
64079+                vldmgt          r4,  {q8-q15}
64080+                add             r4,  r8
64081+
64082+                vuzp.8          q0,  q1
64083+                vuzp.8          q2,  q3
64084+                vuzp.8          q4,  q5
64085+                vuzp.8          q6,  q7
64086+
64087+                vuzp.8          q8,  q9
64088+                vuzp.8          q10, q11
64089+                vuzp.8          q12, q13
64090+                vuzp.8          q14, q15
64091+                subs            r12, #64
64092+
64093+                @ Rearrange regs so we can use vst1 with 4 regs
64094+                vswp            q1,  q2
64095+                vswp            q5,  q6
64096+                vswp            q9,  q10
64097+                vswp            q13, q14
64098+                blt             2f
64099+
64100+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
64101+                vst1.8          {d8,  d9,  d10, d11}, [r0]!
64102+                vst1.8          {d16, d17, d18, d19}, [r0]!
64103+                vst1.8          {d24, d25, d26, d27}, [r0]!
64104+
64105+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
64106+                vst1.8          {d12, d13, d14, d15}, [r2]!
64107+                vst1.8          {d20, d21, d22, d23}, [r2]!
64108+                vst1.8          {d28, d29, d30, d31}, [r2]!
64109+                bne             1b
64110+11:
64111+                subs            r7,  #1
64112+                add             r5,  #128
64113+                add             r0,  r1
64114+                add             r2,  r3
64115+                bne             10b
64116+                vpop            {q4-q7}
64117+                pop             {r4-r8,pc}
64118+
64119+2:
64120+                cmp             r12, #64-128
64121+                blt             1f
64122+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
64123+                vst1.8          {d8,  d9,  d10, d11}, [r0]!
64124+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
64125+                vst1.8          {d12, d13, d14, d15}, [r2]!
64126+                beq             11b
64127+                sub             r12, #64
64128+                vmov            q0,  q8
64129+                vmov            q1,  q9
64130+                vmov            q2,  q10
64131+                vmov            q3,  q11
64132+                vmov            q4,  q12
64133+                vmov            q5,  q13
64134+                vmov            q6,  q14
64135+                vmov            q7,  q15
64136+1:
64137+                cmp             r12, #32-128
64138+                blt             1f
64139+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
64140+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
64141+                beq             11b
64142+                sub             r12, #32
64143+                vmov            q0,  q4
64144+                vmov            q1,  q5
64145+                vmov            q2,  q6
64146+                vmov            q3,  q7
64147+1:
64148+                cmp             r12, #16-128
64149+                blt             1f
64150+                vst1.8          {d0,  d1 }, [r0]!
64151+                vst1.8          {d4,  d5 }, [r2]!
64152+                beq             11b
64153+                sub             r12, #16
64154+                vmov            q0,  q1
64155+                vmov            q2,  q3
64156+1:
64157+                cmp             r12, #8-128
64158+                blt             1f
64159+                vst1.8          {d0}, [r0]!
64160+                vst1.8          {d4}, [r2]!
64161+                beq             11b
64162+                sub             r12, #8
64163+                vmov            d0,  d1
64164+                vmov            d4,  d5
64165+1:
64166+                cmp             r12, #4-128
64167+                blt             1f
64168+                vst1.32         {d0[0]}, [r0]!
64169+                vst1.32         {d4[0]}, [r2]!
64170+                beq             11b
64171+                sub             r12, #4
64172+                vmov            s0,  s1
64173+                vmov            s8,  s9
64174+1:
64175+                cmp             r12, #2-128
64176+                blt             1f
64177+                vst1.16         {d0[0]}, [r0]!
64178+                vst1.16         {d4[0]}, [r2]!
64179+                beq             11b
64180+                vst1.8          {d0[2]}, [r0]!
64181+                vst1.8          {d4[2]}, [r2]!
64182+                b               11b
64183+1:
64184+                vst1.8          {d0[0]}, [r0]!
64185+                vst1.8          {d4[0]}, [r2]!
64186+                b               11b
64187+endfunc
64188+
64189+
64190+
64191+@ void ff_rpi_sand30_lines_to_planar_y16(
64192+@   uint8_t * dest,             // [r0]
64193+@   unsigned int dst_stride,    // [r1]
64194+@   const uint8_t * src,        // [r2]
64195+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
64196+@   unsigned int src_stride2,   // [sp, #0]  -> r3
64197+@   unsigned int _x,            // [sp, #4]  Ignored - 0
64198+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
64199+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
64200+@   unsigned int h);            // [sp, #16] -> r7
64201+@
64202+@ Assumes that we are starting on a stripe boundary and that overreading
64203+@ within the stripe is OK. However it does respect the dest size for writing
64204+
64205+function ff_rpi_sand30_lines_to_planar_y16, export=1
64206+                push            {r4-r8, lr}     @ +24
64207+                ldr             r3,  [sp, #24]
64208+                ldr             r6,  [sp, #36]
64209+                ldr             r7,  [sp, #32]  @ y
64210+                mov             r12, #48
64211+                sub             r3,  #1
64212+                lsl             r3,  #7
64213+                sub             r1,  r1,  r6,  lsl #1
64214+                add             r8,  r2,  r7,  lsl #7
64215+                ldr             r7,  [sp, #40]
64216+
64217+10:
64218+                mov             r2,  r8
64219+                add             r4,  r0,  #24
64220+                mov             r5,  r6
64221+                mov             lr,  #0
64222+1:
64223+                vldm            r2!, {q10-q13}
64224+                add             lr,  #64
64225+
64226+                vshrn.u32       d4 , q10, #14    @ Cannot vshrn.u32 #20!
64227+                ands            lr,  #127
64228+                vshrn.u32       d2,  q10, #10
64229+                vmovn.u32       d0,  q10
64230+
64231+                vshrn.u32       d5,  q11, #14
64232+                it              eq
64233+                addeq           r2,  r3
64234+                vshrn.u32       d3,  q11, #10
64235+                vmovn.u32       d1,  q11
64236+
64237+                subs            r5,  #48
64238+                vshr.u16        q2,  #6
64239+                vbic.u16        q0,  #0xfc00
64240+                vbic.u16        q1,  #0xfc00
64241+
64242+                vshrn.u32       d20, q12, #14
64243+                vshrn.u32       d18, q12, #10
64244+                vmovn.u32       d16, q12
64245+
64246+                vshrn.u32       d21, q13, #14
64247+                vshrn.u32       d19, q13, #10
64248+                vmovn.u32       d17, q13
64249+
64250+                vshr.u16        q10, #6
64251+                vbic.u16        q8,  #0xfc00
64252+                vbic.u16        q9 , #0xfc00
64253+                blt             2f
64254+
64255+                vst3.16         {d0,  d2,  d4},  [r0], r12
64256+                vst3.16         {d1,  d3,  d5},  [r4], r12
64257+                vst3.16         {d16, d18, d20}, [r0], r12
64258+                vst3.16         {d17, d19, d21}, [r4], r12
64259+
64260+                bne             1b
64261+
64262+11:
64263+                subs            r7,  #1
64264+                add             r0,  r1
64265+                add             r8,  #128
64266+                bne             10b
64267+
64268+                pop             {r4-r8, pc}
64269+
64270+@ Partial final write
64271+2:
64272+                cmp             r5,  #24-48
64273+                blt             1f
64274+                vst3.16         {d0,  d2,  d4},  [r0], r12
64275+                vst3.16         {d1,  d3,  d5},  [r4]
64276+                beq             11b
64277+                vmov            q0,  q8
64278+                sub             r5,  #24
64279+                vmov            q1,  q9
64280+                vmov            q2,  q10
64281+1:
64282+                cmp             r5,  #12-48
64283+                blt             1f
64284+                vst3.16         {d0,  d2,  d4},  [r0]!
64285+                beq             11b
64286+                vmov            d0, d1
64287+                sub             r5, #12
64288+                vmov            d2, d3
64289+                vmov            d4, d5
64290+1:
64291+                cmp             r5,  #6-48
64292+                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
64293+                blt             1f
64294+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
64295+                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
64296+                add             r0,  #12
64297+                beq             11b
64298+                vmov            s0,  s1
64299+                sub             r5,  #6
64300+                vmov            s4,  s5
64301+                vmov            s8,  s9
64302+1:
64303+                cmp             r5, #3-48
64304+                blt             1f
64305+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
64306+                beq             11b
64307+                sub             r5, #3
64308+                vshr.u32        d0, #16
64309+                vshr.u32        d2, #16
64310+1:
64311+                cmp             r5, #2-48
64312+                blt             1f
64313+                vst2.16         {d0[0], d2[0]}, [r0]!
64314+                b               11b
64315+1:
64316+                vst1.16         {d0[0]}, [r0]!
64317+                b               11b
64318+
64319+endfunc
64320+
64321+
64322+@ void ff_rpi_sand30_lines_to_planar_c16(
64323+@   uint8_t * dst_u,            // [r0]
64324+@   unsigned int dst_stride_u,  // [r1]
64325+@   uint8_t * dst_v,            // [r2]
64326+@   unsigned int dst_stride_v,  // [r3]
64327+@   const uint8_t * src,        // [sp, #0]  -> r4, r5
64328+@   unsigned int stride1,       // [sp, #4]  128
64329+@   unsigned int stride2,       // [sp, #8]  -> r8
64330+@   unsigned int _x,            // [sp, #12] 0
64331+@   unsigned int y,             // [sp, #16] (r7 in prefix)
64332+@   unsigned int _w,            // [sp, #20] -> r6, r9
64333+@   unsigned int h);            // [sp, #24] -> r7
64334+@
64335+@ Assumes that we are starting on a stripe boundary and that overreading
64336+@ within the stripe is OK. However it does respect the dest size for writing
64337+
64338+function ff_rpi_sand30_lines_to_planar_c16, export=1
64339+                push            {r4-r10, lr}    @ +32
64340+                ldr             r5,  [sp, #32]
64341+                ldr             r8,  [sp, #40]
64342+                ldr             r7,  [sp, #48]
64343+                ldr             r9,  [sp, #52]
64344+                mov             r12, #48
64345+                sub             r8,  #1
64346+                lsl             r8,  #7
64347+                add             r5,  r5,  r7,  lsl #7
64348+                sub             r1,  r1,  r9,  lsl #1
64349+                sub             r3,  r3,  r9,  lsl #1
64350+                ldr             r7,  [sp, #56]
64351+10:
64352+                mov             lr,  #0
64353+                mov             r4,  r5
64354+                mov             r6,  r9
64355+1:
64356+                vldm            r4!, {q0-q3}
64357+                add             lr,  #64
64358+
64359+                @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
64360+                vshrn.u32       d20, q0,  #14
64361+                vmovn.u32       d18, q0
64362+                vshrn.u32       d0,  q0,  #10
64363+                ands            lr,  #127
64364+
64365+                vshrn.u32       d21, q1,  #14
64366+                vmovn.u32       d19, q1
64367+                vshrn.u32       d1,  q1,  #10
64368+
64369+                vshrn.u32       d22, q2,  #10
64370+                vmovn.u32       d2,  q2
64371+                vshrn.u32       d4,  q2,  #14
64372+
64373+                add             r10, r0,  #24
64374+                vshrn.u32       d23, q3,  #10
64375+                vmovn.u32       d3,  q3
64376+                vshrn.u32       d5,  q3,  #14
64377+
64378+                it              eq
64379+                addeq           r4,  r8
64380+                vuzp.16         q0,  q11
64381+                vuzp.16         q9,  q1
64382+                vuzp.16         q10, q2
64383+
64384+                @ q0   V0, V3,..
64385+                @ q9   U0, U3...
64386+                @ q10  U1, U4...
64387+                @ q11  U2, U5,..
64388+                @ q1   V1, V4,
64389+                @ q2   V2, V5,..
64390+
64391+                subs            r6,  #24
64392+                vbic.u16        q11, #0xfc00
64393+                vbic.u16        q9,  #0xfc00
64394+                vshr.u16        q10, #6
64395+                vshr.u16        q2,  #6
64396+                vbic.u16        q0,  #0xfc00
64397+                vbic.u16        q1,  #0xfc00
64398+
64399+                blt             2f
64400+
64401+                vst3.16         {d18, d20, d22}, [r0],  r12
64402+                vst3.16         {d19, d21, d23}, [r10]
64403+                add             r10, r2,  #24
64404+                vst3.16         {d0,  d2,  d4},  [r2],  r12
64405+                vst3.16         {d1,  d3,  d5},  [r10]
64406+
64407+                bne             1b
64408+
64409+11:
64410+                subs            r7,  #1
64411+                add             r5,  #128
64412+                add             r0,  r1
64413+                add             r2,  r3
64414+                bne             10b
64415+
64416+                pop             {r4-r10, pc}
64417+
64418+@ Partial final write
64419+2:
64420+                cmp             r6,  #-12
64421+                blt             1f
64422+                vst3.16         {d18, d20, d22}, [r0]!
64423+                vst3.16         {d0,  d2,  d4},  [r2]!
64424+                beq             11b
64425+                vmov            d18, d19
64426+                vmov            d20, d21
64427+                vmov            d22, d23
64428+                sub             r6,  #12
64429+                vmov            d0,  d1
64430+                vmov            d2,  d3
64431+                vmov            d4,  d5
64432+1:
64433+                cmp             r6,  #-18
64434+                @ Rezip here as it makes the remaining tail handling easier
64435+                vzip.16         d0,  d18
64436+                vzip.16         d2,  d20
64437+                vzip.16         d4,  d22
64438+                blt             1f
64439+                vst3.16         {d0[1],  d2[1],  d4[1]},  [r0]!
64440+                vst3.16         {d0[0],  d2[0],  d4[0]},  [r2]!
64441+                vst3.16         {d0[3],  d2[3],  d4[3]},  [r0]!
64442+                vst3.16         {d0[2],  d2[2],  d4[2]},  [r2]!
64443+                beq             11b
64444+                vmov            d0,  d18
64445+                vmov            d2,  d20
64446+                sub             r6,  #6
64447+                vmov            d4,  d22
64448+1:
64449+                cmp             r6,  #-21
64450+                blt             1f
64451+                vst3.16         {d0[1], d2[1], d4[1]}, [r0]!
64452+                vst3.16         {d0[0], d2[0], d4[0]}, [r2]!
64453+                beq             11b
64454+                vmov            s4,  s5
64455+                sub             r6,  #3
64456+                vmov            s0,  s1
64457+1:
64458+                cmp             r6,  #-22
64459+                blt             1f
64460+                vst2.16         {d0[1], d2[1]}, [r0]!
64461+                vst2.16         {d0[0], d2[0]}, [r2]!
64462+                b               11b
64463+1:
64464+                vst1.16         {d0[1]}, [r0]!
64465+                vst1.16         {d0[0]}, [r2]!
64466+                b               11b
64467+
64468+endfunc
64469+
64470+@ void ff_rpi_sand30_lines_to_planar_p010(
64471+@   uint8_t * dest,             // [r0]
64472+@   unsigned int dst_stride,    // [r1]
64473+@   const uint8_t * src,        // [r2]
64474+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
64475+@   unsigned int src_stride2,   // [sp, #0]  -> r3
64476+@   unsigned int _x,            // [sp, #4]  Ignored - 0
64477+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
64478+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
64479+@   unsigned int h);            // [sp, #16] -> r7
64480+@
64481+@ Assumes that we are starting on a stripe boundary and that overreading
64482+@ within the stripe is OK. However it does respect the dest size for writing
64483+
64484+function ff_rpi_sand30_lines_to_planar_p010, export=1
64485+                push            {r4-r8, lr}     @ +24
64486+                ldr             r3,  [sp, #24]
64487+                ldr             r6,  [sp, #36]
64488+                ldr             r7,  [sp, #32]  @ y
64489+                mov             r12, #48
64490+                vmov.u16        q15, #0xffc0
64491+                sub             r3,  #1
64492+                lsl             r3,  #7
64493+                sub             r1,  r1,  r6,  lsl #1
64494+                add             r8,  r2,  r7,  lsl #7
64495+                ldr             r7,  [sp, #40]
64496+
64497+10:
64498+                mov             r2,  r8
64499+                add             r4,  r0,  #24
64500+                mov             r5,  r6
64501+                mov             lr,  #0
64502+1:
64503+                vldm            r2!, {q10-q13}
64504+                add             lr,  #64
64505+
64506+                vshl.u32        q14, q10, #6
64507+                ands            lr,  #127
64508+                vshrn.u32       d4,  q10, #14
64509+                vshrn.u32       d2,  q10, #4
64510+                vmovn.u32       d0,  q14
64511+
64512+                vshl.u32        q14, q11, #6
64513+                it              eq
64514+                addeq           r2,  r3
64515+                vshrn.u32       d5,  q11, #14
64516+                vshrn.u32       d3,  q11, #4
64517+                vmovn.u32       d1,  q14
64518+
64519+                subs            r5,  #48
64520+                vand            q2,  q15
64521+                vand            q1,  q15
64522+                vand            q0,  q15
64523+
64524+                vshl.u32        q14, q12, #6
64525+                vshrn.u32       d20, q12, #14
64526+                vshrn.u32       d18, q12, #4
64527+                vmovn.u32       d16, q14
64528+
64529+                vshl.u32        q14, q13, #6
64530+                vshrn.u32       d21, q13, #14
64531+                vshrn.u32       d19, q13, #4
64532+                vmovn.u32       d17, q14
64533+
64534+                vand            q10, q15
64535+                vand            q9,  q15
64536+                vand            q8,  q15
64537+                blt             2f
64538+
64539+                vst3.16         {d0,  d2,  d4},  [r0], r12
64540+                vst3.16         {d1,  d3,  d5},  [r4], r12
64541+                vst3.16         {d16, d18, d20}, [r0], r12
64542+                vst3.16         {d17, d19, d21}, [r4], r12
64543+
64544+                bne             1b
64545+
64546+11:
64547+                subs            r7,  #1
64548+                add             r0,  r1
64549+                add             r8,  #128
64550+                bne             10b
64551+
64552+                pop             {r4-r8, pc}
64553+
64554+@ Partial final write
64555+2:
64556+                cmp             r5,  #24-48
64557+                blt             1f
64558+                vst3.16         {d0,  d2,  d4},  [r0], r12
64559+                vst3.16         {d1,  d3,  d5},  [r4]
64560+                beq             11b
64561+                vmov            q0,  q8
64562+                sub             r5,  #24
64563+                vmov            q1,  q9
64564+                vmov            q2,  q10
64565+1:
64566+                cmp             r5,  #12-48
64567+                blt             1f
64568+                vst3.16         {d0,  d2,  d4},  [r0]!
64569+                beq             11b
64570+                vmov            d0, d1
64571+                sub             r5, #12
64572+                vmov            d2, d3
64573+                vmov            d4, d5
64574+1:
64575+                cmp             r5,  #6-48
64576+                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
64577+                blt             1f
64578+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
64579+                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
64580+                add             r0,  #12
64581+                beq             11b
64582+                vmov            s0,  s1
64583+                sub             r5,  #6
64584+                vmov            s4,  s5
64585+                vmov            s8,  s9
64586+1:
64587+                cmp             r5, #3-48
64588+                blt             1f
64589+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
64590+                beq             11b
64591+                sub             r5, #3
64592+                vshr.u32        d0, #16
64593+                vshr.u32        d2, #16
64594+1:
64595+                cmp             r5, #2-48
64596+                blt             1f
64597+                vst2.16         {d0[0], d2[0]}, [r0]!
64598+                b               11b
64599+1:
64600+                vst1.16         {d0[0]}, [r0]!
64601+                b               11b
64602+
64603+endfunc
64604+
64605+
64606+@ void ff_rpi_sand30_lines_to_planar_y8(
64607+@   uint8_t * dest,             // [r0]
64608+@   unsigned int dst_stride,    // [r1]
64609+@   const uint8_t * src,        // [r2]
64610+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
64611+@   unsigned int src_stride2,   // [sp, #0]  -> r3
64612+@   unsigned int _x,            // [sp, #4]  Ignored - 0
64613+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
64614+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
64615+@   unsigned int h);            // [sp, #16] -> r7
64616+@
64617+@ Assumes that we are starting on a stripe boundary and that overreading
64618+@ within the stripe is OK. However it does respect the dest size for wri
64619+
64620+function ff_rpi_sand30_lines_to_planar_y8, export=1
64621+                push            {r4-r8, lr}     @ +24
64622+                ldr             r3,  [sp, #24]
64623+                ldr             r6,  [sp, #36]
64624+                ldr             r7,  [sp, #32]  @ y
64625+                mov             r12, #48
64626+                lsl             r3,  #7
64627+                sub             r1,  r1,  r6
64628+                add             r8,  r2,  r7,  lsl #7
64629+                ldr             r7,  [sp, #40]
64630+
64631+10:
64632+                mov             r2,  r8
64633+                add             r4,  r0,  #24
64634+                mov             r5,  r6
64635+1:
64636+                vldm            r2,  {q8-q15}
64637+
64638+                subs            r5,  #96
64639+
64640+                vmovn.u32       d0,  q8
64641+                vshrn.u32       d2,  q8,  #12
64642+                vshrn.u32       d4,  q8,  #16    @ Cannot vshrn.u32 #20!
64643+
64644+                add             r2,  r3
64645+
64646+                vmovn.u32       d1,  q9
64647+                vshrn.u32       d3,  q9,  #12
64648+                vshrn.u32       d5,  q9,  #16
64649+
64650+                pld             [r2, #0]
64651+
64652+                vshrn.u16       d0,  q0,  #2
64653+                vmovn.u16       d1,  q1
64654+                vshrn.u16       d2,  q2,  #6
64655+
64656+                vmovn.u32       d16, q10
64657+                vshrn.u32       d18, q10, #12
64658+                vshrn.u32       d20, q10, #16
64659+
64660+                vmovn.u32       d17, q11
64661+                vshrn.u32       d19, q11, #12
64662+                vshrn.u32       d21, q11, #16
64663+
64664+                pld             [r2, #64]
64665+
64666+                vshrn.u16       d4,  q8,  #2
64667+                vmovn.u16       d5,  q9
64668+                vshrn.u16       d6,  q10, #6
64669+
64670+                vmovn.u32       d16, q12
64671+                vshrn.u32       d18, q12, #12
64672+                vshrn.u32       d20, q12, #16
64673+
64674+                vmovn.u32       d17, q13
64675+                vshrn.u32       d19, q13, #12
64676+                vshrn.u32       d21, q13, #16
64677+
64678+                vshrn.u16       d16, q8,  #2
64679+                vmovn.u16       d17, q9
64680+                vshrn.u16       d18, q10, #6
64681+
64682+                vmovn.u32       d20, q14
64683+                vshrn.u32       d22, q14, #12
64684+                vshrn.u32       d24, q14, #16
64685+
64686+                vmovn.u32       d21, q15
64687+                vshrn.u32       d23, q15, #12
64688+                vshrn.u32       d25, q15, #16
64689+
64690+                vshrn.u16       d20, q10, #2
64691+                vmovn.u16       d21, q11
64692+                vshrn.u16       d22, q12, #6
64693+
64694+                blt             2f
64695+
64696+                vst3.8          {d0,  d1,  d2},  [r0], r12
64697+                vst3.8          {d4,  d5,  d6},  [r4], r12
64698+                vst3.8          {d16, d17, d18}, [r0], r12
64699+                vst3.8          {d20, d21, d22}, [r4], r12
64700+
64701+                bne             1b
64702+
64703+11:
64704+                subs            r7,  #1
64705+                add             r0,  r1
64706+                add             r8,  #128
64707+                bne             10b
64708+
64709+                pop             {r4-r8, pc}
64710+
64711+@ Partial final write
64712+2:
64713+                cmp             r5,  #48-96
64714+                blt             1f
64715+                vst3.8          {d0,  d1,  d2},  [r0], r12
64716+                vst3.8          {d4,  d5,  d6},  [r4], r12
64717+                beq             11b
64718+                vmov            q0,  q8
64719+                vmov            q2,  q10
64720+                sub             r5,  #48
64721+                vmov            d2,  d18
64722+                vmov            d6,  d22
64723+1:
64724+                cmp             r5,  #24-96
64725+                blt             1f
64726+                vst3.8          {d0,  d1,  d2},  [r0]!
64727+                beq             11b
64728+                vmov            q0,  q2
64729+                sub             r5,  #24
64730+                vmov            d2,  d6
64731+1:
64732+                cmp             r5,  #12-96
64733+                blt             1f
64734+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
64735+                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
64736+                vst3.8          {d0[2], d1[2], d2[2]}, [r0]!
64737+                vst3.8          {d0[3], d1[3], d2[3]}, [r0]!
64738+                beq             11b
64739+                vmov            s0,  s1
64740+                sub             r5,  #12
64741+                vmov            s2,  s3
64742+                vmov            s4,  s5
64743+1:
64744+                cmp             r5,  #6-96
64745+                blt             1f
64746+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
64747+                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
64748+                add             r0,  #12
64749+                beq             11b
64750+                vshr.u32        d0,  #16
64751+                sub             r5,  #6
64752+                vshr.u32        d1,  #16
64753+                vshr.u32        d2,  #16
64754+1:
64755+                cmp             r5, #3-96
64756+                blt             1f
64757+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
64758+                beq             11b
64759+                sub             r5, #3
64760+                vshr.u32        d0, #8
64761+                vshr.u32        d1, #8
64762+1:
64763+                cmp             r5, #2-96
64764+                blt             1f
64765+                vst2.8          {d0[0], d1[0]}, [r0]!
64766+                b               11b
64767+1:
64768+                vst1.8          {d0[0]}, [r0]!
64769+                b               11b
64770+
64771+endfunc
64772+
64773+
64774--- /dev/null
64775+++ b/libavutil/arm/rpi_sand_neon.h
64776@@ -0,0 +1,110 @@
64777+/*
64778+Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
64779+All rights reserved.
64780+
64781+Redistribution and use in source and binary forms, with or without
64782+modification, are permitted provided that the following conditions are met:
64783+    * Redistributions of source code must retain the above copyright
64784+      notice, this list of conditions and the following disclaimer.
64785+    * Redistributions in binary form must reproduce the above copyright
64786+      notice, this list of conditions and the following disclaimer in the
64787+      documentation and/or other materials provided with the distribution.
64788+    * Neither the name of the copyright holder nor the
64789+      names of its contributors may be used to endorse or promote products
64790+      derived from this software without specific prior written permission.
64791+
64792+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
64793+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
64794+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
64795+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
64796+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
64797+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
64798+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
64799+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
64800+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64801+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64802+
64803+Authors: John Cox
64804+*/
64805+
64806+#ifndef AVUTIL_ARM_SAND_NEON_H
64807+#define AVUTIL_ARM_SAND_NEON_H
64808+
64809+void ff_rpi_sand128b_stripe_to_8_10(
64810+  uint8_t * dest,             // [r0]
64811+  const uint8_t * src1,       // [r1]
64812+  const uint8_t * src2,       // [r2]
64813+  unsigned int lines);        // [r3]
64814+
64815+void ff_rpi_sand8_lines_to_planar_y8(
64816+  uint8_t * dest,             // [r0]
64817+  unsigned int dst_stride,    // [r1]
64818+  const uint8_t * src,        // [r2]
64819+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
64820+  unsigned int src_stride2,   // [sp, #0]  -> r3
64821+  unsigned int _x,            // [sp, #4]  Ignored - 0
64822+  unsigned int y,             // [sp, #8]  (r7 in prefix)
64823+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
64824+  unsigned int h);            // [sp, #16] -> r7
64825+
64826+void ff_rpi_sand8_lines_to_planar_c8(
64827+  uint8_t * dst_u,            // [r0]
64828+  unsigned int dst_stride_u,  // [r1]
64829+  uint8_t * dst_v,            // [r2]
64830+  unsigned int dst_stride_v,  // [r3]
64831+  const uint8_t * src,        // [sp, #0]  -> r4, r5
64832+  unsigned int stride1,       // [sp, #4]  128
64833+  unsigned int stride2,       // [sp, #8]  -> r8
64834+  unsigned int _x,            // [sp, #12] 0
64835+  unsigned int y,             // [sp, #16] (r7 in prefix)
64836+  unsigned int _w,            // [sp, #20] -> r12, r6
64837+  unsigned int h);            // [sp, #24] -> r7
64838+
64839+void ff_rpi_sand30_lines_to_planar_y16(
64840+  uint8_t * dest,             // [r0]
64841+  unsigned int dst_stride,    // [r1]
64842+  const uint8_t * src,        // [r2]
64843+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
64844+  unsigned int src_stride2,   // [sp, #0]  -> r3
64845+  unsigned int _x,            // [sp, #4]  Ignored - 0
64846+  unsigned int y,             // [sp, #8]  (r7 in prefix)
64847+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
64848+  unsigned int h);            // [sp, #16] -> r7
64849+
64850+void ff_rpi_sand30_lines_to_planar_c16(
64851+  uint8_t * dst_u,            // [r0]
64852+  unsigned int dst_stride_u,  // [r1]
64853+  uint8_t * dst_v,            // [r2]
64854+  unsigned int dst_stride_v,  // [r3]
64855+  const uint8_t * src,        // [sp, #0]  -> r4, r5
64856+  unsigned int stride1,       // [sp, #4]  128
64857+  unsigned int stride2,       // [sp, #8]  -> r8
64858+  unsigned int _x,            // [sp, #12] 0
64859+  unsigned int y,             // [sp, #16] (r7 in prefix)
64860+  unsigned int _w,            // [sp, #20] -> r6, r9
64861+  unsigned int h);            // [sp, #24] -> r7
64862+
64863+void ff_rpi_sand30_lines_to_planar_p010(
64864+  uint8_t * dest,             // [r0]
64865+  unsigned int dst_stride,    // [r1]
64866+  const uint8_t * src,        // [r2]
64867+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
64868+  unsigned int src_stride2,   // [sp, #0]  -> r3
64869+  unsigned int _x,            // [sp, #4]  Ignored - 0
64870+  unsigned int y,             // [sp, #8]  (r7 in prefix)
64871+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
64872+  unsigned int h);            // [sp, #16] -> r7
64873+
64874+void ff_rpi_sand30_lines_to_planar_y8(
64875+  uint8_t * dest,             // [r0]
64876+  unsigned int dst_stride,    // [r1]
64877+  const uint8_t * src,        // [r2]
64878+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
64879+  unsigned int src_stride2,   // [sp, #0]  -> r3
64880+  unsigned int _x,            // [sp, #4]  Ignored - 0
64881+  unsigned int y,             // [sp, #8]  (r7 in prefix)
64882+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
64883+  unsigned int h);            // [sp, #16] -> r7
64884+
64885+#endif // AVUTIL_ARM_SAND_NEON_H
64886+
64887--- a/libavutil/frame.c
64888+++ b/libavutil/frame.c
64889@@ -16,6 +16,8 @@
64890  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
64891  */
64892
64893+#include "config.h"
64894+
64895 #include "channel_layout.h"
64896 #include "avassert.h"
64897 #include "buffer.h"
64898@@ -26,6 +28,9 @@
64899 #include "mem.h"
64900 #include "samplefmt.h"
64901 #include "hwcontext.h"
64902+#if CONFIG_SAND
64903+#include "rpi_sand_fns.h"
64904+#endif
64905
64906 #if FF_API_FRAME_GET_SET
64907 MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp)
64908@@ -902,6 +907,12 @@ int av_frame_apply_cropping(AVFrame *fra
64909         (frame->crop_top + frame->crop_bottom) >= frame->height)
64910         return AVERROR(ERANGE);
64911
64912+#if CONFIG_SAND
64913+    // Sand cannot be cropped - do not try
64914+    if (av_rpi_is_sand_format(frame->format))
64915+        return 0;
64916+#endif
64917+
64918     desc = av_pix_fmt_desc_get(frame->format);
64919     if (!desc)
64920         return AVERROR_BUG;
64921--- a/libavutil/frame.h
64922+++ b/libavutil/frame.h
64923@@ -968,6 +968,16 @@ int av_frame_apply_cropping(AVFrame *fra
64924  */
64925 const char *av_frame_side_data_name(enum AVFrameSideDataType type);
64926
64927+
64928+static inline int av_frame_cropped_width(const AVFrame * const frame)
64929+{
64930+    return frame->width - (frame->crop_left + frame->crop_right);
64931+}
64932+static inline int av_frame_cropped_height(const AVFrame * const frame)
64933+{
64934+    return frame->height - (frame->crop_top + frame->crop_bottom);
64935+}
64936+
64937 /**
64938  * @}
64939  */
64940--- a/libavutil/hwcontext_drm.c
64941+++ b/libavutil/hwcontext_drm.c
64942@@ -19,8 +19,10 @@
64943 #include <fcntl.h>
64944 #include <sys/mman.h>
64945 #include <unistd.h>
64946+#include <sys/ioctl.h>
64947
64948 #include <drm.h>
64949+#include <libdrm/drm_fourcc.h>
64950 #include <xf86drm.h>
64951
64952 #include "avassert.h"
64953@@ -28,6 +30,11 @@
64954 #include "hwcontext_drm.h"
64955 #include "hwcontext_internal.h"
64956 #include "imgutils.h"
64957+#include "libavutil/rpi_sand_fns.h"
64958+
64959+#include <linux/mman.h>
64960+#include <linux/dma-buf.h>
64961+#include <linux/dma-heap.h>
64962
64963
64964 static void drm_device_free(AVHWDeviceContext *hwdev)
64965@@ -43,6 +50,11 @@ static int drm_device_create(AVHWDeviceC
64966     AVDRMDeviceContext *hwctx = hwdev->hwctx;
64967     drmVersionPtr version;
64968
64969+    if (device == NULL) {
64970+      hwctx->fd = -1;
64971+      return 0;
64972+    }
64973+
64974     hwctx->fd = open(device, O_RDWR);
64975     if (hwctx->fd < 0)
64976         return AVERROR(errno);
64977@@ -85,18 +97,37 @@ static int drm_get_buffer(AVHWFramesCont
64978 typedef struct DRMMapping {
64979     // Address and length of each mmap()ed region.
64980     int nb_regions;
64981+    unsigned int dmaflags;
64982     void *address[AV_DRM_MAX_PLANES];
64983     size_t length[AV_DRM_MAX_PLANES];
64984+    int fds[AV_DRM_MAX_PLANES];
64985 } DRMMapping;
64986
64987+static int dmasync(const int fd, const unsigned int flags)
64988+{
64989+    struct dma_buf_sync sync = {
64990+        .flags = flags
64991+    };
64992+    while (ioctl(fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
64993+        const int err = errno;
64994+        if (errno == EINTR)
64995+            continue;
64996+        av_log(NULL, AV_LOG_WARNING, "%s: ioctl failed: flags=%#x\n", __func__, flags);
64997+        return -err;
64998+    }
64999+    return 0;
65000+}
65001+
65002 static void drm_unmap_frame(AVHWFramesContext *hwfc,
65003                             HWMapDescriptor *hwmap)
65004 {
65005     DRMMapping *map = hwmap->priv;
65006     int i;
65007
65008-    for (i = 0; i < map->nb_regions; i++)
65009+    for (i = 0; i < map->nb_regions; i++) {
65010         munmap(map->address[i], map->length[i]);
65011+        dmasync(map->fds[i], DMA_BUF_SYNC_END | map->dmaflags);
65012+    }
65013
65014     av_free(map);
65015 }
65016@@ -114,15 +145,28 @@ static int drm_map_frame(AVHWFramesConte
65017     if (!map)
65018         return AVERROR(ENOMEM);
65019
65020+    for (i = 0; i < AV_DRM_MAX_PLANES; i++)
65021+        map->fds[i] = -1;
65022+
65023     mmap_prot = 0;
65024-    if (flags & AV_HWFRAME_MAP_READ)
65025+    if (flags & AV_HWFRAME_MAP_READ) {
65026+        map->dmaflags |= DMA_BUF_SYNC_READ;
65027         mmap_prot |= PROT_READ;
65028-    if (flags & AV_HWFRAME_MAP_WRITE)
65029+    }
65030+    if (flags & AV_HWFRAME_MAP_WRITE) {
65031+        map->dmaflags |= DMA_BUF_SYNC_WRITE;
65032         mmap_prot |= PROT_WRITE;
65033+    }
65034+
65035+    if (dst->format == AV_PIX_FMT_NONE)
65036+        dst->format = hwfc->sw_format;
65037
65038     av_assert0(desc->nb_objects <= AV_DRM_MAX_PLANES);
65039     for (i = 0; i < desc->nb_objects; i++) {
65040-        addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED,
65041+        dmasync(desc->objects[i].fd, DMA_BUF_SYNC_START | map->dmaflags);
65042+        map->fds[i] = desc->objects[i].fd;
65043+
65044+        addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED | MAP_POPULATE,
65045                     desc->objects[i].fd, 0);
65046         if (addr == MAP_FAILED) {
65047             err = AVERROR(errno);
65048@@ -151,6 +195,23 @@ static int drm_map_frame(AVHWFramesConte
65049
65050     dst->width  = src->width;
65051     dst->height = src->height;
65052+    dst->crop_top    = src->crop_top;
65053+    dst->crop_bottom = src->crop_bottom;
65054+    dst->crop_left   = src->crop_left;
65055+    dst->crop_right  = src->crop_right;
65056+
65057+#if CONFIG_SAND
65058+    // Rework for sand frames
65059+    if (av_rpi_is_sand_frame(dst)) {
65060+        // As it stands the sand formats hold stride2 in linesize[3]
65061+        // linesize[0] & [1] contain stride1 which is always 128 for everything we do
65062+        // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1]
65063+        dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier);
65064+        dst->linesize[0] = 128;
65065+        dst->linesize[1] = 128;
65066+        // *** Are we sure src->height is actually what we want ???
65067+    }
65068+#endif
65069
65070     err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
65071                                 &drm_unmap_frame, map);
65072@@ -160,7 +221,9 @@ static int drm_map_frame(AVHWFramesConte
65073     return 0;
65074
65075 fail:
65076-    for (i = 0; i < desc->nb_objects; i++) {
65077+    for (i = 0; i < AV_DRM_MAX_PLANES; i++) {
65078+        if (map->fds[i] != -1)
65079+            dmasync(map->fds[i], DMA_BUF_SYNC_END | map->dmaflags);
65080         if (map->address[i])
65081             munmap(map->address[i], map->length[i]);
65082     }
65083@@ -172,16 +235,29 @@ static int drm_transfer_get_formats(AVHW
65084                                     enum AVHWFrameTransferDirection dir,
65085                                     enum AVPixelFormat **formats)
65086 {
65087-    enum AVPixelFormat *pix_fmts;
65088+    enum AVPixelFormat *p;
65089
65090-    pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
65091-    if (!pix_fmts)
65092+    p = *formats = av_malloc_array(3, sizeof(*p));
65093+    if (!p)
65094         return AVERROR(ENOMEM);
65095
65096-    pix_fmts[0] = ctx->sw_format;
65097-    pix_fmts[1] = AV_PIX_FMT_NONE;
65098+    // **** Offer native sand too ????
65099+    *p++ =
65100+#if CONFIG_SAND
65101+        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
65102+            AV_PIX_FMT_YUV420P :
65103+        ctx->sw_format == AV_PIX_FMT_RPI4_10 ?
65104+            AV_PIX_FMT_YUV420P10LE :
65105+#endif
65106+            ctx->sw_format;
65107+
65108+#if CONFIG_SAND
65109+    if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
65110+        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
65111+        *p++ = AV_PIX_FMT_NV12;
65112+#endif
65113
65114-    *formats = pix_fmts;
65115+    *p = AV_PIX_FMT_NONE;
65116     return 0;
65117 }
65118
65119@@ -197,18 +273,63 @@ static int drm_transfer_data_from(AVHWFr
65120     map = av_frame_alloc();
65121     if (!map)
65122         return AVERROR(ENOMEM);
65123-    map->format = dst->format;
65124
65125+    // Map to default
65126+    map->format = AV_PIX_FMT_NONE;
65127     err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
65128     if (err)
65129         goto fail;
65130
65131-    map->width  = dst->width;
65132-    map->height = dst->height;
65133+#if 0
65134+    av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__,
65135+           map->hwfc_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE,
65136+           map->width, map->height,
65137+           map->linesize[0],
65138+           map->linesize[1],
65139+           map->linesize[2],
65140+           map->linesize[3],
65141+           dst->width, dst->height,
65142+           dst->linesize[0],
65143+           dst->linesize[1],
65144+           dst->linesize[2]);
65145+#endif
65146+#if CONFIG_SAND
65147+    if (av_rpi_is_sand_frame(map)) {
65148+        // Preserve crop - later ffmpeg code assumes that we have in that it
65149+        // overwrites any crop that we create with the old values
65150+        unsigned int stride2 = map->linesize[3];
65151+        const unsigned int w = FFMIN(dst->width, map->width);
65152+        const unsigned int h = FFMIN(dst->height, map->height);
65153+
65154+        map->crop_top = 0;
65155+        map->crop_bottom = 0;
65156+        map->crop_left = 0;
65157+        map->crop_right = 0;
65158+
65159+        if (av_rpi_sand_to_planar_frame(dst, map) != 0)
65160+        {
65161+            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
65162+            err = AVERROR(EINVAL);
65163+            goto fail;
65164+        }
65165+
65166+        dst->width = w;
65167+        dst->height = h;
65168+    }
65169+    else
65170+#endif
65171+    {
65172+        // Kludge mapped h/w s.t. frame_copy works
65173+        map->width  = dst->width;
65174+        map->height = dst->height;
65175+        err = av_frame_copy(dst, map);
65176+    }
65177
65178-    err = av_frame_copy(dst, map);
65179     if (err)
65180+    {
65181+        av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
65182         goto fail;
65183+    }
65184
65185     err = 0;
65186 fail:
65187@@ -223,7 +344,10 @@ static int drm_transfer_data_to(AVHWFram
65188     int err;
65189
65190     if (src->width > hwfc->width || src->height > hwfc->height)
65191+    {
65192+        av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
65193         return AVERROR(EINVAL);
65194+    }
65195
65196     map = av_frame_alloc();
65197     if (!map)
65198--- a/libavutil/pixdesc.c
65199+++ b/libavutil/pixdesc.c
65200@@ -2371,6 +2371,50 @@ static const AVPixFmtDescriptor av_pix_f
65201         .name = "vulkan",
65202         .flags = AV_PIX_FMT_FLAG_HWACCEL,
65203     },
65204+    [AV_PIX_FMT_SAND128] = {
65205+        .name = "sand128",
65206+        .nb_components = 3,
65207+        .log2_chroma_w = 1,
65208+        .log2_chroma_h = 1,
65209+        .comp = {
65210+            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
65211+            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
65212+            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
65213+        },
65214+        .flags = 0,
65215+    },
65216+    [AV_PIX_FMT_SAND64_10] = {
65217+        .name = "sand64_10",
65218+        .nb_components = 3,
65219+        .log2_chroma_w = 1,
65220+        .log2_chroma_h = 1,
65221+        .comp = {
65222+            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
65223+            { 1, 4, 0, 0, 10, 3, 9, 1 },        /* U */
65224+            { 1, 4, 2, 0, 10, 3, 9, 3 },        /* V */
65225+        },
65226+        .flags = 0,
65227+    },
65228+    [AV_PIX_FMT_SAND64_16] = {
65229+        .name = "sand64_16",
65230+        .nb_components = 3,
65231+        .log2_chroma_w = 1,
65232+        .log2_chroma_h = 1,
65233+        .comp = {
65234+            { 0, 2, 0, 0, 16, 0, 15, 1 },        /* Y */
65235+            { 1, 4, 0, 0, 16, 3, 15, 1 },        /* U */
65236+            { 1, 4, 2, 0, 16, 3, 15, 3 },        /* V */
65237+        },
65238+        .flags = 0,
65239+    },
65240+    [AV_PIX_FMT_RPI4_8] = {
65241+        .name = "rpi4_8",
65242+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
65243+    },
65244+    [AV_PIX_FMT_RPI4_10] = {
65245+        .name = "rpi4_10",
65246+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
65247+    },
65248 };
65249 #if FF_API_PLUS1_MINUS1
65250 FF_ENABLE_DEPRECATION_WARNINGS
65251--- a/libavutil/pixfmt.h
65252+++ b/libavutil/pixfmt.h
65253@@ -357,6 +357,12 @@ enum AVPixelFormat {
65254
65255     AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
65256     AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
65257+// RPI - not on ifdef so can be got at by calling progs
65258+    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
65259+    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
65260+    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
65261+    AV_PIX_FMT_RPI4_8,
65262+    AV_PIX_FMT_RPI4_10,
65263
65264     AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
65265 };
65266--- /dev/null
65267+++ b/libavutil/rpi_sand_fn_pw.h
65268@@ -0,0 +1,227 @@
65269+/*
65270+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
65271+All rights reserved.
65272+
65273+Redistribution and use in source and binary forms, with or without
65274+modification, are permitted provided that the following conditions are met:
65275+    * Redistributions of source code must retain the above copyright
65276+      notice, this list of conditions and the following disclaimer.
65277+    * Redistributions in binary form must reproduce the above copyright
65278+      notice, this list of conditions and the following disclaimer in the
65279+      documentation and/or other materials provided with the distribution.
65280+    * Neither the name of the copyright holder nor the
65281+      names of its contributors may be used to endorse or promote products
65282+      derived from this software without specific prior written permission.
65283+
65284+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
65285+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
65286+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
65287+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
65288+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
65289+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
65290+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
65291+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
65292+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
65293+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65294+
65295+Authors: John Cox
65296+*/
65297+
65298+// * Included twice from rpi_sand_fn with different PW
65299+
65300+#define STRCAT(x,y) x##y
65301+
65302+#if PW == 1
65303+#define pixel uint8_t
65304+#define FUNC(f) STRCAT(f, 8)
65305+#elif PW == 2
65306+#define pixel uint16_t
65307+#define FUNC(f) STRCAT(f, 16)
65308+#else
65309+#error Unexpected PW
65310+#endif
65311+
65312+// Fetches a single patch - offscreen fixup not done here
65313+// w <= stride1
65314+// unclipped
65315+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
65316+                             const uint8_t * src,
65317+                             unsigned int stride1, unsigned int stride2,
65318+                             unsigned int _x, unsigned int y,
65319+                             unsigned int _w, unsigned int h)
65320+{
65321+    const unsigned int x = _x;
65322+    const unsigned int w = _w;
65323+    const unsigned int mask = stride1 - 1;
65324+
65325+#if PW == 1 && HAVE_SAND_ASM
65326+    if (_x == 0) {
65327+        ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
65328+                                     src, stride1, stride2, _x, y, _w, h);
65329+        return;
65330+    }
65331+#endif
65332+
65333+    if ((x & ~mask) == ((x + w) & ~mask)) {
65334+        // All in one sand stripe
65335+        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
65336+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
65337+            memcpy(dst, p, w);
65338+        }
65339+    }
65340+    else
65341+    {
65342+        // Two+ stripe
65343+        const unsigned int sstride = stride1 * stride2;
65344+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
65345+        const uint8_t * p2 = p1 + sstride - (x & mask);
65346+        const unsigned int w1 = stride1 - (x & mask);
65347+        const unsigned int w3 = (x + w) & mask;
65348+        const unsigned int w2 = w - (w1 + w3);
65349+
65350+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
65351+            unsigned int j;
65352+            const uint8_t * p = p2;
65353+            uint8_t * d = dst;
65354+            memcpy(d, p1, w1);
65355+            d += w1;
65356+            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
65357+                memcpy(d, p, stride1);
65358+            }
65359+            memcpy(d, p, w3);
65360+        }
65361+    }
65362+}
65363+
65364+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
65365+
65366+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
65367+                             uint8_t * dst_v, const unsigned int dst_stride_v,
65368+                             const uint8_t * src,
65369+                             unsigned int stride1, unsigned int stride2,
65370+                             unsigned int _x, unsigned int y,
65371+                             unsigned int _w, unsigned int h)
65372+{
65373+    const unsigned int x = _x * 2;
65374+    const unsigned int w = _w * 2;
65375+    const unsigned int mask = stride1 - 1;
65376+
65377+#if PW == 1 && HAVE_SAND_ASM
65378+    if (_x == 0) {
65379+        ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
65380+                                     src, stride1, stride2, _x, y, _w, h);
65381+        return;
65382+    }
65383+#endif
65384+
65385+    if ((x & ~mask) == ((x + w) & ~mask)) {
65386+        // All in one sand stripe
65387+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
65388+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
65389+            pixel * du = (pixel *)dst_u;
65390+            pixel * dv = (pixel *)dst_v;
65391+            const pixel * p = (const pixel *)p1;
65392+            for (unsigned int k = 0; k < w; k += 2 * PW) {
65393+                *du++ = *p++;
65394+                *dv++ = *p++;
65395+            }
65396+        }
65397+    }
65398+    else
65399+    {
65400+        // Two+ stripe
65401+        const unsigned int sstride = stride1 * stride2;
65402+        const unsigned int sstride_p = (sstride - stride1) / PW;
65403+
65404+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
65405+        const uint8_t * p2 = p1 + sstride - (x & mask);
65406+        const unsigned int w1 = stride1 - (x & mask);
65407+        const unsigned int w3 = (x + w) & mask;
65408+        const unsigned int w2 = w - (w1 + w3);
65409+
65410+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
65411+            unsigned int j;
65412+            const pixel * p = (const pixel *)p1;
65413+            pixel * du = (pixel *)dst_u;
65414+            pixel * dv = (pixel *)dst_v;
65415+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
65416+                *du++ = *p++;
65417+                *dv++ = *p++;
65418+            }
65419+            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
65420+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
65421+                    *du++ = *p++;
65422+                    *dv++ = *p++;
65423+                }
65424+            }
65425+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
65426+                *du++ = *p++;
65427+                *dv++ = *p++;
65428+            }
65429+        }
65430+    }
65431+}
65432+
65433+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
65434+                             unsigned int stride1, unsigned int stride2,
65435+                             const uint8_t * src_u, const unsigned int src_stride_u,
65436+                             const uint8_t * src_v, const unsigned int src_stride_v,
65437+                             unsigned int _x, unsigned int y,
65438+                             unsigned int _w, unsigned int h)
65439+{
65440+    const unsigned int x = _x * 2;
65441+    const unsigned int w = _w * 2;
65442+    const unsigned int mask = stride1 - 1;
65443+    if ((x & ~mask) == ((x + w) & ~mask)) {
65444+        // All in one sand stripe
65445+        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
65446+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
65447+            const pixel * su = (const pixel *)src_u;
65448+            const pixel * sv = (const pixel *)src_v;
65449+            pixel * p = (pixel *)p1;
65450+            for (unsigned int k = 0; k < w; k += 2 * PW) {
65451+                *p++ = *su++;
65452+                *p++ = *sv++;
65453+            }
65454+        }
65455+    }
65456+    else
65457+    {
65458+        // Two+ stripe
65459+        const unsigned int sstride = stride1 * stride2;
65460+        const unsigned int sstride_p = (sstride - stride1) / PW;
65461+
65462+        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
65463+        const uint8_t * p2 = p1 + sstride - (x & mask);
65464+        const unsigned int w1 = stride1 - (x & mask);
65465+        const unsigned int w3 = (x + w) & mask;
65466+        const unsigned int w2 = w - (w1 + w3);
65467+
65468+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
65469+            unsigned int j;
65470+            const pixel * su = (const pixel *)src_u;
65471+            const pixel * sv = (const pixel *)src_v;
65472+            pixel * p = (pixel *)p1;
65473+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
65474+                *p++ = *su++;
65475+                *p++ = *sv++;
65476+            }
65477+            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
65478+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
65479+                    *p++ = *su++;
65480+                    *p++ = *sv++;
65481+                }
65482+            }
65483+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
65484+                *p++ = *su++;
65485+                *p++ = *sv++;
65486+            }
65487+        }
65488+    }
65489+}
65490+
65491+
65492+#undef pixel
65493+#undef STRCAT
65494+#undef FUNC
65495+
65496--- /dev/null
65497+++ b/libavutil/rpi_sand_fns.c
65498@@ -0,0 +1,445 @@
65499+/*
65500+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
65501+All rights reserved.
65502+
65503+Redistribution and use in source and binary forms, with or without
65504+modification, are permitted provided that the following conditions are met:
65505+    * Redistributions of source code must retain the above copyright
65506+      notice, this list of conditions and the following disclaimer.
65507+    * Redistributions in binary form must reproduce the above copyright
65508+      notice, this list of conditions and the following disclaimer in the
65509+      documentation and/or other materials provided with the distribution.
65510+    * Neither the name of the copyright holder nor the
65511+      names of its contributors may be used to endorse or promote products
65512+      derived from this software without specific prior written permission.
65513+
65514+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
65515+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
65516+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
65517+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
65518+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
65519+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
65520+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
65521+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
65522+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
65523+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65524+
65525+Authors: John Cox
65526+*/
65527+
65528+#include "config.h"
65529+#include <stdint.h>
65530+#include <string.h>
65531+#include "rpi_sand_fns.h"
65532+#include "avassert.h"
65533+#include "frame.h"
65534+
65535+#if ARCH_ARM && HAVE_NEON
65536+#include "arm/rpi_sand_neon.h"
65537+#define HAVE_SAND_ASM 1
65538+#elif ARCH_AARCH64 && HAVE_NEON
65539+#include "aarch64/rpi_sand_neon.h"
65540+#define HAVE_SAND_ASM 1
65541+#else
65542+#define HAVE_SAND_ASM 0
65543+#endif
65544+
65545+#define PW 1
65546+#include "rpi_sand_fn_pw.h"
65547+#undef PW
65548+
65549+#define PW 2
65550+#include "rpi_sand_fn_pw.h"
65551+#undef PW
65552+
65553+#if 1
65554+// Simple round
65555+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
65556+{
65557+    const unsigned int rnd = (1 << shr) >> 1;
65558+    const uint16_t * src = (const uint16_t *)_src;
65559+
65560+    for (; n != 0; --n) {
65561+        *dst++ = (*src++ + rnd) >> shr;
65562+    }
65563+}
65564+#else
65565+// Dithered variation
65566+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
65567+{
65568+    unsigned int rnd = (1 << shr) >> 1;
65569+    const unsigned int mask = ((1 << shr) - 1);
65570+    const uint16_t * src = (const uint16_t *)_src;
65571+
65572+    for (; n != 0; --n) {
65573+        rnd = *src++ + (rnd & mask);
65574+        *dst++ = rnd >> shr;
65575+    }
65576+}
65577+#endif
65578+
65579+// Fetches a single patch - offscreen fixup not done here
65580+// w <= stride1
65581+// unclipped
65582+// _x & _w in pixels, strides in bytes
65583+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
65584+                             const uint8_t * src,
65585+                             unsigned int stride1, unsigned int stride2,
65586+                             unsigned int _x, unsigned int y,
65587+                             unsigned int _w, unsigned int h)
65588+{
65589+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
65590+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
65591+    const unsigned int x1 = ((_x + _w) / 3) * 4;
65592+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
65593+    const unsigned int mask = stride1 - 1;
65594+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
65595+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
65596+
65597+#if HAVE_SAND_ASM
65598+    if (_x == 0) {
65599+        ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
65600+        return;
65601+    }
65602+#endif
65603+
65604+    if (x0 == x1) {
65605+        // *******************
65606+        // Partial single word xfer
65607+        return;
65608+    }
65609+
65610+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
65611+    {
65612+        unsigned int x = x0;
65613+        const uint32_t * p = (const uint32_t *)p0;
65614+        uint16_t * d = (uint16_t *)dst;
65615+
65616+        if (xskip0 != 0) {
65617+            const uint32_t p3 = *p++;
65618+
65619+            if (xskip0 == 1)
65620+                *d++ = (p3 >> 10) & 0x3ff;
65621+            *d++ = (p3 >> 20) & 0x3ff;
65622+
65623+            if (((x += 4) & mask) == 0)
65624+                p += slice_inc;
65625+        }
65626+
65627+        while (x != x1) {
65628+            const uint32_t p3 = *p++;
65629+            *d++ = p3 & 0x3ff;
65630+            *d++ = (p3 >> 10) & 0x3ff;
65631+            *d++ = (p3 >> 20) & 0x3ff;
65632+
65633+            if (((x += 4) & mask) == 0)
65634+                p += slice_inc;
65635+        }
65636+
65637+        if (xrem1 != 0) {
65638+            const uint32_t p3 = *p;
65639+
65640+            *d++ = p3 & 0x3ff;
65641+            if (xrem1 == 2)
65642+                *d++ = (p3 >> 10) & 0x3ff;
65643+        }
65644+    }
65645+}
65646+
65647+
65648+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
65649+                             uint8_t * dst_v, const unsigned int dst_stride_v,
65650+                             const uint8_t * src,
65651+                             unsigned int stride1, unsigned int stride2,
65652+                             unsigned int _x, unsigned int y,
65653+                             unsigned int _w, unsigned int h)
65654+{
65655+    const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word
65656+    const unsigned int xskip0 = _x - (x0 >> 3) * 3;
65657+    const unsigned int x1 = ((_x + _w) / 3) * 8;
65658+    const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3;
65659+    const unsigned int mask = stride1 - 1;
65660+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
65661+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
65662+
65663+#if HAVE_SAND_ASM
65664+    if (_x == 0) {
65665+        ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
65666+                                       src, stride1, stride2, _x, y, _w, h);
65667+        return;
65668+    }
65669+#endif
65670+
65671+    if (x0 == x1) {
65672+        // *******************
65673+        // Partial single word xfer
65674+        return;
65675+    }
65676+
65677+    for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1)
65678+    {
65679+        unsigned int x = x0;
65680+        const uint32_t * p = (const uint32_t *)p0;
65681+        uint16_t * du = (uint16_t *)dst_u;
65682+        uint16_t * dv = (uint16_t *)dst_v;
65683+
65684+        if (xskip0 != 0) {
65685+            const uint32_t p3a = *p++;
65686+            const uint32_t p3b = *p++;
65687+
65688+            if (xskip0 == 1)
65689+            {
65690+                *du++ = (p3a >> 20) & 0x3ff;
65691+                *dv++ = (p3b >>  0) & 0x3ff;
65692+            }
65693+            *du++ = (p3b >> 10) & 0x3ff;
65694+            *dv++ = (p3b >> 20) & 0x3ff;
65695+
65696+            if (((x += 8) & mask) == 0)
65697+                p += slice_inc;
65698+        }
65699+
65700+        while (x != x1) {
65701+            const uint32_t p3a = *p++;
65702+            const uint32_t p3b = *p++;
65703+
65704+            *du++ = p3a & 0x3ff;
65705+            *dv++ = (p3a >> 10) & 0x3ff;
65706+            *du++ = (p3a >> 20) & 0x3ff;
65707+            *dv++ = p3b & 0x3ff;
65708+            *du++ = (p3b >> 10) & 0x3ff;
65709+            *dv++ = (p3b >> 20) & 0x3ff;
65710+
65711+            if (((x += 8) & mask) == 0)
65712+                p += slice_inc;
65713+        }
65714+
65715+        if (xrem1 != 0) {
65716+            const uint32_t p3a = *p++;
65717+            const uint32_t p3b = *p++;
65718+
65719+            *du++ = p3a & 0x3ff;
65720+            *dv++ = (p3a >> 10) & 0x3ff;
65721+            if (xrem1 == 2)
65722+            {
65723+                *du++ = (p3a >> 20) & 0x3ff;
65724+                *dv++ = p3b & 0x3ff;
65725+            }
65726+        }
65727+    }
65728+}
65729+
65730+// Fetches a single patch - offscreen fixup not done here
65731+// w <= stride1
65732+// single lose bottom 2 bits truncation
65733+// _x & _w in pixels, strides in bytes
65734+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
65735+                             const uint8_t * src,
65736+                             unsigned int stride1, unsigned int stride2,
65737+                             unsigned int _x, unsigned int y,
65738+                             unsigned int _w, unsigned int h)
65739+{
65740+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
65741+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
65742+    const unsigned int x1 = ((_x + _w) / 3) * 4;
65743+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
65744+    const unsigned int mask = stride1 - 1;
65745+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
65746+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
65747+
65748+#if HAVE_SAND_ASM
65749+    if (_x == 0) {
65750+        ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
65751+        return;
65752+    }
65753+#endif
65754+
65755+    if (x0 == x1) {
65756+        // *******************
65757+        // Partial single word xfer
65758+        return;
65759+    }
65760+
65761+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
65762+    {
65763+        unsigned int x = x0;
65764+        const uint32_t * p = (const uint32_t *)p0;
65765+        uint8_t * d = dst;
65766+
65767+        if (xskip0 != 0) {
65768+            const uint32_t p3 = *p++;
65769+
65770+            if (xskip0 == 1)
65771+                *d++ = (p3 >> 12) & 0xff;
65772+            *d++ = (p3 >> 22) & 0xff;
65773+
65774+            if (((x += 4) & mask) == 0)
65775+                p += slice_inc;
65776+        }
65777+
65778+        while (x != x1) {
65779+            const uint32_t p3 = *p++;
65780+            *d++ = (p3 >> 2) & 0xff;
65781+            *d++ = (p3 >> 12) & 0xff;
65782+            *d++ = (p3 >> 22) & 0xff;
65783+
65784+            if (((x += 4) & mask) == 0)
65785+                p += slice_inc;
65786+        }
65787+
65788+        if (xrem1 != 0) {
65789+            const uint32_t p3 = *p;
65790+
65791+            *d++ = (p3 >> 2) & 0xff;
65792+            if (xrem1 == 2)
65793+                *d++ = (p3 >> 12) & 0xff;
65794+        }
65795+    }
65796+}
65797+
65798+
65799+
65800+// w/h in pixels
65801+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
65802+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
65803+                         unsigned int w, unsigned int h, const unsigned int shr)
65804+{
65805+    const unsigned int n = dst_stride1 / 2;
65806+    unsigned int j;
65807+
65808+    // This is true for our current layouts
65809+    av_assert0(dst_stride1 == src_stride1);
65810+
65811+    // As we have the same stride1 for src & dest and src is wider than dest
65812+    // then if we loop on src we can always write contiguously to dest
65813+    // We make no effort to copy an exact width - round up to nearest src stripe
65814+    // as we will always have storage in dest for that
65815+
65816+#if ARCH_ARM && HAVE_NEON
65817+    if (shr == 3 && src_stride1 == 128) {
65818+        for (j = 0; j + n < w; j += dst_stride1) {
65819+            uint8_t * d = dst + j * dst_stride2;
65820+            const uint8_t * s1 = src + j * 2 * src_stride2;
65821+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
65822+
65823+            ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
65824+        }
65825+    }
65826+    else
65827+#endif
65828+    {
65829+        for (j = 0; j + n < w; j += dst_stride1) {
65830+            uint8_t * d = dst + j * dst_stride2;
65831+            const uint8_t * s1 = src + j * 2 * src_stride2;
65832+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
65833+
65834+            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
65835+                cpy16_to_8(d, s1, n, shr);
65836+                cpy16_to_8(d + n, s2, n, shr);
65837+            }
65838+        }
65839+    }
65840+
65841+    // Fix up a trailing dest half stripe
65842+    if (j < w) {
65843+        uint8_t * d = dst + j * dst_stride2;
65844+        const uint8_t * s1 = src + j * 2 * src_stride2;
65845+
65846+        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
65847+            cpy16_to_8(d, s1, n, shr);
65848+        }
65849+    }
65850+}
65851+
65852+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
65853+{
65854+    const int w = av_frame_cropped_width(src);
65855+    const int h = av_frame_cropped_height(src);
65856+    const int x = src->crop_left;
65857+    const int y = src->crop_top;
65858+
65859+    // We will crop as part of the conversion
65860+    dst->crop_top = 0;
65861+    dst->crop_left = 0;
65862+    dst->crop_bottom = 0;
65863+    dst->crop_right = 0;
65864+
65865+    switch (src->format){
65866+        case AV_PIX_FMT_SAND128:
65867+        case AV_PIX_FMT_RPI4_8:
65868+            switch (dst->format){
65869+                case AV_PIX_FMT_YUV420P:
65870+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
65871+                                             src->data[0],
65872+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
65873+                                             x, y, w, h);
65874+                    av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
65875+                                             dst->data[2], dst->linesize[2],
65876+                                             src->data[1],
65877+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
65878+                                             x/2, y/2,  w/2, h/2);
65879+                    break;
65880+                case AV_PIX_FMT_NV12:
65881+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
65882+                                             src->data[0],
65883+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
65884+                                             x, y, w, h);
65885+                    av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
65886+                                             src->data[1],
65887+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
65888+                                             x/2, y/2, w, h/2);
65889+                    break;
65890+                default:
65891+                    return -1;
65892+            }
65893+            break;
65894+        case AV_PIX_FMT_SAND64_10:
65895+            switch (dst->format){
65896+                case AV_PIX_FMT_YUV420P10:
65897+                    av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
65898+                                             src->data[0],
65899+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
65900+                                             x*2, y, w*2, h);
65901+                    av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
65902+                                             dst->data[2], dst->linesize[2],
65903+                                             src->data[1],
65904+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
65905+                                             x, y/2,  w, h/2);
65906+                    break;
65907+                default:
65908+                    return -1;
65909+            }
65910+            break;
65911+        case AV_PIX_FMT_RPI4_10:
65912+            switch (dst->format){
65913+                case AV_PIX_FMT_YUV420P10:
65914+                    av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
65915+                                             src->data[0],
65916+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
65917+                                             x, y, w, h);
65918+                    av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
65919+                                             dst->data[2], dst->linesize[2],
65920+                                             src->data[1],
65921+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
65922+                                             x/2, y/2, w/2, h/2);
65923+                    break;
65924+                case AV_PIX_FMT_NV12:
65925+                    av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
65926+                                             src->data[0],
65927+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
65928+                                             x, y, w, h);
65929+                    av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
65930+                                             src->data[1],
65931+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
65932+                                             x/2, y/2, w, h/2);
65933+                    break;
65934+                default:
65935+                    return -1;
65936+            }
65937+            break;
65938+        default:
65939+            return -1;
65940+    }
65941+
65942+    return av_frame_copy_props(dst, src);
65943+}
65944--- /dev/null
65945+++ b/libavutil/rpi_sand_fns.h
65946@@ -0,0 +1,188 @@
65947+/*
65948+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
65949+All rights reserved.
65950+
65951+Redistribution and use in source and binary forms, with or without
65952+modification, are permitted provided that the following conditions are met:
65953+    * Redistributions of source code must retain the above copyright
65954+      notice, this list of conditions and the following disclaimer.
65955+    * Redistributions in binary form must reproduce the above copyright
65956+      notice, this list of conditions and the following disclaimer in the
65957+      documentation and/or other materials provided with the distribution.
65958+    * Neither the name of the copyright holder nor the
65959+      names of its contributors may be used to endorse or promote products
65960+      derived from this software without specific prior written permission.
65961+
65962+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
65963+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
65964+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
65965+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
65966+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
65967+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
65968+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
65969+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
65970+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
65971+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65972+
65973+Authors: John Cox
65974+*/
65975+
65976+#ifndef AVUTIL_RPI_SAND_FNS
65977+#define AVUTIL_RPI_SAND_FNS
65978+
65979+#include "libavutil/frame.h"
65980+
65981+// For all these fns _x & _w are measured as coord * PW
65982+// For the C fns coords are in chroma pels (so luma / 2)
65983+// Strides are in bytes
65984+
65985+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
65986+                             const uint8_t * src,
65987+                             unsigned int stride1, unsigned int stride2,
65988+                             unsigned int _x, unsigned int y,
65989+                             unsigned int _w, unsigned int h);
65990+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
65991+                             const uint8_t * src,
65992+                             unsigned int stride1, unsigned int stride2,
65993+                             unsigned int _x, unsigned int y,
65994+                             unsigned int _w, unsigned int h);
65995+
65996+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
65997+                             uint8_t * dst_v, const unsigned int dst_stride_v,
65998+                             const uint8_t * src,
65999+                             unsigned int stride1, unsigned int stride2,
66000+                             unsigned int _x, unsigned int y,
66001+                             unsigned int _w, unsigned int h);
66002+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
66003+                             uint8_t * dst_v, const unsigned int dst_stride_v,
66004+                             const uint8_t * src,
66005+                             unsigned int stride1, unsigned int stride2,
66006+                             unsigned int _x, unsigned int y,
66007+                             unsigned int _w, unsigned int h);
66008+
66009+void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
66010+                             unsigned int stride1, unsigned int stride2,
66011+                             const uint8_t * src_u, const unsigned int src_stride_u,
66012+                             const uint8_t * src_v, const unsigned int src_stride_v,
66013+                             unsigned int _x, unsigned int y,
66014+                             unsigned int _w, unsigned int h);
66015+void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
66016+                             unsigned int stride1, unsigned int stride2,
66017+                             const uint8_t * src_u, const unsigned int src_stride_u,
66018+                             const uint8_t * src_v, const unsigned int src_stride_v,
66019+                             unsigned int _x, unsigned int y,
66020+                             unsigned int _w, unsigned int h);
66021+
66022+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
66023+                             const uint8_t * src,
66024+                             unsigned int stride1, unsigned int stride2,
66025+                             unsigned int _x, unsigned int y,
66026+                             unsigned int _w, unsigned int h);
66027+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
66028+                             uint8_t * dst_v, const unsigned int dst_stride_v,
66029+                             const uint8_t * src,
66030+                             unsigned int stride1, unsigned int stride2,
66031+                             unsigned int _x, unsigned int y,
66032+                             unsigned int _w, unsigned int h);
66033+
66034+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
66035+                             const uint8_t * src,
66036+                             unsigned int stride1, unsigned int stride2,
66037+                             unsigned int _x, unsigned int y,
66038+                             unsigned int _w, unsigned int h);
66039+
66040+// w/h in pixels
66041+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
66042+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
66043+                         unsigned int w, unsigned int h, const unsigned int shr);
66044+
66045+
66046+// dst must contain required pixel format & allocated data buffers
66047+// Cropping on the src buffer will be honoured and dst crop will be set to zero
66048+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
66049+
66050+
66051+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
66052+{
66053+#ifdef RPI_ZC_SAND128_ONLY
66054+    // If we are sure we only only support 128 byte sand formats replace the
66055+    // var with a constant which should allow for better optimisation
66056+    return 128;
66057+#else
66058+    return frame->linesize[0];
66059+#endif
66060+}
66061+
66062+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
66063+{
66064+    return frame->linesize[3];
66065+}
66066+
66067+
66068+static inline int av_rpi_is_sand_format(const int format)
66069+{
66070+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10);
66071+}
66072+
66073+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
66074+{
66075+    return av_rpi_is_sand_format(frame->format);
66076+}
66077+
66078+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
66079+{
66080+    return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8);
66081+}
66082+
66083+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
66084+{
66085+    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
66086+}
66087+
66088+static inline int av_rpi_is_sand30_frame(const AVFrame * const frame)
66089+{
66090+    return (frame->format == AV_PIX_FMT_RPI4_10);
66091+}
66092+
66093+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
66094+{
66095+    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
66096+}
66097+
66098+// If x is measured in bytes (not pixels) then this works for sand64_16 as
66099+// well as sand128 - but in the general case we work that out
66100+
66101+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
66102+{
66103+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
66104+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
66105+    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
66106+    const unsigned int x1 = x & (stride1 - 1);
66107+    const unsigned int x2 = x ^ x1;
66108+
66109+    return x1 + stride1 * y + stride2 * x2;
66110+}
66111+
66112+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
66113+{
66114+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
66115+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
66116+    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
66117+    const unsigned int x1 = x & (stride1 - 1);
66118+    const unsigned int x2 = x ^ x1;
66119+
66120+    return x1 + stride1 * y_c + stride2 * x2;
66121+}
66122+
66123+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
66124+{
66125+    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
66126+}
66127+
66128+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
66129+{
66130+    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
66131+}
66132+
66133+#endif
66134+
66135--- /dev/null
66136+++ b/pi-util/BUILD.txt
66137@@ -0,0 +1,59 @@
66138+Building Pi FFmpeg
66139+==================
66140+
66141+Current only building on a Pi is supported.
66142+This builds ffmpeg the way I've tested it
66143+
66144+Get all dependencies - the current package dependencies are good enough
66145+
66146+$ sudo apt-get build-dep ffmpeg
66147+
66148+Configure using the pi-util/conf_native.sh script
66149+-------------------------------------------------
66150+
66151+This sets the normal release options and creates an ouutput dir to build into
66152+The directory name will depend on system and options but will be under out/
66153+
66154+There are a few choices here
66155+ --mmal  build including the legacy mmal-based decoders and zero-copy code
66156+         this requires appropriate libraries which currently will exist for
66157+         armv7 but not arm64
66158+ --noshared
66159+         Build a static image rather than a shared library one.  Static is
66160+         easier for testing as there is no need to worry about library
66161+         paths being confused and therefore running the wrong code,  Shared
66162+         is what is needed, in most cases, when building for use by other
66163+         programs.
66164+
66165+So for a static build
66166+---------------------
66167+
66168+$ pi-util/conf_native.sh --noshared
66169+
66170+$ make -j8 -C out/<wherever the script said it was building to>
66171+
66172+You can now run ffmpeg directly from where it was built
66173+
66174+For a shared build
66175+------------------
66176+
66177+$ pi-util/conf_native.sh
66178+
66179+You will normally want an install target if shared. Note that the script has
66180+set this up to be generated in out/<builddir>/install, you don't have to worry
66181+about overwriting your system libs.
66182+
66183+$ make -j8 -C out/<builddir> install
66184+
66185+You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
66186+built or install the image on the system - you have to be careful to get rid
66187+of all other ffmpeg libs or confusion may result.  There is a little script
66188+that wipes all other versions - obviously use with care!
66189+
66190+$ sudo pi-util/clean_usr_libs.sh
66191+
66192+Then simply copying from the install to /usr works
66193+
66194+$ sudo cp -r out/<builddir>/install/* /usr
66195+
66196+
66197--- /dev/null
66198+++ b/pi-util/NOTES.txt
66199@@ -0,0 +1,69 @@
66200+Notes on the hevc_rpi decoder & associated support code
66201+-------------------------------------------------------
66202+
66203+There are 3 main parts to the existing code:
66204+
66205+1) The decoder - this is all in libavcodec as rpi_hevc*.
66206+
66207+2) A few filters to deal with Sand frames and a small patch to
66208+automatically select the sand->i420 converter when required.
66209+
66210+3) A kludge in ffmpeg.c to display the decoded video. This could & should
66211+be converted into a proper ffmpeg display module.
66212+
66213+
66214+Decoder
66215+-------
66216+
66217+The decoder is a modified version of the existing ffmpeg hevc decoder.
66218+Generally it is ~100% faster than the existing ffmpeg hevc s/w decoder.
66219+More complex bitstreams can be up to ~200% faster but particularly easy
66220+streams can cut its advantage down to ~50%.  This means that a Pi3+ can
66221+display nearly all 8-bit 1080p30 streams and with some overclocking it can
66222+display most lower bitrate 10-bit 1080p30 streams - this latter case is
66223+not helped by the requirement to downsample to 8-bit before display on a
66224+Pi.
66225+
66226+It has had co-processor offload added for inter-pred and large block
66227+residual transform.  Various parts have had optimized ARM NEON assembler
66228+added and the existing ARM asm sections have been profiled and
66229+re-optimized for A53. The main C code has been substantially reworked at
66230+its lower levels in an attempt to optimize it and minimize memory
66231+bandwidth. To some extent code paths that deal with frame types that it
66232+doesn't support have been pruned.
66233+
66234+It outputs frames in Broadcom Sand format. This is a somewhat annoying
66235+layout that doesn't fit into ffmpegs standard frame descriptions. It has
66236+vertical stripes of 128 horizontal pixels (64 in 10 bit forms) with Y for
66237+the stripe followed by interleaved U & V, that is then followed by the Y
66238+for the next stripe, etc. The final stripe is always padded to
66239+stripe-width. This is used in an attempt to help with cache locality and
66240+cut down on the number of dram bank switches. It is annoying to use for
66241+inter-pred with conventional processing but the way the Pi QPU (which is
66242+used for inter-pred) works means that it has negligible downsides here and
66243+the improved memory performance exceeds the overhead of the increased
66244+complexity in the rest of the code.
66245+
66246+Frames must be allocated out of GPU memory (as otherwise they can't be
66247+accessed by the co-processors). Utility functions (in rpi_zc.c) have been
66248+written to make this easier. As the frames are already in GPU memory they
66249+can be displayed by the Pi h/w without any further copying.
66250+
66251+
66252+Known non-features
66253+------------------
66254+
66255+Frame allocation should probably be done in some other way in order to fit
66256+into the standard framework better.
66257+
66258+Sand frames are currently declared as software frames, there is an
66259+argument that they should be hardware frames but they aren't really.
66260+
66261+There must be a better way of auto-selecting the hevc_rpi decoder over the
66262+normal s/w hevc decoder, but I became confused by the existing h/w
66263+acceleration framework and what I wanted to do didn't seem to fit in
66264+neatly.
66265+
66266+Display should be a proper device rather than a kludge in ffmpeg.c
66267+
66268+
66269--- /dev/null
66270+++ b/pi-util/TESTMESA.txt
66271@@ -0,0 +1,82 @@
66272+# Setup & Build instructions for testing Argon30 mesa support (on Pi4)
66273+
66274+# These assume that the drm_mmal test for Sand8 has been built on this Pi
66275+# as build relies on many of the same files
66276+
66277+# 1st get everything required to build ffmpeg
66278+# If sources aren't already enabled on your Pi then enable them
66279+sudo su
66280+sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list
66281+sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list
66282+mv /tmp/sources.list /etc/apt/
66283+mv /tmp/raspi.list /etc/apt/sources.list.d/
66284+apt update
66285+
66286+# Get dependancies
66287+sudo apt build-dep ffmpeg
66288+
66289+sudo apt install meson libepoxy-dev libxcb-dri3-dev libxcb1-dev libx11-dev libx11-xcb-dev libdrm-dev
66290+
66291+# Enable H265 V4L2 request decoder
66292+sudo su
66293+echo dtoverlay=rpivid-v4l2 >> /boot/config.txt
66294+# You may also want to add more CMA if you are going to try 4k videos
66295+# Change the dtoverlay=vc4-fkms-v3d line in config.txt to read
66296+# dtoverlay=vc4-fkms-v3d,cma-512
66297+reboot
66298+# Check it has turned up
66299+ls -la /dev/video*
66300+# This should include video19
66301+# crw-rw----+ 1 root video 81, 7 Aug  4 17:25 /dev/video19
66302+
66303+# Currently on the Pi the linux headers from the debian distro don't match
66304+# the kernel that we ship and we need to update them - hopefully this step
66305+# will be unneeded in the future
66306+sudo apt install git bc bison flex libssl-dev make
66307+git clone --depth=1 https://github.com/raspberrypi/linux --branch rpi-5.10.y
66308+cd linux
66309+KERNEL=kernel7l
66310+make bcm2711_defconfig
66311+make headers_install
66312+sudo cp -r usr/include/linux /usr/include
66313+cd ..
66314+
66315+# Config - this builds a staticly linked ffmpeg which is easier for testing
66316+pi-util/conf_native.sh --noshared
66317+
66318+# Build (this is a bit dull)
66319+# If you want to poke the source the libavdevice/egl_vout.c contains the
66320+# output code -
66321+cd out/armv7-static-rel
66322+
66323+# Check that you have actually configured V4L2 request
66324+grep HEVC_V4L2REQUEST config.h
66325+# You are hoping for
66326+# #define CONFIG_HEVC_V4L2REQUEST_HWACCEL 1
66327+# if you get 0 then the config has failed
66328+
66329+make -j6
66330+
66331+# Grab test streams
66332+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv
66333+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv
66334+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv
66335+
66336+# Test i420 output (works currently)
66337+./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl -
66338+
66339+# Test Sand8 output - doesn't currently work but should once you have
66340+# Sand8 working in drm_mmal. I can't guarantee that this will work as
66341+# I can't test this path with a known working format, but the debug looks
66342+# good.  If this doesn't work & drm_mmal does with sand8 then come back to me
66343+# The "show_all 1" forces vout to display every frame otherwise it drops any
66344+# frame that would cause it to block
66345+./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -show_all 1 -f vout_egl -
66346+
66347+# Test Sand30 - doesn't currently work
66348+# (Beware that when FFmpeg errors out it often leaves your teminal window
66349+# in a state where you need to reset it)
66350+./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl -
66351+
66352+
66353+
66354--- /dev/null
66355+++ b/pi-util/clean_usr_libs.sh
66356@@ -0,0 +1,26 @@
66357+set -e
66358+U=/usr/lib/arm-linux-gnueabihf
66359+rm -f $U/libavcodec.*
66360+rm -f $U/libavdevice.*
66361+rm -f $U/libavfilter.*
66362+rm -f $U/libavformat.*
66363+rm -f $U/libavutil.*
66364+rm -f $U/libswresample.*
66365+rm -f $U/libswscale.*
66366+U=/usr/lib/arm-linux-gnueabihf/neon/vfp
66367+rm -f $U/libavcodec.*
66368+rm -f $U/libavdevice.*
66369+rm -f $U/libavfilter.*
66370+rm -f $U/libavformat.*
66371+rm -f $U/libavutil.*
66372+rm -f $U/libswresample.*
66373+rm -f $U/libswscale.*
66374+U=/usr/lib/aarch64-linux-gnu
66375+rm -f $U/libavcodec.*
66376+rm -f $U/libavdevice.*
66377+rm -f $U/libavfilter.*
66378+rm -f $U/libavformat.*
66379+rm -f $U/libavutil.*
66380+rm -f $U/libswresample.*
66381+rm -f $U/libswscale.*
66382+
66383--- /dev/null
66384+++ b/pi-util/conf_arm64_native.sh
66385@@ -0,0 +1,45 @@
66386+echo "Configure for ARM64 native build"
66387+
66388+#RPI_KEEPS="-save-temps=obj"
66389+
66390+SHARED_LIBS="--enable-shared"
66391+if [ "$1" == "--noshared" ]; then
66392+  SHARED_LIBS="--disable-shared"
66393+  echo Static libs
66394+  OUT=out/arm64-static-rel
66395+else
66396+  echo Shared libs
66397+  OUT=out/arm64-shared-rel
66398+fi
66399+
66400+mkdir -p $OUT
66401+cd $OUT
66402+
66403+A=aarch64-linux-gnu
66404+USR_PREFIX=`pwd`/install
66405+LIB_PREFIX=$USR_PREFIX/lib/$A
66406+INC_PREFIX=$USR_PREFIX/include/$A
66407+
66408+../../configure \
66409+ --prefix=$USR_PREFIX\
66410+ --libdir=$LIB_PREFIX\
66411+ --incdir=$INC_PREFIX\
66412+ --disable-stripping\
66413+ --disable-thumb\
66414+ --disable-mmal\
66415+ --enable-sand\
66416+ --enable-v4l2-request\
66417+ --enable-libdrm\
66418+ --enable-epoxy\
66419+ --enable-libudev\
66420+ --enable-vout-drm\
66421+ --enable-vout-egl\
66422+ $SHARED_LIBS\
66423+ --extra-cflags="-ggdb"
66424+
66425+# --enable-decoder=hevc_rpi\
66426+# --enable-extra-warnings\
66427+# --arch=armv71\
66428+
66429+# gcc option for getting asm listing
66430+# -Wa,-ahls
66431--- /dev/null
66432+++ b/pi-util/conf_h265.2016.csv
66433@@ -0,0 +1,195 @@
66434+1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8
66435+1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8
66436+1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8
66437+1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8
66438+1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8
66439+1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8
66440+1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8
66441+1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8
66442+1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8
66443+1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8
66444+1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8
66445+1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8
66446+1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8
66447+1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8
66448+1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8
66449+1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8
66450+1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8
66451+1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8
66452+1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8
66453+1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8
66454+1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8
66455+1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10
66456+1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8
66457+1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8
66458+1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8
66459+1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8
66460+1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8
66461+1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8
66462+1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8
66463+1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8
66464+1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8
66465+1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8
66466+1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8
66467+1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8
66468+1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8
66469+1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8
66470+1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8
66471+1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8
66472+1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8
66473+1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8
66474+1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8
66475+1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8
66476+1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10
66477+1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8
66478+1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8
66479+1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8
66480+1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8
66481+1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8
66482+1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8
66483+1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8
66484+1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8
66485+1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8
66486+1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8
66487+1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8
66488+1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8
66489+1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8
66490+1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8
66491+1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8
66492+1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8
66493+1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8
66494+1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8
66495+1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8
66496+1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8
66497+1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8
66498+1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8
66499+1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8
66500+1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8
66501+1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8
66502+1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8
66503+1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8
66504+1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8
66505+1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8
66506+1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8
66507+1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8
66508+1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8
66509+1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8
66510+1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8
66511+1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8
66512+1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8
66513+1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8
66514+1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8
66515+1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8
66516+1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8
66517+1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8
66518+1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8
66519+1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8
66520+1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8
66521+1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8
66522+1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8
66523+1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8
66524+1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8
66525+1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8
66526+1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8
66527+1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8
66528+1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8
66529+1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8
66530+1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8
66531+1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8
66532+1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8
66533+1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8
66534+1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8
66535+1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8
66536+1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8
66537+1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8
66538+1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8
66539+1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8
66540+1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8
66541+1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8
66542+1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8
66543+1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8
66544+1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8
66545+1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8
66546+1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8
66547+1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8
66548+1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8
66549+1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8
66550+1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8
66551+1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8
66552+1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8
66553+1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8
66554+1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8
66555+1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8
66556+1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8
66557+1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8
66558+1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8
66559+1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8
66560+1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8
66561+3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10
66562+1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8
66563+1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8
66564+3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8
66565+1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10
66566+1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8
66567+1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8
66568+1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10
66569+1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10
66570+1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8
66571+1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10
66572+1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8
66573+1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10
66574+1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8
66575+1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10
66576+1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8
66577+1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10
66578+1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8
66579+1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10
66580+1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8
66581+1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0
66582+0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8
66583+0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8
66584+0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10
66585+0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8
66586+0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8
66587+1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0
66588+0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8
66589+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
66590+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
66591+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
66592+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
66593+0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
66594+0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
66595+0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
66596+0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
66597+1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10
66598+1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0
66599+1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0
66600+1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0
66601+1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0
66602+1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0
66603+1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0
66604+0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0
66605+0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8
66606+0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8
66607+1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0
66608+1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8
66609+1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0
66610+1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0
66611+1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0
66612+1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0
66613+1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0
66614+1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0
66615+1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0
66616+0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8
66617+0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10
66618+0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10
66619+0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8
66620+0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8
66621+0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8
66622+0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8
66623+0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8
66624+1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8
66625+1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8
66626+1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8
66627+1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8
66628+1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8
66629--- /dev/null
66630+++ b/pi-util/conf_h265.2016_HEVC_v1.csv
66631@@ -0,0 +1,147 @@
66632+1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
66633+1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
66634+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
66635+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
66636+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
66637+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
66638+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
66639+1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
66640+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
66641+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
66642+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
66643+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
66644+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
66645+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
66646+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
66647+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
66648+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
66649+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
66650+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
66651+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
66652+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
66653+1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
66654+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
66655+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
66656+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
66657+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
66658+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
66659+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
66660+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
66661+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
66662+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
66663+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
66664+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
66665+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
66666+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
66667+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
66668+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
66669+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
66670+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
66671+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
66672+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
66673+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
66674+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
66675+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
66676+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
66677+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
66678+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
66679+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
66680+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
66681+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
66682+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
66683+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
66684+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
66685+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
66686+1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
66687+1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
66688+1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
66689+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
66690+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
66691+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
66692+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
66693+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
66694+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
66695+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
66696+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
66697+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
66698+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
66699+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
66700+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
66701+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
66702+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
66703+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
66704+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
66705+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
66706+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
66707+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
66708+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
66709+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
66710+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
66711+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
66712+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
66713+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
66714+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
66715+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
66716+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
66717+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
66718+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
66719+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
66720+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
66721+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
66722+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
66723+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
66724+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
66725+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
66726+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
66727+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
66728+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
66729+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
66730+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
66731+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
66732+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
66733+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
66734+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
66735+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
66736+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
66737+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
66738+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
66739+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
66740+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
66741+1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
66742+2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
66743+2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
66744+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
66745+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
66746+1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
66747+1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
66748+1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
66749+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
66750+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
66751+1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
66752+1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
66753+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
66754+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
66755+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
66756+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
66757+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
66758+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
66759+3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
66760+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
66761+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
66762+3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
66763+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
66764+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
66765+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
66766+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
66767+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
66768+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
66769+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
66770+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
66771+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
66772+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
66773+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
66774+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
66775+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
66776+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
66777+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
66778+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
66779--- /dev/null
66780+++ b/pi-util/conf_h265.csv
66781@@ -0,0 +1,144 @@
66782+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
66783+1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
66784+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
66785+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
66786+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
66787+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
66788+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
66789+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
66790+1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
66791+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
66792+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
66793+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
66794+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
66795+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
66796+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
66797+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
66798+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
66799+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
66800+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
66801+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
66802+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
66803+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
66804+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
66805+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
66806+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
66807+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
66808+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
66809+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
66810+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
66811+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
66812+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
66813+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
66814+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
66815+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
66816+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
66817+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
66818+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
66819+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
66820+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
66821+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
66822+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
66823+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
66824+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
66825+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
66826+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
66827+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
66828+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
66829+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
66830+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
66831+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
66832+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
66833+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
66834+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
66835+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
66836+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
66837+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
66838+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
66839+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
66840+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
66841+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
66842+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
66843+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
66844+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
66845+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
66846+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
66847+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
66848+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
66849+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
66850+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
66851+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
66852+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
66853+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
66854+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
66855+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
66856+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
66857+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
66858+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
66859+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
66860+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
66861+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
66862+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
66863+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
66864+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
66865+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
66866+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
66867+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
66868+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
66869+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
66870+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
66871+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
66872+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
66873+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
66874+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
66875+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
66876+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
66877+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
66878+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
66879+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
66880+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
66881+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
66882+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
66883+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
66884+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
66885+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
66886+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
66887+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
66888+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
66889+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
66890+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
66891+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
66892+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
66893+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
66894+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
66895+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
66896+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
66897+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
66898+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
66899+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
66900+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
66901+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
66902+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
66903+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
66904+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
66905+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
66906+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
66907+0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
66908+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
66909+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
66910+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
66911+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
66912+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
66913+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
66914+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
66915+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
66916+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
66917+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
66918+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
66919+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
66920+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
66921+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
66922+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
66923+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
66924+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
66925+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
66926--- /dev/null
66927+++ b/pi-util/conf_native.sh
66928@@ -0,0 +1,106 @@
66929+echo "Configure for native build"
66930+
66931+FFSRC=`pwd`
66932+MC=`dpkg --print-architecture`
66933+BUILDBASE=$FFSRC/out
66934+
66935+#RPI_KEEPS="-save-temps=obj"
66936+RPI_KEEPS=""
66937+
66938+NOSHARED=
66939+MMAL=
66940+
66941+while [ "$1" != "" ] ; do
66942+    case $1 in
66943+	--noshared)
66944+	    NOSHARED=1
66945+	    ;;
66946+	--mmal)
66947+	    MMAL=1
66948+	    ;;
66949+	*)
66950+	    echo "Usage $0: [--noshared] [--mmal]"
66951+	    exit 1
66952+	    ;;
66953+    esac
66954+    shift
66955+done
66956+
66957+
66958+MCOPTS=
66959+RPI_INCLUDES=
66960+RPI_LIBDIRS=
66961+RPI_DEFINES=
66962+RPI_EXTRALIBS=
66963+
66964+if [ "$MC" == "arm64" ]; then
66965+  echo "M/C aarch64"
66966+  A=aarch64-linux-gnu
66967+  B=arm64
66968+elif [ "$MC" == "armhf" ]; then
66969+  echo "M/C armv7"
66970+  A=arm-linux-gnueabihf
66971+  B=armv7
66972+  MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
66973+  RPI_DEFINES=-mfpu=neon-vfpv4
66974+else
66975+  echo Unexpected architecture $MC
66976+  exit 1
66977+fi
66978+
66979+if [ $MMAL ]; then
66980+  RPI_OPT_VC=/opt/vc
66981+  RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
66982+  RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
66983+  RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000"
66984+  RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group"
66985+  RPIOPTS="--enable-mmal --enable-rpi"
66986+else
66987+  RPIOPTS="--disable-mmal --enable-sand"
66988+fi
66989+
66990+C=`lsb_release -sc`
66991+V=`cat RELEASE`
66992+
66993+SHARED_LIBS="--enable-shared"
66994+if [ $NOSHARED ]; then
66995+  SHARED_LIBS="--disable-shared"
66996+  OUT=$BUILDBASE/$B-$C-$V-static-rel
66997+  echo Static libs
66998+else
66999+  echo Shared libs
67000+  OUT=$BUILDBASE/$B-$C-$V-shared-rel
67001+fi
67002+
67003+USR_PREFIX=$OUT/install
67004+LIB_PREFIX=$USR_PREFIX/lib/$A
67005+INC_PREFIX=$USR_PREFIX/include/$A
67006+
67007+echo Destination directory: $OUT
67008+mkdir -p $OUT
67009+# Nothing under here need worry git - including this .gitignore!
67010+echo "**" > $BUILDBASE/.gitignore
67011+cd $OUT
67012+
67013+$FFSRC/configure \
67014+ --prefix=$USR_PREFIX\
67015+ --libdir=$LIB_PREFIX\
67016+ --incdir=$INC_PREFIX\
67017+ $MCOPTS\
67018+ --disable-stripping\
67019+ --disable-thumb\
67020+ --enable-v4l2-request\
67021+ --enable-libdrm\
67022+ --enable-vout-egl\
67023+ --enable-vout-drm\
67024+ $SHARED_LIBS\
67025+ $RPIOPTS\
67026+ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
67027+ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
67028+ --extra-ldflags="$RPI_LIBDIRS"\
67029+ --extra-libs="$RPI_EXTRALIBS"\
67030+ --extra-version="rpi"
67031+
67032+
67033+# gcc option for getting asm listing
67034+# -Wa,-ahls
67035--- /dev/null
67036+++ b/pi-util/ffconf.py
67037@@ -0,0 +1,215 @@
67038+#!/usr/bin/env python3
67039+
67040+import string
67041+import os
67042+import subprocess
67043+import re
67044+import argparse
67045+import sys
67046+import csv
67047+from stat import *
67048+
67049+CODEC_HEVC_RPI  = 1
67050+HWACCEL_RPI     = 2
67051+HWACCEL_DRM     = 3
67052+HWACCEL_VAAPI   = 4
67053+
67054+def testone(fileroot, srcname, es_file, md5_file, pix, dectype, vcodec, ffmpeg_exec):
67055+    hwaccel = ""
67056+    if dectype == HWACCEL_RPI:
67057+        hwaccel = "rpi"
67058+    elif dectype == HWACCEL_DRM:
67059+        hwaccel = "drm"
67060+    elif dectype == HWACCEL_VAAPI:
67061+        hwaccel = "vaapi"
67062+
67063+    pix_fmt = []
67064+    if pix == "8":
67065+        pix_fmt = ["-pix_fmt", "yuv420p"]
67066+    elif pix == "10":
67067+        pix_fmt = ["-pix_fmt", "yuv420p10le"]
67068+    elif pix == "12":
67069+        pix_fmt = ["-pix_fmt", "yuv420p12le"]
67070+
67071+    tmp_root = "/tmp"
67072+
67073+    names = srcname.split('/')
67074+    while len(names) > 1:
67075+        tmp_root = os.path.join(tmp_root, names[0])
67076+        del names[0]
67077+    name = names[0]
67078+
67079+    if not os.path.exists(tmp_root):
67080+        os.makedirs(tmp_root)
67081+
67082+    dec_file = os.path.join(tmp_root, name + ".dec.md5")
67083+    try:
67084+        os.remove(dec_file)
67085+    except:
67086+        pass
67087+
67088+    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
67089+
67090+    ffargs = [ffmpeg_exec, "-flags", "unaligned", "-hwaccel", hwaccel, "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file)] + pix_fmt + ["-f", "md5", dec_file]
67091+
67092+    # Unaligned needed for cropping conformance
67093+    if hwaccel:
67094+        rstr = subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT)
67095+    else:
67096+        rstr = subprocess.call(
67097+            [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
67098+            stdout=flog, stderr=subprocess.STDOUT)
67099+
67100+    try:
67101+        m1 = None
67102+        m2 = None
67103+        with open(os.path.join(fileroot, md5_file)) as f:
67104+            for line in f:
67105+                m1 = re.search("[0-9a-f]{32}", line.lower())
67106+                if m1:
67107+                    break
67108+
67109+        with open(dec_file) as f:
67110+            m2 = re.search("[0-9a-f]{32}", f.readline())
67111+    except:
67112+        pass
67113+
67114+    if  m1 and m2 and m1.group() == m2.group():
67115+        print("Match: " + m1.group(), file=flog)
67116+        rv = 0
67117+    elif not m1:
67118+        print("****** Cannot find m1", file=flog)
67119+        rv = 3
67120+    elif not m2:
67121+        print("****** Cannot find m2", file=flog)
67122+        rv = 2
67123+    else:
67124+        print("****** Mismatch: " + m1.group() + " != " + m2.group(), file=flog)
67125+        rv = 1
67126+    flog.close()
67127+    return rv
67128+
67129+def scandir(root):
67130+    aconf = []
67131+    ents = os.listdir(root)
67132+    ents.sort(key=str.lower)
67133+    for name in ents:
67134+        test_path = os.path.join(root, name)
67135+        if S_ISDIR(os.stat(test_path).st_mode):
67136+            files = os.listdir(test_path)
67137+            es_file = "?"
67138+            md5_file = "?"
67139+            for f in files:
67140+                (base, ext) = os.path.splitext(f)
67141+                if base[0] == '.':
67142+                    pass
67143+                elif ext == ".bit" or ext == ".bin":
67144+                    es_file = f
67145+                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
67146+                    if md5_file == "?":
67147+                        md5_file = f
67148+                    elif base[-3:] == "yuv":
67149+                        md5_file = f
67150+            aconf.append((1, name, es_file, md5_file))
67151+    return aconf
67152+
67153+def runtest(name, tests):
67154+    if not tests:
67155+        return True
67156+    for t in tests:
67157+        if name[0:len(t)] == t or name.find("/" + t) != -1:
67158+            return True
67159+    return False
67160+
67161+def doconf(csva, tests, test_root, vcodec, dectype, ffmpeg_exec):
67162+    unx_failures = []
67163+    unx_success = []
67164+    failures = 0
67165+    successes = 0
67166+    for a in csva:
67167+        exp_test = int(a[0])
67168+        if (exp_test and runtest(a[1], tests)):
67169+            name = a[1]
67170+            print ("==== ", name, end="")
67171+            sys.stdout.flush()
67172+
67173+            rv = testone(os.path.join(test_root, name), name, a[2], a[3], a[4], dectype=dectype, vcodec=vcodec, ffmpeg_exec=ffmpeg_exec)
67174+            if (rv == 0):
67175+                successes += 1
67176+            else:
67177+                failures += 1
67178+
67179+            if (rv == 0):
67180+                if exp_test == 2:
67181+                    print(": * OK *")
67182+                    unx_success.append(name)
67183+                else:
67184+                    print(": ok")
67185+            elif exp_test == 2 and rv == 1:
67186+                print(": fail")
67187+            elif exp_test == 3 and rv == 2:
67188+                # Call an expected "crash" an abort
67189+                print(": abort")
67190+            else:
67191+                unx_failures.append(name)
67192+                if rv == 1:
67193+                    print(": * FAIL *")
67194+                elif (rv == 2) :
67195+                    print(": * CRASH *")
67196+                elif (rv == 3) :
67197+                    print(": * MD5 MISSING *")
67198+                else :
67199+                    print(": * BANG *")
67200+
67201+    if unx_failures or unx_success:
67202+        print("Unexpected Failures:", unx_failures)
67203+        print("Unexpected Success: ", unx_success)
67204+    else:
67205+        print("All tests normal:", successes, "ok,", failures, "failed")
67206+
67207+
67208+class ConfCSVDialect(csv.Dialect):
67209+    delimiter = ','
67210+    doublequote = True
67211+    lineterminator = '\n'
67212+    quotechar='"'
67213+    quoting = csv.QUOTE_MINIMAL
67214+    skipinitialspace = True
67215+    strict = True
67216+
67217+if __name__ == '__main__':
67218+
67219+    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
67220+    argp.add_argument("tests", nargs='*')
67221+    argp.add_argument("--pi4", action='store_true', help="Force pi4 cmd line")
67222+    argp.add_argument("--drm", action='store_true', help="Force v4l2 drm cmd line")
67223+    argp.add_argument("--vaapi", action='store_true', help="Force vaapi cmd line")
67224+    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
67225+    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
67226+    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
67227+    argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
67228+    argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name")
67229+    args = argp.parse_args()
67230+
67231+    if args.csvgen:
67232+        csv.writer(sys.stdout).writerows(scandir(args.test_root))
67233+        exit(0)
67234+
67235+    with open(args.csv, 'rt') as csvfile:
67236+        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
67237+
67238+    dectype = CODEC_HEVC_RPI
67239+    if os.path.exists("/dev/rpivid-hevcmem"):
67240+        dectype = HWACCEL_RPI
67241+    if args.drm or os.path.exists("/sys/module/rpivid_hevc"):
67242+        dectype = HWACCEL_DRM
67243+
67244+    if args.pi4:
67245+        dectype = HWACCEL_RPI
67246+    elif args.drm:
67247+        dectype = HWACCEL_DRM
67248+    elif args.vaapi:
67249+        dectype = HWACCEL_VAAPI
67250+
67251+    doconf(csva, args.tests, args.test_root, args.vcodec, dectype, args.ffmpeg)
67252+
67253--- /dev/null
67254+++ b/pi-util/ffperf.py
67255@@ -0,0 +1,128 @@
67256+#!/usr/bin/env python3
67257+
67258+import time
67259+import string
67260+import os
67261+import tempfile
67262+import subprocess
67263+import re
67264+import argparse
67265+import sys
67266+import csv
67267+from stat import *
67268+
67269+class tstats:
67270+    close_threshold = 0.01
67271+
67272+    def __init__(self, stats_dict=None):
67273+        if stats_dict != None:
67274+            self.name = stats_dict["name"]
67275+            self.elapsed = float(stats_dict["elapsed"])
67276+            self.user = float(stats_dict["user"])
67277+            self.sys = float(stats_dict["sys"])
67278+
67279+    def times_str(self):
67280+        ctime = self.sys + self.user
67281+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
67282+
67283+    def dict(self):
67284+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
67285+
67286+    def is_close(self, other):
67287+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
67288+
67289+    def __lt__(self, other):
67290+        return self.elapsed < other.elapsed
67291+    def __gt__(self, other):
67292+        return self.elapsed > other.elapsed
67293+
67294+    def time_file(name, prefix, ffmpeg="./ffmpeg"):
67295+        stats = tstats()
67296+        stats.name = name
67297+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
67298+        cproc = subprocess.Popen([ffmpeg, "-no_cvt_hw",
67299+                                  "-vcodec", "hevc_rpi",
67300+                                  "-t", "30", "-i", prefix + name,
67301+                                  "-f", "vout_rpi", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
67302+        pinfo = os.wait4(cproc.pid, 0)
67303+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
67304+        stats.elapsed = end_time - start_time
67305+        stats.user = pinfo[2].ru_utime
67306+        stats.sys = pinfo[2].ru_stime
67307+        return stats
67308+
67309+
67310+def common_prefix(s1, s2):
67311+    for i in range(min(len(s1),len(s2))):
67312+        if s1[i] != s2[i]:
67313+            return s1[:i]
67314+    return s1[:i+1]
67315+
67316+def main():
67317+    global flog
67318+
67319+    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
67320+To blank the screen before starting use "xdg-screensaver activate"
67321+(For some reason this doesn't seem to work from within python).
67322+""")
67323+
67324+    argp.add_argument("streams", nargs='*')
67325+    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
67326+    argp.add_argument("--csv_in", help="CSV input filename")
67327+    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
67328+    argp.add_argument("--repeat", default=3, type=int, help="Run repeat count")
67329+    argp.add_argument("--ffmpeg", default="./ffmpeg", help="FFmpeg executable")
67330+
67331+    args = argp.parse_args()
67332+
67333+    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
67334+    csv_out.writeheader()
67335+
67336+    stats_in = {}
67337+    if args.csv_in != None:
67338+        with open(args.csv_in, 'r', newline='') as f_in:
67339+            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
67340+
67341+    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
67342+
67343+    streams = args.streams
67344+    if not streams:
67345+        if not stats_in:
67346+            print ("No source streams specified")
67347+            return 1
67348+        prefix = "" if args.prefix == None else args.prefix
67349+        streams = [k for k in stats_in]
67350+    elif args.prefix != None:
67351+        prefix = args.prefix
67352+    else:
67353+        prefix = streams[0]
67354+        for f in streams[1:]:
67355+            prefix = common_prefix(prefix, f)
67356+        pp = prefix.rpartition(os.sep)
67357+        prefix = pp[0] + pp[1]
67358+        streams = [s[len(prefix):] for s in streams]
67359+
67360+    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
67361+        print ("====", f)
67362+
67363+        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
67364+        for i in range(args.repeat):
67365+            t = tstats.time_file(f, prefix, args.ffmpeg)
67366+            print ("...", t.times_str())
67367+            if t0 > t:
67368+                t0 = t
67369+
67370+        if t0.name in stats_in:
67371+            pstat = stats_in[t0.name]
67372+            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
67373+
67374+        csv_out.writerow(t0.dict())
67375+
67376+        print ()
67377+
67378+    return 0
67379+
67380+
67381+if __name__ == '__main__':
67382+    exit(main())
67383+
67384--- /dev/null
67385+++ b/pi-util/genpatch.sh
67386@@ -0,0 +1,35 @@
67387+set -e
67388+
67389+NOPATCH=
67390+if [ "$1" == "--notag" ]; then
67391+  shift
67392+  NOPATCH=1
67393+fi
67394+
67395+if [ "$1" == "" ]; then
67396+  echo Usage: $0 [--notag] \<patch_tag\>
67397+  echo e.g.: $0 mmal_4
67398+  exit 1
67399+fi
67400+
67401+VERSION=`cat RELEASE`
67402+if [ "$VERSION" == "" ]; then
67403+  echo Can\'t find version RELEASE
67404+  exit 1
67405+fi
67406+
67407+PATCHFILE=../ffmpeg-$VERSION-$1.patch
67408+
67409+if [ $NOPATCH ]; then
67410+  echo Not tagged
67411+else
67412+  # Only continue if we are all comitted
67413+  git diff --name-status --exit-code
67414+
67415+  PATCHTAG=pi/$VERSION/$1
67416+  echo Tagging: $PATCHTAG
67417+
67418+  git tag $PATCHTAG
67419+fi
67420+echo Generating patch: $PATCHFILE
67421+git diff n$VERSION -- > $PATCHFILE
67422--- /dev/null
67423+++ b/pi-util/make_array.py
67424@@ -0,0 +1,23 @@
67425+#!/usr/bin/env python
67426+
67427+# Usage
67428+#   make_array file.bin
67429+#   Produces file.h with array of bytes.
67430+#
67431+import sys
67432+for file in sys.argv[1:]:
67433+  prefix,suffix = file.split('.')
67434+  assert suffix=='bin'
67435+  name=prefix.split('/')[-1]
67436+  print 'Converting',file
67437+  with open(prefix+'.h','wb') as out:
67438+    print >>out, 'static const unsigned char',name,'[] = {'
67439+    with open(file,'rb') as fd:
67440+      i = 0
67441+      for byte in fd.read():
67442+        print >>out, '0x%02x, ' % ord(byte),
67443+        i = i + 1
67444+        if i % 8 == 0:
67445+          print >>out, ' // %04x' % (i - 8)
67446+    print >>out,'};'
67447+
67448--- /dev/null
67449+++ b/pi-util/mkinst.sh
67450@@ -0,0 +1,5 @@
67451+set -e
67452+
67453+make install
67454+
67455+cp -r install/* ../vlc/sysroot/raspian_stretch_pi1-sysroot/usr
67456--- /dev/null
67457+++ b/pi-util/patkodi.sh
67458@@ -0,0 +1,9 @@
67459+set -e
67460+KODIBASE=/home/jc/rpi/kodi/xbmc
67461+JOBS=-j20
67462+make $JOBS
67463+git diff xbmc/release/4.3-kodi > $KODIBASE/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
67464+make -C $KODIBASE/tools/depends/target/ffmpeg $JOBS
67465+make -C $KODIBASE/build install
67466+
67467+
67468--- /dev/null
67469+++ b/pi-util/perfcmp.py
67470@@ -0,0 +1,101 @@
67471+#!/usr/bin/env python3
67472+
67473+import time
67474+import string
67475+import os
67476+import tempfile
67477+import subprocess
67478+import re
67479+import argparse
67480+import sys
67481+import csv
67482+from stat import *
67483+
67484+class tstats:
67485+    close_threshold = 0.01
67486+
67487+    def __init__(self, stats_dict=None):
67488+        if stats_dict != None:
67489+            self.name = stats_dict["name"]
67490+            self.elapsed = float(stats_dict["elapsed"])
67491+            self.user = float(stats_dict["user"])
67492+            self.sys = float(stats_dict["sys"])
67493+
67494+    def times_str(self):
67495+        ctime = self.sys + self.user
67496+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
67497+
67498+    def dict(self):
67499+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
67500+
67501+    def is_close(self, other):
67502+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
67503+
67504+    def __lt__(self, other):
67505+        return self.elapsed < other.elapsed
67506+    def __gt__(self, other):
67507+        return self.elapsed > other.elapsed
67508+
67509+    def time_file(name, prefix):
67510+        stats = tstats()
67511+        stats.name = name
67512+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
67513+        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
67514+                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
67515+        pinfo = os.wait4(cproc.pid, 0)
67516+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
67517+        stats.elapsed = end_time - start_time
67518+        stats.user = pinfo[2].ru_utime
67519+        stats.sys = pinfo[2].ru_stime
67520+        return stats
67521+
67522+
67523+def common_prefix(s1, s2):
67524+    for i in range(min(len(s1),len(s2))):
67525+        if s1[i] != s2[i]:
67526+            return s1[:i]
67527+    return s1[:i+1]
67528+
67529+def main():
67530+    argp = argparse.ArgumentParser(description="FFmpeg performance compare")
67531+
67532+    argp.add_argument("stream0", help="CSV to compare")
67533+    argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare")
67534+
67535+    args = argp.parse_args()
67536+
67537+    with open(args.stream0, 'r', newline='') as f_in:
67538+        stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
67539+    with open(args.stream1, 'r', newline='') as f_in:
67540+        stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
67541+
67542+    print (args.stream0, "<<-->>", args.stream1)
67543+    print ()
67544+
67545+    for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()):
67546+       if not (f in stats0) :
67547+           print ("           XX               :", f)
67548+           continue
67549+       if not (f in stats1) :
67550+           print ("       XX                   :", f)
67551+           continue
67552+
67553+       s0 = stats0[f]
67554+       s1 = stats1[f]
67555+
67556+       pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0
67557+       thresh = 0.3
67558+       tc = 6
67559+
67560+       nchar = min(tc - 1, int(abs(pcent) / thresh))
67561+       cc = "  --  " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar
67562+
67563+       print ("%6.2f %s%6.2f (%+5.2f) : %s" %
67564+           (s0.elapsed, cc, s1.elapsed, pcent, f))
67565+
67566+    return 0
67567+
67568+
67569+if __name__ == '__main__':
67570+    exit(main())
67571+
67572--- /dev/null
67573+++ b/pi-util/qem.sh
67574@@ -0,0 +1,9 @@
67575+TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
67576+QASM=python\ ../local/bin/qasm.py
67577+SRC_FILE=libavcodec/rpi_hevc_shader.qasm
67578+DST_BASE=shader
67579+
67580+cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR
67581+$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
67582+$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
67583+
67584--- /dev/null
67585+++ b/pi-util/v3dusage.py
67586@@ -0,0 +1,128 @@
67587+#!/usr/bin/env python
67588+
67589+import sys
67590+import argparse
67591+import re
67592+
67593+def do_logparse(logname):
67594+
67595+    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
67596+    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
67597+    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
67598+    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
67599+
67600+    ttotal = {'idle':0.0}
67601+    tstart = {}
67602+    qctotal = {}
67603+    qtstotal = {}
67604+    l2hits = {}
67605+    l2total = {}
67606+    time0 = None
67607+    idle_start = None
67608+    qpu_op_no = 0
67609+    op_count = 0
67610+
67611+    with open(logname, "rt") as infile:
67612+        for line in infile:
67613+            match = rmatch.match(line)
67614+            if match:
67615+#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
67616+                time = float(match.group(1))
67617+                unit = match.group(3)
67618+                opstart = not match.group(2)
67619+                optype = match.group(7)
67620+                hascb = match.group(8) != "0"
67621+
67622+                if unit == 'qpu1':
67623+                    unit = unit + "." + str(qpu_op_no)
67624+                    if not opstart:
67625+                        if hascb or optype == 'EXECUTE_SYNC':
67626+                            qpu_op_no = 0
67627+                        else:
67628+                            qpu_op_no += 1
67629+
67630+                # Ignore sync type
67631+                if optype == 'EXECUTE_SYNC':
67632+                    continue
67633+
67634+                if not time0:
67635+                    time0 = time
67636+
67637+                if opstart:
67638+                    tstart[unit] = time;
67639+                elif unit in tstart:
67640+                    op_count += 1
67641+                    if not unit in ttotal:
67642+                        ttotal[unit] = 0.0
67643+                    ttotal[unit] += time - tstart[unit]
67644+                    del tstart[unit]
67645+
67646+                if not idle_start and not tstart:
67647+                    idle_start = time
67648+                elif idle_start and tstart:
67649+                    ttotal['idle'] += time - idle_start
67650+                    idle_start = None
67651+
67652+            match = rqcycle.match(line)
67653+            if match:
67654+                unit = "qpu1." + str(qpu_op_no)
67655+                if not unit in qctotal:
67656+                    qctotal[unit] = 0
67657+                qctotal[unit] += int(match.group(2))
67658+
67659+            match = rqtscycle.match(line)
67660+            if match:
67661+                unit = "qpu1." + str(qpu_op_no)
67662+                if not unit in qtstotal:
67663+                    qtstotal[unit] = 0
67664+                qtstotal[unit] += int(match.group(2))
67665+
67666+            match = rl2hits.match(line)
67667+            if match:
67668+                unit = "qpu1." + str(qpu_op_no)
67669+                if not unit in l2total:
67670+                    l2total[unit] = 0
67671+                    l2hits[unit] = 0
67672+                l2total[unit] += int(match.group(3))
67673+                if match.group(2) == "hits":
67674+                    l2hits[unit] += int(match.group(3))
67675+
67676+
67677+    if not time0:
67678+        print "No v3d profile records found"
67679+    else:
67680+        tlogged = time - time0
67681+
67682+        print "Logged time:", tlogged, "  Op count:", op_count
67683+        for unit in sorted(ttotal):
67684+            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
67685+        print
67686+        for unit in sorted(qctotal):
67687+            if not unit in qtstotal:
67688+                qtstotal[unit] = 0;
67689+            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
67690+            if unit in l2total:
67691+                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
67692+
67693+
67694+
67695+if __name__ == '__main__':
67696+    argp = argparse.ArgumentParser(
67697+        formatter_class=argparse.RawDescriptionHelpFormatter,
67698+        description="QPU/VPU perf summary from VC logging",
67699+        epilog = """
67700+Will also summarise TMU stalls if logging requests set in qpu noflush param
67701+in the profiled code.
67702+
67703+Example use:
67704+  vcgencmd set_logging level=0xc0
67705+  <command to profile>
67706+  sudo vcdbg log msg >& t.log
67707+  v3dusage.py t.log
67708+""")
67709+
67710+    argp.add_argument("logfile")
67711+    args = argp.parse_args()
67712+
67713+    do_logparse(args.logfile)
67714+
67715--- a/tests/checkasm/Makefile
67716+++ b/tests/checkasm/Makefile
67717@@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP)
67718 AVCODECOBJS-$(CONFIG_H264DSP)           += h264dsp.o
67719 AVCODECOBJS-$(CONFIG_H264PRED)          += h264pred.o
67720 AVCODECOBJS-$(CONFIG_H264QPEL)          += h264qpel.o
67721+AVCODECOBJS-$(CONFIG_IDCTDSP)           += idctdsp.o
67722 AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
67723 AVCODECOBJS-$(CONFIG_LLVIDENCDSP)       += llviddspenc.o
67724+AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
67725 AVCODECOBJS-$(CONFIG_VP8DSP)            += vp8dsp.o
67726 AVCODECOBJS-$(CONFIG_VIDEODSP)          += videodsp.o
67727
67728--- a/tests/checkasm/checkasm.c
67729+++ b/tests/checkasm/checkasm.c
67730@@ -121,6 +121,9 @@ static const struct {
67731     #if CONFIG_HUFFYUV_DECODER
67732         { "huffyuvdsp", checkasm_check_huffyuvdsp },
67733     #endif
67734+    #if CONFIG_IDCTDSP
67735+        { "idctdsp", checkasm_check_idctdsp },
67736+    #endif
67737     #if CONFIG_JPEG2000_DECODER
67738         { "jpeg2000dsp", checkasm_check_jpeg2000dsp },
67739     #endif
67740@@ -145,6 +148,9 @@ static const struct {
67741     #if CONFIG_V210_ENCODER
67742         { "v210enc", checkasm_check_v210enc },
67743     #endif
67744+    #if CONFIG_VC1DSP
67745+        { "vc1dsp", checkasm_check_vc1dsp },
67746+    #endif
67747     #if CONFIG_VP8DSP
67748         { "vp8dsp", checkasm_check_vp8dsp },
67749     #endif
67750--- a/tests/checkasm/checkasm.h
67751+++ b/tests/checkasm/checkasm.h
67752@@ -60,6 +60,7 @@ void checkasm_check_hevc_add_res(void);
67753 void checkasm_check_hevc_idct(void);
67754 void checkasm_check_hevc_sao(void);
67755 void checkasm_check_huffyuvdsp(void);
67756+void checkasm_check_idctdsp(void);
67757 void checkasm_check_jpeg2000dsp(void);
67758 void checkasm_check_llviddsp(void);
67759 void checkasm_check_llviddspenc(void);
67760@@ -73,6 +74,7 @@ void checkasm_check_sw_scale(void);
67761 void checkasm_check_utvideodsp(void);
67762 void checkasm_check_v210dec(void);
67763 void checkasm_check_v210enc(void);
67764+void checkasm_check_vc1dsp(void);
67765 void checkasm_check_vf_eq(void);
67766 void checkasm_check_vf_gblur(void);
67767 void checkasm_check_vf_hflip(void);
67768--- /dev/null
67769+++ b/tests/checkasm/idctdsp.c
67770@@ -0,0 +1,98 @@
67771+/*
67772+ * Copyright (c) 2022 Ben Avison
67773+ *
67774+ * This file is part of FFmpeg.
67775+ *
67776+ * FFmpeg is free software; you can redistribute it and/or modify
67777+ * it under the terms of the GNU General Public License as published by
67778+ * the Free Software Foundation; either version 2 of the License, or
67779+ * (at your option) any later version.
67780+ *
67781+ * FFmpeg is distributed in the hope that it will be useful,
67782+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
67783+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
67784+ * GNU General Public License for more details.
67785+ *
67786+ * You should have received a copy of the GNU General Public License along
67787+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
67788+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
67789+ */
67790+
67791+#include <string.h>
67792+
67793+#include "checkasm.h"
67794+
67795+#include "libavcodec/idctdsp.h"
67796+
67797+#include "libavutil/common.h"
67798+#include "libavutil/internal.h"
67799+#include "libavutil/intreadwrite.h"
67800+#include "libavutil/mem_internal.h"
67801+
67802+#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) },
67803+
67804+typedef struct {
67805+    const char *name;
67806+    size_t offset;
67807+} test;
67808+
67809+#define RANDOMIZE_BUFFER16(name, size)          \
67810+    do {                                        \
67811+        int i;                                  \
67812+        for (i = 0; i < size; ++i) {            \
67813+            uint16_t r = rnd() % 0x201 - 0x100; \
67814+            AV_WN16A(name##0 + i, r);           \
67815+            AV_WN16A(name##1 + i, r);           \
67816+        }                                       \
67817+    } while (0)
67818+
67819+#define RANDOMIZE_BUFFER8(name, size)         \
67820+    do {                                      \
67821+        int i;                                \
67822+        for (i = 0; i < size; ++i) {          \
67823+            uint8_t r = rnd();                \
67824+            name##0[i] = r;                   \
67825+            name##1[i] = r;                   \
67826+        }                                     \
67827+    } while (0)
67828+
67829+static void check_add_put_clamped(void)
67830+{
67831+    /* Source buffers are only as big as needed, since any over-read won't affect results */
67832+    LOCAL_ALIGNED_16(int16_t, src0, [64]);
67833+    LOCAL_ALIGNED_16(int16_t, src1, [64]);
67834+    /* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */
67835+    LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]);
67836+    LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]);
67837+
67838+    AVCodecContext avctx = { 0 };
67839+    IDCTDSPContext h;
67840+
67841+    const test tests[] = {
67842+        IDCTDSP_TEST(add_pixels_clamped)
67843+        IDCTDSP_TEST(put_pixels_clamped)
67844+        IDCTDSP_TEST(put_signed_pixels_clamped)
67845+    };
67846+
67847+    ff_idctdsp_init(&h, &avctx);
67848+
67849+    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
67850+        void (*func)(const int16_t *, uint8_t * ptrdiff_t) = *(void **)((intptr_t) &h + tests[t].offset);
67851+        if (check_func(func, "idctdsp.%s", tests[t].name)) {
67852+            declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *, uint8_t *, ptrdiff_t);
67853+            RANDOMIZE_BUFFER16(src, 64);
67854+            RANDOMIZE_BUFFER8(dst, 10 * 24);
67855+            call_ref(src0, dst0 + 24 + 8, 24);
67856+            call_new(src1, dst1 + 24 + 8, 24);
67857+            if (memcmp(dst0, dst1, 10 * 24))
67858+                fail();
67859+            bench_new(src1, dst1 + 24 + 8, 24);
67860+        }
67861+    }
67862+}
67863+
67864+void checkasm_check_idctdsp(void)
67865+{
67866+    check_add_put_clamped();
67867+    report("idctdsp");
67868+}
67869--- /dev/null
67870+++ b/tests/checkasm/vc1dsp.c
67871@@ -0,0 +1,452 @@
67872+/*
67873+ * Copyright (c) 2022 Ben Avison
67874+ *
67875+ * This file is part of FFmpeg.
67876+ *
67877+ * FFmpeg is free software; you can redistribute it and/or modify
67878+ * it under the terms of the GNU General Public License as published by
67879+ * the Free Software Foundation; either version 2 of the License, or
67880+ * (at your option) any later version.
67881+ *
67882+ * FFmpeg is distributed in the hope that it will be useful,
67883+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
67884+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
67885+ * GNU General Public License for more details.
67886+ *
67887+ * You should have received a copy of the GNU General Public License along
67888+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
67889+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
67890+ */
67891+
67892+#include <string.h>
67893+
67894+#include "checkasm.h"
67895+
67896+#include "libavcodec/vc1dsp.h"
67897+
67898+#include "libavutil/common.h"
67899+#include "libavutil/internal.h"
67900+#include "libavutil/intreadwrite.h"
67901+#include "libavutil/mem_internal.h"
67902+
67903+#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
67904+#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
67905+
67906+typedef struct {
67907+    const char *name;
67908+    size_t offset;
67909+    int width;
67910+    int height;
67911+} test;
67912+
67913+typedef struct matrix {
67914+    size_t width;
67915+    size_t height;
67916+    float d[];
67917+} matrix;
67918+
67919+static const matrix T8 = { 8, 8, {
67920+        12,  12,  12,  12,  12,  12,  12,  12,
67921+        16,  15,   9,   4,  -4,  -9, -15, -16,
67922+        16,   6,  -6, -16, -16,  -6,   6,  16,
67923+        15,  -4, -16,  -9,   9,  16,   4, -15,
67924+        12, -12, -12,  12,  12, -12, -12,  12,
67925+         9, -16,   4,  15, -15,  -4,  16,  -9,
67926+         6, -16,  16,  -6,  -6,  16, -16,   6,
67927+         4,  -9,  15, -16,  16, -15,   9,  -4
67928+} };
67929+
67930+static const matrix T4 = { 4, 4, {
67931+        17,  17,  17,  17,
67932+        22,  10, -10, -22,
67933+        17, -17, -17,  17,
67934+        10, -22,  22, -10
67935+} };
67936+
67937+static const matrix T8t = { 8, 8, {
67938+        12,  16,  16,  15,  12,   9,   6,   4,
67939+        12,  15,   6,  -4, -12, -16, -16,  -9,
67940+        12,   9,  -6, -16, -12,   4,  16,  15,
67941+        12,   4, -16,  -9,  12,  15,  -6, -16,
67942+        12,  -4, -16,   9,  12, -15,  -6,  16,
67943+        12,  -9,  -6,  16, -12,  -4,  16, -15,
67944+        12, -15,   6,   4, -12,  16, -16,   9,
67945+        12, -16,  16, -15,  12,  -9,   6,  -4
67946+} };
67947+
67948+static const matrix T4t = { 4, 4, {
67949+        17,  22,  17,  10,
67950+        17,  10, -17, -22,
67951+        17, -10, -17,  22,
67952+        17, -22,  17, -10
67953+} };
67954+
67955+static matrix *new_matrix(size_t width, size_t height)
67956+{
67957+    matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float));
67958+    if (out == NULL) {
67959+        fprintf(stderr, "Memory allocation failure\n");
67960+        exit(EXIT_FAILURE);
67961+    }
67962+    out->width = width;
67963+    out->height = height;
67964+    return out;
67965+}
67966+
67967+static matrix *multiply(const matrix *a, const matrix *b)
67968+{
67969+    matrix *out;
67970+    if (a->width != b->height) {
67971+        fprintf(stderr, "Incompatible multiplication\n");
67972+        exit(EXIT_FAILURE);
67973+    }
67974+    out = new_matrix(b->width, a->height);
67975+    for (int j = 0; j < out->height; ++j)
67976+        for (int i = 0; i < out->width; ++i) {
67977+            float sum = 0;
67978+            for (int k = 0; k < a->width; ++k)
67979+                sum += a->d[j * a->width + k] * b->d[k * b->width + i];
67980+            out->d[j * out->width + i] = sum;
67981+        }
67982+    return out;
67983+}
67984+
67985+static void normalise(matrix *a)
67986+{
67987+    for (int j = 0; j < a->height; ++j)
67988+        for (int i = 0; i < a->width; ++i) {
67989+            float *p = a->d + j * a->width + i;
67990+            *p *= 64;
67991+            if (a->height == 4)
67992+                *p /= (const unsigned[]) { 289, 292, 289, 292 } [j];
67993+            else
67994+                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j];
67995+            if (a->width == 4)
67996+                *p /= (const unsigned[]) { 289, 292, 289, 292 } [i];
67997+            else
67998+                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i];
67999+        }
68000+}
68001+
68002+static void divide_and_round_nearest(matrix *a, float by)
68003+{
68004+    for (int j = 0; j < a->height; ++j)
68005+        for (int i = 0; i < a->width; ++i) {
68006+            float *p = a->d + j * a->width + i;
68007+            *p = rintf(*p / by);
68008+        }
68009+}
68010+
68011+static void tweak(matrix *a)
68012+{
68013+    for (int j = 4; j < a->height; ++j)
68014+        for (int i = 0; i < a->width; ++i) {
68015+            float *p = a->d + j * a->width + i;
68016+            *p += 1;
68017+        }
68018+}
68019+
68020+/* The VC-1 spec places restrictions on the values permitted at three
68021+ * different stages:
68022+ * - D: the input coefficients in frequency domain
68023+ * - E: the intermediate coefficients, inverse-transformed only horizontally
68024+ * - R: the fully inverse-transformed coefficients
68025+ *
68026+ * To fully cater for the ranges specified requires various intermediate
68027+ * values to be held to 17-bit precision; yet these conditions do not appear
68028+ * to be utilised in real-world streams. At least some assembly
68029+ * implementations have chosen to restrict these values to 16-bit precision,
68030+ * to accelerate the decoding of real-world streams at the cost of strict
68031+ * adherence to the spec. To avoid our test marking these as failures,
68032+ * reduce our random inputs.
68033+ */
68034+#define ATTENUATION 4
68035+
68036+static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height)
68037+{
68038+    matrix *raw, *tmp, *D, *E, *R;
68039+    raw = new_matrix(width, height);
68040+    for (int i = 0; i < width * height; ++i)
68041+        raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION;
68042+    tmp = multiply(height == 8 ? &T8 : &T4, raw);
68043+    D = multiply(tmp, width == 8 ? &T8t : &T4t);
68044+    normalise(D);
68045+    divide_and_round_nearest(D, 1);
68046+    for (int i = 0; i < width * height; ++i) {
68047+        if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) {
68048+            /* Rare, so simply try again */
68049+            av_free(raw);
68050+            av_free(tmp);
68051+            av_free(D);
68052+            return generate_inverse_quantized_transform_coefficients(width, height);
68053+        }
68054+    }
68055+    E = multiply(D, width == 8 ? &T8 : &T4);
68056+    divide_and_round_nearest(E, 8);
68057+    for (int i = 0; i < width * height; ++i)
68058+        if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) {
68059+            /* Rare, so simply try again */
68060+            av_free(raw);
68061+            av_free(tmp);
68062+            av_free(D);
68063+            av_free(E);
68064+            return generate_inverse_quantized_transform_coefficients(width, height);
68065+        }
68066+    R = multiply(height == 8 ? &T8t : &T4t, E);
68067+    tweak(R);
68068+    divide_and_round_nearest(R, 128);
68069+    for (int i = 0; i < width * height; ++i)
68070+        if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) {
68071+            /* Rare, so simply try again */
68072+            av_free(raw);
68073+            av_free(tmp);
68074+            av_free(D);
68075+            av_free(E);
68076+            av_free(R);
68077+            return generate_inverse_quantized_transform_coefficients(width, height);
68078+        }
68079+    av_free(raw);
68080+    av_free(tmp);
68081+    av_free(E);
68082+    av_free(R);
68083+    return D;
68084+}
68085+
68086+#define RANDOMIZE_BUFFER16(name, size)        \
68087+    do {                                      \
68088+        int i;                                \
68089+        for (i = 0; i < size; ++i) {          \
68090+            uint16_t r = rnd();               \
68091+            AV_WN16A(name##0 + i, r);         \
68092+            AV_WN16A(name##1 + i, r);         \
68093+        }                                     \
68094+    } while (0)
68095+
68096+#define RANDOMIZE_BUFFER8(name, size)         \
68097+    do {                                      \
68098+        int i;                                \
68099+        for (i = 0; i < size; ++i) {          \
68100+            uint8_t r = rnd();                \
68101+            name##0[i] = r;                   \
68102+            name##1[i] = r;                   \
68103+        }                                     \
68104+    } while (0)
68105+
68106+#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size)  \
68107+    do {                                            \
68108+        uint8_t *p##0 = name##0, *p##1 = name##1;   \
68109+        int i = (size);                             \
68110+        while (i-- > 0) {                           \
68111+            int x = 0x80 | (rnd() & 0x7F);          \
68112+            x >>= rnd() % 9;                        \
68113+            if (rnd() & 1)                          \
68114+                x = -x;                             \
68115+            *p##1++ = *p##0++ = 0x80 + x;           \
68116+        }                                           \
68117+    } while (0)
68118+
68119+static void check_inv_trans_inplace(void)
68120+{
68121+    /* Inverse transform input coefficients are stored in a 16-bit buffer
68122+     * with row stride of 8 coefficients irrespective of transform size.
68123+     * vc1_inv_trans_8x8 differs from the others in two ways: coefficients
68124+     * are stored in column-major order, and the outputs are written back
68125+     * to the input buffer, so we oversize it slightly to catch overruns. */
68126+    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]);
68127+    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]);
68128+
68129+    VC1DSPContext h;
68130+
68131+    ff_vc1dsp_init(&h);
68132+
68133+    if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) {
68134+        matrix *coeffs;
68135+        declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *);
68136+        RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8);
68137+        coeffs = generate_inverse_quantized_transform_coefficients(8, 8);
68138+        for (int j = 0; j < 8; ++j)
68139+            for (int i = 0; i < 8; ++i) {
68140+                int idx = 8 + i * 8 + j;
68141+                inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i];
68142+            }
68143+        call_ref(inv_trans_in0 + 8);
68144+        call_new(inv_trans_in1 + 8);
68145+        if (memcmp(inv_trans_in0,  inv_trans_in1,  10 * 8 * sizeof (int16_t)))
68146+            fail();
68147+        bench_new(inv_trans_in1 + 8);
68148+        av_free(coeffs);
68149+    }
68150+}
68151+
68152+static void check_inv_trans_adding(void)
68153+{
68154+    /* Inverse transform input coefficients are stored in a 16-bit buffer
68155+     * with row stride of 8 coefficients irrespective of transform size. */
68156+    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]);
68157+    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]);
68158+
68159+    /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
68160+     * added with saturation to an array of unsigned 8-bit values. Oversize
68161+     * this by 8 samples left and right and one row above and below. */
68162+    LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]);
68163+    LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]);
68164+
68165+    VC1DSPContext h;
68166+
68167+    const test tests[] = {
68168+        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4)
68169+        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8)
68170+        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4)
68171+        VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8)
68172+        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4)
68173+        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8)
68174+        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4)
68175+    };
68176+
68177+    ff_vc1dsp_init(&h);
68178+
68179+    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
68180+        void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset);
68181+        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
68182+            matrix *coeffs;
68183+            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *);
68184+            RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
68185+            RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
68186+            coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height);
68187+            for (int j = 0; j < tests[t].height; ++j)
68188+                for (int i = 0; i < tests[t].width; ++i) {
68189+                    int idx = j * 8 + i;
68190+                    inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i];
68191+                }
68192+            call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
68193+            call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
68194+            if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24))
68195+                fail();
68196+            bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8);
68197+            av_free(coeffs);
68198+        }
68199+    }
68200+}
68201+
68202+static void check_loop_filter(void)
68203+{
68204+    /* Deblocking filter buffers are big enough to hold a 16x16 block,
68205+     * plus 16 columns left and 4 rows above to hold filter inputs
68206+     * (depending on whether v or h neighbouring block edge, oversized
68207+     * horizontally to maintain 16-byte alignment) plus 16 columns and
68208+     * 4 rows below to catch write overflows */
68209+    LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
68210+    LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
68211+
68212+    VC1DSPContext h;
68213+
68214+    const test tests[] = {
68215+        VC1DSP_TEST(vc1_v_loop_filter4)
68216+        VC1DSP_TEST(vc1_h_loop_filter4)
68217+        VC1DSP_TEST(vc1_v_loop_filter8)
68218+        VC1DSP_TEST(vc1_h_loop_filter8)
68219+        VC1DSP_TEST(vc1_v_loop_filter16)
68220+        VC1DSP_TEST(vc1_h_loop_filter16)
68221+    };
68222+
68223+    ff_vc1dsp_init(&h);
68224+
68225+    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
68226+        void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
68227+        declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
68228+        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
68229+            for (int count = 1000; count > 0; --count) {
68230+                int pq = rnd() % 31 + 1;
68231+                RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
68232+                call_ref(filter_buf0 + 4 * 48 + 16, 48, pq);
68233+                call_new(filter_buf1 + 4 * 48 + 16, 48, pq);
68234+                if (memcmp(filter_buf0, filter_buf1, 24 * 48))
68235+                    fail();
68236+            }
68237+        }
68238+        for (int j = 0; j < 24; ++j)
68239+            for (int i = 0; i < 48; ++i)
68240+                filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
68241+        if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
68242+            bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
68243+        if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
68244+            bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
68245+    }
68246+}
68247+
68248+#define TEST_UNESCAPE                                                                               \
68249+    do {                                                                                            \
68250+        for (int count = 100; count > 0; --count) {                                                 \
68251+            escaped_offset = rnd() & 7;                                                             \
68252+            unescaped_offset = rnd() & 7;                                                           \
68253+            escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7);                                    \
68254+            RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE);                                        \
68255+            len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
68256+            len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
68257+            if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE))                  \
68258+                fail();                                                                             \
68259+        }                                                                                           \
68260+    } while (0)
68261+
68262+static void check_unescape(void)
68263+{
68264+    /* This appears to be a typical length of buffer in use */
68265+#define LOG2_UNESCAPE_BUF_SIZE 17
68266+#define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
68267+    LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
68268+    LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
68269+    LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
68270+    LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
68271+
68272+    VC1DSPContext h;
68273+
68274+    ff_vc1dsp_init(&h);
68275+
68276+    if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
68277+        int len0, len1, escaped_offset, unescaped_offset, escaped_len;
68278+        declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
68279+
68280+        /* Test data which consists of escapes sequences packed as tightly as possible */
68281+        for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
68282+            escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
68283+        TEST_UNESCAPE;
68284+
68285+        /* Test random data */
68286+        RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
68287+        TEST_UNESCAPE;
68288+
68289+        /* Test data with escape sequences at random intervals */
68290+        for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
68291+            int gap, gap_msb;
68292+            escaped1[x+0] = escaped0[x+0] = 0;
68293+            escaped1[x+1] = escaped0[x+1] = 0;
68294+            escaped1[x+2] = escaped0[x+2] = 3;
68295+            escaped1[x+3] = escaped0[x+3] = rnd() & 3;
68296+            gap_msb = 2u << (rnd() % 8);
68297+            gap = (rnd() &~ -gap_msb) | gap_msb;
68298+            x += gap;
68299+        }
68300+        TEST_UNESCAPE;
68301+
68302+        /* Test data which is known to contain no escape sequences */
68303+        memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
68304+        memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
68305+        TEST_UNESCAPE;
68306+
68307+        /* Benchmark the no-escape-sequences case */
68308+        bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
68309+    }
68310+}
68311+
68312+void checkasm_check_vc1dsp(void)
68313+{
68314+    check_inv_trans_inplace();
68315+    check_inv_trans_adding();
68316+    report("inv_trans");
68317+
68318+    check_loop_filter();
68319+    report("loop_filter");
68320+
68321+    check_unescape();
68322+    report("unescape_buffer");
68323+}
68324--- a/tests/fate/checkasm.mak
68325+++ b/tests/fate/checkasm.mak
68326@@ -16,6 +16,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp
68327                 fate-checkasm-hevc_add_res                              \
68328                 fate-checkasm-hevc_idct                                 \
68329                 fate-checkasm-hevc_sao                                  \
68330+                fate-checkasm-idctdsp                                   \
68331                 fate-checkasm-jpeg2000dsp                               \
68332                 fate-checkasm-llviddsp                                  \
68333                 fate-checkasm-llviddspenc                               \
68334@@ -27,6 +28,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp
68335                 fate-checkasm-sw_scale                                  \
68336                 fate-checkasm-v210dec                                   \
68337                 fate-checkasm-v210enc                                   \
68338+                fate-checkasm-vc1dsp                                    \
68339                 fate-checkasm-vf_blend                                  \
68340                 fate-checkasm-vf_colorspace                             \
68341                 fate-checkasm-vf_eq                                     \
68342