blob: 5579bd9bedca56727b97b80cb33cfc650b0235e9
1 | /* |
2 | * RV40 decoder motion compensation functions |
3 | * Copyright (c) 2008 Konstantin Shishkov |
4 | * |
5 | * This file is part of FFmpeg. |
6 | * |
7 | * FFmpeg is free software; you can redistribute it and/or |
8 | * modify it under the terms of the GNU Lesser General Public |
9 | * License as published by the Free Software Foundation; either |
10 | * version 2.1 of the License, or (at your option) any later version. |
11 | * |
12 | * FFmpeg is distributed in the hope that it will be useful, |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | * Lesser General Public License for more details. |
16 | * |
17 | * You should have received a copy of the GNU Lesser General Public |
18 | * License along with FFmpeg; if not, write to the Free Software |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 | */ |
21 | |
22 | /** |
23 | * @file |
24 | * RV40 decoder motion compensation functions |
25 | */ |
26 | |
27 | #include "libavutil/common.h" |
28 | #include "libavutil/intreadwrite.h" |
29 | #include "avcodec.h" |
30 | #include "h264qpel.h" |
31 | #include "mathops.h" |
32 | #include "pixels.h" |
33 | #include "rnd_avg.h" |
34 | #include "rv34dsp.h" |
35 | #include "libavutil/avassert.h" |
36 | |
37 | #define RV40_LOWPASS(OPNAME, OP) \ |
38 | static void OPNAME ## rv40_qpel8_h_lowpass(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride,\ |
39 | const int h, const int C1, const int C2, const int SHIFT){\ |
40 | const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;\ |
41 | int i;\ |
42 | for(i = 0; i < h; i++)\ |
43 | {\ |
44 | OP(dst[0], (src[-2] + src[ 3] - 5*(src[-1]+src[2]) + src[0]*C1 + src[1]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
45 | OP(dst[1], (src[-1] + src[ 4] - 5*(src[ 0]+src[3]) + src[1]*C1 + src[2]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
46 | OP(dst[2], (src[ 0] + src[ 5] - 5*(src[ 1]+src[4]) + src[2]*C1 + src[3]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
47 | OP(dst[3], (src[ 1] + src[ 6] - 5*(src[ 2]+src[5]) + src[3]*C1 + src[4]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
48 | OP(dst[4], (src[ 2] + src[ 7] - 5*(src[ 3]+src[6]) + src[4]*C1 + src[5]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
49 | OP(dst[5], (src[ 3] + src[ 8] - 5*(src[ 4]+src[7]) + src[5]*C1 + src[6]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
50 | OP(dst[6], (src[ 4] + src[ 9] - 5*(src[ 5]+src[8]) + src[6]*C1 + src[7]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
51 | OP(dst[7], (src[ 5] + src[10] - 5*(src[ 6]+src[9]) + src[7]*C1 + src[8]*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
52 | dst += dstStride;\ |
53 | src += srcStride;\ |
54 | }\ |
55 | }\ |
56 | \ |
57 | static void OPNAME ## rv40_qpel8_v_lowpass(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride,\ |
58 | const int w, const int C1, const int C2, const int SHIFT){\ |
59 | const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;\ |
60 | int i;\ |
61 | for(i = 0; i < w; i++)\ |
62 | {\ |
63 | const int srcB = src[-2*srcStride];\ |
64 | const int srcA = src[-1*srcStride];\ |
65 | const int src0 = src[0 *srcStride];\ |
66 | const int src1 = src[1 *srcStride];\ |
67 | const int src2 = src[2 *srcStride];\ |
68 | const int src3 = src[3 *srcStride];\ |
69 | const int src4 = src[4 *srcStride];\ |
70 | const int src5 = src[5 *srcStride];\ |
71 | const int src6 = src[6 *srcStride];\ |
72 | const int src7 = src[7 *srcStride];\ |
73 | const int src8 = src[8 *srcStride];\ |
74 | const int src9 = src[9 *srcStride];\ |
75 | const int src10 = src[10*srcStride];\ |
76 | OP(dst[0*dstStride], (srcB + src3 - 5*(srcA+src2) + src0*C1 + src1*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
77 | OP(dst[1*dstStride], (srcA + src4 - 5*(src0+src3) + src1*C1 + src2*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
78 | OP(dst[2*dstStride], (src0 + src5 - 5*(src1+src4) + src2*C1 + src3*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
79 | OP(dst[3*dstStride], (src1 + src6 - 5*(src2+src5) + src3*C1 + src4*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
80 | OP(dst[4*dstStride], (src2 + src7 - 5*(src3+src6) + src4*C1 + src5*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
81 | OP(dst[5*dstStride], (src3 + src8 - 5*(src4+src7) + src5*C1 + src6*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
82 | OP(dst[6*dstStride], (src4 + src9 - 5*(src5+src8) + src6*C1 + src7*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
83 | OP(dst[7*dstStride], (src5 + src10 - 5*(src6+src9) + src7*C1 + src8*C2 + (1<<(SHIFT-1))) >> SHIFT);\ |
84 | dst++;\ |
85 | src++;\ |
86 | }\ |
87 | }\ |
88 | \ |
89 | static void OPNAME ## rv40_qpel16_v_lowpass(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride,\ |
90 | const int w, const int C1, const int C2, const int SHIFT){\ |
91 | OPNAME ## rv40_qpel8_v_lowpass(dst , src , dstStride, srcStride, 8, C1, C2, SHIFT);\ |
92 | OPNAME ## rv40_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride, 8, C1, C2, SHIFT);\ |
93 | src += 8*srcStride;\ |
94 | dst += 8*dstStride;\ |
95 | OPNAME ## rv40_qpel8_v_lowpass(dst , src , dstStride, srcStride, w-8, C1, C2, SHIFT);\ |
96 | OPNAME ## rv40_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride, w-8, C1, C2, SHIFT);\ |
97 | }\ |
98 | \ |
99 | static void OPNAME ## rv40_qpel16_h_lowpass(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride,\ |
100 | const int h, const int C1, const int C2, const int SHIFT){\ |
101 | OPNAME ## rv40_qpel8_h_lowpass(dst , src , dstStride, srcStride, 8, C1, C2, SHIFT);\ |
102 | OPNAME ## rv40_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride, 8, C1, C2, SHIFT);\ |
103 | src += 8*srcStride;\ |
104 | dst += 8*dstStride;\ |
105 | OPNAME ## rv40_qpel8_h_lowpass(dst , src , dstStride, srcStride, h-8, C1, C2, SHIFT);\ |
106 | OPNAME ## rv40_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride, h-8, C1, C2, SHIFT);\ |
107 | }\ |
108 | \ |
109 | |
110 | #define RV40_MC(OPNAME, SIZE) \ |
111 | static void OPNAME ## rv40_qpel ## SIZE ## _mc10_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
112 | {\ |
113 | OPNAME ## rv40_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride, SIZE, 52, 20, 6);\ |
114 | }\ |
115 | \ |
116 | static void OPNAME ## rv40_qpel ## SIZE ## _mc30_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
117 | {\ |
118 | OPNAME ## rv40_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride, SIZE, 20, 52, 6);\ |
119 | }\ |
120 | \ |
121 | static void OPNAME ## rv40_qpel ## SIZE ## _mc01_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
122 | {\ |
123 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, src, stride, stride, SIZE, 52, 20, 6);\ |
124 | }\ |
125 | \ |
126 | static void OPNAME ## rv40_qpel ## SIZE ## _mc11_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
127 | {\ |
128 | uint8_t full[SIZE*(SIZE+5)];\ |
129 | uint8_t * const full_mid = full + SIZE*2;\ |
130 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 52, 20, 6);\ |
131 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 52, 20, 6);\ |
132 | }\ |
133 | \ |
134 | static void OPNAME ## rv40_qpel ## SIZE ## _mc21_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
135 | {\ |
136 | uint8_t full[SIZE*(SIZE+5)];\ |
137 | uint8_t * const full_mid = full + SIZE*2;\ |
138 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 20, 5);\ |
139 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 52, 20, 6);\ |
140 | }\ |
141 | \ |
142 | static void OPNAME ## rv40_qpel ## SIZE ## _mc31_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
143 | {\ |
144 | uint8_t full[SIZE*(SIZE+5)];\ |
145 | uint8_t * const full_mid = full + SIZE*2;\ |
146 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 52, 6);\ |
147 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 52, 20, 6);\ |
148 | }\ |
149 | \ |
150 | static void OPNAME ## rv40_qpel ## SIZE ## _mc12_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
151 | {\ |
152 | uint8_t full[SIZE*(SIZE+5)];\ |
153 | uint8_t * const full_mid = full + SIZE*2;\ |
154 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 52, 20, 6);\ |
155 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 20, 5);\ |
156 | }\ |
157 | \ |
158 | static void OPNAME ## rv40_qpel ## SIZE ## _mc22_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
159 | {\ |
160 | uint8_t full[SIZE*(SIZE+5)];\ |
161 | uint8_t * const full_mid = full + SIZE*2;\ |
162 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 20, 5);\ |
163 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 20, 5);\ |
164 | }\ |
165 | \ |
166 | static void OPNAME ## rv40_qpel ## SIZE ## _mc32_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
167 | {\ |
168 | uint8_t full[SIZE*(SIZE+5)];\ |
169 | uint8_t * const full_mid = full + SIZE*2;\ |
170 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 52, 6);\ |
171 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 20, 5);\ |
172 | }\ |
173 | \ |
174 | static void OPNAME ## rv40_qpel ## SIZE ## _mc03_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
175 | {\ |
176 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, src, stride, stride, SIZE, 20, 52, 6);\ |
177 | }\ |
178 | \ |
179 | static void OPNAME ## rv40_qpel ## SIZE ## _mc13_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
180 | {\ |
181 | uint8_t full[SIZE*(SIZE+5)];\ |
182 | uint8_t * const full_mid = full + SIZE*2;\ |
183 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 52, 20, 6);\ |
184 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 52, 6);\ |
185 | }\ |
186 | \ |
187 | static void OPNAME ## rv40_qpel ## SIZE ## _mc23_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\ |
188 | {\ |
189 | uint8_t full[SIZE*(SIZE+5)];\ |
190 | uint8_t * const full_mid = full + SIZE*2;\ |
191 | put_rv40_qpel ## SIZE ## _h_lowpass(full, src - 2*stride, SIZE, stride, SIZE+5, 20, 20, 5);\ |
192 | OPNAME ## rv40_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE, SIZE, 20, 52, 6);\ |
193 | }\ |
194 | \ |
195 | |
196 | #define op_avg(a, b) a = (((a)+cm[b]+1)>>1) |
197 | #define op_put(a, b) a = cm[b] |
198 | |
199 | RV40_LOWPASS(put_ , op_put) |
200 | RV40_LOWPASS(avg_ , op_avg) |
201 | |
202 | #undef op_avg |
203 | #undef op_put |
204 | |
205 | RV40_MC(put_, 8) |
206 | RV40_MC(put_, 16) |
207 | RV40_MC(avg_, 8) |
208 | RV40_MC(avg_, 16) |
209 | |
210 | #define PIXOP2(OPNAME, OP) \ |
211 | static inline void OPNAME ## _pixels8_xy2_8_c(uint8_t *block, \ |
212 | const uint8_t *pixels, \ |
213 | ptrdiff_t line_size, \ |
214 | int h) \ |
215 | { \ |
216 | /* FIXME HIGH BIT DEPTH */ \ |
217 | int j; \ |
218 | \ |
219 | for (j = 0; j < 2; j++) { \ |
220 | int i; \ |
221 | const uint32_t a = AV_RN32(pixels); \ |
222 | const uint32_t b = AV_RN32(pixels + 1); \ |
223 | uint32_t l0 = (a & 0x03030303UL) + \ |
224 | (b & 0x03030303UL) + \ |
225 | 0x02020202UL; \ |
226 | uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + \ |
227 | ((b & 0xFCFCFCFCUL) >> 2); \ |
228 | uint32_t l1, h1; \ |
229 | \ |
230 | pixels += line_size; \ |
231 | for (i = 0; i < h; i += 2) { \ |
232 | uint32_t a = AV_RN32(pixels); \ |
233 | uint32_t b = AV_RN32(pixels + 1); \ |
234 | l1 = (a & 0x03030303UL) + \ |
235 | (b & 0x03030303UL); \ |
236 | h1 = ((a & 0xFCFCFCFCUL) >> 2) + \ |
237 | ((b & 0xFCFCFCFCUL) >> 2); \ |
238 | OP(*((uint32_t *) block), \ |
239 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL)); \ |
240 | pixels += line_size; \ |
241 | block += line_size; \ |
242 | a = AV_RN32(pixels); \ |
243 | b = AV_RN32(pixels + 1); \ |
244 | l0 = (a & 0x03030303UL) + \ |
245 | (b & 0x03030303UL) + \ |
246 | 0x02020202UL; \ |
247 | h0 = ((a & 0xFCFCFCFCUL) >> 2) + \ |
248 | ((b & 0xFCFCFCFCUL) >> 2); \ |
249 | OP(*((uint32_t *) block), \ |
250 | h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL)); \ |
251 | pixels += line_size; \ |
252 | block += line_size; \ |
253 | } \ |
254 | pixels += 4 - line_size * (h + 1); \ |
255 | block += 4 - line_size * h; \ |
256 | } \ |
257 | } \ |
258 | \ |
259 | CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_8_c, \ |
260 | OPNAME ## _pixels8_xy2_8_c, \ |
261 | 8) \ |
262 | |
263 | #define op_avg(a, b) a = rnd_avg32(a, b) |
264 | #define op_put(a, b) a = b |
265 | PIXOP2(avg, op_avg) |
266 | PIXOP2(put, op_put) |
267 | #undef op_avg |
268 | #undef op_put |
269 | |
270 | static void put_rv40_qpel16_mc33_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) |
271 | { |
272 | put_pixels16_xy2_8_c(dst, src, stride, 16); |
273 | } |
274 | static void avg_rv40_qpel16_mc33_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) |
275 | { |
276 | avg_pixels16_xy2_8_c(dst, src, stride, 16); |
277 | } |
278 | static void put_rv40_qpel8_mc33_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) |
279 | { |
280 | put_pixels8_xy2_8_c(dst, src, stride, 8); |
281 | } |
282 | static void avg_rv40_qpel8_mc33_c(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) |
283 | { |
284 | avg_pixels8_xy2_8_c(dst, src, stride, 8); |
285 | } |
286 | |
287 | static const int rv40_bias[4][4] = { |
288 | { 0, 16, 32, 16 }, |
289 | { 32, 28, 32, 28 }, |
290 | { 0, 32, 16, 32 }, |
291 | { 32, 28, 32, 28 } |
292 | }; |
293 | |
294 | #define RV40_CHROMA_MC(OPNAME, OP)\ |
295 | static void OPNAME ## rv40_chroma_mc4_c(uint8_t *dst /*align 8*/,\ |
296 | uint8_t *src /*align 1*/,\ |
297 | ptrdiff_t stride, int h, int x, int y)\ |
298 | {\ |
299 | const int A = (8-x) * (8-y);\ |
300 | const int B = ( x) * (8-y);\ |
301 | const int C = (8-x) * ( y);\ |
302 | const int D = ( x) * ( y);\ |
303 | int i;\ |
304 | int bias = rv40_bias[y>>1][x>>1];\ |
305 | \ |
306 | av_assert2(x<8 && y<8 && x>=0 && y>=0);\ |
307 | \ |
308 | if(D){\ |
309 | for(i = 0; i < h; i++){\ |
310 | OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + bias));\ |
311 | OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + bias));\ |
312 | OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + bias));\ |
313 | OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + bias));\ |
314 | dst += stride;\ |
315 | src += stride;\ |
316 | }\ |
317 | }else{\ |
318 | const int E = B + C;\ |
319 | const ptrdiff_t step = C ? stride : 1;\ |
320 | for(i = 0; i < h; i++){\ |
321 | OP(dst[0], (A*src[0] + E*src[step+0] + bias));\ |
322 | OP(dst[1], (A*src[1] + E*src[step+1] + bias));\ |
323 | OP(dst[2], (A*src[2] + E*src[step+2] + bias));\ |
324 | OP(dst[3], (A*src[3] + E*src[step+3] + bias));\ |
325 | dst += stride;\ |
326 | src += stride;\ |
327 | }\ |
328 | }\ |
329 | }\ |
330 | \ |
331 | static void OPNAME ## rv40_chroma_mc8_c(uint8_t *dst/*align 8*/,\ |
332 | uint8_t *src/*align 1*/,\ |
333 | ptrdiff_t stride, int h, int x, int y)\ |
334 | {\ |
335 | const int A = (8-x) * (8-y);\ |
336 | const int B = ( x) * (8-y);\ |
337 | const int C = (8-x) * ( y);\ |
338 | const int D = ( x) * ( y);\ |
339 | int i;\ |
340 | int bias = rv40_bias[y>>1][x>>1];\ |
341 | \ |
342 | av_assert2(x<8 && y<8 && x>=0 && y>=0);\ |
343 | \ |
344 | if(D){\ |
345 | for(i = 0; i < h; i++){\ |
346 | OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + bias));\ |
347 | OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + bias));\ |
348 | OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + bias));\ |
349 | OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + bias));\ |
350 | OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + bias));\ |
351 | OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + bias));\ |
352 | OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + bias));\ |
353 | OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + bias));\ |
354 | dst += stride;\ |
355 | src += stride;\ |
356 | }\ |
357 | }else{\ |
358 | const int E = B + C;\ |
359 | const ptrdiff_t step = C ? stride : 1;\ |
360 | for(i = 0; i < h; i++){\ |
361 | OP(dst[0], (A*src[0] + E*src[step+0] + bias));\ |
362 | OP(dst[1], (A*src[1] + E*src[step+1] + bias));\ |
363 | OP(dst[2], (A*src[2] + E*src[step+2] + bias));\ |
364 | OP(dst[3], (A*src[3] + E*src[step+3] + bias));\ |
365 | OP(dst[4], (A*src[4] + E*src[step+4] + bias));\ |
366 | OP(dst[5], (A*src[5] + E*src[step+5] + bias));\ |
367 | OP(dst[6], (A*src[6] + E*src[step+6] + bias));\ |
368 | OP(dst[7], (A*src[7] + E*src[step+7] + bias));\ |
369 | dst += stride;\ |
370 | src += stride;\ |
371 | }\ |
372 | }\ |
373 | } |
374 | |
375 | #define op_avg(a, b) a = (((a)+((b)>>6)+1)>>1) |
376 | #define op_put(a, b) a = ((b)>>6) |
377 | |
378 | RV40_CHROMA_MC(put_, op_put) |
379 | RV40_CHROMA_MC(avg_, op_avg) |
380 | |
381 | #define RV40_WEIGHT_FUNC(size) \ |
382 | static void rv40_weight_func_rnd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\ |
383 | {\ |
384 | int i, j;\ |
385 | \ |
386 | for (j = 0; j < size; j++) {\ |
387 | for (i = 0; i < size; i++)\ |
388 | dst[i] = (((w2 * src1[i]) >> 9) + ((w1 * src2[i]) >> 9) + 0x10) >> 5;\ |
389 | src1 += stride;\ |
390 | src2 += stride;\ |
391 | dst += stride;\ |
392 | }\ |
393 | }\ |
394 | static void rv40_weight_func_nornd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\ |
395 | {\ |
396 | int i, j;\ |
397 | \ |
398 | for (j = 0; j < size; j++) {\ |
399 | for (i = 0; i < size; i++)\ |
400 | dst[i] = (w2 * src1[i] + w1 * src2[i] + 0x10) >> 5;\ |
401 | src1 += stride;\ |
402 | src2 += stride;\ |
403 | dst += stride;\ |
404 | }\ |
405 | } |
406 | |
407 | RV40_WEIGHT_FUNC(16) |
408 | RV40_WEIGHT_FUNC(8) |
409 | |
410 | /** |
411 | * dither values for deblocking filter - left/top values |
412 | */ |
413 | static const uint8_t rv40_dither_l[16] = { |
414 | 0x40, 0x50, 0x20, 0x60, 0x30, 0x50, 0x40, 0x30, |
415 | 0x50, 0x40, 0x50, 0x30, 0x60, 0x20, 0x50, 0x40 |
416 | }; |
417 | |
418 | /** |
419 | * dither values for deblocking filter - right/bottom values |
420 | */ |
421 | static const uint8_t rv40_dither_r[16] = { |
422 | 0x40, 0x30, 0x60, 0x20, 0x50, 0x30, 0x30, 0x40, |
423 | 0x40, 0x40, 0x50, 0x30, 0x20, 0x60, 0x30, 0x40 |
424 | }; |
425 | |
426 | #define CLIP_SYMM(a, b) av_clip(a, -(b), b) |
427 | /** |
428 | * weaker deblocking very similar to the one described in 4.4.2 of JVT-A003r1 |
429 | */ |
430 | static av_always_inline void rv40_weak_loop_filter(uint8_t *src, |
431 | const int step, |
432 | const ptrdiff_t stride, |
433 | const int filter_p1, |
434 | const int filter_q1, |
435 | const int alpha, |
436 | const int beta, |
437 | const int lim_p0q0, |
438 | const int lim_q1, |
439 | const int lim_p1) |
440 | { |
441 | const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; |
442 | int i, t, u, diff; |
443 | |
444 | for (i = 0; i < 4; i++, src += stride) { |
445 | int diff_p1p0 = src[-2*step] - src[-1*step]; |
446 | int diff_q1q0 = src[ 1*step] - src[ 0*step]; |
447 | int diff_p1p2 = src[-2*step] - src[-3*step]; |
448 | int diff_q1q2 = src[ 1*step] - src[ 2*step]; |
449 | |
450 | t = src[0*step] - src[-1*step]; |
451 | if (!t) |
452 | continue; |
453 | |
454 | u = (alpha * FFABS(t)) >> 7; |
455 | if (u > 3 - (filter_p1 && filter_q1)) |
456 | continue; |
457 | |
458 | t *= 1 << 2; |
459 | if (filter_p1 && filter_q1) |
460 | t += src[-2*step] - src[1*step]; |
461 | |
462 | diff = CLIP_SYMM((t + 4) >> 3, lim_p0q0); |
463 | src[-1*step] = cm[src[-1*step] + diff]; |
464 | src[ 0*step] = cm[src[ 0*step] - diff]; |
465 | |
466 | if (filter_p1 && FFABS(diff_p1p2) <= beta) { |
467 | t = (diff_p1p0 + diff_p1p2 - diff) >> 1; |
468 | src[-2*step] = cm[src[-2*step] - CLIP_SYMM(t, lim_p1)]; |
469 | } |
470 | |
471 | if (filter_q1 && FFABS(diff_q1q2) <= beta) { |
472 | t = (diff_q1q0 + diff_q1q2 + diff) >> 1; |
473 | src[ 1*step] = cm[src[ 1*step] - CLIP_SYMM(t, lim_q1)]; |
474 | } |
475 | } |
476 | } |
477 | |
478 | static void rv40_h_weak_loop_filter(uint8_t *src, const ptrdiff_t stride, |
479 | const int filter_p1, const int filter_q1, |
480 | const int alpha, const int beta, |
481 | const int lim_p0q0, const int lim_q1, |
482 | const int lim_p1) |
483 | { |
484 | rv40_weak_loop_filter(src, stride, 1, filter_p1, filter_q1, |
485 | alpha, beta, lim_p0q0, lim_q1, lim_p1); |
486 | } |
487 | |
488 | static void rv40_v_weak_loop_filter(uint8_t *src, const ptrdiff_t stride, |
489 | const int filter_p1, const int filter_q1, |
490 | const int alpha, const int beta, |
491 | const int lim_p0q0, const int lim_q1, |
492 | const int lim_p1) |
493 | { |
494 | rv40_weak_loop_filter(src, 1, stride, filter_p1, filter_q1, |
495 | alpha, beta, lim_p0q0, lim_q1, lim_p1); |
496 | } |
497 | |
498 | static av_always_inline void rv40_strong_loop_filter(uint8_t *src, |
499 | const int step, |
500 | const ptrdiff_t stride, |
501 | const int alpha, |
502 | const int lims, |
503 | const int dmode, |
504 | const int chroma) |
505 | { |
506 | int i; |
507 | |
508 | for(i = 0; i < 4; i++, src += stride){ |
509 | int sflag, p0, q0, p1, q1; |
510 | int t = src[0*step] - src[-1*step]; |
511 | |
512 | if (!t) |
513 | continue; |
514 | |
515 | sflag = (alpha * FFABS(t)) >> 7; |
516 | if (sflag > 1) |
517 | continue; |
518 | |
519 | p0 = (25*src[-3*step] + 26*src[-2*step] + 26*src[-1*step] + |
520 | 26*src[ 0*step] + 25*src[ 1*step] + |
521 | rv40_dither_l[dmode + i]) >> 7; |
522 | |
523 | q0 = (25*src[-2*step] + 26*src[-1*step] + 26*src[ 0*step] + |
524 | 26*src[ 1*step] + 25*src[ 2*step] + |
525 | rv40_dither_r[dmode + i]) >> 7; |
526 | |
527 | if (sflag) { |
528 | p0 = av_clip(p0, src[-1*step] - lims, src[-1*step] + lims); |
529 | q0 = av_clip(q0, src[ 0*step] - lims, src[ 0*step] + lims); |
530 | } |
531 | |
532 | p1 = (25*src[-4*step] + 26*src[-3*step] + 26*src[-2*step] + 26*p0 + |
533 | 25*src[ 0*step] + rv40_dither_l[dmode + i]) >> 7; |
534 | q1 = (25*src[-1*step] + 26*q0 + 26*src[ 1*step] + 26*src[ 2*step] + |
535 | 25*src[ 3*step] + rv40_dither_r[dmode + i]) >> 7; |
536 | |
537 | if (sflag) { |
538 | p1 = av_clip(p1, src[-2*step] - lims, src[-2*step] + lims); |
539 | q1 = av_clip(q1, src[ 1*step] - lims, src[ 1*step] + lims); |
540 | } |
541 | |
542 | src[-2*step] = p1; |
543 | src[-1*step] = p0; |
544 | src[ 0*step] = q0; |
545 | src[ 1*step] = q1; |
546 | |
547 | if(!chroma){ |
548 | src[-3*step] = (25*src[-1*step] + 26*src[-2*step] + |
549 | 51*src[-3*step] + 26*src[-4*step] + 64) >> 7; |
550 | src[ 2*step] = (25*src[ 0*step] + 26*src[ 1*step] + |
551 | 51*src[ 2*step] + 26*src[ 3*step] + 64) >> 7; |
552 | } |
553 | } |
554 | } |
555 | |
556 | static void rv40_h_strong_loop_filter(uint8_t *src, const ptrdiff_t stride, |
557 | const int alpha, const int lims, |
558 | const int dmode, const int chroma) |
559 | { |
560 | rv40_strong_loop_filter(src, stride, 1, alpha, lims, dmode, chroma); |
561 | } |
562 | |
563 | static void rv40_v_strong_loop_filter(uint8_t *src, const ptrdiff_t stride, |
564 | const int alpha, const int lims, |
565 | const int dmode, const int chroma) |
566 | { |
567 | rv40_strong_loop_filter(src, 1, stride, alpha, lims, dmode, chroma); |
568 | } |
569 | |
570 | static av_always_inline int rv40_loop_filter_strength(uint8_t *src, |
571 | int step, ptrdiff_t stride, |
572 | int beta, int beta2, |
573 | int edge, |
574 | int *p1, int *q1) |
575 | { |
576 | int sum_p1p0 = 0, sum_q1q0 = 0, sum_p1p2 = 0, sum_q1q2 = 0; |
577 | int strong0 = 0, strong1 = 0; |
578 | uint8_t *ptr; |
579 | int i; |
580 | |
581 | for (i = 0, ptr = src; i < 4; i++, ptr += stride) { |
582 | sum_p1p0 += ptr[-2*step] - ptr[-1*step]; |
583 | sum_q1q0 += ptr[ 1*step] - ptr[ 0*step]; |
584 | } |
585 | |
586 | *p1 = FFABS(sum_p1p0) < (beta << 2); |
587 | *q1 = FFABS(sum_q1q0) < (beta << 2); |
588 | |
589 | if(!*p1 && !*q1) |
590 | return 0; |
591 | |
592 | if (!edge) |
593 | return 0; |
594 | |
595 | for (i = 0, ptr = src; i < 4; i++, ptr += stride) { |
596 | sum_p1p2 += ptr[-2*step] - ptr[-3*step]; |
597 | sum_q1q2 += ptr[ 1*step] - ptr[ 2*step]; |
598 | } |
599 | |
600 | strong0 = *p1 && (FFABS(sum_p1p2) < beta2); |
601 | strong1 = *q1 && (FFABS(sum_q1q2) < beta2); |
602 | |
603 | return strong0 && strong1; |
604 | } |
605 | |
606 | static int rv40_h_loop_filter_strength(uint8_t *src, ptrdiff_t stride, |
607 | int beta, int beta2, int edge, |
608 | int *p1, int *q1) |
609 | { |
610 | return rv40_loop_filter_strength(src, stride, 1, beta, beta2, edge, p1, q1); |
611 | } |
612 | |
613 | static int rv40_v_loop_filter_strength(uint8_t *src, ptrdiff_t stride, |
614 | int beta, int beta2, int edge, |
615 | int *p1, int *q1) |
616 | { |
617 | return rv40_loop_filter_strength(src, 1, stride, beta, beta2, edge, p1, q1); |
618 | } |
619 | |
620 | av_cold void ff_rv40dsp_init(RV34DSPContext *c) |
621 | { |
622 | H264QpelContext qpel; |
623 | |
624 | ff_rv34dsp_init(c); |
625 | ff_h264qpel_init(&qpel, 8); |
626 | |
627 | c->put_pixels_tab[0][ 0] = qpel.put_h264_qpel_pixels_tab[0][0]; |
628 | c->put_pixels_tab[0][ 1] = put_rv40_qpel16_mc10_c; |
629 | c->put_pixels_tab[0][ 2] = qpel.put_h264_qpel_pixels_tab[0][2]; |
630 | c->put_pixels_tab[0][ 3] = put_rv40_qpel16_mc30_c; |
631 | c->put_pixels_tab[0][ 4] = put_rv40_qpel16_mc01_c; |
632 | c->put_pixels_tab[0][ 5] = put_rv40_qpel16_mc11_c; |
633 | c->put_pixels_tab[0][ 6] = put_rv40_qpel16_mc21_c; |
634 | c->put_pixels_tab[0][ 7] = put_rv40_qpel16_mc31_c; |
635 | c->put_pixels_tab[0][ 8] = qpel.put_h264_qpel_pixels_tab[0][8]; |
636 | c->put_pixels_tab[0][ 9] = put_rv40_qpel16_mc12_c; |
637 | c->put_pixels_tab[0][10] = put_rv40_qpel16_mc22_c; |
638 | c->put_pixels_tab[0][11] = put_rv40_qpel16_mc32_c; |
639 | c->put_pixels_tab[0][12] = put_rv40_qpel16_mc03_c; |
640 | c->put_pixels_tab[0][13] = put_rv40_qpel16_mc13_c; |
641 | c->put_pixels_tab[0][14] = put_rv40_qpel16_mc23_c; |
642 | c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_c; |
643 | c->avg_pixels_tab[0][ 0] = qpel.avg_h264_qpel_pixels_tab[0][0]; |
644 | c->avg_pixels_tab[0][ 1] = avg_rv40_qpel16_mc10_c; |
645 | c->avg_pixels_tab[0][ 2] = qpel.avg_h264_qpel_pixels_tab[0][2]; |
646 | c->avg_pixels_tab[0][ 3] = avg_rv40_qpel16_mc30_c; |
647 | c->avg_pixels_tab[0][ 4] = avg_rv40_qpel16_mc01_c; |
648 | c->avg_pixels_tab[0][ 5] = avg_rv40_qpel16_mc11_c; |
649 | c->avg_pixels_tab[0][ 6] = avg_rv40_qpel16_mc21_c; |
650 | c->avg_pixels_tab[0][ 7] = avg_rv40_qpel16_mc31_c; |
651 | c->avg_pixels_tab[0][ 8] = qpel.avg_h264_qpel_pixels_tab[0][8]; |
652 | c->avg_pixels_tab[0][ 9] = avg_rv40_qpel16_mc12_c; |
653 | c->avg_pixels_tab[0][10] = avg_rv40_qpel16_mc22_c; |
654 | c->avg_pixels_tab[0][11] = avg_rv40_qpel16_mc32_c; |
655 | c->avg_pixels_tab[0][12] = avg_rv40_qpel16_mc03_c; |
656 | c->avg_pixels_tab[0][13] = avg_rv40_qpel16_mc13_c; |
657 | c->avg_pixels_tab[0][14] = avg_rv40_qpel16_mc23_c; |
658 | c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c; |
659 | c->put_pixels_tab[1][ 0] = qpel.put_h264_qpel_pixels_tab[1][0]; |
660 | c->put_pixels_tab[1][ 1] = put_rv40_qpel8_mc10_c; |
661 | c->put_pixels_tab[1][ 2] = qpel.put_h264_qpel_pixels_tab[1][2]; |
662 | c->put_pixels_tab[1][ 3] = put_rv40_qpel8_mc30_c; |
663 | c->put_pixels_tab[1][ 4] = put_rv40_qpel8_mc01_c; |
664 | c->put_pixels_tab[1][ 5] = put_rv40_qpel8_mc11_c; |
665 | c->put_pixels_tab[1][ 6] = put_rv40_qpel8_mc21_c; |
666 | c->put_pixels_tab[1][ 7] = put_rv40_qpel8_mc31_c; |
667 | c->put_pixels_tab[1][ 8] = qpel.put_h264_qpel_pixels_tab[1][8]; |
668 | c->put_pixels_tab[1][ 9] = put_rv40_qpel8_mc12_c; |
669 | c->put_pixels_tab[1][10] = put_rv40_qpel8_mc22_c; |
670 | c->put_pixels_tab[1][11] = put_rv40_qpel8_mc32_c; |
671 | c->put_pixels_tab[1][12] = put_rv40_qpel8_mc03_c; |
672 | c->put_pixels_tab[1][13] = put_rv40_qpel8_mc13_c; |
673 | c->put_pixels_tab[1][14] = put_rv40_qpel8_mc23_c; |
674 | c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_c; |
675 | c->avg_pixels_tab[1][ 0] = qpel.avg_h264_qpel_pixels_tab[1][0]; |
676 | c->avg_pixels_tab[1][ 1] = avg_rv40_qpel8_mc10_c; |
677 | c->avg_pixels_tab[1][ 2] = qpel.avg_h264_qpel_pixels_tab[1][2]; |
678 | c->avg_pixels_tab[1][ 3] = avg_rv40_qpel8_mc30_c; |
679 | c->avg_pixels_tab[1][ 4] = avg_rv40_qpel8_mc01_c; |
680 | c->avg_pixels_tab[1][ 5] = avg_rv40_qpel8_mc11_c; |
681 | c->avg_pixels_tab[1][ 6] = avg_rv40_qpel8_mc21_c; |
682 | c->avg_pixels_tab[1][ 7] = avg_rv40_qpel8_mc31_c; |
683 | c->avg_pixels_tab[1][ 8] = qpel.avg_h264_qpel_pixels_tab[1][8]; |
684 | c->avg_pixels_tab[1][ 9] = avg_rv40_qpel8_mc12_c; |
685 | c->avg_pixels_tab[1][10] = avg_rv40_qpel8_mc22_c; |
686 | c->avg_pixels_tab[1][11] = avg_rv40_qpel8_mc32_c; |
687 | c->avg_pixels_tab[1][12] = avg_rv40_qpel8_mc03_c; |
688 | c->avg_pixels_tab[1][13] = avg_rv40_qpel8_mc13_c; |
689 | c->avg_pixels_tab[1][14] = avg_rv40_qpel8_mc23_c; |
690 | c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c; |
691 | |
692 | c->put_chroma_pixels_tab[0] = put_rv40_chroma_mc8_c; |
693 | c->put_chroma_pixels_tab[1] = put_rv40_chroma_mc4_c; |
694 | c->avg_chroma_pixels_tab[0] = avg_rv40_chroma_mc8_c; |
695 | c->avg_chroma_pixels_tab[1] = avg_rv40_chroma_mc4_c; |
696 | |
697 | c->rv40_weight_pixels_tab[0][0] = rv40_weight_func_rnd_16; |
698 | c->rv40_weight_pixels_tab[0][1] = rv40_weight_func_rnd_8; |
699 | c->rv40_weight_pixels_tab[1][0] = rv40_weight_func_nornd_16; |
700 | c->rv40_weight_pixels_tab[1][1] = rv40_weight_func_nornd_8; |
701 | |
702 | c->rv40_weak_loop_filter[0] = rv40_h_weak_loop_filter; |
703 | c->rv40_weak_loop_filter[1] = rv40_v_weak_loop_filter; |
704 | c->rv40_strong_loop_filter[0] = rv40_h_strong_loop_filter; |
705 | c->rv40_strong_loop_filter[1] = rv40_v_strong_loop_filter; |
706 | c->rv40_loop_filter_strength[0] = rv40_h_loop_filter_strength; |
707 | c->rv40_loop_filter_strength[1] = rv40_v_loop_filter_strength; |
708 | |
709 | if (ARCH_AARCH64) |
710 | ff_rv40dsp_init_aarch64(c); |
711 | if (ARCH_ARM) |
712 | ff_rv40dsp_init_arm(c); |
713 | if (ARCH_X86) |
714 | ff_rv40dsp_init_x86(c); |
715 | } |
716 |