blob: fdaa292d3717b62897c0a718f855be76da488d05
1 | /* |
2 | * Copyright (C) 2004 The FFmpeg project |
3 | * |
4 | * This file is part of FFmpeg. |
5 | * |
6 | * FFmpeg is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Lesser General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2.1 of the License, or (at your option) any later version. |
10 | * |
11 | * FFmpeg is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Lesser General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Lesser General Public |
17 | * License along with FFmpeg; if not, write to the Free Software |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 | */ |
20 | |
21 | /** |
22 | * @file |
23 | * Standard C DSP-oriented functions cribbed from the original VP3 |
24 | * source code. |
25 | */ |
26 | |
27 | #include "libavutil/attributes.h" |
28 | #include "libavutil/common.h" |
29 | #include "libavutil/intreadwrite.h" |
30 | |
31 | #include "avcodec.h" |
32 | #include "rnd_avg.h" |
33 | #include "vp3dsp.h" |
34 | |
35 | #define IdctAdjustBeforeShift 8 |
36 | #define xC1S7 64277 |
37 | #define xC2S6 60547 |
38 | #define xC3S5 54491 |
39 | #define xC4S4 46341 |
40 | #define xC5S3 36410 |
41 | #define xC6S2 25080 |
42 | #define xC7S1 12785 |
43 | |
44 | #define M(a, b) ((int)((SUINT)(a) * (b)) >> 16) |
45 | |
46 | static av_always_inline void idct(uint8_t *dst, ptrdiff_t stride, |
47 | int16_t *input, int type) |
48 | { |
49 | int16_t *ip = input; |
50 | |
51 | int A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H; |
52 | int Ed, Gd, Add, Bdd, Fd, Hd; |
53 | |
54 | int i; |
55 | |
56 | /* Inverse DCT on the rows now */ |
57 | for (i = 0; i < 8; i++) { |
58 | /* Check for non-zero values */ |
59 | if (ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] | |
60 | ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8]) { |
61 | A = M(xC1S7, ip[1 * 8]) + M(xC7S1, ip[7 * 8]); |
62 | B = M(xC7S1, ip[1 * 8]) - M(xC1S7, ip[7 * 8]); |
63 | C = M(xC3S5, ip[3 * 8]) + M(xC5S3, ip[5 * 8]); |
64 | D = M(xC3S5, ip[5 * 8]) - M(xC5S3, ip[3 * 8]); |
65 | |
66 | Ad = M(xC4S4, (A - C)); |
67 | Bd = M(xC4S4, (B - D)); |
68 | |
69 | Cd = A + C; |
70 | Dd = B + D; |
71 | |
72 | E = M(xC4S4, (ip[0 * 8] + ip[4 * 8])); |
73 | F = M(xC4S4, (ip[0 * 8] - ip[4 * 8])); |
74 | |
75 | G = M(xC2S6, ip[2 * 8]) + M(xC6S2, ip[6 * 8]); |
76 | H = M(xC6S2, ip[2 * 8]) - M(xC2S6, ip[6 * 8]); |
77 | |
78 | Ed = E - G; |
79 | Gd = E + G; |
80 | |
81 | Add = F + Ad; |
82 | Bdd = Bd - H; |
83 | |
84 | Fd = F - Ad; |
85 | Hd = Bd + H; |
86 | |
87 | /* Final sequence of operations over-write original inputs. */ |
88 | ip[0 * 8] = Gd + Cd; |
89 | ip[7 * 8] = Gd - Cd; |
90 | |
91 | ip[1 * 8] = Add + Hd; |
92 | ip[2 * 8] = Add - Hd; |
93 | |
94 | ip[3 * 8] = Ed + Dd; |
95 | ip[4 * 8] = Ed - Dd; |
96 | |
97 | ip[5 * 8] = Fd + Bdd; |
98 | ip[6 * 8] = Fd - Bdd; |
99 | } |
100 | |
101 | ip += 1; /* next row */ |
102 | } |
103 | |
104 | ip = input; |
105 | |
106 | for (i = 0; i < 8; i++) { |
107 | /* Check for non-zero values (bitwise or faster than ||) */ |
108 | if (ip[1] | ip[2] | ip[3] | |
109 | ip[4] | ip[5] | ip[6] | ip[7]) { |
110 | A = M(xC1S7, ip[1]) + M(xC7S1, ip[7]); |
111 | B = M(xC7S1, ip[1]) - M(xC1S7, ip[7]); |
112 | C = M(xC3S5, ip[3]) + M(xC5S3, ip[5]); |
113 | D = M(xC3S5, ip[5]) - M(xC5S3, ip[3]); |
114 | |
115 | Ad = M(xC4S4, (A - C)); |
116 | Bd = M(xC4S4, (B - D)); |
117 | |
118 | Cd = A + C; |
119 | Dd = B + D; |
120 | |
121 | E = M(xC4S4, (ip[0] + ip[4])) + 8; |
122 | F = M(xC4S4, (ip[0] - ip[4])) + 8; |
123 | |
124 | if (type == 1) { // HACK |
125 | E += 16 * 128; |
126 | F += 16 * 128; |
127 | } |
128 | |
129 | G = M(xC2S6, ip[2]) + M(xC6S2, ip[6]); |
130 | H = M(xC6S2, ip[2]) - M(xC2S6, ip[6]); |
131 | |
132 | Ed = E - G; |
133 | Gd = E + G; |
134 | |
135 | Add = F + Ad; |
136 | Bdd = Bd - H; |
137 | |
138 | Fd = F - Ad; |
139 | Hd = Bd + H; |
140 | |
141 | /* Final sequence of operations over-write original inputs. */ |
142 | if (type == 1) { |
143 | dst[0 * stride] = av_clip_uint8((Gd + Cd) >> 4); |
144 | dst[7 * stride] = av_clip_uint8((Gd - Cd) >> 4); |
145 | |
146 | dst[1 * stride] = av_clip_uint8((Add + Hd) >> 4); |
147 | dst[2 * stride] = av_clip_uint8((Add - Hd) >> 4); |
148 | |
149 | dst[3 * stride] = av_clip_uint8((Ed + Dd) >> 4); |
150 | dst[4 * stride] = av_clip_uint8((Ed - Dd) >> 4); |
151 | |
152 | dst[5 * stride] = av_clip_uint8((Fd + Bdd) >> 4); |
153 | dst[6 * stride] = av_clip_uint8((Fd - Bdd) >> 4); |
154 | } else { |
155 | dst[0 * stride] = av_clip_uint8(dst[0 * stride] + ((Gd + Cd) >> 4)); |
156 | dst[7 * stride] = av_clip_uint8(dst[7 * stride] + ((Gd - Cd) >> 4)); |
157 | |
158 | dst[1 * stride] = av_clip_uint8(dst[1 * stride] + ((Add + Hd) >> 4)); |
159 | dst[2 * stride] = av_clip_uint8(dst[2 * stride] + ((Add - Hd) >> 4)); |
160 | |
161 | dst[3 * stride] = av_clip_uint8(dst[3 * stride] + ((Ed + Dd) >> 4)); |
162 | dst[4 * stride] = av_clip_uint8(dst[4 * stride] + ((Ed - Dd) >> 4)); |
163 | |
164 | dst[5 * stride] = av_clip_uint8(dst[5 * stride] + ((Fd + Bdd) >> 4)); |
165 | dst[6 * stride] = av_clip_uint8(dst[6 * stride] + ((Fd - Bdd) >> 4)); |
166 | } |
167 | } else { |
168 | if (type == 1) { |
169 | dst[0*stride] = |
170 | dst[1*stride] = |
171 | dst[2*stride] = |
172 | dst[3*stride] = |
173 | dst[4*stride] = |
174 | dst[5*stride] = |
175 | dst[6*stride] = |
176 | dst[7*stride] = av_clip_uint8(128 + ((xC4S4 * ip[0] + (IdctAdjustBeforeShift << 16)) >> 20)); |
177 | } else { |
178 | if (ip[0]) { |
179 | int v = (xC4S4 * ip[0] + (IdctAdjustBeforeShift << 16)) >> 20; |
180 | dst[0 * stride] = av_clip_uint8(dst[0 * stride] + v); |
181 | dst[1 * stride] = av_clip_uint8(dst[1 * stride] + v); |
182 | dst[2 * stride] = av_clip_uint8(dst[2 * stride] + v); |
183 | dst[3 * stride] = av_clip_uint8(dst[3 * stride] + v); |
184 | dst[4 * stride] = av_clip_uint8(dst[4 * stride] + v); |
185 | dst[5 * stride] = av_clip_uint8(dst[5 * stride] + v); |
186 | dst[6 * stride] = av_clip_uint8(dst[6 * stride] + v); |
187 | dst[7 * stride] = av_clip_uint8(dst[7 * stride] + v); |
188 | } |
189 | } |
190 | } |
191 | |
192 | ip += 8; /* next column */ |
193 | dst++; |
194 | } |
195 | } |
196 | |
197 | static void vp3_idct_put_c(uint8_t *dest /* align 8 */, ptrdiff_t stride, |
198 | int16_t *block /* align 16 */) |
199 | { |
200 | idct(dest, stride, block, 1); |
201 | memset(block, 0, sizeof(*block) * 64); |
202 | } |
203 | |
204 | static void vp3_idct_add_c(uint8_t *dest /* align 8 */, ptrdiff_t stride, |
205 | int16_t *block /* align 16 */) |
206 | { |
207 | idct(dest, stride, block, 2); |
208 | memset(block, 0, sizeof(*block) * 64); |
209 | } |
210 | |
211 | static void vp3_idct_dc_add_c(uint8_t *dest /* align 8 */, ptrdiff_t stride, |
212 | int16_t *block /* align 16 */) |
213 | { |
214 | int i, dc = (block[0] + 15) >> 5; |
215 | |
216 | for (i = 0; i < 8; i++) { |
217 | dest[0] = av_clip_uint8(dest[0] + dc); |
218 | dest[1] = av_clip_uint8(dest[1] + dc); |
219 | dest[2] = av_clip_uint8(dest[2] + dc); |
220 | dest[3] = av_clip_uint8(dest[3] + dc); |
221 | dest[4] = av_clip_uint8(dest[4] + dc); |
222 | dest[5] = av_clip_uint8(dest[5] + dc); |
223 | dest[6] = av_clip_uint8(dest[6] + dc); |
224 | dest[7] = av_clip_uint8(dest[7] + dc); |
225 | dest += stride; |
226 | } |
227 | block[0] = 0; |
228 | } |
229 | |
230 | static void vp3_v_loop_filter_c(uint8_t *first_pixel, ptrdiff_t stride, |
231 | int *bounding_values) |
232 | { |
233 | unsigned char *end; |
234 | int filter_value; |
235 | const ptrdiff_t nstride = -stride; |
236 | |
237 | for (end = first_pixel + 8; first_pixel < end; first_pixel++) { |
238 | filter_value = (first_pixel[2 * nstride] - first_pixel[stride]) + |
239 | (first_pixel[0] - first_pixel[nstride]) * 3; |
240 | filter_value = bounding_values[(filter_value + 4) >> 3]; |
241 | |
242 | first_pixel[nstride] = av_clip_uint8(first_pixel[nstride] + filter_value); |
243 | first_pixel[0] = av_clip_uint8(first_pixel[0] - filter_value); |
244 | } |
245 | } |
246 | |
247 | static void vp3_h_loop_filter_c(uint8_t *first_pixel, ptrdiff_t stride, |
248 | int *bounding_values) |
249 | { |
250 | unsigned char *end; |
251 | int filter_value; |
252 | |
253 | for (end = first_pixel + 8 * stride; first_pixel != end; first_pixel += stride) { |
254 | filter_value = (first_pixel[-2] - first_pixel[1]) + |
255 | (first_pixel[ 0] - first_pixel[-1]) * 3; |
256 | filter_value = bounding_values[(filter_value + 4) >> 3]; |
257 | |
258 | first_pixel[-1] = av_clip_uint8(first_pixel[-1] + filter_value); |
259 | first_pixel[ 0] = av_clip_uint8(first_pixel[ 0] - filter_value); |
260 | } |
261 | } |
262 | |
263 | static void put_no_rnd_pixels_l2(uint8_t *dst, const uint8_t *src1, |
264 | const uint8_t *src2, ptrdiff_t stride, int h) |
265 | { |
266 | int i; |
267 | |
268 | for (i = 0; i < h; i++) { |
269 | uint32_t a, b; |
270 | |
271 | a = AV_RN32(&src1[i * stride]); |
272 | b = AV_RN32(&src2[i * stride]); |
273 | AV_WN32A(&dst[i * stride], no_rnd_avg32(a, b)); |
274 | a = AV_RN32(&src1[i * stride + 4]); |
275 | b = AV_RN32(&src2[i * stride + 4]); |
276 | AV_WN32A(&dst[i * stride + 4], no_rnd_avg32(a, b)); |
277 | } |
278 | } |
279 | |
280 | av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags) |
281 | { |
282 | c->put_no_rnd_pixels_l2 = put_no_rnd_pixels_l2; |
283 | |
284 | c->idct_put = vp3_idct_put_c; |
285 | c->idct_add = vp3_idct_add_c; |
286 | c->idct_dc_add = vp3_idct_dc_add_c; |
287 | c->v_loop_filter = vp3_v_loop_filter_c; |
288 | c->h_loop_filter = vp3_h_loop_filter_c; |
289 | |
290 | if (ARCH_ARM) |
291 | ff_vp3dsp_init_arm(c, flags); |
292 | if (ARCH_PPC) |
293 | ff_vp3dsp_init_ppc(c, flags); |
294 | if (ARCH_X86) |
295 | ff_vp3dsp_init_x86(c, flags); |
296 | } |
297 |