blob: f532313441c05426679a378d629819416d3540cc
1 | /* |
2 | * Simple IDCT |
3 | * |
4 | * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> |
5 | * |
6 | * This file is part of FFmpeg. |
7 | * |
8 | * FFmpeg is free software; you can redistribute it and/or |
9 | * modify it under the terms of the GNU Lesser General Public |
10 | * License as published by the Free Software Foundation; either |
11 | * version 2.1 of the License, or (at your option) any later version. |
12 | * |
13 | * FFmpeg is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 | * Lesser General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU Lesser General Public |
19 | * License along with FFmpeg; if not, write to the Free Software |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 | */ |
22 | |
23 | /** |
24 | * @file |
25 | * simpleidct in C. |
26 | */ |
27 | |
28 | /* Based upon some commented-out C code from mpeg2dec (idct_mmx.c |
29 | * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>). */ |
30 | |
31 | #include "simple_idct.h" |
32 | |
33 | #include "bit_depth_template.c" |
34 | |
35 | #undef W1 |
36 | #undef W2 |
37 | #undef W3 |
38 | #undef W4 |
39 | #undef W5 |
40 | #undef W6 |
41 | #undef W7 |
42 | #undef ROW_SHIFT |
43 | #undef COL_SHIFT |
44 | #undef DC_SHIFT |
45 | #undef MUL |
46 | #undef MAC |
47 | |
48 | #if BIT_DEPTH == 8 |
49 | |
50 | #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
51 | #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
52 | #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
53 | #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
54 | #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
55 | #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
56 | #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
57 | |
58 | #define ROW_SHIFT 11 |
59 | #define COL_SHIFT 20 |
60 | #define DC_SHIFT 3 |
61 | |
62 | #define MUL(a, b) MUL16(a, b) |
63 | #define MAC(a, b, c) MAC16(a, b, c) |
64 | |
65 | #elif BIT_DEPTH == 10 || BIT_DEPTH == 12 |
66 | |
67 | # if BIT_DEPTH == 10 |
68 | #define W1 22725 // 90901 |
69 | #define W2 21407 // 85627 |
70 | #define W3 19265 // 77062 |
71 | #define W4 16384 // 65535 |
72 | #define W5 12873 // 51491 |
73 | #define W6 8867 // 35468 |
74 | #define W7 4520 // 18081 |
75 | |
76 | # ifdef EXTRA_SHIFT |
77 | #define ROW_SHIFT 13 |
78 | #define COL_SHIFT 18 |
79 | #define DC_SHIFT 1 |
80 | # else |
81 | #define ROW_SHIFT 12 |
82 | #define COL_SHIFT 19 |
83 | #define DC_SHIFT 2 |
84 | # endif |
85 | |
86 | # else |
87 | #define W1 45451 |
88 | #define W2 42813 |
89 | #define W3 38531 |
90 | #define W4 32767 |
91 | #define W5 25746 |
92 | #define W6 17734 |
93 | #define W7 9041 |
94 | |
95 | #define ROW_SHIFT 16 |
96 | #define COL_SHIFT 17 |
97 | #define DC_SHIFT -1 |
98 | # endif |
99 | |
100 | #define MUL(a, b) ((a) * (b)) |
101 | #define MAC(a, b, c) ((a) += (b) * (c)) |
102 | |
103 | #else |
104 | |
105 | #error "Unsupported bitdepth" |
106 | |
107 | #endif |
108 | |
109 | #ifdef EXTRA_SHIFT |
110 | static inline void FUNC(idctRowCondDC_extrashift)(int16_t *row, int extra_shift) |
111 | #else |
112 | static inline void FUNC(idctRowCondDC)(int16_t *row, int extra_shift) |
113 | #endif |
114 | { |
115 | SUINT a0, a1, a2, a3, b0, b1, b2, b3; |
116 | |
117 | #if HAVE_FAST_64BIT |
118 | #define ROW0_MASK (0xffffLL << 48 * HAVE_BIGENDIAN) |
119 | if (((AV_RN64A(row) & ~ROW0_MASK) | AV_RN64A(row+4)) == 0) { |
120 | uint64_t temp; |
121 | if (DC_SHIFT - extra_shift >= 0) { |
122 | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; |
123 | } else { |
124 | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; |
125 | } |
126 | temp += temp * (1 << 16); |
127 | temp += temp * ((uint64_t) 1 << 32); |
128 | AV_WN64A(row, temp); |
129 | AV_WN64A(row + 4, temp); |
130 | return; |
131 | } |
132 | #else |
133 | if (!(AV_RN32A(row+2) | |
134 | AV_RN32A(row+4) | |
135 | AV_RN32A(row+6) | |
136 | row[1])) { |
137 | uint32_t temp; |
138 | if (DC_SHIFT - extra_shift >= 0) { |
139 | temp = (row[0] * (1 << (DC_SHIFT - extra_shift))) & 0xffff; |
140 | } else { |
141 | temp = ((row[0] + (1<<(extra_shift - DC_SHIFT-1))) >> (extra_shift - DC_SHIFT)) & 0xffff; |
142 | } |
143 | temp += temp * (1 << 16); |
144 | AV_WN32A(row, temp); |
145 | AV_WN32A(row+2, temp); |
146 | AV_WN32A(row+4, temp); |
147 | AV_WN32A(row+6, temp); |
148 | return; |
149 | } |
150 | #endif |
151 | |
152 | a0 = (W4 * row[0]) + (1 << (ROW_SHIFT + extra_shift - 1)); |
153 | a1 = a0; |
154 | a2 = a0; |
155 | a3 = a0; |
156 | |
157 | a0 += W2 * row[2]; |
158 | a1 += W6 * row[2]; |
159 | a2 -= W6 * row[2]; |
160 | a3 -= W2 * row[2]; |
161 | |
162 | b0 = MUL(W1, row[1]); |
163 | MAC(b0, W3, row[3]); |
164 | b1 = MUL(W3, row[1]); |
165 | MAC(b1, -W7, row[3]); |
166 | b2 = MUL(W5, row[1]); |
167 | MAC(b2, -W1, row[3]); |
168 | b3 = MUL(W7, row[1]); |
169 | MAC(b3, -W5, row[3]); |
170 | |
171 | if (AV_RN64A(row + 4)) { |
172 | a0 += W4*row[4] + W6*row[6]; |
173 | a1 += - W4*row[4] - W2*row[6]; |
174 | a2 += - W4*row[4] + W2*row[6]; |
175 | a3 += W4*row[4] - W6*row[6]; |
176 | |
177 | MAC(b0, W5, row[5]); |
178 | MAC(b0, W7, row[7]); |
179 | |
180 | MAC(b1, -W1, row[5]); |
181 | MAC(b1, -W5, row[7]); |
182 | |
183 | MAC(b2, W7, row[5]); |
184 | MAC(b2, W3, row[7]); |
185 | |
186 | MAC(b3, W3, row[5]); |
187 | MAC(b3, -W1, row[7]); |
188 | } |
189 | |
190 | row[0] = (int)(a0 + b0) >> (ROW_SHIFT + extra_shift); |
191 | row[7] = (int)(a0 - b0) >> (ROW_SHIFT + extra_shift); |
192 | row[1] = (int)(a1 + b1) >> (ROW_SHIFT + extra_shift); |
193 | row[6] = (int)(a1 - b1) >> (ROW_SHIFT + extra_shift); |
194 | row[2] = (int)(a2 + b2) >> (ROW_SHIFT + extra_shift); |
195 | row[5] = (int)(a2 - b2) >> (ROW_SHIFT + extra_shift); |
196 | row[3] = (int)(a3 + b3) >> (ROW_SHIFT + extra_shift); |
197 | row[4] = (int)(a3 - b3) >> (ROW_SHIFT + extra_shift); |
198 | } |
199 | |
200 | #define IDCT_COLS do { \ |
201 | a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4)); \ |
202 | a1 = a0; \ |
203 | a2 = a0; \ |
204 | a3 = a0; \ |
205 | \ |
206 | a0 += W2*col[8*2]; \ |
207 | a1 += W6*col[8*2]; \ |
208 | a2 += -W6*col[8*2]; \ |
209 | a3 += -W2*col[8*2]; \ |
210 | \ |
211 | b0 = MUL(W1, col[8*1]); \ |
212 | b1 = MUL(W3, col[8*1]); \ |
213 | b2 = MUL(W5, col[8*1]); \ |
214 | b3 = MUL(W7, col[8*1]); \ |
215 | \ |
216 | MAC(b0, W3, col[8*3]); \ |
217 | MAC(b1, -W7, col[8*3]); \ |
218 | MAC(b2, -W1, col[8*3]); \ |
219 | MAC(b3, -W5, col[8*3]); \ |
220 | \ |
221 | if (col[8*4]) { \ |
222 | a0 += W4*col[8*4]; \ |
223 | a1 += -W4*col[8*4]; \ |
224 | a2 += -W4*col[8*4]; \ |
225 | a3 += W4*col[8*4]; \ |
226 | } \ |
227 | \ |
228 | if (col[8*5]) { \ |
229 | MAC(b0, W5, col[8*5]); \ |
230 | MAC(b1, -W1, col[8*5]); \ |
231 | MAC(b2, W7, col[8*5]); \ |
232 | MAC(b3, W3, col[8*5]); \ |
233 | } \ |
234 | \ |
235 | if (col[8*6]) { \ |
236 | a0 += W6*col[8*6]; \ |
237 | a1 += -W2*col[8*6]; \ |
238 | a2 += W2*col[8*6]; \ |
239 | a3 += -W6*col[8*6]; \ |
240 | } \ |
241 | \ |
242 | if (col[8*7]) { \ |
243 | MAC(b0, W7, col[8*7]); \ |
244 | MAC(b1, -W5, col[8*7]); \ |
245 | MAC(b2, W3, col[8*7]); \ |
246 | MAC(b3, -W1, col[8*7]); \ |
247 | } \ |
248 | } while (0) |
249 | |
250 | #ifdef EXTRA_SHIFT |
251 | static inline void FUNC(idctSparseCol_extrashift)(int16_t *col) |
252 | #else |
253 | static inline void FUNC(idctSparseColPut)(pixel *dest, ptrdiff_t line_size, |
254 | int16_t *col) |
255 | { |
256 | SUINT a0, a1, a2, a3, b0, b1, b2, b3; |
257 | |
258 | IDCT_COLS; |
259 | |
260 | dest[0] = av_clip_pixel((int)(a0 + b0) >> COL_SHIFT); |
261 | dest += line_size; |
262 | dest[0] = av_clip_pixel((int)(a1 + b1) >> COL_SHIFT); |
263 | dest += line_size; |
264 | dest[0] = av_clip_pixel((int)(a2 + b2) >> COL_SHIFT); |
265 | dest += line_size; |
266 | dest[0] = av_clip_pixel((int)(a3 + b3) >> COL_SHIFT); |
267 | dest += line_size; |
268 | dest[0] = av_clip_pixel((int)(a3 - b3) >> COL_SHIFT); |
269 | dest += line_size; |
270 | dest[0] = av_clip_pixel((int)(a2 - b2) >> COL_SHIFT); |
271 | dest += line_size; |
272 | dest[0] = av_clip_pixel((int)(a1 - b1) >> COL_SHIFT); |
273 | dest += line_size; |
274 | dest[0] = av_clip_pixel((int)(a0 - b0) >> COL_SHIFT); |
275 | } |
276 | |
277 | static inline void FUNC(idctSparseColAdd)(pixel *dest, ptrdiff_t line_size, |
278 | int16_t *col) |
279 | { |
280 | int a0, a1, a2, a3, b0, b1, b2, b3; |
281 | |
282 | IDCT_COLS; |
283 | |
284 | dest[0] = av_clip_pixel(dest[0] + ((a0 + b0) >> COL_SHIFT)); |
285 | dest += line_size; |
286 | dest[0] = av_clip_pixel(dest[0] + ((a1 + b1) >> COL_SHIFT)); |
287 | dest += line_size; |
288 | dest[0] = av_clip_pixel(dest[0] + ((a2 + b2) >> COL_SHIFT)); |
289 | dest += line_size; |
290 | dest[0] = av_clip_pixel(dest[0] + ((a3 + b3) >> COL_SHIFT)); |
291 | dest += line_size; |
292 | dest[0] = av_clip_pixel(dest[0] + ((a3 - b3) >> COL_SHIFT)); |
293 | dest += line_size; |
294 | dest[0] = av_clip_pixel(dest[0] + ((a2 - b2) >> COL_SHIFT)); |
295 | dest += line_size; |
296 | dest[0] = av_clip_pixel(dest[0] + ((a1 - b1) >> COL_SHIFT)); |
297 | dest += line_size; |
298 | dest[0] = av_clip_pixel(dest[0] + ((a0 - b0) >> COL_SHIFT)); |
299 | } |
300 | |
301 | static inline void FUNC(idctSparseCol)(int16_t *col) |
302 | #endif |
303 | { |
304 | int a0, a1, a2, a3, b0, b1, b2, b3; |
305 | |
306 | IDCT_COLS; |
307 | |
308 | col[0 ] = ((a0 + b0) >> COL_SHIFT); |
309 | col[8 ] = ((a1 + b1) >> COL_SHIFT); |
310 | col[16] = ((a2 + b2) >> COL_SHIFT); |
311 | col[24] = ((a3 + b3) >> COL_SHIFT); |
312 | col[32] = ((a3 - b3) >> COL_SHIFT); |
313 | col[40] = ((a2 - b2) >> COL_SHIFT); |
314 | col[48] = ((a1 - b1) >> COL_SHIFT); |
315 | col[56] = ((a0 - b0) >> COL_SHIFT); |
316 | } |
317 | |
318 | #ifndef EXTRA_SHIFT |
319 | void FUNC(ff_simple_idct_put)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block) |
320 | { |
321 | pixel *dest = (pixel *)dest_; |
322 | int i; |
323 | |
324 | line_size /= sizeof(pixel); |
325 | |
326 | for (i = 0; i < 8; i++) |
327 | FUNC(idctRowCondDC)(block + i*8, 0); |
328 | |
329 | for (i = 0; i < 8; i++) |
330 | FUNC(idctSparseColPut)(dest + i, line_size, block + i); |
331 | } |
332 | |
333 | void FUNC(ff_simple_idct_add)(uint8_t *dest_, ptrdiff_t line_size, int16_t *block) |
334 | { |
335 | pixel *dest = (pixel *)dest_; |
336 | int i; |
337 | |
338 | line_size /= sizeof(pixel); |
339 | |
340 | for (i = 0; i < 8; i++) |
341 | FUNC(idctRowCondDC)(block + i*8, 0); |
342 | |
343 | for (i = 0; i < 8; i++) |
344 | FUNC(idctSparseColAdd)(dest + i, line_size, block + i); |
345 | } |
346 | |
347 | void FUNC(ff_simple_idct)(int16_t *block) |
348 | { |
349 | int i; |
350 | |
351 | for (i = 0; i < 8; i++) |
352 | FUNC(idctRowCondDC)(block + i*8, 0); |
353 | |
354 | for (i = 0; i < 8; i++) |
355 | FUNC(idctSparseCol)(block + i); |
356 | } |
357 | #endif |
358 |