1 | /* |
---|
2 | * jidctint.c |
---|
3 | * |
---|
4 | * Copyright (C) 1991-1998, Thomas G. Lane. |
---|
5 | * This file is part of the Independent JPEG Group's software. |
---|
6 | * For conditions of distribution and use, see the accompanying README file. |
---|
7 | * |
---|
8 | * This file contains a slow-but-accurate integer implementation of the |
---|
9 | * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine |
---|
10 | * must also perform dequantization of the input coefficients. |
---|
11 | * |
---|
12 | * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT |
---|
13 | * on each row (or vice versa, but it's more convenient to emit a row at |
---|
14 | * a time). Direct algorithms are also available, but they are much more |
---|
15 | * complex and seem not to be any faster when reduced to code. |
---|
16 | * |
---|
17 | * This implementation is based on an algorithm described in |
---|
18 | * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT |
---|
19 | * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, |
---|
20 | * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. |
---|
21 | * The primary algorithm described there uses 11 multiplies and 29 adds. |
---|
22 | * We use their alternate method with 12 multiplies and 32 adds. |
---|
23 | * The advantage of this method is that no data path contains more than one |
---|
24 | * multiplication; this allows a very simple and accurate implementation in |
---|
25 | * scaled fixed-point arithmetic, with a minimal number of shifts. |
---|
26 | */ |
---|
27 | |
---|
28 | #define JPEG_INTERNALS |
---|
29 | #include "jinclude.h" |
---|
30 | #include "jpeglib.h" |
---|
31 | #include "jdct.h" /* Private declarations for DCT subsystem */ |
---|
32 | |
---|
33 | #ifdef DCT_ISLOW_SUPPORTED |
---|
34 | |
---|
35 | |
---|
36 | /* |
---|
37 | * This module is specialized to the case DCTSIZE = 8. |
---|
38 | */ |
---|
39 | |
---|
40 | #if DCTSIZE != 8 |
---|
41 | Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ |
---|
42 | #endif |
---|
43 | |
---|
44 | |
---|
45 | /* |
---|
46 | * The poop on this scaling stuff is as follows: |
---|
47 | * |
---|
48 | * Each 1-D IDCT step produces outputs which are a factor of sqrt(N) |
---|
49 | * larger than the true IDCT outputs. The final outputs are therefore |
---|
50 | * a factor of N larger than desired; since N=8 this can be cured by |
---|
51 | * a simple right shift at the end of the algorithm. The advantage of |
---|
52 | * this arrangement is that we save two multiplications per 1-D IDCT, |
---|
53 | * because the y0 and y4 inputs need not be divided by sqrt(N). |
---|
54 | * |
---|
55 | * We have to do addition and subtraction of the integer inputs, which |
---|
56 | * is no problem, and multiplication by fractional constants, which is |
---|
57 | * a problem to do in integer arithmetic. We multiply all the constants |
---|
58 | * by CONST_SCALE and convert them to integer constants (thus retaining |
---|
59 | * CONST_BITS bits of precision in the constants). After doing a |
---|
60 | * multiplication we have to divide the product by CONST_SCALE, with proper |
---|
61 | * rounding, to produce the correct output. This division can be done |
---|
62 | * cheaply as a right shift of CONST_BITS bits. We postpone shifting |
---|
63 | * as long as possible so that partial sums can be added together with |
---|
64 | * full fractional precision. |
---|
65 | * |
---|
66 | * The outputs of the first pass are scaled up by PASS1_BITS bits so that |
---|
67 | * they are represented to better-than-integral precision. These outputs |
---|
68 | * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word |
---|
69 | * with the recommended scaling. (To scale up 12-bit sample data further, an |
---|
70 | * intermediate INT32 array would be needed.) |
---|
71 | * |
---|
72 | * To avoid overflow of the 32-bit intermediate results in pass 2, we must |
---|
73 | * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis |
---|
74 | * shows that the values given below are the most effective. |
---|
75 | */ |
---|
76 | |
---|
77 | #if BITS_IN_JSAMPLE == 8 |
---|
78 | #define CONST_BITS 13 |
---|
79 | #define PASS1_BITS 2 |
---|
80 | #else |
---|
81 | #define CONST_BITS 13 |
---|
82 | #define PASS1_BITS 1 /* lose a little precision to avoid overflow */ |
---|
83 | #endif |
---|
84 | |
---|
85 | /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus |
---|
86 | * causing a lot of useless floating-point operations at run time. |
---|
87 | * To get around this we use the following pre-calculated constants. |
---|
88 | * If you change CONST_BITS you may want to add appropriate values. |
---|
89 | * (With a reasonable C compiler, you can just rely on the FIX() macro...) |
---|
90 | */ |
---|
91 | |
---|
92 | #if CONST_BITS == 13 |
---|
93 | #define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */ |
---|
94 | #define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */ |
---|
95 | #define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */ |
---|
96 | #define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */ |
---|
97 | #define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */ |
---|
98 | #define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */ |
---|
99 | #define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */ |
---|
100 | #define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */ |
---|
101 | #define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */ |
---|
102 | #define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */ |
---|
103 | #define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */ |
---|
104 | #define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */ |
---|
105 | #else |
---|
106 | #define FIX_0_298631336 FIX(0.298631336) |
---|
107 | #define FIX_0_390180644 FIX(0.390180644) |
---|
108 | #define FIX_0_541196100 FIX(0.541196100) |
---|
109 | #define FIX_0_765366865 FIX(0.765366865) |
---|
110 | #define FIX_0_899976223 FIX(0.899976223) |
---|
111 | #define FIX_1_175875602 FIX(1.175875602) |
---|
112 | #define FIX_1_501321110 FIX(1.501321110) |
---|
113 | #define FIX_1_847759065 FIX(1.847759065) |
---|
114 | #define FIX_1_961570560 FIX(1.961570560) |
---|
115 | #define FIX_2_053119869 FIX(2.053119869) |
---|
116 | #define FIX_2_562915447 FIX(2.562915447) |
---|
117 | #define FIX_3_072711026 FIX(3.072711026) |
---|
118 | #endif |
---|
119 | |
---|
120 | |
---|
121 | /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result. |
---|
122 | * For 8-bit samples with the recommended scaling, all the variable |
---|
123 | * and constant values involved are no more than 16 bits wide, so a |
---|
124 | * 16x16->32 bit multiply can be used instead of a full 32x32 multiply. |
---|
125 | * For 12-bit samples, a full 32-bit multiplication will be needed. |
---|
126 | */ |
---|
127 | |
---|
128 | #if BITS_IN_JSAMPLE == 8 |
---|
129 | #define MULTIPLY(var,const) MULTIPLY16C16(var,const) |
---|
130 | #else |
---|
131 | #define MULTIPLY(var,const) ((var) * (const)) |
---|
132 | #endif |
---|
133 | |
---|
134 | |
---|
135 | /* Dequantize a coefficient by multiplying it by the multiplier-table |
---|
136 | * entry; produce an int result. In this module, both inputs and result |
---|
137 | * are 16 bits or less, so either int or short multiply will work. |
---|
138 | */ |
---|
139 | |
---|
140 | #define DEQUANTIZE(coef,quantval) (((ISLOW_MULT_TYPE) (coef)) * (quantval)) |
---|
141 | |
---|
142 | |
---|
143 | /* |
---|
144 | * Perform dequantization and inverse DCT on one block of coefficients. |
---|
145 | */ |
---|
146 | |
---|
147 | GLOBAL(void) |
---|
148 | jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, |
---|
149 | JCOEFPTR coef_block, |
---|
150 | JSAMPARRAY output_buf, JDIMENSION output_col) |
---|
151 | { |
---|
152 | INT32 tmp0, tmp1, tmp2, tmp3; |
---|
153 | INT32 tmp10, tmp11, tmp12, tmp13; |
---|
154 | INT32 z1, z2, z3, z4, z5; |
---|
155 | JCOEFPTR inptr; |
---|
156 | ISLOW_MULT_TYPE * quantptr; |
---|
157 | int * wsptr; |
---|
158 | JSAMPROW outptr; |
---|
159 | JSAMPLE *range_limit = IDCT_range_limit(cinfo); |
---|
160 | int ctr; |
---|
161 | int workspace[DCTSIZE2]; /* buffers data between passes */ |
---|
162 | SHIFT_TEMPS |
---|
163 | |
---|
164 | /* Pass 1: process columns from input, store into work array. */ |
---|
165 | /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ |
---|
166 | /* furthermore, we scale the results by 2**PASS1_BITS. */ |
---|
167 | |
---|
168 | inptr = coef_block; |
---|
169 | quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table; |
---|
170 | wsptr = workspace; |
---|
171 | for (ctr = DCTSIZE; ctr > 0; ctr--) { |
---|
172 | /* Due to quantization, we will usually find that many of the input |
---|
173 | * coefficients are zero, especially the AC terms. We can exploit this |
---|
174 | * by short-circuiting the IDCT calculation for any column in which all |
---|
175 | * the AC terms are zero. In that case each output is equal to the |
---|
176 | * DC coefficient (with scale factor as needed). |
---|
177 | * With typical images and quantization tables, half or more of the |
---|
178 | * column DCT calculations can be simplified this way. |
---|
179 | */ |
---|
180 | |
---|
181 | if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && |
---|
182 | inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && |
---|
183 | inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && |
---|
184 | inptr[DCTSIZE*7] == 0) { |
---|
185 | /* AC terms all zero */ |
---|
186 | int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS; |
---|
187 | |
---|
188 | wsptr[DCTSIZE*0] = dcval; |
---|
189 | wsptr[DCTSIZE*1] = dcval; |
---|
190 | wsptr[DCTSIZE*2] = dcval; |
---|
191 | wsptr[DCTSIZE*3] = dcval; |
---|
192 | wsptr[DCTSIZE*4] = dcval; |
---|
193 | wsptr[DCTSIZE*5] = dcval; |
---|
194 | wsptr[DCTSIZE*6] = dcval; |
---|
195 | wsptr[DCTSIZE*7] = dcval; |
---|
196 | |
---|
197 | inptr++; /* advance pointers to next column */ |
---|
198 | quantptr++; |
---|
199 | wsptr++; |
---|
200 | continue; |
---|
201 | } |
---|
202 | |
---|
203 | /* Even part: reverse the even part of the forward DCT. */ |
---|
204 | /* The rotator is sqrt(2)*c(-6). */ |
---|
205 | |
---|
206 | z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); |
---|
207 | z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); |
---|
208 | |
---|
209 | z1 = MULTIPLY(z2 + z3, FIX_0_541196100); |
---|
210 | tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); |
---|
211 | tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); |
---|
212 | |
---|
213 | z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); |
---|
214 | z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); |
---|
215 | |
---|
216 | tmp0 = (z2 + z3) << CONST_BITS; |
---|
217 | tmp1 = (z2 - z3) << CONST_BITS; |
---|
218 | |
---|
219 | tmp10 = tmp0 + tmp3; |
---|
220 | tmp13 = tmp0 - tmp3; |
---|
221 | tmp11 = tmp1 + tmp2; |
---|
222 | tmp12 = tmp1 - tmp2; |
---|
223 | |
---|
224 | /* Odd part per figure 8; the matrix is unitary and hence its |
---|
225 | * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. |
---|
226 | */ |
---|
227 | |
---|
228 | tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); |
---|
229 | tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); |
---|
230 | tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); |
---|
231 | tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); |
---|
232 | |
---|
233 | z1 = tmp0 + tmp3; |
---|
234 | z2 = tmp1 + tmp2; |
---|
235 | z3 = tmp0 + tmp2; |
---|
236 | z4 = tmp1 + tmp3; |
---|
237 | z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ |
---|
238 | |
---|
239 | tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ |
---|
240 | tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ |
---|
241 | tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ |
---|
242 | tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ |
---|
243 | z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ |
---|
244 | z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ |
---|
245 | z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ |
---|
246 | z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ |
---|
247 | |
---|
248 | z3 += z5; |
---|
249 | z4 += z5; |
---|
250 | |
---|
251 | tmp0 += z1 + z3; |
---|
252 | tmp1 += z2 + z4; |
---|
253 | tmp2 += z2 + z3; |
---|
254 | tmp3 += z1 + z4; |
---|
255 | |
---|
256 | /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ |
---|
257 | |
---|
258 | wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); |
---|
259 | wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); |
---|
260 | wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); |
---|
261 | wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); |
---|
262 | wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); |
---|
263 | wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); |
---|
264 | wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); |
---|
265 | wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); |
---|
266 | |
---|
267 | inptr++; /* advance pointers to next column */ |
---|
268 | quantptr++; |
---|
269 | wsptr++; |
---|
270 | } |
---|
271 | |
---|
272 | /* Pass 2: process rows from work array, store into output array. */ |
---|
273 | /* Note that we must descale the results by a factor of 8 == 2**3, */ |
---|
274 | /* and also undo the PASS1_BITS scaling. */ |
---|
275 | |
---|
276 | wsptr = workspace; |
---|
277 | for (ctr = 0; ctr < DCTSIZE; ctr++) { |
---|
278 | outptr = output_buf[ctr] + output_col; |
---|
279 | /* Rows of zeroes can be exploited in the same way as we did with columns. |
---|
280 | * However, the column calculation has created many nonzero AC terms, so |
---|
281 | * the simplification applies less often (typically 5% to 10% of the time). |
---|
282 | * On machines with very fast multiplication, it's possible that the |
---|
283 | * test takes more time than it's worth. In that case this section |
---|
284 | * may be commented out. |
---|
285 | */ |
---|
286 | |
---|
287 | #ifndef NO_ZERO_ROW_TEST |
---|
288 | if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 && |
---|
289 | wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { |
---|
290 | /* AC terms all zero */ |
---|
291 | JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3) |
---|
292 | & RANGE_MASK]; |
---|
293 | |
---|
294 | outptr[0] = dcval; |
---|
295 | outptr[1] = dcval; |
---|
296 | outptr[2] = dcval; |
---|
297 | outptr[3] = dcval; |
---|
298 | outptr[4] = dcval; |
---|
299 | outptr[5] = dcval; |
---|
300 | outptr[6] = dcval; |
---|
301 | outptr[7] = dcval; |
---|
302 | |
---|
303 | wsptr += DCTSIZE; /* advance pointer to next row */ |
---|
304 | continue; |
---|
305 | } |
---|
306 | #endif |
---|
307 | |
---|
308 | /* Even part: reverse the even part of the forward DCT. */ |
---|
309 | /* The rotator is sqrt(2)*c(-6). */ |
---|
310 | |
---|
311 | z2 = (INT32) wsptr[2]; |
---|
312 | z3 = (INT32) wsptr[6]; |
---|
313 | |
---|
314 | z1 = MULTIPLY(z2 + z3, FIX_0_541196100); |
---|
315 | tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); |
---|
316 | tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); |
---|
317 | |
---|
318 | tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS; |
---|
319 | tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS; |
---|
320 | |
---|
321 | tmp10 = tmp0 + tmp3; |
---|
322 | tmp13 = tmp0 - tmp3; |
---|
323 | tmp11 = tmp1 + tmp2; |
---|
324 | tmp12 = tmp1 - tmp2; |
---|
325 | |
---|
326 | /* Odd part per figure 8; the matrix is unitary and hence its |
---|
327 | * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. |
---|
328 | */ |
---|
329 | |
---|
330 | tmp0 = (INT32) wsptr[7]; |
---|
331 | tmp1 = (INT32) wsptr[5]; |
---|
332 | tmp2 = (INT32) wsptr[3]; |
---|
333 | tmp3 = (INT32) wsptr[1]; |
---|
334 | |
---|
335 | z1 = tmp0 + tmp3; |
---|
336 | z2 = tmp1 + tmp2; |
---|
337 | z3 = tmp0 + tmp2; |
---|
338 | z4 = tmp1 + tmp3; |
---|
339 | z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ |
---|
340 | |
---|
341 | tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ |
---|
342 | tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ |
---|
343 | tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ |
---|
344 | tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ |
---|
345 | z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ |
---|
346 | z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ |
---|
347 | z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ |
---|
348 | z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ |
---|
349 | |
---|
350 | z3 += z5; |
---|
351 | z4 += z5; |
---|
352 | |
---|
353 | tmp0 += z1 + z3; |
---|
354 | tmp1 += z2 + z4; |
---|
355 | tmp2 += z2 + z3; |
---|
356 | tmp3 += z1 + z4; |
---|
357 | |
---|
358 | /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ |
---|
359 | |
---|
360 | outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3, |
---|
361 | CONST_BITS+PASS1_BITS+3) |
---|
362 | & RANGE_MASK]; |
---|
363 | outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3, |
---|
364 | CONST_BITS+PASS1_BITS+3) |
---|
365 | & RANGE_MASK]; |
---|
366 | outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2, |
---|
367 | CONST_BITS+PASS1_BITS+3) |
---|
368 | & RANGE_MASK]; |
---|
369 | outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2, |
---|
370 | CONST_BITS+PASS1_BITS+3) |
---|
371 | & RANGE_MASK]; |
---|
372 | outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1, |
---|
373 | CONST_BITS+PASS1_BITS+3) |
---|
374 | & RANGE_MASK]; |
---|
375 | outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1, |
---|
376 | CONST_BITS+PASS1_BITS+3) |
---|
377 | & RANGE_MASK]; |
---|
378 | outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0, |
---|
379 | CONST_BITS+PASS1_BITS+3) |
---|
380 | & RANGE_MASK]; |
---|
381 | outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0, |
---|
382 | CONST_BITS+PASS1_BITS+3) |
---|
383 | & RANGE_MASK]; |
---|
384 | |
---|
385 | wsptr += DCTSIZE; /* advance pointer to next row */ |
---|
386 | } |
---|
387 | } |
---|
388 | |
---|
389 | |
---|
390 | #ifdef HAVE_SSE2_INTEL_MNEMONICS |
---|
391 | |
---|
392 | /* |
---|
393 | * Intel SSE2 optimized Inverse Discrete Cosine Transform |
---|
394 | * |
---|
395 | * |
---|
396 | * Copyright (c) 2001-2002 Intel Corporation |
---|
397 | * All Rights Reserved |
---|
398 | * |
---|
399 | * |
---|
400 | * Authors: |
---|
401 | * Danilov G. |
---|
402 | * |
---|
403 | * |
---|
404 | *----------------------------------------------------------------------------- |
---|
405 | * |
---|
406 | * References: |
---|
407 | * K.R. Rao and P. Yip |
---|
408 | * Discrete Cosine Transform. |
---|
409 | * Algorithms, Advantages, Applications. |
---|
410 | * Academic Press, Inc, London, 1990. |
---|
411 | * JPEG Group's software. |
---|
412 | * This implementation is based on Appendix A.2 of the book (R&Y) ... |
---|
413 | * |
---|
414 | *----------------------------------------------------------------------------- |
---|
415 | */ |
---|
416 | |
---|
417 | typedef unsigned char Ipp8u; |
---|
418 | typedef unsigned short Ipp16u; |
---|
419 | typedef unsigned int Ipp32u; |
---|
420 | |
---|
421 | typedef signed char Ipp8s; |
---|
422 | typedef signed short Ipp16s; |
---|
423 | typedef signed int Ipp32s; |
---|
424 | |
---|
425 | #define BITS_INV_ACC 4 |
---|
426 | #define SHIFT_INV_ROW 16 - BITS_INV_ACC |
---|
427 | #define SHIFT_INV_COL 1 + BITS_INV_ACC |
---|
428 | |
---|
429 | #define RND_INV_ROW 1024 * (6 - BITS_INV_ACC) /* 1 << (SHIFT_INV_ROW-1) */ |
---|
430 | #define RND_INV_COL = 16 * (BITS_INV_ACC - 3) /* 1 << (SHIFT_INV_COL-1) */ |
---|
431 | #define RND_INV_CORR = RND_INV_COL - 1 /* correction -1.0 and round */ |
---|
432 | |
---|
433 | #define c_inv_corr_0 -1024 * (6 - BITS_INV_ACC) + 65536 /* -0.5 + (16.0 or 32.0) */ |
---|
434 | #define c_inv_corr_1 1877 * (6 - BITS_INV_ACC) /* 0.9167 */ |
---|
435 | #define c_inv_corr_2 1236 * (6 - BITS_INV_ACC) /* 0.6035 */ |
---|
436 | #define c_inv_corr_3 680 * (6 - BITS_INV_ACC) /* 0.3322 */ |
---|
437 | #define c_inv_corr_4 0 * (6 - BITS_INV_ACC) /* 0.0 */ |
---|
438 | #define c_inv_corr_5 -569 * (6 - BITS_INV_ACC) /* -0.278 */ |
---|
439 | #define c_inv_corr_6 -512 * (6 - BITS_INV_ACC) /* -0.25 */ |
---|
440 | #define c_inv_corr_7 -651 * (6 - BITS_INV_ACC) /* -0.3176 */ |
---|
441 | |
---|
442 | #define RND_INV_ROW_0 RND_INV_ROW + c_inv_corr_0 |
---|
443 | #define RND_INV_ROW_1 RND_INV_ROW + c_inv_corr_1 |
---|
444 | #define RND_INV_ROW_2 RND_INV_ROW + c_inv_corr_2 |
---|
445 | #define RND_INV_ROW_3 RND_INV_ROW + c_inv_corr_3 |
---|
446 | #define RND_INV_ROW_4 RND_INV_ROW + c_inv_corr_4 |
---|
447 | #define RND_INV_ROW_5 RND_INV_ROW + c_inv_corr_5 |
---|
448 | #define RND_INV_ROW_6 RND_INV_ROW + c_inv_corr_6 |
---|
449 | #define RND_INV_ROW_7 RND_INV_ROW + c_inv_corr_7 |
---|
450 | |
---|
451 | /* Table for rows 0,4 - constants are multiplied on cos_4_16 */ |
---|
452 | |
---|
453 | __declspec(align(16)) short tab_i_04[] = { |
---|
454 | 16384, 21407, 16384, 8867, |
---|
455 | -16384, 21407, 16384, -8867, |
---|
456 | 16384, -8867, 16384, -21407, |
---|
457 | 16384, 8867, -16384, -21407, |
---|
458 | 22725, 19266, 19266, -4520, |
---|
459 | 4520, 19266, 19266, -22725, |
---|
460 | 12873, -22725, 4520, -12873, |
---|
461 | 12873, 4520, -22725, -12873}; |
---|
462 | |
---|
463 | /* Table for rows 1,7 - constants are multiplied on cos_1_16 */ |
---|
464 | |
---|
465 | __declspec(align(16)) short tab_i_17[] = { |
---|
466 | 22725, 29692, 22725, 12299, |
---|
467 | -22725, 29692, 22725, -12299, |
---|
468 | 22725, -12299, 22725, -29692, |
---|
469 | 22725, 12299, -22725, -29692, |
---|
470 | 31521, 26722, 26722, -6270, |
---|
471 | 6270, 26722, 26722, -31521, |
---|
472 | 17855, -31521, 6270, -17855, |
---|
473 | 17855, 6270, -31521, -17855}; |
---|
474 | |
---|
475 | /* Table for rows 2,6 - constants are multiplied on cos_2_16 */ |
---|
476 | |
---|
477 | __declspec(align(16)) short tab_i_26[] = { |
---|
478 | 21407, 27969, 21407, 11585, |
---|
479 | -21407, 27969, 21407, -11585, |
---|
480 | 21407, -11585, 21407, -27969, |
---|
481 | 21407, 11585, -21407, -27969, |
---|
482 | 29692, 25172, 25172, -5906, |
---|
483 | 5906, 25172, 25172, -29692, |
---|
484 | 16819, -29692, 5906, -16819, |
---|
485 | 16819, 5906, -29692, -16819}; |
---|
486 | |
---|
487 | /* Table for rows 3,5 - constants are multiplied on cos_3_16 */ |
---|
488 | |
---|
489 | __declspec(align(16)) short tab_i_35[] = { |
---|
490 | 19266, 25172, 19266, 10426, |
---|
491 | -19266, 25172, 19266, -10426, |
---|
492 | 19266, -10426, 19266, -25172, |
---|
493 | 19266, 10426, -19266, -25172, |
---|
494 | 26722, 22654, 22654, -5315, |
---|
495 | 5315, 22654, 22654, -26722, |
---|
496 | 15137, -26722, 5315, -15137, |
---|
497 | 15137, 5315, -26722, -15137}; |
---|
498 | |
---|
499 | __declspec(align(16)) long round_i_0[] = {RND_INV_ROW_0,RND_INV_ROW_0, |
---|
500 | RND_INV_ROW_0,RND_INV_ROW_0}; |
---|
501 | __declspec(align(16)) long round_i_1[] = {RND_INV_ROW_1,RND_INV_ROW_1, |
---|
502 | RND_INV_ROW_1,RND_INV_ROW_1}; |
---|
503 | __declspec(align(16)) long round_i_2[] = {RND_INV_ROW_2,RND_INV_ROW_2, |
---|
504 | RND_INV_ROW_2,RND_INV_ROW_2}; |
---|
505 | __declspec(align(16)) long round_i_3[] = {RND_INV_ROW_3,RND_INV_ROW_3, |
---|
506 | RND_INV_ROW_3,RND_INV_ROW_3}; |
---|
507 | __declspec(align(16)) long round_i_4[] = {RND_INV_ROW_4,RND_INV_ROW_4, |
---|
508 | RND_INV_ROW_4,RND_INV_ROW_4}; |
---|
509 | __declspec(align(16)) long round_i_5[] = {RND_INV_ROW_5,RND_INV_ROW_5, |
---|
510 | RND_INV_ROW_5,RND_INV_ROW_5}; |
---|
511 | __declspec(align(16)) long round_i_6[] = {RND_INV_ROW_6,RND_INV_ROW_6, |
---|
512 | RND_INV_ROW_6,RND_INV_ROW_6}; |
---|
513 | __declspec(align(16)) long round_i_7[] = {RND_INV_ROW_7,RND_INV_ROW_7, |
---|
514 | RND_INV_ROW_7,RND_INV_ROW_7}; |
---|
515 | |
---|
516 | __declspec(align(16)) short tg_1_16[] = { |
---|
517 | 13036, 13036, 13036, 13036, /* tg * (2<<16) + 0.5 */ |
---|
518 | 13036, 13036, 13036, 13036}; |
---|
519 | __declspec(align(16)) short tg_2_16[] = { |
---|
520 | 27146, 27146, 27146, 27146, /* tg * (2<<16) + 0.5 */ |
---|
521 | 27146, 27146, 27146, 27146}; |
---|
522 | __declspec(align(16)) short tg_3_16[] = { |
---|
523 | -21746, -21746, -21746, -21746, /* tg * (2<<16) + 0.5 */ |
---|
524 | -21746, -21746, -21746, -21746}; |
---|
525 | __declspec(align(16)) short cos_4_16[] = { |
---|
526 | -19195, -19195, -19195, -19195, /* cos * (2<<16) + 0.5 */ |
---|
527 | -19195, -19195, -19195, -19195}; |
---|
528 | |
---|
529 | /* |
---|
530 | * In this implementation the outputs of the iDCT-1D are multiplied |
---|
531 | * for rows 0,4 - on cos_4_16, |
---|
532 | * for rows 1,7 - on cos_1_16, |
---|
533 | * for rows 2,6 - on cos_2_16, |
---|
534 | * for rows 3,5 - on cos_3_16 |
---|
535 | * and are shifted to the left for rise of accuracy |
---|
536 | * |
---|
537 | * For used constants |
---|
538 | * FIX(float_const) = (short) (float_const * (1<<15) + 0.5) |
---|
539 | * |
---|
540 | *----------------------------------------------------------------------------- |
---|
541 | * |
---|
542 | * On the first stage the calculation is executed at once for two rows. |
---|
543 | * The permutation for each output row is done on second stage |
---|
544 | * t7 t6 t5 t4 t3 t2 t1 t0 -> t4 t5 t6 t7 t3 t2 t1 t0 |
---|
545 | * |
---|
546 | *----------------------------------------------------------------------------- |
---|
547 | */ |
---|
548 | |
---|
549 | #define DCT_8_INV_ROW_2R(TABLE, ROUND1, ROUND2) __asm { \ |
---|
550 | __asm pshuflw xmm1, xmm0, 10001000b \ |
---|
551 | __asm pshuflw xmm0, xmm0, 11011101b \ |
---|
552 | __asm pshufhw xmm1, xmm1, 10001000b \ |
---|
553 | __asm pshufhw xmm0, xmm0, 11011101b \ |
---|
554 | __asm movdqa xmm2, XMMWORD PTR [TABLE] \ |
---|
555 | __asm pmaddwd xmm2, xmm1 \ |
---|
556 | __asm movdqa xmm3, XMMWORD PTR [TABLE + 32] \ |
---|
557 | __asm pmaddwd xmm3, xmm0 \ |
---|
558 | __asm pmaddwd xmm1, XMMWORD PTR [TABLE + 16] \ |
---|
559 | __asm pmaddwd xmm0, XMMWORD PTR [TABLE + 48] \ |
---|
560 | __asm pshuflw xmm5, xmm4, 10001000b \ |
---|
561 | __asm pshuflw xmm4, xmm4, 11011101b \ |
---|
562 | __asm pshufhw xmm5, xmm5, 10001000b \ |
---|
563 | __asm pshufhw xmm4, xmm4, 11011101b \ |
---|
564 | __asm movdqa xmm6, XMMWORD PTR [TABLE] \ |
---|
565 | __asm pmaddwd xmm6, xmm5 \ |
---|
566 | __asm movdqa xmm7, XMMWORD PTR [TABLE + 32] \ |
---|
567 | __asm pmaddwd xmm7, xmm4 \ |
---|
568 | __asm pmaddwd xmm5, XMMWORD PTR [TABLE + 16] \ |
---|
569 | __asm pmaddwd xmm4, XMMWORD PTR [TABLE + 48] \ |
---|
570 | __asm pshufd xmm1, xmm1, 01001110b \ |
---|
571 | __asm pshufd xmm0, xmm0, 01001110b \ |
---|
572 | __asm paddd xmm2, XMMWORD PTR [ROUND1] \ |
---|
573 | __asm paddd xmm3, xmm0 \ |
---|
574 | __asm paddd xmm1, xmm2 \ |
---|
575 | __asm pshufd xmm5, xmm5, 01001110b \ |
---|
576 | __asm pshufd xmm4, xmm4, 01001110b \ |
---|
577 | __asm movdqa xmm2, xmm1 \ |
---|
578 | __asm psubd xmm2, xmm3 \ |
---|
579 | __asm psrad xmm2, SHIFT_INV_ROW \ |
---|
580 | __asm paddd xmm1, xmm3 \ |
---|
581 | __asm psrad xmm1, SHIFT_INV_ROW \ |
---|
582 | __asm packssdw xmm1, xmm2 \ |
---|
583 | __asm paddd xmm6, XMMWORD PTR [ROUND2] \ |
---|
584 | __asm paddd xmm7, xmm4 \ |
---|
585 | __asm paddd xmm5, xmm6 \ |
---|
586 | __asm movdqa xmm6, xmm5 \ |
---|
587 | __asm psubd xmm6, xmm7 \ |
---|
588 | __asm psrad xmm6, SHIFT_INV_ROW \ |
---|
589 | __asm paddd xmm5, xmm7 \ |
---|
590 | __asm psrad xmm5, SHIFT_INV_ROW \ |
---|
591 | __asm packssdw xmm5, xmm6 \ |
---|
592 | } |
---|
593 | |
---|
594 | /* |
---|
595 | * |
---|
596 | * The second stage - inverse DCTs of columns |
---|
597 | * |
---|
598 | * The inputs are multiplied |
---|
599 | * for rows 0,4 - on cos_4_16, |
---|
600 | * for rows 1,7 - on cos_1_16, |
---|
601 | * for rows 2,6 - on cos_2_16, |
---|
602 | * for rows 3,5 - on cos_3_16 |
---|
603 | * and are shifted to the left for rise of accuracy |
---|
604 | */ |
---|
605 | |
---|
606 | #define DCT_8_INV_COL_8R(INP, OUTP) __asm { \ |
---|
607 | __asm movdqa xmm0, [INP + 5*16] \ |
---|
608 | __asm movdqa xmm1, XMMWORD PTR tg_3_16 \ |
---|
609 | __asm movdqa xmm2, xmm0 \ |
---|
610 | __asm movdqa xmm3, [INP + 3*16] \ |
---|
611 | __asm pmulhw xmm0, xmm1 \ |
---|
612 | __asm movdqa xmm4, [INP + 7*16] \ |
---|
613 | __asm pmulhw xmm1, xmm3 \ |
---|
614 | __asm movdqa xmm5, XMMWORD PTR tg_1_16 \ |
---|
615 | __asm movdqa xmm6, xmm4 \ |
---|
616 | __asm pmulhw xmm4, xmm5 \ |
---|
617 | __asm paddsw xmm0, xmm2 \ |
---|
618 | __asm pmulhw xmm5, [INP + 1*16] \ |
---|
619 | __asm paddsw xmm1, xmm3 \ |
---|
620 | __asm movdqa xmm7, [INP + 6*16] \ |
---|
621 | __asm paddsw xmm0, xmm3 \ |
---|
622 | __asm movdqa xmm3, XMMWORD PTR tg_2_16 \ |
---|
623 | __asm psubsw xmm2, xmm1 \ |
---|
624 | __asm pmulhw xmm7, xmm3 \ |
---|
625 | __asm movdqa xmm1, xmm0 \ |
---|
626 | __asm pmulhw xmm3, [INP + 2*16] \ |
---|
627 | __asm psubsw xmm5, xmm6 \ |
---|
628 | __asm paddsw xmm4, [INP + 1*16] \ |
---|
629 | __asm paddsw xmm0, xmm4 \ |
---|
630 | __asm psubsw xmm4, xmm1 \ |
---|
631 | __asm pshufhw xmm0, xmm0, 00011011b \ |
---|
632 | __asm paddsw xmm7, [INP + 2*16] \ |
---|
633 | __asm movdqa xmm6, xmm5 \ |
---|
634 | __asm psubsw xmm3, [INP + 6*16] \ |
---|
635 | __asm psubsw xmm5, xmm2 \ |
---|
636 | __asm paddsw xmm6, xmm2 \ |
---|
637 | __asm movdqa [OUTP + 7*16], xmm0 \ |
---|
638 | __asm movdqa xmm1, xmm4 \ |
---|
639 | __asm movdqa xmm2, XMMWORD PTR cos_4_16 \ |
---|
640 | __asm paddsw xmm4, xmm5 \ |
---|
641 | __asm movdqa xmm0, XMMWORD PTR cos_4_16 \ |
---|
642 | __asm pmulhw xmm2, xmm4 \ |
---|
643 | __asm pshufhw xmm6, xmm6, 00011011b \ |
---|
644 | __asm movdqa [OUTP + 3*16], xmm6 \ |
---|
645 | __asm psubsw xmm1, xmm5 \ |
---|
646 | __asm movdqa xmm6, [INP + 0*16] \ |
---|
647 | __asm pmulhw xmm0, xmm1 \ |
---|
648 | __asm movdqa xmm5, [INP + 4*16] \ |
---|
649 | __asm paddsw xmm4, xmm2 \ |
---|
650 | __asm paddsw xmm5, xmm6 \ |
---|
651 | __asm psubsw xmm6, [INP + 4*16] \ |
---|
652 | __asm paddsw xmm0, xmm1 \ |
---|
653 | __asm pshufhw xmm4, xmm4, 00011011b \ |
---|
654 | __asm movdqa xmm2, xmm5 \ |
---|
655 | __asm paddsw xmm5, xmm7 \ |
---|
656 | __asm movdqa xmm1, xmm6 \ |
---|
657 | __asm psubsw xmm2, xmm7 \ |
---|
658 | __asm movdqa xmm7, [OUTP + 7*16] \ |
---|
659 | __asm paddsw xmm6, xmm3 \ |
---|
660 | __asm pshufhw xmm5, xmm5, 00011011b \ |
---|
661 | __asm paddsw xmm7, xmm5 \ |
---|
662 | __asm psubsw xmm1, xmm3 \ |
---|
663 | __asm pshufhw xmm6, xmm6, 00011011b \ |
---|
664 | __asm movdqa xmm3, xmm6 \ |
---|
665 | __asm paddsw xmm6, xmm4 \ |
---|
666 | __asm pshufhw xmm2, xmm2, 00011011b \ |
---|
667 | __asm psraw xmm7, SHIFT_INV_COL \ |
---|
668 | __asm movdqa [OUTP + 0*16], xmm7 \ |
---|
669 | __asm movdqa xmm7, xmm1 \ |
---|
670 | __asm paddsw xmm1, xmm0 \ |
---|
671 | __asm psraw xmm6, SHIFT_INV_COL \ |
---|
672 | __asm movdqa [OUTP + 1*16], xmm6 \ |
---|
673 | __asm pshufhw xmm1, xmm1, 00011011b \ |
---|
674 | __asm movdqa xmm6, [OUTP + 3*16] \ |
---|
675 | __asm psubsw xmm7, xmm0 \ |
---|
676 | __asm psraw xmm1, SHIFT_INV_COL \ |
---|
677 | __asm movdqa [OUTP + 2*16], xmm1 \ |
---|
678 | __asm psubsw xmm5, [OUTP + 7*16] \ |
---|
679 | __asm paddsw xmm6, xmm2 \ |
---|
680 | __asm psubsw xmm2, [OUTP + 3*16] \ |
---|
681 | __asm psubsw xmm3, xmm4 \ |
---|
682 | __asm psraw xmm7, SHIFT_INV_COL \ |
---|
683 | __asm pshufhw xmm7, xmm7, 00011011b \ |
---|
684 | __asm movdqa [OUTP + 5*16], xmm7 \ |
---|
685 | __asm psraw xmm5, SHIFT_INV_COL \ |
---|
686 | __asm movdqa [OUTP + 7*16], xmm5 \ |
---|
687 | __asm psraw xmm6, SHIFT_INV_COL \ |
---|
688 | __asm movdqa [OUTP + 3*16], xmm6 \ |
---|
689 | __asm psraw xmm2, SHIFT_INV_COL \ |
---|
690 | __asm movdqa [OUTP + 4*16], xmm2 \ |
---|
691 | __asm psraw xmm3, SHIFT_INV_COL \ |
---|
692 | __asm movdqa [OUTP + 6*16], xmm3 \ |
---|
693 | } |
---|
694 | |
---|
695 | /* |
---|
696 | * |
---|
697 | * Name: dct_8x8_inv_16s |
---|
698 | * Purpose: Inverse Discrete Cosine Transform 8x8 with |
---|
699 | * 2D buffer of short int data |
---|
700 | * Context: |
---|
701 | * void dct_8x8_inv_16s ( short *src, short *dst ) |
---|
702 | * Parameters: |
---|
703 | * src - Pointer to the source buffer |
---|
704 | * dst - Pointer to the destination buffer |
---|
705 | * |
---|
706 | */ |
---|
707 | |
---|
708 | GLOBAL(void) |
---|
709 | dct_8x8_inv_16s ( short *src, short *dst ) { |
---|
710 | |
---|
711 | __asm { |
---|
712 | |
---|
713 | mov ecx, src |
---|
714 | mov edx, dst |
---|
715 | |
---|
716 | movdqa xmm0, [ecx+0*16] |
---|
717 | movdqa xmm4, [ecx+4*16] |
---|
718 | DCT_8_INV_ROW_2R(tab_i_04, round_i_0, round_i_4) |
---|
719 | movdqa [edx+0*16], xmm1 |
---|
720 | movdqa [edx+4*16], xmm5 |
---|
721 | |
---|
722 | movdqa xmm0, [ecx+1*16] |
---|
723 | movdqa xmm4, [ecx+7*16] |
---|
724 | DCT_8_INV_ROW_2R(tab_i_17, round_i_1, round_i_7) |
---|
725 | movdqa [edx+1*16], xmm1 |
---|
726 | movdqa [edx+7*16], xmm5 |
---|
727 | |
---|
728 | movdqa xmm0, [ecx+3*16] |
---|
729 | movdqa xmm4, [ecx+5*16] |
---|
730 | DCT_8_INV_ROW_2R(tab_i_35, round_i_3, round_i_5); |
---|
731 | movdqa [edx+3*16], xmm1 |
---|
732 | movdqa [edx+5*16], xmm5 |
---|
733 | |
---|
734 | movdqa xmm0, [ecx+2*16] |
---|
735 | movdqa xmm4, [ecx+6*16] |
---|
736 | DCT_8_INV_ROW_2R(tab_i_26, round_i_2, round_i_6); |
---|
737 | movdqa [edx+2*16], xmm1 |
---|
738 | movdqa [edx+6*16], xmm5 |
---|
739 | |
---|
740 | DCT_8_INV_COL_8R(edx+0, edx+0); |
---|
741 | } |
---|
742 | } |
---|
743 | |
---|
744 | |
---|
745 | /* |
---|
746 | * Name: |
---|
747 | * ownpj_QuantInv_8x8_16s |
---|
748 | * |
---|
749 | * Purpose: |
---|
750 | * Dequantize 8x8 block of DCT coefficients |
---|
751 | * |
---|
752 | * Context: |
---|
753 | * void ownpj_QuantInv_8x8_16s |
---|
754 | * Ipp16s* pSrc, |
---|
755 | * Ipp16s* pDst, |
---|
756 | * const Ipp16u* pQTbl)* |
---|
757 | * |
---|
758 | */ |
---|
759 | |
---|
760 | GLOBAL(void) |
---|
761 | ownpj_QuantInv_8x8_16s(short * pSrc, short * pDst, const unsigned short * pQTbl) |
---|
762 | { |
---|
763 | __asm { |
---|
764 | |
---|
765 | push ebx |
---|
766 | push ecx |
---|
767 | push edx |
---|
768 | push esi |
---|
769 | push edi |
---|
770 | |
---|
771 | mov esi, pSrc |
---|
772 | mov edi, pDst |
---|
773 | mov edx, pQTbl |
---|
774 | mov ecx, 4 |
---|
775 | mov ebx, 32 |
---|
776 | |
---|
777 | again: |
---|
778 | |
---|
779 | movq mm0, QWORD PTR [esi+0] |
---|
780 | movq mm1, QWORD PTR [esi+8] |
---|
781 | movq mm2, QWORD PTR [esi+16] |
---|
782 | movq mm3, QWORD PTR [esi+24] |
---|
783 | |
---|
784 | prefetcht0 [esi+ebx] ; fetch next cache line |
---|
785 | |
---|
786 | pmullw mm0, QWORD PTR [edx+0] |
---|
787 | pmullw mm1, QWORD PTR [edx+8] |
---|
788 | pmullw mm2, QWORD PTR [edx+16] |
---|
789 | pmullw mm3, QWORD PTR [edx+24] |
---|
790 | |
---|
791 | movq QWORD PTR [edi+0], mm0 |
---|
792 | movq QWORD PTR [edi+8], mm1 |
---|
793 | movq QWORD PTR [edi+16], mm2 |
---|
794 | movq QWORD PTR [edi+24], mm3 |
---|
795 | |
---|
796 | add esi, ebx |
---|
797 | add edi, ebx |
---|
798 | add edx, ebx |
---|
799 | dec ecx |
---|
800 | jnz again |
---|
801 | |
---|
802 | emms |
---|
803 | |
---|
804 | pop edi |
---|
805 | pop esi |
---|
806 | pop edx |
---|
807 | pop ecx |
---|
808 | pop ebx |
---|
809 | } |
---|
810 | } |
---|
811 | |
---|
812 | |
---|
813 | /* |
---|
814 | * Name: |
---|
815 | * ownpj_Add128_8x8_16s8u |
---|
816 | * |
---|
817 | * Purpose: |
---|
818 | * signed to unsigned conversion (level shift) |
---|
819 | * for 8x8 block of DCT coefficients |
---|
820 | * |
---|
821 | * Context: |
---|
822 | * void ownpj_Add128_8x8_16s8u |
---|
823 | * const Ipp16s* pSrc, |
---|
824 | * Ipp8u* pDst, |
---|
825 | * int DstStep); |
---|
826 | * |
---|
827 | */ |
---|
828 | |
---|
829 | __declspec(align(16)) long const_128[]= {0x00800080, 0x00800080, 0x00800080, 0x00800080}; |
---|
830 | |
---|
831 | GLOBAL(void) |
---|
832 | ownpj_Add128_8x8_16s8u(const short * pSrc, unsigned char * pDst, int DstStep) |
---|
833 | { |
---|
834 | __asm { |
---|
835 | push eax |
---|
836 | push ebx |
---|
837 | push ecx |
---|
838 | push edx |
---|
839 | push esi |
---|
840 | push edi |
---|
841 | |
---|
842 | mov esi, pSrc |
---|
843 | mov edi, pDst |
---|
844 | mov edx, DstStep |
---|
845 | mov ecx, 2 |
---|
846 | mov ebx, edx |
---|
847 | mov eax, edx |
---|
848 | sal ebx, 1 |
---|
849 | add eax, ebx |
---|
850 | movdqa xmm7, XMMWORD PTR const_128 |
---|
851 | |
---|
852 | again: |
---|
853 | |
---|
854 | movdqa xmm0, XMMWORD PTR [esi+0] ; line 0 |
---|
855 | movdqa xmm1, XMMWORD PTR [esi+16] ; line 1 |
---|
856 | movdqa xmm2, XMMWORD PTR [esi+32] ; line 2 |
---|
857 | movdqa xmm3, XMMWORD PTR [esi+48] ; line 3 |
---|
858 | |
---|
859 | paddw xmm0, xmm7 |
---|
860 | paddw xmm1, xmm7 |
---|
861 | paddw xmm2, xmm7 |
---|
862 | paddw xmm3, xmm7 |
---|
863 | |
---|
864 | packuswb xmm0, xmm1 |
---|
865 | packuswb xmm2, xmm3 |
---|
866 | |
---|
867 | movq QWORD PTR [edi], xmm0 ;0*DstStep |
---|
868 | movq QWORD PTR [edi+ebx], xmm2 ;2*DstStep |
---|
869 | |
---|
870 | psrldq xmm0, 8 |
---|
871 | psrldq xmm2, 8 |
---|
872 | |
---|
873 | movq QWORD PTR [edi+edx], xmm0 ;1*DstStep |
---|
874 | movq QWORD PTR [edi+eax], xmm2 ;3*DstStep |
---|
875 | |
---|
876 | add edi, ebx |
---|
877 | add esi, 64 |
---|
878 | add edi, ebx |
---|
879 | dec ecx |
---|
880 | jnz again |
---|
881 | |
---|
882 | pop edi |
---|
883 | pop esi |
---|
884 | pop edx |
---|
885 | pop ecx |
---|
886 | pop ebx |
---|
887 | pop eax |
---|
888 | } |
---|
889 | } |
---|
890 | |
---|
891 | |
---|
892 | /* |
---|
893 | * Name: |
---|
894 | * ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R |
---|
895 | * |
---|
896 | * Purpose: |
---|
897 | * Inverse DCT transform, de-quantization and level shift |
---|
898 | * |
---|
899 | * Parameters: |
---|
900 | * pSrc - pointer to source |
---|
901 | * pDst - pointer to output array |
---|
902 | * DstStep - line offset for output data |
---|
903 | * pEncoderQuantTable - pointer to Quantization table |
---|
904 | * |
---|
905 | */ |
---|
906 | |
---|
907 | GLOBAL(void) |
---|
908 | ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R( |
---|
909 | short * pSrc, |
---|
910 | unsigned char * pDst, |
---|
911 | int DstStep, |
---|
912 | const unsigned short * pQuantInvTable) |
---|
913 | { |
---|
914 | |
---|
915 | __declspec(align(16)) Ipp8u buf[DCTSIZE2*sizeof(Ipp16s)]; |
---|
916 | Ipp16s * workbuf = (Ipp16s *)buf; |
---|
917 | |
---|
918 | ownpj_QuantInv_8x8_16s(pSrc,workbuf,pQuantInvTable); |
---|
919 | dct_8x8_inv_16s(workbuf,workbuf); |
---|
920 | ownpj_Add128_8x8_16s8u(workbuf,pDst,DstStep); |
---|
921 | |
---|
922 | } |
---|
923 | |
---|
924 | GLOBAL(void) |
---|
925 | jpeg_idct_islow_sse2 ( |
---|
926 | j_decompress_ptr cinfo, |
---|
927 | jpeg_component_info * compptr, |
---|
928 | JCOEFPTR coef_block, |
---|
929 | JSAMPARRAY output_buf, |
---|
930 | JDIMENSION output_col) |
---|
931 | { |
---|
932 | int ctr; |
---|
933 | JCOEFPTR inptr; |
---|
934 | Ipp16u* quantptr; |
---|
935 | Ipp8u* wsptr; |
---|
936 | __declspec(align(16)) Ipp8u workspace[DCTSIZE2]; |
---|
937 | JSAMPROW outptr; |
---|
938 | |
---|
939 | inptr = coef_block; |
---|
940 | quantptr = (Ipp16u*)compptr->dct_table; |
---|
941 | wsptr = workspace; |
---|
942 | |
---|
943 | ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(inptr, workspace, 8, quantptr); |
---|
944 | |
---|
945 | for(ctr = 0; ctr < DCTSIZE; ctr++) |
---|
946 | { |
---|
947 | outptr = output_buf[ctr] + output_col; |
---|
948 | |
---|
949 | outptr[0] = wsptr[0]; |
---|
950 | outptr[1] = wsptr[1]; |
---|
951 | outptr[2] = wsptr[2]; |
---|
952 | outptr[3] = wsptr[3]; |
---|
953 | outptr[4] = wsptr[4]; |
---|
954 | outptr[5] = wsptr[5]; |
---|
955 | outptr[6] = wsptr[6]; |
---|
956 | outptr[7] = wsptr[7]; |
---|
957 | |
---|
958 | wsptr += DCTSIZE; |
---|
959 | } |
---|
960 | } |
---|
961 | #endif /* HAVE_SSE2_INTEL_MNEMONICS */ |
---|
962 | |
---|
963 | #endif /* DCT_ISLOW_SUPPORTED */ |
---|