1 | /* |
---|
2 | * jidctfst.c |
---|
3 | * |
---|
4 | * Copyright (C) 1994-1998, Thomas G. Lane. |
---|
5 | * This file is part of the Independent JPEG Group's software. |
---|
6 | * For conditions of distribution and use, see the accompanying README file. |
---|
7 | * |
---|
8 | * This file contains a fast, not so accurate integer implementation of the |
---|
9 | * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine |
---|
10 | * must also perform dequantization of the input coefficients. |
---|
11 | * |
---|
12 | * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT |
---|
13 | * on each row (or vice versa, but it's more convenient to emit a row at |
---|
14 | * a time). Direct algorithms are also available, but they are much more |
---|
15 | * complex and seem not to be any faster when reduced to code. |
---|
16 | * |
---|
17 | * This implementation is based on Arai, Agui, and Nakajima's algorithm for |
---|
18 | * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in |
---|
19 | * Japanese, but the algorithm is described in the Pennebaker & Mitchell |
---|
20 | * JPEG textbook (see REFERENCES section in file README). The following code |
---|
21 | * is based directly on figure 4-8 in P&M. |
---|
22 | * While an 8-point DCT cannot be done in less than 11 multiplies, it is |
---|
23 | * possible to arrange the computation so that many of the multiplies are |
---|
24 | * simple scalings of the final outputs. These multiplies can then be |
---|
25 | * folded into the multiplications or divisions by the JPEG quantization |
---|
26 | * table entries. The AA&N method leaves only 5 multiplies and 29 adds |
---|
27 | * to be done in the DCT itself. |
---|
28 | * The primary disadvantage of this method is that with fixed-point math, |
---|
29 | * accuracy is lost due to imprecise representation of the scaled |
---|
30 | * quantization values. The smaller the quantization table entry, the less |
---|
31 | * precise the scaled value, so this implementation does worse with high- |
---|
32 | * quality-setting files than with low-quality ones. |
---|
33 | */ |
---|
34 | |
---|
35 | #define JPEG_INTERNALS |
---|
36 | #include "jinclude.h" |
---|
37 | #include "jpeglib.h" |
---|
38 | #include "jdct.h" /* Private declarations for DCT subsystem */ |
---|
39 | |
---|
40 | |
---|
41 | #ifdef DCT_IFAST_SUPPORTED |
---|
42 | |
---|
43 | |
---|
44 | /* |
---|
45 | * This module is specialized to the case DCTSIZE = 8. |
---|
46 | */ |
---|
47 | |
---|
48 | #if DCTSIZE != 8 |
---|
49 | Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ |
---|
50 | #endif |
---|
51 | |
---|
52 | |
---|
53 | /* Scaling decisions are generally the same as in the LL&M algorithm; |
---|
54 | * see jidctint.c for more details. However, we choose to descale |
---|
55 | * (right shift) multiplication products as soon as they are formed, |
---|
56 | * rather than carrying additional fractional bits into subsequent additions. |
---|
57 | * This compromises accuracy slightly, but it lets us save a few shifts. |
---|
58 | * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples) |
---|
59 | * everywhere except in the multiplications proper; this saves a good deal |
---|
60 | * of work on 16-bit-int machines. |
---|
61 | * |
---|
62 | * The dequantized coefficients are not integers because the AA&N scaling |
---|
63 | * factors have been incorporated. We represent them scaled up by PASS1_BITS, |
---|
64 | * so that the first and second IDCT rounds have the same input scaling. |
---|
65 | * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to |
---|
66 | * avoid a descaling shift; this compromises accuracy rather drastically |
---|
67 | * for small quantization table entries, but it saves a lot of shifts. |
---|
68 | * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway, |
---|
69 | * so we use a much larger scaling factor to preserve accuracy. |
---|
70 | * |
---|
71 | * A final compromise is to represent the multiplicative constants to only |
---|
72 | * 8 fractional bits, rather than 13. This saves some shifting work on some |
---|
73 | * machines, and may also reduce the cost of multiplication (since there |
---|
74 | * are fewer one-bits in the constants). |
---|
75 | */ |
---|
76 | |
---|
77 | #if BITS_IN_JSAMPLE == 8 |
---|
78 | #define CONST_BITS 8 |
---|
79 | #define PASS1_BITS 2 |
---|
80 | #else |
---|
81 | #define CONST_BITS 8 |
---|
82 | #define PASS1_BITS 1 /* lose a little precision to avoid overflow */ |
---|
83 | #endif |
---|
84 | |
---|
85 | /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus |
---|
86 | * causing a lot of useless floating-point operations at run time. |
---|
87 | * To get around this we use the following pre-calculated constants. |
---|
88 | * If you change CONST_BITS you may want to add appropriate values. |
---|
89 | * (With a reasonable C compiler, you can just rely on the FIX() macro...) |
---|
90 | */ |
---|
91 | |
---|
92 | #if CONST_BITS == 8 |
---|
93 | #define FIX_1_082392200 ((INT32) 277) /* FIX(1.082392200) */ |
---|
94 | #define FIX_1_414213562 ((INT32) 362) /* FIX(1.414213562) */ |
---|
95 | #define FIX_1_847759065 ((INT32) 473) /* FIX(1.847759065) */ |
---|
96 | #define FIX_2_613125930 ((INT32) 669) /* FIX(2.613125930) */ |
---|
97 | #else |
---|
98 | #define FIX_1_082392200 FIX(1.082392200) |
---|
99 | #define FIX_1_414213562 FIX(1.414213562) |
---|
100 | #define FIX_1_847759065 FIX(1.847759065) |
---|
101 | #define FIX_2_613125930 FIX(2.613125930) |
---|
102 | #endif |
---|
103 | |
---|
104 | |
---|
105 | /* We can gain a little more speed, with a further compromise in accuracy, |
---|
106 | * by omitting the addition in a descaling shift. This yields an incorrectly |
---|
107 | * rounded result half the time... |
---|
108 | */ |
---|
109 | |
---|
110 | #ifndef USE_ACCURATE_ROUNDING |
---|
111 | #undef DESCALE |
---|
112 | #define DESCALE(x,n) RIGHT_SHIFT(x, n) |
---|
113 | #endif |
---|
114 | |
---|
115 | |
---|
116 | /* Multiply a DCTELEM variable by an INT32 constant, and immediately |
---|
117 | * descale to yield a DCTELEM result. |
---|
118 | */ |
---|
119 | |
---|
120 | #define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS)) |
---|
121 | |
---|
122 | |
---|
123 | /* Dequantize a coefficient by multiplying it by the multiplier-table |
---|
124 | * entry; produce a DCTELEM result. For 8-bit data a 16x16->16 |
---|
125 | * multiplication will do. For 12-bit data, the multiplier table is |
---|
126 | * declared INT32, so a 32-bit multiply will be used. |
---|
127 | */ |
---|
128 | |
---|
129 | #if BITS_IN_JSAMPLE == 8 |
---|
130 | #define DEQUANTIZE(coef,quantval) (((IFAST_MULT_TYPE) (coef)) * (quantval)) |
---|
131 | #else |
---|
132 | #define DEQUANTIZE(coef,quantval) \ |
---|
133 | DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS) |
---|
134 | #endif |
---|
135 | |
---|
136 | |
---|
137 | /* Like DESCALE, but applies to a DCTELEM and produces an int. |
---|
138 | * We assume that int right shift is unsigned if INT32 right shift is. |
---|
139 | */ |
---|
140 | |
---|
141 | #ifdef RIGHT_SHIFT_IS_UNSIGNED |
---|
142 | #define ISHIFT_TEMPS DCTELEM ishift_temp; |
---|
143 | #if BITS_IN_JSAMPLE == 8 |
---|
144 | #define DCTELEMBITS 16 /* DCTELEM may be 16 or 32 bits */ |
---|
145 | #else |
---|
146 | #define DCTELEMBITS 32 /* DCTELEM must be 32 bits */ |
---|
147 | #endif |
---|
148 | #define IRIGHT_SHIFT(x,shft) \ |
---|
149 | ((ishift_temp = (x)) < 0 ? \ |
---|
150 | (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \ |
---|
151 | (ishift_temp >> (shft))) |
---|
152 | #else |
---|
153 | #define ISHIFT_TEMPS |
---|
154 | #define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) |
---|
155 | #endif |
---|
156 | |
---|
157 | #ifdef USE_ACCURATE_ROUNDING |
---|
158 | #define IDESCALE(x,n) ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n)) |
---|
159 | #else |
---|
160 | #define IDESCALE(x,n) ((int) IRIGHT_SHIFT(x, n)) |
---|
161 | #endif |
---|
162 | |
---|
163 | #ifdef HAVE_MMX_INTEL_MNEMONICS |
---|
164 | __inline GLOBAL(void) |
---|
165 | jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr, |
---|
166 | JCOEFPTR coef_block, |
---|
167 | JSAMPARRAY output_buf, JDIMENSION output_col); |
---|
168 | __inline GLOBAL(void) |
---|
169 | jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr, |
---|
170 | JCOEFPTR coef_block, |
---|
171 | JSAMPARRAY output_buf, JDIMENSION output_col); |
---|
172 | #endif |
---|
173 | |
---|
174 | GLOBAL(void) |
---|
175 | jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info * compptr, |
---|
176 | JCOEFPTR coef_block, |
---|
177 | JSAMPARRAY output_buf, JDIMENSION output_col); |
---|
178 | |
---|
179 | |
---|
180 | #ifdef HAVE_MMX_INTEL_MNEMONICS |
---|
181 | GLOBAL(void) |
---|
182 | jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, |
---|
183 | JCOEFPTR coef_block, |
---|
184 | JSAMPARRAY output_buf, JDIMENSION output_col) |
---|
185 | { |
---|
186 | if (MMXAvailable) |
---|
187 | jpeg_idct_ifast_mmx(cinfo, compptr, coef_block, output_buf, output_col); |
---|
188 | else |
---|
189 | jpeg_idct_ifast_orig(cinfo, compptr, coef_block, output_buf, output_col); |
---|
190 | } |
---|
191 | #else |
---|
192 | |
---|
193 | /* |
---|
194 | * Perform dequantization and inverse DCT on one block of coefficients. |
---|
195 | */ |
---|
196 | |
---|
197 | GLOBAL (void) |
---|
198 | jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, |
---|
199 | JCOEFPTR coef_block, |
---|
200 | JSAMPARRAY output_buf, JDIMENSION output_col) |
---|
201 | { |
---|
202 | DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
---|
203 | DCTELEM tmp10, tmp11, tmp12, tmp13; |
---|
204 | DCTELEM z5, z10, z11, z12, z13; |
---|
205 | JCOEFPTR inptr; |
---|
206 | IFAST_MULT_TYPE * quantptr; |
---|
207 | int * wsptr; |
---|
208 | JSAMPROW outptr; |
---|
209 | JSAMPLE *range_limit = IDCT_range_limit(cinfo); |
---|
210 | int ctr; |
---|
211 | int workspace[DCTSIZE2]; /* buffers data between passes */ |
---|
212 | SHIFT_TEMPS /* for DESCALE */ |
---|
213 | ISHIFT_TEMPS /* for IDESCALE */ |
---|
214 | |
---|
215 | /* Pass 1: process columns from input, store into work array. */ |
---|
216 | |
---|
217 | inptr = coef_block; |
---|
218 | quantptr = (IFAST_MULT_TYPE *) compptr->dct_table; |
---|
219 | wsptr = workspace; |
---|
220 | for (ctr = DCTSIZE; ctr > 0; ctr--) { |
---|
221 | /* Due to quantization, we will usually find that many of the input |
---|
222 | * coefficients are zero, especially the AC terms. We can exploit this |
---|
223 | * by short-circuiting the IDCT calculation for any column in which all |
---|
224 | * the AC terms are zero. In that case each output is equal to the |
---|
225 | * DC coefficient (with scale factor as needed). |
---|
226 | * With typical images and quantization tables, half or more of the |
---|
227 | * column DCT calculations can be simplified this way. |
---|
228 | */ |
---|
229 | |
---|
230 | if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && |
---|
231 | inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && |
---|
232 | inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && |
---|
233 | inptr[DCTSIZE*7] == 0) { |
---|
234 | /* AC terms all zero */ |
---|
235 | int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); |
---|
236 | |
---|
237 | wsptr[DCTSIZE*0] = dcval; |
---|
238 | wsptr[DCTSIZE*1] = dcval; |
---|
239 | wsptr[DCTSIZE*2] = dcval; |
---|
240 | wsptr[DCTSIZE*3] = dcval; |
---|
241 | wsptr[DCTSIZE*4] = dcval; |
---|
242 | wsptr[DCTSIZE*5] = dcval; |
---|
243 | wsptr[DCTSIZE*6] = dcval; |
---|
244 | wsptr[DCTSIZE*7] = dcval; |
---|
245 | |
---|
246 | inptr++; /* advance pointers to next column */ |
---|
247 | quantptr++; |
---|
248 | wsptr++; |
---|
249 | continue; |
---|
250 | } |
---|
251 | |
---|
252 | /* Even part */ |
---|
253 | |
---|
254 | tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); |
---|
255 | tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); |
---|
256 | tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); |
---|
257 | tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); |
---|
258 | |
---|
259 | tmp10 = tmp0 + tmp2; /* phase 3 */ |
---|
260 | tmp11 = tmp0 - tmp2; |
---|
261 | |
---|
262 | tmp13 = tmp1 + tmp3; /* phases 5-3 */ |
---|
263 | tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ |
---|
264 | |
---|
265 | tmp0 = tmp10 + tmp13; /* phase 2 */ |
---|
266 | tmp3 = tmp10 - tmp13; |
---|
267 | tmp1 = tmp11 + tmp12; |
---|
268 | tmp2 = tmp11 - tmp12; |
---|
269 | |
---|
270 | /* Odd part */ |
---|
271 | |
---|
272 | tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); |
---|
273 | tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); |
---|
274 | tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); |
---|
275 | tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); |
---|
276 | |
---|
277 | z13 = tmp6 + tmp5; /* phase 6 */ |
---|
278 | z10 = tmp6 - tmp5; |
---|
279 | z11 = tmp4 + tmp7; |
---|
280 | z12 = tmp4 - tmp7; |
---|
281 | |
---|
282 | tmp7 = z11 + z13; /* phase 5 */ |
---|
283 | tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ |
---|
284 | |
---|
285 | z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */ |
---|
286 | tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */ |
---|
287 | tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */ |
---|
288 | |
---|
289 | tmp6 = tmp12 - tmp7; /* phase 2 */ |
---|
290 | tmp5 = tmp11 - tmp6; |
---|
291 | tmp4 = tmp10 + tmp5; |
---|
292 | |
---|
293 | wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); |
---|
294 | wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); |
---|
295 | wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); |
---|
296 | wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); |
---|
297 | wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5); |
---|
298 | wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); |
---|
299 | wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); |
---|
300 | wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); |
---|
301 | |
---|
302 | inptr++; /* advance pointers to next column */ |
---|
303 | quantptr++; |
---|
304 | wsptr++; |
---|
305 | } |
---|
306 | |
---|
307 | /* Pass 2: process rows from work array, store into output array. */ |
---|
308 | /* Note that we must descale the results by a factor of 8 == 2**3, */ |
---|
309 | /* and also undo the PASS1_BITS scaling. */ |
---|
310 | |
---|
311 | wsptr = workspace; |
---|
312 | for (ctr = 0; ctr < DCTSIZE; ctr++) { |
---|
313 | outptr = output_buf[ctr] + output_col; |
---|
314 | /* Rows of zeroes can be exploited in the same way as we did with columns. |
---|
315 | * However, the column calculation has created many nonzero AC terms, so |
---|
316 | * the simplification applies less often (typically 5% to 10% of the time). |
---|
317 | * On machines with very fast multiplication, it's possible that the |
---|
318 | * test takes more time than it's worth. In that case this section |
---|
319 | * may be commented out. |
---|
320 | */ |
---|
321 | |
---|
322 | #ifndef NO_ZERO_ROW_TEST |
---|
323 | if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 && |
---|
324 | wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { |
---|
325 | /* AC terms all zero */ |
---|
326 | JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3) |
---|
327 | & RANGE_MASK]; |
---|
328 | |
---|
329 | outptr[0] = dcval; |
---|
330 | outptr[1] = dcval; |
---|
331 | outptr[2] = dcval; |
---|
332 | outptr[3] = dcval; |
---|
333 | outptr[4] = dcval; |
---|
334 | outptr[5] = dcval; |
---|
335 | outptr[6] = dcval; |
---|
336 | outptr[7] = dcval; |
---|
337 | |
---|
338 | wsptr += DCTSIZE; /* advance pointer to next row */ |
---|
339 | continue; |
---|
340 | } |
---|
341 | #endif |
---|
342 | |
---|
343 | /* Even part */ |
---|
344 | |
---|
345 | tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); |
---|
346 | tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); |
---|
347 | |
---|
348 | tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); |
---|
349 | tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) |
---|
350 | - tmp13; |
---|
351 | |
---|
352 | tmp0 = tmp10 + tmp13; |
---|
353 | tmp3 = tmp10 - tmp13; |
---|
354 | tmp1 = tmp11 + tmp12; |
---|
355 | tmp2 = tmp11 - tmp12; |
---|
356 | |
---|
357 | /* Odd part */ |
---|
358 | |
---|
359 | z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; |
---|
360 | z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; |
---|
361 | z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; |
---|
362 | z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; |
---|
363 | |
---|
364 | tmp7 = z11 + z13; /* phase 5 */ |
---|
365 | tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ |
---|
366 | |
---|
367 | z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */ |
---|
368 | tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */ |
---|
369 | tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */ |
---|
370 | |
---|
371 | tmp6 = tmp12 - tmp7; /* phase 2 */ |
---|
372 | tmp5 = tmp11 - tmp6; |
---|
373 | tmp4 = tmp10 + tmp5; |
---|
374 | |
---|
375 | /* Final output stage: scale down by a factor of 8 and range-limit */ |
---|
376 | |
---|
377 | outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) |
---|
378 | & RANGE_MASK]; |
---|
379 | outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) |
---|
380 | & RANGE_MASK]; |
---|
381 | outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) |
---|
382 | & RANGE_MASK]; |
---|
383 | outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) |
---|
384 | & RANGE_MASK]; |
---|
385 | outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) |
---|
386 | & RANGE_MASK]; |
---|
387 | outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) |
---|
388 | & RANGE_MASK]; |
---|
389 | outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) |
---|
390 | & RANGE_MASK]; |
---|
391 | outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) |
---|
392 | & RANGE_MASK]; |
---|
393 | |
---|
394 | wsptr += DCTSIZE; /* advance pointer to next row */ |
---|
395 | } |
---|
396 | } |
---|
397 | |
---|
398 | #endif |
---|
399 | |
---|
400 | #ifdef HAVE_MMX_INTEL_MNEMONICS |
---|
401 | |
---|
402 | |
---|
403 | _inline GLOBAL(void) |
---|
404 | jpeg_idct_ifast_orig (j_decompress_ptr cinfo, jpeg_component_info * compptr, |
---|
405 | JCOEFPTR coef_block, |
---|
406 | JSAMPARRAY output_buf, JDIMENSION output_col) |
---|
407 | { |
---|
408 | DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
---|
409 | DCTELEM tmp10, tmp11, tmp12, tmp13; |
---|
410 | DCTELEM z5, z10, z11, z12, z13; |
---|
411 | JCOEFPTR inptr; |
---|
412 | IFAST_MULT_TYPE * quantptr; |
---|
413 | int * wsptr; |
---|
414 | JSAMPROW outptr; |
---|
415 | JSAMPLE *range_limit = IDCT_range_limit(cinfo); |
---|
416 | int ctr; |
---|
417 | int workspace[DCTSIZE2]; /* buffers data between passes */ |
---|
418 | SHIFT_TEMPS /* for DESCALE */ |
---|
419 | ISHIFT_TEMPS /* for IDESCALE */ |
---|
420 | |
---|
421 | /* Pass 1: process columns from input, store into work array. */ |
---|
422 | |
---|
423 | inptr = coef_block; |
---|
424 | quantptr = (IFAST_MULT_TYPE *) compptr->dct_table; |
---|
425 | wsptr = workspace; |
---|
426 | for (ctr = DCTSIZE; ctr > 0; ctr--) { |
---|
427 | /* Due to quantization, we will usually find that many of the input |
---|
428 | * coefficients are zero, especially the AC terms. We can exploit this |
---|
429 | * by short-circuiting the IDCT calculation for any column in which all |
---|
430 | * the AC terms are zero. In that case each output is equal to the |
---|
431 | * DC coefficient (with scale factor as needed). |
---|
432 | * With typical images and quantization tables, half or more of the |
---|
433 | * column DCT calculations can be simplified this way. |
---|
434 | */ |
---|
435 | |
---|
436 | if ((inptr[DCTSIZE*1] | inptr[DCTSIZE*2] | inptr[DCTSIZE*3] | |
---|
437 | inptr[DCTSIZE*4] | inptr[DCTSIZE*5] | inptr[DCTSIZE*6] | |
---|
438 | inptr[DCTSIZE*7]) == 0) { |
---|
439 | /* AC terms all zero */ |
---|
440 | int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); |
---|
441 | |
---|
442 | wsptr[DCTSIZE*0] = dcval; |
---|
443 | wsptr[DCTSIZE*1] = dcval; |
---|
444 | wsptr[DCTSIZE*2] = dcval; |
---|
445 | wsptr[DCTSIZE*3] = dcval; |
---|
446 | wsptr[DCTSIZE*4] = dcval; |
---|
447 | wsptr[DCTSIZE*5] = dcval; |
---|
448 | wsptr[DCTSIZE*6] = dcval; |
---|
449 | wsptr[DCTSIZE*7] = dcval; |
---|
450 | |
---|
451 | inptr++; /* advance pointers to next column */ |
---|
452 | quantptr++; |
---|
453 | wsptr++; |
---|
454 | continue; |
---|
455 | } |
---|
456 | |
---|
457 | /* Even part */ |
---|
458 | |
---|
459 | tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); |
---|
460 | tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); |
---|
461 | tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); |
---|
462 | tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); |
---|
463 | |
---|
464 | tmp10 = tmp0 + tmp2; /* phase 3 */ |
---|
465 | tmp11 = tmp0 - tmp2; |
---|
466 | |
---|
467 | tmp13 = tmp1 + tmp3; /* phases 5-3 */ |
---|
468 | tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ |
---|
469 | |
---|
470 | tmp0 = tmp10 + tmp13; /* phase 2 */ |
---|
471 | tmp3 = tmp10 - tmp13; |
---|
472 | tmp1 = tmp11 + tmp12; |
---|
473 | tmp2 = tmp11 - tmp12; |
---|
474 | |
---|
475 | /* Odd part */ |
---|
476 | |
---|
477 | tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); |
---|
478 | tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); |
---|
479 | tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); |
---|
480 | tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); |
---|
481 | |
---|
482 | z13 = tmp6 + tmp5; /* phase 6 */ |
---|
483 | z10 = tmp6 - tmp5; |
---|
484 | z11 = tmp4 + tmp7; |
---|
485 | z12 = tmp4 - tmp7; |
---|
486 | |
---|
487 | tmp7 = z11 + z13; /* phase 5 */ |
---|
488 | tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ |
---|
489 | |
---|
490 | z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */ |
---|
491 | tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */ |
---|
492 | tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */ |
---|
493 | |
---|
494 | tmp6 = tmp12 - tmp7; /* phase 2 */ |
---|
495 | tmp5 = tmp11 - tmp6; |
---|
496 | tmp4 = tmp10 + tmp5; |
---|
497 | |
---|
498 | wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); |
---|
499 | wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); |
---|
500 | wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); |
---|
501 | wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); |
---|
502 | wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5); |
---|
503 | wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); |
---|
504 | wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); |
---|
505 | wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); |
---|
506 | |
---|
507 | inptr++; /* advance pointers to next column */ |
---|
508 | quantptr++; |
---|
509 | wsptr++; |
---|
510 | } |
---|
511 | |
---|
512 | /* Pass 2: process rows from work array, store into output array. */ |
---|
513 | /* Note that we must descale the results by a factor of 8 == 2**3, */ |
---|
514 | /* and also undo the PASS1_BITS scaling. */ |
---|
515 | |
---|
516 | wsptr = workspace; |
---|
517 | for (ctr = 0; ctr < DCTSIZE; ctr++) { |
---|
518 | outptr = output_buf[ctr] + output_col; |
---|
519 | /* Rows of zeroes can be exploited in the same way as we did with columns. |
---|
520 | * However, the column calculation has created many nonzero AC terms, so |
---|
521 | * the simplification applies less often (typically 5% to 10% of the time). |
---|
522 | * On machines with very fast multiplication, it's possible that the |
---|
523 | * test takes more time than it's worth. In that case this section |
---|
524 | * may be commented out. |
---|
525 | */ |
---|
526 | |
---|
527 | #ifndef NO_ZERO_ROW_TEST |
---|
528 | if ((wsptr[1] | wsptr[2] | wsptr[3] | wsptr[4] | wsptr[5] | wsptr[6] | |
---|
529 | wsptr[7]) == 0) { |
---|
530 | /* AC terms all zero */ |
---|
531 | JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3) |
---|
532 | & RANGE_MASK]; |
---|
533 | |
---|
534 | outptr[0] = dcval; |
---|
535 | outptr[1] = dcval; |
---|
536 | outptr[2] = dcval; |
---|
537 | outptr[3] = dcval; |
---|
538 | outptr[4] = dcval; |
---|
539 | outptr[5] = dcval; |
---|
540 | outptr[6] = dcval; |
---|
541 | outptr[7] = dcval; |
---|
542 | |
---|
543 | wsptr += DCTSIZE; /* advance pointer to next row */ |
---|
544 | continue; |
---|
545 | } |
---|
546 | #endif |
---|
547 | |
---|
548 | /* Even part */ |
---|
549 | |
---|
550 | tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); |
---|
551 | tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); |
---|
552 | |
---|
553 | tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); |
---|
554 | tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) |
---|
555 | - tmp13; |
---|
556 | |
---|
557 | tmp0 = tmp10 + tmp13; |
---|
558 | tmp3 = tmp10 - tmp13; |
---|
559 | tmp1 = tmp11 + tmp12; |
---|
560 | tmp2 = tmp11 - tmp12; |
---|
561 | |
---|
562 | /* Odd part */ |
---|
563 | |
---|
564 | z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; |
---|
565 | z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; |
---|
566 | z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; |
---|
567 | z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; |
---|
568 | |
---|
569 | tmp7 = z11 + z13; /* phase 5 */ |
---|
570 | tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ |
---|
571 | |
---|
572 | z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */ |
---|
573 | tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */ |
---|
574 | tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */ |
---|
575 | |
---|
576 | tmp6 = tmp12 - tmp7; /* phase 2 */ |
---|
577 | tmp5 = tmp11 - tmp6; |
---|
578 | tmp4 = tmp10 + tmp5; |
---|
579 | |
---|
580 | /* Final output stage: scale down by a factor of 8 and range-limit */ |
---|
581 | |
---|
582 | outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) |
---|
583 | & RANGE_MASK]; |
---|
584 | outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) |
---|
585 | & RANGE_MASK]; |
---|
586 | outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) |
---|
587 | & RANGE_MASK]; |
---|
588 | outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) |
---|
589 | & RANGE_MASK]; |
---|
590 | outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) |
---|
591 | & RANGE_MASK]; |
---|
592 | outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) |
---|
593 | & RANGE_MASK]; |
---|
594 | outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) |
---|
595 | & RANGE_MASK]; |
---|
596 | outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) |
---|
597 | & RANGE_MASK]; |
---|
598 | |
---|
599 | wsptr += DCTSIZE; /* advance pointer to next row */ |
---|
600 | } |
---|
601 | } |
---|
602 | |
---|
603 | |
---|
604 | static __int64 fix_141 = 0x5a825a825a825a82; |
---|
605 | static __int64 fix_184n261 = 0xcf04cf04cf04cf04; |
---|
606 | static __int64 fix_184 = 0x7641764176417641; |
---|
607 | static __int64 fix_n184 = 0x896f896f896f896f; |
---|
608 | static __int64 fix_108n184 = 0xcf04cf04cf04cf04; |
---|
609 | static __int64 const_0x0080 = 0x0080008000800080; |
---|
610 | |
---|
611 | |
---|
612 | __inline GLOBAL(void) |
---|
613 | jpeg_idct_ifast_mmx (j_decompress_ptr cinfo, jpeg_component_info * compptr, |
---|
614 | JCOEFPTR inptr, |
---|
615 | JSAMPARRAY outptr, JDIMENSION output_col) |
---|
616 | { |
---|
617 | |
---|
618 | int16 workspace[DCTSIZE2 + 4]; /* buffers data between passes */ |
---|
619 | int16 *wsptr=workspace; |
---|
620 | int16 *quantptr=compptr->dct_table; |
---|
621 | |
---|
622 | __asm{ |
---|
623 | |
---|
624 | mov edi, quantptr |
---|
625 | mov ebx, inptr |
---|
626 | mov esi, wsptr |
---|
627 | add esi, 0x07 ;align wsptr to qword |
---|
628 | and esi, 0xfffffff8 ;align wsptr to qword |
---|
629 | |
---|
630 | mov eax, esi |
---|
631 | |
---|
632 | /* Odd part */ |
---|
633 | |
---|
634 | |
---|
635 | movq mm1, [ebx + 8*10] ;load inptr[DCTSIZE*5] |
---|
636 | |
---|
637 | pmullw mm1, [edi + 8*10] ;tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); |
---|
638 | |
---|
639 | movq mm0, [ebx + 8*6] ;load inptr[DCTSIZE*3] |
---|
640 | |
---|
641 | pmullw mm0, [edi + 8*6] ;tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); |
---|
642 | |
---|
643 | movq mm3, [ebx + 8*2] ;load inptr[DCTSIZE*1] |
---|
644 | movq mm2, mm1 ;copy tmp6 /* phase 6 */ |
---|
645 | |
---|
646 | pmullw mm3, [edi + 8*2] ;tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); |
---|
647 | |
---|
648 | movq mm4, [ebx + 8*14] ;load inptr[DCTSIZE*1] |
---|
649 | paddw mm1, mm0 ;z13 = tmp6 + tmp5; |
---|
650 | |
---|
651 | pmullw mm4, [edi + 8*14] ;tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); |
---|
652 | psubw mm2, mm0 ;z10 = tmp6 - tmp5 |
---|
653 | |
---|
654 | psllw mm2, 2 ;shift z10 |
---|
655 | movq mm0, mm2 ;copy z10 |
---|
656 | |
---|
657 | pmulhw mm2, fix_184n261 ;MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ |
---|
658 | movq mm5, mm3 ;copy tmp4 |
---|
659 | |
---|
660 | pmulhw mm0, fix_n184 ;MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ |
---|
661 | paddw mm3, mm4 ;z11 = tmp4 + tmp7; |
---|
662 | |
---|
663 | movq mm6, mm3 ;copy z11 /* phase 5 */ |
---|
664 | psubw mm5, mm4 ;z12 = tmp4 - tmp7; |
---|
665 | |
---|
666 | psubw mm6, mm1 ;z11-z13 |
---|
667 | psllw mm5, 2 ;shift z12 |
---|
668 | |
---|
669 | movq mm4, [ebx + 8*12] ;load inptr[DCTSIZE*6], even part |
---|
670 | movq mm7, mm5 ;copy z12 |
---|
671 | |
---|
672 | pmulhw mm5, fix_108n184 ;MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part |
---|
673 | paddw mm3, mm1 ;tmp7 = z11 + z13; |
---|
674 | |
---|
675 | |
---|
676 | /* Even part */ |
---|
677 | pmulhw mm7, fix_184 ;MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ |
---|
678 | psllw mm6, 2 |
---|
679 | |
---|
680 | movq mm1, [ebx + 8*4] ;load inptr[DCTSIZE*2] |
---|
681 | |
---|
682 | pmullw mm1, [edi + 8*4] ;tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); |
---|
683 | paddw mm0, mm5 ;tmp10 |
---|
684 | |
---|
685 | pmullw mm4, [edi + 8*12] ;tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); |
---|
686 | paddw mm2, mm7 ;tmp12 |
---|
687 | |
---|
688 | pmulhw mm6, fix_141 ;tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ |
---|
689 | psubw mm2, mm3 ;tmp6 = tmp12 - tmp7 |
---|
690 | |
---|
691 | movq mm5, mm1 ;copy tmp1 |
---|
692 | paddw mm1, mm4 ;tmp13= tmp1 + tmp3; /* phases 5-3 */ |
---|
693 | |
---|
694 | psubw mm5, mm4 ;tmp1-tmp3 |
---|
695 | psubw mm6, mm2 ;tmp5 = tmp11 - tmp6; |
---|
696 | |
---|
697 | movq [esi+8*0], mm1 ;save tmp13 in workspace |
---|
698 | psllw mm5, 2 ;shift tmp1-tmp3 |
---|
699 | |
---|
700 | movq mm7, [ebx + 8*0] ;load inptr[DCTSIZE*0] |
---|
701 | |
---|
702 | pmulhw mm5, fix_141 ;MULTIPLY(tmp1 - tmp3, FIX_1_414213562) |
---|
703 | paddw mm0, mm6 ;tmp4 = tmp10 + tmp5; |
---|
704 | |
---|
705 | pmullw mm7, [edi + 8*0] ;tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); |
---|
706 | |
---|
707 | movq mm4, [ebx + 8*8] ;load inptr[DCTSIZE*4] |
---|
708 | |
---|
709 | pmullw mm4, [edi + 8*8] ;tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); |
---|
710 | psubw mm5, mm1 ;tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ |
---|
711 | |
---|
712 | movq [esi+8*4], mm0 ;save tmp4 in workspace |
---|
713 | movq mm1, mm7 ;copy tmp0 /* phase 3 */ |
---|
714 | |
---|
715 | movq [esi+8*2], mm5 ;save tmp12 in workspace |
---|
716 | psubw mm1, mm4 ;tmp11 = tmp0 - tmp2; |
---|
717 | |
---|
718 | paddw mm7, mm4 ;tmp10 = tmp0 + tmp2; |
---|
719 | movq mm5, mm1 ;copy tmp11 |
---|
720 | |
---|
721 | paddw mm1, [esi+8*2] ;tmp1 = tmp11 + tmp12; |
---|
722 | movq mm4, mm7 ;copy tmp10 /* phase 2 */ |
---|
723 | |
---|
724 | paddw mm7, [esi+8*0] ;tmp0 = tmp10 + tmp13; |
---|
725 | |
---|
726 | psubw mm4, [esi+8*0] ;tmp3 = tmp10 - tmp13; |
---|
727 | movq mm0, mm7 ;copy tmp0 |
---|
728 | |
---|
729 | psubw mm5, [esi+8*2] ;tmp2 = tmp11 - tmp12; |
---|
730 | paddw mm7, mm3 ;wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); |
---|
731 | |
---|
732 | psubw mm0, mm3 ;wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); |
---|
733 | |
---|
734 | movq [esi + 8*0], mm7 ;wsptr[DCTSIZE*0] |
---|
735 | movq mm3, mm1 ;copy tmp1 |
---|
736 | |
---|
737 | movq [esi + 8*14], mm0 ;wsptr[DCTSIZE*7] |
---|
738 | paddw mm1, mm2 ;wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); |
---|
739 | |
---|
740 | psubw mm3, mm2 ;wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); |
---|
741 | |
---|
742 | movq [esi + 8*2], mm1 ;wsptr[DCTSIZE*1] |
---|
743 | movq mm1, mm4 ;copy tmp3 |
---|
744 | |
---|
745 | movq [esi + 8*12], mm3 ;wsptr[DCTSIZE*6] |
---|
746 | |
---|
747 | paddw mm4, [esi+8*4] ;wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); |
---|
748 | |
---|
749 | psubw mm1, [esi+8*4] ;wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); |
---|
750 | |
---|
751 | movq [esi + 8*8], mm4 |
---|
752 | movq mm7, mm5 ;copy tmp2 |
---|
753 | |
---|
754 | paddw mm5, mm6 ;wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) |
---|
755 | |
---|
756 | movq [esi+8*6], mm1 ; |
---|
757 | psubw mm7, mm6 ;wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); |
---|
758 | |
---|
759 | movq [esi + 8*4], mm5 |
---|
760 | |
---|
761 | movq [esi + 8*10], mm7 |
---|
762 | |
---|
763 | |
---|
764 | |
---|
765 | /*****************************************************************/ |
---|
766 | add edi, 8 |
---|
767 | add ebx, 8 |
---|
768 | add esi, 8 |
---|
769 | |
---|
770 | /*****************************************************************/ |
---|
771 | |
---|
772 | |
---|
773 | |
---|
774 | |
---|
775 | movq mm1, [ebx + 8*10] ;load inptr[DCTSIZE*5] |
---|
776 | |
---|
777 | pmullw mm1, [edi + 8*10] ;tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); |
---|
778 | |
---|
779 | movq mm0, [ebx + 8*6] ;load inptr[DCTSIZE*3] |
---|
780 | |
---|
781 | pmullw mm0, [edi + 8*6] ;tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); |
---|
782 | |
---|
783 | movq mm3, [ebx + 8*2] ;load inptr[DCTSIZE*1] |
---|
784 | movq mm2, mm1 ;copy tmp6 /* phase 6 */ |
---|
785 | |
---|
786 | pmullw mm3, [edi + 8*2] ;tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); |
---|
787 | |
---|
788 | movq mm4, [ebx + 8*14] ;load inptr[DCTSIZE*1] |
---|
789 | paddw mm1, mm0 ;z13 = tmp6 + tmp5; |
---|
790 | |
---|
791 | pmullw mm4, [edi + 8*14] ;tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); |
---|
792 | psubw mm2, mm0 ;z10 = tmp6 - tmp5 |
---|
793 | |
---|
794 | psllw mm2, 2 ;shift z10 |
---|
795 | movq mm0, mm2 ;copy z10 |
---|
796 | |
---|
797 | pmulhw mm2, fix_184n261 ;MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */ |
---|
798 | movq mm5, mm3 ;copy tmp4 |
---|
799 | |
---|
800 | pmulhw mm0, fix_n184 ;MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */ |
---|
801 | paddw mm3, mm4 ;z11 = tmp4 + tmp7; |
---|
802 | |
---|
803 | movq mm6, mm3 ;copy z11 /* phase 5 */ |
---|
804 | psubw mm5, mm4 ;z12 = tmp4 - tmp7; |
---|
805 | |
---|
806 | psubw mm6, mm1 ;z11-z13 |
---|
807 | psllw mm5, 2 ;shift z12 |
---|
808 | |
---|
809 | movq mm4, [ebx + 8*12] ;load inptr[DCTSIZE*6], even part |
---|
810 | movq mm7, mm5 ;copy z12 |
---|
811 | |
---|
812 | pmulhw mm5, fix_108n184 ;MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part |
---|
813 | paddw mm3, mm1 ;tmp7 = z11 + z13; |
---|
814 | |
---|
815 | |
---|
816 | /* Even part */ |
---|
817 | pmulhw mm7, fix_184 ;MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */ |
---|
818 | psllw mm6, 2 |
---|
819 | |
---|
820 | movq mm1, [ebx + 8*4] ;load inptr[DCTSIZE*2] |
---|
821 | |
---|
822 | pmullw mm1, [edi + 8*4] ;tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); |
---|
823 | paddw mm0, mm5 ;tmp10 |
---|
824 | |
---|
825 | pmullw mm4, [edi + 8*12] ;tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); |
---|
826 | paddw mm2, mm7 ;tmp12 |
---|
827 | |
---|
828 | pmulhw mm6, fix_141 ;tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */ |
---|
829 | psubw mm2, mm3 ;tmp6 = tmp12 - tmp7 |
---|
830 | |
---|
831 | movq mm5, mm1 ;copy tmp1 |
---|
832 | paddw mm1, mm4 ;tmp13= tmp1 + tmp3; /* phases 5-3 */ |
---|
833 | |
---|
834 | psubw mm5, mm4 ;tmp1-tmp3 |
---|
835 | psubw mm6, mm2 ;tmp5 = tmp11 - tmp6; |
---|
836 | |
---|
837 | movq [esi+8*0], mm1 ;save tmp13 in workspace |
---|
838 | psllw mm5, 2 ;shift tmp1-tmp3 |
---|
839 | |
---|
840 | movq mm7, [ebx + 8*0] ;load inptr[DCTSIZE*0] |
---|
841 | paddw mm0, mm6 ;tmp4 = tmp10 + tmp5; |
---|
842 | |
---|
843 | pmulhw mm5, fix_141 ;MULTIPLY(tmp1 - tmp3, FIX_1_414213562) |
---|
844 | |
---|
845 | pmullw mm7, [edi + 8*0] ;tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); |
---|
846 | |
---|
847 | movq mm4, [ebx + 8*8] ;load inptr[DCTSIZE*4] |
---|
848 | |
---|
849 | pmullw mm4, [edi + 8*8] ;tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); |
---|
850 | psubw mm5, mm1 ;tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */ |
---|
851 | |
---|
852 | movq [esi+8*4], mm0 ;save tmp4 in workspace |
---|
853 | movq mm1, mm7 ;copy tmp0 /* phase 3 */ |
---|
854 | |
---|
855 | movq [esi+8*2], mm5 ;save tmp12 in workspace |
---|
856 | psubw mm1, mm4 ;tmp11 = tmp0 - tmp2; |
---|
857 | |
---|
858 | paddw mm7, mm4 ;tmp10 = tmp0 + tmp2; |
---|
859 | movq mm5, mm1 ;copy tmp11 |
---|
860 | |
---|
861 | paddw mm1, [esi+8*2] ;tmp1 = tmp11 + tmp12; |
---|
862 | movq mm4, mm7 ;copy tmp10 /* phase 2 */ |
---|
863 | |
---|
864 | paddw mm7, [esi+8*0] ;tmp0 = tmp10 + tmp13; |
---|
865 | |
---|
866 | psubw mm4, [esi+8*0] ;tmp3 = tmp10 - tmp13; |
---|
867 | movq mm0, mm7 ;copy tmp0 |
---|
868 | |
---|
869 | psubw mm5, [esi+8*2] ;tmp2 = tmp11 - tmp12; |
---|
870 | paddw mm7, mm3 ;wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); |
---|
871 | |
---|
872 | psubw mm0, mm3 ;wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); |
---|
873 | |
---|
874 | movq [esi + 8*0], mm7 ;wsptr[DCTSIZE*0] |
---|
875 | movq mm3, mm1 ;copy tmp1 |
---|
876 | |
---|
877 | movq [esi + 8*14], mm0 ;wsptr[DCTSIZE*7] |
---|
878 | paddw mm1, mm2 ;wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); |
---|
879 | |
---|
880 | psubw mm3, mm2 ;wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); |
---|
881 | |
---|
882 | movq [esi + 8*2], mm1 ;wsptr[DCTSIZE*1] |
---|
883 | movq mm1, mm4 ;copy tmp3 |
---|
884 | |
---|
885 | movq [esi + 8*12], mm3 ;wsptr[DCTSIZE*6] |
---|
886 | |
---|
887 | paddw mm4, [esi+8*4] ;wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); |
---|
888 | |
---|
889 | psubw mm1, [esi+8*4] ;wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); |
---|
890 | |
---|
891 | movq [esi + 8*8], mm4 |
---|
892 | movq mm7, mm5 ;copy tmp2 |
---|
893 | |
---|
894 | paddw mm5, mm6 ;wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) |
---|
895 | |
---|
896 | movq [esi+8*6], mm1 ; |
---|
897 | psubw mm7, mm6 ;wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); |
---|
898 | |
---|
899 | movq [esi + 8*4], mm5 |
---|
900 | |
---|
901 | movq [esi + 8*10], mm7 |
---|
902 | |
---|
903 | |
---|
904 | |
---|
905 | |
---|
906 | /*****************************************************************/ |
---|
907 | |
---|
908 | /* Pass 2: process rows from work array, store into output array. */ |
---|
909 | /* Note that we must descale the results by a factor of 8 == 2**3, */ |
---|
910 | /* and also undo the PASS1_BITS scaling. */ |
---|
911 | |
---|
912 | /*****************************************************************/ |
---|
913 | /* Even part */ |
---|
914 | |
---|
915 | mov esi, eax |
---|
916 | mov eax, outptr |
---|
917 | |
---|
918 | // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); |
---|
919 | // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); |
---|
920 | // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); |
---|
921 | // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); |
---|
922 | movq mm0, [esi+8*0] ;wsptr[0,0],[0,1],[0,2],[0,3] |
---|
923 | |
---|
924 | movq mm1, [esi+8*1] ;wsptr[0,4],[0,5],[0,6],[0,7] |
---|
925 | movq mm2, mm0 |
---|
926 | |
---|
927 | movq mm3, [esi+8*2] ;wsptr[1,0],[1,1],[1,2],[1,3] |
---|
928 | paddw mm0, mm1 ;wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] |
---|
929 | |
---|
930 | movq mm4, [esi+8*3] ;wsptr[1,4],[1,5],[1,6],[1,7] |
---|
931 | psubw mm2, mm1 ;wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] |
---|
932 | |
---|
933 | movq mm6, mm0 |
---|
934 | movq mm5, mm3 |
---|
935 | |
---|
936 | paddw mm3, mm4 ;wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] |
---|
937 | movq mm1, mm2 |
---|
938 | |
---|
939 | psubw mm5, mm4 ;wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] |
---|
940 | punpcklwd mm0, mm3 ;wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] |
---|
941 | |
---|
942 | movq mm7, [esi+8*7] ;wsptr[3,4],[3,5],[3,6],[3,7] |
---|
943 | punpckhwd mm6, mm3 ;wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] |
---|
944 | |
---|
945 | movq mm3, [esi+8*4] ;wsptr[2,0],[2,1],[2,2],[2,3] |
---|
946 | punpckldq mm0, mm6 ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] |
---|
947 | |
---|
948 | punpcklwd mm1, mm5 ;wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] |
---|
949 | movq mm4, mm3 |
---|
950 | |
---|
951 | movq mm6, [esi+8*6] ;wsptr[3,0],[3,1],[3,2],[3,3] |
---|
952 | punpckhwd mm2, mm5 ;wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] |
---|
953 | |
---|
954 | movq mm5, [esi+8*5] ;wsptr[2,4],[2,5],[2,6],[2,7] |
---|
955 | punpckldq mm1, mm2 ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] |
---|
956 | |
---|
957 | |
---|
958 | paddw mm3, mm5 ;wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] |
---|
959 | movq mm2, mm6 |
---|
960 | |
---|
961 | psubw mm4, mm5 ;wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] |
---|
962 | paddw mm6, mm7 ;wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] |
---|
963 | |
---|
964 | movq mm5, mm3 |
---|
965 | punpcklwd mm3, mm6 ;wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] |
---|
966 | |
---|
967 | psubw mm2, mm7 ;wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] |
---|
968 | punpckhwd mm5, mm6 ;wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] |
---|
969 | |
---|
970 | movq mm7, mm4 |
---|
971 | punpckldq mm3, mm5 ;wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] |
---|
972 | |
---|
973 | punpcklwd mm4, mm2 ;wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] |
---|
974 | |
---|
975 | punpckhwd mm7, mm2 ;wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] |
---|
976 | |
---|
977 | punpckldq mm4, mm7 ;wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] |
---|
978 | movq mm6, mm1 |
---|
979 | |
---|
980 | // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] |
---|
981 | // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] |
---|
982 | |
---|
983 | |
---|
984 | movq mm2, mm0 |
---|
985 | punpckhdq mm6, mm4 ;wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] |
---|
986 | |
---|
987 | punpckldq mm1, mm4 ;wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] |
---|
988 | psllw mm6, 2 |
---|
989 | |
---|
990 | pmulhw mm6, fix_141 |
---|
991 | punpckldq mm0, mm3 ;wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] |
---|
992 | |
---|
993 | punpckhdq mm2, mm3 ;wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] |
---|
994 | movq mm7, mm0 |
---|
995 | |
---|
996 | // tmp0 = tmp10 + tmp13; |
---|
997 | // tmp3 = tmp10 - tmp13; |
---|
998 | paddw mm0, mm2 ;[0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] |
---|
999 | psubw mm7, mm2 ;[0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] |
---|
1000 | |
---|
1001 | // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; |
---|
1002 | psubw mm6, mm2 ;wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] |
---|
1003 | // tmp1 = tmp11 + tmp12; |
---|
1004 | // tmp2 = tmp11 - tmp12; |
---|
1005 | movq mm5, mm1 |
---|
1006 | |
---|
1007 | |
---|
1008 | |
---|
1009 | /* Odd part */ |
---|
1010 | |
---|
1011 | // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; |
---|
1012 | // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; |
---|
1013 | // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; |
---|
1014 | // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; |
---|
1015 | movq mm3, [esi+8*0] ;wsptr[0,0],[0,1],[0,2],[0,3] |
---|
1016 | paddw mm1, mm6 ;[0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] |
---|
1017 | |
---|
1018 | movq mm4, [esi+8*1] ;wsptr[0,4],[0,5],[0,6],[0,7] |
---|
1019 | psubw mm5, mm6 ;[0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] |
---|
1020 | |
---|
1021 | movq mm6, mm3 |
---|
1022 | punpckldq mm3, mm4 ;wsptr[0,0],[0,1],[0,4],[0,5] |
---|
1023 | |
---|
1024 | punpckhdq mm4, mm6 ;wsptr[0,6],[0,7],[0,2],[0,3] |
---|
1025 | movq mm2, mm3 |
---|
1026 | |
---|
1027 | //Save tmp0 and tmp1 in wsptr |
---|
1028 | movq [esi+8*0], mm0 ;save tmp0 |
---|
1029 | paddw mm2, mm4 ;wsptr[xxx],[0,z11],[xxx],[0,z13] |
---|
1030 | |
---|
1031 | |
---|
1032 | //Continue with z10 --- z13 |
---|
1033 | movq mm6, [esi+8*2] ;wsptr[1,0],[1,1],[1,2],[1,3] |
---|
1034 | psubw mm3, mm4 ;wsptr[xxx],[0,z12],[xxx],[0,z10] |
---|
1035 | |
---|
1036 | movq mm0, [esi+8*3] ;wsptr[1,4],[1,5],[1,6],[1,7] |
---|
1037 | movq mm4, mm6 |
---|
1038 | |
---|
1039 | movq [esi+8*1], mm1 ;save tmp1 |
---|
1040 | punpckldq mm6, mm0 ;wsptr[1,0],[1,1],[1,4],[1,5] |
---|
1041 | |
---|
1042 | punpckhdq mm0, mm4 ;wsptr[1,6],[1,7],[1,2],[1,3] |
---|
1043 | movq mm1, mm6 |
---|
1044 | |
---|
1045 | //Save tmp2 and tmp3 in wsptr |
---|
1046 | paddw mm6, mm0 ;wsptr[xxx],[1,z11],[xxx],[1,z13] |
---|
1047 | movq mm4, mm2 |
---|
1048 | |
---|
1049 | //Continue with z10 --- z13 |
---|
1050 | movq [esi+8*2], mm5 ;save tmp2 |
---|
1051 | punpcklwd mm2, mm6 ;wsptr[xxx],[xxx],[0,z11],[1,z11] |
---|
1052 | |
---|
1053 | psubw mm1, mm0 ;wsptr[xxx],[1,z12],[xxx],[1,z10] |
---|
1054 | punpckhwd mm4, mm6 ;wsptr[xxx],[xxx],[0,z13],[1,z13] |
---|
1055 | |
---|
1056 | movq mm0, mm3 |
---|
1057 | punpcklwd mm3, mm1 ;wsptr[xxx],[xxx],[0,z12],[1,z12] |
---|
1058 | |
---|
1059 | movq [esi+8*3], mm7 ;save tmp3 |
---|
1060 | punpckhwd mm0, mm1 ;wsptr[xxx],[xxx],[0,z10],[1,z10] |
---|
1061 | |
---|
1062 | movq mm6, [esi+8*4] ;wsptr[2,0],[2,1],[2,2],[2,3] |
---|
1063 | punpckhdq mm0, mm2 ;wsptr[0,z10],[1,z10],[0,z11],[1,z11] |
---|
1064 | |
---|
1065 | movq mm7, [esi+8*5] ;wsptr[2,4],[2,5],[2,6],[2,7] |
---|
1066 | punpckhdq mm3, mm4 ;wsptr[0,z12],[1,z12],[0,z13],[1,z13] |
---|
1067 | |
---|
1068 | movq mm1, [esi+8*6] ;wsptr[3,0],[3,1],[3,2],[3,3] |
---|
1069 | movq mm4, mm6 |
---|
1070 | |
---|
1071 | punpckldq mm6, mm7 ;wsptr[2,0],[2,1],[2,4],[2,5] |
---|
1072 | movq mm5, mm1 |
---|
1073 | |
---|
1074 | punpckhdq mm7, mm4 ;wsptr[2,6],[2,7],[2,2],[2,3] |
---|
1075 | movq mm2, mm6 |
---|
1076 | |
---|
1077 | movq mm4, [esi+8*7] ;wsptr[3,4],[3,5],[3,6],[3,7] |
---|
1078 | paddw mm6, mm7 ;wsptr[xxx],[2,z11],[xxx],[2,z13] |
---|
1079 | |
---|
1080 | psubw mm2, mm7 ;wsptr[xxx],[2,z12],[xxx],[2,z10] |
---|
1081 | punpckldq mm1, mm4 ;wsptr[3,0],[3,1],[3,4],[3,5] |
---|
1082 | |
---|
1083 | punpckhdq mm4, mm5 ;wsptr[3,6],[3,7],[3,2],[3,3] |
---|
1084 | movq mm7, mm1 |
---|
1085 | |
---|
1086 | paddw mm1, mm4 ;wsptr[xxx],[3,z11],[xxx],[3,z13] |
---|
1087 | psubw mm7, mm4 ;wsptr[xxx],[3,z12],[xxx],[3,z10] |
---|
1088 | |
---|
1089 | movq mm5, mm6 |
---|
1090 | punpcklwd mm6, mm1 ;wsptr[xxx],[xxx],[2,z11],[3,z11] |
---|
1091 | |
---|
1092 | punpckhwd mm5, mm1 ;wsptr[xxx],[xxx],[2,z13],[3,z13] |
---|
1093 | movq mm4, mm2 |
---|
1094 | |
---|
1095 | punpcklwd mm2, mm7 ;wsptr[xxx],[xxx],[2,z12],[3,z12] |
---|
1096 | |
---|
1097 | punpckhwd mm4, mm7 ;wsptr[xxx],[xxx],[2,z10],[3,z10] |
---|
1098 | |
---|
1099 | punpckhdq mm4, mm6 ;wsptr[2,z10],[3,z10],[2,z11],[3,z11] |
---|
1100 | |
---|
1101 | punpckhdq mm2, mm5 ;wsptr[2,z12],[3,z12],[2,z13],[3,z13] |
---|
1102 | movq mm5, mm0 |
---|
1103 | |
---|
1104 | punpckldq mm0, mm4 ;wsptr[0,z10],[1,z10],[2,z10],[3,z10] |
---|
1105 | |
---|
1106 | punpckhdq mm5, mm4 ;wsptr[0,z11],[1,z11],[2,z11],[3,z11] |
---|
1107 | movq mm4, mm3 |
---|
1108 | |
---|
1109 | punpckhdq mm4, mm2 ;wsptr[0,z13],[1,z13],[2,z13],[3,z13] |
---|
1110 | movq mm1, mm5 |
---|
1111 | |
---|
1112 | punpckldq mm3, mm2 ;wsptr[0,z12],[1,z12],[2,z12],[3,z12] |
---|
1113 | // tmp7 = z11 + z13; /* phase 5 */ |
---|
1114 | // tmp8 = z11 - z13; /* phase 5 */ |
---|
1115 | psubw mm1, mm4 ;tmp8 |
---|
1116 | |
---|
1117 | paddw mm5, mm4 ;tmp7 |
---|
1118 | // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */ |
---|
1119 | psllw mm1, 2 |
---|
1120 | |
---|
1121 | psllw mm0, 2 |
---|
1122 | |
---|
1123 | pmulhw mm1, fix_141 ;tmp21 |
---|
1124 | // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */ |
---|
1125 | // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */ |
---|
1126 | psllw mm3, 2 |
---|
1127 | movq mm7, mm0 |
---|
1128 | |
---|
1129 | pmulhw mm7, fix_n184 |
---|
1130 | movq mm6, mm3 |
---|
1131 | |
---|
1132 | movq mm2, [esi+8*0] ;tmp0,final1 |
---|
1133 | |
---|
1134 | pmulhw mm6, fix_108n184 |
---|
1135 | // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */ |
---|
1136 | // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */ |
---|
1137 | movq mm4, mm2 ;final1 |
---|
1138 | |
---|
1139 | pmulhw mm0, fix_184n261 |
---|
1140 | paddw mm2, mm5 ;tmp0+tmp7,final1 |
---|
1141 | |
---|
1142 | pmulhw mm3, fix_184 |
---|
1143 | psubw mm4, mm5 ;tmp0-tmp7,final1 |
---|
1144 | |
---|
1145 | // tmp6 = tmp22 - tmp7; /* phase 2 */ |
---|
1146 | psraw mm2, 5 ;outptr[0,0],[1,0],[2,0],[3,0],final1 |
---|
1147 | |
---|
1148 | paddsw mm2, const_0x0080 ;final1 |
---|
1149 | paddw mm7, mm6 ;tmp20 |
---|
1150 | psraw mm4, 5 ;outptr[0,7],[1,7],[2,7],[3,7],final1 |
---|
1151 | |
---|
1152 | paddsw mm4, const_0x0080 ;final1 |
---|
1153 | paddw mm3, mm0 ;tmp22 |
---|
1154 | |
---|
1155 | // tmp5 = tmp21 - tmp6; |
---|
1156 | psubw mm3, mm5 ;tmp6 |
---|
1157 | |
---|
1158 | // tmp4 = tmp20 + tmp5; |
---|
1159 | movq mm0, [esi+8*1] ;tmp1,final2 |
---|
1160 | psubw mm1, mm3 ;tmp5 |
---|
1161 | |
---|
1162 | movq mm6, mm0 ;final2 |
---|
1163 | paddw mm0, mm3 ;tmp1+tmp6,final2 |
---|
1164 | |
---|
1165 | /* Final output stage: scale down by a factor of 8 and range-limit */ |
---|
1166 | |
---|
1167 | |
---|
1168 | // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) |
---|
1169 | // & RANGE_MASK]; |
---|
1170 | // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) |
---|
1171 | // & RANGE_MASK]; final1 |
---|
1172 | |
---|
1173 | |
---|
1174 | // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) |
---|
1175 | // & RANGE_MASK]; |
---|
1176 | // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) |
---|
1177 | // & RANGE_MASK]; final2 |
---|
1178 | psubw mm6, mm3 ;tmp1-tmp6,final2 |
---|
1179 | psraw mm0, 5 ;outptr[0,1],[1,1],[2,1],[3,1] |
---|
1180 | |
---|
1181 | paddsw mm0, const_0x0080 |
---|
1182 | psraw mm6, 5 ;outptr[0,6],[1,6],[2,6],[3,6] |
---|
1183 | |
---|
1184 | paddsw mm6, const_0x0080 ;need to check this value |
---|
1185 | packuswb mm0, mm4 ;out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] |
---|
1186 | |
---|
1187 | movq mm5, [esi+8*2] ;tmp2,final3 |
---|
1188 | packuswb mm2, mm6 ;out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] |
---|
1189 | |
---|
1190 | // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) |
---|
1191 | // & RANGE_MASK]; |
---|
1192 | // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) |
---|
1193 | // & RANGE_MASK]; final3 |
---|
1194 | paddw mm7, mm1 ;tmp4 |
---|
1195 | movq mm3, mm5 |
---|
1196 | |
---|
1197 | paddw mm5, mm1 ;tmp2+tmp5 |
---|
1198 | psubw mm3, mm1 ;tmp2-tmp5 |
---|
1199 | |
---|
1200 | psraw mm5, 5 ;outptr[0,2],[1,2],[2,2],[3,2] |
---|
1201 | |
---|
1202 | paddsw mm5, const_0x0080 |
---|
1203 | movq mm4, [esi+8*3] ;tmp3,final4 |
---|
1204 | psraw mm3, 5 ;outptr[0,5],[1,5],[2,5],[3,5] |
---|
1205 | |
---|
1206 | paddsw mm3, const_0x0080 |
---|
1207 | |
---|
1208 | |
---|
1209 | // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) |
---|
1210 | // & RANGE_MASK]; |
---|
1211 | // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) |
---|
1212 | // & RANGE_MASK]; final4 |
---|
1213 | movq mm6, mm4 |
---|
1214 | paddw mm4, mm7 ;tmp3+tmp4 |
---|
1215 | |
---|
1216 | psubw mm6, mm7 ;tmp3-tmp4 |
---|
1217 | psraw mm4, 5 ;outptr[0,4],[1,4],[2,4],[3,4] |
---|
1218 | mov ecx, [eax] |
---|
1219 | |
---|
1220 | paddsw mm4, const_0x0080 |
---|
1221 | psraw mm6, 5 ;outptr[0,3],[1,3],[2,3],[3,3] |
---|
1222 | |
---|
1223 | paddsw mm6, const_0x0080 |
---|
1224 | packuswb mm5, mm4 ;out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] |
---|
1225 | |
---|
1226 | packuswb mm6, mm3 ;out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] |
---|
1227 | movq mm4, mm2 |
---|
1228 | |
---|
1229 | movq mm7, mm5 |
---|
1230 | punpcklbw mm2, mm0 ;out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] |
---|
1231 | |
---|
1232 | punpckhbw mm4, mm0 ;out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] |
---|
1233 | movq mm1, mm2 |
---|
1234 | |
---|
1235 | punpcklbw mm5, mm6 ;out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] |
---|
1236 | add eax, 4 |
---|
1237 | |
---|
1238 | punpckhbw mm7, mm6 ;out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] |
---|
1239 | |
---|
1240 | punpcklwd mm2, mm5 ;out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] |
---|
1241 | add ecx, output_col |
---|
1242 | |
---|
1243 | movq mm6, mm7 |
---|
1244 | punpckhwd mm1, mm5 ;out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] |
---|
1245 | |
---|
1246 | movq mm0, mm2 |
---|
1247 | punpcklwd mm6, mm4 ;out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] |
---|
1248 | |
---|
1249 | mov ebx, [eax] |
---|
1250 | punpckldq mm2, mm6 ;out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] |
---|
1251 | |
---|
1252 | add eax, 4 |
---|
1253 | movq mm3, mm1 |
---|
1254 | |
---|
1255 | add ebx, output_col |
---|
1256 | punpckhwd mm7, mm4 ;out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] |
---|
1257 | |
---|
1258 | movq [ecx], mm2 |
---|
1259 | punpckhdq mm0, mm6 ;out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] |
---|
1260 | |
---|
1261 | mov ecx, [eax] |
---|
1262 | add eax, 4 |
---|
1263 | add ecx, output_col |
---|
1264 | |
---|
1265 | movq [ebx], mm0 |
---|
1266 | punpckldq mm1, mm7 ;out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] |
---|
1267 | |
---|
1268 | mov ebx, [eax] |
---|
1269 | |
---|
1270 | add ebx, output_col |
---|
1271 | punpckhdq mm3, mm7 ;out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] |
---|
1272 | movq [ecx], mm1 |
---|
1273 | |
---|
1274 | |
---|
1275 | movq [ebx], mm3 |
---|
1276 | |
---|
1277 | |
---|
1278 | |
---|
1279 | /*******************************************************************/ |
---|
1280 | |
---|
1281 | |
---|
1282 | add esi, 64 |
---|
1283 | add eax, 4 |
---|
1284 | |
---|
1285 | /*******************************************************************/ |
---|
1286 | |
---|
1287 | // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); |
---|
1288 | // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); |
---|
1289 | // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); |
---|
1290 | // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); |
---|
1291 | movq mm0, [esi+8*0] ;wsptr[0,0],[0,1],[0,2],[0,3] |
---|
1292 | |
---|
1293 | movq mm1, [esi+8*1] ;wsptr[0,4],[0,5],[0,6],[0,7] |
---|
1294 | movq mm2, mm0 |
---|
1295 | |
---|
1296 | movq mm3, [esi+8*2] ;wsptr[1,0],[1,1],[1,2],[1,3] |
---|
1297 | paddw mm0, mm1 ;wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] |
---|
1298 | |
---|
1299 | movq mm4, [esi+8*3] ;wsptr[1,4],[1,5],[1,6],[1,7] |
---|
1300 | psubw mm2, mm1 ;wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] |
---|
1301 | |
---|
1302 | movq mm6, mm0 |
---|
1303 | movq mm5, mm3 |
---|
1304 | |
---|
1305 | paddw mm3, mm4 ;wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] |
---|
1306 | movq mm1, mm2 |
---|
1307 | |
---|
1308 | psubw mm5, mm4 ;wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] |
---|
1309 | punpcklwd mm0, mm3 ;wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] |
---|
1310 | |
---|
1311 | movq mm7, [esi+8*7] ;wsptr[3,4],[3,5],[3,6],[3,7] |
---|
1312 | punpckhwd mm6, mm3 ;wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] |
---|
1313 | |
---|
1314 | movq mm3, [esi+8*4] ;wsptr[2,0],[2,1],[2,2],[2,3] |
---|
1315 | punpckldq mm0, mm6 ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] |
---|
1316 | |
---|
1317 | punpcklwd mm1, mm5 ;wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] |
---|
1318 | movq mm4, mm3 |
---|
1319 | |
---|
1320 | movq mm6, [esi+8*6] ;wsptr[3,0],[3,1],[3,2],[3,3] |
---|
1321 | punpckhwd mm2, mm5 ;wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] |
---|
1322 | |
---|
1323 | movq mm5, [esi+8*5] ;wsptr[2,4],[2,5],[2,6],[2,7] |
---|
1324 | punpckldq mm1, mm2 ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] |
---|
1325 | |
---|
1326 | |
---|
1327 | paddw mm3, mm5 ;wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] |
---|
1328 | movq mm2, mm6 |
---|
1329 | |
---|
1330 | psubw mm4, mm5 ;wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] |
---|
1331 | paddw mm6, mm7 ;wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] |
---|
1332 | |
---|
1333 | movq mm5, mm3 |
---|
1334 | punpcklwd mm3, mm6 ;wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] |
---|
1335 | |
---|
1336 | psubw mm2, mm7 ;wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] |
---|
1337 | punpckhwd mm5, mm6 ;wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] |
---|
1338 | |
---|
1339 | movq mm7, mm4 |
---|
1340 | punpckldq mm3, mm5 ;wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] |
---|
1341 | |
---|
1342 | punpcklwd mm4, mm2 ;wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] |
---|
1343 | |
---|
1344 | punpckhwd mm7, mm2 ;wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] |
---|
1345 | |
---|
1346 | punpckldq mm4, mm7 ;wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] |
---|
1347 | movq mm6, mm1 |
---|
1348 | |
---|
1349 | // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] |
---|
1350 | // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] |
---|
1351 | |
---|
1352 | |
---|
1353 | movq mm2, mm0 |
---|
1354 | punpckhdq mm6, mm4 ;wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] |
---|
1355 | |
---|
1356 | punpckldq mm1, mm4 ;wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] |
---|
1357 | psllw mm6, 2 |
---|
1358 | |
---|
1359 | pmulhw mm6, fix_141 |
---|
1360 | punpckldq mm0, mm3 ;wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] |
---|
1361 | |
---|
1362 | punpckhdq mm2, mm3 ;wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] |
---|
1363 | movq mm7, mm0 |
---|
1364 | |
---|
1365 | // tmp0 = tmp10 + tmp13; |
---|
1366 | // tmp3 = tmp10 - tmp13; |
---|
1367 | paddw mm0, mm2 ;[0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] |
---|
1368 | psubw mm7, mm2 ;[0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] |
---|
1369 | |
---|
1370 | // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; |
---|
1371 | psubw mm6, mm2 ;wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] |
---|
1372 | // tmp1 = tmp11 + tmp12; |
---|
1373 | // tmp2 = tmp11 - tmp12; |
---|
1374 | movq mm5, mm1 |
---|
1375 | |
---|
1376 | |
---|
1377 | |
---|
1378 | /* Odd part */ |
---|
1379 | |
---|
1380 | // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; |
---|
1381 | // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; |
---|
1382 | // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; |
---|
1383 | // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; |
---|
1384 | movq mm3, [esi+8*0] ;wsptr[0,0],[0,1],[0,2],[0,3] |
---|
1385 | paddw mm1, mm6 ;[0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] |
---|
1386 | |
---|
1387 | movq mm4, [esi+8*1] ;wsptr[0,4],[0,5],[0,6],[0,7] |
---|
1388 | psubw mm5, mm6 ;[0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] |
---|
1389 | |
---|
1390 | movq mm6, mm3 |
---|
1391 | punpckldq mm3, mm4 ;wsptr[0,0],[0,1],[0,4],[0,5] |
---|
1392 | |
---|
1393 | punpckhdq mm4, mm6 ;wsptr[0,6],[0,7],[0,2],[0,3] |
---|
1394 | movq mm2, mm3 |
---|
1395 | |
---|
1396 | //Save tmp0 and tmp1 in wsptr |
---|
1397 | movq [esi+8*0], mm0 ;save tmp0 |
---|
1398 | paddw mm2, mm4 ;wsptr[xxx],[0,z11],[xxx],[0,z13] |
---|
1399 | |
---|
1400 | |
---|
1401 | //Continue with z10 --- z13 |
---|
1402 | movq mm6, [esi+8*2] ;wsptr[1,0],[1,1],[1,2],[1,3] |
---|
1403 | psubw mm3, mm4 ;wsptr[xxx],[0,z12],[xxx],[0,z10] |
---|
1404 | |
---|
1405 | movq mm0, [esi+8*3] ;wsptr[1,4],[1,5],[1,6],[1,7] |
---|
1406 | movq mm4, mm6 |
---|
1407 | |
---|
1408 | movq [esi+8*1], mm1 ;save tmp1 |
---|
1409 | punpckldq mm6, mm0 ;wsptr[1,0],[1,1],[1,4],[1,5] |
---|
1410 | |
---|
1411 | punpckhdq mm0, mm4 ;wsptr[1,6],[1,7],[1,2],[1,3] |
---|
1412 | movq mm1, mm6 |
---|
1413 | |
---|
1414 | //Save tmp2 and tmp3 in wsptr |
---|
1415 | paddw mm6, mm0 ;wsptr[xxx],[1,z11],[xxx],[1,z13] |
---|
1416 | movq mm4, mm2 |
---|
1417 | |
---|
1418 | //Continue with z10 --- z13 |
---|
1419 | movq [esi+8*2], mm5 ;save tmp2 |
---|
1420 | punpcklwd mm2, mm6 ;wsptr[xxx],[xxx],[0,z11],[1,z11] |
---|
1421 | |
---|
1422 | psubw mm1, mm0 ;wsptr[xxx],[1,z12],[xxx],[1,z10] |
---|
1423 | punpckhwd mm4, mm6 ;wsptr[xxx],[xxx],[0,z13],[1,z13] |
---|
1424 | |
---|
1425 | movq mm0, mm3 |
---|
1426 | punpcklwd mm3, mm1 ;wsptr[xxx],[xxx],[0,z12],[1,z12] |
---|
1427 | |
---|
1428 | movq [esi+8*3], mm7 ;save tmp3 |
---|
1429 | punpckhwd mm0, mm1 ;wsptr[xxx],[xxx],[0,z10],[1,z10] |
---|
1430 | |
---|
1431 | movq mm6, [esi+8*4] ;wsptr[2,0],[2,1],[2,2],[2,3] |
---|
1432 | punpckhdq mm0, mm2 ;wsptr[0,z10],[1,z10],[0,z11],[1,z11] |
---|
1433 | |
---|
1434 | movq mm7, [esi+8*5] ;wsptr[2,4],[2,5],[2,6],[2,7] |
---|
1435 | punpckhdq mm3, mm4 ;wsptr[0,z12],[1,z12],[0,z13],[1,z13] |
---|
1436 | |
---|
1437 | movq mm1, [esi+8*6] ;wsptr[3,0],[3,1],[3,2],[3,3] |
---|
1438 | movq mm4, mm6 |
---|
1439 | |
---|
1440 | punpckldq mm6, mm7 ;wsptr[2,0],[2,1],[2,4],[2,5] |
---|
1441 | movq mm5, mm1 |
---|
1442 | |
---|
1443 | punpckhdq mm7, mm4 ;wsptr[2,6],[2,7],[2,2],[2,3] |
---|
1444 | movq mm2, mm6 |
---|
1445 | |
---|
1446 | movq mm4, [esi+8*7] ;wsptr[3,4],[3,5],[3,6],[3,7] |
---|
1447 | paddw mm6, mm7 ;wsptr[xxx],[2,z11],[xxx],[2,z13] |
---|
1448 | |
---|
1449 | psubw mm2, mm7 ;wsptr[xxx],[2,z12],[xxx],[2,z10] |
---|
1450 | punpckldq mm1, mm4 ;wsptr[3,0],[3,1],[3,4],[3,5] |
---|
1451 | |
---|
1452 | punpckhdq mm4, mm5 ;wsptr[3,6],[3,7],[3,2],[3,3] |
---|
1453 | movq mm7, mm1 |
---|
1454 | |
---|
1455 | paddw mm1, mm4 ;wsptr[xxx],[3,z11],[xxx],[3,z13] |
---|
1456 | psubw mm7, mm4 ;wsptr[xxx],[3,z12],[xxx],[3,z10] |
---|
1457 | |
---|
1458 | movq mm5, mm6 |
---|
1459 | punpcklwd mm6, mm1 ;wsptr[xxx],[xxx],[2,z11],[3,z11] |
---|
1460 | |
---|
1461 | punpckhwd mm5, mm1 ;wsptr[xxx],[xxx],[2,z13],[3,z13] |
---|
1462 | movq mm4, mm2 |
---|
1463 | |
---|
1464 | punpcklwd mm2, mm7 ;wsptr[xxx],[xxx],[2,z12],[3,z12] |
---|
1465 | |
---|
1466 | punpckhwd mm4, mm7 ;wsptr[xxx],[xxx],[2,z10],[3,z10] |
---|
1467 | |
---|
1468 | punpckhdq mm4, mm6 ;wsptr[2,z10],[3,z10],[2,z11],[3,z11] |
---|
1469 | |
---|
1470 | punpckhdq mm2, mm5 ;wsptr[2,z12],[3,z12],[2,z13],[3,z13] |
---|
1471 | movq mm5, mm0 |
---|
1472 | |
---|
1473 | punpckldq mm0, mm4 ;wsptr[0,z10],[1,z10],[2,z10],[3,z10] |
---|
1474 | |
---|
1475 | punpckhdq mm5, mm4 ;wsptr[0,z11],[1,z11],[2,z11],[3,z11] |
---|
1476 | movq mm4, mm3 |
---|
1477 | |
---|
1478 | punpckhdq mm4, mm2 ;wsptr[0,z13],[1,z13],[2,z13],[3,z13] |
---|
1479 | movq mm1, mm5 |
---|
1480 | |
---|
1481 | punpckldq mm3, mm2 ;wsptr[0,z12],[1,z12],[2,z12],[3,z12] |
---|
1482 | // tmp7 = z11 + z13; /* phase 5 */ |
---|
1483 | // tmp8 = z11 - z13; /* phase 5 */ |
---|
1484 | psubw mm1, mm4 ;tmp8 |
---|
1485 | |
---|
1486 | paddw mm5, mm4 ;tmp7 |
---|
1487 | // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */ |
---|
1488 | psllw mm1, 2 |
---|
1489 | |
---|
1490 | psllw mm0, 2 |
---|
1491 | |
---|
1492 | pmulhw mm1, fix_141 ;tmp21 |
---|
1493 | // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */ |
---|
1494 | // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */ |
---|
1495 | psllw mm3, 2 |
---|
1496 | movq mm7, mm0 |
---|
1497 | |
---|
1498 | pmulhw mm7, fix_n184 |
---|
1499 | movq mm6, mm3 |
---|
1500 | |
---|
1501 | movq mm2, [esi+8*0] ;tmp0,final1 |
---|
1502 | |
---|
1503 | pmulhw mm6, fix_108n184 |
---|
1504 | // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */ |
---|
1505 | // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */ |
---|
1506 | movq mm4, mm2 ;final1 |
---|
1507 | |
---|
1508 | pmulhw mm0, fix_184n261 |
---|
1509 | paddw mm2, mm5 ;tmp0+tmp7,final1 |
---|
1510 | |
---|
1511 | pmulhw mm3, fix_184 |
---|
1512 | psubw mm4, mm5 ;tmp0-tmp7,final1 |
---|
1513 | |
---|
1514 | // tmp6 = tmp22 - tmp7; /* phase 2 */ |
---|
1515 | psraw mm2, 5 ;outptr[0,0],[1,0],[2,0],[3,0],final1 |
---|
1516 | |
---|
1517 | paddsw mm2, const_0x0080 ;final1 |
---|
1518 | paddw mm7, mm6 ;tmp20 |
---|
1519 | psraw mm4, 5 ;outptr[0,7],[1,7],[2,7],[3,7],final1 |
---|
1520 | |
---|
1521 | paddsw mm4, const_0x0080 ;final1 |
---|
1522 | paddw mm3, mm0 ;tmp22 |
---|
1523 | |
---|
1524 | // tmp5 = tmp21 - tmp6; |
---|
1525 | psubw mm3, mm5 ;tmp6 |
---|
1526 | |
---|
1527 | // tmp4 = tmp20 + tmp5; |
---|
1528 | movq mm0, [esi+8*1] ;tmp1,final2 |
---|
1529 | psubw mm1, mm3 ;tmp5 |
---|
1530 | |
---|
1531 | movq mm6, mm0 ;final2 |
---|
1532 | paddw mm0, mm3 ;tmp1+tmp6,final2 |
---|
1533 | |
---|
1534 | /* Final output stage: scale down by a factor of 8 and range-limit */ |
---|
1535 | |
---|
1536 | |
---|
1537 | // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) |
---|
1538 | // & RANGE_MASK]; |
---|
1539 | // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) |
---|
1540 | // & RANGE_MASK]; final1 |
---|
1541 | |
---|
1542 | |
---|
1543 | // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) |
---|
1544 | // & RANGE_MASK]; |
---|
1545 | // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) |
---|
1546 | // & RANGE_MASK]; final2 |
---|
1547 | psubw mm6, mm3 ;tmp1-tmp6,final2 |
---|
1548 | psraw mm0, 5 ;outptr[0,1],[1,1],[2,1],[3,1] |
---|
1549 | |
---|
1550 | paddsw mm0, const_0x0080 |
---|
1551 | psraw mm6, 5 ;outptr[0,6],[1,6],[2,6],[3,6] |
---|
1552 | |
---|
1553 | paddsw mm6, const_0x0080 ;need to check this value |
---|
1554 | packuswb mm0, mm4 ;out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] |
---|
1555 | |
---|
1556 | movq mm5, [esi+8*2] ;tmp2,final3 |
---|
1557 | packuswb mm2, mm6 ;out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] |
---|
1558 | |
---|
1559 | // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) |
---|
1560 | // & RANGE_MASK]; |
---|
1561 | // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) |
---|
1562 | // & RANGE_MASK]; final3 |
---|
1563 | paddw mm7, mm1 ;tmp4 |
---|
1564 | movq mm3, mm5 |
---|
1565 | |
---|
1566 | paddw mm5, mm1 ;tmp2+tmp5 |
---|
1567 | psubw mm3, mm1 ;tmp2-tmp5 |
---|
1568 | |
---|
1569 | psraw mm5, 5 ;outptr[0,2],[1,2],[2,2],[3,2] |
---|
1570 | |
---|
1571 | paddsw mm5, const_0x0080 |
---|
1572 | movq mm4, [esi+8*3] ;tmp3,final4 |
---|
1573 | psraw mm3, 5 ;outptr[0,5],[1,5],[2,5],[3,5] |
---|
1574 | |
---|
1575 | paddsw mm3, const_0x0080 |
---|
1576 | |
---|
1577 | |
---|
1578 | // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) |
---|
1579 | // & RANGE_MASK]; |
---|
1580 | // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) |
---|
1581 | // & RANGE_MASK]; final4 |
---|
1582 | movq mm6, mm4 |
---|
1583 | paddw mm4, mm7 ;tmp3+tmp4 |
---|
1584 | |
---|
1585 | psubw mm6, mm7 ;tmp3-tmp4 |
---|
1586 | psraw mm4, 5 ;outptr[0,4],[1,4],[2,4],[3,4] |
---|
1587 | mov ecx, [eax] |
---|
1588 | |
---|
1589 | paddsw mm4, const_0x0080 |
---|
1590 | psraw mm6, 5 ;outptr[0,3],[1,3],[2,3],[3,3] |
---|
1591 | |
---|
1592 | paddsw mm6, const_0x0080 |
---|
1593 | packuswb mm5, mm4 ;out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] |
---|
1594 | |
---|
1595 | packuswb mm6, mm3 ;out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] |
---|
1596 | movq mm4, mm2 |
---|
1597 | |
---|
1598 | movq mm7, mm5 |
---|
1599 | punpcklbw mm2, mm0 ;out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] |
---|
1600 | |
---|
1601 | punpckhbw mm4, mm0 ;out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] |
---|
1602 | movq mm1, mm2 |
---|
1603 | |
---|
1604 | punpcklbw mm5, mm6 ;out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] |
---|
1605 | add eax, 4 |
---|
1606 | |
---|
1607 | punpckhbw mm7, mm6 ;out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] |
---|
1608 | |
---|
1609 | punpcklwd mm2, mm5 ;out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] |
---|
1610 | add ecx, output_col |
---|
1611 | |
---|
1612 | movq mm6, mm7 |
---|
1613 | punpckhwd mm1, mm5 ;out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] |
---|
1614 | |
---|
1615 | movq mm0, mm2 |
---|
1616 | punpcklwd mm6, mm4 ;out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] |
---|
1617 | |
---|
1618 | mov ebx, [eax] |
---|
1619 | punpckldq mm2, mm6 ;out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] |
---|
1620 | |
---|
1621 | add eax, 4 |
---|
1622 | movq mm3, mm1 |
---|
1623 | |
---|
1624 | add ebx, output_col |
---|
1625 | punpckhwd mm7, mm4 ;out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] |
---|
1626 | |
---|
1627 | movq [ecx], mm2 |
---|
1628 | punpckhdq mm0, mm6 ;out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] |
---|
1629 | |
---|
1630 | mov ecx, [eax] |
---|
1631 | add eax, 4 |
---|
1632 | add ecx, output_col |
---|
1633 | |
---|
1634 | movq [ebx], mm0 |
---|
1635 | punpckldq mm1, mm7 ;out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] |
---|
1636 | |
---|
1637 | mov ebx, [eax] |
---|
1638 | |
---|
1639 | add ebx, output_col |
---|
1640 | punpckhdq mm3, mm7 ;out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] |
---|
1641 | movq [ecx], mm1 |
---|
1642 | |
---|
1643 | movq [ebx], mm3 |
---|
1644 | |
---|
1645 | emms |
---|
1646 | } |
---|
1647 | } |
---|
1648 | #endif |
---|
1649 | |
---|
1650 | #endif /* DCT_IFAST_SUPPORTED */ |
---|