source: trunk/third/firefox/jpeg/jdmerge.c @ 21695

Revision 21695, 30.0 KB checked in by rbasch, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r21694, which included commits to RCS files with non-trunk default branches.
Line 
1/*
2 * jdmerge.c
3 *
4 * Copyright (C) 1994-1996, Thomas G. Lane.
5 * This file is part of the Independent JPEG Group's software.
6 * For conditions of distribution and use, see the accompanying README file.
7 *
8 * This file contains code for merged upsampling/color conversion.
9 *
10 * This file combines functions from jdsample.c and jdcolor.c;
11 * read those files first to understand what's going on.
12 *
13 * When the chroma components are to be upsampled by simple replication
14 * (ie, box filtering), we can save some work in color conversion by
15 * calculating all the output pixels corresponding to a pair of chroma
16 * samples at one time.  In the conversion equations
17 *      R = Y           + K1 * Cr
18 *      G = Y + K2 * Cb + K3 * Cr
19 *      B = Y + K4 * Cb
20 * only the Y term varies among the group of pixels corresponding to a pair
21 * of chroma samples, so the rest of the terms can be calculated just once.
22 * At typical sampling ratios, this eliminates half or three-quarters of the
23 * multiplications needed for color conversion.
24 *
25 * This file currently provides implementations for the following cases:
26 *      YCbCr => RGB color conversion only.
27 *      Sampling ratios of 2h1v or 2h2v.
28 *      No scaling needed at upsample time.
29 *      Corner-aligned (non-CCIR601) sampling alignment.
30 * Other special cases could be added, but in most applications these are
31 * the only common cases.  (For uncommon cases we fall back on the more
32 * general code in jdsample.c and jdcolor.c.)
33 */
34
35#define JPEG_INTERNALS
36#include "jinclude.h"
37#include "jpeglib.h"
38
39#ifdef UPSAMPLE_MERGING_SUPPORTED
40
41#ifdef HAVE_MMX_INTEL_MNEMONICS
42  __int64 const1 = 0x59BA0000D24B59BA;       // Cr_r Cr_b Cr_g Cr_r
43  __int64 const2 = 0x00007168E9FA0000;           // Cb-r Cb_b Cb_g Cb_r
44  __int64 const5 = 0x0000D24B59BA0000;           // Cr_b Cr_g Cr_r Cr_b
45  __int64 const6 = 0x7168E9FA00007168;           // Cb_b Cb_g Cb_r Cb_b
46
47  // constants for factors (One_Half/fix(x)) << 2
48
49  __int64 const05 = 0x0001000000000001; // Cr_r Cr_b Cr_g Cr_r
50  __int64 const15 = 0x00000001FFFA0000; // Cb-r Cb_b Cb_g Cb_r
51  __int64 const45 = 0x0000000000010000; // Cr_b Cr_g Cr_r Cr_b
52  __int64 const55 = 0x0001FFFA00000001; // Cb_b Cb_g Cb_r Cb_b
53#endif
54
55/* Private subobject */
56
57typedef struct {
58  struct jpeg_upsampler pub;    /* public fields */
59
60  /* Pointer to routine to do actual upsampling/conversion of one row group */
61  JMETHOD(void, upmethod, (j_decompress_ptr cinfo,
62                           JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
63                           JSAMPARRAY output_buf));
64
65  /* Private state for YCC->RGB conversion */
66  int * Cr_r_tab;               /* => table for Cr to R conversion */
67  int * Cb_b_tab;               /* => table for Cb to B conversion */
68  INT32 * Cr_g_tab;             /* => table for Cr to G conversion */
69  INT32 * Cb_g_tab;             /* => table for Cb to G conversion */
70
71  /* For 2:1 vertical sampling, we produce two output rows at a time.
72   * We need a "spare" row buffer to hold the second output row if the
73   * application provides just a one-row buffer; we also use the spare
74   * to discard the dummy last row if the image height is odd.
75   */
76  JSAMPROW spare_row;
77  boolean spare_full;           /* T if spare buffer is occupied */
78
79  JDIMENSION out_row_width;     /* samples per output row */
80  JDIMENSION rows_to_go;        /* counts rows remaining in image */
81} my_upsampler;
82
83typedef my_upsampler * my_upsample_ptr;
84
85#define SCALEBITS       16      /* speediest right-shift on some machines */
86#define ONE_HALF        ((INT32) 1 << (SCALEBITS-1))
87#define FIX(x)          ((INT32) ((x) * (1L<<SCALEBITS) + 0.5))
88
89
90/*
91 * Initialize tables for YCC->RGB colorspace conversion.
92 * This is taken directly from jdcolor.c; see that file for more info.
93 */
94
95LOCAL(void)
96build_ycc_rgb_table (j_decompress_ptr cinfo)
97{
98  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
99  int i;
100  INT32 x;
101  SHIFT_TEMPS
102
103  upsample->Cr_r_tab = (int *)
104    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
105                                (MAXJSAMPLE+1) * SIZEOF(int));
106  upsample->Cb_b_tab = (int *)
107    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
108                                (MAXJSAMPLE+1) * SIZEOF(int));
109  upsample->Cr_g_tab = (INT32 *)
110    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
111                                (MAXJSAMPLE+1) * SIZEOF(INT32));
112  upsample->Cb_g_tab = (INT32 *)
113    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
114                                (MAXJSAMPLE+1) * SIZEOF(INT32));
115
116  for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
117    /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
118    /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */
119    /* Cr=>R value is nearest int to 1.40200 * x */
120    upsample->Cr_r_tab[i] = (int)
121                    RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
122    /* Cb=>B value is nearest int to 1.77200 * x */
123    upsample->Cb_b_tab[i] = (int)
124                    RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
125    /* Cr=>G value is scaled-up -0.71414 * x */
126    upsample->Cr_g_tab[i] = (- FIX(0.71414)) * x;
127    /* Cb=>G value is scaled-up -0.34414 * x */
128    /* We also add in ONE_HALF so that need not do it in inner loop */
129    upsample->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
130  }
131}
132
133
134/*
135 * Initialize for an upsampling pass.
136 */
137
138METHODDEF(void)
139start_pass_merged_upsample (j_decompress_ptr cinfo)
140{
141  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
142
143  /* Mark the spare buffer empty */
144  upsample->spare_full = FALSE;
145  /* Initialize total-height counter for detecting bottom of image */
146  upsample->rows_to_go = cinfo->output_height;
147}
148
149
150/*
151 * Control routine to do upsampling (and color conversion).
152 *
153 * The control routine just handles the row buffering considerations.
154 */
155
156METHODDEF(void)
157merged_2v_upsample (j_decompress_ptr cinfo,
158                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
159                    JDIMENSION in_row_groups_avail,
160                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
161                    JDIMENSION out_rows_avail)
162/* 2:1 vertical sampling case: may need a spare row. */
163{
164  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
165  JSAMPROW work_ptrs[2];
166  JDIMENSION num_rows;          /* number of rows returned to caller */
167
168  if (upsample->spare_full) {
169    /* If we have a spare row saved from a previous cycle, just return it. */
170    jcopy_sample_rows(& upsample->spare_row, 0, output_buf + *out_row_ctr, 0,
171                      1, upsample->out_row_width);
172    num_rows = 1;
173    upsample->spare_full = FALSE;
174  } else {
175    /* Figure number of rows to return to caller. */
176    num_rows = 2;
177    /* Not more than the distance to the end of the image. */
178    if (num_rows > upsample->rows_to_go)
179      num_rows = upsample->rows_to_go;
180    /* And not more than what the client can accept: */
181    out_rows_avail -= *out_row_ctr;
182    if (num_rows > out_rows_avail)
183      num_rows = out_rows_avail;
184    /* Create output pointer array for upsampler. */
185    work_ptrs[0] = output_buf[*out_row_ctr];
186    if (num_rows > 1) {
187      work_ptrs[1] = output_buf[*out_row_ctr + 1];
188    } else {
189      work_ptrs[1] = upsample->spare_row;
190      upsample->spare_full = TRUE;
191    }
192    /* Now do the upsampling. */
193    (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr, work_ptrs);
194  }
195
196  /* Adjust counts */
197  *out_row_ctr += num_rows;
198  upsample->rows_to_go -= num_rows;
199  /* When the buffer is emptied, declare this input row group consumed */
200  if (! upsample->spare_full)
201    (*in_row_group_ctr)++;
202}
203
204
205METHODDEF(void)
206merged_1v_upsample (j_decompress_ptr cinfo,
207                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
208                    JDIMENSION in_row_groups_avail,
209                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
210                    JDIMENSION out_rows_avail)
211/* 1:1 vertical sampling case: much easier, never need a spare row. */
212{
213  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
214
215  /* Just do the upsampling. */
216  (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr,
217                         output_buf + *out_row_ctr);
218  /* Adjust counts */
219  (*out_row_ctr)++;
220  (*in_row_group_ctr)++;
221}
222
223
224/*
225 * These are the routines invoked by the control routines to do
226 * the actual upsampling/conversion.  One row group is processed per call.
227 *
228 * Note: since we may be writing directly into application-supplied buffers,
229 * we have to be honest about the output width; we can't assume the buffer
230 * has been rounded up to an even width.
231 */
232
233
234/*
235 * Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
236 */
237
238METHODDEF(void)
239h2v1_merged_upsample (j_decompress_ptr cinfo,
240                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
241                      JSAMPARRAY output_buf)
242{
243 
244
245 my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
246  register int y, cred, cgreen, cblue;
247  int cb, cr;
248  register JSAMPROW outptr;
249  JSAMPROW inptr0, inptr1, inptr2;
250  JDIMENSION col;
251  /* copy these pointers into registers if possible */
252  register JSAMPLE * range_limit = cinfo->sample_range_limit;
253  int * Crrtab = upsample->Cr_r_tab;
254  int * Cbbtab = upsample->Cb_b_tab;
255  INT32 * Crgtab = upsample->Cr_g_tab;
256  INT32 * Cbgtab = upsample->Cb_g_tab;
257  SHIFT_TEMPS
258
259  inptr0 = input_buf[0][in_row_group_ctr];
260  inptr1 = input_buf[1][in_row_group_ctr];
261  inptr2 = input_buf[2][in_row_group_ctr];
262  outptr = output_buf[0];
263  /* Loop for each pair of output pixels */
264  for (col = cinfo->output_width >> 1; col > 0; col--) {
265    /* Do the chroma part of the calculation */
266    cb = GETJSAMPLE(*inptr1++);
267    cr = GETJSAMPLE(*inptr2++);
268    cred = Crrtab[cr];
269    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
270    cblue = Cbbtab[cb];
271    /* Fetch 2 Y values and emit 2 pixels */
272    y  = GETJSAMPLE(*inptr0++);
273    outptr[RGB_RED] =   range_limit[y + cred];
274    outptr[RGB_GREEN] = range_limit[y + cgreen];
275    outptr[RGB_BLUE] =  range_limit[y + cblue];
276    outptr += RGB_PIXELSIZE;
277    y  = GETJSAMPLE(*inptr0++);
278    outptr[RGB_RED] =   range_limit[y + cred];
279    outptr[RGB_GREEN] = range_limit[y + cgreen];
280    outptr[RGB_BLUE] =  range_limit[y + cblue];
281    outptr += RGB_PIXELSIZE;
282  }
283  /* If image width is odd, do the last output column separately */
284  if (cinfo->output_width & 1) {
285    cb = GETJSAMPLE(*inptr1);
286    cr = GETJSAMPLE(*inptr2);
287    cred = Crrtab[cr];
288    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
289    cblue = Cbbtab[cb];
290    y  = GETJSAMPLE(*inptr0);
291    outptr[RGB_RED] =   range_limit[y + cred];
292    outptr[RGB_GREEN] = range_limit[y + cgreen];
293    outptr[RGB_BLUE] =  range_limit[y + cblue];
294  }
295}
296
297
298/*
299 * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
300 */
301
302#ifdef HAVE_MMX_INTEL_MNEMONICS
303__inline METHODDEF(void)
304h2v2_merged_upsample_orig (j_decompress_ptr cinfo,
305                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
306                      JSAMPARRAY output_buf);
307__inline METHODDEF(void)
308h2v2_merged_upsample_mmx (j_decompress_ptr cinfo,
309                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
310                      JSAMPARRAY output_buf);
311#endif
312 
313METHODDEF(void)
314h2v2_merged_upsample (j_decompress_ptr cinfo,
315                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
316                      JSAMPARRAY output_buf);
317
318#ifdef HAVE_MMX_INTEL_MNEMONICS
319METHODDEF(void)
320h2v2_merged_upsample (j_decompress_ptr cinfo,
321                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
322                      JSAMPARRAY output_buf)
323{
324if (MMXAvailable && (cinfo->image_width >= 8))
325        h2v2_merged_upsample_mmx (cinfo, input_buf, in_row_group_ctr, output_buf);
326else
327        h2v2_merged_upsample_orig (cinfo, input_buf, in_row_group_ctr, output_buf);
328
329}
330
331__inline METHODDEF(void)
332h2v2_merged_upsample_orig (j_decompress_ptr cinfo,
333                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
334                      JSAMPARRAY output_buf)
335{
336
337  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
338  register int y, cred, cgreen, cblue;
339  int cb, cr;
340  register JSAMPROW outptr0, outptr1;
341  JSAMPROW inptr00, inptr01, inptr1, inptr2;
342  JDIMENSION col;
343  /* copy these pointers into registers if possible */
344  register JSAMPLE * range_limit = cinfo->sample_range_limit;
345  int * Crrtab = upsample->Cr_r_tab;
346  int * Cbbtab = upsample->Cb_b_tab;
347  INT32 * Crgtab = upsample->Cr_g_tab;
348  INT32 * Cbgtab = upsample->Cb_g_tab;
349  SHIFT_TEMPS
350
351  inptr00 = input_buf[0][in_row_group_ctr*2];
352  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
353  inptr1 = input_buf[1][in_row_group_ctr];
354  inptr2 = input_buf[2][in_row_group_ctr];
355  outptr0 = output_buf[0];
356  outptr1 = output_buf[1];
357  /* Loop for each group of output pixels */
358  for (col = cinfo->output_width >> 1; col > 0; col--) {
359    /* Do the chroma part of the calculation */
360    cb = GETJSAMPLE(*inptr1++);
361    cr = GETJSAMPLE(*inptr2++);
362    cred = Crrtab[cr];
363    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
364    cblue = Cbbtab[cb];
365    /* Fetch 4 Y values and emit 4 pixels */
366    y  = GETJSAMPLE(*inptr00++);
367    outptr0[RGB_RED] =   range_limit[y + cred];
368    outptr0[RGB_GREEN] = range_limit[y + cgreen];
369    outptr0[RGB_BLUE] =  range_limit[y + cblue];
370    outptr0 += RGB_PIXELSIZE;
371    y  = GETJSAMPLE(*inptr00++);
372    outptr0[RGB_RED] =   range_limit[y + cred];
373    outptr0[RGB_GREEN] = range_limit[y + cgreen];
374    outptr0[RGB_BLUE] =  range_limit[y + cblue];
375    outptr0 += RGB_PIXELSIZE;
376    y  = GETJSAMPLE(*inptr01++);
377    outptr1[RGB_RED] =   range_limit[y + cred];
378    outptr1[RGB_GREEN] = range_limit[y + cgreen];
379    outptr1[RGB_BLUE] =  range_limit[y + cblue];
380    outptr1 += RGB_PIXELSIZE;
381    y  = GETJSAMPLE(*inptr01++);
382    outptr1[RGB_RED] =   range_limit[y + cred];
383    outptr1[RGB_GREEN] = range_limit[y + cgreen];
384    outptr1[RGB_BLUE] =  range_limit[y + cblue];
385    outptr1 += RGB_PIXELSIZE;
386  }
387  /* If image width is odd, do the last output column separately */
388  if (cinfo->output_width & 1) {
389    cb = GETJSAMPLE(*inptr1);
390    cr = GETJSAMPLE(*inptr2);
391    cred = Crrtab[cr];
392    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
393    cblue = Cbbtab[cb];
394    y  = GETJSAMPLE(*inptr00);
395    outptr0[RGB_RED] =   range_limit[y + cred];
396    outptr0[RGB_GREEN] = range_limit[y + cgreen];
397    outptr0[RGB_BLUE] =  range_limit[y + cblue];
398    y  = GETJSAMPLE(*inptr01);
399    outptr1[RGB_RED] =   range_limit[y + cred];
400    outptr1[RGB_GREEN] = range_limit[y + cgreen];
401    outptr1[RGB_BLUE] =  range_limit[y + cblue];
402  }
403}
404
405/*
406 * Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
407 */
408__inline METHODDEF(void)
409h2v2_merged_upsample_mmx (j_decompress_ptr cinfo,
410                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
411                      JSAMPARRAY output_buf)
412{
413        // added for MMX
414  __int64 const128 = 0x0080008000800080;
415  __int64 empty = 0x0000000000000000;
416  __int64 davemask = 0x0000FFFFFFFF0000;
417  ////////////////////////////////
418
419  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
420  register int y, cred, cgreen, cblue;
421  int cb, cr;
422  register JSAMPROW outptr0, outptr1;
423  JSAMPROW inptr00, inptr01, inptr1, inptr2;
424  JDIMENSION col;
425  /* copy these pointers into registers if possible */
426  register JSAMPLE * range_limit = cinfo->sample_range_limit;
427  int * Crrtab = upsample->Cr_r_tab;
428  int * Cbbtab = upsample->Cb_b_tab;
429  INT32 * Crgtab = upsample->Cr_g_tab;
430  INT32 * Cbgtab = upsample->Cb_g_tab;
431  SHIFT_TEMPS
432 
433
434  // Added for MMX       
435  register int width = cinfo->image_width;
436  int cols = cinfo->output_width;
437  int cols_asm = (cols >> 3);
438  int diff = cols - (cols_asm<<3);
439  int cols_asm_copy = cols_asm;
440
441 ///////////////////////////////////////
442
443  inptr00 = input_buf[0][in_row_group_ctr*2];
444  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
445  inptr1 = input_buf[1][in_row_group_ctr];
446  inptr2 = input_buf[2][in_row_group_ctr];
447  outptr0 = output_buf[0];
448  outptr1 = output_buf[1];
449  /* Loop for each group of output pixels */
450
451           
452  _asm
453  {
454          mov esi, inptr00
455
456          mov eax, inptr01
457         
458          mov ebx, inptr2
459
460          mov ecx, inptr1
461
462          mov edi, outptr0
463
464          mov edx, outptr1
465
466do_next16:
467         
468          movd mm0, [ebx]                       ; Cr7 Cr6.....Cr1 Cr0
469
470          pxor mm6, mm6
471
472          punpcklbw mm0, mm0            ; Cr3 Cr3 Cr2 Cr2 Cr1 Cr1 Cr0 Cr0
473
474          movq mm7, const128
475
476          punpcklwd mm0, mm0            ; Cr1 Cr1 Cr1 Cr1 Cr0 Cr0 Cr0 Cr0
477
478          movq mm4, mm0
479
480          punpcklbw mm0, mm6            ; Cr0 Cr0 Cr0 Cr0
481
482          psubsw mm0, mm7                       ; Cr0 - 128:Cr0-128:Cr0-128:Cr0 -128
483         
484          movd mm1, [ecx]                       ; Cb7 Cb6...... Cb1 Cb0
485                   
486          psllw mm0, 2                          ; left shift by 2 bits
487
488          punpcklbw mm1, mm1            ; Cb3 Cb3 Cb2 Cb2 Cb1 Cb1 Cb0 Cb0
489         
490          paddsw mm0, const05           ; add (one_half/fix(x)) << 2
491
492          punpcklwd mm1, mm1            ; Cb1 Cb1 Cb1 Cb1 Cb0 Cb0 Cb0 Cb0
493
494          movq mm5, mm1
495
496          pmulhw mm0, const1            ; multiply by (fix(x) >> 1)
497
498          punpcklbw mm1, mm6            ; Cb0 Cb0 Cb0 Cb0
499
500          punpckhbw mm4, mm6            ; Cr1 Cr1 Cr1 Cr1
501
502          psubsw mm1, mm7                       ; Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128
503
504          punpckhbw mm5, mm6            ; Cb1 Cb1 Cb1 Cb1
505
506          psllw mm1, 2                          ; left shift by 2 bits
507 
508          paddsw mm1, const15           ; add (one_half/fix(x)) << 2
509
510          psubsw mm4, mm7                       ; Cr1 - 128:Cr1-128:Cr1-128:Cr1 -128
511                                               
512          psubsw mm5, mm7                       ; Cb1 - 128:Cb1-128:Cb1-128:Cb1 -128
513
514          pmulhw mm1, const2            ; multiply by (fix(x) >> 1)
515
516          psllw mm4, 2                          ; left shift by 2 bits
517
518          psllw mm5, 2                          ; left shift by 2 bits
519
520          paddsw mm4, const45           ; add (one_half/fix(x)) << 2
521
522          movd mm7, [esi]                       ;  Y13 Y12 Y9 Y8 Y5 Y4 Y1 Y0
523
524          pmulhw mm4, const5            ; multiply by (fix(x) >> 1)
525
526          movq mm6, mm7
527
528          punpcklbw mm7, mm7            ; Y5 Y5 Y4 Y4 Y1 Y1 Y0 Y0
529
530          paddsw mm5, const55           ; add (one_half/fix(x)) << 2
531
532          paddsw  mm0, mm1                      ; cred0 cbl0 cgr0 cred0
533
534          movq mm1, mm7
535
536          pmulhw mm5, const6            ; multiply by (fix(x) >> 1)
537
538          movq  mm2, mm0                        ; cred0 cbl0 cgr0 cred0
539
540          punpcklwd mm7, mm6            ; Y5 Y4 Y1 Y1 Y1 Y0 Y0 Y0
541
542          pand mm2, davemask            ; 0 cbl0 cgr0 0
543
544          psrlq mm1, 16                         ; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
545
546          psrlq mm2, 16                         ; 0 0 cbl0 cgr0
547
548          punpcklbw mm7, empty          ; Y1 Y0 Y0 Y0
549
550          paddsw mm4, mm5                       ; cbl1 cgr1 cred1 cbl1
551
552          movq  mm3, mm4                        ; cbl1 cgr1 cred1 cbl1
553
554          pand  mm3, davemask           ; 0 cgr1 cred1 0
555
556          paddsw mm7, mm0                       ; r1 b0 g0 r0
557
558          psllq mm3, 16                         ; cgr1 cred1 0 0
559
560          movq mm6, mm1                         ; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
561       
562          por   mm2, mm3                        ; cgr1 cred1 cbl0 cgr0
563
564          punpcklbw mm6, empty          ; Y4 Y4 Y1 Y1
565
566          movd mm3, [eax]                       ; Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2
567         
568          paddsw mm6, mm2                       ; g4 r4 b1 g1
569
570          packuswb mm7, mm6                     ; g4 r4 b1 g1 r1 b0 g0 r0
571
572          movq mm6, mm3                         ; Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2
573
574          punpcklbw mm3, mm3            ; Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2
575
576          movq [edi], mm7                       ; move to memory g4 r4 b1 g1 r1 b0 g0 r0
577
578          movq mm5, mm3                         ; Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2
579
580          punpcklwd mm3, mm6            ; X X X X Y3 Y2 Y2 Y2
581
582          punpcklbw mm3, empty          ; Y3 Y2 Y2 Y2
583
584          psrlq mm5, 16                         ; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
585
586          paddsw mm3, mm0                       ; r3 b2 g2 r2
587
588          movq mm6, mm5                         ; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
589
590          movq mm0, mm1                         ; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
591
592          punpckldq mm6, mm6            ; X X X X Y6 Y6 Y3 Y3
593
594          punpcklbw mm6, empty          ; Y6 Y6 Y3 Y3
595
596          psrlq mm1, 24                         ; 0 0 0 0 0 Y5 Y5 Y4
597         
598          paddsw mm6, mm2                       ; g6 r6 b3 g3
599
600          packuswb mm3, mm6                     ; g6 r6 b3 g3 r3 b2 g2 r2
601
602          movq mm2, mm5                         ; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
603
604          psrlq mm0, 32                         ; 0 0 0 0 0 0 Y5 Y5
605
606          movq [edx], mm3                       ; move to memory g6 r6 b3 g3 r3 b2 g2 r2
607         
608          punpcklwd mm1, mm0            ; X X X X Y5 Y5 Y5 Y4
609
610          psrlq mm5, 24                         ; 0 0 0 0 0 Y7 Y7 Y6
611
612          movd mm0, [ebx]                       ; Cr9 Cr8.....Cr3 Cr2
613
614          psrlq mm2, 32                         ; 0 0 0 0 0 0 Y7 Y7     
615         
616          psrlq mm0, 16         
617
618          punpcklbw mm1, empty          ; Y5 Y5 Y5 Y4
619
620          punpcklwd mm5, mm2            ; X X X X Y7 Y7 Y7 Y6
621
622          paddsw mm1, mm4                       ; b5 g5 r5 b4
623         
624          punpcklbw mm5, empty          ; Y7 Y7 Y7 Y6       
625
626          pxor mm6, mm6                         ; clear mm6 registr
627         
628          punpcklbw mm0, mm0            ; X X X X Cr3 Cr3 Cr2 Cr2
629 
630          paddsw mm5, mm4                       ; b7 g7 r7 b6
631         
632          punpcklwd mm0, mm0            ; Cr3 Cr3 Cr3 Cr3 Cr2 Cr2 Cr2 Cr2
633
634          movq mm4, mm0
635
636          movd mm3, [ecx]                       ; Cb9 Cb8...... Cb3 Cb2
637         
638          punpcklbw mm0, mm6            ; Cr2 Cr2 Cr2 Cr2
639
640          psrlq mm3, 16
641
642          psubsw mm0, const128          ; Cr2 - 128:Cr2-128:Cr2-128:Cr2 -128
643
644          punpcklbw mm3, mm3            ; X X X X Cb3 Cb3 Cb2 Cb2
645
646          psllw mm0, 2                          ; left shift by 2 bits
647
648          paddsw mm0, const05           ; add (one_half/fix(x)) << 2
649
650          punpcklwd mm3, mm3            ; Cb3 Cb3 Cb3 Cb3 Cb2 Cb2 Cb2 Cb2
651
652          movq mm7, mm3
653         
654          pmulhw mm0, const1            ; multiply by (fix(x) >> 1)               
655
656          punpcklbw mm3, mm6            ; Cb2 Cb2 Cb2 Cb2
657
658          psubsw mm3, const128          ; Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128
659
660          punpckhbw mm4, mm6            ; Cr3 Cr3 Cr3 Cr3
661         
662          psllw mm3, 2                          ; left shift by 2 bits
663
664          paddsw mm3, const15           ; add (one_half/fix(x)) << 2
665
666          punpckhbw mm7, mm6            ; Cb3 Cb3 Cb3 Cb3
667
668          pmulhw mm3, const2            ; multiply by (fix(x) >> 1)
669         
670          psubsw mm7, const128          ; Cb3 - 128:Cb3-128:Cb3-128:Cb3 -128
671
672          paddsw  mm0, mm3                      ; cred2 cbl2 cgr2 cred2
673           
674          psllw mm7, 2                          ; left shift by 2 bits
675
676          psubsw mm4, const128          ; Cr3 - 128:Cr3-128:Cr3-128:Cr3 -128
677         
678          movd mm3, [esi+4]                     ;  Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8
679         
680          psllw mm4, 2                          ; left shift by 2 bits
681
682          paddsw mm7, const55           ; add (one_half/fix(x)) << 2
683                 
684          movq mm6, mm3                         ;  Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8
685
686          movq  mm2, mm0
687                 
688          pand mm2, davemask
689
690          punpcklbw mm3, mm3            ; Y13 Y13 Y12 Y12 Y9 Y9 Y8 Y8
691
692          psrlq mm2, 16
693                 
694          paddsw mm4, const45           ; add (one_half/fix(x)) << 2
695
696          punpcklwd mm3, mm6            ; X X X X Y9 Y8 Y8 Y8
697         
698          pmulhw mm4, const5            ; multiply by (fix(x) >> 1)
699
700          pmulhw mm7, const6            ; multiply by (fix(x) >> 1)
701
702          punpcklbw mm3, empty          ; Y9 Y8 Y8 Y8
703         
704          paddsw mm4, mm7                       ; cbl3 cgr3 cred3 cbl3
705
706          paddsw mm3, mm0                       ; r9 b8 g8 r8
707
708          movq  mm7, mm4
709
710          packuswb mm1, mm3                     ; r9 b8 g8 r8 b5 g5 r5 b4
711
712          movd mm3, [eax+4]                     ; Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10
713         
714          pand  mm7, davemask
715
716          psrlq mm6, 8                          ; 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9
717
718          psllq mm7, 16
719                                                   
720          movq [edi+8], mm1                     ; move to memory r9 b8 g8 r8 b5 g5 r5 b4
721
722          por   mm2, mm7
723
724          movq mm7, mm3                         ; Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10
725
726          punpcklbw mm3, mm3            ; X X X X Y11 Y11 Y10 Y10
727
728          pxor mm1, mm1
729
730          punpcklwd mm3, mm7            ; X X X X Y11 Y10 Y10 Y10
731
732          punpcklbw mm3, mm1            ; Y11 Y10 Y10 Y10
733
734          psrlq mm7, 8                          ; 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11
735         
736          paddsw mm3, mm0                       ; r11 b10 g10 r10
737
738          movq mm0, mm7                         ; 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11
739
740          packuswb mm5, mm3                     ; r11 b10 g10 r10 b7 g7 r7 b6
741
742          punpcklbw mm7, mm7            ; X X X X Y14 Y14 Y11 Y11
743
744          movq [edx+8], mm5                     ; move to memory r11 b10 g10 r10 b7 g7 r7 b6
745
746          movq mm3, mm6                         ; 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9
747
748          punpcklbw mm6, mm6            ; X X X X Y12 Y12 Y9 Y9
749
750          punpcklbw mm7, mm1            ; Y14 Y14 Y11 Y11
751
752          punpcklbw mm6, mm1            ; Y12 Y12 Y9 Y9
753
754          paddsw mm7, mm2                       ; g14 r14 b11 g11
755
756          paddsw mm6, mm2                       ; g12 r12 b9 g9
757
758          psrlq mm3, 8                          ; 0 0 Y21 Y20 Y17 Y16 Y13 Y12
759
760          movq mm1, mm3                         ; 0 0 Y21 Y20 Y17 Y16 Y13 Y12
761
762          punpcklbw mm3, mm3            ; X X X X Y13 Y13 Y12 Y12
763
764          add esi, 8
765
766          psrlq mm3, 16                         ; X X X X X X Y13 Y13 modified on 09/24
767
768          punpcklwd mm1, mm3            ; X X X X Y13 Y13 Y13 Y12
769
770          add eax, 8
771
772          psrlq mm0, 8                          ; 0 0 Y23 Y22 Y19 Y18 Y15 Y14   
773
774          punpcklbw mm1, empty          ; Y13 Y13 Y13 Y12
775
776          movq mm5, mm0                         ; 0 0 Y23 Y22 Y19 Y18 Y15 Y14   
777
778          punpcklbw mm0, mm0            ; X X X X Y15 Y15 Y14 Y14
779
780          paddsw mm1, mm4                       ; b13 g13 r13 b12
781
782          psrlq mm0, 16                         ; X X X X X X Y15 Y15
783
784          add edi, 24
785         
786          punpcklwd mm5, mm0            ; X X X X Y15 Y15 Y15 Y14
787
788          packuswb mm6, mm1                     ; b13 g13 r13 b12 g12 r12 b9 g9
789
790          add edx, 24
791         
792          punpcklbw mm5, empty          ; Y15 Y15 Y15 Y14
793
794          add ebx, 4
795                 
796          paddsw mm5, mm4                       ; b15 g15 r15 b14
797
798          movq [edi-8], mm6             ; move to memory b13 g13 r13 b12 g12 r12 b9 g9
799
800          packuswb mm7, mm5                     ; b15 g15 r15 b14 g14 r14 b11 g11
801
802          add ecx, 4
803 
804          movq [edx-8], mm7             ; move to memory b15 g15 r15 b14 g14 r14 b11 g11
805
806          dec cols_asm
807         
808          jnz do_next16
809
810          EMMS
811                 
812          }
813
814         
815  inptr1 += (cols_asm_copy<<2);
816
817  inptr2 += (cols_asm_copy<<2);
818
819  inptr00 += (cols_asm_copy<<3);
820
821  inptr01 += (cols_asm_copy<<3);
822
823  outptr0 += cols_asm_copy*24;
824
825  outptr1 += cols_asm_copy*24;
826                 
827  //for (col = cinfo->output_width >> 1; col > 0; col--) {
828      /* Do the chroma part of the calculation */
829    /*cb = GETJSAMPLE(*inptr1++);
830    cr = GETJSAMPLE(*inptr2++);
831    cred = Crrtab[cr];
832    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
833    cblue = Cbbtab[cb];*/
834    /* Fetch 4 Y values and emit 4 pixels */
835    /*y  = GETJSAMPLE(*inptr00++);
836    outptr0[RGB_RED] =   range_limit[y + cred];
837    outptr0[RGB_GREEN] = range_limit[y + cgreen];
838    outptr0[RGB_BLUE] =  range_limit[y + cblue];
839    outptr0 += RGB_PIXELSIZE;
840    y  = GETJSAMPLE(*inptr00++);
841    outptr0[RGB_RED] =   range_limit[y + cred];
842    outptr0[RGB_GREEN] = range_limit[y + cgreen];
843    outptr0[RGB_BLUE] =  range_limit[y + cblue];
844    outptr0 += RGB_PIXELSIZE;
845    y  = GETJSAMPLE(*inptr01++);
846    outptr1[RGB_RED] =   range_limit[y + cred];
847    outptr1[RGB_GREEN] = range_limit[y + cgreen];
848    outptr1[RGB_BLUE] =  range_limit[y + cblue];
849    outptr1 += RGB_PIXELSIZE;
850    y  = GETJSAMPLE(*inptr01++);
851    outptr1[RGB_RED] =   range_limit[y + cred];
852    outptr1[RGB_GREEN] = range_limit[y + cgreen];
853    outptr1[RGB_BLUE] =  range_limit[y + cblue];
854    outptr1 += RGB_PIXELSIZE;
855  }       */
856
857
858  for (col = diff >> 1; col > 0; col--) {
859      /* Do the chroma part of the calculation */
860    cb = GETJSAMPLE(*inptr1++);
861    cr = GETJSAMPLE(*inptr2++);
862    cred = Crrtab[cr];
863    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
864    cblue = Cbbtab[cb];
865    /* Fetch 4 Y values and emit 4 pixels */
866    y  = GETJSAMPLE(*inptr00++);
867    outptr0[RGB_RED] =   range_limit[y + cred];
868    outptr0[RGB_GREEN] = range_limit[y + cgreen];
869    outptr0[RGB_BLUE] =  range_limit[y + cblue];
870    outptr0 += RGB_PIXELSIZE;
871    y  = GETJSAMPLE(*inptr00++);
872    outptr0[RGB_RED] =   range_limit[y + cred];
873    outptr0[RGB_GREEN] = range_limit[y + cgreen];
874    outptr0[RGB_BLUE] =  range_limit[y + cblue];
875    outptr0 += RGB_PIXELSIZE;
876    y  = GETJSAMPLE(*inptr01++);
877    outptr1[RGB_RED] =   range_limit[y + cred];
878    outptr1[RGB_GREEN] = range_limit[y + cgreen];
879    outptr1[RGB_BLUE] =  range_limit[y + cblue];
880    outptr1 += RGB_PIXELSIZE;
881    y  = GETJSAMPLE(*inptr01++);
882    outptr1[RGB_RED] =   range_limit[y + cred];
883    outptr1[RGB_GREEN] = range_limit[y + cgreen];
884    outptr1[RGB_BLUE] =  range_limit[y + cblue];
885    outptr1 += RGB_PIXELSIZE;
886  }       
887
888                                         
889  /* If image width is odd, do the last output column separately */
890  //if (cinfo->output_width & 1) {
891  if (diff & 1) {
892    cb = GETJSAMPLE(*inptr1);
893    cr = GETJSAMPLE(*inptr2);
894    cred = Crrtab[cr];
895    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
896    cblue = Cbbtab[cb];
897    y  = GETJSAMPLE(*inptr00);
898    outptr0[RGB_RED] =   range_limit[y + cred];
899    outptr0[RGB_GREEN] = range_limit[y + cgreen];
900    outptr0[RGB_BLUE] =  range_limit[y + cblue];
901    y  = GETJSAMPLE(*inptr01);
902    outptr1[RGB_RED] =   range_limit[y + cred];
903    outptr1[RGB_GREEN] = range_limit[y + cgreen];
904    outptr1[RGB_BLUE] =  range_limit[y + cblue];
905  }   
906}
907#else
908
909
910METHODDEF(void)
911h2v2_merged_upsample (j_decompress_ptr cinfo,
912                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
913                      JSAMPARRAY output_buf)
914{
915  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
916  register int y, cred, cgreen, cblue;
917  int cb, cr;
918  register JSAMPROW outptr0, outptr1;
919  JSAMPROW inptr00, inptr01, inptr1, inptr2;
920  JDIMENSION col;
921  /* copy these pointers into registers if possible */
922  register JSAMPLE * range_limit = cinfo->sample_range_limit;
923  int * Crrtab = upsample->Cr_r_tab;
924  int * Cbbtab = upsample->Cb_b_tab;
925  INT32 * Crgtab = upsample->Cr_g_tab;
926  INT32 * Cbgtab = upsample->Cb_g_tab;
927  SHIFT_TEMPS
928
929  inptr00 = input_buf[0][in_row_group_ctr*2];
930  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
931  inptr1 = input_buf[1][in_row_group_ctr];
932  inptr2 = input_buf[2][in_row_group_ctr];
933  outptr0 = output_buf[0];
934  outptr1 = output_buf[1];
935  /* Loop for each group of output pixels */
936  for (col = cinfo->output_width >> 1; col > 0; col--) {
937    /* Do the chroma part of the calculation */
938    cb = GETJSAMPLE(*inptr1++);
939    cr = GETJSAMPLE(*inptr2++);
940    cred = Crrtab[cr];
941    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
942    cblue = Cbbtab[cb];
943    /* Fetch 4 Y values and emit 4 pixels */
944    y  = GETJSAMPLE(*inptr00++);
945    outptr0[RGB_RED] =   range_limit[y + cred];
946    outptr0[RGB_GREEN] = range_limit[y + cgreen];
947    outptr0[RGB_BLUE] =  range_limit[y + cblue];
948    outptr0 += RGB_PIXELSIZE;
949    y  = GETJSAMPLE(*inptr00++);
950    outptr0[RGB_RED] =   range_limit[y + cred];
951    outptr0[RGB_GREEN] = range_limit[y + cgreen];
952    outptr0[RGB_BLUE] =  range_limit[y + cblue];
953    outptr0 += RGB_PIXELSIZE;
954    y  = GETJSAMPLE(*inptr01++);
955    outptr1[RGB_RED] =   range_limit[y + cred];
956    outptr1[RGB_GREEN] = range_limit[y + cgreen];
957    outptr1[RGB_BLUE] =  range_limit[y + cblue];
958    outptr1 += RGB_PIXELSIZE;
959    y  = GETJSAMPLE(*inptr01++);
960    outptr1[RGB_RED] =   range_limit[y + cred];
961    outptr1[RGB_GREEN] = range_limit[y + cgreen];
962    outptr1[RGB_BLUE] =  range_limit[y + cblue];
963    outptr1 += RGB_PIXELSIZE;
964  }
965  /* If image width is odd, do the last output column separately */
966  if (cinfo->output_width & 1) {
967    cb = GETJSAMPLE(*inptr1);
968    cr = GETJSAMPLE(*inptr2);
969    cred = Crrtab[cr];
970    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
971    cblue = Cbbtab[cb];
972    y  = GETJSAMPLE(*inptr00);
973    outptr0[RGB_RED] =   range_limit[y + cred];
974    outptr0[RGB_GREEN] = range_limit[y + cgreen];
975    outptr0[RGB_BLUE] =  range_limit[y + cblue];
976    y  = GETJSAMPLE(*inptr01);
977    outptr1[RGB_RED] =   range_limit[y + cred];
978    outptr1[RGB_GREEN] = range_limit[y + cgreen];
979    outptr1[RGB_BLUE] =  range_limit[y + cblue];
980  }
981}
982#endif
983
984
985/*
986 * Module initialization routine for merged upsampling/color conversion.
987 *
988 * NB: this is called under the conditions determined by use_merged_upsample()
989 * in jdmaster.c.  That routine MUST correspond to the actual capabilities
990 * of this module; no safety checks are made here.
991 */
992
993GLOBAL(void)
994jinit_merged_upsampler (j_decompress_ptr cinfo)
995{
996  my_upsample_ptr upsample;
997
998  upsample = (my_upsample_ptr)
999    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
1000                                SIZEOF(my_upsampler));
1001  cinfo->upsample = (struct jpeg_upsampler *) upsample;
1002  upsample->pub.start_pass = start_pass_merged_upsample;
1003  upsample->pub.need_context_rows = FALSE;
1004
1005  upsample->out_row_width = cinfo->output_width * cinfo->out_color_components;
1006
1007  if (cinfo->max_v_samp_factor == 2) {
1008    upsample->pub.upsample = merged_2v_upsample;
1009    upsample->upmethod = h2v2_merged_upsample;
1010    /* Allocate a spare row buffer */
1011    upsample->spare_row = (JSAMPROW)
1012      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
1013                (size_t) (upsample->out_row_width * SIZEOF(JSAMPLE)));
1014  } else {
1015    upsample->pub.upsample = merged_1v_upsample;
1016    upsample->upmethod = h2v1_merged_upsample;
1017    /* No spare row needed */
1018    upsample->spare_row = NULL;
1019  }
1020
1021  build_ycc_rgb_table(cinfo);
1022}
1023
1024#endif /* UPSAMPLE_MERGING_SUPPORTED */
Note: See TracBrowser for help on using the repository browser.