Commits

Anonymous committed 97d722d

Added fixes from the main trunk for sdlmixer.sndarray
Added filter backend switch support from the main trunk.
Added minor blit macro optimisations from the main trunk.

  • Participants
  • Parent commits 46addbe
  • Branches pgreloaded

Comments (0)

Files changed (19)

 CRITICAL:
 =========
 * recheck and redesign X11 implementation for pygame.sdlext.scrap
-* merge transform/scale asm changes in rev. 1657:1669
 * complete physics collision and contacts and merge it back to the branch
 * make LayeredGroups class independent from sdl surfaces, where possible
 * check anything for possible integer/float overflows
 * use copy.copy and copy.deepcopy for consistent object copies
-* Merge transform smooth scale changes in rev. 1715:1717
 * Rewrite and fix up numpysurfarray and numpysndarray
 * Add prebuilt package for Win32 VC++ builds.
 * Refine quitting for SDL_QuitSubSystem wrapper.
 * Check display surface tracking for multiple calls to set_mode using
   different return variables.
 * Argument parsing must handle 64-bit conversions correctly.
+* Add palette color support to sdlext.transform (trunk rev. 2242).
+* Check trunk rev. 1918, 1921, 1922, 1933, 1953 (blit blend operations).
+* Check trunk rev. 1937, 1947 (blit blend for self).
+* Add surface.scroll (trunk rev. 1951).
 
 Things to ADD:
 ==============

doc/src/sdlexttransform.xml

       :class:`pygame2.sdl.video.Surface` with the same dimensions. 
     </desc>
   </func>
+  <func name="get_filtertype">
+    <call>get_filtertype () -> int</call>
+    <desc>
+      Gets the currently set filter type.
+    </desc>
+  </func>
+  <func name="set_filtertype">
+    <call>set_filtertype (type) -> int</call>
+    <desc>
+      Sets the filters to use to one of the supported filter types.
+      
+      TODO
+    </desc>
+  </func>
   <func name="laplacian">
     <call>laplacian (surface[, destsurface]) -> Surface</call>
     <desc>
     """
     read the fonts on unix
     """
+    import subprocess
     fonts = {}
 
     # we use the fc-list from fontconfig to get a list of fonts.
 
     try:
-        # note, we use popen3 for if fc-list isn't there to stop stderr
-        # printing.
-        flin, flout, flerr = os.popen3('fc-list : file family style')
-    except:
+        flout, flerr = subprocess.Popen('fc-list : file family style', shell=True,
+                                        stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                                        close_fds=True).communicate()
+    except Exception:
         return fonts
 
     try:

lib/sdlmixer/numpysndarray.py

     else:
         data = sound.get_buffer ()
 
-    shape = (len (data) / channels * fmtbytes, )
+    shape = (len (data) // fmtbytes, )
     if channels > 1:
-        shape = (shape[0], 2)
+        shape = (shape[0] // channels, channels)
 
     # mixer.init () does not support different formats from the ones below,
     # so MSB/LSB stuff is silently ignored.
     if not info:
         raise pygame2.Error ("Mixer not initialized")
     fmtbytes = (abs (info[1]) & 0xff) >> 3
-    channels = mixer.get_num_channels ()
+    channels = info[2]
     data = sound.get_buffer ()
 
-    shape = (data.length / channels * fmtbytes, )
+    shape = (data.length // fmtbytes, )
     if channels > 1:
-        shape = (shape[0], 2)
+        shape = (shape[0] // channels, channels)
         
     # mixer.init () does not support different formats from the ones below,
     # so MSB/LSB stuff is silently ignored.

src/freetype/ft_font.c

     /* TODO */
 }
 
-
-
 /****************************************************
  * C API CALLS
  ****************************************************/

src/freetype/ft_wrap.c

         len = strlen(latin1_buffer);
 
         utf16_buffer = malloc((len + 1) * sizeof(FT_UInt16));
+        if (!utf16_buffer)
+            return NULL;
 
         for (i = 0; i < len; ++i)
             utf16_buffer[i] = (FT_UInt16)latin1_buffer[i];
     if (ft->library)
         FT_Done_FreeType(ft->library);
 
-    free(ft->_error_msg);
-    free(ft);
+    if (ft->_error_msg)
+        free (ft->_error_msg);
+    
+    free (ft);
 }
 
 int

src/freetype/pgfreetype.h

 /*
   pygame - Python Game Library
-  Copyright (C) 2000-2001 Pete Shinners
-  Copyright (C) 2008 Marcus von Appen
   Copyright (C) 2009 Vicent Marti
 
   This library is free software; you can redistribute it and/or

src/mask/maskmod.c

 _bitmask_threshold (bitmask_t *m, SDL_Surface *surf, SDL_Surface *surf2, 
     Uint32 color,  Uint32 threshold)
 {
-    int x, y, rshift, gshift, bshift, rshift2, gshift2, bshift2;
+    int x, y, rshift, gshift, bshift, rshift2, gshift2, bshift2, bpp1, bpp2;
     int rloss, gloss, bloss, rloss2, gloss2, bloss2;
     Uint8 *pixels, *pixels2;
     SDL_PixelFormat *format, *format2;
     rloss = format->Rloss;
     gloss = format->Gloss;
     bloss = format->Bloss;
+    bpp1 = format->BytesPerPixel;
 
     if (surf2)
     {
         gloss2 = format2->Gloss;
         bloss2 = format2->Bloss;
         pixels2 = (Uint8 *) surf2->pixels;
+        bpp2 = format2->BytesPerPixel;
     }
     else
     {
         rloss2 = gloss2 = bloss2 = 0;
         format2 = NULL;
         pixels2 = NULL;
+        bpp2 = 0;
     }
 
     SDL_GetRGBA (color, format, &r, &g, &b, &a);
         for (x=0; x < surf->w; x++)
         {
             /* the_color = surf->get_at(x,y) */
-            switch (format->BytesPerPixel)
+            switch (bpp1)
             {
             case 1:
                 the_color = (Uint32)*((Uint8 *) pixels);
 
             if (surf2)
             {
-                switch (format2->BytesPerPixel)
+                switch (bpp2)
                 {
                 case 1:
                     the_color2 = (Uint32)*((Uint8 *) pixels2);

src/sdl/surface.h

             ((Uint8)((argb & 0xff000000) >> 24)));                      \
     }
 
+#if SDL_BYTEORDER == SDL_LIL_ENDIAN
+#define SET_PIXEL24(buf,format,rgb)                                     \
+    *((buf) + ((format)->Rshift >> 3)) = (rgb)[0];                      \
+    *((buf) + ((format)->Gshift >> 3)) = (rgb)[1];                      \
+    *((buf) + ((format)->Bshift >> 3)) = (rgb)[2];
+#else
+#define SET_PIXEL24(buf,format,rgb)                                     \
+    *((buf) + 2 - ((format)->Rshift >> 3)) = (rgb)[0];                  \
+    *((buf) + 2 - ((format)->Gshift >> 3)) = (rgb)[1];                  \
+    *((buf) + 2 - ((format)->Bshift >> 3)) = (rgb)[2];
+#endif
+
 #define SET_PIXEL_AT(surface,format,_x,_y,color)                        \
     if ((_x) >= (surface)->clip_rect.x &&                               \
         (_x) <= (surface)->clip_rect.x + (surface)->clip_rect.w &&      \
             SDL_GetRGB ((color), (format), _rgb, _rgb+1, _rgb+2);       \
             _buf = (Uint8*)(((Uint8*)(surface)->pixels) + (_y) *        \
                 (surface)->pitch) + (_x) * 3;                           \
-            if (SDL_BYTEORDER == SDL_LIL_ENDIAN)                        \
-            {                                                           \
-                *(_buf + ((format)->Rshift >> 3)) = _rgb[0];            \
-                *(_buf + ((format)->Gshift >> 3)) = _rgb[1];            \
-                *(_buf + ((format)->Bshift >> 3)) = _rgb[2];            \
-            }                                                           \
-            else                                                        \
-            {                                                           \
-                *(_buf + 2 - ((format)->Rshift >> 3)) = _rgb[0];        \
-                *(_buf + 2 - ((format)->Gshift >> 3)) = _rgb[1];        \
-                *(_buf + 2 - ((format)->Bshift >> 3)) = _rgb[2];        \
-            }                                                           \
+            SET_PIXEL24(_buf, format, rgb);                             \
             break;                                                      \
         }                                                               \
         }                                                               \
     }
 
+#if SDL_BYTEORDER == SDL_LIL_ENDIAN
+#define GET_PIXEL_24(b) (b[0] + (b[1] << 8) + (b[2] << 16))
+#else
+#define GET_PIXEL_24(b) (b[2] + (b[1] << 8) + (b[0] << 16))
+#endif
+
 #define GET_PIXEL_AT(pxl,surface,bpp,_x,_y)                             \
     switch ((bpp))                                                      \
     {                                                                   \
     {                                                                   \
         Uint8* buf = ((Uint8 *) (((Uint8*)(surface)->pixels) + (_y) *   \
                 (surface)->pitch) + (_x) * 3);                          \
-        pxl = (SDL_BYTEORDER == SDL_LIL_ENDIAN) ?                       \
-            buf[0] + (buf[1] << 8) + (buf[2] << 16) :                   \
-            (buf[0] << 16) + (buf[1] << 8) + buf[2];                    \
+        pxl = GET_PIXEL_24(b);                                          \
         break;                                                          \
     }                                                                   \
     }
     default:                                      \
     {                                             \
         Uint8 *b = (Uint8 *) source;              \
-        pxl = (SDL_BYTEORDER == SDL_LIL_ENDIAN) ? \
-            b[0] + (b[1] << 8) + (b[2] << 16) :   \
-            (b[0] << 16) + (b[1] << 8) + b[2];    \
+        pxl = GET_PIXEL_24(b);                    \
     }                                             \
     break;                                        \
     }

src/sdlext/constantsmod.c

 #include "sdlextmod.h"
 #include "pgsdlext.h"
 #include "scrap.h"
+#include "filters.h"
 
 /* macros used to create each constant */
 #define DEC_CONSTS(x)  PyModule_AddIntConstant(module, #x, (int) #x)
     ADD_STRING_CONST (SCRAP_FORMAT_PPM);
     ADD_STRING_CONST (SCRAP_FORMAT_PBM);
 
+    DEC_CONSTS (FILTER_C);
+    DEC_CONSTS (FILTER_MMX);
+    DEC_CONSTS (FILTER_SSE);
+    
     MODINIT_RETURN(module);
 fail:
     Py_XDECREF (module);

src/sdlext/filters.c

 
 #include "filters.h"
 
+#if defined(__GNUC__)
+#if defined(__x86_64__)
+#include "filters_64.c"
+#elif defined(__i386__)
+#include "filters_32.c"
+#endif
+#endif /* __GNUC__ */
+
+FilterType
+pyg_filter_init_filterfuncs (FilterFuncs *filters, FilterType type)
+{
+    if (!filters)
+        return 0;
+    
+    filters->type = FILTER_C;
+    filters->shrink_X = pyg_filter_shrink_X_C;
+    filters->shrink_Y = pyg_filter_shrink_Y_C;
+    filters->expand_X = pyg_filter_expand_X_C;
+    filters->expand_Y = pyg_filter_expand_Y_C;
+    
+#if defined(FILTERS_SUPPORT_MMX)
+    if (type == FILTER_MMX && SDL_HasMMX ())
+    {
+        filters->type = FILTER_MMX;
+        filters->shrink_X = pyg_filter_shrink_X_MMX;
+        filters->shrink_Y = pyg_filter_shrink_Y_MMX;
+        filters->expand_X = pyg_filter_expand_X_MMX;
+        filters->expand_Y = pyg_filter_expand_Y_MMX;
+    }
+#endif /* FILTERS_SUPPORT_MMX */
+
+#if defined(FILTERS_SUPPORT_SSE)
+    if (type == FILTER_SSE && SDL_HasSSE ())
+    {
+        filters->type = FILTER_SSE;
+        filters->shrink_X = pyg_filter_shrink_X_SSE;
+        filters->shrink_Y = pyg_filter_shrink_Y_SSE;
+        filters->expand_X = pyg_filter_expand_X_SSE;
+        filters->expand_Y = pyg_filter_expand_Y_SSE;
+    }
+#endif /* FILTERS_SUPPORT_SSE */
+    return filters->type;
+}
+
 /* this function implements an area-averaging shrinking filter in the
  * X-dimension */
 void
     int x, y;
 
     int xspace = 0x10000 * srcwidth / dstwidth; /* must be > 1 */
-    int xrecip = (int) ((long long) 0x100000000 / xspace);
+    int xrecip = (int) (0x100000000LL / xspace);
     for (y = 0; y < height; y++)
     {
         Uint16 accumulate[4] = {0,0,0,0};
     }
 }
 
-void
-pyg_filter_shrink_X_MMX (Uint8 *srcpix, Uint8 *dstpix, int height,
-    int srcpitch, int dstpitch, int srcwidth, int dstwidth)
-{
-    int srcdiff = srcpitch - (srcwidth * 4);
-    int dstdiff = dstpitch - (dstwidth * 4);
-
-    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
-    int xrecip = (int) ((long long) 0x040000000 / xspace);
-    long long One64 = 0x4000400040004000ULL;
-#if defined(__GNUC__) && defined(__x86_64__)
-    long long srcdiff64 = srcdiff;
-    long long dstdiff64 = dstdiff;
-    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
-        " pxor          %%mm0,      %%mm0;           "
-        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
-        " punpcklwd     %%mm7,      %%mm7;           "
-        " punpckldq     %%mm7,      %%mm7;           "
-        "1:                                          " /* outer Y-loop */
-        " movl             %5,      %%ecx;           " /* ecx == xcounter */
-        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
-        " movl             %4,      %%edx;           " /* edx == width */
-        "2:                                          " /* inner X-loop */
-        " cmpl        $0x4000,      %%ecx;           "
-        " jbe              3f;                       "
-        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
-        " add              $4,         %0;           "
-        " punpcklbw     %%mm0,      %%mm2;           "
-        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
-        " subl        $0x4000,      %%ecx;           "
-        " jmp              4f;                       "
-        "3:                                          " /* prepare to output a pixel */
-        " movd          %%ecx,      %%mm2;           "
-        " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
-        " punpcklwd     %%mm2,      %%mm2;           "
-        " punpckldq     %%mm2,      %%mm2;           "
-        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
-        " add              $4,         %0;           "
-        " punpcklbw     %%mm0,      %%mm4;           "
-        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
-        " psllw            $2,      %%mm4;           "
-        " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
-        " psraw           $15,      %%mm5;           "
-        " pand          %%mm2,      %%mm5;           "
-        " movq          %%mm2,      %%mm6;           "
-        " psraw           $15,      %%mm6;           "
-        " pand          %%mm4,      %%mm6;           "
-        " pmulhw        %%mm4,      %%mm2;           "
-        " paddw         %%mm5,      %%mm2;           "
-        " paddw         %%mm6,      %%mm2;           "
-        " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
-        " psraw           $15,      %%mm5;           "
-        " pand          %%mm3,      %%mm5;           "
-        " movq          %%mm3,      %%mm6;           "
-        " psraw           $15,      %%mm6;           "
-        " pand          %%mm4,      %%mm6;           "
-        " pmulhw        %%mm4,      %%mm3;           "
-        " paddw         %%mm5,      %%mm3;           "
-        " paddw         %%mm6,      %%mm3;           "
-        " paddw         %%mm1,      %%mm2;           "
-        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
-        " movq          %%mm7,      %%mm5;           "
-        " psraw           $15,      %%mm5;           "
-        " pand          %%mm2,      %%mm5;           "
-        " movq          %%mm2,      %%mm6;           "
-        " psraw           $15,      %%mm6;           "
-        " pand          %%mm7,      %%mm6;           "
-        " pmulhw        %%mm7,      %%mm2;           "
-        " paddw         %%mm5,      %%mm2;           "
-        " paddw         %%mm6,      %%mm2;           "
-        " packuswb      %%mm0,      %%mm2;           "
-        " movd          %%mm2,       (%1);           "
-        " add              %5,      %%ecx;           "
-        " add              $4,         %1;           "
-        " subl        $0x4000,      %%ecx;           "
-        "4:                                          " /* tail of inner X-loop */
-        " decl          %%edx;                       "
-        " jne              2b;                       "
-        " add              %7,         %0;           " /* srcpix += srcdiff */
-        " add              %8,         %1;           " /* dstpix += dstdiff */
-        " decl             %3;                       "
-        " jne              1b;                       "
-        " emms;                                      "
-        : "+r"(srcpix), "+r"(dstpix)  /* outputs */
-        : "m"(One64),   "m"(height), "m"(srcwidth),
-          "m"(xspace),  "m"(xrecip), "m"(srcdiff64), "m"(dstdiff64)     /* inputs */
-        : "%ecx","%edx"               /* clobbered */
-        );
-#elif defined(__GNUC__) && defined(__i386__)
-    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
-        " pxor          %%mm0,      %%mm0;           "
-        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
-        " punpcklwd     %%mm7,      %%mm7;           "
-        " punpckldq     %%mm7,      %%mm7;           "
-        "1:                                          " /* outer Y-loop */
-        " movl             %5,      %%ecx;           " /* ecx == xcounter */
-        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
-        " movl             %4,      %%edx;           " /* edx == width */
-        "2:                                          " /* inner X-loop */
-        " cmpl        $0x4000,      %%ecx;           "
-        " jbe              3f;                       "
-        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
-        " add              $4,         %0;           "
-        " punpcklbw     %%mm0,      %%mm2;           "
-        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
-        " subl        $0x4000,      %%ecx;           "
-        " jmp              4f;                       "
-        "3:                                          " /* prepare to output a pixel */
-        " movd          %%ecx,      %%mm2;           "
-        " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
-        " punpcklwd     %%mm2,      %%mm2;           "
-        " punpckldq     %%mm2,      %%mm2;           "
-        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
-        " add              $4,         %0;           "
-        " punpcklbw     %%mm0,      %%mm4;           "
-        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
-        " psllw            $2,      %%mm4;           "
-        " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
-        " psraw           $15,      %%mm5;           "
-        " pand          %%mm2,      %%mm5;           "
-        " movq          %%mm2,      %%mm6;           "
-        " psraw           $15,      %%mm6;           "
-        " pand          %%mm4,      %%mm6;           "
-        " pmulhw        %%mm4,      %%mm2;           "
-        " paddw         %%mm5,      %%mm2;           "
-        " paddw         %%mm6,      %%mm2;           "
-        " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
-        " psraw           $15,      %%mm5;           "
-        " pand          %%mm3,      %%mm5;           "
-        " movq          %%mm3,      %%mm6;           "
-        " psraw           $15,      %%mm6;           "
-        " pand          %%mm4,      %%mm6;           "
-        " pmulhw        %%mm4,      %%mm3;           "
-        " paddw         %%mm5,      %%mm3;           "
-        " paddw         %%mm6,      %%mm3;           "
-        " paddw         %%mm1,      %%mm2;           "
-        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
-        " movq          %%mm7,      %%mm5;           "
-        " psraw           $15,      %%mm5;           "
-        " pand          %%mm2,      %%mm5;           "
-        " movq          %%mm2,      %%mm6;           "
-        " psraw           $15,      %%mm6;           "
-        " pand          %%mm7,      %%mm6;           "
-        " pmulhw        %%mm7,      %%mm2;           "
-        " paddw         %%mm5,      %%mm2;           "
-        " paddw         %%mm6,      %%mm2;           "
-        " packuswb      %%mm0,      %%mm2;           "
-        " movd          %%mm2,       (%1);           "
-        " add              %5,      %%ecx;           "
-        " add              $4,         %1;           "
-        " subl        $0x4000,      %%ecx;           "
-        "4:                                          " /* tail of inner X-loop */
-        " decl          %%edx;                       "
-        " jne              2b;                       "
-        " add              %7,         %0;           " /* srcpix += srcdiff */
-        " add              %8,         %1;           " /* dstpix += dstdiff */
-        " decl             %3;                       "
-        " jne              1b;                       "
-        " emms;                                      "
-        : "+r"(srcpix), "+r"(dstpix)                   /* outputs */
-        : "m"(One64),   "m"(height), "m"(srcwidth),
-          "m"(xspace),  "m"(xrecip), "m"(srcdiff),  "m"(dstdiff)  /* input */
-        : "%ecx","%edx"     /* clobbered */
-        );
-#endif
-}
-
 /* this function implements an area-averaging shrinking filter in the
  * Y-dimension */
 void
     int dstdiff = dstpitch - (width * 4);
     int x, y;
     int yspace = 0x10000 * srcheight / dstheight; /* must be > 1 */
-    int yrecip = (int) ((long long) 0x100000000 / yspace);
+    int yrecip = (int) (0x100000000LL / yspace);
     int ycounter = yspace;
 
     /* allocate and clear a memory area for storing the accumulator line */
     free (templine);
 }
 
-/* this function implements an area-averaging shrinking filter in the
- * Y-dimension */
-void
-pyg_filter_shrink_Y_MMX (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
-    int dstpitch, int srcheight, int dstheight)
-{
-    Uint16 *templine;
-    int srcdiff = srcpitch - (width * 4);
-    int dstdiff = dstpitch - (width * 4);
-    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
-    int yrecip = (int) ((long long) 0x040000000 / yspace);
-    long long One64 = 0x4000400040004000ULL;
-
-    /* allocate and clear a memory area for storing the accumulator line */
-    templine = (Uint16 *) malloc((size_t) (dstpitch * 2));
-    if (templine == NULL)
-        return;
-    memset(templine, 0, (size_t) (dstpitch * 2));
-
-#if defined(__GNUC__) && defined(__x86_64__)
-    long long srcdiff64 = srcdiff;
-    long long dstdiff64 = dstdiff;
-    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
-        " movl             %5,      %%ecx;           " /* ecx == ycounter */
-        " pxor          %%mm0,      %%mm0;           "
-        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
-        " punpcklwd     %%mm7,      %%mm7;           "
-        " punpckldq     %%mm7,      %%mm7;           "
-        "1:                                          " /* outer Y-loop */
-        " mov              %2,      %%rax;           " /* rax == accumulate */
-        " cmpl        $0x4000,      %%ecx;           "
-        " jbe              3f;                       "
-        " movl             %4,      %%edx;           " /* edx == width */
-        "2:                                          "
-        " movd           (%0),      %%mm1;           "
-        " add              $4,         %0;           "
-        " movq        (%%rax),      %%mm2;           "
-        " punpcklbw     %%mm0,      %%mm1;           "
-        " paddw         %%mm1,      %%mm2;           "
-        " movq          %%mm2,    (%%rax);           "
-        " add              $8,      %%rax;           "
-        " decl          %%edx;                       "
-        " jne              2b;                       "
-        " subl        $0x4000,      %%ecx;           "
-        " jmp              6f;                       "
-        "3:                                          " /* prepare to output a line */
-        " movd          %%ecx,      %%mm1;           "
-        " movl             %4,      %%edx;           " /* edx = width */
-        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
-        " punpcklwd     %%mm1,      %%mm1;           "
-        " punpckldq     %%mm1,      %%mm1;           "
-        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
-        "4:                                          "
-        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
-        " add              $4,         %0;           "
-        " punpcklbw     %%mm0,      %%mm4;           "
-        " movq        (%%rax),      %%mm5;           " /* mm5 = accumulate */
-        " movq          %%mm6,      %%mm3;           "
-        " psllw            $2,      %%mm4;           "
-        " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
-        " psraw           $15,      %%mm0;           "
-        " pand          %%mm3,      %%mm0;           "
-        " movq          %%mm3,      %%mm2;           "
-        " psraw           $15,      %%mm2;           "
-        " pand          %%mm4,      %%mm2;           "
-        " pmulhw        %%mm4,      %%mm3;           "
-        " paddw         %%mm0,      %%mm3;           "
-        " paddw         %%mm2,      %%mm3;           "
-        " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
-        " psraw           $15,      %%mm0;           "
-        " pand          %%mm4,      %%mm0;           "
-        " movq          %%mm4,      %%mm2;           "
-        " psraw           $15,      %%mm2;           "
-        " pand          %%mm1,      %%mm2;           "
-        " pmulhw        %%mm1,      %%mm4;           "
-        " paddw         %%mm0,      %%mm4;           "
-        " paddw         %%mm2,      %%mm4;           "
-        " movq          %%mm3,    (%%rax);           "
-        " paddw         %%mm5,      %%mm4;           "
-        " add              $8,      %%rax;           "
-        " movq          %%mm7,      %%mm0;           "
-        " psraw           $15,      %%mm0;           "
-        " pand          %%mm4,      %%mm0;           "
-        " movq          %%mm4,      %%mm2;           "
-        " psraw           $15,      %%mm2;           "
-        " pand          %%mm7,      %%mm2;           "
-        " pmulhw        %%mm7,      %%mm4;           "
-        " paddw         %%mm0,      %%mm4;           "
-        " paddw         %%mm2,      %%mm4;           "
-        " pxor          %%mm0,      %%mm0;           "
-        " packuswb      %%mm0,      %%mm4;           "
-        " movd          %%mm4,       (%1);           "
-        " add              $4,         %1;           "
-        " decl          %%edx;                       "
-        " jne              4b;                       "
-        " add              %8,         %1;           " /* dstpix += dstdiff */
-        " addl             %5,      %%ecx;           "
-        " subl        $0x4000,      %%ecx;           "
-        "6:                                          " /* tail of outer Y-loop */
-        " add              %7,         %0;           " /* srcpix += srcdiff */
-        " decl             %3;                       "
-        " jne              1b;                       "
-        " emms;                                      "
-        : "+r"(srcpix), "+r"(dstpix)    /* outputs */
-        : "m"(templine),"m"(srcheight), "m"(width),     "m"(yspace),  
-          "m"(yrecip),  "m"(srcdiff64), "m"(dstdiff64), "m"(One64)  /* input */
-        : "%ecx","%edx","%rax"          /* clobbered */
-        );
-#elif defined(__GNUC__) && defined(__i386__)
-    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
-        " movl             %5,      %%ecx;           " /* ecx == ycounter */
-        " pxor          %%mm0,      %%mm0;           "
-        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
-        " punpcklwd     %%mm7,      %%mm7;           "
-        " punpckldq     %%mm7,      %%mm7;           "
-        "1:                                          " /* outer Y-loop */
-        " movl             %2,      %%eax;           " /* rax == accumulate */
-        " cmpl        $0x4000,      %%ecx;           "
-        " jbe              3f;                       "
-        " movl             %4,      %%edx;           " /* edx == width */
-        "2:                                          "
-        " movd           (%0),      %%mm1;           "
-        " add              $4,         %0;           "
-        " movq        (%%eax),      %%mm2;           "
-        " punpcklbw     %%mm0,      %%mm1;           "
-        " paddw         %%mm1,      %%mm2;           "
-        " movq          %%mm2,    (%%eax);           "
-        " add              $8,      %%eax;           "
-        " decl          %%edx;                       "
-        " jne              2b;                       "
-        " subl        $0x4000,      %%ecx;           "
-        " jmp              6f;                       "
-        "3:                                          " /* prepare to output a line */
-        " movd          %%ecx,      %%mm1;           "
-        " movl             %4,      %%edx;           " /* edx = width */
-        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
-        " punpcklwd     %%mm1,      %%mm1;           "
-        " punpckldq     %%mm1,      %%mm1;           "
-        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
-        "4:                                          "
-        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
-        " add              $4,         %0;           "
-        " punpcklbw     %%mm0,      %%mm4;           "
-        " movq        (%%eax),      %%mm5;           " /* mm5 = accumulate */
-        " movq          %%mm6,      %%mm3;           "
-        " psllw            $2,      %%mm4;           "
-        " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
-        " psraw           $15,      %%mm0;           "
-        " pand          %%mm3,      %%mm0;           "
-        " movq          %%mm3,      %%mm2;           "
-        " psraw           $15,      %%mm2;           "
-        " pand          %%mm4,      %%mm2;           "
-        " pmulhw        %%mm4,      %%mm3;           "
-        " paddw         %%mm0,      %%mm3;           "
-        " paddw         %%mm2,      %%mm3;           "
-        " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
-        " psraw           $15,      %%mm0;           "
-        " pand          %%mm4,      %%mm0;           "
-        " movq          %%mm4,      %%mm2;           "
-        " psraw           $15,      %%mm2;           "
-        " pand          %%mm1,      %%mm2;           "
-        " pmulhw        %%mm1,      %%mm4;           "
-        " paddw         %%mm0,      %%mm4;           "
-        " paddw         %%mm2,      %%mm4;           "
-        " movq          %%mm3,    (%%eax);           "
-        " paddw         %%mm5,      %%mm4;           "
-        " add              $8,      %%eax;           "
-        " movq          %%mm7,      %%mm0;           "
-        " psraw           $15,      %%mm0;           "
-        " pand          %%mm4,      %%mm0;           "
-        " movq          %%mm4,      %%mm2;           "
-        " psraw           $15,      %%mm2;           "
-        " pand          %%mm7,      %%mm2;           "
-        " pmulhw        %%mm7,      %%mm4;           "
-        " paddw         %%mm0,      %%mm4;           "
-        " paddw         %%mm2,      %%mm4;           "
-        " pxor          %%mm0,      %%mm0;           "
-        " packuswb      %%mm0,      %%mm4;           "
-        " movd          %%mm4,       (%1);           "
-        " add              $4,         %1;           "
-        " decl          %%edx;                       "
-        " jne              4b;                       "
-        " add              %8,         %1;           " /* dstpix += dstdiff */
-        " addl             %5,      %%ecx;           "
-        " subl        $0x4000,      %%ecx;           "
-        "6:                                          " /* tail of outer Y-loop */
-        " add              %7,         %0;           " /* srcpix += srcdiff */
-        " decl             %3;                       "
-        " jne              1b;                       "
-        " emms;                                      "
-        : "+r"(srcpix),  "+r"(dstpix)     /* outputs */
-        : "m"(templine), "m"(srcheight), "m"(width),  "m"(yspace),
-          "m"(yrecip),   "m"(srcdiff),   "m"(dstdiff),"m"(One64)  /* input */
-        : "%ecx","%edx","%eax"           /* clobbered */
-        );
-
-#endif
-
-    /* free the temporary memory */
-    free (templine);
-}
-
 /* this function implements a bilinear filter in the X-dimension */
 void
 pyg_filter_expand_X_C (Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
     free (xmult1);
 }
 
-/* this function implements a bilinear filter in the X-dimension */
-void
-pyg_filter_expand_X_MMX (Uint8 *srcpix, Uint8 *dstpix, int height,
-    int srcpitch, int dstpitch, int srcwidth, int dstwidth)
-{
-    int *xidx0, *xmult0, *xmult1;
-    int x, y;
-    int factorwidth = 8;
-
-    /* Allocate memory for factors */
-    xidx0 = malloc((size_t) (dstwidth * 4));
-    if (xidx0 == NULL)
-        return;
-    xmult0 = (int *) malloc((size_t) (dstwidth * factorwidth));
-    xmult1 = (int *) malloc((size_t) (dstwidth * factorwidth));
-    if (xmult0 == NULL || xmult1 == NULL)
-    {
-        free (xidx0);
-        if (xmult0)
-            free (xmult0);
-        if (xmult1)
-            free (xmult1);
-    }
-
-    /* Create multiplier factors and starting indices and put them in arrays */
-    for (x = 0; x < dstwidth; x++)
-    {
-        int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
-        int xm0 = 0x100 - xm1;
-        xidx0[x] = x * (srcwidth - 1) / dstwidth;
-        xmult1[x*2]   = xm1 | (xm1 << 16);
-        xmult1[x*2+1] = xm1 | (xm1 << 16);
-        xmult0[x*2]   = xm0 | (xm0 << 16);
-        xmult0[x*2+1] = xm0 | (xm0 << 16);
-    }
-
-    /* Do the scaling in raster order so we don't trash the cache */
-    for (y = 0; y < height; y++)
-    {
-        Uint8 *srcrow0 = srcpix + y * srcpitch;
-        Uint8 *dstrow = dstpix + y * dstpitch;
-        int *xm0 = xmult0;
-        int *xm1 = xmult1;
-        int *x0 = xidx0;
-#if defined(__GNUC__) && defined(__x86_64__)
-        asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
-             " movl             %5,      %%ecx;           "
-             " pxor          %%mm0,      %%mm0;           "
-             "1:                                          "
-             " movsxl         (%3),      %%rax;           " /* get xidx0[x] */
-             " add              $4,         %3;           "
-             " movq           (%0),      %%mm1;           " /* load mult0 */
-             " add              $8,         %0;           "
-             " movq           (%1),      %%mm2;           " /* load mult1 */
-             " add              $8,         %1;           "
-             " movd   (%4,%%rax,4),      %%mm4;           "
-             " movd  4(%4,%%rax,4),      %%mm5;           "
-             " punpcklbw     %%mm0,      %%mm4;           "
-             " punpcklbw     %%mm0,      %%mm5;           "
-             " pmullw        %%mm1,      %%mm4;           "
-             " pmullw        %%mm2,      %%mm5;           "
-             " paddw         %%mm4,      %%mm5;           "
-             " psrlw            $8,      %%mm5;           "
-             " packuswb      %%mm0,      %%mm5;           "
-             " movd          %%mm5,       (%2);           "
-             " add              $4,         %2;           "
-             " decl          %%ecx;                       "
-             " jne              1b;                       "
-             " emms;                                      "
-             : "+r"(xm0),   "+r"(xm1), "+r"(dstrow), "+r"(x0) /* outputs */
-             : "r"(srcrow0),"m"(dstwidth)  /* input */
-             : "%ecx","%rax"                /* clobbered */
-             );
-#elif defined(__GNUC__) && defined(__i386__)
-        int width = dstwidth;
-        long long One64 = 0x0100010001000100;
-        asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
-             " pxor          %%mm0,      %%mm0;           "
-             " movq             %5,      %%mm7;           "
-             "1:                                          "
-             " movl           (%2),      %%eax;           " /* get xidx0[x] */
-             " add              $4,         %2;           "
-             " movq          %%mm7,      %%mm2;           "
-             " movq           (%0),      %%mm1;           " /* load mult0 */
-             " add              $8,         %0;           "
-             " psubw         %%mm1,      %%mm2;           " /* load mult1 */
-             " movd   (%4,%%eax,4),      %%mm4;           "
-             " movd  4(%4,%%eax,4),      %%mm5;           "
-             " punpcklbw     %%mm0,      %%mm4;           "
-             " punpcklbw     %%mm0,      %%mm5;           "
-             " pmullw        %%mm1,      %%mm4;           "
-             " pmullw        %%mm2,      %%mm5;           "
-             " paddw         %%mm4,      %%mm5;           "
-             " psrlw            $8,      %%mm5;           "
-             " packuswb      %%mm0,      %%mm5;           "
-             " movd          %%mm5,       (%1);           "
-             " add              $4,         %1;           "
-             " decl             %3;                       "
-             " jne              1b;                       "
-             " emms;                                      "
-             : "+r"(xm0),    "+r"(dstrow), "+r"(x0), "+m"(width)  /* outputs */
-             : "S"(srcrow0), "m"(One64)    /* input */
-             : "%eax"            /* clobbered */
-             );
-#endif
-    }
-
-    /* free memory */
-    free (xidx0);
-    free (xmult0);
-    free (xmult1);
-}
-
 /* this function implements a bilinear filter in the Y-dimension */
 void
 pyg_filter_expand_Y_C (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
         }
     }
 }
-
-/* this function implements a bilinear filter in the Y-dimension */
-void
-pyg_filter_expand_Y_MMX (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
-    int dstpitch, int srcheight, int dstheight)
-{
-    int y;
-
-    for (y = 0; y < dstheight; y++)
-    {
-        int yidx0 = y * (srcheight - 1) / dstheight;
-        Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
-        Uint8 *srcrow1 = srcrow0 + srcpitch;
-        int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
-        int ymult0 = 0x0100 - ymult1;
-        Uint8 *dstrow = dstpix + y * dstpitch;
-#if defined(__GNUC__) && defined(__x86_64__)
-        asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
-             " movl          %5,      %%ecx;                      "
-             " movd          %3,      %%mm1;                      "
-             " movd          %4,      %%mm2;                      "
-             " pxor       %%mm0,      %%mm0;                      "
-             " punpcklwd  %%mm1,      %%mm1;                      "
-             " punpckldq  %%mm1,      %%mm1;                      "
-             " punpcklwd  %%mm2,      %%mm2;                      "
-             " punpckldq  %%mm2,      %%mm2;                      "
-             "1:                                                  "
-             " movd        (%0),      %%mm4;                      "
-             " add           $4,         %0;                      "
-             " movd        (%1),      %%mm5;                      "
-             " add           $4,         %1;                      "
-             " punpcklbw  %%mm0,      %%mm4;                      "
-             " punpcklbw  %%mm0,      %%mm5;                      "
-             " pmullw     %%mm1,      %%mm4;                      "
-             " pmullw     %%mm2,      %%mm5;                      "
-             " paddw      %%mm4,      %%mm5;                      "
-             " psrlw         $8,      %%mm5;                      "
-             " packuswb   %%mm0,      %%mm5;                      "
-             " movd       %%mm5,       (%2);                      "
-             " add           $4,         %2;                      "
-             " decl       %%ecx;                                  "
-             " jne           1b;                                  "
-             " emms;                                              "
-             : "+r"(srcrow0), "+r"(srcrow1), "+r"(dstrow)   /* outputs */
-             : "m"(ymult0),   "m"(ymult1),   "m"(width)    /* input */
-             : "%ecx"         /* clobbered */
-             );
-#elif defined(__GNUC__) && defined(__i386__)
-        asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
-             " movl          %5,      %%eax;                      "
-             " movd          %3,      %%mm1;                      "
-             " movd          %4,      %%mm2;                      "
-             " pxor       %%mm0,      %%mm0;                      "
-             " punpcklwd  %%mm1,      %%mm1;                      "
-             " punpckldq  %%mm1,      %%mm1;                      "
-             " punpcklwd  %%mm2,      %%mm2;                      "
-             " punpckldq  %%mm2,      %%mm2;                      "
-             "1:                                                  "
-             " movd        (%0),      %%mm4;                      "
-             " add           $4,         %0;                      "
-             " movd        (%1),      %%mm5;                      "
-             " add           $4,         %1;                      "
-             " punpcklbw  %%mm0,     %%mm4;                       "
-             " punpcklbw  %%mm0,     %%mm5;                       "
-             " pmullw     %%mm1,     %%mm4;                       "
-             " pmullw     %%mm2,     %%mm5;                       "
-             " paddw      %%mm4,     %%mm5;                       "
-             " psrlw         $8,     %%mm5;                       "
-             " packuswb   %%mm0,     %%mm5;                       "
-             " movd       %%mm5,      (%2);                       "
-             " add           $4,        %2;                       "
-             " decl       %%eax;                                  "
-             " jne           1b;                                  "
-             " emms;                                              "
-             : "+r"(srcrow0), "+r"(srcrow1),"+r"(dstrow)   /* no outputs */
-             : "m"(ymult0),   "m"(ymult1),  "m"(width)    /* input */
-             : "%eax"        /* clobbered */
-             );
-#endif
-    }
-}

src/sdlext/filters.h

 
 #include <SDL.h>
 
+typedef enum
+{
+    FILTER_C,
+    FILTER_MMX,
+    FILTER_SSE
+} FilterType;
+
+typedef struct
+{
+    FilterType type;
+    void       (*shrink_X)(Uint8 *, Uint8 *, int, int, int, int, int);
+    void       (*shrink_Y)(Uint8 *, Uint8 *, int, int, int, int, int);
+    void       (*expand_X)(Uint8 *, Uint8 *, int, int, int, int, int);
+    void       (*expand_Y)(Uint8 *, Uint8 *, int, int, int, int, int);
+} FilterFuncs;
+
+FilterType
+pyg_filter_init_filterfuncs (FilterFuncs *filters, FilterType type);
+
 void
 pyg_filter_shrink_X_C (Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
     int dstpitch, int srcwidth, int dstwidth);
 
 void
-pyg_filter_shrink_X_MMX (Uint8 *srcpix, Uint8 *dstpix, int height,
-    int srcpitch, int dstpitch, int srcwidth, int dstwidth);
-
-void
 pyg_filter_shrink_Y_C (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
     int dstpitch, int srcheight, int dstheight);
 
 void
-pyg_filter_shrink_Y_MMX (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
-    int dstpitch, int srcheight, int dstheight);
-
-void
 pyg_filter_expand_X_C (Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
     int dstpitch, int srcwidth, int dstwidth);
 
 void
-pyg_filter_expand_X_MMX (Uint8 *srcpix, Uint8 *dstpix, int height,
+pyg_filter_expand_Y_C (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
+    int dstpitch, int srcheight, int dstheight);
+
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#define FILTERS_SUPPORT_MMX
+#define FILTERS_SUPPORT_SSE
+
+void
+pyg_filter_shrink_X_MMX (Uint8 *srcpix, Uint8 *dstpix, int height,
     int srcpitch, int dstpitch, int srcwidth, int dstwidth);
 
 void
-pyg_filter_expand_Y_C (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
+pyg_filter_shrink_Y_MMX (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
     int dstpitch, int srcheight, int dstheight);
 
 void
+pyg_filter_expand_X_MMX (Uint8 *srcpix, Uint8 *dstpix, int height,
+    int srcpitch, int dstpitch, int srcwidth, int dstwidth);
+
+void
 pyg_filter_expand_Y_MMX (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
     int dstpitch, int srcheight, int dstheight);
 
+void
+pyg_filter_shrink_X_SSE (Uint8 *srcpix, Uint8 *dstpix, int height,
+    int srcpitch, int dstpitch, int srcwidth, int dstwidth);
+
+void
+pyg_filter_shrink_Y_SSE (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
+    int dstpitch, int srcheight, int dstheight);
+
+void
+pyg_filter_expand_X_SSE (Uint8 *srcpix, Uint8 *dstpix, int height,
+    int srcpitch, int dstpitch, int srcwidth, int dstwidth);
+
+void
+pyg_filter_expand_Y_SSE (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
+    int dstpitch, int srcheight, int dstheight);
+
+#endif /* __GNUC__ */
+
 #endif /* _PYGAME_FILTERS_H_ */

src/sdlext/filters_32.c

+/*
+  pygame - Python Game Library
+  Copyright (C) 2000-2001  Pete Shinners
+  Copyright (C) 2007  Rene Dudfield, Richard Goedeken 
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Library General Public
+  License as published by the Free Software Foundation; either
+  version 2 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Library General Public License for more details.
+
+  You should have received a copy of the GNU Library General Public
+  License along with this library; if not, write to the Free
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  Pete Shinners
+  pete@shinners.org
+*/
+
+/* Pentium 32 bit SSE/MMX smoothscale filter routines
+ * These are written for compilation with GCC only.
+ *
+ * This file should not depend on anything but the C standard library.
+ */
+
+#if !defined(__GNUC__) || !defined(__i386__) || defined(__x86_64__)
+#error "Pygame2 build bug: should not be compiling this file!"
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* These functions implement an area-averaging shrinking filter in the X-dimension.
+ */
+void
+pyg_filter_shrink_X_MMX(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int srcdiff = srcpitch - (srcwidth * 4);
+    int dstdiff = dstpitch - (dstwidth * 4);
+
+    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
+    int xrecip = 0x40000000 / xspace;
+    long long One64 = 0x4000400040004000ULL;
+
+    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
+        " punpcklwd     %%mm7,      %%mm7;           "
+        " punpckldq     %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %5,      %%ecx;           " /* ecx == xcounter */
+        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          " /* inner X-loop */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm2;           "
+        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              4f;                       "
+        "3:                                          " /* prepare to output a pixel */
+        " movd          %%ecx,      %%mm2;           "
+        " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
+        " punpcklwd     %%mm2,      %%mm2;           "
+        " punpckldq     %%mm2,      %%mm2;           "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
+        " psllw            $2,      %%mm4;           "
+        " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
+        " psraw           $15,      %%mm5;           "
+        " pand          %%mm2,      %%mm5;           "
+        " movq          %%mm2,      %%mm6;           "
+        " psraw           $15,      %%mm6;           "
+        " pand          %%mm4,      %%mm6;           "
+        " pmulhw        %%mm4,      %%mm2;           "
+        " paddw         %%mm5,      %%mm2;           "
+        " paddw         %%mm6,      %%mm2;           "
+        " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
+        " psraw           $15,      %%mm5;           "
+        " pand          %%mm3,      %%mm5;           "
+        " movq          %%mm3,      %%mm6;           "
+        " psraw           $15,      %%mm6;           "
+        " pand          %%mm4,      %%mm6;           "
+        " pmulhw        %%mm4,      %%mm3;           "
+        " paddw         %%mm5,      %%mm3;           "
+        " paddw         %%mm6,      %%mm3;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
+        " movq          %%mm7,      %%mm5;           "
+        " psraw           $15,      %%mm5;           "
+        " pand          %%mm2,      %%mm5;           "
+        " movq          %%mm2,      %%mm6;           "
+        " psraw           $15,      %%mm6;           "
+        " pand          %%mm7,      %%mm6;           "
+        " pmulhw        %%mm7,      %%mm2;           "
+        " paddw         %%mm5,      %%mm2;           "
+        " paddw         %%mm6,      %%mm2;           "
+        " packuswb      %%mm0,      %%mm2;           "
+        " movd          %%mm2,       (%1);           "
+        " add              %5,      %%ecx;           "
+        " add              $4,         %1;           "
+        " subl        $0x4000,      %%ecx;           "
+        "4:                                          " /* tail of inner X-loop */
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)                   /* outputs */
+        : "m"(One64),   "m"(height), "m"(srcwidth),
+          "m"(xspace),  "m"(xrecip), "m"(srcdiff),  "m"(dstdiff)  /* input */
+        : "%ecx","%edx"     /* clobbered */
+        );
+}
+
+void
+pyg_filter_shrink_X_SSE(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int srcdiff = srcpitch - (srcwidth * 4);
+    int dstdiff = dstpitch - (dstwidth * 4);
+
+    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
+    int xrecip = 0x40000000 / xspace;
+    long long One64 = 0x4000400040004000ULL;
+
+    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
+        " movq             %2,      %%mm6;           " /* mm6 = 2^14  */
+        " pshufw    $0, %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %5,      %%ecx;           " /* ecx == xcounter */
+        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          " /* inner X-loop */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm2;           "
+        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              4f;                       "
+        "3:                                          " /* prepare to output a pixel */
+        " movd          %%ecx,      %%mm2;           "
+        " movq          %%mm6,      %%mm3;           " /* mm3 = 2^14  */
+        " pshufw    $0, %%mm2,      %%mm2;           "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
+        " psllw            $2,      %%mm4;           "
+        " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
+        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
+        " pmulhuw       %%mm7,      %%mm2;           "
+        " packuswb      %%mm0,      %%mm2;           "
+        " movd          %%mm2,       (%1);           "
+        " add              %5,      %%ecx;           "
+        " add              $4,         %1;           "
+        " subl        $0x4000,      %%ecx;           "
+        "4:                                          " /* tail of inner X-loop */
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)                   /* outputs */
+        : "m"(One64),   "m"(height), "m"(srcwidth),
+          "m"(xspace),  "m"(xrecip), "m"(srcdiff),  "m"(dstdiff)  /* input */
+        : "%ecx","%edx"     /* clobbered */
+        );
+}
+
+/* These functions implement an area-averaging shrinking filter in the Y-dimension.
+ */
+void
+pyg_filter_shrink_Y_MMX(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    Uint16 *templine;
+    int srcdiff = srcpitch - (width * 4);
+    int dstdiff = dstpitch - (width * 4);
+    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
+    int yrecip = 0x40000000 / yspace;
+    long long One64 = 0x4000400040004000ULL;
+
+    /* allocate and clear a memory area for storing the accumulator line */
+    templine = (Uint16 *) malloc(dstpitch * 2);
+    if (templine == 0) return;
+    memset(templine, 0, dstpitch * 2);
+
+    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
+        " movl             %5,      %%ecx;           " /* ecx == ycounter */
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
+        " punpcklwd     %%mm7,      %%mm7;           "
+        " punpckldq     %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %2,      %%eax;           " /* rax == accumulate */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          "
+        " movd           (%0),      %%mm1;           "
+        " add              $4,         %0;           "
+        " movq        (%%eax),      %%mm2;           "
+        " punpcklbw     %%mm0,      %%mm1;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm2,    (%%eax);           "
+        " add              $8,      %%eax;           "
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              6f;                       "
+        "3:                                          " /* prepare to output a line */
+        " movd          %%ecx,      %%mm1;           "
+        " movl             %4,      %%edx;           " /* edx = width */
+        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
+        " punpcklwd     %%mm1,      %%mm1;           "
+        " punpckldq     %%mm1,      %%mm1;           "
+        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
+        "4:                                          "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " movq        (%%eax),      %%mm5;           " /* mm5 = accumulate */
+        " movq          %%mm6,      %%mm3;           "
+        " psllw            $2,      %%mm4;           "
+        " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
+        " psraw           $15,      %%mm0;           "
+        " pand          %%mm3,      %%mm0;           "
+        " movq          %%mm3,      %%mm2;           "
+        " psraw           $15,      %%mm2;           "
+        " pand          %%mm4,      %%mm2;           "
+        " pmulhw        %%mm4,      %%mm3;           "
+        " paddw         %%mm0,      %%mm3;           "
+        " paddw         %%mm2,      %%mm3;           "
+        " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
+        " psraw           $15,      %%mm0;           "
+        " pand          %%mm4,      %%mm0;           "
+        " movq          %%mm4,      %%mm2;           "
+        " psraw           $15,      %%mm2;           "
+        " pand          %%mm1,      %%mm2;           "
+        " pmulhw        %%mm1,      %%mm4;           "
+        " paddw         %%mm0,      %%mm4;           "
+        " paddw         %%mm2,      %%mm4;           "
+        " movq          %%mm3,    (%%eax);           "
+        " paddw         %%mm5,      %%mm4;           "
+        " add              $8,      %%eax;           "
+        " movq          %%mm7,      %%mm0;           "
+        " psraw           $15,      %%mm0;           "
+        " pand          %%mm4,      %%mm0;           "
+        " movq          %%mm4,      %%mm2;           "
+        " psraw           $15,      %%mm2;           "
+        " pand          %%mm7,      %%mm2;           "
+        " pmulhw        %%mm7,      %%mm4;           "
+        " paddw         %%mm0,      %%mm4;           "
+        " paddw         %%mm2,      %%mm4;           "
+        " pxor          %%mm0,      %%mm0;           "
+        " packuswb      %%mm0,      %%mm4;           "
+        " movd          %%mm4,       (%1);           "
+        " add              $4,         %1;           "
+        " decl          %%edx;                       "
+        " jne              4b;                       "
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " addl             %5,      %%ecx;           "
+        " subl        $0x4000,      %%ecx;           "
+        "6:                                          " /* tail of outer Y-loop */
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix),  "+r"(dstpix)     /* outputs */
+        : "m"(templine), "m"(srcheight), "m"(width),  "m"(yspace),
+          "m"(yrecip),   "m"(srcdiff),   "m"(dstdiff),"m"(One64)  /* input */
+        : "%ecx","%edx","%eax"           /* clobbered */
+        );
+
+    /* free the temporary memory */
+    free(templine);
+}
+
+void
+pyg_filter_shrink_Y_SSE(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    Uint16 *templine;
+    int srcdiff = srcpitch - (width * 4);
+    int dstdiff = dstpitch - (width * 4);
+    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
+    int yrecip = 0x40000000 / yspace;
+    long long One64 = 0x4000400040004000ULL;
+
+    /* allocate and clear a memory area for storing the accumulator line */
+    templine = (Uint16 *) malloc(dstpitch * 2);
+    if (templine == 0) return;
+    memset(templine, 0, dstpitch * 2);
+    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
+        " movl             %5,      %%ecx;           " /* ecx == ycounter */
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
+        " pshufw    $0, %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %2,      %%eax;           " /* rax == accumulate */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          "
+        " movd           (%0),      %%mm1;           "
+        " add              $4,         %0;           "
+        " movq        (%%eax),      %%mm2;           "
+        " punpcklbw     %%mm0,      %%mm1;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm2,    (%%eax);           "
+        " add              $8,      %%eax;           "
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              6f;                       "
+        "3:                                          " /* prepare to output a line */
+        " movd          %%ecx,      %%mm1;           "
+        " movl             %4,      %%edx;           " /* edx = width */
+        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
+        " pshufw    $0, %%mm1,      %%mm1;           "
+        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
+        "4:                                          "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " movq        (%%eax),      %%mm5;           " /* mm5 = accumulate */
+        " movq          %%mm6,      %%mm3;           "
+        " psllw            $2,      %%mm4;           "
+        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
+        " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
+        " movq          %%mm3,    (%%eax);           "
+        " paddw         %%mm5,      %%mm4;           "
+        " add              $8,      %%eax;           "
+        " pmulhuw       %%mm7,      %%mm4;           "
+        " packuswb      %%mm0,      %%mm4;           "
+        " movd          %%mm4,       (%1);           "
+        " add              $4,         %1;           "
+        " decl          %%edx;                       "
+        " jne              4b;                       "
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " addl             %5,      %%ecx;           "
+        " subl        $0x4000,      %%ecx;           "
+        "6:                                          " /* tail of outer Y-loop */
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix),  "+r"(dstpix)     /* outputs */
+        : "m"(templine), "m"(srcheight), "m"(width),  "m"(yspace),
+          "m"(yrecip),   "m"(srcdiff),   "m"(dstdiff),"m"(One64)  /* input */
+        : "%ecx","%edx","%eax"           /* clobbered */
+        );
+
+    /* free the temporary memory */
+    free(templine);
+}
+
+/* These functions implement a bilinear filter in the X-dimension.
+ */
+void
+pyg_filter_expand_X_MMX(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int *xidx0, *xmult0, *xmult1;
+    int x, y;
+    int factorwidth = 8;
+  	long long One64 = 0x0100010001000100ULL;
+
+    /* Allocate memory for factors */
+    xidx0 = malloc(dstwidth * 4);
+    if (xidx0 == 0) return;
+    xmult0 = (int *) malloc(dstwidth * factorwidth);
+    xmult1 = (int *) malloc(dstwidth * factorwidth);
+    if (xmult0 == 0 || xmult1 == 0)
+    {
+        free(xidx0);
+        if (xmult0) free(xmult0);
+        if (xmult1) free(xmult1);
+    }
+
+    /* Create multiplier factors and starting indices and put them in arrays */
+    for (x = 0; x < dstwidth; x++)
+    {
+        int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
+        int xm0 = 0x100 - xm1;
+        xidx0[x] = x * (srcwidth - 1) / dstwidth;
+        xmult1[x*2]   = xm1 | (xm1 << 16);
+        xmult1[x*2+1] = xm1 | (xm1 << 16);
+        xmult0[x*2]   = xm0 | (xm0 << 16);
+        xmult0[x*2+1] = xm0 | (xm0 << 16);
+    }
+
+    /* Do the scaling in raster order so we don't trash the cache */
+    for (y = 0; y < height; y++)
+    {
+        Uint8 *srcrow0 = srcpix + y * srcpitch;
+        Uint8 *dstrow = dstpix + y * dstpitch;
+        int *xm0 = xmult0;
+        int *x0 = xidx0;
+    	int width = dstwidth;
+        asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
+             " pxor          %%mm0,      %%mm0;           "
+             " movq             %5,      %%mm7;           "
+             "1:                                          "
+             " movl           (%2),      %%eax;           " /* get xidx0[x] */
+             " add              $4,         %2;           "
+             " movq          %%mm7,      %%mm2;           "
+             " movq           (%0),      %%mm1;           " /* load mult0 */
+             " add              $8,         %0;           "
+             " psubw         %%mm1,      %%mm2;           " /* load mult1 */
+             " movd   (%4,%%eax,4),      %%mm4;           "
+             " movd  4(%4,%%eax,4),      %%mm5;           "
+             " punpcklbw     %%mm0,      %%mm4;           "
+             " punpcklbw     %%mm0,      %%mm5;           "
+             " pmullw        %%mm1,      %%mm4;           "
+             " pmullw        %%mm2,      %%mm5;           "
+             " paddw         %%mm4,      %%mm5;           "
+             " psrlw            $8,      %%mm5;           "
+             " packuswb      %%mm0,      %%mm5;           "
+             " movd          %%mm5,       (%1);           "
+             " add              $4,         %1;           "
+             " decl             %3;                       "
+             " jne              1b;                       "
+             " emms;                                      "
+             : "+r"(xm0),    "+r"(dstrow), "+r"(x0), "+m"(width)  /* outputs */
+             : "S"(srcrow0), "m"(One64)    /* input */
+             : "%eax"            /* clobbered */
+             );
+    }
+
+    /* free memory */
+    free(xidx0);
+    free(xmult0);
+    free(xmult1);
+}
+
+void
+pyg_filter_expand_X_SSE(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int *xidx0, *xmult0, *xmult1;
+    int x, y;
+    int factorwidth = 8;
+  	long long One64 = 0x0100010001000100ULL;
+
+    /* Allocate memory for factors */
+    xidx0 = malloc(dstwidth * 4);
+    if (xidx0 == 0) return;
+    xmult0 = (int *) malloc(dstwidth * factorwidth);
+    xmult1 = (int *) malloc(dstwidth * factorwidth);
+    if (xmult0 == 0 || xmult1 == 0)
+    {
+        free(xidx0);
+        if (xmult0) free(xmult0);
+        if (xmult1) free(xmult1);
+    }
+
+    /* Create multiplier factors and starting indices and put them in arrays */
+    for (x = 0; x < dstwidth; x++)
+    {
+        int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
+        int xm0 = 0x100 - xm1;
+        xidx0[x] = x * (srcwidth - 1) / dstwidth;
+        xmult1[x*2]   = xm1 | (xm1 << 16);
+        xmult1[x*2+1] = xm1 | (xm1 << 16);
+        xmult0[x*2]   = xm0 | (xm0 << 16);
+        xmult0[x*2+1] = xm0 | (xm0 << 16);
+    }
+
+    /* Do the scaling in raster order so we don't trash the cache */
+    for (y = 0; y < height; y++)
+    {
+        Uint8 *srcrow0 = srcpix + y * srcpitch;
+        Uint8 *dstrow = dstpix + y * dstpitch;
+        int *xm0 = xmult0;
+        int *x0 = xidx0;
+    	int width = dstwidth;
+        asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
+             " pxor          %%mm0,      %%mm0;           "
+             " movq             %5,      %%mm7;           "
+             "1:                                          "
+             " movl           (%2),      %%eax;           " /* get xidx0[x] */
+             " add              $4,         %2;           "
+             " movq          %%mm7,      %%mm2;           "
+             " movq           (%0),      %%mm1;           " /* load mult0 */
+             " add              $8,         %0;           "
+             " psubw         %%mm1,      %%mm2;           " /* load mult1 */
+             " movd   (%4,%%eax,4),      %%mm4;           "
+             " movd  4(%4,%%eax,4),      %%mm5;           "
+             " punpcklbw     %%mm0,      %%mm4;           "
+             " punpcklbw     %%mm0,      %%mm5;           "
+             " pmullw        %%mm1,      %%mm4;           "
+             " pmullw        %%mm2,      %%mm5;           "
+             " paddw         %%mm4,      %%mm5;           "
+             " psrlw            $8,      %%mm5;           "
+             " packuswb      %%mm0,      %%mm5;           "
+             " movd          %%mm5,       (%1);           "
+             " add              $4,         %1;           "
+             " decl             %3;                       "
+             " jne              1b;                       "
+             " emms;                                      "
+             : "+r"(xm0),    "+r"(dstrow), "+r"(x0), "+m"(width)  /* outputs */
+             : "S"(srcrow0), "m"(One64)    /* input */
+             : "%eax"            /* clobbered */
+             );
+    }
+
+    /* free memory */
+    free(xidx0);
+    free(xmult0);
+    free(xmult1);
+}
+
+/* These functions implement a bilinear filter in the Y-dimension.
+ */
+void
+pyg_filter_expand_Y_MMX(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    int y;
+
+    for (y = 0; y < dstheight; y++)
+    {
+        int yidx0 = y * (srcheight - 1) / dstheight;
+        Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
+        Uint8 *srcrow1 = srcrow0 + srcpitch;
+        int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
+        int ymult0 = 0x0100 - ymult1;
+        Uint8 *dstrow = dstpix + y * dstpitch;
+        asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
+             " movl          %5,      %%eax;                      "
+             " movd          %3,      %%mm1;                      "
+             " movd          %4,      %%mm2;                      "
+             " pxor       %%mm0,      %%mm0;                      "
+             " punpcklwd  %%mm1,      %%mm1;                      "
+             " punpckldq  %%mm1,      %%mm1;                      "
+             " punpcklwd  %%mm2,      %%mm2;                      "
+             " punpckldq  %%mm2,      %%mm2;                      "
+             "1:                                                  "
+             " movd        (%0),      %%mm4;                      "
+             " add           $4,         %0;                      "
+             " movd        (%1),      %%mm5;                      "
+             " add           $4,         %1;                      "
+             " punpcklbw  %%mm0,     %%mm4;                       "
+             " punpcklbw  %%mm0,     %%mm5;                       "
+             " pmullw     %%mm1,     %%mm4;                       "
+             " pmullw     %%mm2,     %%mm5;                       "
+             " paddw      %%mm4,     %%mm5;                       "
+             " psrlw         $8,     %%mm5;                       "
+             " packuswb   %%mm0,     %%mm5;                       "
+             " movd       %%mm5,      (%2);                       "
+             " add           $4,        %2;                       "
+             " decl       %%eax;                                  "
+             " jne           1b;                                  "
+             " emms;                                              "
+             : "+r"(srcrow0), "+r"(srcrow1),"+r"(dstrow)   /* no outputs */
+             : "m"(ymult0),   "m"(ymult1),  "m"(width)    /* input */
+             : "%eax"        /* clobbered */
+             );
+    }
+}
+
+void
+pyg_filter_expand_Y_SSE(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    int y;
+
+    for (y = 0; y < dstheight; y++)
+    {
+        int yidx0 = y * (srcheight - 1) / dstheight;
+        Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
+        Uint8 *srcrow1 = srcrow0 + srcpitch;
+        int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
+        int ymult0 = 0x0100 - ymult1;
+        Uint8 *dstrow = dstpix + y * dstpitch;
+        asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
+             " movl          %5,      %%eax;                      "
+             " movd          %3,      %%mm1;                      "
+             " movd          %4,      %%mm2;                      "
+             " pxor       %%mm0,      %%mm0;                      "
+             " pshufw      $0, %%mm1, %%mm1;                      "
+             " pshufw      $0, %%mm2, %%mm2;                      "
+             "1:                                                  "
+             " movd        (%0),      %%mm4;                      "
+             " add           $4,         %0;                      "
+             " movd        (%1),      %%mm5;                      "
+             " add           $4,         %1;                      "
+             " punpcklbw  %%mm0,     %%mm4;                       "
+             " punpcklbw  %%mm0,     %%mm5;                       "
+             " pmullw     %%mm1,     %%mm4;                       "
+             " pmullw     %%mm2,     %%mm5;                       "
+             " paddw      %%mm4,     %%mm5;                       "
+             " psrlw         $8,     %%mm5;                       "
+             " packuswb   %%mm0,     %%mm5;                       "
+             " movd       %%mm5,      (%2);                       "
+             " add           $4,        %2;                       "
+             " decl       %%eax;                                  "
+             " jne           1b;                                  "
+             " emms;                                              "
+             : "+r"(srcrow0), "+r"(srcrow1),"+r"(dstrow)   /* no outputs */
+             : "m"(ymult0),   "m"(ymult1),  "m"(width)    /* input */
+             : "%eax"        /* clobbered */
+             );
+    }
+}

src/sdlext/filters_64.c

+/*
+  pygame - Python Game Library
+  Copyright (C) 2000-2001  Pete Shinners
+  Copyright (C) 2007  Rene Dudfield, Richard Goedeken 
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Library General Public
+  License as published by the Free Software Foundation; either
+  version 2 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Library General Public License for more details.
+
+  You should have received a copy of the GNU Library General Public
+  License along with this library; if not, write to the Free
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  Pete Shinners
+  pete@shinners.org
+*/
+
+/* Pentium 64 bit SSE/MMX smoothscale routines
+ * These are written for compilation with GCC only.
+ *
+ * This file should not depend on anything but the C standard library.
+ */
+
+#if !defined(__GNUC__) || !defined(__x86_64__)
+#error "Pygame2 build bug: should not be compiling this file!"
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* These functions implement an area-averaging shrinking filter in the X-dimension.
+ */
+void
+pyg_filter_shrink_X_MMX(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int srcdiff = srcpitch - (srcwidth * 4);
+    int dstdiff = dstpitch - (dstwidth * 4);
+
+    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
+    int xrecip = 0x40000000 / xspace;
+    long long One64 = 0x4000400040004000ULL;
+    long long srcdiff64 = srcdiff;
+    long long dstdiff64 = dstdiff;
+    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
+        " punpcklwd     %%mm7,      %%mm7;           "
+        " punpckldq     %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %5,      %%ecx;           " /* ecx == xcounter */
+        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          " /* inner X-loop */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm2;           "
+        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              4f;                       "
+        "3:                                          " /* prepare to output a pixel */
+        " movd          %%ecx,      %%mm2;           "
+        " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
+        " punpcklwd     %%mm2,      %%mm2;           "
+        " punpckldq     %%mm2,      %%mm2;           "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
+        " psllw            $2,      %%mm4;           "
+        " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
+        " psraw           $15,      %%mm5;           "
+        " pand          %%mm2,      %%mm5;           "
+        " movq          %%mm2,      %%mm6;           "
+        " psraw           $15,      %%mm6;           "
+        " pand          %%mm4,      %%mm6;           "
+        " pmulhw        %%mm4,      %%mm2;           "
+        " paddw         %%mm5,      %%mm2;           "
+        " paddw         %%mm6,      %%mm2;           "
+        " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
+        " psraw           $15,      %%mm5;           "
+        " pand          %%mm3,      %%mm5;           "
+        " movq          %%mm3,      %%mm6;           "
+        " psraw           $15,      %%mm6;           "
+        " pand          %%mm4,      %%mm6;           "
+        " pmulhw        %%mm4,      %%mm3;           "
+        " paddw         %%mm5,      %%mm3;           "
+        " paddw         %%mm6,      %%mm3;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
+        " movq          %%mm7,      %%mm5;           "
+        " psraw           $15,      %%mm5;           "
+        " pand          %%mm2,      %%mm5;           "
+        " movq          %%mm2,      %%mm6;           "
+        " psraw           $15,      %%mm6;           "
+        " pand          %%mm7,      %%mm6;           "
+        " pmulhw        %%mm7,      %%mm2;           "
+        " paddw         %%mm5,      %%mm2;           "
+        " paddw         %%mm6,      %%mm2;           "
+        " packuswb      %%mm0,      %%mm2;           "
+        " movd          %%mm2,       (%1);           "
+        " add              %5,      %%ecx;           "
+        " add              $4,         %1;           "
+        " subl        $0x4000,      %%ecx;           "
+        "4:                                          " /* tail of inner X-loop */
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)  /* outputs */
+        : "m"(One64),   "m"(height), "m"(srcwidth),
+          "m"(xspace),  "m"(xrecip), "m"(srcdiff64), "m"(dstdiff64)     /* inputs */
+        : "%ecx","%edx"               /* clobbered */
+        );
+}
+
+void
+pyg_filter_shrink_X_SSE(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int srcdiff = srcpitch - (srcwidth * 4);
+    int dstdiff = dstpitch - (dstwidth * 4);
+
+    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
+    int xrecip = 0x40000000 / xspace;
+    long long One64 = 0x4000400040004000ULL;
+    long long srcdiff64 = srcdiff;
+    long long dstdiff64 = dstdiff;
+    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
+        " movq             %2,      %%mm6;           " /* mm6 = 2^14  */
+        " pshufw    $0, %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %5,      %%ecx;           " /* ecx == xcounter */
+        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          " /* inner X-loop */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm2;           "
+        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              4f;                       "
+        "3:                                          " /* prepare to output a pixel */
+        " movd          %%ecx,      %%mm2;           "
+        " movq          %%mm6,      %%mm3;           " /* mm3 = 2^14  */
+        " pshufw    $0, %%mm2,      %%mm2;           "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
+        " psllw            $2,      %%mm4;           "
+        " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
+        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
+        " pmulhuw       %%mm7,      %%mm2;           "
+        " packuswb      %%mm0,      %%mm2;           "
+        " movd          %%mm2,       (%1);           "
+        " add              %5,      %%ecx;           "
+        " add              $4,         %1;           "
+        " subl        $0x4000,      %%ecx;           "
+        "4:                                          " /* tail of inner X-loop */
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)  /* outputs */
+        : "m"(One64),   "m"(height), "m"(srcwidth),
+          "m"(xspace),  "m"(xrecip), "m"(srcdiff64), "m"(dstdiff64)     /* inputs */
+        : "%ecx","%edx"               /* clobbered */
+        );
+}
+
+/* These functions implement an area-averaging shrinking filter in the Y-dimension.
+ */
+void
+pyg_filter_shrink_Y_MMX(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    Uint16 *templine;
+    int srcdiff = srcpitch - (width * 4);
+    int dstdiff = dstpitch - (width * 4);
+    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
+    int yrecip = 0x40000000 / yspace;
+    long long One64 = 0x4000400040004000ULL;
+
+    /* allocate and clear a memory area for storing the accumulator line */
+    templine = (Uint16 *) malloc(dstpitch * 2);
+    if (templine == 0) return;
+    memset(templine, 0, dstpitch * 2);
+    long long srcdiff64 = srcdiff;
+    long long dstdiff64 = dstdiff;
+    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
+        " movl             %5,      %%ecx;           " /* ecx == ycounter */
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
+        " punpcklwd     %%mm7,      %%mm7;           "
+        " punpckldq     %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " mov              %2,      %%rax;           " /* rax == accumulate */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          "
+        " movd           (%0),      %%mm1;           "
+        " add              $4,         %0;           "
+        " movq        (%%rax),      %%mm2;           "
+        " punpcklbw     %%mm0,      %%mm1;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm2,    (%%rax);           "
+        " add              $8,      %%rax;           "
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              6f;                       "
+        "3:                                          " /* prepare to output a line */
+        " movd          %%ecx,      %%mm1;           "
+        " movl             %4,      %%edx;           " /* edx = width */
+        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
+        " punpcklwd     %%mm1,      %%mm1;           "
+        " punpckldq     %%mm1,      %%mm1;           "
+        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
+        "4:                                          "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " movq        (%%rax),      %%mm5;           " /* mm5 = accumulate */
+        " movq          %%mm6,      %%mm3;           "
+        " psllw            $2,      %%mm4;           "
+        " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
+        " psraw           $15,      %%mm0;           "
+        " pand          %%mm3,      %%mm0;           "
+        " movq          %%mm3,      %%mm2;           "
+        " psraw           $15,      %%mm2;           "
+        " pand          %%mm4,      %%mm2;           "
+        " pmulhw        %%mm4,      %%mm3;           "
+        " paddw         %%mm0,      %%mm3;           "
+        " paddw         %%mm2,      %%mm3;           "
+        " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
+        " psraw           $15,      %%mm0;           "
+        " pand          %%mm4,      %%mm0;           "
+        " movq          %%mm4,      %%mm2;           "
+        " psraw           $15,      %%mm2;           "
+        " pand          %%mm1,      %%mm2;           "
+        " pmulhw        %%mm1,      %%mm4;           "
+        " paddw         %%mm0,      %%mm4;           "
+        " paddw         %%mm2,      %%mm4;           "
+        " movq          %%mm3,    (%%rax);           "
+        " paddw         %%mm5,      %%mm4;           "
+        " add              $8,      %%rax;           "
+        " movq          %%mm7,      %%mm0;           "
+        " psraw           $15,      %%mm0;           "
+        " pand          %%mm4,      %%mm0;           "
+        " movq          %%mm4,      %%mm2;           "
+        " psraw           $15,      %%mm2;           "
+        " pand          %%mm7,      %%mm2;           "
+        " pmulhw        %%mm7,      %%mm4;           "
+        " paddw         %%mm0,      %%mm4;           "
+        " paddw         %%mm2,      %%mm4;           "
+        " pxor          %%mm0,      %%mm0;           "
+        " packuswb      %%mm0,      %%mm4;           "
+        " movd          %%mm4,       (%1);           "
+        " add              $4,         %1;           "
+        " decl          %%edx;                       "
+        " jne              4b;                       "
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " addl             %5,      %%ecx;           "
+        " subl        $0x4000,      %%ecx;           "
+        "6:                                          " /* tail of outer Y-loop */
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)    /* outputs */
+        : "m"(templine),"m"(srcheight), "m"(width),     "m"(yspace),  
+          "m"(yrecip),  "m"(srcdiff64), "m"(dstdiff64), "m"(One64)  /* input */
+        : "%ecx","%edx","%rax"          /* clobbered */
+        );
+
+    /* free the temporary memory */
+    free(templine);
+}
+
+void
+pyg_filter_shrink_Y_SSE(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    Uint16 *templine;
+    int srcdiff = srcpitch - (width * 4);
+    int dstdiff = dstpitch - (width * 4);
+    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
+    int yrecip = 0x40000000 / yspace;
+    long long One64 = 0x4000400040004000ULL;
+
+    /* allocate and clear a memory area for storing the accumulator line */
+    templine = (Uint16 *) malloc(dstpitch * 2);
+    if (templine == 0) return;
+    memset(templine, 0, dstpitch * 2);
+    long long srcdiff64 = srcdiff;
+    long long dstdiff64 = dstdiff;
+    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
+        " movl             %5,      %%ecx;           " /* ecx == ycounter */
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
+        " pshufw    $0, %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " mov              %2,      %%rax;           " /* rax == accumulate */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          "
+        " movd           (%0),      %%mm1;           "
+        " add              $4,         %0;           "
+        " movq        (%%rax),      %%mm2;           "
+        " punpcklbw     %%mm0,      %%mm1;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm2,    (%%rax);           "
+        " add              $8,      %%rax;           "
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              6f;                       "
+        "3:                                          " /* prepare to output a line */
+        " movd          %%ecx,      %%mm1;           "
+        " movl             %4,      %%edx;           " /* edx = width */
+        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
+        " pshufw    $0, %%mm1,      %%mm1;           "
+        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
+        "4:                                          "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " movq        (%%rax),      %%mm5;           " /* mm5 = accumulate */
+        " movq          %%mm6,      %%mm3;           "
+        " psllw            $2,      %%mm4;           "
+        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
+        " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
+        " movq          %%mm3,    (%%rax);           "
+        " paddw         %%mm5,      %%mm4;           "
+        " add              $8,      %%rax;           "
+        " pmulhuw       %%mm7,      %%mm4;           "
+        " packuswb      %%mm0,      %%mm4;           "
+        " movd          %%mm4,       (%1);           "