Commits

Anonymous committed 97d722d

Added fixes from the main trunk for sdlmixer.sndarray
Added filter backend switch support from the main trunk.
Added minor blit macro optimisations from the main trunk.

Comments (0)

Files changed (19)

 CRITICAL:
 =========
 * recheck and redesign X11 implementation for pygame.sdlext.scrap
-* merge transform/scale asm changes in rev. 1657:1669
 * complete physics collision and contacts and merge it back to the branch
 * make LayeredGroups class independent from sdl surfaces, where possible
 * check anything for possible integer/float overflows
 * use copy.copy and copy.deepcopy for consistent object copies
-* Merge transform smooth scale changes in rev. 1715:1717
 * Rewrite and fix up numpysurfarray and numpysndarray
 * Add prebuilt package for Win32 VC++ builds.
 * Refine quitting for SDL_QuitSubSystem wrapper.
 * Check display surface tracking for multiple calls to set_mode using
   different return variables.
 * Argument parsing must handle 64-bit conversions correctly.
+* Add palette color support to sdlext.transform (trunk rev. 2242).
+* Check trunk rev. 1918, 1921, 1922, 1933, 1953 (blit blend operations).
+* Check trunk rev. 1937, 1947 (blit blend for self).
+* Add surface.scroll (trunk rev. 1951).
 
 Things to ADD:
 ==============

doc/src/sdlexttransform.xml

       :class:`pygame2.sdl.video.Surface` with the same dimensions. 
     </desc>
   </func>
+  <func name="get_filtertype">
+    <call>get_filtertype () -> int</call>
+    <desc>
+      Gets the currently set filter type.
+    </desc>
+  </func>
+  <func name="set_filtertype">
+    <call>set_filtertype (type) -> int</call>
+    <desc>
+      Sets the filters to use to one of the supported filter types.
+      
+      TODO
+    </desc>
+  </func>
   <func name="laplacian">
     <call>laplacian (surface[, destsurface]) -> Surface</call>
     <desc>
     """
     read the fonts on unix
     """
+    import subprocess
     fonts = {}
 
     # we use the fc-list from fontconfig to get a list of fonts.
 
     try:
-        # note, we use popen3 for if fc-list isn't there to stop stderr
-        # printing.
-        flin, flout, flerr = os.popen3('fc-list : file family style')
-    except:
+        flout, flerr = subprocess.Popen('fc-list : file family style', shell=True,
+                                        stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                                        close_fds=True).communicate()
+    except Exception:
         return fonts
 
     try:

lib/sdlmixer/numpysndarray.py

     else:
         data = sound.get_buffer ()
 
-    shape = (len (data) / channels * fmtbytes, )
+    shape = (len (data) // fmtbytes, )
     if channels > 1:
-        shape = (shape[0], 2)
+        shape = (shape[0] // channels, channels)
 
     # mixer.init () does not support different formats from the ones below,
     # so MSB/LSB stuff is silently ignored.
     if not info:
         raise pygame2.Error ("Mixer not initialized")
     fmtbytes = (abs (info[1]) & 0xff) >> 3
-    channels = mixer.get_num_channels ()
+    channels = info[2]
     data = sound.get_buffer ()
 
-    shape = (data.length / channels * fmtbytes, )
+    shape = (data.length // fmtbytes, )
     if channels > 1:
-        shape = (shape[0], 2)
+        shape = (shape[0] // channels, channels)
         
     # mixer.init () does not support different formats from the ones below,
     # so MSB/LSB stuff is silently ignored.

src/freetype/ft_font.c

     /* TODO */
 }
 
-
-
 /****************************************************
  * C API CALLS
  ****************************************************/

src/freetype/ft_wrap.c

         len = strlen(latin1_buffer);
 
         utf16_buffer = malloc((len + 1) * sizeof(FT_UInt16));
+        if (!utf16_buffer)
+            return NULL;
 
         for (i = 0; i < len; ++i)
             utf16_buffer[i] = (FT_UInt16)latin1_buffer[i];
     if (ft->library)
         FT_Done_FreeType(ft->library);
 
-    free(ft->_error_msg);
-    free(ft);
+    if (ft->_error_msg)
+        free (ft->_error_msg);
+    
+    free (ft);
 }
 
 int

src/freetype/pgfreetype.h

 /*
   pygame - Python Game Library
-  Copyright (C) 2000-2001 Pete Shinners
-  Copyright (C) 2008 Marcus von Appen
   Copyright (C) 2009 Vicent Marti
 
   This library is free software; you can redistribute it and/or

src/mask/maskmod.c

 _bitmask_threshold (bitmask_t *m, SDL_Surface *surf, SDL_Surface *surf2, 
     Uint32 color,  Uint32 threshold)
 {
-    int x, y, rshift, gshift, bshift, rshift2, gshift2, bshift2;
+    int x, y, rshift, gshift, bshift, rshift2, gshift2, bshift2, bpp1, bpp2;
     int rloss, gloss, bloss, rloss2, gloss2, bloss2;
     Uint8 *pixels, *pixels2;
     SDL_PixelFormat *format, *format2;
     rloss = format->Rloss;
     gloss = format->Gloss;
     bloss = format->Bloss;
+    bpp1 = format->BytesPerPixel;
 
     if (surf2)
     {
         gloss2 = format2->Gloss;
         bloss2 = format2->Bloss;
         pixels2 = (Uint8 *) surf2->pixels;
+        bpp2 = format2->BytesPerPixel;
     }
     else
     {
         rloss2 = gloss2 = bloss2 = 0;
         format2 = NULL;
         pixels2 = NULL;
+        bpp2 = 0;
     }
 
     SDL_GetRGBA (color, format, &r, &g, &b, &a);
         for (x=0; x < surf->w; x++)
         {
             /* the_color = surf->get_at(x,y) */
-            switch (format->BytesPerPixel)
+            switch (bpp1)
             {
             case 1:
                 the_color = (Uint32)*((Uint8 *) pixels);
 
             if (surf2)
             {
-                switch (format2->BytesPerPixel)
+                switch (bpp2)
                 {
                 case 1:
                     the_color2 = (Uint32)*((Uint8 *) pixels2);

src/sdl/surface.h

             ((Uint8)((argb & 0xff000000) >> 24)));                      \
     }
 
+#if SDL_BYTEORDER == SDL_LIL_ENDIAN
+#define SET_PIXEL24(buf,format,rgb)                                     \
+    *((buf) + ((format)->Rshift >> 3)) = (rgb)[0];                      \
+    *((buf) + ((format)->Gshift >> 3)) = (rgb)[1];                      \
+    *((buf) + ((format)->Bshift >> 3)) = (rgb)[2];
+#else
+#define SET_PIXEL24(buf,format,rgb)                                     \
+    *((buf) + 2 - ((format)->Rshift >> 3)) = (rgb)[0];                  \
+    *((buf) + 2 - ((format)->Gshift >> 3)) = (rgb)[1];                  \
+    *((buf) + 2 - ((format)->Bshift >> 3)) = (rgb)[2];
+#endif
+
 #define SET_PIXEL_AT(surface,format,_x,_y,color)                        \
     if ((_x) >= (surface)->clip_rect.x &&                               \
         (_x) <= (surface)->clip_rect.x + (surface)->clip_rect.w &&      \
             SDL_GetRGB ((color), (format), _rgb, _rgb+1, _rgb+2);       \
             _buf = (Uint8*)(((Uint8*)(surface)->pixels) + (_y) *        \
                 (surface)->pitch) + (_x) * 3;                           \
-            if (SDL_BYTEORDER == SDL_LIL_ENDIAN)                        \
-            {                                                           \
-                *(_buf + ((format)->Rshift >> 3)) = _rgb[0];            \
-                *(_buf + ((format)->Gshift >> 3)) = _rgb[1];            \
-                *(_buf + ((format)->Bshift >> 3)) = _rgb[2];            \
-            }                                                           \
-            else                                                        \
-            {                                                           \
-                *(_buf + 2 - ((format)->Rshift >> 3)) = _rgb[0];        \
-                *(_buf + 2 - ((format)->Gshift >> 3)) = _rgb[1];        \
-                *(_buf + 2 - ((format)->Bshift >> 3)) = _rgb[2];        \
-            }                                                           \
+            SET_PIXEL24(_buf, format, rgb);                             \
             break;                                                      \
         }                                                               \
         }                                                               \
     }
 
+#if SDL_BYTEORDER == SDL_LIL_ENDIAN
+#define GET_PIXEL_24(b) (b[0] + (b[1] << 8) + (b[2] << 16))
+#else
+#define GET_PIXEL_24(b) (b[2] + (b[1] << 8) + (b[0] << 16))
+#endif
+
 #define GET_PIXEL_AT(pxl,surface,bpp,_x,_y)                             \
     switch ((bpp))                                                      \
     {                                                                   \
     {                                                                   \
         Uint8* buf = ((Uint8 *) (((Uint8*)(surface)->pixels) + (_y) *   \
                 (surface)->pitch) + (_x) * 3);                          \
-        pxl = (SDL_BYTEORDER == SDL_LIL_ENDIAN) ?                       \
-            buf[0] + (buf[1] << 8) + (buf[2] << 16) :                   \
-            (buf[0] << 16) + (buf[1] << 8) + buf[2];                    \
+        pxl = GET_PIXEL_24(b);                                          \
         break;                                                          \
     }                                                                   \
     }
     default:                                      \
     {                                             \
         Uint8 *b = (Uint8 *) source;              \
-        pxl = (SDL_BYTEORDER == SDL_LIL_ENDIAN) ? \
-            b[0] + (b[1] << 8) + (b[2] << 16) :   \
-            (b[0] << 16) + (b[1] << 8) + b[2];    \
+        pxl = GET_PIXEL_24(b);                    \
     }                                             \
     break;                                        \
     }

src/sdlext/constantsmod.c

 #include "sdlextmod.h"
 #include "pgsdlext.h"
 #include "scrap.h"
+#include "filters.h"
 
 /* macros used to create each constant */
 #define DEC_CONSTS(x)  PyModule_AddIntConstant(module, #x, (int) #x)
     ADD_STRING_CONST (SCRAP_FORMAT_PPM);
     ADD_STRING_CONST (SCRAP_FORMAT_PBM);
 
+    DEC_CONSTS (FILTER_C);
+    DEC_CONSTS (FILTER_MMX);
+    DEC_CONSTS (FILTER_SSE);
+    
     MODINIT_RETURN(module);
 fail:
     Py_XDECREF (module);

src/sdlext/filters.c

 
 #include "filters.h"
 
+#if defined(__GNUC__)
+#if defined(__x86_64__)
+#include "filters_64.c"
+#elif defined(__i386__)
+#include "filters_32.c"
+#endif
+#endif /* __GNUC__ */
+
+FilterType
+pyg_filter_init_filterfuncs (FilterFuncs *filters, FilterType type)
+{
+    if (!filters)
+        return 0;
+    
+    filters->type = FILTER_C;
+    filters->shrink_X = pyg_filter_shrink_X_C;
+    filters->shrink_Y = pyg_filter_shrink_Y_C;
+    filters->expand_X = pyg_filter_expand_X_C;
+    filters->expand_Y = pyg_filter_expand_Y_C;
+    
+#if defined(FILTERS_SUPPORT_MMX)
+    if (type == FILTER_MMX && SDL_HasMMX ())
+    {
+        filters->type = FILTER_MMX;
+        filters->shrink_X = pyg_filter_shrink_X_MMX;
+        filters->shrink_Y = pyg_filter_shrink_Y_MMX;
+        filters->expand_X = pyg_filter_expand_X_MMX;
+        filters->expand_Y = pyg_filter_expand_Y_MMX;
+    }
+#endif /* FILTERS_SUPPORT_MMX */
+
+#if defined(FILTERS_SUPPORT_SSE)
+    if (type == FILTER_SSE && SDL_HasSSE ())
+    {
+        filters->type = FILTER_SSE;
+        filters->shrink_X = pyg_filter_shrink_X_SSE;
+        filters->shrink_Y = pyg_filter_shrink_Y_SSE;
+        filters->expand_X = pyg_filter_expand_X_SSE;
+        filters->expand_Y = pyg_filter_expand_Y_SSE;
+    }
+#endif /* FILTERS_SUPPORT_SSE */
+    return filters->type;
+}
+
 /* this function implements an area-averaging shrinking filter in the
  * X-dimension */
 void
     int x, y;
 
     int xspace = 0x10000 * srcwidth / dstwidth; /* must be > 1 */
-    int xrecip = (int) ((long long) 0x100000000 / xspace);
+    int xrecip = (int) (0x100000000LL / xspace);
     for (y = 0; y < height; y++)
     {
         Uint16 accumulate[4] = {0,0,0,0};
     }
 }
 
-void
-pyg_filter_shrink_X_MMX (Uint8 *srcpix, Uint8 *dstpix, int height,
-    int srcpitch, int dstpitch, int srcwidth, int dstwidth)
-{
-    int srcdiff = srcpitch - (srcwidth * 4);
-    int dstdiff = dstpitch - (dstwidth * 4);
-
-    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
-    int xrecip = (int) ((long long) 0x040000000 / xspace);
-    long long One64 = 0x4000400040004000ULL;
-#if defined(__GNUC__) && defined(__x86_64__)
-    long long srcdiff64 = srcdiff;
-    long long dstdiff64 = dstdiff;
-    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
-        " pxor          %%mm0,      %%mm0;           "
-        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
-        " punpcklwd     %%mm7,      %%mm7;           "
-        " punpckldq     %%mm7,      %%mm7;           "
-        "1:                                          " /* outer Y-loop */
-        " movl             %5,      %%ecx;           " /* ecx == xcounter */
-        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
-        " movl             %4,      %%edx;           " /* edx == width */
-        "2:                                          " /* inner X-loop */
-        " cmpl        $0x4000,      %%ecx;           "
-        " jbe              3f;                       "
-        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
-        " add              $4,         %0;           "
-        " punpcklbw     %%mm0,      %%mm2;           "
-        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
-        " subl        $0x4000,      %%ecx;           "
-        " jmp              4f;                       "
-        "3:                                          " /* prepare to output a pixel */
-        " movd          %%ecx,      %%mm2;           "
-        " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
-        " punpcklwd     %%mm2,      %%mm2;           "
-        " punpckldq     %%mm2,      %%mm2;           "
-        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
-        " add              $4,         %0;           "
-        " punpcklbw     %%mm0,      %%mm4;           "
-        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
-        " psllw            $2,      %%mm4;           "
-        " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
-        " psraw           $15,      %%mm5;           "
-        " pand          %%mm2,      %%mm5;           "
-        " movq          %%mm2,      %%mm6;           "
-        " psraw           $15,      %%mm6;           "
-        " pand          %%mm4,      %%mm6;           "
-        " pmulhw        %%mm4,      %%mm2;           "
-        " paddw         %%mm5,      %%mm2;           "
-        " paddw         %%mm6,      %%mm2;           "
-        " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
-        " psraw           $15,      %%mm5;           "
-        " pand          %%mm3,      %%mm5;           "
-        " movq          %%mm3,      %%mm6;           "
-        " psraw           $15,      %%mm6;           "
-        " pand          %%mm4,      %%mm6;           "
-        " pmulhw        %%mm4,      %%mm3;           "
-        " paddw         %%mm5,      %%mm3;           "
-        " paddw         %%mm6,      %%mm3;           "
-        " paddw         %%mm1,      %%mm2;           "
-        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
-        " movq          %%mm7,      %%mm5;           "
-        " psraw           $15,      %%mm5;           "
-        " pand          %%mm2,      %%mm5;           "
-        " movq          %%mm2,      %%mm6;           "
-        " psraw           $15,      %%mm6;           "
-        " pand          %%mm7,      %%mm6;           "
-        " pmulhw        %%mm7,      %%mm2;           "
-        " paddw         %%mm5,      %%mm2;           "
-        " paddw         %%mm6,      %%mm2;           "
-        " packuswb      %%mm0,      %%mm2;           "
-        " movd          %%mm2,       (%1);           "
-        " add              %5,      %%ecx;           "
-        " add              $4,         %1;           "
-        " subl        $0x4000,      %%ecx;           "
-        "4:                                          " /* tail of inner X-loop */
-        " decl          %%edx;                       "
-        " jne              2b;                       "
-        " add              %7,         %0;           " /* srcpix += srcdiff */
-        " add              %8,         %1;           " /* dstpix += dstdiff */
-        " decl             %3;                       "
-        " jne              1b;                       "
-        " emms;                                      "
-        : "+r"(srcpix), "+r"(dstpix)  /* outputs */
-        : "m"(One64),   "m"(height), "m"(srcwidth),
-          "m"(xspace),  "m"(xrecip), "m"(srcdiff64), "m"(dstdiff64)     /* inputs */
-        : "%ecx","%edx"               /* clobbered */
-        );
-#elif defined(__GNUC__) && defined(__i386__)
-    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
-        " pxor          %%mm0,      %%mm0;           "
-        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
-        " punpcklwd     %%mm7,      %%mm7;           "
-        " punpckldq     %%mm7,      %%mm7;           "
-        "1:                                          " /* outer Y-loop */
-        " movl             %5,      %%ecx;           " /* ecx == xcounter */
-        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
-        " movl             %4,      %%edx;           " /* edx == width */
-        "2:                                          " /* inner X-loop */
-        " cmpl        $0x4000,      %%ecx;           "
-        " jbe              3f;                       "
-        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
-        " add              $4,         %0;           "
-        " punpcklbw     %%mm0,      %%mm2;           "
-        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
-        " subl        $0x4000,      %%ecx;           "
-        " jmp              4f;                       "
-        "3:                                          " /* prepare to output a pixel */
-        " movd          %%ecx,      %%mm2;           "
-        " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
-        " punpcklwd     %%mm2,      %%mm2;           "
-        " punpckldq     %%mm2,      %%mm2;           "
-        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
-        " add              $4,         %0;           "
-        " punpcklbw     %%mm0,      %%mm4;           "
-        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
-        " psllw            $2,      %%mm4;           "
-        " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
-        " psraw           $15,      %%mm5;           "
-        " pand          %%mm2,      %%mm5;           "
-        " movq          %%mm2,      %%mm6;           "
-        " psraw           $15,      %%mm6;           "
-        " pand          %%mm4,      %%mm6;           "
-        " pmulhw        %%mm4,      %%mm2;           "
-        " paddw         %%mm5,      %%mm2;           "
-        " paddw         %%mm6,      %%mm2;           "
-        " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
-        " psraw           $15,      %%mm5;           "
-        " pand          %%mm3,      %%mm5;           "
-        " movq          %%mm3,      %%mm6;           "
-        " psraw           $15,      %%mm6;           "
-        " pand          %%mm4,      %%mm6;           "
-        " pmulhw        %%mm4,      %%mm3;           "
-        " paddw         %%mm5,      %%mm3;           "
-        " paddw         %%mm6,      %%mm3;           "
-        " paddw         %%mm1,      %%mm2;           "
-        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
-        " movq          %%mm7,      %%mm5;           "
-        " psraw           $15,      %%mm5;           "
-        " pand          %%mm2,      %%mm5;           "
-        " movq          %%mm2,      %%mm6;           "
-        " psraw           $15,      %%mm6;           "
-        " pand          %%mm7,      %%mm6;           "
-        " pmulhw        %%mm7,      %%mm2;           "
-        " paddw         %%mm5,      %%mm2;           "
-        " paddw         %%mm6,      %%mm2;           "
-        " packuswb      %%mm0,      %%mm2;           "
-        " movd          %%mm2,       (%1);           "
-        " add              %5,      %%ecx;           "
-        " add              $4,         %1;           "
-        " subl        $0x4000,      %%ecx;           "
-        "4:                                          " /* tail of inner X-loop */
-        " decl          %%edx;                       "
-        " jne              2b;                       "
-        " add              %7,         %0;           " /* srcpix += srcdiff */
-        " add              %8,         %1;           " /* dstpix += dstdiff */
-        " decl             %3;                       "
-        " jne              1b;                       "
-        " emms;                                      "
-        : "+r"(srcpix), "+r"(dstpix)                   /* outputs */
-        : "m"(One64),   "m"(height), "m"(srcwidth),
-          "m"(xspace),  "m"(xrecip), "m"(srcdiff),  "m"(dstdiff)  /* input */
-        : "%ecx","%edx"     /* clobbered */
-        );
-#endif
-}
-
 /* this function implements an area-averaging shrinking filter in the
  * Y-dimension */
 void
     int dstdiff = dstpitch - (width * 4);
     int x, y;
     int yspace = 0x10000 * srcheight / dstheight; /* must be > 1 */
-    int yrecip = (int) ((long long) 0x100000000 / yspace);
+    int yrecip = (int) (0x100000000LL / yspace);
     int ycounter = yspace;
 
     /* allocate and clear a memory area for storing the accumulator line */
     free (templine);
 }
 
-/* this function implements an area-averaging shrinking filter in the
- * Y-dimension */
-void
-pyg_filter_shrink_Y_MMX (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
-    int dstpitch, int srcheight, int dstheight)
-{
-    Uint16 *templine;
-    int srcdiff = srcpitch - (width * 4);
-    int dstdiff = dstpitch - (width * 4);
-    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
-    int yrecip = (int) ((long long) 0x040000000 / yspace);
-    long long One64 = 0x4000400040004000ULL;
-
-    /* allocate and clear a memory area for storing the accumulator line */
-    templine = (Uint16 *) malloc((size_t) (dstpitch * 2));
-    if (templine == NULL)
-        return;
-    memset(templine, 0, (size_t) (dstpitch * 2));
-
-#if defined(__GNUC__) && defined(__x86_64__)
-    long long srcdiff64 = srcdiff;
-    long long dstdiff64 = dstdiff;
-    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
-        " movl             %5,      %%ecx;           " /* ecx == ycounter */
-        " pxor          %%mm0,      %%mm0;           "
-        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
-        " punpcklwd     %%mm7,      %%mm7;           "
-        " punpckldq     %%mm7,      %%mm7;           "
-        "1:                                          " /* outer Y-loop */
-        " mov              %2,      %%rax;           " /* rax == accumulate */
-        " cmpl        $0x4000,      %%ecx;           "
-        " jbe              3f;                       "
-        " movl             %4,      %%edx;           " /* edx == width */
-        "2:                                          "
-        " movd           (%0),      %%mm1;           "
-        " add              $4,         %0;           "
-        " movq        (%%rax),      %%mm2;           "
-        " punpcklbw     %%mm0,      %%mm1;           "
-        " paddw         %%mm1,      %%mm2;           "
-        " movq          %%mm2,    (%%rax);           "
-        " add              $8,      %%rax;           "
-        " decl          %%edx;                       "
-        " jne              2b;                       "
-        " subl        $0x4000,      %%ecx;           "
-        " jmp              6f;                       "
-        "3:                                          " /* prepare to output a line */
-        " movd          %%ecx,      %%mm1;           "
-        " movl             %4,      %%edx;           " /* edx = width */
-        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
-        " punpcklwd     %%mm1,      %%mm1;           "
-        " punpckldq     %%mm1,      %%mm1;           "
-        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
-        "4:                                          "
-        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
-        " add              $4,         %0;           "
-        " punpcklbw     %%mm0,      %%mm4;           "
-        " movq        (%%rax),      %%mm5;           " /* mm5 = accumulate */
-        " movq          %%mm6,      %%mm3;           "
-        " psllw            $2,      %%mm4;           "
-        " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
-        " psraw           $15,      %%mm0;           "
-        " pand          %%mm3,      %%mm0;           "
-        " movq          %%mm3,      %%mm2;           "
-        " psraw           $15,      %%mm2;           "
-        " pand          %%mm4,      %%mm2;           "
-        " pmulhw        %%mm4,      %%mm3;           "
-        " paddw         %%mm0,      %%mm3;           "
-        " paddw         %%mm2,      %%mm3;           "
-        " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
-        " psraw           $15,      %%mm0;           "
-        " pand          %%mm4,      %%mm0;           "
-        " movq          %%mm4,      %%mm2;           "
-        " psraw           $15,      %%mm2;           "
-        " pand          %%mm1,      %%mm2;           "
-        " pmulhw        %%mm1,      %%mm4;           "
-        " paddw         %%mm0,      %%mm4;           "
-        " paddw         %%mm2,      %%mm4;           "
-        " movq          %%mm3,    (%%rax);           "
-        " paddw         %%mm5,      %%mm4;           "
-        " add              $8,      %%rax;           "
-        " movq          %%mm7,      %%mm0;           "
-        " psraw           $15,      %%mm0;           "
-        " pand          %%mm4,      %%mm0;           "
-        " movq          %%mm4,      %%mm2;           "
-        " psraw           $15,      %%mm2;           "
-        " pand          %%mm7,      %%mm2;           "
-        " pmulhw        %%mm7,      %%mm4;           "
-        " paddw         %%mm0,      %%mm4;           "
-        " paddw         %%mm2,      %%mm4;           "
-        " pxor          %%mm0,      %%mm0;           "
-        " packuswb      %%mm0,      %%mm4;           "
-        " movd          %%mm4,       (%1);           "
-        " add              $4,         %1;           "
-        " decl          %%edx;                       "
-        " jne              4b;                       "
-        " add              %8,         %1;           " /* dstpix += dstdiff */
-        " addl             %5,      %%ecx;           "
-        " subl        $0x4000,      %%ecx;           "
-        "6:                                          " /* tail of outer Y-loop */
-        " add              %7,         %0;           " /* srcpix += srcdiff */
-        " decl             %3;                       "
-        " jne              1b;                       "
-        " emms;                                      "
-        : "+r"(srcpix), "+r"(dstpix)    /* outputs */
-        : "m"(templine),"m"(srcheight), "m"(width),     "m"(yspace),  
-          "m"(yrecip),  "m"(srcdiff64), "m"(dstdiff64), "m"(One64)  /* input */
-        : "%ecx","%edx","%rax"          /* clobbered */
-        );
-#elif defined(__GNUC__) && defined(__i386__)
-    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
-        " movl             %5,      %%ecx;           " /* ecx == ycounter */
-        " pxor          %%mm0,      %%mm0;           "
-        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
-        " punpcklwd     %%mm7,      %%mm7;           "
-        " punpckldq     %%mm7,      %%mm7;           "
-        "1:                                          " /* outer Y-loop */
-        " movl             %2,      %%eax;           " /* rax == accumulate */
-        " cmpl        $0x4000,      %%ecx;           "
-        " jbe              3f;                       "
-        " movl             %4,      %%edx;           " /* edx == width */
-        "2:                                          "
-        " movd           (%0),      %%mm1;           "
-        " add              $4,         %0;           "
-        " movq        (%%eax),      %%mm2;           "
-        " punpcklbw     %%mm0,      %%mm1;           "
-        " paddw         %%mm1,      %%mm2;           "
-        " movq          %%mm2,    (%%eax);           "
-        " add              $8,      %%eax;           "
-        " decl          %%edx;                       "
-        " jne              2b;                       "
-        " subl        $0x4000,      %%ecx;           "
-        " jmp              6f;                       "
-        "3:                                          " /* prepare to output a line */
-        " movd          %%ecx,      %%mm1;           "
-        " movl             %4,      %%edx;           " /* edx = width */
-        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
-        " punpcklwd     %%mm1,      %%mm1;           "
-        " punpckldq     %%mm1,      %%mm1;           "
-        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
-        "4:                                          "
-        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
-        " add              $4,         %0;           "
-        " punpcklbw     %%mm0,      %%mm4;           "
-        " movq        (%%eax),      %%mm5;           " /* mm5 = accumulate */
-        " movq          %%mm6,      %%mm3;           "
-        " psllw            $2,      %%mm4;           "
-        " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
-        " psraw           $15,      %%mm0;           "
-        " pand          %%mm3,      %%mm0;           "
-        " movq          %%mm3,      %%mm2;           "
-        " psraw           $15,      %%mm2;           "
-        " pand          %%mm4,      %%mm2;           "
-        " pmulhw        %%mm4,      %%mm3;           "
-        " paddw         %%mm0,      %%mm3;           "
-        " paddw         %%mm2,      %%mm3;           "
-        " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
-        " psraw           $15,      %%mm0;           "
-        " pand          %%mm4,      %%mm0;           "
-        " movq          %%mm4,      %%mm2;           "
-        " psraw           $15,      %%mm2;           "
-        " pand          %%mm1,      %%mm2;           "
-        " pmulhw        %%mm1,      %%mm4;           "
-        " paddw         %%mm0,      %%mm4;           "
-        " paddw         %%mm2,      %%mm4;           "
-        " movq          %%mm3,    (%%eax);           "
-        " paddw         %%mm5,      %%mm4;           "
-        " add              $8,      %%eax;           "
-        " movq          %%mm7,      %%mm0;           "
-        " psraw           $15,      %%mm0;           "
-        " pand          %%mm4,      %%mm0;           "
-        " movq          %%mm4,      %%mm2;           "
-        " psraw           $15,      %%mm2;           "
-        " pand          %%mm7,      %%mm2;           "
-        " pmulhw        %%mm7,      %%mm4;           "
-        " paddw         %%mm0,      %%mm4;           "
-        " paddw         %%mm2,      %%mm4;           "
-        " pxor          %%mm0,      %%mm0;           "
-        " packuswb      %%mm0,      %%mm4;           "
-        " movd          %%mm4,       (%1);           "
-        " add              $4,         %1;           "
-        " decl          %%edx;                       "
-        " jne              4b;                       "
-        " add              %8,         %1;           " /* dstpix += dstdiff */
-        " addl             %5,      %%ecx;           "
-        " subl        $0x4000,      %%ecx;           "
-        "6:                                          " /* tail of outer Y-loop */
-        " add              %7,         %0;           " /* srcpix += srcdiff */
-        " decl             %3;                       "
-        " jne              1b;                       "
-        " emms;                                      "
-        : "+r"(srcpix),  "+r"(dstpix)     /* outputs */
-        : "m"(templine), "m"(srcheight), "m"(width),  "m"(yspace),
-          "m"(yrecip),   "m"(srcdiff),   "m"(dstdiff),"m"(One64)  /* input */
-        : "%ecx","%edx","%eax"           /* clobbered */
-        );
-
-#endif
-
-    /* free the temporary memory */
-    free (templine);
-}
-
 /* this function implements a bilinear filter in the X-dimension */
 void
 pyg_filter_expand_X_C (Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
     free (xmult1);
 }
 
-/* this function implements a bilinear filter in the X-dimension */
-void
-pyg_filter_expand_X_MMX (Uint8 *srcpix, Uint8 *dstpix, int height,
-    int srcpitch, int dstpitch, int srcwidth, int dstwidth)
-{
-    int *xidx0, *xmult0, *xmult1;
-    int x, y;
-    int factorwidth = 8;
-
-    /* Allocate memory for factors */
-    xidx0 = malloc((size_t) (dstwidth * 4));
-    if (xidx0 == NULL)
-        return;
-    xmult0 = (int *) malloc((size_t) (dstwidth * factorwidth));
-    xmult1 = (int *) malloc((size_t) (dstwidth * factorwidth));
-    if (xmult0 == NULL || xmult1 == NULL)
-    {
-        free (xidx0);
-        if (xmult0)
-            free (xmult0);
-        if (xmult1)
-            free (xmult1);
-    }
-
-    /* Create multiplier factors and starting indices and put them in arrays */
-    for (x = 0; x < dstwidth; x++)
-    {
-        int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
-        int xm0 = 0x100 - xm1;
-        xidx0[x] = x * (srcwidth - 1) / dstwidth;
-        xmult1[x*2]   = xm1 | (xm1 << 16);
-        xmult1[x*2+1] = xm1 | (xm1 << 16);
-        xmult0[x*2]   = xm0 | (xm0 << 16);
-        xmult0[x*2+1] = xm0 | (xm0 << 16);
-    }
-
-    /* Do the scaling in raster order so we don't trash the cache */
-    for (y = 0; y < height; y++)
-    {
-        Uint8 *srcrow0 = srcpix + y * srcpitch;
-        Uint8 *dstrow = dstpix + y * dstpitch;
-        int *xm0 = xmult0;
-        int *xm1 = xmult1;
-        int *x0 = xidx0;
-#if defined(__GNUC__) && defined(__x86_64__)
-        asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
-             " movl             %5,      %%ecx;           "
-             " pxor          %%mm0,      %%mm0;           "
-             "1:                                          "
-             " movsxl         (%3),      %%rax;           " /* get xidx0[x] */
-             " add              $4,         %3;           "
-             " movq           (%0),      %%mm1;           " /* load mult0 */
-             " add              $8,         %0;           "
-             " movq           (%1),      %%mm2;           " /* load mult1 */
-             " add              $8,         %1;           "
-             " movd   (%4,%%rax,4),      %%mm4;           "
-             " movd  4(%4,%%rax,4),      %%mm5;           "
-             " punpcklbw     %%mm0,      %%mm4;           "
-             " punpcklbw     %%mm0,      %%mm5;           "
-             " pmullw        %%mm1,      %%mm4;           "
-             " pmullw        %%mm2,      %%mm5;           "
-             " paddw         %%mm4,      %%mm5;           "
-             " psrlw            $8,      %%mm5;           "
-             " packuswb      %%mm0,      %%mm5;           "
-             " movd          %%mm5,       (%2);           "
-             " add              $4,         %2;           "
-             " decl          %%ecx;                       "
-             " jne              1b;                       "
-             " emms;                                      "
-             : "+r"(xm0),   "+r"(xm1), "+r"(dstrow), "+r"(x0) /* outputs */
-             : "r"(srcrow0),"m"(dstwidth)  /* input */
-             : "%ecx","%rax"                /* clobbered */
-             );
-#elif defined(__GNUC__) && defined(__i386__)
-        int width = dstwidth;
-        long long One64 = 0x0100010001000100;
-        asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
-             " pxor          %%mm0,      %%mm0;           "
-             " movq             %5,      %%mm7;           "
-             "1:                                          "
-             " movl           (%2),      %%eax;           " /* get xidx0[x] */
-             " add              $4,         %2;           "
-             " movq          %%mm7,      %%mm2;           "
-             " movq           (%0),      %%mm1;           " /* load mult0 */
-             " add              $8,         %0;           "
-             " psubw         %%mm1,      %%mm2;           " /* load mult1 */
-             " movd   (%4,%%eax,4),      %%mm4;           "
-             " movd  4(%4,%%eax,4),      %%mm5;           "
-             " punpcklbw     %%mm0,      %%mm4;           "
-             " punpcklbw     %%mm0,      %%mm5;           "
-             " pmullw        %%mm1,      %%mm4;           "
-             " pmullw        %%mm2,      %%mm5;           "
-             " paddw         %%mm4,      %%mm5;           "
-             " psrlw            $8,      %%mm5;           "
-             " packuswb      %%mm0,      %%mm5;           "
-             " movd          %%mm5,       (%1);           "
-             " add              $4,         %1;           "
-             " decl             %3;                       "
-             " jne              1b;                       "
-             " emms;                                      "
-             : "+r"(xm0),    "+r"(dstrow), "+r"(x0), "+m"(width)  /* outputs */
-             : "S"(srcrow0), "m"(One64)    /* input */
-             : "%eax"            /* clobbered */
-             );
-#endif
-    }
-
-    /* free memory */
-    free (xidx0);
-    free (xmult0);
-    free (xmult1);
-}
-
 /* this function implements a bilinear filter in the Y-dimension */
 void
 pyg_filter_expand_Y_C (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
         }
     }
 }
-
-/* this function implements a bilinear filter in the Y-dimension */
-void
-pyg_filter_expand_Y_MMX (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
-    int dstpitch, int srcheight, int dstheight)
-{
-    int y;
-
-    for (y = 0; y < dstheight; y++)
-    {
-        int yidx0 = y * (srcheight - 1) / dstheight;
-        Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
-        Uint8 *srcrow1 = srcrow0 + srcpitch;
-        int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
-        int ymult0 = 0x0100 - ymult1;
-        Uint8 *dstrow = dstpix + y * dstpitch;
-#if defined(__GNUC__) && defined(__x86_64__)
-        asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
-             " movl          %5,      %%ecx;                      "
-             " movd          %3,      %%mm1;                      "
-             " movd          %4,      %%mm2;                      "
-             " pxor       %%mm0,      %%mm0;                      "
-             " punpcklwd  %%mm1,      %%mm1;                      "
-             " punpckldq  %%mm1,      %%mm1;                      "
-             " punpcklwd  %%mm2,      %%mm2;                      "
-             " punpckldq  %%mm2,      %%mm2;                      "
-             "1:                                                  "
-             " movd        (%0),      %%mm4;                      "
-             " add           $4,         %0;                      "
-             " movd        (%1),      %%mm5;                      "
-             " add           $4,         %1;                      "
-             " punpcklbw  %%mm0,      %%mm4;                      "
-             " punpcklbw  %%mm0,      %%mm5;                      "
-             " pmullw     %%mm1,      %%mm4;                      "
-             " pmullw     %%mm2,      %%mm5;                      "
-             " paddw      %%mm4,      %%mm5;                      "
-             " psrlw         $8,      %%mm5;                      "
-             " packuswb   %%mm0,      %%mm5;                      "
-             " movd       %%mm5,       (%2);                      "
-             " add           $4,         %2;                      "
-             " decl       %%ecx;                                  "
-             " jne           1b;                                  "
-             " emms;                                              "
-             : "+r"(srcrow0), "+r"(srcrow1), "+r"(dstrow)   /* outputs */
-             : "m"(ymult0),   "m"(ymult1),   "m"(width)    /* input */
-             : "%ecx"         /* clobbered */
-             );
-#elif defined(__GNUC__) && defined(__i386__)
-        asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
-             " movl          %5,      %%eax;                      "
-             " movd          %3,      %%mm1;                      "
-             " movd          %4,      %%mm2;                      "
-             " pxor       %%mm0,      %%mm0;                      "
-             " punpcklwd  %%mm1,      %%mm1;                      "
-             " punpckldq  %%mm1,      %%mm1;                      "
-             " punpcklwd  %%mm2,      %%mm2;                      "
-             " punpckldq  %%mm2,      %%mm2;                      "
-             "1:                                                  "
-             " movd        (%0),      %%mm4;                      "
-             " add           $4,         %0;                      "
-             " movd        (%1),      %%mm5;                      "
-             " add           $4,         %1;                      "
-             " punpcklbw  %%mm0,     %%mm4;                       "
-             " punpcklbw  %%mm0,     %%mm5;                       "
-             " pmullw     %%mm1,     %%mm4;                       "
-             " pmullw     %%mm2,     %%mm5;                       "
-             " paddw      %%mm4,     %%mm5;                       "
-             " psrlw         $8,     %%mm5;                       "
-             " packuswb   %%mm0,     %%mm5;                       "
-             " movd       %%mm5,      (%2);                       "
-             " add           $4,        %2;                       "
-             " decl       %%eax;                                  "
-             " jne           1b;                                  "
-             " emms;                                              "
-             : "+r"(srcrow0), "+r"(srcrow1),"+r"(dstrow)   /* no outputs */
-             : "m"(ymult0),   "m"(ymult1),  "m"(width)    /* input */
-             : "%eax"        /* clobbered */
-             );
-#endif
-    }
-}

src/sdlext/filters.h

 
 #include <SDL.h>
 
+typedef enum
+{
+    FILTER_C,
+    FILTER_MMX,
+    FILTER_SSE
+} FilterType;
+
+typedef struct
+{
+    FilterType type;
+    void       (*shrink_X)(Uint8 *, Uint8 *, int, int, int, int, int);
+    void       (*shrink_Y)(Uint8 *, Uint8 *, int, int, int, int, int);
+    void       (*expand_X)(Uint8 *, Uint8 *, int, int, int, int, int);
+    void       (*expand_Y)(Uint8 *, Uint8 *, int, int, int, int, int);
+} FilterFuncs;
+
+FilterType
+pyg_filter_init_filterfuncs (FilterFuncs *filters, FilterType type);
+
 void
 pyg_filter_shrink_X_C (Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
     int dstpitch, int srcwidth, int dstwidth);
 
 void
-pyg_filter_shrink_X_MMX (Uint8 *srcpix, Uint8 *dstpix, int height,
-    int srcpitch, int dstpitch, int srcwidth, int dstwidth);
-
-void
 pyg_filter_shrink_Y_C (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
     int dstpitch, int srcheight, int dstheight);
 
 void
-pyg_filter_shrink_Y_MMX (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
-    int dstpitch, int srcheight, int dstheight);
-
-void
 pyg_filter_expand_X_C (Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
     int dstpitch, int srcwidth, int dstwidth);
 
 void
-pyg_filter_expand_X_MMX (Uint8 *srcpix, Uint8 *dstpix, int height,
+pyg_filter_expand_Y_C (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
+    int dstpitch, int srcheight, int dstheight);
+
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#define FILTERS_SUPPORT_MMX
+#define FILTERS_SUPPORT_SSE
+
+void
+pyg_filter_shrink_X_MMX (Uint8 *srcpix, Uint8 *dstpix, int height,
     int srcpitch, int dstpitch, int srcwidth, int dstwidth);
 
 void
-pyg_filter_expand_Y_C (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
+pyg_filter_shrink_Y_MMX (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
     int dstpitch, int srcheight, int dstheight);
 
 void
+pyg_filter_expand_X_MMX (Uint8 *srcpix, Uint8 *dstpix, int height,
+    int srcpitch, int dstpitch, int srcwidth, int dstwidth);
+
+void
 pyg_filter_expand_Y_MMX (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
     int dstpitch, int srcheight, int dstheight);
 
+void
+pyg_filter_shrink_X_SSE (Uint8 *srcpix, Uint8 *dstpix, int height,
+    int srcpitch, int dstpitch, int srcwidth, int dstwidth);
+
+void
+pyg_filter_shrink_Y_SSE (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
+    int dstpitch, int srcheight, int dstheight);
+
+void
+pyg_filter_expand_X_SSE (Uint8 *srcpix, Uint8 *dstpix, int height,
+    int srcpitch, int dstpitch, int srcwidth, int dstwidth);
+
+void
+pyg_filter_expand_Y_SSE (Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
+    int dstpitch, int srcheight, int dstheight);
+
+#endif /* __GNUC__ */
+
 #endif /* _PYGAME_FILTERS_H_ */

src/sdlext/filters_32.c

+/*
+  pygame - Python Game Library
+  Copyright (C) 2000-2001  Pete Shinners
+  Copyright (C) 2007  Rene Dudfield, Richard Goedeken 
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Library General Public
+  License as published by the Free Software Foundation; either
+  version 2 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Library General Public License for more details.
+
+  You should have received a copy of the GNU Library General Public
+  License along with this library; if not, write to the Free
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  Pete Shinners
+  pete@shinners.org
+*/
+
+/* Pentium 32 bit SSE/MMX smoothscale filter routines
+ * These are written for compilation with GCC only.
+ *
+ * This file should not depend on anything but the C standard library.
+ */
+
+#if !defined(__GNUC__) || !defined(__i386__) || defined(__x86_64__)
+#error "Pygame2 build bug: should not be compiling this file!"
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* These functions implement an area-averaging shrinking filter in the X-dimension.
+ */
+void
+pyg_filter_shrink_X_MMX(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int srcdiff = srcpitch - (srcwidth * 4);
+    int dstdiff = dstpitch - (dstwidth * 4);
+
+    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
+    int xrecip = 0x40000000 / xspace;
+    long long One64 = 0x4000400040004000ULL;
+
+    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
+        " punpcklwd     %%mm7,      %%mm7;           "
+        " punpckldq     %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %5,      %%ecx;           " /* ecx == xcounter */
+        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          " /* inner X-loop */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm2;           "
+        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              4f;                       "
+        "3:                                          " /* prepare to output a pixel */
+        " movd          %%ecx,      %%mm2;           "
+        " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
+        " punpcklwd     %%mm2,      %%mm2;           "
+        " punpckldq     %%mm2,      %%mm2;           "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
+        " psllw            $2,      %%mm4;           "
+        " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
+        " psraw           $15,      %%mm5;           "
+        " pand          %%mm2,      %%mm5;           "
+        " movq          %%mm2,      %%mm6;           "
+        " psraw           $15,      %%mm6;           "
+        " pand          %%mm4,      %%mm6;           "
+        " pmulhw        %%mm4,      %%mm2;           "
+        " paddw         %%mm5,      %%mm2;           "
+        " paddw         %%mm6,      %%mm2;           "
+        " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
+        " psraw           $15,      %%mm5;           "
+        " pand          %%mm3,      %%mm5;           "
+        " movq          %%mm3,      %%mm6;           "
+        " psraw           $15,      %%mm6;           "
+        " pand          %%mm4,      %%mm6;           "
+        " pmulhw        %%mm4,      %%mm3;           "
+        " paddw         %%mm5,      %%mm3;           "
+        " paddw         %%mm6,      %%mm3;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
+        " movq          %%mm7,      %%mm5;           "
+        " psraw           $15,      %%mm5;           "
+        " pand          %%mm2,      %%mm5;           "
+        " movq          %%mm2,      %%mm6;           "
+        " psraw           $15,      %%mm6;           "
+        " pand          %%mm7,      %%mm6;           "
+        " pmulhw        %%mm7,      %%mm2;           "
+        " paddw         %%mm5,      %%mm2;           "
+        " paddw         %%mm6,      %%mm2;           "
+        " packuswb      %%mm0,      %%mm2;           "
+        " movd          %%mm2,       (%1);           "
+        " add              %5,      %%ecx;           "
+        " add              $4,         %1;           "
+        " subl        $0x4000,      %%ecx;           "
+        "4:                                          " /* tail of inner X-loop */
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)                   /* outputs */
+        : "m"(One64),   "m"(height), "m"(srcwidth),
+          "m"(xspace),  "m"(xrecip), "m"(srcdiff),  "m"(dstdiff)  /* input */
+        : "%ecx","%edx"     /* clobbered */
+        );
+}
+
+void
+pyg_filter_shrink_X_SSE(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int srcdiff = srcpitch - (srcwidth * 4);
+    int dstdiff = dstpitch - (dstwidth * 4);
+
+    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
+    int xrecip = 0x40000000 / xspace;
+    long long One64 = 0x4000400040004000ULL;
+
+    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
+        " movq             %2,      %%mm6;           " /* mm6 = 2^14  */
+        " pshufw    $0, %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %5,      %%ecx;           " /* ecx == xcounter */
+        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          " /* inner X-loop */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm2;           "
+        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              4f;                       "
+        "3:                                          " /* prepare to output a pixel */
+        " movd          %%ecx,      %%mm2;           "
+        " movq          %%mm6,      %%mm3;           " /* mm3 = 2^14  */
+        " pshufw    $0, %%mm2,      %%mm2;           "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
+        " psllw            $2,      %%mm4;           "
+        " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
+        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
+        " pmulhuw       %%mm7,      %%mm2;           "
+        " packuswb      %%mm0,      %%mm2;           "
+        " movd          %%mm2,       (%1);           "
+        " add              %5,      %%ecx;           "
+        " add              $4,         %1;           "
+        " subl        $0x4000,      %%ecx;           "
+        "4:                                          " /* tail of inner X-loop */
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)                   /* outputs */
+        : "m"(One64),   "m"(height), "m"(srcwidth),
+          "m"(xspace),  "m"(xrecip), "m"(srcdiff),  "m"(dstdiff)  /* input */
+        : "%ecx","%edx"     /* clobbered */
+        );
+}
+
+/* These functions implement an area-averaging shrinking filter in the Y-dimension.
+ */
+void
+pyg_filter_shrink_Y_MMX(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    Uint16 *templine;
+    int srcdiff = srcpitch - (width * 4);
+    int dstdiff = dstpitch - (width * 4);
+    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
+    int yrecip = 0x40000000 / yspace;
+    long long One64 = 0x4000400040004000ULL;
+
+    /* allocate and clear a memory area for storing the accumulator line */
+    templine = (Uint16 *) malloc(dstpitch * 2);
+    if (templine == 0) return;
+    memset(templine, 0, dstpitch * 2);
+
+    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
+        " movl             %5,      %%ecx;           " /* ecx == ycounter */
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
+        " punpcklwd     %%mm7,      %%mm7;           "
+        " punpckldq     %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %2,      %%eax;           " /* rax == accumulate */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          "
+        " movd           (%0),      %%mm1;           "
+        " add              $4,         %0;           "
+        " movq        (%%eax),      %%mm2;           "
+        " punpcklbw     %%mm0,      %%mm1;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm2,    (%%eax);           "
+        " add              $8,      %%eax;           "
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              6f;                       "
+        "3:                                          " /* prepare to output a line */
+        " movd          %%ecx,      %%mm1;           "
+        " movl             %4,      %%edx;           " /* edx = width */
+        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
+        " punpcklwd     %%mm1,      %%mm1;           "
+        " punpckldq     %%mm1,      %%mm1;           "
+        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
+        "4:                                          "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " movq        (%%eax),      %%mm5;           " /* mm5 = accumulate */
+        " movq          %%mm6,      %%mm3;           "
+        " psllw            $2,      %%mm4;           "
+        " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
+        " psraw           $15,      %%mm0;           "
+        " pand          %%mm3,      %%mm0;           "
+        " movq          %%mm3,      %%mm2;           "
+        " psraw           $15,      %%mm2;           "
+        " pand          %%mm4,      %%mm2;           "
+        " pmulhw        %%mm4,      %%mm3;           "
+        " paddw         %%mm0,      %%mm3;           "
+        " paddw         %%mm2,      %%mm3;           "
+        " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
+        " psraw           $15,      %%mm0;           "
+        " pand          %%mm4,      %%mm0;           "
+        " movq          %%mm4,      %%mm2;           "
+        " psraw           $15,      %%mm2;           "
+        " pand          %%mm1,      %%mm2;           "
+        " pmulhw        %%mm1,      %%mm4;           "
+        " paddw         %%mm0,      %%mm4;           "
+        " paddw         %%mm2,      %%mm4;           "
+        " movq          %%mm3,    (%%eax);           "
+        " paddw         %%mm5,      %%mm4;           "
+        " add              $8,      %%eax;           "
+        " movq          %%mm7,      %%mm0;           "
+        " psraw           $15,      %%mm0;           "
+        " pand          %%mm4,      %%mm0;           "
+        " movq          %%mm4,      %%mm2;           "
+        " psraw           $15,      %%mm2;           "
+        " pand          %%mm7,      %%mm2;           "
+        " pmulhw        %%mm7,      %%mm4;           "
+        " paddw         %%mm0,      %%mm4;           "
+        " paddw         %%mm2,      %%mm4;           "
+        " pxor          %%mm0,      %%mm0;           "
+        " packuswb      %%mm0,      %%mm4;           "
+        " movd          %%mm4,       (%1);           "
+        " add              $4,         %1;           "
+        " decl          %%edx;                       "
+        " jne              4b;                       "
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " addl             %5,      %%ecx;           "
+        " subl        $0x4000,      %%ecx;           "
+        "6:                                          " /* tail of outer Y-loop */
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix),  "+r"(dstpix)     /* outputs */
+        : "m"(templine), "m"(srcheight), "m"(width),  "m"(yspace),
+          "m"(yrecip),   "m"(srcdiff),   "m"(dstdiff),"m"(One64)  /* input */
+        : "%ecx","%edx","%eax"           /* clobbered */
+        );
+
+    /* free the temporary memory */
+    free(templine);
+}
+
+void
+pyg_filter_shrink_Y_SSE(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    Uint16 *templine;
+    int srcdiff = srcpitch - (width * 4);
+    int dstdiff = dstpitch - (width * 4);
+    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
+    int yrecip = 0x40000000 / yspace;
+    long long One64 = 0x4000400040004000ULL;
+
+    /* allocate and clear a memory area for storing the accumulator line */
+    templine = (Uint16 *) malloc(dstpitch * 2);
+    if (templine == 0) return;
+    memset(templine, 0, dstpitch * 2);
+    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
+        " movl             %5,      %%ecx;           " /* ecx == ycounter */
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
+        " pshufw    $0, %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %2,      %%eax;           " /* rax == accumulate */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          "
+        " movd           (%0),      %%mm1;           "
+        " add              $4,         %0;           "
+        " movq        (%%eax),      %%mm2;           "
+        " punpcklbw     %%mm0,      %%mm1;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm2,    (%%eax);           "
+        " add              $8,      %%eax;           "
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              6f;                       "
+        "3:                                          " /* prepare to output a line */
+        " movd          %%ecx,      %%mm1;           "
+        " movl             %4,      %%edx;           " /* edx = width */
+        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
+        " pshufw    $0, %%mm1,      %%mm1;           "
+        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
+        "4:                                          "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " movq        (%%eax),      %%mm5;           " /* mm5 = accumulate */
+        " movq          %%mm6,      %%mm3;           "
+        " psllw            $2,      %%mm4;           "
+        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
+        " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
+        " movq          %%mm3,    (%%eax);           "
+        " paddw         %%mm5,      %%mm4;           "
+        " add              $8,      %%eax;           "
+        " pmulhuw       %%mm7,      %%mm4;           "
+        " packuswb      %%mm0,      %%mm4;           "
+        " movd          %%mm4,       (%1);           "
+        " add              $4,         %1;           "
+        " decl          %%edx;                       "
+        " jne              4b;                       "
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " addl             %5,      %%ecx;           "
+        " subl        $0x4000,      %%ecx;           "
+        "6:                                          " /* tail of outer Y-loop */
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix),  "+r"(dstpix)     /* outputs */
+        : "m"(templine), "m"(srcheight), "m"(width),  "m"(yspace),
+          "m"(yrecip),   "m"(srcdiff),   "m"(dstdiff),"m"(One64)  /* input */
+        : "%ecx","%edx","%eax"           /* clobbered */
+        );
+
+    /* free the temporary memory */
+    free(templine);
+}
+
+/* These functions implement a bilinear filter in the X-dimension.
+ */
+void
+pyg_filter_expand_X_MMX(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int *xidx0, *xmult0, *xmult1;
+    int x, y;
+    int factorwidth = 8;
+  	long long One64 = 0x0100010001000100ULL;
+
+    /* Allocate memory for factors */
+    xidx0 = malloc(dstwidth * 4);
+    if (xidx0 == 0) return;
+    xmult0 = (int *) malloc(dstwidth * factorwidth);
+    xmult1 = (int *) malloc(dstwidth * factorwidth);
+    if (xmult0 == 0 || xmult1 == 0)
+    {
+        free(xidx0);
+        if (xmult0) free(xmult0);
+        if (xmult1) free(xmult1);
+    }
+
+    /* Create multiplier factors and starting indices and put them in arrays */
+    for (x = 0; x < dstwidth; x++)
+    {
+        int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
+        int xm0 = 0x100 - xm1;
+        xidx0[x] = x * (srcwidth - 1) / dstwidth;
+        xmult1[x*2]   = xm1 | (xm1 << 16);
+        xmult1[x*2+1] = xm1 | (xm1 << 16);
+        xmult0[x*2]   = xm0 | (xm0 << 16);
+        xmult0[x*2+1] = xm0 | (xm0 << 16);
+    }
+
+    /* Do the scaling in raster order so we don't trash the cache */
+    for (y = 0; y < height; y++)
+    {
+        Uint8 *srcrow0 = srcpix + y * srcpitch;
+        Uint8 *dstrow = dstpix + y * dstpitch;
+        int *xm0 = xmult0;
+        int *x0 = xidx0;
+    	int width = dstwidth;
+        asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
+             " pxor          %%mm0,      %%mm0;           "
+             " movq             %5,      %%mm7;           "
+             "1:                                          "
+             " movl           (%2),      %%eax;           " /* get xidx0[x] */
+             " add              $4,         %2;           "
+             " movq          %%mm7,      %%mm2;           "
+             " movq           (%0),      %%mm1;           " /* load mult0 */
+             " add              $8,         %0;           "
+             " psubw         %%mm1,      %%mm2;           " /* load mult1 */
+             " movd   (%4,%%eax,4),      %%mm4;           "
+             " movd  4(%4,%%eax,4),      %%mm5;           "
+             " punpcklbw     %%mm0,      %%mm4;           "
+             " punpcklbw     %%mm0,      %%mm5;           "
+             " pmullw        %%mm1,      %%mm4;           "
+             " pmullw        %%mm2,      %%mm5;           "
+             " paddw         %%mm4,      %%mm5;           "
+             " psrlw            $8,      %%mm5;           "
+             " packuswb      %%mm0,      %%mm5;           "
+             " movd          %%mm5,       (%1);           "
+             " add              $4,         %1;           "
+             " decl             %3;                       "
+             " jne              1b;                       "
+             " emms;                                      "
+             : "+r"(xm0),    "+r"(dstrow), "+r"(x0), "+m"(width)  /* outputs */
+             : "S"(srcrow0), "m"(One64)    /* input */
+             : "%eax"            /* clobbered */
+             );
+    }
+
+    /* free memory */
+    free(xidx0);
+    free(xmult0);
+    free(xmult1);
+}
+
+void
+pyg_filter_expand_X_SSE(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int *xidx0, *xmult0, *xmult1;
+    int x, y;
+    int factorwidth = 8;
+  	long long One64 = 0x0100010001000100ULL;
+
+    /* Allocate memory for factors */
+    xidx0 = malloc(dstwidth * 4);
+    if (xidx0 == 0) return;
+    xmult0 = (int *) malloc(dstwidth * factorwidth);
+    xmult1 = (int *) malloc(dstwidth * factorwidth);
+    if (xmult0 == 0 || xmult1 == 0)
+    {
+        free(xidx0);
+        if (xmult0) free(xmult0);
+        if (xmult1) free(xmult1);
+    }
+
+    /* Create multiplier factors and starting indices and put them in arrays */
+    for (x = 0; x < dstwidth; x++)
+    {
+        int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
+        int xm0 = 0x100 - xm1;
+        xidx0[x] = x * (srcwidth - 1) / dstwidth;
+        xmult1[x*2]   = xm1 | (xm1 << 16);
+        xmult1[x*2+1] = xm1 | (xm1 << 16);
+        xmult0[x*2]   = xm0 | (xm0 << 16);
+        xmult0[x*2+1] = xm0 | (xm0 << 16);
+    }
+
+    /* Do the scaling in raster order so we don't trash the cache */
+    for (y = 0; y < height; y++)
+    {
+        Uint8 *srcrow0 = srcpix + y * srcpitch;
+        Uint8 *dstrow = dstpix + y * dstpitch;
+        int *xm0 = xmult0;
+        int *x0 = xidx0;
+    	int width = dstwidth;
+        asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
+             " pxor          %%mm0,      %%mm0;           "
+             " movq             %5,      %%mm7;           "
+             "1:                                          "
+             " movl           (%2),      %%eax;           " /* get xidx0[x] */
+             " add              $4,         %2;           "
+             " movq          %%mm7,      %%mm2;           "
+             " movq           (%0),      %%mm1;           " /* load mult0 */
+             " add              $8,         %0;           "
+             " psubw         %%mm1,      %%mm2;           " /* load mult1 */
+             " movd   (%4,%%eax,4),      %%mm4;           "
+             " movd  4(%4,%%eax,4),      %%mm5;           "
+             " punpcklbw     %%mm0,      %%mm4;           "
+             " punpcklbw     %%mm0,      %%mm5;           "
+             " pmullw        %%mm1,      %%mm4;           "
+             " pmullw        %%mm2,      %%mm5;           "
+             " paddw         %%mm4,      %%mm5;           "
+             " psrlw            $8,      %%mm5;           "
+             " packuswb      %%mm0,      %%mm5;           "
+             " movd          %%mm5,       (%1);           "
+             " add              $4,         %1;           "
+             " decl             %3;                       "
+             " jne              1b;                       "
+             " emms;                                      "
+             : "+r"(xm0),    "+r"(dstrow), "+r"(x0), "+m"(width)  /* outputs */
+             : "S"(srcrow0), "m"(One64)    /* input */
+             : "%eax"            /* clobbered */
+             );
+    }
+
+    /* free memory */
+    free(xidx0);
+    free(xmult0);
+    free(xmult1);
+}
+
+/* These functions implement a bilinear filter in the Y-dimension.
+ */
+void
+pyg_filter_expand_Y_MMX(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    int y;
+
+    for (y = 0; y < dstheight; y++)
+    {
+        int yidx0 = y * (srcheight - 1) / dstheight;
+        Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
+        Uint8 *srcrow1 = srcrow0 + srcpitch;
+        int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
+        int ymult0 = 0x0100 - ymult1;
+        Uint8 *dstrow = dstpix + y * dstpitch;
+        asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
+             " movl          %5,      %%eax;                      "
+             " movd          %3,      %%mm1;                      "
+             " movd          %4,      %%mm2;                      "
+             " pxor       %%mm0,      %%mm0;                      "
+             " punpcklwd  %%mm1,      %%mm1;                      "
+             " punpckldq  %%mm1,      %%mm1;                      "
+             " punpcklwd  %%mm2,      %%mm2;                      "
+             " punpckldq  %%mm2,      %%mm2;                      "
+             "1:                                                  "
+             " movd        (%0),      %%mm4;                      "
+             " add           $4,         %0;                      "
+             " movd        (%1),      %%mm5;                      "
+             " add           $4,         %1;                      "
+             " punpcklbw  %%mm0,     %%mm4;                       "
+             " punpcklbw  %%mm0,     %%mm5;                       "
+             " pmullw     %%mm1,     %%mm4;                       "
+             " pmullw     %%mm2,     %%mm5;                       "
+             " paddw      %%mm4,     %%mm5;                       "
+             " psrlw         $8,     %%mm5;                       "
+             " packuswb   %%mm0,     %%mm5;                       "
+             " movd       %%mm5,      (%2);                       "
+             " add           $4,        %2;                       "
+             " decl       %%eax;                                  "
+             " jne           1b;                                  "
+             " emms;                                              "
+             : "+r"(srcrow0), "+r"(srcrow1),"+r"(dstrow)   /* no outputs */
+             : "m"(ymult0),   "m"(ymult1),  "m"(width)    /* input */
+             : "%eax"        /* clobbered */
+             );
+    }
+}
+
+void
+pyg_filter_expand_Y_SSE(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    int y;
+
+    for (y = 0; y < dstheight; y++)
+    {
+        int yidx0 = y * (srcheight - 1) / dstheight;
+        Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
+        Uint8 *srcrow1 = srcrow0 + srcpitch;
+        int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
+        int ymult0 = 0x0100 - ymult1;
+        Uint8 *dstrow = dstpix + y * dstpitch;
+        asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
+             " movl          %5,      %%eax;                      "
+             " movd          %3,      %%mm1;                      "
+             " movd          %4,      %%mm2;                      "
+             " pxor       %%mm0,      %%mm0;                      "
+             " pshufw      $0, %%mm1, %%mm1;                      "
+             " pshufw      $0, %%mm2, %%mm2;                      "
+             "1:                                                  "
+             " movd        (%0),      %%mm4;                      "
+             " add           $4,         %0;                      "
+             " movd        (%1),      %%mm5;                      "
+             " add           $4,         %1;                      "
+             " punpcklbw  %%mm0,     %%mm4;                       "
+             " punpcklbw  %%mm0,     %%mm5;                       "
+             " pmullw     %%mm1,     %%mm4;                       "
+             " pmullw     %%mm2,     %%mm5;                       "
+             " paddw      %%mm4,     %%mm5;                       "
+             " psrlw         $8,     %%mm5;                       "
+             " packuswb   %%mm0,     %%mm5;                       "
+             " movd       %%mm5,      (%2);                       "
+             " add           $4,        %2;                       "
+             " decl       %%eax;                                  "
+             " jne           1b;                                  "
+             " emms;                                              "
+             : "+r"(srcrow0), "+r"(srcrow1),"+r"(dstrow)   /* no outputs */
+             : "m"(ymult0),   "m"(ymult1),  "m"(width)    /* input */
+             : "%eax"        /* clobbered */
+             );
+    }
+}

src/sdlext/filters_64.c

+/*
+  pygame - Python Game Library
+  Copyright (C) 2000-2001  Pete Shinners
+  Copyright (C) 2007  Rene Dudfield, Richard Goedeken 
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Library General Public
+  License as published by the Free Software Foundation; either
+  version 2 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Library General Public License for more details.
+
+  You should have received a copy of the GNU Library General Public
+  License along with this library; if not, write to the Free
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  Pete Shinners
+  pete@shinners.org
+*/
+
+/* Pentium 64 bit SSE/MMX smoothscale routines
+ * These are written for compilation with GCC only.
+ *
+ * This file should not depend on anything but the C standard library.
+ */
+
+#if !defined(__GNUC__) || !defined(__x86_64__)
+#error "Pygame2 build bug: should not be compiling this file!"
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* These functions implement an area-averaging shrinking filter in the X-dimension.
+ */
+void
+pyg_filter_shrink_X_MMX(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int srcdiff = srcpitch - (srcwidth * 4);
+    int dstdiff = dstpitch - (dstwidth * 4);
+
+    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
+    int xrecip = 0x40000000 / xspace;
+    long long One64 = 0x4000400040004000ULL;
+    long long srcdiff64 = srcdiff;
+    long long dstdiff64 = dstdiff;
+    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
+        " punpcklwd     %%mm7,      %%mm7;           "
+        " punpckldq     %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %5,      %%ecx;           " /* ecx == xcounter */
+        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          " /* inner X-loop */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm2;           "
+        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              4f;                       "
+        "3:                                          " /* prepare to output a pixel */
+        " movd          %%ecx,      %%mm2;           "
+        " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
+        " punpcklwd     %%mm2,      %%mm2;           "
+        " punpckldq     %%mm2,      %%mm2;           "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
+        " psllw            $2,      %%mm4;           "
+        " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
+        " psraw           $15,      %%mm5;           "
+        " pand          %%mm2,      %%mm5;           "
+        " movq          %%mm2,      %%mm6;           "
+        " psraw           $15,      %%mm6;           "
+        " pand          %%mm4,      %%mm6;           "
+        " pmulhw        %%mm4,      %%mm2;           "
+        " paddw         %%mm5,      %%mm2;           "
+        " paddw         %%mm6,      %%mm2;           "
+        " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
+        " psraw           $15,      %%mm5;           "
+        " pand          %%mm3,      %%mm5;           "
+        " movq          %%mm3,      %%mm6;           "
+        " psraw           $15,      %%mm6;           "
+        " pand          %%mm4,      %%mm6;           "
+        " pmulhw        %%mm4,      %%mm3;           "
+        " paddw         %%mm5,      %%mm3;           "
+        " paddw         %%mm6,      %%mm3;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
+        " movq          %%mm7,      %%mm5;           "
+        " psraw           $15,      %%mm5;           "
+        " pand          %%mm2,      %%mm5;           "
+        " movq          %%mm2,      %%mm6;           "
+        " psraw           $15,      %%mm6;           "
+        " pand          %%mm7,      %%mm6;           "
+        " pmulhw        %%mm7,      %%mm2;           "
+        " paddw         %%mm5,      %%mm2;           "
+        " paddw         %%mm6,      %%mm2;           "
+        " packuswb      %%mm0,      %%mm2;           "
+        " movd          %%mm2,       (%1);           "
+        " add              %5,      %%ecx;           "
+        " add              $4,         %1;           "
+        " subl        $0x4000,      %%ecx;           "
+        "4:                                          " /* tail of inner X-loop */
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)  /* outputs */
+        : "m"(One64),   "m"(height), "m"(srcwidth),
+          "m"(xspace),  "m"(xrecip), "m"(srcdiff64), "m"(dstdiff64)     /* inputs */
+        : "%ecx","%edx"               /* clobbered */
+        );
+}
+
+void
+pyg_filter_shrink_X_SSE(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int srcdiff = srcpitch - (srcwidth * 4);
+    int dstdiff = dstpitch - (dstwidth * 4);
+
+    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
+    int xrecip = 0x40000000 / xspace;
+    long long One64 = 0x4000400040004000ULL;
+    long long srcdiff64 = srcdiff;
+    long long dstdiff64 = dstdiff;
+    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
+        " movq             %2,      %%mm6;           " /* mm6 = 2^14  */
+        " pshufw    $0, %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %5,      %%ecx;           " /* ecx == xcounter */
+        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          " /* inner X-loop */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm2;           "
+        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              4f;                       "
+        "3:                                          " /* prepare to output a pixel */
+        " movd          %%ecx,      %%mm2;           "
+        " movq          %%mm6,      %%mm3;           " /* mm3 = 2^14  */
+        " pshufw    $0, %%mm2,      %%mm2;           "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
+        " psllw            $2,      %%mm4;           "
+        " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
+        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
+        " pmulhuw       %%mm7,      %%mm2;           "
+        " packuswb      %%mm0,      %%mm2;           "
+        " movd          %%mm2,       (%1);           "
+        " add              %5,      %%ecx;           "
+        " add              $4,         %1;           "
+        " subl        $0x4000,      %%ecx;           "
+        "4:                                          " /* tail of inner X-loop */
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)  /* outputs */
+        : "m"(One64),   "m"(height), "m"(srcwidth),
+          "m"(xspace),  "m"(xrecip), "m"(srcdiff64), "m"(dstdiff64)     /* inputs */
+        : "%ecx","%edx"               /* clobbered */
+        );
+}
+
+/* These functions implement an area-averaging shrinking filter in the Y-dimension.
+ */
+void
+pyg_filter_shrink_Y_MMX(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    Uint16 *templine;
+    int srcdiff = srcpitch - (width * 4);
+    int dstdiff = dstpitch - (width * 4);
+    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
+    int yrecip = 0x40000000 / yspace;
+    long long One64 = 0x4000400040004000ULL;
+
+    /* allocate and clear a memory area for storing the accumulator line */
+    templine = (Uint16 *) malloc(dstpitch * 2);
+    if (templine == 0) return;
+    memset(templine, 0, dstpitch * 2);
+    long long srcdiff64 = srcdiff;
+    long long dstdiff64 = dstdiff;
+    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
+        " movl             %5,      %%ecx;           " /* ecx == ycounter */
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
+        " punpcklwd     %%mm7,      %%mm7;           "
+        " punpckldq     %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " mov              %2,      %%rax;           " /* rax == accumulate */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          "
+        " movd           (%0),      %%mm1;           "
+        " add              $4,         %0;           "
+        " movq        (%%rax),      %%mm2;           "
+        " punpcklbw     %%mm0,      %%mm1;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm2,    (%%rax);           "
+        " add              $8,      %%rax;           "
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              6f;                       "
+        "3:                                          " /* prepare to output a line */
+        " movd          %%ecx,      %%mm1;           "
+        " movl             %4,      %%edx;           " /* edx = width */
+        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
+        " punpcklwd     %%mm1,      %%mm1;           "
+        " punpckldq     %%mm1,      %%mm1;           "
+        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
+        "4:                                          "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " movq        (%%rax),      %%mm5;           " /* mm5 = accumulate */
+        " movq          %%mm6,      %%mm3;           "
+        " psllw            $2,      %%mm4;           "
+        " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
+        " psraw           $15,      %%mm0;           "
+        " pand          %%mm3,      %%mm0;           "
+        " movq          %%mm3,      %%mm2;           "
+        " psraw           $15,      %%mm2;           "
+        " pand          %%mm4,      %%mm2;           "
+        " pmulhw        %%mm4,      %%mm3;           "
+        " paddw         %%mm0,      %%mm3;           "
+        " paddw         %%mm2,      %%mm3;           "
+        " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
+        " psraw           $15,      %%mm0;           "
+        " pand          %%mm4,      %%mm0;           "
+        " movq          %%mm4,      %%mm2;           "
+        " psraw           $15,      %%mm2;           "
+        " pand          %%mm1,      %%mm2;           "
+        " pmulhw        %%mm1,      %%mm4;           "
+        " paddw         %%mm0,      %%mm4;           "
+        " paddw         %%mm2,      %%mm4;           "
+        " movq          %%mm3,    (%%rax);           "
+        " paddw         %%mm5,      %%mm4;           "
+        " add              $8,      %%rax;           "
+        " movq          %%mm7,      %%mm0;           "
+        " psraw           $15,      %%mm0;           "
+        " pand          %%mm4,      %%mm0;           "
+        " movq          %%mm4,      %%mm2;           "
+        " psraw           $15,      %%mm2;           "
+        " pand          %%mm7,      %%mm2;           "
+        " pmulhw        %%mm7,      %%mm4;           "
+        " paddw         %%mm0,      %%mm4;           "
+        " paddw         %%mm2,      %%mm4;           "
+        " pxor          %%mm0,      %%mm0;           "
+        " packuswb      %%mm0,      %%mm4;           "
+        " movd          %%mm4,       (%1);           "
+        " add              $4,         %1;           "
+        " decl          %%edx;                       "
+        " jne              4b;                       "
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " addl             %5,      %%ecx;           "
+        " subl        $0x4000,      %%ecx;           "
+        "6:                                          " /* tail of outer Y-loop */
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)    /* outputs */
+        : "m"(templine),"m"(srcheight), "m"(width),     "m"(yspace),  
+          "m"(yrecip),  "m"(srcdiff64), "m"(dstdiff64), "m"(One64)  /* input */
+        : "%ecx","%edx","%rax"          /* clobbered */
+        );
+
+    /* free the temporary memory */
+    free(templine);
+}
+
+void
+pyg_filter_shrink_Y_SSE(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    Uint16 *templine;
+    int srcdiff = srcpitch - (width * 4);
+    int dstdiff = dstpitch - (width * 4);
+    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
+    int yrecip = 0x40000000 / yspace;
+    long long One64 = 0x4000400040004000ULL;
+
+    /* allocate and clear a memory area for storing the accumulator line */
+    templine = (Uint16 *) malloc(dstpitch * 2);
+    if (templine == 0) return;
+    memset(templine, 0, dstpitch * 2);
+    long long srcdiff64 = srcdiff;
+    long long dstdiff64 = dstdiff;
+    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
+        " movl             %5,      %%ecx;           " /* ecx == ycounter */
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
+        " pshufw    $0, %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " mov              %2,      %%rax;           " /* rax == accumulate */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          "
+        " movd           (%0),      %%mm1;           "
+        " add              $4,         %0;           "
+        " movq        (%%rax),      %%mm2;           "
+        " punpcklbw     %%mm0,      %%mm1;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm2,    (%%rax);           "
+        " add              $8,      %%rax;           "
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              6f;                       "
+        "3:                                          " /* prepare to output a line */
+        " movd          %%ecx,      %%mm1;           "
+        " movl             %4,      %%edx;           " /* edx = width */
+        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
+        " pshufw    $0, %%mm1,      %%mm1;           "
+        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
+        "4:                                          "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " movq        (%%rax),      %%mm5;           " /* mm5 = accumulate */
+        " movq          %%mm6,      %%mm3;           "
+        " psllw            $2,      %%mm4;           "
+        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
+        " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
+        " movq          %%mm3,    (%%rax);           "
+        " paddw         %%mm5,      %%mm4;           "
+        " add              $8,      %%rax;           "
+        " pmulhuw       %%mm7,      %%mm4;           "
+        " packuswb      %%mm0,      %%mm4;           "
+        " movd          %%mm4,       (%1);           "
+        " add              $4,         %1;           "
+        " decl          %%edx;                       "
+        " jne              4b;                       "
+        &qu