Commits

illume committed 7149634

Added smoothscale function from Richard Goedeken.

Changed it to work with subversion pygame.
Also made it take a destination surface argument like the other scale
functions now do.
ones. As well as releasing the GIL during processing for multiple threads.

Comments (0)

Files changed (8)

 # BUG	= fixed a bug that was (or could have been) crashing
 #
 #
+#
+#Changes from Marcus for ref counting bug fixes and
+
+Jun 25, 2007
+    Removed QNX support from scrap module. Thanks Marcus!
+    Added smoothscale(with MMX!) function from Richard Goedeken
+
+Jun 27, 2007
+    Fixes from Marcus for ref counting bugs.
+    Also using METH_NOARGS for functions with no arguments.
+     Which should make some functions slightly faster.
+    Thanks Marcus, and thanks Campbell Barton for spotting them.
+
 May 30, 2007
     Fixed some documentation.  mask, scrap, font modules.
     Fixed the mask.set_at get_at functions to raise a nice exception.

docs/ref/index.html

 <li><a href="mask.html#pygame.Mask">pygame.Mask</a> - <font size=-1>pygame object for representing 2d bitmasks</font></li>
 <ul>
 <li></li>
+<li><a href="mask.html#Mask.from_surface">Mask.from_surface</a> - <font size=-1>Returns the size of the mask.</font></li>
 <li><a href="mask.html#Mask.get_at">Mask.get_at</a> - <font size=-1>Returns nonzero if the bit at (x,y) is set.</font></li>
 <li><a href="mask.html#Mask.get_size">Mask.get_size</a> - <font size=-1>Returns the size of the mask.</font></li>
 <li><a href="mask.html#Mask.overlap">Mask.overlap</a> - <font size=-1>Returns nonzero if the masks overlap with the given offset.</font></li>
 <li><a href="transform.html#pygame.transform.rotozoom">pygame.transform.rotozoom</a> - <font size=-1>filtered scale and rotation</font></li>
 <li><a href="transform.html#pygame.transform.scale">pygame.transform.scale</a> - <font size=-1>resize to new resolution</font></li>
 <li><a href="transform.html#pygame.transform.scale2x">pygame.transform.scale2x</a> - <font size=-1>specialized image doubler</font></li>
+<li><a href="transform.html#pygame.transform.smoothscale">pygame.transform.smoothscale</a> - <font size=-1>scale a surface to an arbitrary size smoothly</font></li>
 </ul>
 <li><a href="pygame.html#pygame.quit">pygame.quit</a> - <font size=-1>uninitialize all pygame modules</font></li>
 <li><a href="pygame.html#pygame.register_quit">pygame.register_quit</a> - <font size=-1>register a function to be called when pygame quits</font></li>

docs/ref/transform.html

   <tr><td><a href="transform.html#pygame.transform.rotate">pygame.transform.rotate</a> - <font size=-1>rotate an image</font></td><td>rotate an image</td></tr>
   <tr><td><a href="transform.html#pygame.transform.rotozoom">pygame.transform.rotozoom</a> - <font size=-1>filtered scale and rotation</font></td><td>filtered scale and rotation</td></tr>
   <tr><td><a href="transform.html#pygame.transform.scale2x">pygame.transform.scale2x</a> - <font size=-1>specialized image doubler</font></td><td>specialized image doubler</td></tr>
+  <tr><td><a href="transform.html#pygame.transform.smoothscale">pygame.transform.smoothscale</a> - <font size=-1>scale a surface to an arbitrary size smoothly</font></td><td>scale a surface to an arbitrary size smoothly</td></tr>
   <tr><td><a href="transform.html#pygame.transform.chop">pygame.transform.chop</a> - <font size=-1>remove interior area of an image</font></td><td>remove interior area of an image</td></tr>
 </table></small></ul>
 <p><tt>A</tt> Surface transform is an operation that moves or resizes the pixels. All these functions take a Surface to operate on and return a new Surface with the results. </p>
 <br></ul>
 
 
+<a name="pygame.transform.smoothscale">
+<big><b>pygame.transform.smoothscale</big></b><br><ul>
+  <i>scale a surface to an arbitrary size smoothly</i><br>
+  <tt>pygame.transform.smoothscale(Surface, (width, height)): return Surface</tt><br>
+<p>Uses one of two different algorithms for scaling each dimension of the input surface as required. For shrinkage, the output pixels are area averages of the colors they cover. For expansion, a bilinear filter is used. For the amd64 and i686 architectures, optimized <tt>MMX</tt> routines are included and will run much faster than other machine types. The size is a 2 number sequence for (width, height). This function only works for 24-bit or 32-bit surfaces. An exception will be thrown if the input surface bit depth is less than 24. </p>
+<p>New in pygame <tt>1.8</tt> </p>
+<!--COMMENTS:pygame.transform.smoothscale--> &nbsp;<br> 
+<br></ul>
+
+
 <a name="pygame.transform.chop">
 <big><b>pygame.transform.chop</big></b><br><ul>
   <i>remove interior area of an image</i><br>

examples/scaletest.py

+#!/usr/bin/env python
+
+import sys, time
+import pygame
+
+def main():
+    # check input parameters
+    if len(sys.argv) < 2:
+        print"Usage: %s ImageFile [-t] [-convert_alpha]" % sys.argv[0]
+        print"       [-t] = Run Speed Test\n"
+        print"       [-convert_alpha] = Use convert_alpha() on the surf.\n"
+        return
+    imagefile = sys.argv[1]
+    bSpeedTest = False
+    if len(sys.argv) > 2 and sys.argv[2] == '-t':
+        bSpeedTest = True
+    # initialize display
+    pygame.display.init()
+    # load background image
+    background = pygame.image.load(imagefile)
+    if "-convert_alpha" in sys.argv:
+        screen = pygame.display.set_mode((1024, 768), pygame.FULLSCREEN)
+        background = background.convert_alpha()
+
+    if bSpeedTest:
+        SpeedTest(background)
+        return
+    screen = pygame.display.set_mode((1024, 768), pygame.FULLSCREEN)
+    # start fullscreen mode
+    # turn off the mouse pointer
+    pygame.mouse.set_visible(0)
+    # main loop
+    bRunning = True
+    bUp = False
+    bDown = False
+    bLeft = False
+    bRight = False
+    cursize = [background.get_width(), background.get_height()]
+    while(bRunning):
+        image = pygame.transform.smoothscale(background, cursize)
+        imgpos = image.get_rect(centerx=512, centery=384)
+        screen.fill((255,255,255))
+        screen.blit(image, imgpos)
+        pygame.display.flip()
+        for event in pygame.event.get():
+            if event.type == pygame.QUIT or (event.type == pygame.KEYDOWN and event.key == pygame.K_ESCAPE):
+                bRunning = False
+            if event.type == pygame.KEYDOWN:
+                if event.key == pygame.K_UP:    bUp = True
+                if event.key == pygame.K_DOWN:  bDown = True
+                if event.key == pygame.K_LEFT:  bLeft = True
+                if event.key == pygame.K_RIGHT: bRight = True
+            if event.type == pygame.KEYUP:
+                if event.key == pygame.K_UP:    bUp = False
+                if event.key == pygame.K_DOWN:  bDown = False
+                if event.key == pygame.K_LEFT:  bLeft = False
+                if event.key == pygame.K_RIGHT: bRight = False
+        if bUp:
+            cursize[1] -= 2
+            if cursize[1] < 1: cursize[1] = 1
+        if bDown:
+            cursize[1] += 2
+        if bLeft:
+            cursize[0] -= 2
+            if cursize[0] < 1: cursize[0] = 1
+        if bRight:
+            cursize[0] += 2
+
+
+def SpeedTest(image):
+    print "Smoothscale Speed Test - Image Size %s\n" % str(image.get_size())
+    imgsize = [image.get_width(), image.get_height()]
+    duration = 0.0
+    for i in range(128):
+        shrinkx = (imgsize[0] * i) / 128
+        shrinky = (imgsize[1] * i) / 128
+        start = time.time()
+        tempimg = pygame.transform.smoothscale(image, (shrinkx, shrinky))
+        duration += (time.time() - start)
+        del tempimg
+    print "Average smooth shrink time: %i milliseconds." % int((duration / 128) * 1000)
+    duration = 0
+    for i in range(128):
+        expandx = (imgsize[0] * (i + 129)) / 128
+        expandy = (imgsize[1] * (i + 129)) / 128
+        start = time.time()
+        tempimg = pygame.transform.smoothscale(image, (expandx, expandy))
+        duration += (time.time() - start)
+        del tempimg
+    print "Average smooth expand time: %i milliseconds." % int((duration / 128) * 1000)
+    duration = 0.0
+    for i in range(128):
+        shrinkx = (imgsize[0] * i) / 128
+        shrinky = (imgsize[1] * i) / 128
+        start = time.time()
+        tempimg = pygame.transform.scale(image, (shrinkx, shrinky))
+        duration += (time.time() - start)
+        del tempimg
+    print "Average jaggy shrink time: %i milliseconds." % int((duration / 128) * 1000)
+    duration = 0
+    for i in range(128):
+        expandx = (imgsize[0] * (i + 129)) / 128
+        expandy = (imgsize[1] * (i + 129)) / 128
+        start = time.time()
+        tempimg = pygame.transform.scale(image, (expandx, expandy))
+        duration += (time.time() - start)
+        del tempimg
+    print "Average jaggy expand time: %i milliseconds." % int((duration / 128) * 1000)
+
+
+
+if __name__ == '__main__':
+    main()
 
 #define DOC_PYGAMETRANSFORMSCALE2X "pygame.transform.scale2x(Surface, DestSurface = None): Surface\nspecialized image doubler"
 
+#define DOC_PYGAMETRANSFORMSMOOTHSCALE "pygame.transform.smoothscale(Surface, (width, height)): return Surface\nscale a surface to an arbitrary size smoothly"
+
 #define DOC_PYGAMETRANSFORMCHOP "pygame.transform.chop(Surface, rect): return Surface\nremove interior area of an image"
 
 /*
   pygame - Python Game Library
   Copyright (C) 2000-2001  Pete Shinners
+  Copyright (C) 2007  Rene Dudfield, Richard Goedeken 
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public
  *  surface transformations for pygame
  */
 #include "pygame.h"
+#include <SDL_cpuinfo.h>
 #include "pygamedocs.h"
 #include <math.h>
 
     return PySurface_New (newsurf);
 }
 
+
+
+
+/*
+ * smooth scale functions.
+ */
+
+
+/* this function implements an area-averaging shrinking filter in the X-dimension */
+static void filter_shrink_X_ONLYC(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int srcdiff = srcpitch - (srcwidth * 4);
+    int dstdiff = dstpitch - (dstwidth * 4);
+    int x, y;
+
+    int xspace = 0x10000 * srcwidth / dstwidth; /* must be > 1 */
+    int xrecip = (int) ((long long) 0x100000000 / xspace);
+    for (y = 0; y < height; y++)
+    {
+        Uint16 accumulate[4] = {0,0,0,0};
+        int xcounter = xspace;
+        for (x = 0; x < srcwidth; x++)
+        {
+            if (xcounter > 0x10000)
+            {
+                accumulate[0] += (Uint16) *srcpix++;
+                accumulate[1] += (Uint16) *srcpix++;
+                accumulate[2] += (Uint16) *srcpix++;
+                accumulate[3] += (Uint16) *srcpix++;
+                xcounter -= 0x10000;
+            }
+            else
+            {
+                /* write out a destination pixel */
+                *dstpix++ = (Uint8) (((accumulate[0] + ((srcpix[0] * xcounter) >> 16)) * xrecip) >> 16);
+                *dstpix++ = (Uint8) (((accumulate[1] + ((srcpix[1] * xcounter) >> 16)) * xrecip) >> 16);
+                *dstpix++ = (Uint8) (((accumulate[2] + ((srcpix[2] * xcounter) >> 16)) * xrecip) >> 16);
+                *dstpix++ = (Uint8) (((accumulate[3] + ((srcpix[3] * xcounter) >> 16)) * xrecip) >> 16);
+                /* reload the accumulator with the remainder of this pixel */
+                int xfrac = 0x10000 - xcounter;
+                accumulate[0] = (Uint16) ((*srcpix++ * xfrac) >> 16);
+                accumulate[1] = (Uint16) ((*srcpix++ * xfrac) >> 16);
+                accumulate[2] = (Uint16) ((*srcpix++ * xfrac) >> 16);
+                accumulate[3] = (Uint16) ((*srcpix++ * xfrac) >> 16);
+                xcounter = xspace - xfrac;
+            }
+        }
+        srcpix += srcdiff;
+        dstpix += dstdiff;
+    }
+}
+
+/* this function implements an area-averaging shrinking filter in the X-dimension */
+static void filter_shrink_X_MMX(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int srcdiff = srcpitch - (srcwidth * 4);
+    int dstdiff = dstpitch - (dstwidth * 4);
+    int x, y;
+
+    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
+    int xrecip = (int) ((long long) 0x040000000 / xspace);
+    long long One64 = 0x4000400040004000;
+#if defined(__GNUC__) && defined(__x86_64__)
+    long long srcdiff64 = srcdiff;
+    long long dstdiff64 = dstdiff;
+    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
+        " movq             %2,      %%mm6;           " /* mm6 = 2^14  */
+        " pshufw    $0, %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %5,      %%ecx;           " /* ecx == xcounter */
+        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          " /* inner X-loop */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm2;           "
+        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              4f;                       "
+        "3:                                          " /* prepare to output a pixel */
+        " movd          %%ecx,      %%mm2;           "
+        " movq          %%mm6,      %%mm3;           " /* mm3 = 2^14  */
+        " pshufw    $0, %%mm2,      %%mm2;           "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
+        " psllw            $2,      %%mm4;           "
+        " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
+        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
+        " pmulhuw       %%mm7,      %%mm2;           "
+        " packuswb      %%mm0,      %%mm2;           "
+        " movd          %%mm2,       (%1);           "
+        " add              %5,      %%ecx;           "
+        " add              $4,         %1;           "
+        " subl        $0x4000,      %%ecx;           "
+        "4:                                          " /* tail of inner X-loop */
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)  /* outputs */
+        : "m"(One64),   "m"(height), "m"(srcwidth),
+          "m"(xspace),  "m"(xrecip), "m"(srcdiff64), "m"(dstdiff64)     /* inputs */
+        : "%ecx","%edx"               /* clobbered */
+        );
+#elif defined(__GNUC__) && defined(__i386__)
+    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
+        " movq             %2,      %%mm6;           " /* mm6 = 2^14  */
+        " pshufw    $0, %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %5,      %%ecx;           " /* ecx == xcounter */
+        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          " /* inner X-loop */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm2;           "
+        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              4f;                       "
+        "3:                                          " /* prepare to output a pixel */
+        " movd          %%ecx,      %%mm2;           "
+        " movq          %%mm6,      %%mm3;           " /* mm3 = 2^14  */
+        " pshufw    $0, %%mm2,      %%mm2;           "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
+        " psllw            $2,      %%mm4;           "
+        " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
+        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
+        " pmulhuw       %%mm7,      %%mm2;           "
+        " packuswb      %%mm0,      %%mm2;           "
+        " movd          %%mm2,       (%1);           "
+        " add              %5,      %%ecx;           "
+        " add              $4,         %1;           "
+        " subl        $0x4000,      %%ecx;           "
+        "4:                                          " /* tail of inner X-loop */
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)                   /* outputs */
+        : "m"(One64),   "m"(height), "m"(srcwidth),
+          "m"(xspace),  "m"(xrecip), "m"(srcdiff),  "m"(dstdiff)  /* input */
+        : "%ecx","%edx"     /* clobbered */
+        );
+#endif
+}
+
+/* this function implements an area-averaging shrinking filter in the Y-dimension */
+static void filter_shrink_Y_ONLYC(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    Uint16 *templine;
+    int srcdiff = srcpitch - (width * 4);
+    int dstdiff = dstpitch - (width * 4);
+    int x, y;
+
+    /* allocate and clear a memory area for storing the accumulator line */
+    templine = (Uint16 *) malloc(dstpitch * 2);
+    if (templine == NULL) return;
+    memset(templine, 0, dstpitch * 2);
+
+    int yspace = 0x10000 * srcheight / dstheight; /* must be > 1 */
+    int yrecip = (int) ((long long) 0x100000000 / yspace);
+    int ycounter = yspace;
+    for (y = 0; y < srcheight; y++)
+    {
+        Uint16 *accumulate = templine;
+        if (ycounter > 0x10000)
+        {
+            for (x = 0; x < width; x++)
+            {
+                *accumulate++ += (Uint16) *srcpix++;
+                *accumulate++ += (Uint16) *srcpix++;
+                *accumulate++ += (Uint16) *srcpix++;
+                *accumulate++ += (Uint16) *srcpix++;
+            }
+            ycounter -= 0x10000;
+        }
+        else
+        {
+            /* write out a destination line */
+            for (x = 0; x < width; x++)
+            {
+                *dstpix++ = (Uint8) (((*accumulate++ + ((*srcpix++ * ycounter) >> 16)) * yrecip) >> 16);
+                *dstpix++ = (Uint8) (((*accumulate++ + ((*srcpix++ * ycounter) >> 16)) * yrecip) >> 16);
+                *dstpix++ = (Uint8) (((*accumulate++ + ((*srcpix++ * ycounter) >> 16)) * yrecip) >> 16);
+                *dstpix++ = (Uint8) (((*accumulate++ + ((*srcpix++ * ycounter) >> 16)) * yrecip) >> 16);
+            }
+            dstpix += dstdiff;
+            /* reload the accumulator with the remainder of this line */
+            accumulate = templine;
+            srcpix -= 4 * width;
+            int yfrac = 0x10000 - ycounter;
+            for (x = 0; x < width; x++)
+            {
+                *accumulate++ = (Uint16) ((*srcpix++ * yfrac) >> 16);
+                *accumulate++ = (Uint16) ((*srcpix++ * yfrac) >> 16);
+                *accumulate++ = (Uint16) ((*srcpix++ * yfrac) >> 16);
+                *accumulate++ = (Uint16) ((*srcpix++ * yfrac) >> 16);
+            }
+            ycounter = yspace - yfrac;
+        }
+        srcpix += srcdiff;
+    } /* for (int y = 0; y < srcheight; y++) */
+
+    /* free the temporary memory */
+    free(templine);
+}
+
+/* this function implements an area-averaging shrinking filter in the Y-dimension */
+static void filter_shrink_Y_MMX(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    Uint16 *templine;
+    int srcdiff = srcpitch - (width * 4);
+    int dstdiff = dstpitch - (width * 4);
+    int x, y;
+
+    /* allocate and clear a memory area for storing the accumulator line */
+    templine = (Uint16 *) malloc(dstpitch * 2);
+    if (templine == NULL) return;
+    memset(templine, 0, dstpitch * 2);
+
+    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
+    int yrecip = (int) ((long long) 0x040000000 / yspace);
+    long long One64 = 0x4000400040004000;
+#if defined(__GNUC__) && defined(__x86_64__)
+    long long srcdiff64 = srcdiff;
+    long long dstdiff64 = dstdiff;
+    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
+        " movl             %5,      %%ecx;           " /* ecx == ycounter */
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
+        " pshufw    $0, %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " mov              %2,      %%rax;           " /* rax == accumulate */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          "
+        " movd           (%0),      %%mm1;           "
+        " add              $4,         %0;           "
+        " movq        (%%rax),      %%mm2;           "
+        " punpcklbw     %%mm0,      %%mm1;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm2,    (%%rax);           "
+        " add              $8,      %%rax;           "
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              6f;                       "
+        "3:                                          " /* prepare to output a line */
+        " movd          %%ecx,      %%mm1;           "
+        " movl             %4,      %%edx;           " /* edx = width */
+        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
+        " pshufw    $0, %%mm1,      %%mm1;           "
+        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
+        "4:                                          "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " movq        (%%rax),      %%mm5;           " /* mm5 = accumulate */
+        " movq          %%mm6,      %%mm3;           "
+        " psllw            $2,      %%mm4;           "
+        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
+        " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
+        " movq          %%mm3,    (%%rax);           "
+        " paddw         %%mm5,      %%mm4;           "
+        " add              $8,      %%rax;           "
+        " pmulhuw       %%mm7,      %%mm4;           "
+        " packuswb      %%mm0,      %%mm4;           "
+        " movd          %%mm4,       (%1);           "
+        " add              $4,         %1;           "
+        " decl          %%edx;                       "
+        " jne              4b;                       "
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " addl             %5,      %%ecx;           "
+        " subl        $0x4000,      %%ecx;           "
+        "6:                                          " /* tail of outer Y-loop */
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix), "+r"(dstpix)    /* outputs */
+        : "m"(templine),"m"(srcheight), "m"(width),     "m"(yspace),  
+          "m"(yrecip),  "m"(srcdiff64), "m"(dstdiff64), "m"(One64)  /* input */
+        : "%ecx","%edx","%rax"          /* clobbered */
+        );
+#elif defined(__GNUC__) && defined(__i386__)
+    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
+        " movl             %5,      %%ecx;           " /* ecx == ycounter */
+        " pxor          %%mm0,      %%mm0;           "
+        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
+        " pshufw    $0, %%mm7,      %%mm7;           "
+        "1:                                          " /* outer Y-loop */
+        " movl             %2,      %%eax;           " /* rax == accumulate */
+        " cmpl        $0x4000,      %%ecx;           "
+        " jbe              3f;                       "
+        " movl             %4,      %%edx;           " /* edx == width */
+        "2:                                          "
+        " movd           (%0),      %%mm1;           "
+        " add              $4,         %0;           "
+        " movq        (%%eax),      %%mm2;           "
+        " punpcklbw     %%mm0,      %%mm1;           "
+        " paddw         %%mm1,      %%mm2;           "
+        " movq          %%mm2,    (%%eax);           "
+        " add              $8,      %%eax;           "
+        " decl          %%edx;                       "
+        " jne              2b;                       "
+        " subl        $0x4000,      %%ecx;           "
+        " jmp              6f;                       "
+        "3:                                          " /* prepare to output a line */
+        " movd          %%ecx,      %%mm1;           "
+        " movl             %4,      %%edx;           " /* edx = width */
+        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
+        " pshufw    $0, %%mm1,      %%mm1;           "
+        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
+        "4:                                          "
+        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
+        " add              $4,         %0;           "
+        " punpcklbw     %%mm0,      %%mm4;           "
+        " movq        (%%eax),      %%mm5;           " /* mm5 = accumulate */
+        " movq          %%mm6,      %%mm3;           "
+        " psllw            $2,      %%mm4;           "
+        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
+        " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
+        " movq          %%mm3,    (%%eax);           "
+        " paddw         %%mm5,      %%mm4;           "
+        " add              $8,      %%eax;           "
+        " pmulhuw       %%mm7,      %%mm4;           "
+        " packuswb      %%mm0,      %%mm4;           "
+        " movd          %%mm4,       (%1);           "
+        " add              $4,         %1;           "
+        " decl          %%edx;                       "
+        " jne              4b;                       "
+        " add              %8,         %1;           " /* dstpix += dstdiff */
+        " addl             %5,      %%ecx;           "
+        " subl        $0x4000,      %%ecx;           "
+        "6:                                          " /* tail of outer Y-loop */
+        " add              %7,         %0;           " /* srcpix += srcdiff */
+        " decl             %3;                       "
+        " jne              1b;                       "
+        " emms;                                      "
+        : "+r"(srcpix),  "+r"(dstpix)     /* outputs */
+        : "m"(templine), "m"(srcheight), "m"(width),  "m"(yspace),
+          "m"(yrecip),   "m"(srcdiff),   "m"(dstdiff),"m"(One64)  /* input */
+        : "%ecx","%edx","%eax"           /* clobbered */
+        );
+
+#endif
+
+    /* free the temporary memory */
+    free(templine);
+}
+
+/* this function implements a bilinear filter in the X-dimension */
+static void filter_expand_X_ONLYC(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int dstdiff = dstpitch - (dstwidth * 4);
+    int *xidx0, *xmult0, *xmult1;
+    int x, y;
+
+    /* Allocate memory for factors */
+    xidx0 = malloc(dstwidth * 4);
+    if (xidx0 == NULL) return;
+    int factorwidth = 4;
+    xmult0 = (int *) malloc(dstwidth * factorwidth);
+    xmult1 = (int *) malloc(dstwidth * factorwidth);
+    if (xmult0 == NULL || xmult1 == NULL)
+    {
+        free(xidx0);
+        if (xmult0) free(xmult0);
+        if (xmult1) free(xmult1);
+    }
+
+    /* Create multiplier factors and starting indices and put them in arrays */
+    for (x = 0; x < dstwidth; x++)
+    {
+        xidx0[x] = x * (srcwidth - 1) / dstwidth;
+        xmult1[x] = 0x10000 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
+        xmult0[x] = 0x10000 - xmult1[x];
+    }
+
+    /* Do the scaling in raster order so we don't trash the cache */
+    for (y = 0; y < height; y++)
+    {
+        Uint8 *srcrow0 = srcpix + y * srcpitch;
+        for (x = 0; x < dstwidth; x++)
+        {
+            Uint8 *src = srcrow0 + xidx0[x] * 4;
+            int xm0 = xmult0[x];
+            int xm1 = xmult1[x];
+            *dstpix++ = (Uint8) (((src[0] * xm0) + (src[4] * xm1)) >> 16);
+            *dstpix++ = (Uint8) (((src[1] * xm0) + (src[5] * xm1)) >> 16);
+            *dstpix++ = (Uint8) (((src[2] * xm0) + (src[6] * xm1)) >> 16);
+            *dstpix++ = (Uint8) (((src[3] * xm0) + (src[7] * xm1)) >> 16);
+        }
+        dstpix += dstdiff;
+    }
+
+    /* free memory */
+    free(xidx0);
+    free(xmult0);
+    free(xmult1);
+}
+
+/* this function implements a bilinear filter in the X-dimension */
+static void filter_expand_X_MMX(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
+{
+    int dstdiff = dstpitch - (dstwidth * 4);
+    int *xidx0, *xmult0, *xmult1;
+    int x, y;
+
+    /* Allocate memory for factors */
+    xidx0 = malloc(dstwidth * 4);
+    if (xidx0 == NULL) return;
+    int factorwidth = 8;
+    xmult0 = (int *) malloc(dstwidth * factorwidth);
+    xmult1 = (int *) malloc(dstwidth * factorwidth);
+    if (xmult0 == NULL || xmult1 == NULL)
+    {
+        free(xidx0);
+        if (xmult0) free(xmult0);
+        if (xmult1) free(xmult1);
+    }
+
+    /* Create multiplier factors and starting indices and put them in arrays */
+    for (x = 0; x < dstwidth; x++)
+    {
+        xidx0[x] = x * (srcwidth - 1) / dstwidth;
+        int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
+        int xm0 = 0x100 - xm1;
+        xmult1[x*2]   = xm1 | (xm1 << 16);
+        xmult1[x*2+1] = xm1 | (xm1 << 16);
+        xmult0[x*2]   = xm0 | (xm0 << 16);
+        xmult0[x*2+1] = xm0 | (xm0 << 16);
+    }
+
+    /* Do the scaling in raster order so we don't trash the cache */
+    for (y = 0; y < height; y++)
+    {
+        Uint8 *srcrow0 = srcpix + y * srcpitch;
+        Uint8 *dstrow = dstpix + y * dstpitch;
+        int *xm0 = xmult0;
+        int *xm1 = xmult1;
+        int *x0 = xidx0;
+#if defined(__GNUC__) && defined(__x86_64__)
+        asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
+             " movl             %5,      %%ecx;           "
+             " pxor          %%mm0,      %%mm0;           "
+             "1:                                          "
+             " movsxl         (%3),      %%rax;           " /* get xidx0[x] */
+             " add              $4,         %3;           "
+             " movq           (%0),      %%mm1;           " /* load mult0 */
+             " add              $8,         %0;           "
+             " movq           (%1),      %%mm2;           " /* load mult1 */
+             " add              $8,         %1;           "
+             " movd   (%4,%%rax,4),      %%mm4;           "
+             " movd  4(%4,%%rax,4),      %%mm5;           "
+             " punpcklbw     %%mm0,      %%mm4;           "
+             " punpcklbw     %%mm0,      %%mm5;           "
+             " pmullw        %%mm1,      %%mm4;           "
+             " pmullw        %%mm2,      %%mm5;           "
+             " paddw         %%mm4,      %%mm5;           "
+             " psrlw            $8,      %%mm5;           "
+             " packuswb      %%mm0,      %%mm5;           "
+             " movd          %%mm5,       (%2);           "
+             " add              $4,         %2;           "
+             " decl          %%ecx;                       "
+             " jne              1b;                       "
+             " emms;                                      "
+             : "+r"(xm0),   "+r"(xm1), "+r"(dstrow), "+r"(x0) /* outputs */
+             : "r"(srcrow0),"m"(dstwidth)  /* input */
+             : "%ecx","%rax"                /* clobbered */
+             );
+#elif defined(__GNUC__) && defined(__i386__)
+    	int width = dstwidth;
+    	long long One64 = 0x0100010001000100;
+        asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
+             " pxor          %%mm0,      %%mm0;           "
+             " movq             %5,      %%mm7;           "
+             "1:                                          "
+             " movl           (%2),      %%eax;           " /* get xidx0[x] */
+             " add              $4,         %2;           "
+             " movq          %%mm7,      %%mm2;           "
+             " movq           (%0),      %%mm1;           " /* load mult0 */
+             " add              $8,         %0;           "
+             " psubw         %%mm1,      %%mm2;           " /* load mult1 */
+             " movd   (%4,%%eax,4),      %%mm4;           "
+             " movd  4(%4,%%eax,4),      %%mm5;           "
+             " punpcklbw     %%mm0,      %%mm4;           "
+             " punpcklbw     %%mm0,      %%mm5;           "
+             " pmullw        %%mm1,      %%mm4;           "
+             " pmullw        %%mm2,      %%mm5;           "
+             " paddw         %%mm4,      %%mm5;           "
+             " psrlw            $8,      %%mm5;           "
+             " packuswb      %%mm0,      %%mm5;           "
+             " movd          %%mm5,       (%1);           "
+             " add              $4,         %1;           "
+             " decl             %3;                       "
+             " jne              1b;                       "
+             " emms;                                      "
+             : "+r"(xm0),    "+r"(dstrow), "+r"(x0), "+m"(width)  /* outputs */
+             : "S"(srcrow0), "m"(One64)    /* input */
+             : "%eax"            /* clobbered */
+             );
+#endif
+    }
+
+    /* free memory */
+    free(xidx0);
+    free(xmult0);
+    free(xmult1);
+}
+
+/* this function implements a bilinear filter in the Y-dimension */
+static void filter_expand_Y_ONLYC(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    int dstdiff = dstpitch - (width * 4);
+    int x, y;
+
+    for (y = 0; y < dstheight; y++)
+    {
+        int yidx0 = y * (srcheight - 1) / dstheight;
+        Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
+        Uint8 *srcrow1 = srcrow0 + srcpitch;
+        int ymult1 = 0x10000 * ((y * (srcheight - 1)) % dstheight) / dstheight;
+        int ymult0 = 0x10000 - ymult1;
+        for (x = 0; x < width; x++)
+        {
+            *dstpix++ = (Uint8) (((*srcrow0++ * ymult0) + (*srcrow1++ * ymult1)) >> 16);
+            *dstpix++ = (Uint8) (((*srcrow0++ * ymult0) + (*srcrow1++ * ymult1)) >> 16);
+            *dstpix++ = (Uint8) (((*srcrow0++ * ymult0) + (*srcrow1++ * ymult1)) >> 16);
+            *dstpix++ = (Uint8) (((*srcrow0++ * ymult0) + (*srcrow1++ * ymult1)) >> 16);
+        }
+    }
+}
+
+/* this function implements a bilinear filter in the Y-dimension */
+static void filter_expand_Y_MMX(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
+{
+    int dstdiff = dstpitch - (width * 4);
+    int x, y;
+
+    for (y = 0; y < dstheight; y++)
+    {
+        int yidx0 = y * (srcheight - 1) / dstheight;
+        Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
+        Uint8 *srcrow1 = srcrow0 + srcpitch;
+        int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
+        int ymult0 = 0x0100 - ymult1;
+        Uint8 *dstrow = dstpix + y * dstpitch;
+#if defined(__GNUC__) && defined(__x86_64__)
+        asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
+             " movl          %5,      %%ecx;                      "
+             " movd          %3,      %%mm1;                      "
+             " movd          %4,      %%mm2;                      "
+             " pxor       %%mm0,      %%mm0;                      "
+             " pshufw      $0, %%mm1, %%mm1;                      "
+             " pshufw      $0, %%mm2, %%mm2;                      "
+             "1:                                                  "
+             " movd        (%0),      %%mm4;                      "
+             " add           $4,         %0;                      "
+             " movd        (%1),      %%mm5;                      "
+             " add           $4,         %1;                      "
+             " punpcklbw  %%mm0,      %%mm4;                      "
+             " punpcklbw  %%mm0,      %%mm5;                      "
+             " pmullw     %%mm1,      %%mm4;                      "
+             " pmullw     %%mm2,      %%mm5;                      "
+             " paddw      %%mm4,      %%mm5;                      "
+             " psrlw         $8,      %%mm5;                      "
+             " packuswb   %%mm0,      %%mm5;                      "
+             " movd       %%mm5,       (%2);                      "
+             " add           $4,         %2;                      "
+             " decl       %%ecx;                                  "
+             " jne           1b;                                  "
+             " emms;                                              "
+             : "+r"(srcrow0), "+r"(srcrow1), "+r"(dstrow)   /* outputs */
+             : "m"(ymult0),   "m"(ymult1),   "m"(width)    /* input */
+             : "%ecx"         /* clobbered */
+             );
+#elif defined(__GNUC__) && defined(__i386__)
+        asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
+             " movl          %5,      %%eax;                      "
+             " movd          %3,      %%mm1;                      "
+             " movd          %4,      %%mm2;                      "
+             " pxor       %%mm0,      %%mm0;                      "
+             " pshufw      $0, %%mm1, %%mm1;                      "
+             " pshufw      $0, %%mm2, %%mm2;                      "
+             "1:                                                  "
+             " movd        (%0),      %%mm4;                      "
+             " add           $4,         %0;                      "
+             " movd        (%1),      %%mm5;                      "
+             " add           $4,         %1;                      "
+             " punpcklbw  %%mm0,     %%mm4;                       "
+             " punpcklbw  %%mm0,     %%mm5;                       "
+             " pmullw     %%mm1,     %%mm4;                       "
+             " pmullw     %%mm2,     %%mm5;                       "
+             " paddw      %%mm4,     %%mm5;                       "
+             " psrlw         $8,     %%mm5;                       "
+             " packuswb   %%mm0,     %%mm5;                       "
+             " movd       %%mm5,      (%2);                       "
+             " add           $4,        %2;                       "
+             " decl       %%eax;                                  "
+             " jne           1b;                                  "
+             " emms;                                              "
+             : "+r"(srcrow0), "+r"(srcrow1),"+r"(dstrow)   /* no outputs */
+             : "m"(ymult0),   "m"(ymult1),  "m"(width)    /* input */
+             : "%eax"        /* clobbered */
+             );
+#endif
+    }
+}
+
+static void convert_24_32(Uint8 *srcpix, int srcpitch, Uint8 *dstpix, int dstpitch, int width, int height)
+{
+    int srcdiff = srcpitch - (width * 3);
+    int dstdiff = dstpitch - (width * 4);
+    int x, y;
+
+    for (y = 0; y < height; y++)
+    {
+        for (x = 0; x < width; x++)
+        {
+            *dstpix++ = *srcpix++;
+            *dstpix++ = *srcpix++;
+            *dstpix++ = *srcpix++;
+            *dstpix++ = 0xff;
+        }
+        srcpix += srcdiff;
+        dstpix += dstdiff;
+    }
+}
+
+static void convert_32_24(Uint8 *srcpix, int srcpitch, Uint8 *dstpix, int dstpitch, int width, int height)
+{
+    int srcdiff = srcpitch - (width * 4);
+    int dstdiff = dstpitch - (width * 3);
+    int x, y;
+
+    for (y = 0; y < height; y++)
+    {
+        for (x = 0; x < width; x++)
+        {
+            *dstpix++ = *srcpix++;
+            *dstpix++ = *srcpix++;
+            *dstpix++ = *srcpix++;
+            srcpix++;
+        }
+        srcpix += srcdiff;
+        dstpix += dstdiff;
+    }
+}
+
+static void scalesmooth(SDL_Surface *src, SDL_Surface *dst)
+{
+	Uint8* srcpix = (Uint8*)src->pixels;
+	Uint8* dstpix = (Uint8*)dst->pixels;
+    Uint8* dst32 = NULL;
+	int srcpitch = src->pitch;
+	int dstpitch = dst->pitch;
+
+	int srcwidth = src->w;
+	int srcheight = src->h;
+	int dstwidth = dst->w;
+    int dstheight = dst->h;
+
+    int bpp = src->format->BytesPerPixel;
+
+    /* convert to 32-bit if necessary */
+    if (bpp == 3)
+    {
+        int newpitch = srcwidth * 4;
+        Uint8 *newsrc = (Uint8 *) malloc(newpitch * srcheight);
+        if (!newsrc) return;
+        convert_24_32(srcpix, srcpitch, newsrc, newpitch, srcwidth, srcheight);
+        srcpix = newsrc;
+        srcpitch = newpitch;
+        /* create a destination buffer for the 32-bit result */
+        dstpitch = dstwidth << 2;
+        dst32 = (Uint8 *) malloc(dstpitch * dstheight);
+        if (dst32 == NULL)
+        {
+            free(dst32);
+            return;
+        }
+        dstpix = dst32;
+    }
+
+    /* Create a temporary processing buffer if we will be scaling both X and Y */
+    Uint8 *temppix = NULL;
+    int tempwidth=0, temppitch=0, tempheight=0;
+    if (srcwidth != dstwidth && srcheight != dstheight)
+    {
+        tempwidth = dstwidth;
+        temppitch = tempwidth << 2;
+        tempheight = srcheight;
+        temppix = (Uint8 *) malloc(temppitch * tempheight);
+        if (temppix == NULL)
+        {
+            if (bpp == 3)
+            {
+                free(srcpix);
+                free(dstpix);
+            }
+            return;
+        }
+    }
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) /* MMX routines will only compile in GCC */
+    if (SDL_HasMMX())
+    {
+        /* Start the filter by doing X-scaling */
+        if (dstwidth < srcwidth) /* shrink */
+        {
+            if (srcheight != dstheight)
+                filter_shrink_X_MMX(srcpix, temppix, srcheight, srcpitch, temppitch, srcwidth, dstwidth);
+            else
+                filter_shrink_X_MMX(srcpix, dstpix, srcheight, srcpitch, dstpitch, srcwidth, dstwidth);
+        }
+        else if (dstwidth > srcwidth) /* expand */
+        {
+            if (srcheight != dstheight)
+                filter_expand_X_MMX(srcpix, temppix, srcheight, srcpitch, temppitch, srcwidth, dstwidth);
+            else
+                filter_expand_X_MMX(srcpix, dstpix, srcheight, srcpitch, dstpitch, srcwidth, dstwidth);
+        }
+        /* Now do the Y scale */
+        if (dstheight < srcheight) /* shrink */
+        {
+            if (srcwidth != dstwidth)
+                filter_shrink_Y_MMX(temppix, dstpix, tempwidth, temppitch, dstpitch, srcheight, dstheight);
+            else
+                filter_shrink_Y_MMX(srcpix, dstpix, srcwidth, srcpitch, dstpitch, srcheight, dstheight);
+        }
+        else if (dstheight > srcheight)  /* expand */
+        {
+            if (srcwidth != dstwidth)
+                filter_expand_Y_MMX(temppix, dstpix, tempwidth, temppitch, dstpitch, srcheight, dstheight);
+            else
+                filter_expand_Y_MMX(srcpix, dstpix, srcwidth, srcpitch, dstpitch, srcheight, dstheight);
+        }
+    }
+    else
+#endif
+    { /* No MMX -- use the C versions */
+        /* Start the filter by doing X-scaling */
+        if (dstwidth < srcwidth) /* shrink */
+        {
+            if (srcheight != dstheight)
+                filter_shrink_X_ONLYC(srcpix, temppix, srcheight, srcpitch, temppitch, srcwidth, dstwidth);
+            else
+                filter_shrink_X_ONLYC(srcpix, dstpix, srcheight, srcpitch, dstpitch, srcwidth, dstwidth);
+        }
+        else if (dstwidth > srcwidth) /* expand */
+        {
+            if (srcheight != dstheight)
+                filter_expand_X_ONLYC(srcpix, temppix, srcheight, srcpitch, temppitch, srcwidth, dstwidth);
+            else
+                filter_expand_X_ONLYC(srcpix, dstpix, srcheight, srcpitch, dstpitch, srcwidth, dstwidth);
+        }
+        /* Now do the Y scale */
+        if (dstheight < srcheight) /* shrink */
+        {
+            if (srcwidth != dstwidth)
+                filter_shrink_Y_ONLYC(temppix, dstpix, tempwidth, temppitch, dstpitch, srcheight, dstheight);
+            else
+                filter_shrink_Y_ONLYC(srcpix, dstpix, srcwidth, srcpitch, dstpitch, srcheight, dstheight);
+        }
+        else if (dstheight > srcheight)  /* expand */
+        {
+            if (srcwidth != dstwidth)
+                filter_expand_Y_ONLYC(temppix, dstpix, tempwidth, temppitch, dstpitch, srcheight, dstheight);
+            else
+                filter_expand_Y_ONLYC(srcpix, dstpix, srcwidth, srcpitch, dstpitch, srcheight, dstheight);
+        }
+    }
+
+    /* Convert back to 24-bit if necessary */
+    if (bpp == 3)
+    {
+        convert_32_24(dst32, dstpitch, (Uint8*)dst->pixels, dst->pitch, dstwidth, dstheight);
+        free(dst32);
+        dst32 = NULL;
+        free(srcpix);
+        srcpix = NULL;
+    }
+    /* free temporary buffer if necessary */
+    if (temppix != NULL) free(temppix);
+
+}
+
+
+static PyObject* surf_scalesmooth(PyObject* self, PyObject* arg)
+{
+	PyObject *surfobj, *surfobj2;
+	SDL_Surface* surf, *newsurf;
+	int width, height, bpp;
+    surfobj2 = NULL;
+
+    /*get all the arguments*/
+    if (!PyArg_ParseTuple (arg, "O!(ii)|O!", &PySurface_Type, &surfobj, 
+                           &width, &height, &PySurface_Type, &surfobj2))
+        return NULL;
+
+    if (width < 0 || height < 0)
+        return RAISE (PyExc_ValueError, "Cannot scale to negative size");
+
+    surf = PySurface_AsSurface (surfobj);
+
+    bpp = surf->format->BytesPerPixel;
+    if(bpp < 3 || bpp > 4)
+		return RAISE(PyExc_ValueError, "Only 24-bit or 32-bit surfaces can be smoothly scaled");
+
+	
+    if (!surfobj2)
+    {
+        newsurf = newsurf_fromsurf (surf, width, height);
+        if (!newsurf)
+            return NULL;
+    }
+    else
+        newsurf = PySurface_AsSurface (surfobj2);
+
+    /* check to see if the size is twice as big. */
+    if (newsurf->w != width || newsurf->h != height)
+        return RAISE (PyExc_ValueError, 
+                      "Destination surface not the given width or height.");
+
+
+    if(((width * bpp + 3) >> 2) > newsurf->pitch)
+        return RAISE(PyExc_ValueError, "SDL Error: destination surface pitch not 4-byte aligned.");
+
+	
+    if(width && height)
+    {
+        SDL_LockSurface(newsurf);
+        PySurface_Lock(surfobj);
+        Py_BEGIN_ALLOW_THREADS;
+
+        /* handle trivial case */
+        if (surf->w == width && surf->h == height) {
+            int y;
+            for (y = 0; y < height; y++) {
+                memcpy(newsurf->pixels + y * newsurf->pitch, 
+                       surf->pixels + y * surf->pitch, width * bpp);
+            }
+        }
+        else {
+            scalesmooth(surf, newsurf);
+        }
+        Py_END_ALLOW_THREADS;
+
+        PySurface_Unlock(surfobj);
+        SDL_UnlockSurface(newsurf);
+    }
+
+    if (surfobj2)
+    {
+        Py_INCREF (surfobj2);
+        return surfobj2;
+    }
+    else
+        return PySurface_New (newsurf);
+
+}
+
+
+
+
+
+
+
+
 static PyMethodDef transform_builtins[] =
 {
     { "scale", surf_scale, METH_VARARGS, DOC_PYGAMETRANSFORMSCALE },
     { "rotozoom", surf_rotozoom, METH_VARARGS, DOC_PYGAMETRANSFORMROTOZOOM},
     { "chop", surf_chop, METH_VARARGS, DOC_PYGAMETRANSFORMCHOP },
     { "scale2x", surf_scale2x, METH_VARARGS, DOC_PYGAMETRANSFORMSCALE2X },
+    { "smoothscale", surf_scalesmooth, METH_VARARGS, DOC_PYGAMETRANSFORMSMOOTHSCALE },
     
     { NULL, NULL, 0, NULL }
 };

src/transform.doc

 
 
 
+smoothscale
+scale a surface to an arbitrary size smoothly
+pygame.transform.smoothscale(Surface, (width, height), DestSurface = None): return Surface
+
+Uses one of two different algorithms for scaling each
+dimension of the input surface as required.  For shrinkage,
+the output pixels are area averages of the colors they
+cover.  For expansion, a bilinear filter is used. For the
+amd64 and i686 architectures, optimized MMX routines are
+included and will run much faster than other machine types.
+The size is a 2 number sequence for (width, height). This
+function only works for 24-bit or 32-bit surfaces.  An
+exception will be thrown if the input surface bit depth is
+less than 24.
+
+New in pygame 1.8
+<END>
+
+
+
 chop
 remove interior area of an image
 pygame.transform.chop(Surface, rect): return Surface

test/transform_test.py

         # the wrong size surface is past in.  Should raise an error.
         self.assertRaises(ValueError, pygame.transform.scale, s, (33,64), s3)
 
+        if 1:
+            s = pygame.Surface((32,32))
+            s2 = pygame.transform.smoothscale(s, (64,64))
+            s3 = s2.copy()
 
+            s3 = pygame.transform.smoothscale(s, (64,64), s3)
+            pygame.transform.smoothscale(s, (64,64), s2)
+
+            # the wrong size surface is past in.  Should raise an error.
+            self.assertRaises(ValueError, pygame.transform.smoothscale, s, (33,64), s3)
 
 
 if __name__ == '__main__':