Commits

ecsv committed d33b39e

Convert from mixed line endings to LF-only line endings

  • Participants
  • Parent commits 5a2352a

Comments (0)

Files changed (94)

-
-Microsoft Visual Studio Solution File, Format Version 9.00
-# Visual Studio 2005
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "GlideHQ", "GlideHQ\MSVS\GlideHQ.vcproj", "{2DADDAA5-0F57-46ED-A974-747908DDC7F3}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BuildNo", "GlideHQ\MSVS\BuildNo.vcproj", "{4F7108EF-68E7-4BD3-9680-3957B9D2410F}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Glitch64", "Glitch64\MSVS\Glitch64.vcproj", "{462C2608-182F-452C-944F-AF201E19300B}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Glide64", "Glide64\MSVS\Glide64.vcproj", "{A4D13408-A794-4199-8FC7-4A9A32505005}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Release|Win32 = Release|Win32
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{2DADDAA5-0F57-46ED-A974-747908DDC7F3}.Debug|Win32.ActiveCfg = Debug|Win32
-		{2DADDAA5-0F57-46ED-A974-747908DDC7F3}.Debug|Win32.Build.0 = Debug|Win32
-		{2DADDAA5-0F57-46ED-A974-747908DDC7F3}.Release|Win32.ActiveCfg = Release|Win32
-		{2DADDAA5-0F57-46ED-A974-747908DDC7F3}.Release|Win32.Build.0 = Release|Win32
-		{4F7108EF-68E7-4BD3-9680-3957B9D2410F}.Debug|Win32.ActiveCfg = Debug|Win32
-		{4F7108EF-68E7-4BD3-9680-3957B9D2410F}.Debug|Win32.Build.0 = Debug|Win32
-		{4F7108EF-68E7-4BD3-9680-3957B9D2410F}.Release|Win32.ActiveCfg = Release|Win32
-		{4F7108EF-68E7-4BD3-9680-3957B9D2410F}.Release|Win32.Build.0 = Release|Win32
-		{462C2608-182F-452C-944F-AF201E19300B}.Debug|Win32.ActiveCfg = Debug|Win32
-		{462C2608-182F-452C-944F-AF201E19300B}.Debug|Win32.Build.0 = Debug|Win32
-		{462C2608-182F-452C-944F-AF201E19300B}.Release|Win32.ActiveCfg = Release|Win32
-		{462C2608-182F-452C-944F-AF201E19300B}.Release|Win32.Build.0 = Release|Win32
-		{A4D13408-A794-4199-8FC7-4A9A32505005}.Debug|Win32.ActiveCfg = Debug|Win32
-		{A4D13408-A794-4199-8FC7-4A9A32505005}.Debug|Win32.Build.0 = Debug|Win32
-		{A4D13408-A794-4199-8FC7-4A9A32505005}.Release|Win32.ActiveCfg = Release|Win32
-		{A4D13408-A794-4199-8FC7-4A9A32505005}.Release|Win32.Build.0 = Release|Win32
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
+
+Microsoft Visual Studio Solution File, Format Version 9.00
+# Visual Studio 2005
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "GlideHQ", "GlideHQ\MSVS\GlideHQ.vcproj", "{2DADDAA5-0F57-46ED-A974-747908DDC7F3}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BuildNo", "GlideHQ\MSVS\BuildNo.vcproj", "{4F7108EF-68E7-4BD3-9680-3957B9D2410F}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Glitch64", "Glitch64\MSVS\Glitch64.vcproj", "{462C2608-182F-452C-944F-AF201E19300B}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Glide64", "Glide64\MSVS\Glide64.vcproj", "{A4D13408-A794-4199-8FC7-4A9A32505005}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{2DADDAA5-0F57-46ED-A974-747908DDC7F3}.Debug|Win32.ActiveCfg = Debug|Win32
+		{2DADDAA5-0F57-46ED-A974-747908DDC7F3}.Debug|Win32.Build.0 = Debug|Win32
+		{2DADDAA5-0F57-46ED-A974-747908DDC7F3}.Release|Win32.ActiveCfg = Release|Win32
+		{2DADDAA5-0F57-46ED-A974-747908DDC7F3}.Release|Win32.Build.0 = Release|Win32
+		{4F7108EF-68E7-4BD3-9680-3957B9D2410F}.Debug|Win32.ActiveCfg = Debug|Win32
+		{4F7108EF-68E7-4BD3-9680-3957B9D2410F}.Debug|Win32.Build.0 = Debug|Win32
+		{4F7108EF-68E7-4BD3-9680-3957B9D2410F}.Release|Win32.ActiveCfg = Release|Win32
+		{4F7108EF-68E7-4BD3-9680-3957B9D2410F}.Release|Win32.Build.0 = Release|Win32
+		{462C2608-182F-452C-944F-AF201E19300B}.Debug|Win32.ActiveCfg = Debug|Win32
+		{462C2608-182F-452C-944F-AF201E19300B}.Debug|Win32.Build.0 = Debug|Win32
+		{462C2608-182F-452C-944F-AF201E19300B}.Release|Win32.ActiveCfg = Release|Win32
+		{462C2608-182F-452C-944F-AF201E19300B}.Release|Win32.Build.0 = Release|Win32
+		{A4D13408-A794-4199-8FC7-4A9A32505005}.Debug|Win32.ActiveCfg = Debug|Win32
+		{A4D13408-A794-4199-8FC7-4A9A32505005}.Debug|Win32.Build.0 = Debug|Win32
+		{A4D13408-A794-4199-8FC7-4A9A32505005}.Release|Win32.ActiveCfg = Release|Win32
+		{A4D13408-A794-4199-8FC7-4A9A32505005}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal

src/Glide64/3dmath.cpp

-/*
-* Glide64 - Glide video plugin for Nintendo 64 emulators.
-* Copyright (c) 2002  Dave2001
-* Copyright (c) 2003-2009  Sergey 'Gonetz' Lipski
-*
-* This program is free software; you can redistribute it and/or modify
-* it under the terms of the GNU General Public License as published by
-* the Free Software Foundation; either version 2 of the License, or
-* any later version.
-*
-* This program is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-* GNU General Public License for more details.
-*
-* You should have received a copy of the GNU General Public License
-* along with this program; if not, write to the Free Software
-* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-*/
-
-//****************************************************************
-//
-// Glide64 - Glide Plugin for Nintendo 64 emulators
-// Project started on December 29th, 2001
-//
-// Authors:
-// Dave2001, original author, founded the project in 2001, left it in 2002
-// Gugaman, joined the project in 2002, left it in 2002
-// Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
-// Hiroshi 'KoolSmoky' Morii, joined the project in 2007
-//
-//****************************************************************
-//
-// To modify Glide64:
-// * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
-// * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
-//
-//****************************************************************
-
-#ifndef NOSSE
-#include <xmmintrin.h>
-#endif
-
-#include <math.h>
-#include "Gfx #1.3.h"
-#include "3dmath.h"
-
-void calc_light (VERTEX *v)
-{
-  float light_intensity = 0.0f;
-  register float color[3] = {rdp.light[rdp.num_lights].r, rdp.light[rdp.num_lights].g, rdp.light[rdp.num_lights].b};
-  for (wxUint32 l=0; l<rdp.num_lights; l++)
-  {
-    light_intensity = DotProduct (rdp.light_vector[l], v->vec);
-    
-    if (light_intensity > 0.0f) 
-    {
-      color[0] += rdp.light[l].r * light_intensity;
-      color[1] += rdp.light[l].g * light_intensity;
-      color[2] += rdp.light[l].b * light_intensity;
-    }
-  }
-  
-  if (color[0] > 1.0f) color[0] = 1.0f;
-  if (color[1] > 1.0f) color[1] = 1.0f;
-  if (color[2] > 1.0f) color[2] = 1.0f;
-  
-  v->r = (wxUint8)(color[0]*255.0f);
-  v->g = (wxUint8)(color[1]*255.0f);
-  v->b = (wxUint8)(color[2]*255.0f);
-}
-
-//*
-void calc_linear (VERTEX *v)
-{
-  if (settings.force_calc_sphere)
-  {
-    calc_sphere(v);
-    return;
-  }
-  DECLAREALIGN16VAR(vec[3]);
-  
-  TransformVector (v->vec, vec, rdp.model);
-  //    TransformVector (v->vec, vec, rdp.combined);
-  NormalizeVector (vec);
-  float x, y;
-  if (!rdp.use_lookat)
-  {
-    x = vec[0];
-    y = vec[1];
-  }
-  else
-  {
-    x = DotProduct (rdp.lookat[0], vec);
-    y = DotProduct (rdp.lookat[1], vec);
-  }
-  
-  if (x > 1.0f)
-    x = 1.0f;
-  else if (x < -1.0f)
-    x = -1.0f;
-  if (y > 1.0f)
-    y = 1.0f;
-  else if (y < -1.0f)
-    y = -1.0f;
-  
-  if (rdp.cur_cache[0])
-  {
-    // scale >> 6 is size to map to
-    v->ou = (acosf(x)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_s_scale >> 6);
-    v->ov = (acosf(y)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_t_scale >> 6);
-  }
-  v->uv_scaled = 1;
-#ifdef EXTREME_LOGGING
-  FRDP ("calc linear u: %f, v: %f\n", v->ou, v->ov);
-#endif
-}
-
-void calc_sphere (VERTEX *v)
-{
-//  LRDP("calc_sphere\n");
-  DECLAREALIGN16VAR(vec[3]);
-  int s_scale, t_scale;
-  if (settings.hacks&hack_Chopper)
-  {
-    s_scale = min(rdp.tiles[rdp.cur_tile].org_s_scale >> 6, rdp.tiles[rdp.cur_tile].lr_s);
-    t_scale = min(rdp.tiles[rdp.cur_tile].org_t_scale >> 6, rdp.tiles[rdp.cur_tile].lr_t);
-  }
-  else
-  {
-    s_scale = rdp.tiles[rdp.cur_tile].org_s_scale >> 6;
-    t_scale = rdp.tiles[rdp.cur_tile].org_t_scale >> 6;
-  }
-  TransformVector (v->vec, vec, rdp.model);
-  //    TransformVector (v->vec, vec, rdp.combined);
-  NormalizeVector (vec);
-  float x, y;
-  if (!rdp.use_lookat)
-  {
-    x = vec[0];
-    y = vec[1];
-  }
-  else
-  {
-    x = DotProduct (rdp.lookat[0], vec);
-    y = DotProduct (rdp.lookat[1], vec);
-  }
-  v->ou = (x * 0.5f + 0.5f) * s_scale;
-  v->ov = (y * 0.5f + 0.5f) * t_scale;
-  v->uv_scaled = 1;
-#ifdef EXTREME_LOGGING
-  FRDP ("calc sphere u: %f, v: %f\n", v->ou, v->ov);
-#endif
-}
-
-float DotProductC(register float *v1, register float *v2)
-{
-    register float result;
-    result = v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
-    return(result);
-}
-
-void NormalizeVectorC(float *v)
-{
-    register float len;
-    len = sqrtf(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
-    if (len > 0.0f)
-    {
-        v[0] /= len;
-        v[1] /= len;
-        v[2] /= len;
-    }
-}
-
-void TransformVectorC(float *src, float *dst, float mat[4][4])
-{
-  dst[0] = mat[0][0]*src[0] + mat[1][0]*src[1] + mat[2][0]*src[2];
-  dst[1] = mat[0][1]*src[0] + mat[1][1]*src[1] + mat[2][1]*src[2];
-  dst[2] = mat[0][2]*src[0] + mat[1][2]*src[1] + mat[2][2]*src[2];
-}
-
-void InverseTransformVectorC (float *src, float *dst, float mat[4][4])
-{
-  dst[0] = mat[0][0]*src[0] + mat[0][1]*src[1] + mat[0][2]*src[2];
-  dst[1] = mat[1][0]*src[0] + mat[1][1]*src[1] + mat[1][2]*src[2];
-  dst[2] = mat[2][0]*src[0] + mat[2][1]*src[1] + mat[2][2]*src[2];
-}
-
-void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
-{
-  for (int i=0; i<4; i++)
-  {
-    for (int j=0; j<4; j++)
-    {
-      r[i][j] = m1[i][0] * m2[0][j] +
-                m1[i][1] * m2[1][j] +
-                m1[i][2] * m2[2][j] +
-                m1[i][3] * m2[3][j];
-    }
-  }
-}
-
-// 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication
-//                      and 3DNOW! 4x4 4x4 matrix multiplication
-// 2011-01-03 Balrog - removed because is in NASM format and not 64-bit compatible
-// This will need fixing.
-MULMATRIX MulMatrices = MulMatricesC;
-TRANSFORMVECTOR TransformVector = TransformVectorC;
-TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC;
-DOTPRODUCT DotProduct = DotProductC;
-NORMALIZEVECTOR NormalizeVector = NormalizeVectorC;
-
-void MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
-{
-#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
-   /* [row][col]*/
-  typedef float v4sf __attribute__ ((vector_size (16)));
-  v4sf row0 = __builtin_ia32_loadups(m2[0]);
-  v4sf row1 = __builtin_ia32_loadups(m2[1]);
-  v4sf row2 = __builtin_ia32_loadups(m2[2]);
-  v4sf row3 = __builtin_ia32_loadups(m2[3]);
-
-  for (int i = 0; i < 4; ++i)
-  {
-    v4sf leftrow = __builtin_ia32_loadups(m1[i]);
-
-    // Fill tmp with four copies of leftrow[0]
-    v4sf tmp = leftrow;
-    tmp = _mm_shuffle_ps (tmp, tmp, 0);
-    // Calculate the four first summands
-    v4sf destrow = tmp * row0;
-
-    // Fill tmp with four copies of leftrow[1]
-    tmp = leftrow;
-    tmp = _mm_shuffle_ps (tmp, tmp, 1 + (1 << 2) + (1 << 4) + (1 << 6));
-    destrow += tmp * row1;
-
-    // Fill tmp with four copies of leftrow[2]
-    tmp = leftrow;
-    tmp = _mm_shuffle_ps (tmp, tmp, 2 + (2 << 2) + (2 << 4) + (2 << 6));
-    destrow += tmp * row2;
-
-    // Fill tmp with four copies of leftrow[3]
-    tmp = leftrow;
-    tmp = _mm_shuffle_ps (tmp, tmp, 3 + (3 << 2) + (3 << 4) + (3 << 6));
-    destrow += tmp * row3;
-
-    __builtin_ia32_storeups(r[i], destrow);
-  }
- #elif !defined(NO_ASM) && !defined(NOSSE)
-  __asm
-  {
-    mov     eax, dword ptr [r]  
-      mov     ecx, dword ptr [m1]
-      mov     edx, dword ptr [m2]
-
-      movaps  xmm0,[edx]
-      movaps  xmm1,[edx+16]
-      movaps  xmm2,[edx+32]
-      movaps  xmm3,[edx+48]
-
-// r[0][0],r[0][1],r[0][2],r[0][3]
-
-      movaps  xmm4,xmmword ptr[ecx]
-      movaps  xmm5,xmm4
-      movaps  xmm6,xmm4
-      movaps  xmm7,xmm4
-
-      shufps  xmm4,xmm4,00000000b
-      shufps  xmm5,xmm5,01010101b
-      shufps  xmm6,xmm6,10101010b
-      shufps  xmm7,xmm7,11111111b
-
-      mulps   xmm4,xmm0
-      mulps   xmm5,xmm1
-      mulps   xmm6,xmm2
-      mulps   xmm7,xmm3
-
-      addps   xmm4,xmm5
-      addps   xmm4,xmm6
-      addps   xmm4,xmm7
-
-      movaps  xmmword ptr[eax],xmm4
-
-// r[1][0],r[1][1],r[1][2],r[1][3]
-
-      movaps  xmm4,xmmword ptr[ecx+16]
-      movaps  xmm5,xmm4
-      movaps  xmm6,xmm4
-      movaps  xmm7,xmm4
-
-      shufps  xmm4,xmm4,00000000b
-      shufps  xmm5,xmm5,01010101b
-      shufps  xmm6,xmm6,10101010b
-      shufps  xmm7,xmm7,11111111b
-
-      mulps   xmm4,xmm0
-      mulps   xmm5,xmm1
-      mulps   xmm6,xmm2
-      mulps   xmm7,xmm3
-
-      addps   xmm4,xmm5
-      addps   xmm4,xmm6
-      addps   xmm4,xmm7
-
-      movaps  xmmword ptr[eax+16],xmm4
-
-
-// r[2][0],r[2][1],r[2][2],r[2][3]
-
-      movaps  xmm4,xmmword ptr[ecx+32]
-      movaps  xmm5,xmm4
-      movaps  xmm6,xmm4
-      movaps  xmm7,xmm4
-
-      shufps  xmm4,xmm4,00000000b
-      shufps  xmm5,xmm5,01010101b
-      shufps  xmm6,xmm6,10101010b
-      shufps  xmm7,xmm7,11111111b
-
-      mulps   xmm4,xmm0
-      mulps   xmm5,xmm1
-      mulps   xmm6,xmm2
-      mulps   xmm7,xmm3
-
-      addps   xmm4,xmm5
-      addps   xmm4,xmm6
-      addps   xmm4,xmm7
-
-      movaps  xmmword ptr[eax+32],xmm4
-
-// r[3][0],r[3][1],r[3][2],r[3][3]
-
-      movaps  xmm4,xmmword ptr[ecx+48]
-      movaps  xmm5,xmm4
-      movaps  xmm6,xmm4
-      movaps  xmm7,xmm4
-
-      shufps  xmm4,xmm4,00000000b
-      shufps  xmm5,xmm5,01010101b
-      shufps  xmm6,xmm6,10101010b
-      shufps  xmm7,xmm7,11111111b
-
-      mulps   xmm4,xmm0
-      mulps   xmm5,xmm1
-      mulps   xmm6,xmm2
-      mulps   xmm7,xmm3
-
-      addps   xmm4,xmm5
-      addps   xmm4,xmm6
-      addps   xmm4,xmm7
-
-      movaps  xmmword ptr[eax+48],xmm4
-    }
-#endif // _WIN32
-  }
-
-
-
-  void math_init()
-  {
-#ifndef _DEBUG
-    int IsSSE = FALSE;
-    int edx, eax;
-#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
-    GLIDE64_TRY
-    {
-  #if defined(__x86_64__)
-      asm volatile(" cpuid;        "
-        : "=a"(eax), "=d"(edx)
-        : "0"(1)
-        : "rbx", "rcx"
-        );
-  #else
-      asm volatile(" push %%ebx;   "
-        " push %%ecx;   "
-        " cpuid;        "
-        " pop %%ecx;    "
-        " pop %%ebx;    "
-        : "=a"(eax), "=d"(edx)
-        : "0"(1)
-        :
-      );
-  #endif
-    }
-    GLIDE64_CATCH
-      { return; }
-    // Check for SSE
-    if (edx & (1 << 25))
-      IsSSE = TRUE;
-#elif !defined(NO_ASM) && !defined(NOSSE)
-    DWORD dwEdx;
-    __try
-    {
-      __asm 
-      {
-        mov  eax,1
-          cpuid
-          mov dwEdx,edx
-        }  
-      }
-      __except(EXCEPTION_EXECUTE_HANDLER)
-      {
-        return;
-      }
-
-      if (dwEdx & (1<<25)) 
-      {
-        if (dwEdx & (1<<24))
-        {      
-          __try
-          {
-            __asm xorps xmm0, xmm0
-              IsSSE = TRUE;
-          }
-          __except(EXCEPTION_EXECUTE_HANDLER)
-          {
-            return;
-          }
-        }
-      }
-#endif // _WIN32
-      if (IsSSE)
-      {
-        MulMatrices = MulMatricesSSE;
-        LOG("3DNOW! detected.\n");
-      }
-
-#endif //_DEBUG
-    }
+/*
+* Glide64 - Glide video plugin for Nintendo 64 emulators.
+* Copyright (c) 2002  Dave2001
+* Copyright (c) 2003-2009  Sergey 'Gonetz' Lipski
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+//****************************************************************
+//
+// Glide64 - Glide Plugin for Nintendo 64 emulators
+// Project started on December 29th, 2001
+//
+// Authors:
+// Dave2001, original author, founded the project in 2001, left it in 2002
+// Gugaman, joined the project in 2002, left it in 2002
+// Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
+// Hiroshi 'KoolSmoky' Morii, joined the project in 2007
+//
+//****************************************************************
+//
+// To modify Glide64:
+// * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
+// * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
+//
+//****************************************************************
+
+#ifndef NOSSE
+#include <xmmintrin.h>
+#endif
+
+#include <math.h>
+#include "Gfx #1.3.h"
+#include "3dmath.h"
+
+void calc_light (VERTEX *v)
+{
+  float light_intensity = 0.0f;
+  register float color[3] = {rdp.light[rdp.num_lights].r, rdp.light[rdp.num_lights].g, rdp.light[rdp.num_lights].b};
+  for (wxUint32 l=0; l<rdp.num_lights; l++)
+  {
+    light_intensity = DotProduct (rdp.light_vector[l], v->vec);
+    
+    if (light_intensity > 0.0f) 
+    {
+      color[0] += rdp.light[l].r * light_intensity;
+      color[1] += rdp.light[l].g * light_intensity;
+      color[2] += rdp.light[l].b * light_intensity;
+    }
+  }
+  
+  if (color[0] > 1.0f) color[0] = 1.0f;
+  if (color[1] > 1.0f) color[1] = 1.0f;
+  if (color[2] > 1.0f) color[2] = 1.0f;
+  
+  v->r = (wxUint8)(color[0]*255.0f);
+  v->g = (wxUint8)(color[1]*255.0f);
+  v->b = (wxUint8)(color[2]*255.0f);
+}
+
+//*
+void calc_linear (VERTEX *v)
+{
+  if (settings.force_calc_sphere)
+  {
+    calc_sphere(v);
+    return;
+  }
+  DECLAREALIGN16VAR(vec[3]);
+  
+  TransformVector (v->vec, vec, rdp.model);
+  //    TransformVector (v->vec, vec, rdp.combined);
+  NormalizeVector (vec);
+  float x, y;
+  if (!rdp.use_lookat)
+  {
+    x = vec[0];
+    y = vec[1];
+  }
+  else
+  {
+    x = DotProduct (rdp.lookat[0], vec);
+    y = DotProduct (rdp.lookat[1], vec);
+  }
+  
+  if (x > 1.0f)
+    x = 1.0f;
+  else if (x < -1.0f)
+    x = -1.0f;
+  if (y > 1.0f)
+    y = 1.0f;
+  else if (y < -1.0f)
+    y = -1.0f;
+  
+  if (rdp.cur_cache[0])
+  {
+    // scale >> 6 is size to map to
+    v->ou = (acosf(x)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_s_scale >> 6);
+    v->ov = (acosf(y)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_t_scale >> 6);
+  }
+  v->uv_scaled = 1;
+#ifdef EXTREME_LOGGING
+  FRDP ("calc linear u: %f, v: %f\n", v->ou, v->ov);
+#endif
+}
+
+void calc_sphere (VERTEX *v)
+{
+//  LRDP("calc_sphere\n");
+  DECLAREALIGN16VAR(vec[3]);
+  int s_scale, t_scale;
+  if (settings.hacks&hack_Chopper)
+  {
+    s_scale = min(rdp.tiles[rdp.cur_tile].org_s_scale >> 6, rdp.tiles[rdp.cur_tile].lr_s);
+    t_scale = min(rdp.tiles[rdp.cur_tile].org_t_scale >> 6, rdp.tiles[rdp.cur_tile].lr_t);
+  }
+  else
+  {
+    s_scale = rdp.tiles[rdp.cur_tile].org_s_scale >> 6;
+    t_scale = rdp.tiles[rdp.cur_tile].org_t_scale >> 6;
+  }
+  TransformVector (v->vec, vec, rdp.model);
+  //    TransformVector (v->vec, vec, rdp.combined);
+  NormalizeVector (vec);
+  float x, y;
+  if (!rdp.use_lookat)
+  {
+    x = vec[0];
+    y = vec[1];
+  }
+  else
+  {
+    x = DotProduct (rdp.lookat[0], vec);
+    y = DotProduct (rdp.lookat[1], vec);
+  }
+  v->ou = (x * 0.5f + 0.5f) * s_scale;
+  v->ov = (y * 0.5f + 0.5f) * t_scale;
+  v->uv_scaled = 1;
+#ifdef EXTREME_LOGGING
+  FRDP ("calc sphere u: %f, v: %f\n", v->ou, v->ov);
+#endif
+}
+
+float DotProductC(register float *v1, register float *v2)
+{
+    register float result;
+    result = v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
+    return(result);
+}
+
+void NormalizeVectorC(float *v)
+{
+    register float len;
+    len = sqrtf(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
+    if (len > 0.0f)
+    {
+        v[0] /= len;
+        v[1] /= len;
+        v[2] /= len;
+    }
+}
+
+void TransformVectorC(float *src, float *dst, float mat[4][4])
+{
+  dst[0] = mat[0][0]*src[0] + mat[1][0]*src[1] + mat[2][0]*src[2];
+  dst[1] = mat[0][1]*src[0] + mat[1][1]*src[1] + mat[2][1]*src[2];
+  dst[2] = mat[0][2]*src[0] + mat[1][2]*src[1] + mat[2][2]*src[2];
+}
+
+void InverseTransformVectorC (float *src, float *dst, float mat[4][4])
+{
+  dst[0] = mat[0][0]*src[0] + mat[0][1]*src[1] + mat[0][2]*src[2];
+  dst[1] = mat[1][0]*src[0] + mat[1][1]*src[1] + mat[1][2]*src[2];
+  dst[2] = mat[2][0]*src[0] + mat[2][1]*src[1] + mat[2][2]*src[2];
+}
+
+void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
+{
+  for (int i=0; i<4; i++)
+  {
+    for (int j=0; j<4; j++)
+    {
+      r[i][j] = m1[i][0] * m2[0][j] +
+                m1[i][1] * m2[1][j] +
+                m1[i][2] * m2[2][j] +
+                m1[i][3] * m2[3][j];
+    }
+  }
+}
+
+// 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication
+//                      and 3DNOW! 4x4 4x4 matrix multiplication
+// 2011-01-03 Balrog - removed because is in NASM format and not 64-bit compatible
+// This will need fixing.
+MULMATRIX MulMatrices = MulMatricesC;
+TRANSFORMVECTOR TransformVector = TransformVectorC;
+TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC;
+DOTPRODUCT DotProduct = DotProductC;
+NORMALIZEVECTOR NormalizeVector = NormalizeVectorC;
+
+void MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
+{
+#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
+   /* [row][col]*/
+  typedef float v4sf __attribute__ ((vector_size (16)));
+  v4sf row0 = __builtin_ia32_loadups(m2[0]);
+  v4sf row1 = __builtin_ia32_loadups(m2[1]);
+  v4sf row2 = __builtin_ia32_loadups(m2[2]);
+  v4sf row3 = __builtin_ia32_loadups(m2[3]);
+
+  for (int i = 0; i < 4; ++i)
+  {
+    v4sf leftrow = __builtin_ia32_loadups(m1[i]);
+
+    // Fill tmp with four copies of leftrow[0]
+    v4sf tmp = leftrow;
+    tmp = _mm_shuffle_ps (tmp, tmp, 0);
+    // Calculate the four first summands
+    v4sf destrow = tmp * row0;
+
+    // Fill tmp with four copies of leftrow[1]
+    tmp = leftrow;
+    tmp = _mm_shuffle_ps (tmp, tmp, 1 + (1 << 2) + (1 << 4) + (1 << 6));
+    destrow += tmp * row1;
+
+    // Fill tmp with four copies of leftrow[2]
+    tmp = leftrow;
+    tmp = _mm_shuffle_ps (tmp, tmp, 2 + (2 << 2) + (2 << 4) + (2 << 6));
+    destrow += tmp * row2;
+
+    // Fill tmp with four copies of leftrow[3]
+    tmp = leftrow;
+    tmp = _mm_shuffle_ps (tmp, tmp, 3 + (3 << 2) + (3 << 4) + (3 << 6));
+    destrow += tmp * row3;
+
+    __builtin_ia32_storeups(r[i], destrow);
+  }
+ #elif !defined(NO_ASM) && !defined(NOSSE)
+  __asm
+  {
+    mov     eax, dword ptr [r]  
+      mov     ecx, dword ptr [m1]
+      mov     edx, dword ptr [m2]
+
+      movaps  xmm0,[edx]
+      movaps  xmm1,[edx+16]
+      movaps  xmm2,[edx+32]
+      movaps  xmm3,[edx+48]
+
+// r[0][0],r[0][1],r[0][2],r[0][3]
+
+      movaps  xmm4,xmmword ptr[ecx]
+      movaps  xmm5,xmm4
+      movaps  xmm6,xmm4
+      movaps  xmm7,xmm4
+
+      shufps  xmm4,xmm4,00000000b
+      shufps  xmm5,xmm5,01010101b
+      shufps  xmm6,xmm6,10101010b
+      shufps  xmm7,xmm7,11111111b
+
+      mulps   xmm4,xmm0
+      mulps   xmm5,xmm1
+      mulps   xmm6,xmm2
+      mulps   xmm7,xmm3
+
+      addps   xmm4,xmm5
+      addps   xmm4,xmm6
+      addps   xmm4,xmm7
+
+      movaps  xmmword ptr[eax],xmm4
+
+// r[1][0],r[1][1],r[1][2],r[1][3]
+
+      movaps  xmm4,xmmword ptr[ecx+16]
+      movaps  xmm5,xmm4
+      movaps  xmm6,xmm4
+      movaps  xmm7,xmm4
+
+      shufps  xmm4,xmm4,00000000b
+      shufps  xmm5,xmm5,01010101b
+      shufps  xmm6,xmm6,10101010b
+      shufps  xmm7,xmm7,11111111b
+
+      mulps   xmm4,xmm0
+      mulps   xmm5,xmm1
+      mulps   xmm6,xmm2
+      mulps   xmm7,xmm3
+
+      addps   xmm4,xmm5
+      addps   xmm4,xmm6
+      addps   xmm4,xmm7
+
+      movaps  xmmword ptr[eax+16],xmm4
+
+
+// r[2][0],r[2][1],r[2][2],r[2][3]
+
+      movaps  xmm4,xmmword ptr[ecx+32]
+      movaps  xmm5,xmm4
+      movaps  xmm6,xmm4
+      movaps  xmm7,xmm4
+
+      shufps  xmm4,xmm4,00000000b
+      shufps  xmm5,xmm5,01010101b
+      shufps  xmm6,xmm6,10101010b
+      shufps  xmm7,xmm7,11111111b
+
+      mulps   xmm4,xmm0
+      mulps   xmm5,xmm1
+      mulps   xmm6,xmm2
+      mulps   xmm7,xmm3
+
+      addps   xmm4,xmm5
+      addps   xmm4,xmm6
+      addps   xmm4,xmm7
+
+      movaps  xmmword ptr[eax+32],xmm4
+
+// r[3][0],r[3][1],r[3][2],r[3][3]
+
+      movaps  xmm4,xmmword ptr[ecx+48]
+      movaps  xmm5,xmm4
+      movaps  xmm6,xmm4
+      movaps  xmm7,xmm4
+
+      shufps  xmm4,xmm4,00000000b
+      shufps  xmm5,xmm5,01010101b
+      shufps  xmm6,xmm6,10101010b
+      shufps  xmm7,xmm7,11111111b
+
+      mulps   xmm4,xmm0
+      mulps   xmm5,xmm1
+      mulps   xmm6,xmm2
+      mulps   xmm7,xmm3
+
+      addps   xmm4,xmm5
+      addps   xmm4,xmm6
+      addps   xmm4,xmm7
+
+      movaps  xmmword ptr[eax+48],xmm4
+    }
+#endif // _WIN32
+  }
+
+
+
+  void math_init()
+  {
+#ifndef _DEBUG
+    int IsSSE = FALSE;
+    int edx, eax;
+#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
+    GLIDE64_TRY
+    {
+  #if defined(__x86_64__)
+      asm volatile(" cpuid;        "
+        : "=a"(eax), "=d"(edx)
+        : "0"(1)
+        : "rbx", "rcx"
+        );
+  #else
+      asm volatile(" push %%ebx;   "
+        " push %%ecx;   "
+        " cpuid;        "
+        " pop %%ecx;    "
+        " pop %%ebx;    "
+        : "=a"(eax), "=d"(edx)
+        : "0"(1)
+        :
+      );
+  #endif
+    }
+    GLIDE64_CATCH
+      { return; }
+    // Check for SSE
+    if (edx & (1 << 25))
+      IsSSE = TRUE;
+#elif !defined(NO_ASM) && !defined(NOSSE)
+    DWORD dwEdx;
+    __try
+    {
+      __asm 
+      {
+        mov  eax,1
+          cpuid
+          mov dwEdx,edx
+        }  
+      }
+      __except(EXCEPTION_EXECUTE_HANDLER)
+      {
+        return;
+      }
+
+      if (dwEdx & (1<<25)) 
+      {
+        if (dwEdx & (1<<24))
+        {      
+          __try
+          {
+            __asm xorps xmm0, xmm0
+              IsSSE = TRUE;
+          }
+          __except(EXCEPTION_EXECUTE_HANDLER)
+          {
+            return;
+          }
+        }
+      }
+#endif // _WIN32
+      if (IsSSE)
+      {
+        MulMatrices = MulMatricesSSE;
+        LOG("3DNOW! detected.\n");
+      }
+
+#endif //_DEBUG
+    }

src/Glide64/3dmath.h

-/*
-* Glide64 - Glide video plugin for Nintendo 64 emulators.
-* Copyright (c) 2002  Dave2001
-* Copyright (c) 2003-2009  Sergey 'Gonetz' Lipski
-*
-* This program is free software; you can redistribute it and/or modify
-* it under the terms of the GNU General Public License as published by
-* the Free Software Foundation; either version 2 of the License, or
-* any later version.
-*
-* This program is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-* GNU General Public License for more details.
-*
-* You should have received a copy of the GNU General Public License
-* along with this program; if not, write to the Free Software
-* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-*/
-
-//****************************************************************
-//
-// Glide64 - Glide Plugin for Nintendo 64 emulators
-// Project started on December 29th, 2001
-//
-// Authors:
-// Dave2001, original author, founded the project in 2001, left it in 2002
-// Gugaman, joined the project in 2002, left it in 2002
-// Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
-// Hiroshi 'KoolSmoky' Morii, joined the project in 2007
-//
-//****************************************************************
-//
-// To modify Glide64:
-// * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
-// * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
-//
-//****************************************************************
-
-void calc_light (VERTEX *v);
-void calc_linear (VERTEX *v);
-void calc_sphere (VERTEX *v);
-
-void math_init();
-
-typedef void (*MULMATRIX)(float m1[4][4],float m2[4][4],float r[4][4]); 
-extern MULMATRIX MulMatrices;
-typedef void (*TRANSFORMVECTOR)(float *src,float *dst,float mat[4][4]); 
-extern TRANSFORMVECTOR TransformVector;
-extern TRANSFORMVECTOR InverseTransformVector;
-typedef float (*DOTPRODUCT)(register float *v1, register float *v2);
-extern DOTPRODUCT DotProduct;
-typedef void (*NORMALIZEVECTOR)(float *v);
-extern NORMALIZEVECTOR NormalizeVector;
+/*
+* Glide64 - Glide video plugin for Nintendo 64 emulators.
+* Copyright (c) 2002  Dave2001
+* Copyright (c) 2003-2009  Sergey 'Gonetz' Lipski
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+//****************************************************************
+//
+// Glide64 - Glide Plugin for Nintendo 64 emulators
+// Project started on December 29th, 2001
+//
+// Authors:
+// Dave2001, original author, founded the project in 2001, left it in 2002
+// Gugaman, joined the project in 2002, left it in 2002
+// Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
+// Hiroshi 'KoolSmoky' Morii, joined the project in 2007
+//
+//****************************************************************
+//
+// To modify Glide64:
+// * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
+// * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
+//
+//****************************************************************
+
+void calc_light (VERTEX *v);
+void calc_linear (VERTEX *v);
+void calc_sphere (VERTEX *v);
+
+void math_init();
+
+typedef void (*MULMATRIX)(float m1[4][4],float m2[4][4],float r[4][4]); 
+extern MULMATRIX MulMatrices;
+typedef void (*TRANSFORMVECTOR)(float *src,float *dst,float mat[4][4]); 
+extern TRANSFORMVECTOR TransformVector;
+extern TRANSFORMVECTOR InverseTransformVector;
+typedef float (*DOTPRODUCT)(register float *v1, register float *v2);
+extern DOTPRODUCT DotProduct;
+typedef void (*NORMALIZEVECTOR)(float *v);
+extern NORMALIZEVECTOR NormalizeVector;

src/Glide64/3dmathSIMD.asm

-;/*
-;* Glide64 - Glide video plugin for Nintendo 64 emulators.
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-;*/
-;
-;****************************************************************
-;
-; Glide64 - Glide Plugin for Nintendo 64 emulators
-; Project started on December 29th, 2001
-;
-; Authors:
-; Dave2001, original author, founded the project in 2001, left it in 2002
-; Gugaman, joined the project in 2002, left it in 2002
-; Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
-; Hiroshi 'KoolSmoky' Morii, joined the project in 2007
-;
-;****************************************************************
-;
-; To modify Glide64:
-; * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
-; * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
-;
-;****************************************************************
-
-%include "inc/c32.mac"
-
-segment .text
-
-proc DetectSIMD
-      %$func    arg 
-      %$iedx    arg 
-      %$iecx    arg 
-      mov       eax,[ebp + %$func]
-      cpuid
-      mov       eax,[ebp + %$iedx]
-      mov       [eax],edx
-      mov       eax,[ebp + %$iecx]
-      mov       [eax],ecx
-endproc ;DetectSIMD
-
-;****************************************************************
-;
-;                     ******** SSE ********
-;
-;****************************************************************
-
-proc TransformVectorSSE
-CPU P3 
-      %$src     arg           ; float *src     
-      %$dst     arg           ; float *dst     
-      %$mat     arg           ; float mat[4][4]
-
-      mov       ecx,[ebp + %$src]
-      mov       eax,[ebp + %$dst]
-      mov       edx,[ebp + %$mat]
-                   
-      movss     xmm0,[ecx]    ; 0 0 0 src[0]
-      movss     xmm5,[edx]    ; 0 0 0 mat[0][0]
-      movhps    xmm5,[edx+4]  ; mat[0][2] mat[0][1] 0 mat[0][0]
-      shufps    xmm0,xmm0, 0  ; src[0] src[0] src[0] src[0]
-      movss     xmm1,[ecx+4]  ; 0 0 0 src[1]
-      movss     xmm3,[edx+16] ; 0 0 0 mat[1][0]
-      movhps    xmm3,[edx+20] ; mat[1][2] mat[1][1] 0 mat[1][0]
-      shufps    xmm1,xmm1, 0  ; src[1] src[1] src[1] src[1]
-      mulps     xmm0,xmm5     ; mat[0][2]*src[0] mat[0][1]*src[0] 0 mat[0][0]*src[0]
-      mulps     xmm1,xmm3     ; mat[1][2]*src[1] mat[1][1]*src[1] 0 mat[1][0]*src[1]
-      movss     xmm2,[ecx+8]  ; 0 0 0 src[2]
-      shufps    xmm2,xmm2, 0  ; src[2] src[2] src[2] src[2]
-      movss     xmm4,[edx+32] ; 0 0 0 mat[2][0]
-      movhps    xmm4,[edx+36] ; mat[2][2] mat[2][1] 0 mat[2][0]
-      addps     xmm0,xmm1     ; mat[0][2]*src[0]+mat[1][2]*src[1] mat[0][1]*src[0]+mat[1][1]*src[1] 0 mat[0][0]*src[0]+mat[1][0]*src[1]
-      mulps     xmm2,xmm4     ; mat[2][2]*src[2] mat[2][1]*src[2] 0 mat[2][0]*src[2]
-      addps     xmm0,xmm2     ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2] mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2] 0 mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
-      movss     [eax],xmm0    ; mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
-      movhps    [eax+4],xmm0  ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2] mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2]
-
-endproc ;TransformVectorSSE
-
-proc MulMatricesSSE
-CPU P3 
-      %$m1      arg  ; float m1[4][4] 
-      %$m2      arg  ; float m2[4][4] 
-      %$r       arg  ; float r[4][4]  
-
-      mov       eax,[ebp + %$r]      
-      mov       ecx,[ebp + %$m1]
-      mov       edx,[ebp + %$m2]
-      
-      movaps    xmm0,[edx]
-      movaps    xmm1,[edx+16]
-      movaps    xmm2,[edx+32]
-      movaps    xmm3,[edx+48]
-      
-      ; r[0][0],r[0][1],r[0][2],r[0][3]
-      
-      movaps    xmm4,[ecx]
-      movaps    xmm5,xmm4
-      movaps    xmm6,xmm4
-      movaps    xmm7,xmm4
-      
-      shufps    xmm4,xmm4,00000000b
-      shufps    xmm5,xmm5,01010101b
-      shufps    xmm6,xmm6,10101010b
-      shufps    xmm7,xmm7,11111111b
-      
-      mulps     xmm4,xmm0
-      mulps     xmm5,xmm1
-      mulps     xmm6,xmm2
-      mulps     xmm7,xmm3
-      
-      addps     xmm4,xmm5
-      addps     xmm4,xmm6
-      addps     xmm4,xmm7
-      
-      movaps    [eax],xmm4
-      
-      ; r[1][0],r[1][1],r[1][2],r[1][3]
-      
-      movaps    xmm4,[ecx+16]
-      movaps    xmm5,xmm4
-      movaps    xmm6,xmm4
-      movaps    xmm7,xmm4
-      
-      shufps    xmm4,xmm4,00000000b
-      shufps    xmm5,xmm5,01010101b
-      shufps    xmm6,xmm6,10101010b
-      shufps    xmm7,xmm7,11111111b
-      
-      mulps     xmm4,xmm0
-      mulps     xmm5,xmm1
-      mulps     xmm6,xmm2
-      mulps     xmm7,xmm3
-      
-      addps     xmm4,xmm5
-      addps     xmm4,xmm6
-      addps     xmm4,xmm7
-      
-      movaps    [eax+16],xmm4
-      
-      
-      ; r[2][0],r[2][1],r[2][2],r[2][3]
-      
-      movaps    xmm4,[ecx+32]
-      movaps    xmm5,xmm4
-      movaps    xmm6,xmm4
-      movaps    xmm7,xmm4
-      
-      shufps    xmm4,xmm4,00000000b
-      shufps    xmm5,xmm5,01010101b
-      shufps    xmm6,xmm6,10101010b
-      shufps    xmm7,xmm7,11111111b
-      
-      mulps     xmm4,xmm0
-      mulps     xmm5,xmm1
-      mulps     xmm6,xmm2
-      mulps     xmm7,xmm3
-      
-      addps     xmm4,xmm5
-      addps     xmm4,xmm6
-      addps     xmm4,xmm7
-      
-      movaps    [eax+32],xmm4
-      
-      ; r[3][0],r[3][1],r[3][2],r[3][3]
-      
-      movaps    xmm4,[ecx+48]
-      movaps    xmm5,xmm4
-      movaps    xmm6,xmm4
-      movaps    xmm7,xmm4
-      
-      shufps    xmm4,xmm4,00000000b
-      shufps    xmm5,xmm5,01010101b
-      shufps    xmm6,xmm6,10101010b
-      shufps    xmm7,xmm7,11111111b
-      
-      mulps     xmm4,xmm0
-      mulps     xmm5,xmm1
-      mulps     xmm6,xmm2
-      mulps     xmm7,xmm3
-      
-      addps     xmm4,xmm5
-      addps     xmm4,xmm6
-      addps     xmm4,xmm7
-      
-      movaps    [eax+48],xmm4
-
-endproc ;MulMatricesSSE
-
-proc NormalizeVectorSSE
-CPU P3
-      %$v arg
-      
-      mov edx, [ebp + %$v]
-      movaps xmm0, [edx]      ; x y z 0
-      movaps xmm2, xmm0       ; x y z 0
-      mulps  xmm0, xmm0       ; x*x y*y z*z 0
-      movaps xmm1, xmm0       ; x*x y*y z*z 0
-      shufps xmm0, xmm1, 0x4e ; z*z 0 x*x y*y
-      addps  xmm0, xmm1       ; x*x+z*z y*y z*z+x*x y*y
-      movaps xmm1, xmm0       ; x*x+z*z y*y z*z+x*x y*y
-      shufps xmm1, xmm1, 0x11 ; y*y z*z+x*x y*y z*z+x*x
-      addps  xmm0, xmm1       ; x*x+z*z+y*y
-      rsqrtps xmm0, xmm0      ; 1.0/sqrt(x*x+z*z+y*y)
-      mulps  xmm2, xmm0       ; x/sqrt(x*x+z*z+y*y) y/sqrt(x*x+z*z+y*y) z/sqrt(x*x+z*z+y*y) 0
-      movaps [edx], xmm2
-      
-endproc ;NormalizeVectorSSE
-
-;****************************************************************
-;
-;                     ******** SSE3 ********
-;
-;****************************************************************
-
-proc DotProductSSE3
-CPU PRESCOTT
-      %$v1 arg
-      %$v2 arg
-      
-      mov eax,[ebp + %$v1]
-      mov edx,[ebp + %$v2]
-      movaps xmm0, [eax]
-      mulps xmm0, [edx]
-      haddps xmm0, xmm0
-      haddps xmm0, xmm0
-;      movss eax, xmm0
-      
-endproc ;DotProductSSE3
-
-;****************************************************************
-;
-;                     ******** 3DNOW ********
-;
-;****************************************************************
-
-proc TransformVector3DNOW
-CPU 586
-      %$src       arg           ; float *src     
-      %$dst       arg           ; float *dst     
-      %$mat       arg           ; float mat[4][4]
-
-    femms
-      mov         ecx,[ebp + %$src]
-      mov         eax,[ebp + %$dst]
-      mov         edx,[ebp + %$mat]
-      movq        mm0,[ecx]     ; src[1] src[0]
-      movd        mm2,[ecx+8]   ; 0 src[2]
-      movq        mm1,mm0       ; src[1] src[0]
-      punpckldq   mm0,mm0       ; src[0] src[0]
-      punpckhdq   mm1,mm1       ; src[1] src[1]
-      punpckldq   mm2,mm2       ; src[2] src[2]
-      movq        mm3,mm0       ; src[0] src[0]
-      movq        mm4,mm1       ; src[1] src[1]
-      movq        mm5,mm2       ; src[2] src[2]
-      pfmul       mm0,[edx]     ; src[0]*mat[0][1] src[0]*mat[0][0]
-      pfmul       mm3,[edx+8]   ; 0 src[0]*mat[0][2]
-      pfmul       mm1,[edx+16]  ; src[1]*mat[1][1] src[1]*mat[1][0]
-      pfmul       mm4,[edx+24]  ; 0 src[1]*mat[1][2]
-      pfmul       mm2,[edx+32]  ; src[2]*mat[2][1] src[2]*mat[2][0]
-      pfmul       mm5,[edx+40]  ; 0 src[2]*mat[2][2]
-      pfadd       mm0,mm1       ; src[0]*mat[0][1]+src[1]*mat[1][1] src[0]*mat[0][0]+src[1]*mat[1][0]
-      pfadd       mm3,mm4       ; 0 src[0]*mat[0][2]+src[1]*mat[1][2]
-      pfadd       mm0,mm2       ; src[0]*mat[0][1]+src[1]*mat[1][1]+src[2]*mat[2][1] src[0]*mat[0][0]+src[1]*mat[1][0]+src[2]*mat[2][0]
-      pfadd       mm3,mm5       ; 0 src[0]*mat[0][2]+src[1]*mat[1][2]+src[2]*mat[2][2]
-      movq        [eax],mm0     ; mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2] mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
-      movd        [eax+8],mm3   ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2]
-      femms
-
-endproc ;TransformVector3DNOW
-
-proc InverseTransformVector3DNOW
-CPU 586
-      %$src       arg           ; float *src       
-      %$dst       arg           ; float *dst       
-      %$mat       arg           ; float mat[4][4]  
-
-    femms
-      mov         ecx,[ebp + %$src]
-      mov         eax,[ebp + %$dst]
-      mov         edx,[ebp + %$mat]
-      movq        mm0,[ecx]     ; src[1] src[0]
-      movd        mm4,[ecx+8]   ; 0 src[2]
-      movq        mm1,mm0       ; src[1] src[0]
-      pfmul       mm0,[edx]     ; src[1]*mat[0][1] src[0]*mat[0][0]
-      movq        mm5,mm4       ; 0 src[2]
-      pfmul       mm4,[edx+8]   ; 0 src[2]*mat[0][2]
-      movq        mm2,mm1       ; src[1] src[0]
-      pfmul       mm1,[edx+16]  ; src[1]*mat[1][1] src[0]*mat[1][0]
-      movq        mm6,mm5       ; 0 src[2]
-      pfmul       mm5,[edx+24]  ; 0 src[2]*mat[1][2]
-      movq        mm3,mm2       ; src[1] src[0]
-      pfmul       mm2,[edx+32]  ; src[1]*mat[2][1] src[0]*mat[2][0]
-      movq        mm7,mm6       ; 0 src[2]
-      pfmul       mm6,[edx+40]  ; 0 src[2]*mat[2][2]
-      pfacc       mm0,mm4       ; src[2]*mat[0][2] src[1]*mat[0][1]+src[0]*mat[0][0]
-      pfacc       mm1,mm5       ; src[2]*mat[1][2] src[1]*mat[1][1]+src[0]*mat[1][0]
-      pfacc       mm2,mm6       ; src[2]*mat[2][2] src[1]*mat[2][1]+src[0]*mat[2][0]
-      pfacc       mm0,mm1       ; src[2]*mat[1][2]+src[1]*mat[1][1]+src[0]*mat[1][0] src[2]*mat[0][2]+src[1]*mat[0][1]+src[0]*mat[0][0]
-      pfacc       mm2,mm3       ; 0 src[2]*mat[2][2]+src[1]*mat[2][1]+src[0]*mat[2][0]
-      movq        [eax],mm0     ; mat[1][0]*src[0]+mat[1][1]*src[1]+mat[1][2]*src[2] mat[0][0]*src[0]+mat[0][1]*src[1]+mat[0][2]*src[2]
-      movd        [eax+8],mm2   ; mat[2][0]*src[0]+mat[2][1]*src[1]+mat[2][2]*src[2]
-      femms                    
-
-endproc ;InverseTransformVector3DNOW
-
-proc MulMatrices3DNOW
-CPU 586
-      %$m1        arg    ; float m1[4][4] 
-      %$m2        arg    ; float m2[4][4] 
-      %$r         arg    ; float r[4][4]  
-
-    femms
-      mov         ecx,[ebp + %$m1]
-      mov         eax,[ebp + %$r]
-      mov         edx,[ebp + %$m2]
-      
-      movq        mm0,[ecx]
-      movq        mm1,[ecx+8]
-      movq        mm4,[edx]
-      punpckhdq   mm2,mm0
-      movq        mm5,[edx+16]
-      punpckhdq   mm3,mm1
-      movq        mm6,[edx+32]
-      punpckldq   mm0,mm0
-      punpckldq   mm1,mm1
-      pfmul       mm4,mm0
-      punpckhdq   mm2,mm2
-      pfmul       mm0,[edx+8]
-      movq        mm7,[edx+48]
-      pfmul       mm5,mm2
-      punpckhdq   mm3,mm3
-      pfmul       mm2,[edx+24]
-      pfmul       mm6,mm1
-      pfadd       mm5,mm4
-      pfmul       mm1,[edx+40]
-      pfadd       mm2,mm0
-      pfmul       mm7,mm3
-      pfadd       mm6,mm5
-      pfmul       mm3,[edx+56]
-      pfadd       mm2,mm1
-      pfadd       mm7,mm6
-      movq        mm0,[ecx+16]
-      pfadd       mm3,mm2
-      movq        mm1,[ecx+24]
-      movq        [eax],mm7
-      movq        mm4,[edx]
-      movq        [eax+8],mm3
-      
-      punpckhdq   mm2,mm0
-      movq        mm5,[edx+16]
-      punpckhdq   mm3,mm1
-      movq        mm6,[edx+32]
-      punpckldq   mm0,mm0
-      punpckldq   mm1,mm1
-      pfmul       mm4,mm0
-      punpckhdq   mm2,mm2
-      pfmul       mm0,[edx+8]
-      movq        mm7,[edx+48]
-      pfmul       mm5,mm2
-      punpckhdq   mm3,mm3
-      pfmul       mm2,[edx+24]
-      pfmul       mm6,mm1
-      pfadd       mm5,mm4
-      pfmul       mm1,[edx+40]
-      pfadd       mm2,mm0
-      pfmul       mm7,mm3
-      pfadd       mm6,mm5
-      pfmul       mm3,[edx+56]
-      pfadd       mm2,mm1
-      pfadd       mm7,mm6
-      movq        mm0,[ecx+32]
-      pfadd       mm3,mm2
-      movq        mm1,[ecx+40]
-      movq        [eax+16],mm7
-      movq        mm4,[edx]
-      movq        [eax+24],mm3
-      
-      punpckhdq   mm2,mm0
-      movq        mm5,[edx+16]
-      punpckhdq   mm3,mm1
-      movq        mm6,[edx+32]
-      punpckldq   mm0,mm0
-      punpckldq   mm1,mm1
-      pfmul       mm4,mm0
-      punpckhdq   mm2,mm2
-      pfmul       mm0,[edx+8]
-      movq        mm7,[edx+48]
-      pfmul       mm5,mm2
-      punpckhdq   mm3,mm3
-      pfmul       mm2,[edx+24]
-      pfmul       mm6,mm1
-      pfadd       mm5,mm4
-      pfmul       mm1,[edx+40]
-      pfadd       mm2,mm0
-      pfmul       mm7,mm3
-      pfadd       mm6,mm5
-      pfmul       mm3,[edx+56]
-      pfadd       mm2,mm1
-      pfadd       mm7,mm6
-      movq        mm0,[ecx+48]
-      pfadd       mm3,mm2
-      movq        mm1,[ecx+56]
-      movq        [eax+32],mm7
-      movq        mm4,[edx]
-      movq        [eax+40],mm3
-      
-      punpckhdq   mm2,mm0
-      movq        mm5,[edx+16]
-      punpckhdq   mm3,mm1
-      movq        mm6,[edx+32]
-      punpckldq   mm0,mm0
-      punpckldq   mm1,mm1
-      pfmul       mm4,mm0
-      punpckhdq   mm2,mm2
-      pfmul       mm0,[edx+8]
-      movq        mm7,[edx+48]
-      pfmul       mm5,mm2
-      punpckhdq   mm3,mm3
-      pfmul       mm2,[edx+24]
-      pfmul       mm6,mm1
-      pfadd       mm5,mm4
-      pfmul       mm1,[edx+40]
-      pfadd       mm2,mm0
-      pfmul       mm7,mm3
-      pfadd       mm6,mm5
-      pfmul       mm3,[edx+56]
-      pfadd       mm2,mm1
-      pfadd       mm7,mm6
-      pfadd       mm3,mm2
-      movq        [eax+48],mm7
-      movq        [eax+56],mm3
-      femms
-
-endproc ;MulMatrices3DNOW
-
-proc DotProduct3DNOW
-CPU 586
-      %$v1        arg
-      %$v2        arg
-      
-      femms
-      mov         edx,[ebp + %$v1]
-      mov         eax,[ebp + %$v2]
-      movq        mm0,[edx]
-      movq        mm3,[eax]
-      pfmul       mm0,mm3
-      movq        mm2,[edx+8]
-      movq        mm1,[eax+8]
-      pfacc       mm0,mm0
-      pfmul       mm1,mm2
-      pfadd       mm0,mm1
-      movd        eax,mm0
-      femms
-
-endproc ;DotProduct3DNOW
-
-proc NormalizeVector3DNOW
-CPU 586
-      %$v          arg
-      
-      femms
-      mov          edx,[ebp + %$v]
-      movq         mm0,[edx]
-      movq         mm3,[edx+8]
-      movq         mm1,mm0
-      movq         mm2,mm3
-      pfmul        mm0,mm0
-      pfmul        mm3,mm3
-      pfacc        mm0,mm0
-      pfadd        mm0,mm3
-      ;movq mm4,mm0 ; prepare for 24bit precision
-      ;punpckldq mm4,mm4 ; prepare for 24bit precision
-      pfrsqrt      mm0,mm0 ; 15bit precision 1/sqrtf(v)
-      ;movq mm3,mm0
-      ;pfmul mm0,mm0
-      ;pfrsqit1 mm0,mm4
-      ;pfrcpit2 mm0,mm3 ; 24bit precision 1/sqrtf(v)
-      pfmul        mm1,mm0
-      pfmul        mm2,mm0
-      movq         [edx],mm1
-      movq         [edx+8],mm2
-      femms
-      
-endproc ;NormalizeVector3DNOW
+;/*
+;* Glide64 - Glide video plugin for Nintendo 64 emulators.
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+;*/
+;
+;****************************************************************
+;
+; Glide64 - Glide Plugin for Nintendo 64 emulators
+; Project started on December 29th, 2001
+;
+; Authors:
+; Dave2001, original author, founded the project in 2001, left it in 2002
+; Gugaman, joined the project in 2002, left it in 2002
+; Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
+; Hiroshi 'KoolSmoky' Morii, joined the project in 2007
+;
+;****************************************************************
+;
+; To modify Glide64:
+; * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
+; * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
+;
+;****************************************************************
+
+%include "inc/c32.mac"
+
+segment .text
+
+proc DetectSIMD
+      %$func    arg 
+      %$iedx    arg 
+      %$iecx    arg 
+      mov       eax,[ebp + %$func]
+      cpuid
+      mov       eax,[ebp + %$iedx]
+      mov       [eax],edx
+      mov       eax,[ebp + %$iecx]
+      mov       [eax],ecx
+endproc ;DetectSIMD
+
+;****************************************************************
+;
+;                     ******** SSE ********
+;
+;****************************************************************
+
+proc TransformVectorSSE
+CPU P3 
+      %$src     arg           ; float *src     
+      %$dst     arg           ; float *dst     
+      %$mat     arg           ; float mat[4][4]
+
+      mov       ecx,[ebp + %$src]
+      mov       eax,[ebp + %$dst]
+      mov       edx,[ebp + %$mat]
+                   
+      movss     xmm0,[ecx]    ; 0 0 0 src[0]
+      movss     xmm5,[edx]    ; 0 0 0 mat[0][0]
+      movhps    xmm5,[edx+4]  ; mat[0][2] mat[0][1] 0 mat[0][0]
+      shufps    xmm0,xmm0, 0  ; src[0] src[0] src[0] src[0]
+      movss     xmm1,[ecx+4]  ; 0 0 0 src[1]
+      movss     xmm3,[edx+16] ; 0 0 0 mat[1][0]
+      movhps    xmm3,[edx+20] ; mat[1][2] mat[1][1] 0 mat[1][0]
+      shufps    xmm1,xmm1, 0  ; src[1] src[1] src[1] src[1]
+      mulps     xmm0,xmm5     ; mat[0][2]*src[0] mat[0][1]*src[0] 0 mat[0][0]*src[0]
+      mulps     xmm1,xmm3     ; mat[1][2]*src[1] mat[1][1]*src[1] 0 mat[1][0]*src[1]
+      movss     xmm2,[ecx+8]  ; 0 0 0 src[2]
+      shufps    xmm2,xmm2, 0  ; src[2] src[2] src[2] src[2]
+      movss     xmm4,[edx+32] ; 0 0 0 mat[2][0]
+      movhps    xmm4,[edx+36] ; mat[2][2] mat[2][1] 0 mat[2][0]
+      addps     xmm0,xmm1     ; mat[0][2]*src[0]+mat[1][2]*src[1] mat[0][1]*src[0]+mat[1][1]*src[1] 0 mat[0][0]*src[0]+mat[1][0]*src[1]
+      mulps     xmm2,xmm4     ; mat[2][2]*src[2] mat[2][1]*src[2] 0 mat[2][0]*src[2]
+      addps     xmm0,xmm2     ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2] mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2] 0 mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
+      movss     [eax],xmm0    ; mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
+      movhps    [eax+4],xmm0  ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2] mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2]
+
+endproc ;TransformVectorSSE
+
+proc MulMatricesSSE
+CPU P3 
+      %$m1      arg  ; float m1[4][4] 
+      %$m2      arg  ; float m2[4][4] 
+      %$r       arg  ; float r[4][4]  
+
+      mov       eax,[ebp + %$r]      
+      mov       ecx,[ebp + %$m1]
+      mov       edx,[ebp + %$m2]
+      
+      movaps    xmm0,[edx]
+      movaps    xmm1,[edx+16]
+      movaps    xmm2,[edx+32]
+      movaps    xmm3,[edx+48]
+      
+      ; r[0][0],r[0][1],r[0][2],r[0][3]
+      
+      movaps    xmm4,[ecx]
+      movaps    xmm5,xmm4
+      movaps    xmm6,xmm4
+      movaps    xmm7,xmm4
+      
+      shufps    xmm4,xmm4,00000000b
+      shufps    xmm5,xmm5,01010101b
+      shufps    xmm6,xmm6,10101010b
+      shufps    xmm7,xmm7,11111111b
+      
+      mulps     xmm4,xmm0
+      mulps     xmm5,xmm1
+      mulps     xmm6,xmm2
+      mulps     xmm7,xmm3
+      
+      addps     xmm4,xmm5
+      addps     xmm4,xmm6
+      addps     xmm4,xmm7
+      
+      movaps    [eax],xmm4
+      
+      ; r[1][0],r[1][1],r[1][2],r[1][3]
+      
+      movaps    xmm4,[ecx+16]
+      movaps    xmm5,xmm4
+      movaps    xmm6,xmm4
+      movaps    xmm7,xmm4
+      
+      shufps    xmm4,xmm4,00000000b
+      shufps    xmm5,xmm5,01010101b
+      shufps    xmm6,xmm6,10101010b
+      shufps    xmm7,xmm7,11111111b
+      
+      mulps     xmm4,xmm0
+      mulps     xmm5,xmm1
+      mulps     xmm6,xmm2
+      mulps     xmm7,xmm3
+      
+      addps     xmm4,xmm5
+      addps     xmm4,xmm6
+      addps     xmm4,xmm7
+      
+      movaps    [eax+16],xmm4
+      
+      
+      ; r[2][0],r[2][1],r[2][2],r[2][3]
+      
+      movaps    xmm4,[ecx+32]
+      movaps    xmm5,xmm4
+      movaps    xmm6,xmm4
+      movaps    xmm7,xmm4
+      
+      shufps    xmm4,xmm4,00000000b
+      shufps    xmm5,xmm5,01010101b
+      shufps    xmm6,xmm6,10101010b
+      shufps    xmm7,xmm7,11111111b
+      
+      mulps     xmm4,xmm0
+      mulps     xmm5,xmm1
+      mulps     xmm6,xmm2
+      mulps     xmm7,xmm3
+      
+      addps     xmm4,xmm5
+      addps     xmm4,xmm6
+      addps     xmm4,xmm7
+      
+      movaps    [eax+32],xmm4
+      
+      ; r[3][0],r[3][1],r[3][2],r[3][3]
+      
+      movaps    xmm4,[ecx+48]
+      movaps    xmm5,xmm4
+      movaps    xmm6,xmm4
+      movaps    xmm7,xmm4
+      
+      shufps    xmm4,xmm4,00000000b
+      shufps    xmm5,xmm5,01010101b
+      shufps    xmm6,xmm6,10101010b
+      shufps    xmm7,xmm7,11111111b
+      
+      mulps     xmm4,xmm0
+      mulps     xmm5,xmm1
+      mulps     xmm6,xmm2
+      mulps     xmm7,xmm3
+      
+      addps     xmm4,xmm5
+      addps     xmm4,xmm6
+      addps     xmm4,xmm7
+      
+      movaps    [eax+48],xmm4
+
+endproc ;MulMatricesSSE
+
+proc NormalizeVectorSSE
+CPU P3
+      %$v arg
+      
+      mov edx, [ebp + %$v]
+      movaps xmm0, [edx]      ; x y z 0
+      movaps xmm2, xmm0       ; x y z 0
+      mulps  xmm0, xmm0       ; x*x y*y z*z 0
+      movaps xmm1, xmm0       ; x*x y*y z*z 0
+      shufps xmm0, xmm1, 0x4e ; z*z 0 x*x y*y
+      addps  xmm0, xmm1       ; x*x+z*z y*y z*z+x*x y*y
+      movaps xmm1, xmm0       ; x*x+z*z y*y z*z+x*x y*y
+      shufps xmm1, xmm1, 0x11 ; y*y z*z+x*x y*y z*z+x*x
+      addps  xmm0, xmm1       ; x*x+z*z+y*y
+      rsqrtps xmm0, xmm0      ; 1.0/sqrt(x*x+z*z+y*y)
+      mulps  xmm2, xmm0       ; x/sqrt(x*x+z*z+y*y) y/sqrt(x*x+z*z+y*y) z/sqrt(x*x+z*z+y*y) 0
+      movaps [edx], xmm2
+      
+endproc ;NormalizeVectorSSE
+
+;****************************************************************
+;
+;                     ******** SSE3 ********
+;
+;****************************************************************
+
+proc DotProductSSE3
+CPU PRESCOTT
+      %$v1 arg
+      %$v2 arg
+      
+      mov eax,[ebp + %$v1]
+      mov edx,[ebp + %$v2]
+      movaps xmm0, [eax]
+      mulps xmm0, [edx]
+      haddps xmm0, xmm0
+      haddps xmm0, xmm0
+;      movss eax, xmm0
+      
+endproc ;DotProductSSE3
+
+;****************************************************************
+;
+;                     ******** 3DNOW ********
+;
+;****************************************************************
+
+proc TransformVector3DNOW
+CPU 586
+      %$src       arg           ; float *src     
+      %$dst       arg           ; float *dst     
+      %$mat       arg           ; float mat[4][4]
+
+    femms
+      mov         ecx,[ebp + %$src]
+      mov         eax,[ebp + %$dst]
+      mov         edx,[ebp + %$mat]
+      movq        mm0,[ecx]     ; src[1] src[0]
+      movd        mm2,[ecx+8]   ; 0 src[2]
+      movq        mm1,mm0       ; src[1] src[0]
+      punpckldq   mm0,mm0       ; src[0] src[0]
+      punpckhdq   mm1,mm1       ; src[1] src[1]
+      punpckldq   mm2,mm2       ; src[2] src[2]
+      movq        mm3,mm0       ; src[0] src[0]
+      movq        mm4,mm1       ; src[1] src[1]
+      movq        mm5,mm2       ; src[2] src[2]
+      pfmul       mm0,[edx]     ; src[0]*mat[0][1] src[0]*mat[0][0]
+      pfmul       mm3,[edx+8]   ; 0 src[0]*mat[0][2]
+      pfmul       mm1,[edx+16]  ; src[1]*mat[1][1] src[1]*mat[1][0]
+      pfmul       mm4,[edx+24]  ; 0 src[1]*mat[1][2]
+      pfmul       mm2,[edx+32]  ; src[2]*mat[2][1] src[2]*mat[2][0]
+      pfmul       mm5,[edx+40]  ; 0 src[2]*mat[2][2]
+      pfadd       mm0,mm1       ; src[0]*mat[0][1]+src[1]*mat[1][1] src[0]*mat[0][0]+src[1]*mat[1][0]
+      pfadd       mm3,mm4       ; 0 src[0]*mat[0][2]+src[1]*mat[1][2]
+      pfadd       mm0,mm2       ; src[0]*mat[0][1]+src[1]*mat[1][1]+src[2]*mat[2][1] src[0]*mat[0][0]+src[1]*mat[1][0]+src[2]*mat[2][0]
+      pfadd       mm3,mm5       ; 0 src[0]*mat[0][2]+src[1]*mat[1][2]+src[2]*mat[2][2]
+      movq        [eax],mm0     ; mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2] mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
+      movd        [eax+8],mm3   ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2]
+      femms
+
+endproc ;TransformVector3DNOW
+
+proc InverseTransformVector3DNOW
+CPU 586
+      %$src       arg           ; float *src       
+      %$dst       arg           ; float *dst       
+      %$mat       arg           ; float mat[4][4]  
+
+    femms
+      mov         ecx,[ebp + %$src]
+      mov         eax,[ebp + %$dst]
+      mov         edx,[ebp + %$mat]
+      movq        mm0,[ecx]     ; src[1] src[0]
+      movd        mm4,[ecx+8]   ; 0 src[2]
+      movq        mm1,mm0       ; src[1] src[0]
+      pfmul       mm0,[edx]     ; src[1]*mat[0][1] src[0]*mat[0][0]
+      movq        mm5,mm4       ; 0 src[2]
+      pfmul       mm4,[edx+8]   ; 0 src[2]*mat[0][2]
+      movq        mm2,mm1       ; src[1] src[0]
+      pfmul       mm1,[edx+16]  ; src[1]*mat[1][1] src[0]*mat[1][0]
+      movq        mm6,mm5       ; 0 src[2]
+      pfmul       mm5,[edx+24]  ; 0 src[2]*mat[1][2]
+      movq        mm3,mm2       ; src[1] src[0]
+      pfmul       mm2,[edx+32]  ; src[1]*mat[2][1] src[0]*mat[2][0]
+      movq        mm7,mm6       ; 0 src[2]
+      pfmul       mm6,[edx+40]  ; 0 src[2]*mat[2][2]
+      pfacc       mm0,mm4       ; src[2]*mat[0][2] src[1]*mat[0][1]+src[0]*mat[0][0]
+      pfacc       mm1,mm5       ; src[2]*mat[1][2] src[1]*mat[1][1]+src[0]*mat[1][0]
+      pfacc       mm2,mm6       ; src[2]*mat[2][2] src[1]*mat[2][1]+src[0]*mat[2][0]
+      pfacc       mm0,mm1       ; src[2]*mat[1][2]+src[1]*mat[1][1]+src[0]*mat[1][0] src[2]*mat[0][2]+src[1]*mat[0][1]+src[0]*mat[0][0]
+      pfacc       mm2,mm3       ; 0 src[2]*mat[2][2]+src[1]*mat[2][1]+src[0]*mat[2][0]
+      movq        [eax],mm0     ; mat[1][0]*src[0]+mat[1][1]*src[1]+mat[1][2]*src[2] mat[0][0]*src[0]+mat[0][1]*src[1]+mat[0][2]*src[2]
+      movd        [eax+8],mm2   ; mat[2][0]*src[0]+mat[2][1]*src[1]+mat[2][2]*src[2]
+      femms                    
+
+endproc ;InverseTransformVector3DNOW
+
+proc MulMatrices3DNOW
+CPU 586
+      %$m1        arg    ; float m1[4][4] 
+      %$m2        arg    ; float m2[4][4] 
+      %$r         arg    ; float r[4][4]  
+
+    femms
+      mov         ecx,[ebp + %$m1]
+      mov         eax,[ebp + %$r]
+      mov         edx,[ebp + %$m2]
+      
+      movq        mm0,[ecx]
+      movq        mm1,[ecx+8]
+      movq        mm4,[edx]
+      punpckhdq   mm2,mm0
+      movq        mm5,[edx+16]
+      punpckhdq   mm3,mm1
+      movq        mm6,[edx+32]
+      punpckldq   mm0,mm0
+      punpckldq   mm1,mm1
+      pfmul       mm4,mm0
+      punpckhdq   mm2,mm2
+      pfmul       mm0,[edx+8]
+      movq        mm7,[edx+48]
+      pfmul       mm5,mm2
+      punpckhdq   mm3,mm3
+      pfmul       mm2,[edx+24]
+      pfmul       mm6,mm1
+      pfadd       mm5,mm4
+      pfmul       mm1,[edx+40]
+      pfadd       mm2,mm0
+      pfmul       mm7,mm3
+      pfadd       mm6,mm5
+      pfmul       mm3,[edx+56]
+      pfadd       mm2,mm1
+      pfadd       mm7,mm6
+      movq        mm0,[ecx+16]
+      pfadd       mm3,mm2
+      movq        mm1,[ecx+24]
+      movq        [eax],mm7
+      movq        mm4,[edx]
+      movq        [eax+8],mm3
+      
+      punpckhdq   mm2,mm0
+      movq        mm5,[edx+16]
+      punpckhdq   mm3,mm1
+      movq        mm6,[edx+32]
+      punpckldq   mm0,mm0
+      punpckldq   mm1,mm1
+      pfmul       mm4,mm0
+      punpckhdq   mm2,mm2
+      pfmul       mm0,[edx+8]
+      movq        mm7,[edx+48]
+      pfmul       mm5,mm2
+      punpckhdq   mm3,mm3
+      pfmul       mm2,[edx+24]
+      pfmul       mm6,mm1
+      pfadd       mm5,mm4
+      pfmul       mm1,[edx+40]
+      pfadd       mm2,mm0
+      pfmul       mm7,mm3
+      pfadd       mm6,mm5
+      pfmul       mm3,[edx+56]
+      pfadd       mm2,mm1
+      pfadd       mm7,mm6
+      movq        mm0,[ecx+32]
+      pfadd       mm3,mm2
+      movq        mm1,[ecx+40]
+      movq        [eax+16],mm7
+      movq        mm4,[edx]
+      movq        [eax+24],mm3
+      
+      punpckhdq   mm2,mm0
+      movq        mm5,[edx+16]
+      punpckhdq   mm3,mm1
+      movq        mm6,[edx+32]
+      punpckldq   mm0,mm0
+      punpckldq   mm1,mm1
+      pfmul       mm4,mm0
+      punpckhdq   mm2,mm2
+      pfmul       mm0,[edx+8]
+      movq        mm7,[edx+48]
+      pfmul       mm5,mm2
+      punpckhdq   mm3,mm3
+      pfmul       mm2,[edx+24]
+      pfmul       mm6,mm1
+      pfadd       mm5,mm4
+      pfmul       mm1,[edx+40]
+      pfadd       mm2,mm0
+      pfmul       mm7,mm3
+      pfadd       mm6,mm5
+      pfmul       mm3,[edx+56]
+      pfadd       mm2,mm1
+      pfadd       mm7,mm6
+      movq        mm0,[ecx+48]
+      pfadd       mm3,mm2
+      movq        mm1,[ecx+56]
+      movq        [eax+32],mm7
+      movq        mm4,[edx]
+      movq        [eax+40],mm3
+      
+      punpckhdq   mm2,mm0
+      movq        mm5,[edx+16]
+      punpckhdq   mm3,mm1
+      movq        mm6,[edx+32]
+      punpckldq   mm0,mm0
+      punpckldq   mm1,mm1
+      pfmul       mm4,mm0
+      punpckhdq   mm2,mm2
+      pfmul       mm0,[edx+8]
+      movq        mm7,[edx+48]
+      pfmul       mm5,mm2
+      punpckhdq   mm3,mm3
+      pfmul       mm2,[edx+24]
+      pfmul       mm6,mm1
+      pfadd       mm5,mm4
+      pfmul       mm1,[edx+40]
+      pfadd       mm2,mm0
+      pfmul       mm7,mm3
+      pfadd       mm6,mm5
+      pfmul       mm3,[edx+56]
+      pfadd       mm2,mm1
+      pfadd       mm7,mm6
+      pfadd       mm3,mm2
+      movq        [eax+48],mm7
+      movq        [eax+56],mm3
+      femms
+
+endproc ;MulMatrices3DNOW
+
+proc DotProduct3DNOW
+CPU 586
+      %$v1        arg
+      %$v2        arg
+      
+      femms
+      mov         edx,[ebp + %$v1]
+      mov         eax,[ebp + %$v2]
+      movq        mm0,[edx]
+      movq        mm3,[eax]
+      pfmul       mm0,mm3
+      movq        mm2,[edx+8]
+      movq        mm1,[eax+8]
+      pfacc       mm0,mm0
+      pfmul       mm1,mm2
+      pfadd       mm0,mm1
+      movd        eax,mm0
+      femms
+
+endproc ;DotProduct3DNOW
+
+proc NormalizeVector3DNOW
+CPU 586
+      %$v          arg
+      
+      femms
+      mov          edx,[ebp + %$v]
+      movq         mm0,[edx]
+      movq         mm3,[edx+8]
+      movq         mm1,mm0