Commits

ecsv committed b59ee72

Remove switchable asm implementations used during the port to C

Comments (0)

Files changed (16)

projects/msvc10/mupen64plus-video-glide64mk2.vcxproj

     </ClCompile>
     <ClCompile Include="..\..\src\Glitch64\textures.cpp" />
   </ItemGroup>
-  <ItemGroup>
-    <None Include="..\..\src\Glide64\3dmathSIMD.asm" />
-    <None Include="..\..\src\Glide64\FixedPoint.asm" />
-    <None Include="..\..\src\Glide64\Texture.asm">
-      <FileType>Document</FileType>
-    </None>
-  </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets" />
-</Project>
+</Project>

projects/msvc10/mupen64plus-video-glide64mk2.vcxproj.filters

     <Filter Include="Glide64\Windows">
       <UniqueIdentifier>{c6ba9864-aee3-4434-954b-8c5679c2959c}</UniqueIdentifier>
     </Filter>
-    <Filter Include="asm">
-      <UniqueIdentifier>{b8d8c948-32b5-4ac7-b558-cd807470175c}</UniqueIdentifier>
-    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\src\Glitch64\geometry.cpp">
       <Filter>Glitch64</Filter>
     </ClCompile>
   </ItemGroup>
-  <ItemGroup>
-    <None Include="..\..\src\Glide64\3dmathSIMD.asm">
-      <Filter>asm</Filter>
-    </None>
-    <None Include="..\..\src\Glide64\FixedPoint.asm">
-      <Filter>asm</Filter>
-    </None>
-    <None Include="..\..\src\Glide64\Texture.asm">
-      <Filter>asm</Filter>
-    </None>
-  </ItemGroup>
-</Project>
+</Project>

projects/unix/Makefile

 OBJDIRS = $(dir $(OBJECTS))
 $(shell $(MKDIR) $(OBJDIRS))
 
-# development only support for using asm on i386 linux
-ifeq ($(OLD_ASM),1)
-
-SOURCE = $(SRCDIR)/Glide64/Texture.asm
-#CFLAGS += -DOLDASM_asmTextureCRC
-#CFLAGS += -DOLDASM_asmLoad4bCI
-#CFLAGS += -DOLDASM_asmLoad4bIAPal
-#CFLAGS += -DOLDASM_asmLoad4bIA
-#CFLAGS += -DOLDASM_asmLoad4bI
-#CFLAGS += -DOLDASM_asmLoad16bRGBA
-#CFLAGS += -DOLDASM_asmLoad16bIA
-#CFLAGS += -DOLDASM_asmLoad8bCI
-#CFLAGS += -DOLDASM_asmLoad8bIA8
-#CFLAGS += -DOLDASM_asmLoad8bIA4
-#CFLAGS += -DOLDASM_asmLoad8bI
-#CFLAGS += -DOLDASM_asmMirror8bS
-#CFLAGS += -DOLDASM_asmWrap8bS
-#CFLAGS += -DOLDASM_asmClamp8bS
-#CFLAGS += -DOLDASM_asmMirror16bS
-#CFLAGS += -DOLDASM_asmWrap16bS
-#CFLAGS += -DOLDASM_asmClamp16bS
-#CFLAGS += -DOLDASM_asmMirror32bS
-#CFLAGS += -DOLDASM_asmWrap32bS
-#CFLAGS += -DOLDASM_asmClamp32bS
-#CFLAGS += -DOLDASM_asmTexConv_ARGB1555_ARGB4444
-#CFLAGS += -DOLDASM_asmTexConv_AI88_ARGB4444
-#CFLAGS += -DOLDASM_asmTexConv_AI44_ARGB4444
-#CFLAGS += -DOLDASM_asmTexConv_A8_ARGB4444
-#CFLAGS += -DOLDASM_asmLoadBlock
-#CFLAGS += -DOLDASM_asmLoadTile
-
-OBJECTS += $(patsubst $(SRCDIR)/%.asm, $(OBJDIR)/%.o, $(filter %.asm, $(SOURCE)))
-
-AS          = nasm
-COMPILE.asm = $(AS) -O6 -felf -I$(SRCDIR)/Glide64/
-
-$(OBJDIR)/%.o: $(SRCDIR)/%.asm
-	$(COMPILE.asm) -o $@ $<
-
-endif
-
 # build targets
 TARGET = mupen64plus-video-glide64mk2$(POSTFIX).$(SO_EXTENSION)
 

src/Glide64/3dmathSIMD.asm

-;/*
-;* Glide64 - Glide video plugin for Nintendo 64 emulators.
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-;*/
-;
-;****************************************************************
-;
-; Glide64 - Glide Plugin for Nintendo 64 emulators
-; Project started on December 29th, 2001
-;
-; Authors:
-; Dave2001, original author, founded the project in 2001, left it in 2002
-; Gugaman, joined the project in 2002, left it in 2002
-; Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
-; Hiroshi 'KoolSmoky' Morii, joined the project in 2007
-;
-;****************************************************************
-;
-; To modify Glide64:
-; * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
-; * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
-;
-;****************************************************************
-
-%include "inc/c32.mac"
-
-segment .text
-
-proc DetectSIMD
-      %$func    arg 
-      %$iedx    arg 
-      %$iecx    arg 
-      mov       eax,[ebp + %$func]
-      cpuid
-      mov       eax,[ebp + %$iedx]
-      mov       [eax],edx
-      mov       eax,[ebp + %$iecx]
-      mov       [eax],ecx
-endproc ;DetectSIMD
-
-;****************************************************************
-;
-;                     ******** SSE ********
-;
-;****************************************************************
-
-proc TransformVectorSSE
-CPU P3 
-      %$src     arg           ; float *src     
-      %$dst     arg           ; float *dst     
-      %$mat     arg           ; float mat[4][4]
-
-      mov       ecx,[ebp + %$src]
-      mov       eax,[ebp + %$dst]
-      mov       edx,[ebp + %$mat]
-                   
-      movss     xmm0,[ecx]    ; 0 0 0 src[0]
-      movss     xmm5,[edx]    ; 0 0 0 mat[0][0]
-      movhps    xmm5,[edx+4]  ; mat[0][2] mat[0][1] 0 mat[0][0]
-      shufps    xmm0,xmm0, 0  ; src[0] src[0] src[0] src[0]
-      movss     xmm1,[ecx+4]  ; 0 0 0 src[1]
-      movss     xmm3,[edx+16] ; 0 0 0 mat[1][0]
-      movhps    xmm3,[edx+20] ; mat[1][2] mat[1][1] 0 mat[1][0]
-      shufps    xmm1,xmm1, 0  ; src[1] src[1] src[1] src[1]
-      mulps     xmm0,xmm5     ; mat[0][2]*src[0] mat[0][1]*src[0] 0 mat[0][0]*src[0]
-      mulps     xmm1,xmm3     ; mat[1][2]*src[1] mat[1][1]*src[1] 0 mat[1][0]*src[1]
-      movss     xmm2,[ecx+8]  ; 0 0 0 src[2]
-      shufps    xmm2,xmm2, 0  ; src[2] src[2] src[2] src[2]
-      movss     xmm4,[edx+32] ; 0 0 0 mat[2][0]
-      movhps    xmm4,[edx+36] ; mat[2][2] mat[2][1] 0 mat[2][0]
-      addps     xmm0,xmm1     ; mat[0][2]*src[0]+mat[1][2]*src[1] mat[0][1]*src[0]+mat[1][1]*src[1] 0 mat[0][0]*src[0]+mat[1][0]*src[1]
-      mulps     xmm2,xmm4     ; mat[2][2]*src[2] mat[2][1]*src[2] 0 mat[2][0]*src[2]
-      addps     xmm0,xmm2     ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2] mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2] 0 mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
-      movss     [eax],xmm0    ; mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
-      movhps    [eax+4],xmm0  ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2] mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2]
-
-endproc ;TransformVectorSSE
-
-proc MulMatricesSSE
-CPU P3 
-      %$m1      arg  ; float m1[4][4] 
-      %$m2      arg  ; float m2[4][4] 
-      %$r       arg  ; float r[4][4]  
-
-      mov       eax,[ebp + %$r]      
-      mov       ecx,[ebp + %$m1]
-      mov       edx,[ebp + %$m2]
-      
-      movaps    xmm0,[edx]
-      movaps    xmm1,[edx+16]
-      movaps    xmm2,[edx+32]
-      movaps    xmm3,[edx+48]
-      
-      ; r[0][0],r[0][1],r[0][2],r[0][3]
-      
-      movaps    xmm4,[ecx]
-      movaps    xmm5,xmm4
-      movaps    xmm6,xmm4
-      movaps    xmm7,xmm4
-      
-      shufps    xmm4,xmm4,00000000b
-      shufps    xmm5,xmm5,01010101b
-      shufps    xmm6,xmm6,10101010b
-      shufps    xmm7,xmm7,11111111b
-      
-      mulps     xmm4,xmm0
-      mulps     xmm5,xmm1
-      mulps     xmm6,xmm2
-      mulps     xmm7,xmm3
-      
-      addps     xmm4,xmm5
-      addps     xmm4,xmm6
-      addps     xmm4,xmm7
-      
-      movaps    [eax],xmm4
-      
-      ; r[1][0],r[1][1],r[1][2],r[1][3]
-      
-      movaps    xmm4,[ecx+16]
-      movaps    xmm5,xmm4
-      movaps    xmm6,xmm4
-      movaps    xmm7,xmm4
-      
-      shufps    xmm4,xmm4,00000000b
-      shufps    xmm5,xmm5,01010101b
-      shufps    xmm6,xmm6,10101010b
-      shufps    xmm7,xmm7,11111111b
-      
-      mulps     xmm4,xmm0
-      mulps     xmm5,xmm1
-      mulps     xmm6,xmm2
-      mulps     xmm7,xmm3
-      
-      addps     xmm4,xmm5
-      addps     xmm4,xmm6
-      addps     xmm4,xmm7
-      
-      movaps    [eax+16],xmm4
-      
-      
-      ; r[2][0],r[2][1],r[2][2],r[2][3]
-      
-      movaps    xmm4,[ecx+32]
-      movaps    xmm5,xmm4
-      movaps    xmm6,xmm4
-      movaps    xmm7,xmm4
-      
-      shufps    xmm4,xmm4,00000000b
-      shufps    xmm5,xmm5,01010101b
-      shufps    xmm6,xmm6,10101010b
-      shufps    xmm7,xmm7,11111111b
-      
-      mulps     xmm4,xmm0
-      mulps     xmm5,xmm1
-      mulps     xmm6,xmm2
-      mulps     xmm7,xmm3
-      
-      addps     xmm4,xmm5
-      addps     xmm4,xmm6
-      addps     xmm4,xmm7
-      
-      movaps    [eax+32],xmm4
-      
-      ; r[3][0],r[3][1],r[3][2],r[3][3]
-      
-      movaps    xmm4,[ecx+48]
-      movaps    xmm5,xmm4
-      movaps    xmm6,xmm4
-      movaps    xmm7,xmm4
-      
-      shufps    xmm4,xmm4,00000000b
-      shufps    xmm5,xmm5,01010101b
-      shufps    xmm6,xmm6,10101010b
-      shufps    xmm7,xmm7,11111111b
-      
-      mulps     xmm4,xmm0
-      mulps     xmm5,xmm1
-      mulps     xmm6,xmm2
-      mulps     xmm7,xmm3
-      
-      addps     xmm4,xmm5
-      addps     xmm4,xmm6
-      addps     xmm4,xmm7
-      
-      movaps    [eax+48],xmm4
-
-endproc ;MulMatricesSSE
-
-proc NormalizeVectorSSE
-CPU P3
-      %$v arg
-      
-      mov edx, [ebp + %$v]
-      movaps xmm0, [edx]      ; x y z 0
-      movaps xmm2, xmm0       ; x y z 0
-      mulps  xmm0, xmm0       ; x*x y*y z*z 0
-      movaps xmm1, xmm0       ; x*x y*y z*z 0
-      shufps xmm0, xmm1, 0x4e ; z*z 0 x*x y*y
-      addps  xmm0, xmm1       ; x*x+z*z y*y z*z+x*x y*y
-      movaps xmm1, xmm0       ; x*x+z*z y*y z*z+x*x y*y
-      shufps xmm1, xmm1, 0x11 ; y*y z*z+x*x y*y z*z+x*x
-      addps  xmm0, xmm1       ; x*x+z*z+y*y
-      rsqrtps xmm0, xmm0      ; 1.0/sqrt(x*x+z*z+y*y)
-      mulps  xmm2, xmm0       ; x/sqrt(x*x+z*z+y*y) y/sqrt(x*x+z*z+y*y) z/sqrt(x*x+z*z+y*y) 0
-      movaps [edx], xmm2
-      
-endproc ;NormalizeVectorSSE
-
-;****************************************************************
-;
-;                     ******** SSE3 ********
-;
-;****************************************************************
-
-proc DotProductSSE3
-CPU PRESCOTT
-      %$v1 arg
-      %$v2 arg
-      
-      mov eax,[ebp + %$v1]
-      mov edx,[ebp + %$v2]
-      movaps xmm0, [eax]
-      mulps xmm0, [edx]
-      haddps xmm0, xmm0
-      haddps xmm0, xmm0
-;      movss eax, xmm0
-      
-endproc ;DotProductSSE3
-
-;****************************************************************
-;
-;                     ******** 3DNOW ********
-;
-;****************************************************************
-
-proc TransformVector3DNOW
-CPU 586
-      %$src       arg           ; float *src     
-      %$dst       arg           ; float *dst     
-      %$mat       arg           ; float mat[4][4]
-
-    femms
-      mov         ecx,[ebp + %$src]
-      mov         eax,[ebp + %$dst]
-      mov         edx,[ebp + %$mat]
-      movq        mm0,[ecx]     ; src[1] src[0]
-      movd        mm2,[ecx+8]   ; 0 src[2]
-      movq        mm1,mm0       ; src[1] src[0]
-      punpckldq   mm0,mm0       ; src[0] src[0]
-      punpckhdq   mm1,mm1       ; src[1] src[1]
-      punpckldq   mm2,mm2       ; src[2] src[2]
-      movq        mm3,mm0       ; src[0] src[0]
-      movq        mm4,mm1       ; src[1] src[1]
-      movq        mm5,mm2       ; src[2] src[2]
-      pfmul       mm0,[edx]     ; src[0]*mat[0][1] src[0]*mat[0][0]
-      pfmul       mm3,[edx+8]   ; 0 src[0]*mat[0][2]
-      pfmul       mm1,[edx+16]  ; src[1]*mat[1][1] src[1]*mat[1][0]
-      pfmul       mm4,[edx+24]  ; 0 src[1]*mat[1][2]
-      pfmul       mm2,[edx+32]  ; src[2]*mat[2][1] src[2]*mat[2][0]
-      pfmul       mm5,[edx+40]  ; 0 src[2]*mat[2][2]
-      pfadd       mm0,mm1       ; src[0]*mat[0][1]+src[1]*mat[1][1] src[0]*mat[0][0]+src[1]*mat[1][0]
-      pfadd       mm3,mm4       ; 0 src[0]*mat[0][2]+src[1]*mat[1][2]
-      pfadd       mm0,mm2       ; src[0]*mat[0][1]+src[1]*mat[1][1]+src[2]*mat[2][1] src[0]*mat[0][0]+src[1]*mat[1][0]+src[2]*mat[2][0]
-      pfadd       mm3,mm5       ; 0 src[0]*mat[0][2]+src[1]*mat[1][2]+src[2]*mat[2][2]
-      movq        [eax],mm0     ; mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2] mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
-      movd        [eax+8],mm3   ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2]
-      femms
-
-endproc ;TransformVector3DNOW
-
-proc InverseTransformVector3DNOW
-CPU 586
-      %$src       arg           ; float *src       
-      %$dst       arg           ; float *dst       
-      %$mat       arg           ; float mat[4][4]  
-
-    femms
-      mov         ecx,[ebp + %$src]
-      mov         eax,[ebp + %$dst]
-      mov         edx,[ebp + %$mat]
-      movq        mm0,[ecx]     ; src[1] src[0]
-      movd        mm4,[ecx+8]   ; 0 src[2]
-      movq        mm1,mm0       ; src[1] src[0]
-      pfmul       mm0,[edx]     ; src[1]*mat[0][1] src[0]*mat[0][0]
-      movq        mm5,mm4       ; 0 src[2]
-      pfmul       mm4,[edx+8]   ; 0 src[2]*mat[0][2]
-      movq        mm2,mm1       ; src[1] src[0]
-      pfmul       mm1,[edx+16]  ; src[1]*mat[1][1] src[0]*mat[1][0]
-      movq        mm6,mm5       ; 0 src[2]
-      pfmul       mm5,[edx+24]  ; 0 src[2]*mat[1][2]
-      movq        mm3,mm2       ; src[1] src[0]
-      pfmul       mm2,[edx+32]  ; src[1]*mat[2][1] src[0]*mat[2][0]
-      movq        mm7,mm6       ; 0 src[2]
-      pfmul       mm6,[edx+40]  ; 0 src[2]*mat[2][2]
-      pfacc       mm0,mm4       ; src[2]*mat[0][2] src[1]*mat[0][1]+src[0]*mat[0][0]
-      pfacc       mm1,mm5       ; src[2]*mat[1][2] src[1]*mat[1][1]+src[0]*mat[1][0]
-      pfacc       mm2,mm6       ; src[2]*mat[2][2] src[1]*mat[2][1]+src[0]*mat[2][0]
-      pfacc       mm0,mm1       ; src[2]*mat[1][2]+src[1]*mat[1][1]+src[0]*mat[1][0] src[2]*mat[0][2]+src[1]*mat[0][1]+src[0]*mat[0][0]
-      pfacc       mm2,mm3       ; 0 src[2]*mat[2][2]+src[1]*mat[2][1]+src[0]*mat[2][0]
-      movq        [eax],mm0     ; mat[1][0]*src[0]+mat[1][1]*src[1]+mat[1][2]*src[2] mat[0][0]*src[0]+mat[0][1]*src[1]+mat[0][2]*src[2]
-      movd        [eax+8],mm2   ; mat[2][0]*src[0]+mat[2][1]*src[1]+mat[2][2]*src[2]
-      femms                    
-
-endproc ;InverseTransformVector3DNOW
-
-proc MulMatrices3DNOW
-CPU 586
-      %$m1        arg    ; float m1[4][4] 
-      %$m2        arg    ; float m2[4][4] 
-      %$r         arg    ; float r[4][4]  
-
-    femms
-      mov         ecx,[ebp + %$m1]
-      mov         eax,[ebp + %$r]
-      mov         edx,[ebp + %$m2]
-      
-      movq        mm0,[ecx]
-      movq        mm1,[ecx+8]
-      movq        mm4,[edx]
-      punpckhdq   mm2,mm0
-      movq        mm5,[edx+16]
-      punpckhdq   mm3,mm1
-      movq        mm6,[edx+32]
-      punpckldq   mm0,mm0
-      punpckldq   mm1,mm1
-      pfmul       mm4,mm0
-      punpckhdq   mm2,mm2
-      pfmul       mm0,[edx+8]
-      movq        mm7,[edx+48]
-      pfmul       mm5,mm2
-      punpckhdq   mm3,mm3
-      pfmul       mm2,[edx+24]
-      pfmul       mm6,mm1
-      pfadd       mm5,mm4
-      pfmul       mm1,[edx+40]
-      pfadd       mm2,mm0
-      pfmul       mm7,mm3
-      pfadd       mm6,mm5
-      pfmul       mm3,[edx+56]
-      pfadd       mm2,mm1
-      pfadd       mm7,mm6
-      movq        mm0,[ecx+16]
-      pfadd       mm3,mm2
-      movq        mm1,[ecx+24]
-      movq        [eax],mm7
-      movq        mm4,[edx]
-      movq        [eax+8],mm3
-      
-      punpckhdq   mm2,mm0
-      movq        mm5,[edx+16]
-      punpckhdq   mm3,mm1
-      movq        mm6,[edx+32]
-      punpckldq   mm0,mm0
-      punpckldq   mm1,mm1
-      pfmul       mm4,mm0
-      punpckhdq   mm2,mm2
-      pfmul       mm0,[edx+8]
-      movq        mm7,[edx+48]
-      pfmul       mm5,mm2
-      punpckhdq   mm3,mm3
-      pfmul       mm2,[edx+24]
-      pfmul       mm6,mm1
-      pfadd       mm5,mm4
-      pfmul       mm1,[edx+40]
-      pfadd       mm2,mm0
-      pfmul       mm7,mm3
-      pfadd       mm6,mm5
-      pfmul       mm3,[edx+56]
-      pfadd       mm2,mm1
-      pfadd       mm7,mm6
-      movq        mm0,[ecx+32]
-      pfadd       mm3,mm2
-      movq        mm1,[ecx+40]
-      movq        [eax+16],mm7
-      movq        mm4,[edx]
-      movq        [eax+24],mm3
-      
-      punpckhdq   mm2,mm0
-      movq        mm5,[edx+16]
-      punpckhdq   mm3,mm1
-      movq        mm6,[edx+32]
-      punpckldq   mm0,mm0
-      punpckldq   mm1,mm1
-      pfmul       mm4,mm0
-      punpckhdq   mm2,mm2
-      pfmul       mm0,[edx+8]
-      movq        mm7,[edx+48]
-      pfmul       mm5,mm2
-      punpckhdq   mm3,mm3
-      pfmul       mm2,[edx+24]
-      pfmul       mm6,mm1
-      pfadd       mm5,mm4
-      pfmul       mm1,[edx+40]
-      pfadd       mm2,mm0
-      pfmul       mm7,mm3
-      pfadd       mm6,mm5
-      pfmul       mm3,[edx+56]
-      pfadd       mm2,mm1
-      pfadd       mm7,mm6
-      movq        mm0,[ecx+48]
-      pfadd       mm3,mm2
-      movq        mm1,[ecx+56]
-      movq        [eax+32],mm7
-      movq        mm4,[edx]
-      movq        [eax+40],mm3
-      
-      punpckhdq   mm2,mm0
-      movq        mm5,[edx+16]
-      punpckhdq   mm3,mm1
-      movq        mm6,[edx+32]
-      punpckldq   mm0,mm0
-      punpckldq   mm1,mm1
-      pfmul       mm4,mm0
-      punpckhdq   mm2,mm2
-      pfmul       mm0,[edx+8]
-      movq        mm7,[edx+48]
-      pfmul       mm5,mm2
-      punpckhdq   mm3,mm3
-      pfmul       mm2,[edx+24]
-      pfmul       mm6,mm1
-      pfadd       mm5,mm4
-      pfmul       mm1,[edx+40]
-      pfadd       mm2,mm0
-      pfmul       mm7,mm3
-      pfadd       mm6,mm5
-      pfmul       mm3,[edx+56]
-      pfadd       mm2,mm1
-      pfadd       mm7,mm6
-      pfadd       mm3,mm2
-      movq        [eax+48],mm7
-      movq        [eax+56],mm3
-      femms
-
-endproc ;MulMatrices3DNOW
-
-proc DotProduct3DNOW
-CPU 586
-      %$v1        arg
-      %$v2        arg
-      
-      femms
-      mov         edx,[ebp + %$v1]
-      mov         eax,[ebp + %$v2]
-      movq        mm0,[edx]
-      movq        mm3,[eax]
-      pfmul       mm0,mm3
-      movq        mm2,[edx+8]
-      movq        mm1,[eax+8]
-      pfacc       mm0,mm0
-      pfmul       mm1,mm2
-      pfadd       mm0,mm1
-      movd        eax,mm0
-      femms
-
-endproc ;DotProduct3DNOW
-
-proc NormalizeVector3DNOW
-CPU 586
-      %$v          arg
-      
-      femms
-      mov          edx,[ebp + %$v]
-      movq         mm0,[edx]
-      movq         mm3,[edx+8]
-      movq         mm1,mm0
-      movq         mm2,mm3
-      pfmul        mm0,mm0
-      pfmul        mm3,mm3
-      pfacc        mm0,mm0
-      pfadd        mm0,mm3
-      ;movq mm4,mm0 ; prepare for 24bit precision
-      ;punpckldq mm4,mm4 ; prepare for 24bit precision
-      pfrsqrt      mm0,mm0 ; 15bit precision 1/sqrtf(v)
-      ;movq mm3,mm0
-      ;pfmul mm0,mm0
-      ;pfrsqit1 mm0,mm4
-      ;pfrcpit2 mm0,mm3 ; 24bit precision 1/sqrtf(v)
-      pfmul        mm1,mm0
-      pfmul        mm2,mm0
-      movq         [edx],mm1
-      movq         [edx+8],mm2
-      femms
-      
-endproc ;NormalizeVector3DNOW

src/Glide64/FixedPoint.asm

-;/*
-;* Glide64 - Glide video plugin for Nintendo 64 emulators.
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-;*/
-;
-;****************************************************************
-;
-; Glide64 - Glide Plugin for Nintendo 64 emulators
-; Project started on December 29th, 2001
-;
-; Authors:
-; Dave2001, original author, founded the project in 2001, left it in 2002
-; Gugaman, joined the project in 2002, left it in 2002
-; Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
-; Hiroshi 'KoolSmoky' Morii, joined the project in 2007
-;
-;****************************************************************
-;
-; To modify Glide64:
-; * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
-; * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
-;
-;****************************************************************
-
-%include "inc/c32.mac"
-
-segment .text
-
-; (x * y) >> 16
-proc imul16 
-CPU 586
-
-  %$x   arg 
-  %$y   arg 
-  mov   eax, [ebp + %$x]
-  mov   edx, [ebp + %$y]
-  imul  edx        
-  shrd  eax,edx,16
-
-endproc ;imul16 
-
-;(x * y) >> 14
-proc imul14 
-CPU 586
-
-  %$x   arg 
-  %$y   arg 
-  mov   eax, [ebp + %$x]
-  mov   edx, [ebp + %$y]
-  imul  edx        
-  shrd  eax,edx,14
-
-endproc ;imul14
-
-;(x << 16) / y
-proc idiv16
-CPU 586
-
-  %$x   arg 
-  %$y   arg 
-  mov   eax, [ebp + %$x]
-  mov   ebx, [ebp + %$y]
-  mov   edx,eax   
-  sar   edx,16
-  shl   eax,16    
-  idiv  ebx  
-
-endproc ;idiv16

src/Glide64/MiClWr16b.h

 //
 //****************************************************************
 
-extern "C" void asmMirror16bS(uint8_t *tex, uint8_t *start, int width, int height, int mask, int line, int full, int count);
-extern "C" void asmWrap16bS(uint8_t *tex, uint8_t *start, int height, int mask, int line, int full, int count);
-extern "C" void asmClamp16bS(uint8_t *tex, uint8_t *constant, int height, int line, int full, int count);
-
 static inline void mirror16bS(uint8_t *tex, uint8_t *start, int width, int height, int mask, int line, int full, int count)
 {
   uint16_t *v8;
   int line = line_full - (count << 1);
   if (line < 0) return;
   unsigned char *start = tex + (mask_width << 1);
-#ifdef OLDASM_asmMirror16bS
-  asmMirror16bS (tex, start, mask_width, height, mask_mask, line, line_full, count);
-#else
   mirror16bS (tex, start, mask_width, height, mask_mask, line, line_full, count);
-#endif
 }
 
 //****************************************************************
   int line = line_full - (count << 2);
   if (line < 0) return;
   unsigned char * start = tex + (mask_width << 1);
-#ifdef OLDASM_asmWrap16bS
-  asmWrap16bS (tex, start, height, mask_mask, line, line_full, count);
-#else
   wrap16bS (tex, start, height, mask_mask, line, line_full, count);
-#endif
 }
 
 //****************************************************************
   int line_full = real_width << 1;
   int line = width << 1;
 
-#ifdef OLDASM_asmClamp16bS
-  asmClamp16bS (dest, constant, real_height, line, line_full, count);
-#else
   clamp16bS (dest, constant, real_height, line, line_full, count);
-#endif
 }
 
 //****************************************************************

src/Glide64/MiClWr32b.h

 //
 //****************************************************************
 
-extern "C" void asmMirror32bS(uint8_t *tex, uint8_t *start, int width, int height, int mask, int line, int full, int count);
-extern "C" void asmWrap32bS(uint8_t *tex, uint8_t *start, int height, int mask, int line, int full, int count);
-extern "C" void asmClamp32bS(uint8_t *tex, uint8_t *constant, int height, int line, int full, int count);
-
 static inline void mirror32bS(uint8_t *tex, uint8_t *start, int width, int height, int mask, int line, int full, int count)
 {
   uint32_t *v8;
 	int line = line_full - (count << 2);
 	if (line < 0) return;
 	unsigned char * start = tex + (mask_width << 2);
-#ifdef OLDASM_asmMirror32bS
-	asmMirror32bS (tex, start, mask_width, height, mask_mask, line, line_full, count);
-#else
 	mirror32bS (tex, start, mask_width, height, mask_mask, line, line_full, count);
-#endif
 }
 
 //****************************************************************
 	int line = line_full - (count << 2);
 	if (line < 0) return;
 	unsigned char * start = tex + (mask_width << 2);
-#ifdef OLDASM_asmWrap32bS
-	asmWrap32bS (tex, start, height, mask_mask, line, line_full, count);
-#else
 	wrap32bS (tex, start, height, mask_mask, line, line_full, count);
-#endif
 }
 
 //****************************************************************
 	
 	int line_full = real_width << 2;
 	int line = width << 2;
-#ifdef OLDASM_asmClamp32bS
-	asmClamp32bS (dest, constant, real_height, line, line_full, count);
-#else
 	clamp32bS (dest, constant, real_height, line, line_full, count);
-#endif
 }
 
 //****************************************************************

src/Glide64/MiClWr8b.h

 //****************************************************************
 // 8-bit Horizontal Mirror
 
-extern "C" void asmMirror8bS(uint8_t *tex, uint8_t *start, int width, int height, int mask, int line, int full, int count);
-extern "C" void asmWrap8bS(uint8_t *tex, uint8_t *start, int height, int mask, int line, int full, int count);
-extern "C" void asmClamp8bS(uint8_t *tex, uint8_t *constant, int height, int line, int full, int count);
-
 static inline void mirror8bS(uint8_t *tex, uint8_t *start, int width, int height, int mask, int line, int full, int count)
 {
   uint8_t *v8;
   int line = line_full - (count);
   if (line < 0) return;
   unsigned char * start = tex + (mask_width);
-#ifdef OLDASM_asmMirror8bS
-  asmMirror8bS (tex, start, mask_width, height, mask_mask, line, line_full, count);
-#else
   mirror8bS (tex, start, mask_width, height, mask_mask, line, line_full, count);
-#endif
 }
 
 //****************************************************************
   int line = line_full - (count << 2);
   if (line < 0) return;
   unsigned char * start = tex + (mask_width);
-#ifdef OLDASM_asmWrap8bS
-  asmWrap8bS (tex, start, height, mask_mask, line, line_full, count);
-#else
   wrap8bS (tex, start, height, mask_mask, line, line_full, count);
-#endif
 }
 
 //****************************************************************
 
   int line_full = real_width;
   int line = width;
-#ifdef OLDASM_asmClamp8bS
-  asmClamp8bS (dest, constant, real_height, line, line_full, count);
-#else
   clamp8bS (dest, constant, real_height, line, line_full, count);
-#endif
 }
 
 //****************************************************************

src/Glide64/TexCache.cpp

 }
 
 //****************************************************************
-extern "C" int asmTextureCRC(uint8_t *addr, int width, int height, int line);
-
 uint32_t textureCRC(uint8_t *addr, int width, int height, int line)
 {
-#ifdef OLDASM_asmTextureCRC
-  return asmTextureCRC(addr, width, height, line);
-#else
   uint32_t crc = 0;
   uint32_t *pixelpos;
   unsigned int i;
   }
 
   return crc;
-#endif
 }
 // GetTexInfo - gets information for either t0 or t1, checks if in cache & fills tex_found
 

src/Glide64/TexConv.h

 //
 //****************************************************************
 
-extern "C" void asmTexConv_ARGB1555_ARGB4444(uint8_t *src, uint8_t *dst, int size);
-extern "C" void asmTexConv_AI88_ARGB4444(uint8_t *src, uint8_t *dst, int size);
-extern "C" void asmTexConv_AI44_ARGB4444(uint8_t *src, uint8_t *dst, int size);
-extern "C" void asmTexConv_A8_ARGB4444(uint8_t *src, uint8_t *dst, int size);
-
-
 static inline void texConv_ARGB1555_ARGB4444(uint8_t *src, uint8_t *dst, int size)
 {
   uint32_t *v3;
   int size = (width * height) >> 1;	// Hiroshi Morii <koolsmoky@users.sourceforge.net>
   // 2 pixels are converted in one loop
   // NOTE: width * height must be a multiple of 2
-#ifdef OLDASM_asmTexConv_ARGB1555_ARGB4444
-  asmTexConv_ARGB1555_ARGB4444(src, dst, size);
-#else
   texConv_ARGB1555_ARGB4444(src, dst, size);
-#endif
 }
 
 void TexConv_AI88_ARGB4444 (unsigned char * src, unsigned char * dst, int width, int height)
   int size = (width * height) >> 1;	// Hiroshi Morii <koolsmoky@users.sourceforge.net>
   // 2 pixels are converted in one loop
   // NOTE: width * height must be a multiple of 2
-#ifdef OLDASM_asmTexConv_AI88_ARGB4444
-  asmTexConv_AI88_ARGB4444(src, dst, size);
-#else
   texConv_AI88_ARGB4444(src, dst, size);
-#endif
 }
 
 void TexConv_AI44_ARGB4444 (unsigned char * src, unsigned char * dst, int width, int height)
   int size = (width * height) >> 2;	// Hiroshi Morii <koolsmoky@users.sourceforge.net>
   // 4 pixels are converted in one loop
   // NOTE: width * height must be a multiple of 4
-#ifdef OLDASM_asmTexConv_AI44_ARGB4444
-  asmTexConv_AI44_ARGB4444(src, dst, size);
-#else
   texConv_AI44_ARGB4444(src, dst, size);
-#endif
 }
 
 void TexConv_A8_ARGB4444 (unsigned char * src, unsigned char * dst, int width, int height)
   int size = (width * height) >> 2;	// Hiroshi Morii <koolsmoky@users.sourceforge.net>
   // 4 pixels are converted in one loop
   // NOTE: width * height must be a multiple of 4
-#ifdef OLDASM_asmTexConv_A8_ARGB4444
-  asmTexConv_A8_ARGB4444(src, dst, size);
-#else
   texConv_A8_ARGB4444(src, dst, size);
-#endif
 }
 

src/Glide64/TexLoad16b.h

 //
 //****************************************************************
 
-extern "C" void asmLoad16bRGBA(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext);
-extern "C" void asmLoad16bIA(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext);
-
 static inline void load16bRGBA(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext)
 {
   uint32_t *v6;
   if (height < 1) height = 1;
   int ext = (real_width - (wid_64 << 2)) << 1;
 
-#ifdef OLDASM_asmLoad16bRGBA
-  asmLoad16bRGBA((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#else
   load16bRGBA((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#endif
 
   return (1 << 16) | GR_TEXFMT_ARGB_1555;
 }
   if (height < 1) height = 1;
   int ext = (real_width - (wid_64 << 2)) << 1;
 
-#ifdef OLDASM_asmLoad16bIA
-  asmLoad16bIA((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#else
   load16bIA((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#endif
 
   return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88;
 }

src/Glide64/TexLoad4b.h

 
 #include <stdint.h>
 
-extern "C" void asmLoad4bCI(uint8_t *src, uint8_t *dst, int wid_64, int height, uint16_t line, int ext, uint16_t *pal);
-extern "C" void asmLoad4bIAPal(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext, uint16_t *pal);
-extern "C" void asmLoad4bIA(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext);
-extern "C" void asmLoad4bI(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext);
-
 static inline void load4bCI(uint8_t *src, uint8_t *dst, int wid_64, int height, uint16_t line, int ext, uint16_t *pal)
 {
   uint8_t *v7;
   {
     //in tlut DISABLE mode load CI texture as plain intensity texture instead of palette dereference.
     //Thanks to angrylion for the advice
-#ifdef OLDASM_asmLoad4bI
-    asmLoad4bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#else
     load4bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#endif
     return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44;
   }
 
   wxUIntPtr pal = wxPtrToUInt(rdp.pal_8 + (rdp.tiles[tile].palette << 4));
   if (rdp.tlut_mode == 2)
   {
-#ifdef OLDASM_asmLoad4bCI
-    asmLoad4bCI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, (uint16_t *)pal);
-#else
     load4bCI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, (uint16_t *)pal);
-#endif
     
     return (1 << 16) | GR_TEXFMT_ARGB_1555;
   }
 
-#ifdef OLDASM_asmLoad4bIAPal
-    asmLoad4bIAPal ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, (uint16_t *)pal);
-#else
     load4bIAPal ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, (uint16_t *)pal);
-#endif
   return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88;
 }
 
   if (wid_64 < 1) wid_64 = 1;
   if (height < 1) height = 1;
   int ext = (real_width - (wid_64 << 4));
-#ifdef OLDASM_asmLoad4bIA
-  asmLoad4bIA ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#else
   load4bIA ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#endif
   return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44;
 }
 
   if (wid_64 < 1) wid_64 = 1;
   if (height < 1) height = 1;
   int ext = (real_width - (wid_64 << 4));
-#ifdef OLDASM_asmLoad4bI
-  asmLoad4bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#else
   load4bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#endif
   
   return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44;
 }

src/Glide64/TexLoad8b.h

 //****************************************************************
 #include <stdint.h>
 
-extern "C" void asmLoad8bCI(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext, uint16_t *pal);
-extern "C" void asmLoad8bIA8(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext, uint16_t *pal);
-extern "C" void asmLoad8bIA4(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext);
-extern "C" void asmLoad8bI(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext);
-
 static inline void load8bCI(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext, uint16_t *pal)
 {
   uint8_t *v7;
     case 0: //palette is not used
       //in tlut DISABLE mode load CI texture as plain intensity texture instead of palette dereference.
       //Thanks to angrylion for the advice
-#ifdef OLDASM_asmLoad8bI
-      asmLoad8bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#else
       load8bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#endif
       return /*(0 << 16) | */GR_TEXFMT_ALPHA_8;
     case 2: //color palette
       ext <<= 1;
-#ifdef OLDASM_asmLoad8bCI
-      asmLoad8bCI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, pal);
-#else
       load8bCI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, pal);
-#endif
       return (1 << 16) | GR_TEXFMT_ARGB_1555;
     default: //IA palette
       ext <<= 1;
-#ifdef OLDASM_asmLoad8bIA8
-      asmLoad8bIA8 ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, pal);
-#else
       load8bIA8 ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, pal);
-#endif
       return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88;
   }
 }
   if (wid_64 < 1) wid_64 = 1;
   if (height < 1) height = 1;
   int ext = (real_width - (wid_64 << 3));
-#ifdef OLDASM_asmLoad8bIA4
-  asmLoad8bIA4 ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#else
   load8bIA4 ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#endif
   return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44;
 } 
 
   if (wid_64 < 1) wid_64 = 1;
   if (height < 1) height = 1;
   int ext = (real_width - (wid_64 << 3));
-#ifdef OLDASM_asmLoad8bI
-  asmLoad8bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#else
   load8bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
-#endif
   return /*(0 << 16) | */GR_TEXFMT_ALPHA_8;
 }
 

src/Glide64/Texture.asm

-;/*
-;* Glide64 - Glide video plugin for Nintendo 64 emulators.
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-;*/
-;
-;****************************************************************
-;
-; Glide64 - Glide Plugin for Nintendo 64 emulators
-; Project started on December 29th, 2001
-;
-; Authors:
-; Dave2001, original author, founded the project in 2001, left it in 2002
-; Gugaman, joined the project in 2002, left it in 2002
-; Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
-; Hiroshi 'KoolSmoky' Morii, joined the project in 2007
-;
-;****************************************************************
-;
-; To modify Glide64:
-; * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
-; * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
-;
-;****************************************************************
-
-%include "inc/c32.mac"
-
-segment .text
-
-
-;****************************************************************
-;
-;               ******** Textures load ********
-;
-;****************************************************************
-
-
-;****************************************************************
-;4b textures load
-;****************************************************************
-
-
-;****************************************************************
-; Size: 0, Format: 2
-; 2009 ported to NASM - Sergey (Gonetz) Lipski
-
-proc asmLoad4bCI
-CPU 586
-        %$src     arg
-        %$dst     arg
-        %$wid_64  arg
-        %$height  arg
-        %$line    arg
-        %$ext     arg
-        %$pal     arg
-ci4:
-        push ebx
-        push esi
-        push edi
-
-        mov ebx,[ebp + %$pal]
-        mov esi,[ebp + %$src]
-        mov edi,[ebp + %$dst]
-        mov ecx,[ebp + %$height]
-.y_loop:
-        push ecx
-        mov ecx,[ebp + %$wid_64]
-.x_loop:
-        push ecx
-
-        mov eax,[esi]           ; read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        ; 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; * copy
-        mov eax,[esi]           ; read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        ; 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-        ; *
-
-        pop ecx
-
-        dec ecx
-        jnz .x_loop
-
-        pop ecx
-        dec ecx
-        jz near .end_y_loop
-        push ecx
-
-        mov eax,esi
-        add eax,[ebp + %$line]
-        mov esi,[ebp + %$src]
-        sub eax,esi
-        and eax,0x7FF
-        add esi,eax
-        add edi,[ebp + %$ext]
-
-        mov ecx,[ebp + %$wid_64]
- .x_loop_2:
-        push ecx
-
-        mov eax,[esi+4]         ; read all 8 pixels
-        bswap eax
-        mov edx,eax
-
-        ; 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; * copy
-        mov eax,[esi]           ; read all 8 pixels
-        bswap eax
-        mov edx,esi
-        add edx,8
-        mov esi,[ebp + %$src]
-        sub edx,esi
-        and edx,0x7FF
-        add esi,edx
-        mov edx,eax
-
-        ; 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-        ; *
-
-        pop ecx
-
-        dec ecx
-        jnz .x_loop_2
-
-        mov eax,esi
-        add eax,[ebp + %$line]
-        mov esi,[ebp + %$src]
-        sub eax,esi
-        and eax,0x7FF
-        add esi,eax
-        add edi,[ebp + %$ext]
-
-        pop ecx
-        dec ecx
-        jnz .y_loop
-
-.end_y_loop:
-        pop edi
-        pop esi
-        pop ebx
-endproc ;asmLoad4bCI
-
-proc asmLoad4bIAPal
-CPU 586
-        %$src     arg
-        %$dst     arg
-        %$wid_64  arg
-        %$height  arg
-        %$line    arg
-        %$ext     arg
-        %$pal     arg
-ia4pal:
-        push ebx
-        push esi
-        push edi
-
-        mov ebx,[ebp + %$pal]
-        mov esi,[ebp + %$src]
-        mov edi,[ebp + %$dst]
-        mov ecx,[ebp + %$height]
-.y_loop:
-        push ecx
-        mov ecx,[ebp + %$wid_64]
-.x_loop:
-        push ecx
-
-        mov eax,[esi]           ; read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        ; 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; * copy
-        mov eax,[esi]           ; read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        ; 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-        ; *
-
-        pop ecx
-
-        dec ecx
-        jnz .x_loop
-
-        pop ecx
-        dec ecx
-        jz near .end_y_loop
-        push ecx
-
-        mov eax,esi
-        add eax,[ebp + %$line]
-        mov esi,[ebp + %$src]
-        sub eax,esi
-        and eax,0x7FF
-        add esi,eax
-        add edi,[ebp + %$ext]
-
-        mov ecx,[ebp + %$wid_64]
-.x_loop_2:
-        push ecx
-
-        mov eax,[esi+4]         ; read all 8 pixels
-        bswap eax
-        mov edx,eax
-
-        ; 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; * copy
-        mov eax,[esi]           ; read all 8 pixels
-        bswap eax
-        mov edx,esi
-        add edx,8
-        mov esi,[ebp + %$src]
-        sub edx,esi
-        and edx,0x7FF
-        add esi,edx
-        mov edx,eax
-
-        ; 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-        ; *
-
-        pop ecx
-
-        dec ecx
-        jnz .x_loop_2
-
-        mov eax,esi
-        add eax,[ebp + %$line]
-        mov esi,[ebp + %$src]
-        sub eax,esi
-        and eax,0x7FF
-        add esi,eax
-        add edi,[ebp + %$ext]
-
-        pop ecx
-        dec ecx
-        jnz .y_loop
-
-.end_y_loop:
-        pop edi
-        pop esi
-        pop ebx
-endproc ;asmLoad4bIAPal
-
-;****************************************************************
-; Size: 0, Format: 3
-;
-; ** BY GUGAMAN **
-; 2009 ported to NASM - Sergey (Gonetz) Lipski
-
-proc asmLoad4bIA
-CPU 586
-        %$src     arg
-        %$dst     arg
-        %$wid_64  arg
-        %$height  arg
-        %$line    arg
-        %$ext     arg
-ia4:
-        push ebx
-        push esi
-        push edi
-
-        mov esi,[ebp + %$src]
-        mov edi,[ebp + %$dst]
-        mov ecx,[ebp + %$height]
-.y_loop:
-        push ecx
-        mov ecx,[ebp + %$wid_64]
-.x_loop:
-        push ecx
-
-        mov eax,[esi]           ; read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        ; 1st dword {
-        xor ecx,ecx
-
-        ; pixel #1
-        ;       IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-        ;       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
-        mov eax,edx
-        shr eax,24 ;Alpha
-        and eax,0x00000010
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,28 ; Intensity
-        and eax,0x0000000E
-        or ecx,eax
-        shr eax,3
-        or ecx,eax
-
-        ; pixel #2
-        ;       xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx
-        ;       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
-        mov eax,edx
-        shr eax,12 ;Alpha
-        and eax,0x00001000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,16 ; Intensity
-        and eax,0x00000E00
-        or ecx,eax
-        shr eax,3
-        and eax,0x00000100
-        or ecx,eax
-
-        ; pixel #3
-        ;       xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx
-        ;       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
-        ;Alpha
-        mov eax,edx
-        and eax,0x00100000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,4 ; Intensity
-        and eax,0x000E0000
-        or ecx,eax
-        shr eax,3
-        and eax,0x00010000
-        or ecx,eax
-
-        ; pixel #4
-        ;       xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx
-        ;       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
-        mov eax,edx
-        shl eax,12 ;Alpha
-        and eax,0x10000000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,8 ; Intensity
-        and eax,0x0E000000
-        or ecx,eax
-        shr eax,3
-        and eax,0x01000000
-        or ecx,eax
-
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-; 2nd dword {
-        xor ecx,ecx
-
-        ; pixel #5
-        ;       xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx
-        ;       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
-        mov eax,edx
-        shr eax,8 ;Alpha
-        and eax,0x00000010
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,12 ; Intensity
-        and eax,0x0000000E
-        or ecx,eax
-        shr eax,3
-        or ecx,eax
-
-        ; pixel #6
-        ;       xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx
-        ;       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
-        ;Alpha
-        mov eax,edx
-        shl eax,4
-        and eax,0x00001000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx     ; Intensity
-        and eax,0x00000E00
-        or ecx,eax
-        shr eax,3
-        and eax,0x00000100
-        or ecx,eax
-
-        ; pixel #7
-        ;       xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx
-        ;       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
-        ;Alpha
-        mov eax,edx
-        shl eax,16
-        and eax,0x00100000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,12 ; Intensity
-        and eax,0x000E0000
-        or ecx,eax
-        shr eax,3
-        and eax,0x00010000
-        or ecx,eax
-
-        ; pixel #8
-        ;       xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA
-        ;       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
-        mov eax,edx
-        shl eax,28 ;Alpha
-        and eax,0x10000000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,24 ; Intensity
-        and eax,0x0E000000
-        or ecx,eax
-        shr eax,3
-        and eax,0x01000000
-        or ecx,eax
-
-        mov [edi],ecx
-        add edi,4
-        ; }
-
-        ; * copy
-        mov eax,[esi]           ; read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        ; 1st dword {
-        xor ecx,ecx
-
-        ; pixel #1
-        ;       IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-        ;       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
-        mov eax,edx
-        shr eax,24 ;Alpha
-        and eax,0x00000010
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,28 ; Intensity
-        and eax,0x0000000E
-        or ecx,eax
-        shr eax,3
-        or ecx,eax
-
-        ; pixel #2
-        ;       xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx
-        ;       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
-        mov eax,edx
-        shr eax,12 ;Alpha
-        and eax,0x00001000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,16 ; Intensity
-        and eax,0x00000E00
-        or ecx,eax
-        shr eax,3
-        and eax,0x00000100
-        or ecx,eax
-
-        ; pixel #3
-        ;       xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx
-        ;       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
-        ;Alpha
-        mov eax,edx
-        and eax,0x00100000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,4 ; Intensity
-        and eax,0x000E0000
-        or ecx,eax
-        shr eax,3
-        and eax,0x00010000
-        or ecx,eax
-
-        ; pixel #4
-        ;       xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx
-        ;       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
-        mov eax,edx
-        shl eax,12 ;Alpha
-        and eax,0x10000000
-        or ecx,eax
-        shl eax,1