Anonymous avatar Anonymous committed 6f76dc7

some cleanup and asm fixes

Comments (0)

Files changed (5)

 #ifdef UNIMP_LOG
     if (settings.log_unk)
     {
-      sprintf (out_buf, "COLOR combine not found: %08lx, #1: (%s-%s)*%s+%s, #2: (%s-%s)*%s+%s\n",
+      sprintf (out_buf, "COLOR combine not found: %08x, #1: (%s-%s)*%s+%s, #2: (%s-%s)*%s+%s\n",
         actual_combine,
         Mode0[rdp.cycle1&0xF], Mode1[(rdp.cycle1>>4)&0xF], Mode2[(rdp.cycle1>>8)&0x1F], Mode3[(rdp.cycle1>>13)&7],
         Mode0[rdp.cycle2&0xF], Mode1[(rdp.cycle2>>4)&0xF], Mode2[(rdp.cycle2>>8)&0x1F], Mode3[(rdp.cycle2>>13)&7]);
 #ifdef UNIMP_LOG
       if (settings.log_unk)
       {
-        sprintf (out_buf, "ALPHA combine not found: %08lx, #1: (%s-%s)*%s+%s, #2: (%s-%s)*%s+%s\n",
+        sprintf (out_buf, "ALPHA combine not found: %08x, #1: (%s-%s)*%s+%s, #2: (%s-%s)*%s+%s\n",
           actual_combine,
           Alpha0[(rdp.cycle1>>16)&7], Alpha1[(rdp.cycle1>>19)&7], Alpha2[(rdp.cycle1>>22)&7], Alpha3[(rdp.cycle1>>25)&7],
           Alpha0[(rdp.cycle2>>16)&7], Alpha1[(rdp.cycle2>>19)&7], Alpha2[(rdp.cycle2>>22)&7], Alpha3[(rdp.cycle2>>25)&7]);
   const char *extensions = grGetString (GR_EXTENSION);
   if (const char * extstr = strstr(extensions, "COMBINE")) {
     if (!strncmp(extstr, "COMBINE", 7)) {
-      LOG ("extensions ");
+      LOG ("extensions ");
       char strColorCombineExt[] = "grColorCombineExt";
-      cmb.grColorCombineExt = (GRCOLORCOMBINEEXT) grGetProcAddress(strColorCombineExt);
+      cmb.grColorCombineExt = (GRCOLORCOMBINEEXT) grGetProcAddress(strColorCombineExt);
       char strAlphaCombineExt[] = "grAlphaCombineExt";
-      cmb.grAlphaCombineExt = (GRCOLORCOMBINEEXT) grGetProcAddress(strAlphaCombineExt);
-      char strTexColorCombineExt[] = "grTexColorCombineExt";
+      cmb.grAlphaCombineExt = (GRCOLORCOMBINEEXT) grGetProcAddress(strAlphaCombineExt);
+      char strTexColorCombineExt[] = "grTexColorCombineExt";
       cmb.grTexColorCombineExt = (GRTEXCOLORCOMBINEEXT) grGetProcAddress(strTexColorCombineExt);
-      char strTexAlphaCombineExt[] = "grTexAlphaCombineExt";
+      char strTexAlphaCombineExt[] = "grTexAlphaCombineExt";
       cmb.grTexAlphaCombineExt = (GRTEXCOLORCOMBINEEXT) grGetProcAddress(strTexAlphaCombineExt);
-      char strConstantColorValueExt[] = "grConstantColorValueExt";
+      char strConstantColorValueExt[] = "grConstantColorValueExt";
       cmb.grConstantColorValueExt = (GRCONSTANTCOLORVALUEEXT) grGetProcAddress(strConstantColorValueExt);
       if (cmb.grColorCombineExt && cmb.grAlphaCombineExt &&
         cmb.grTexColorCombineExt && cmb.grTexAlphaCombineExt)
 	if (CheckKeyPressed(LOGKEY,0x8000)) return;
 #endif
 
-	sprintf (out_buf, "%08lx: (%08lx, %08lx) ", rdp.pc[rdp.pc_i]-8, rdp.cmd0, rdp.cmd1);
+	sprintf (out_buf, "%08x: (%08x, %08x) ", rdp.pc[rdp.pc_i]-8, rdp.cmd0, rdp.cmd1);
 	rdp_err << out_buf;
 
 	va_list ap2;
 
 #include <string.h>
 #include <stdlib.h>
+#include <xlocale.h>
 
 #ifndef _WIN32
 #include <sys/time.h>
   rdp.scale_1024 = settings.scr_res_x / 1024.0f;
   rdp.scale_768 = settings.scr_res_y / 768.0f;
   
-  # warning not sure why res_scl_x isn't being set (commented out)
+//  # warning not sure why res_scl_x isn't being set (commented out)
 //  float res_scl_x = (float)settings.res_x / 320.0f;
   float res_scl_y = (float)settings.res_y / 240.0f;
 
         options |= DUMP_TEX;
 
       ghq_dmptex_toggle_key = 0;
-
+      wchar_t[21] wName;
+      mbstowcs(wName, rdp.RomName, sizeof(rdp.RomName));
       settings.ghq_use = (int)ext_ghq_init(voodoo.max_tex_size, // max texture width supported by hardware
         voodoo.max_tex_size, // max texture height supported by hardware
         voodoo.sup_32bit_tex?32:16, // max texture bpp supported by hardware
         options,
         settings.ghq_cache_size * 1024*1024, // cache texture to system memory
         NULL,
-        NULL,
-        #warning romname/pluginpath needed for GlideHQ support
 //        pluginPath.wchar_str(), // plugin path
-//        rdp.RomName.wchar_str(), // name of ROM. must be no longer than 256 characters
+        wName, // name of ROM. must be no longer than 256 characters
+        #warning pluginpath needed for GlideHQ support
         DisplayLoadProgress);
     }
   }
 input:    none
 output:   none
 *******************************************************************/
-#warning ChangeWindow unimplemented
+//#warning ChangeWindow unimplemented
 EXPORT void CALL ChangeWindow (void)
 {
   LOG ("CALL ChangeWindow()\n");
 
   strncpy(rdp.RomName, name, sizeof(name));
   ReadSpecialSettings (name);
-  
-//  WriteLog(M64MSG_INFO, "fb_clear %d fb_smart %d\n", settings.fb_depth_clear, settings.fb_smart);
-  
+    
   ClearCache ();
 
   CheckDRAMSize();
     
     }
 
-    #warning screen capture disabled!
+//    #warning screen capture disabled!
     /*if (capture_screen)
     {
       //char path[256];
     frame_count ++;
 
     // Open/close debugger?
-    // #warning hotkeys disabled
     if (CheckKeyPressed(G64_VK_SCROLL, 0x0001))
     {
       if (!debugging)
 }
 #endif
 
-// #warning CheckKeyPressed disabled for now
 
 int CheckKeyPressed(int key, int mask)
 {
 #include <stdint.h>
 wxUint32 Load8bCI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)
 {
-    if (wid_64 < 1) wid_64 = 1;
-    if (height < 1) height = 1;
-    int ext = (real_width - (wid_64 << 3)) << 1;
-    unsigned short * pal = rdp.pal_8;
-    
-    #warning case for rdp.tlut_mode = 0 is not implemented!
-      //case 0: //palette is not used
+  if (wid_64 < 1) wid_64 = 1;
+  if (height < 1) height = 1;
+  int ext = (real_width - (wid_64 << 3)) << 1;
+  unsigned short * pal = rdp.pal_8;
+
+  switch (rdp.tlut_mode) {    
+    //#warning case for rdp.tlut_mode = 0 is not implemented!
+    case 0: //palette is not used
       //in tlut DISABLE mode load CI texture as plain intensity texture instead of palette dereference. 
       //Thanks to angrylion for the advice
 
-    if (rdp.tlut_mode == 2)
+    {
+        #if !defined(__GNUC__) && !defined(NO_ASM)
+      __asm {  
+        mov esi,dword ptr [src]  
+          mov edi,dword ptr [dst]  
+
+          mov ecx,dword ptr [height]  
+          y_loop:  
+        push ecx  
+
+          mov ecx,dword ptr [wid_64]  
+          x_loop:  
+        mov eax,dword ptr [esi]          // read all 4 pixels  
+          add esi,4  
+
+          mov dword ptr [edi],eax // save dword 
+          add edi,4  
+
+          mov eax,dword ptr [esi]          // read all 4 pixels  
+          add esi,4  
+
+          mov dword ptr [edi],eax // save dword 
+          add edi,4  
+            // *  
+
+          dec ecx  
+          jnz x_loop  
+
+          pop ecx  
+          dec ecx  
+          jz end_y_loop  
+          push ecx  
+
+          add esi,dword ptr [line]  
+          add edi,dword ptr [ext]  
+
+          mov ecx,dword ptr [wid_64]  
+          x_loop_2:  
+        mov eax,dword ptr [esi+4]          // read both pixels  
+
+          mov dword ptr [edi],eax //save dword 
+          add edi,4  
+
+          mov eax,dword ptr [esi]          // read both pixels  
+          add esi,8  
+
+          mov dword ptr [edi],eax //save dword 
+          add edi,4  
+            // *  
+
+          dec ecx  
+          jnz x_loop_2  
+
+          add esi,dword ptr [line]  
+          add edi,dword ptr [ext]  
+
+          pop ecx  
+          dec ecx  
+          jnz y_loop  
+
+          end_y_loop:  
+      }  
+#elif !defined(NO_ASM)
+   //printf("Load8bI\n");
+      int lTemp, lHeight = (int) height;
+      asm volatile (
+        "1:                     \n"  // y_loop6
+        "mov %[wid_64], %%eax   \n"
+        "mov %%eax, %[temp]     \n"
+        "2:                     \n"  // x_loop6
+        "mov (%[src]), %%eax    \n"          // read all 4 pixels  
+        "add $4, %[src]         \n"
+
+        "mov %%eax, (%[dst])    \n" // save dword 
+        "add $4, %[dst]         \n"
+
+        "mov (%[src]), %%eax    \n"          // read all 4 pixels  
+        "add $4, %[src]         \n"
+
+        "mov %%eax, (%[dst])    \n" // save dword 
+        "add $4, %[dst]         \n"
+         // *  
+
+        "decl %[temp]          \n"
+        "jnz 2b                \n" // x_loop6
+
+        "decl %[height]        \n"
+        "jz 4f                 \n" // end_y_loop6
+
+        "add %[line], %[src]   \n"
+        "add %[ext], %[dst]    \n"
+
+        "mov %[wid_64], %%eax   \n"
+        "mov %%eax, %[temp]     \n"
+        "3:                     \n"  // x_loop_26
+        "mov 4(%[src]), %%eax   \n"          // read both pixels  
+
+        "mov %%eax, (%[dst])    \n" //save dword 
+        "add $4, %[dst]         \n"
+
+        "mov (%[src]), %%eax    \n"          // read both pixels  
+        "add $8, %[src]         \n"
+
+        "mov %%eax, (%[dst])    \n" //save dword 
+        "add $4, %[dst]         \n"
+
+        "decl %[temp]          \n"
+        "jnz 3b                \n"  // x_loop_26
+
+        "add %[line], %[src]   \n"
+        "add %[ext], %[dst]    \n"
+
+        "decl %[height]        \n"
+        "jnz 1b                \n"  // y_loop6
+
+        "4:                    \n"  // end_y_loop6
+        : [temp]"=m"(lTemp), [src]"+S"(src), [dst]"+D"(dst), [height]"+g"(lHeight)
+        : [wid_64] "g" (wid_64), [line] "g" ((uintptr_t)line), [ext] "g" ((uintptr_t)ext)
+        : "memory", "cc", "eax", "edx"
+        );  
+#endif
+      // asmLoad8bI (src, dst, wid_64, height, line, ext);
+      return /*(0 << 16) | */GR_TEXFMT_ALPHA_8;
+    }
+    case 2: //color palette
     {
 #if !defined(__GNUC__) && !defined(NO_ASM)
-        __asm {
-            mov ebx,dword ptr [pal]
-                
-                mov esi,dword ptr [src]
-                mov edi,dword ptr [dst]
-                
-                mov ecx,dword ptr [height]
-y_loop:
-            push ecx
-                
-                mov ecx,dword ptr [wid_64]
-x_loop:
-            push ecx
-                
-                mov eax,dword ptr [esi]     // read all 4 pixels
-                bswap eax
-                add esi,4
-                mov edx,eax
-                
+      __asm {
+        mov ebx,dword ptr [pal]
+
+          mov esi,dword ptr [src]
+          mov edi,dword ptr [dst]
+
+          mov ecx,dword ptr [height]
+          y_loop:
+        push ecx
+
+          mov ecx,dword ptr [wid_64]
+          x_loop:
+        push ecx
+
+          mov eax,dword ptr [esi]     // read all 4 pixels
+          bswap eax
+          add esi,4
+          mov edx,eax
+
                 // 1st dword output {
-                shr eax,15
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,1
-                shl ecx,16
-                
-                mov eax,edx
-                shr eax,23
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,1
-                
-                mov dword ptr [edi],ecx
-                add edi,4
+          shr eax,15
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,1
+          shl ecx,16
+
+          mov eax,edx
+          shr eax,23
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,1
+
+          mov dword ptr [edi],ecx
+          add edi,4
                 // }
-                
+
                 // 2nd dword output {
-                mov eax,edx
-                shl eax,1
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,1
-                shl ecx,16
-                
-                shr edx,7
-                and edx,0x1FE
-                mov cx,word ptr [ebx+edx]
-                ror cx,1
-                
-                mov dword ptr [edi],ecx
-                add edi,4
+          mov eax,edx
+          shl eax,1
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,1
+          shl ecx,16
+
+          shr edx,7
+          and edx,0x1FE
+          mov cx,word ptr [ebx+edx]
+          ror cx,1
+
+          mov dword ptr [edi],ecx
+          add edi,4
                 // }
-                
+
                 // * copy
-                mov eax,dword ptr [esi]     // read all 4 pixels
-                bswap eax
-                add esi,4
-                mov edx,eax
-                
+          mov eax,dword ptr [esi]     // read all 4 pixels
+          bswap eax
+          add esi,4
+          mov edx,eax
+
                 // 1st dword output {
-                shr eax,15
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,1
-                shl ecx,16
-                
-                mov eax,edx
-                shr eax,23
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,1
-                
-                mov dword ptr [edi],ecx
-                add edi,4
+          shr eax,15
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,1
+          shl ecx,16
+
+          mov eax,edx
+          shr eax,23
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,1
+
+          mov dword ptr [edi],ecx
+          add edi,4
                 // }
-                
+
                 // 2nd dword output {
-                mov eax,edx
-                shl eax,1
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,1
-                shl ecx,16
-                
-                shr edx,7
-                and edx,0x1FE
-                mov cx,word ptr [ebx+edx]
-                ror cx,1
-                
-                mov dword ptr [edi],ecx
-                add edi,4
+          mov eax,edx
+          shl eax,1
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,1
+          shl ecx,16
+
+          shr edx,7
+          and edx,0x1FE
+          mov cx,word ptr [ebx+edx]
+          ror cx,1
+
+          mov dword ptr [edi],ecx
+          add edi,4
                 // }
                 // *
-                
-                pop ecx
-                
-                dec ecx
-                jnz x_loop
-                
-                pop ecx
-                dec ecx
-                jz end_y_loop
-                push ecx
-                
-                add esi,dword ptr [line]
-                add edi,dword ptr [ext]
-                
-                mov ecx,dword ptr [wid_64]
-x_loop_2:
-            push ecx
-                
-                mov eax,dword ptr [esi+4]       // read all 4 pixels
-                bswap eax
-                mov edx,eax
-                
+
+          pop ecx
+
+          dec ecx
+          jnz x_loop
+
+          pop ecx
+          dec ecx
+          jz end_y_loop
+          push ecx
+
+          add esi,dword ptr [line]
+          add edi,dword ptr [ext]
+
+          mov ecx,dword ptr [wid_64]
+          x_loop_2:
+        push ecx
+
+          mov eax,dword ptr [esi+4]       // read all 4 pixels
+          bswap eax
+          mov edx,eax
+
                 // 1st dword output {
-                shr eax,15
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,1
-                shl ecx,16
-                
-                mov eax,edx
-                shr eax,23
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,1
-                
-                mov dword ptr [edi],ecx
-                add edi,4
+          shr eax,15
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,1
+          shl ecx,16
+
+          mov eax,edx
+          shr eax,23
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,1
+
+          mov dword ptr [edi],ecx
+          add edi,4
                 // }
-                
+
                 // 2nd dword output {
-                mov eax,edx
-                shl eax,1
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,1
-                shl ecx,16
-                
-                shr edx,7
-                and edx,0x1FE
-                mov cx,word ptr [ebx+edx]
-                ror cx,1
-                
-                mov dword ptr [edi],ecx
-                add edi,4
+          mov eax,edx
+          shl eax,1
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,1
+          shl ecx,16
+
+          shr edx,7
+          and edx,0x1FE
+          mov cx,word ptr [ebx+edx]
+          ror cx,1
+
+          mov dword ptr [edi],ecx
+          add edi,4
                 // }
-                
+
                 // * copy
-                mov eax,dword ptr [esi]     // read all 4 pixels
-                bswap eax
-                add esi,8
-                mov edx,eax
-                
+          mov eax,dword ptr [esi]     // read all 4 pixels
+          bswap eax
+          add esi,8
+          mov edx,eax
+
                 // 1st dword output {
-                shr eax,15
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,1
-                shl ecx,16
-                
-                mov eax,edx
-                shr eax,23
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,1
-                
-                mov dword ptr [edi],ecx
-                add edi,4
+          shr eax,15
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,1
+          shl ecx,16
+
+          mov eax,edx
+          shr eax,23
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,1
+
+          mov dword ptr [edi],ecx
+          add edi,4
                 // }
-                
+
                 // 2nd dword output {
-                mov eax,edx
-                shl eax,1
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,1
-                shl ecx,16
-                
-                shr edx,7
-                and edx,0x1FE
-                mov cx,word ptr [ebx+edx]
-                ror cx,1
-                
-                mov dword ptr [edi],ecx
-                add edi,4
+          mov eax,edx
+          shl eax,1
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,1
+          shl ecx,16
+
+          shr edx,7
+          and edx,0x1FE
+          mov cx,word ptr [ebx+edx]
+          ror cx,1
+
+          mov dword ptr [edi],ecx
+          add edi,4
                 // }
                 // *
-                
-                pop ecx
-                
-                dec ecx
-                jnz x_loop_2
-                
-                add esi,dword ptr [line]
-                add edi,dword ptr [ext]
-                
-                pop ecx
-                dec ecx
-                jnz y_loop
-                
-end_y_loop:
-        }
+
+          pop ecx
+
+          dec ecx
+          jnz x_loop_2
+
+          add esi,dword ptr [line]
+          add edi,dword ptr [ext]
+
+          pop ecx
+          dec ecx
+          jnz y_loop
+
+          end_y_loop:
+      }
 #elif !defined(NO_ASM)
        //printf("Load8bCI1\n");
-       long lTempX, lTempY, lHeight = (long) height;
-       intptr_t fake_eax, fake_edx;
-       asm volatile (
-             "1:                     \n"  // y_loop4
-             "mov %[c], %[tempy]     \n"
-                
-             "mov %[wid_64], %%ecx   \n"
-             "2:                     \n"  // x_loop4
-             "mov %[c], %[tempx]     \n"
-             
-             "mov (%[src]), %%eax      \n"      // read all 4 pixels
-             "bswap %%eax             \n"
-             "add $4, %[src]           \n"
-             "mov %%eax, %%edx        \n"
-             
+      long lTempX, lTempY, lHeight = (long) height;
+      intptr_t fake_eax, fake_edx;
+      asm volatile (
+        "1:                     \n"  // y_loop4
+        "mov %[c], %[tempy]     \n"
+
+        "mov %[wid_64], %%ecx   \n"
+        "2:                     \n"  // x_loop4
+        "mov %[c], %[tempx]     \n"
+
+        "mov (%[src]), %%eax      \n"      // read all 4 pixels
+        "bswap %%eax             \n"
+        "add $4, %[src]           \n"
+        "mov %%eax, %%edx        \n"
+
              // 1st dword output {
-             "shr $15, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $23, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
+        "shr $15, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $1, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "mov %%edx, %%eax        \n"
+        "shr $23, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $1, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
              // }
-                
+
              // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $7, %%edx           \n"
-             "and $0x1FE, %%edx       \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
+        "mov %%edx, %%eax        \n"
+        "shl $1, %%eax           \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $1, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "shr $7, %%edx           \n"
+        "and $0x1FE, %%edx       \n"
+        "mov (%[pal],%[d]), %%cx \n"
+        "ror $1, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
              // }
-                
+
              // * copy
-             "mov (%[src]), %%eax      \n"      // read all 4 pixels
-             "bswap %%eax             \n"
-             "add $4, %[src]           \n"
-             "mov %%eax, %%edx        \n"
-             
+        "mov (%[src]), %%eax      \n"      // read all 4 pixels
+        "bswap %%eax             \n"
+        "add $4, %[src]           \n"
+        "mov %%eax, %%edx        \n"
+
              // 1st dword output {
-             "shr $15, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $23, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
+        "shr $15, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $1, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "mov %%edx, %%eax        \n"
+        "shr $23, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $1, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
              // }
-                
+
              // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $7, %%edx           \n"
-             "and $0x1FE, %%edx       \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             // *
-                
-             "mov %[tempx], %[c]     \n"
+        "mov %%edx, %%eax        \n"
+        "shl $1, %%eax           \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $1, %%cx            \n"
+        "shl $16, %%ecx          \n"
 
-             "dec %%ecx               \n"
-             "jnz 2b                  \n"  // x_loop4
-             
-             "mov %[tempy], %[c]      \n"
-             "dec %%ecx               \n"
-             "jz 4f                   \n"  // end_y_loop4
-             "mov %[c], %[tempy]      \n"
-             
-             "add %[line], %[src]     \n"
-             "add %[ext], %[dst]      \n"
-             
-             "mov %[wid_64], %%ecx   \n"
-             "3:                     \n"  // x_loop_24
-             "mov %[c], %[tempx]     \n"
-             
-             "mov 4(%[src]), %%eax     \n"      // read all 4 pixels
-             "bswap %%eax             \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $15, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $23, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-                
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $7, %%edx           \n"
-             "and $0x1FE, %%edx       \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             
-             // * copy
-             "mov (%[src]), %%eax      \n"      // read all 4 pixels
-             "bswap %%eax             \n"
-             "add $8, %[src]           \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $15, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $23, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $7, %%edx           \n"
-             "and $0x1FE, %%edx       \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             // *
-             
-             "mov %[tempx], %[c]      \n"
-             "dec %%ecx               \n"
-             "jnz 3b                  \n"  // x_loop_24
-             
-             "add %[line], %[src]     \n"
-             "add %[ext], %[dst]      \n"
-             
-             "mov %[tempy], %[c]      \n"
-             "dec %%ecx               \n"
-             "jnz 1b                  \n"  // y_loop4
-             
-             "4:                      \n"  // end_y_loop4
-             : [tempx]"=m"(lTempX), [tempy]"=m"(lTempY), [a] "=&a" (fake_eax), [d] "=&d" (fake_edx), [src]"+S"(src), [dst]"+D"(dst), [c]"+c"(lHeight)
-             : [pal] "r" (pal), [wid_64] "g" (wid_64), [line] "g" ((uintptr_t)line), [ext] "g" ((uintptr_t)ext)
-             : "memory", "cc"
-             );
-#endif
-    return (1 << 16) | GR_TEXFMT_ARGB_1555;
-    }
-    else
-    {
-#if !defined(__GNUC__) && !defined(NO_ASM)
-        __asm {
-            mov ebx,dword ptr [pal]
-                
-                mov esi,dword ptr [src]
-                mov edi,dword ptr [dst]
-                
-                mov ecx,dword ptr [height]
-ia_y_loop:
-            push ecx
-                
-                mov ecx,dword ptr [wid_64]
-ia_x_loop:
-            push ecx
-                
-                mov eax,dword ptr [esi]     // read all 4 pixels
-                bswap eax
-                add esi,4
-                mov edx,eax
-                
-                // 1st dword output {
-                shr eax,15
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,8
-                shl ecx,16
-                
-                mov eax,edx
-                shr eax,23
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,8
-                
-                mov dword ptr [edi],ecx
-                add edi,4
-                // }
-                
-                // 2nd dword output {
-                mov eax,edx
-                shl eax,1
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,8
-                shl ecx,16
-                
-                shr edx,7
-                and edx,0x1FE
-                mov cx,word ptr [ebx+edx]
-                ror cx,8
-                
-                mov dword ptr [edi],ecx
-                add edi,4
-                // }
-                
-                // * copy
-                mov eax,dword ptr [esi]     // read all 4 pixels
-                bswap eax
-                add esi,4
-                mov edx,eax
-                
-                // 1st dword output {
-                shr eax,15
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,8
-                shl ecx,16
-                
-                mov eax,edx
-                shr eax,23
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,8
-                
-                mov dword ptr [edi],ecx
-                add edi,4
-                // }
-                
-                // 2nd dword output {
-                mov eax,edx
-                shl eax,1
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,8
-                shl ecx,16
-                
-                shr edx,7
-                and edx,0x1FE
-                mov cx,word ptr [ebx+edx]
-                ror cx,8
-                
-                mov dword ptr [edi],ecx
-                add edi,4
-                // }
-                // *
-                
-                pop ecx
-                
-                dec ecx
-                jnz ia_x_loop
-                
-                pop ecx
-                dec ecx
-                jz ia_end_y_loop
-                push ecx
-                
-                add esi,dword ptr [line]
-                add edi,dword ptr [ext]
-                
-                mov ecx,dword ptr [wid_64]
-ia_x_loop_2:
-            push ecx
-                
-                mov eax,dword ptr [esi+4]       // read all 4 pixels
-                bswap eax
-                mov edx,eax
-                
-                // 1st dword output {
-                shr eax,15
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,8
-                shl ecx,16
-                
-                mov eax,edx
-                shr eax,23
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,8
-                
-                mov dword ptr [edi],ecx
-                add edi,4
-                // }
-                
-                // 2nd dword output {
-                mov eax,edx
-                shl eax,1
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,8
-                shl ecx,16
-                
-                shr edx,7
-                and edx,0x1FE
-                mov cx,word ptr [ebx+edx]
-                ror cx,8
-                
-                mov dword ptr [edi],ecx
-                add edi,4
-                // }
-                
-                // * copy
-                mov eax,dword ptr [esi]     // read all 4 pixels
-                bswap eax
-                add esi,8
-                mov edx,eax
-                
-                // 1st dword output {
-                shr eax,15
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,8
-                shl ecx,16
-                
-                mov eax,edx
-                shr eax,23
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,8
-                
-                mov dword ptr [edi],ecx
-                add edi,4
-                // }
-                
-                // 2nd dword output {
-                mov eax,edx
-                shl eax,1
-                and eax,0x1FE
-                mov cx,word ptr [ebx+eax]
-                ror cx,8
-                shl ecx,16
-                
-                shr edx,7
-                and edx,0x1FE
-                mov cx,word ptr [ebx+edx]
-                ror cx,8
-                
-                mov dword ptr [edi],ecx
-                add edi,4
-                // }
-                // *
-                
-                pop ecx
-                
-                dec ecx
-                jnz ia_x_loop_2
-                
-                add esi,dword ptr [line]
-                add edi,dword ptr [ext]
-                
-                pop ecx
-                dec ecx
-                jnz ia_y_loop
-                
-ia_end_y_loop:
-    }
-#elif !defined(NO_ASM)
-       //printf("Load8bCI1\n");
-       long lTempX, lTempY, lHeight = (long) height;
-        intptr_t fake_eax, fake_edx;
-       asm volatile (
-             "1:                      \n"  // ia_y_loop2
-             "mov %[c], %[tempy]      \n"
-                
-             "mov %[wid_64], %%ecx   \n"
-             "2:                     \n"  // ia_x_loop2
-             "mov %[c], %[tempx]     \n"
-             
-             "mov (%[src]), %%eax      \n"      // read all 4 pixels
-             "bswap %%eax             \n"
-             "add $4, %[src]           \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $15, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $23, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-                
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $7, %%edx           \n"
-             "and $0x1FE, %%edx       \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-                
-             // * copy
-             "mov (%[src]), %%eax      \n"      // read all 4 pixels
-             "bswap %%eax             \n"
-             "add $4, %[src]           \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $15, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $23, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-                
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $7, %%edx           \n"
-             "and $0x1FE, %%edx       \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             // *
-                
-             "mov %[tempx], %[c]      \n"
-             "dec %%ecx               \n"
-             "jnz 2b                  \n"  // ia_x_loop2
-             
-             "mov %[tempy], %[c]      \n"
-             "dec %%ecx               \n"
-             "jz 4f                   \n"  // ia_end_y_loop2
-             "mov %[c], %[tempy]      \n"
-                
-             "add %[line], %[src]     \n"
-             "add %[ext], %[dst]      \n"
-             
-             "mov %[wid_64], %%ecx    \n"
-             "3:                      \n"  // ia_x_loop_22
-             "mov %[c], %[tempx]      \n"
-             
-             "mov 4(%[src]), %%eax     \n"      // read all 4 pixels
-             "bswap %%eax             \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $15, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $23, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-                
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $7, %%edx           \n"
-             "and $0x1FE, %%edx       \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             
-             // * copy
-             "mov (%[src]), %%eax      \n"      // read all 4 pixels
-             "bswap %%eax             \n"
-             "add $8, %[src]           \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $15, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $23, %%eax          \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1FE, %%eax       \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $7, %%edx           \n"
-             "and $0x1FE, %%edx       \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
+        "shr $7, %%edx           \n"
+        "and $0x1FE, %%edx       \n"
+        "mov (%[pal],%[d]), %%cx \n"
+        "ror $1, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
              // }
              // *
 
-             "mov %[tempx], %[c]      \n"
-             "dec %%ecx               \n"
-             "jnz 3b                  \n"  // ia_x_loop_22
-             
-             "add %[line], %[src]     \n"
-             "add %[ext], %[dst]      \n"
-             
-             "mov %[tempy], %[c]      \n"
-             "dec %%ecx               \n"
-             "jnz 1b                  \n"  // ia_y_loop2
-             
-             "4:                      \n"  // ia_end_y_loop2
-             : [tempx]"=m"(lTempX), [tempy]"=m"(lTempY), [a] "=&a" (fake_eax), [d] "=&d" (fake_edx), [src]"+S"(src), [dst]"+D"(dst), [c]"+c"(lHeight)
-             : [pal] "r" (pal), [wid_64] "g" (wid_64), [line] "g" ((uintptr_t)line), [ext] "g" ((uintptr_t)ext)
-             : "memory", "cc"
-             );
+        "mov %[tempx], %[c]     \n"
+
+        "dec %%ecx               \n"
+        "jnz 2b                  \n"  // x_loop4
+
+        "mov %[tempy], %[c]      \n"
+        "dec %%ecx               \n"
+        "jz 4f                   \n"  // end_y_loop4
+        "mov %[c], %[tempy]      \n"
+
+        "add %[line], %[src]     \n"
+        "add %[ext], %[dst]      \n"
+
+        "mov %[wid_64], %%ecx   \n"
+        "3:                     \n"  // x_loop_24
+        "mov %[c], %[tempx]     \n"
+
+        "mov 4(%[src]), %%eax     \n"      // read all 4 pixels
+        "bswap %%eax             \n"
+        "mov %%eax, %%edx        \n"
+
+             // 1st dword output {
+        "shr $15, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $1, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "mov %%edx, %%eax        \n"
+        "shr $23, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $1, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
+             // }
+
+             // 2nd dword output {
+        "mov %%edx, %%eax        \n"
+        "shl $1, %%eax           \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $1, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "shr $7, %%edx           \n"
+        "and $0x1FE, %%edx       \n"
+        "mov (%[pal],%[d]), %%cx \n"
+        "ror $1, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
+             // }
+
+             // * copy
+        "mov (%[src]), %%eax      \n"      // read all 4 pixels
+        "bswap %%eax             \n"
+        "add $8, %[src]           \n"
+        "mov %%eax, %%edx        \n"
+
+             // 1st dword output {
+        "shr $15, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $1, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "mov %%edx, %%eax        \n"
+        "shr $23, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $1, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
+             // }
+
+             // 2nd dword output {
+        "mov %%edx, %%eax        \n"
+        "shl $1, %%eax           \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $1, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "shr $7, %%edx           \n"
+        "and $0x1FE, %%edx       \n"
+        "mov (%[pal],%[d]), %%cx \n"
+        "ror $1, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
+             // }
+             // *
+
+        "mov %[tempx], %[c]      \n"
+        "dec %%ecx               \n"
+        "jnz 3b                  \n"  // x_loop_24
+
+        "add %[line], %[src]     \n"
+        "add %[ext], %[dst]      \n"
+
+        "mov %[tempy], %[c]      \n"
+        "dec %%ecx               \n"
+        "jnz 1b                  \n"  // y_loop4
+
+        "4:                      \n"  // end_y_loop4
+        : [tempx]"=m"(lTempX), [tempy]"=m"(lTempY), [a] "=&a" (fake_eax), [d] "=&d" (fake_edx), [src]"+S"(src), [dst]"+D"(dst), [c]"+c"(lHeight)
+        : [pal] "r" (pal), [wid_64] "g" (wid_64), [line] "g" ((uintptr_t)line), [ext] "g" ((uintptr_t)ext)
+        : "memory", "cc"
+        );
 #endif
-    return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88;
+      return (1 << 16) | GR_TEXFMT_ARGB_1555;
     }
-    
-    return 0;
+    default: //IA palette
+    {
+#if !defined(__GNUC__) && !defined(NO_ASM)
+      __asm {
+        mov ebx,dword ptr [pal]
+
+          mov esi,dword ptr [src]
+          mov edi,dword ptr [dst]
+
+          mov ecx,dword ptr [height]
+          ia_y_loop:
+        push ecx
+
+          mov ecx,dword ptr [wid_64]
+          ia_x_loop:
+        push ecx
+
+          mov eax,dword ptr [esi]     // read all 4 pixels
+          bswap eax
+          add esi,4
+          mov edx,eax
+
+                // 1st dword output {
+          shr eax,15
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,8
+          shl ecx,16
+
+          mov eax,edx
+          shr eax,23
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,8
+
+          mov dword ptr [edi],ecx
+          add edi,4
+                // }
+
+                // 2nd dword output {
+          mov eax,edx
+          shl eax,1
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,8
+          shl ecx,16
+
+          shr edx,7
+          and edx,0x1FE
+          mov cx,word ptr [ebx+edx]
+          ror cx,8
+
+          mov dword ptr [edi],ecx
+          add edi,4
+                // }
+
+                // * copy
+          mov eax,dword ptr [esi]     // read all 4 pixels
+          bswap eax
+          add esi,4
+          mov edx,eax
+
+                // 1st dword output {
+          shr eax,15
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,8
+          shl ecx,16
+
+          mov eax,edx
+          shr eax,23
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,8
+
+          mov dword ptr [edi],ecx
+          add edi,4
+                // }
+
+                // 2nd dword output {
+          mov eax,edx
+          shl eax,1
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,8
+          shl ecx,16
+
+          shr edx,7
+          and edx,0x1FE
+          mov cx,word ptr [ebx+edx]
+          ror cx,8
+
+          mov dword ptr [edi],ecx
+          add edi,4
+                // }
+                // *
+
+          pop ecx
+
+          dec ecx
+          jnz ia_x_loop
+
+          pop ecx
+          dec ecx
+          jz ia_end_y_loop
+          push ecx
+
+          add esi,dword ptr [line]
+          add edi,dword ptr [ext]
+
+          mov ecx,dword ptr [wid_64]
+          ia_x_loop_2:
+        push ecx
+
+          mov eax,dword ptr [esi+4]       // read all 4 pixels
+          bswap eax
+          mov edx,eax
+
+                // 1st dword output {
+          shr eax,15
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,8
+          shl ecx,16
+
+          mov eax,edx
+          shr eax,23
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,8
+
+          mov dword ptr [edi],ecx
+          add edi,4
+                // }
+
+                // 2nd dword output {
+          mov eax,edx
+          shl eax,1
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,8
+          shl ecx,16
+
+          shr edx,7
+          and edx,0x1FE
+          mov cx,word ptr [ebx+edx]
+          ror cx,8
+
+          mov dword ptr [edi],ecx
+          add edi,4
+                // }
+
+                // * copy
+          mov eax,dword ptr [esi]     // read all 4 pixels
+          bswap eax
+          add esi,8
+          mov edx,eax
+
+                // 1st dword output {
+          shr eax,15
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,8
+          shl ecx,16
+
+          mov eax,edx
+          shr eax,23
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,8
+
+          mov dword ptr [edi],ecx
+          add edi,4
+                // }
+
+                // 2nd dword output {
+          mov eax,edx
+          shl eax,1
+          and eax,0x1FE
+          mov cx,word ptr [ebx+eax]
+          ror cx,8
+          shl ecx,16
+
+          shr edx,7
+          and edx,0x1FE
+          mov cx,word ptr [ebx+edx]
+          ror cx,8
+
+          mov dword ptr [edi],ecx
+          add edi,4
+                // }
+                // *
+
+          pop ecx
+
+          dec ecx
+          jnz ia_x_loop_2
+
+          add esi,dword ptr [line]
+          add edi,dword ptr [ext]
+
+          pop ecx
+          dec ecx
+          jnz ia_y_loop
+
+          ia_end_y_loop:
+      }
+#elif !defined(NO_ASM)
+       //printf("Load8bCI1\n");
+      long lTempX, lTempY, lHeight = (long) height;
+      intptr_t fake_eax, fake_edx;
+      asm volatile (
+        "1:                      \n"  // ia_y_loop2
+        "mov %[c], %[tempy]      \n"
+
+        "mov %[wid_64], %%ecx   \n"
+        "2:                     \n"  // ia_x_loop2
+        "mov %[c], %[tempx]     \n"
+
+        "mov (%[src]), %%eax      \n"      // read all 4 pixels
+        "bswap %%eax             \n"
+        "add $4, %[src]           \n"
+        "mov %%eax, %%edx        \n"
+
+             // 1st dword output {
+        "shr $15, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $8, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "mov %%edx, %%eax        \n"
+        "shr $23, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $8, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
+             // }
+
+             // 2nd dword output {
+        "mov %%edx, %%eax        \n"
+        "shl $1, %%eax           \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $8, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "shr $7, %%edx           \n"
+        "and $0x1FE, %%edx       \n"
+        "mov (%[pal],%[d]), %%cx \n"
+        "ror $8, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
+             // }
+
+             // * copy
+        "mov (%[src]), %%eax      \n"      // read all 4 pixels
+        "bswap %%eax             \n"
+        "add $4, %[src]           \n"
+        "mov %%eax, %%edx        \n"
+
+             // 1st dword output {
+        "shr $15, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $8, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "mov %%edx, %%eax        \n"
+        "shr $23, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $8, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
+             // }
+
+             // 2nd dword output {
+        "mov %%edx, %%eax        \n"
+        "shl $1, %%eax           \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $8, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "shr $7, %%edx           \n"
+        "and $0x1FE, %%edx       \n"
+        "mov (%[pal],%[d]), %%cx \n"
+        "ror $8, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
+             // }
+             // *
+
+        "mov %[tempx], %[c]      \n"
+        "dec %%ecx               \n"
+        "jnz 2b                  \n"  // ia_x_loop2
+
+        "mov %[tempy], %[c]      \n"
+        "dec %%ecx               \n"
+        "jz 4f                   \n"  // ia_end_y_loop2
+        "mov %[c], %[tempy]      \n"
+
+        "add %[line], %[src]     \n"
+        "add %[ext], %[dst]      \n"
+
+        "mov %[wid_64], %%ecx    \n"
+        "3:                      \n"  // ia_x_loop_22
+        "mov %[c], %[tempx]      \n"
+
+        "mov 4(%[src]), %%eax     \n"      // read all 4 pixels
+        "bswap %%eax             \n"
+        "mov %%eax, %%edx        \n"
+
+             // 1st dword output {
+        "shr $15, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $8, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "mov %%edx, %%eax        \n"
+        "shr $23, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $8, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
+             // }
+
+             // 2nd dword output {
+        "mov %%edx, %%eax        \n"
+        "shl $1, %%eax           \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $8, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "shr $7, %%edx           \n"
+        "and $0x1FE, %%edx       \n"
+        "mov (%[pal],%[d]), %%cx \n"
+        "ror $8, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
+             // }
+
+             // * copy
+        "mov (%[src]), %%eax      \n"      // read all 4 pixels
+        "bswap %%eax             \n"
+        "add $8, %[src]           \n"
+        "mov %%eax, %%edx        \n"
+
+             // 1st dword output {
+        "shr $15, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $8, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "mov %%edx, %%eax        \n"
+        "shr $23, %%eax          \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $8, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
+             // }
+
+             // 2nd dword output {
+        "mov %%edx, %%eax        \n"
+        "shl $1, %%eax           \n"
+        "and $0x1FE, %%eax       \n"
+        "mov (%[pal],%[a]), %%cx \n"
+        "ror $8, %%cx            \n"
+        "shl $16, %%ecx          \n"
+
+        "shr $7, %%edx           \n"
+        "and $0x1FE, %%edx       \n"
+        "mov (%[pal],%[d]), %%cx \n"
+        "ror $8, %%cx            \n"
+
+        "mov %%ecx, (%[dst])      \n"
+        "add $4, %[dst]           \n"
+             // }
+             // *
+
+        "mov %[tempx], %[c]      \n"
+        "dec %%ecx               \n"
+        "jnz 3b                  \n"  // ia_x_loop_22
+
+        "add %[line], %[src]     \n"
+        "add %[ext], %[dst]      \n"
+
+        "mov %[tempy], %[c]      \n"
+        "dec %%ecx               \n"
+        "jnz 1b                  \n"  // ia_y_loop2
+
+        "4:                      \n"  // ia_end_y_loop2
+        : [tempx]"=m"(lTempX), [tempy]"=m"(lTempY), [a] "=&a" (fake_eax), [d] "=&d" (fake_edx), [src]"+S"(src), [dst]"+D"(dst), [c]"+c"(lHeight)
+        : [pal] "r" (pal), [wid_64] "g" (wid_64), [line] "g" ((uintptr_t)line), [ext] "g" ((uintptr_t)ext)
+        : "memory", "cc"
+        );
+#endif
+      return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88;
+    }
+  }
+  return 0;
 }
 
 //****************************************************************
 
 wxUint32 Load8bIA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)  
 { 
-    if (rdp.tlut_mode != 0)
-        return Load8bCI (dst, src, wid_64, height, line, real_width, tile);
+  if (rdp.tlut_mode != 0)
+    return Load8bCI (dst, src, wid_64, height, line, real_width, tile);
 
-    if (wid_64 < 1) wid_64 = 1;  
-    if (height < 1) height = 1;  
-    int ext = (real_width - (wid_64 << 3));  
+  if (wid_64 < 1) wid_64 = 1;  
+  if (height < 1) height = 1;  
+  int ext = (real_width - (wid_64 << 3));  
 #if !defined(__GNUC__) && !defined(NO_ASM)
-    __asm {  
-        mov esi,dword ptr [src]  
-            mov edi,dword ptr [dst]  
-            
-            mov ecx,dword ptr [height]  
-y_loop:  
-        push ecx  
-            
-            mov ecx,dword ptr [wid_64]  
-x_loop:  
-        mov eax,dword ptr [esi]          // read all 4 pixels  
-            add esi,4  
-            
-            xor ebx,ebx 
-            mov edx,eax 
-            shr eax,4//all alpha 
-            and eax,0x0F0F0F0F 
-            or ebx,eax 
-            mov eax,edx//intensity 
-            shl eax,4 
-            and eax,0xF0F0F0F0 
-            or ebx,eax 
-            
-            mov dword ptr [edi],ebx // save dword 
-            add edi,4  
-            
-            mov eax,dword ptr [esi]          // read all 4 pixels  
-            add esi,4  
-            
-            xor ebx,ebx 
-            mov edx,eax 
-            shr eax,4//all alpha 
-            and eax,0x0F0F0F0F 
-            or ebx,eax 
-            mov eax,edx//intensity 
-            shl eax,4 
-            and eax,0xF0F0F0F0 
-            or ebx,eax 
-            
-            mov dword ptr [edi],ebx // save dword 
-            add edi,4  
+  __asm {  
+    mov esi,dword ptr [src]  
+      mov edi,dword ptr [dst]  
+
+      mov ecx,dword ptr [height]  
+      y_loop:  
+    push ecx  
+
+      mov ecx,dword ptr [wid_64]  
+      x_loop:  
+    mov eax,dword ptr [esi]          // read all 4 pixels  
+      add esi,4  
+
+      xor ebx,ebx 
+      mov edx,eax 
+      shr eax,4//all alpha 
+      and eax,0x0F0F0F0F 
+      or ebx,eax 
+      mov eax,edx//intensity 
+      shl eax,4 
+      and eax,0xF0F0F0F0 
+      or ebx,eax 
+
+      mov dword ptr [edi],ebx // save dword 
+      add edi,4  
+
+      mov eax,dword ptr [esi]          // read all 4 pixels  
+      add esi,4  
+
+      xor ebx,ebx 
+      mov edx,eax 
+      shr eax,4//all alpha 
+      and eax,0x0F0F0F0F 
+      or ebx,eax 
+      mov eax,edx//intensity 
+      shl eax,4 
+      and eax,0xF0F0F0F0 
+      or ebx,eax 
+
+      mov dword ptr [edi],ebx // save dword 
+      add edi,4  
             // *  
-            
-            dec ecx  
-            jnz x_loop  
-            
-            pop ecx  
-            dec ecx  
-            jz end_y_loop  
-            push ecx  
-            
-            add esi,dword ptr [line]  
-            add edi,dword ptr [ext]  
-            
-            mov ecx,dword ptr [wid_64]  
-x_loop_2:  
-        mov eax,dword ptr [esi+4]          // read both pixels  
-            
-            xor ebx,ebx 
-            mov edx,eax 
-            shr eax,4//all alpha 
-            and eax,0x0F0F0F0F 
-            or ebx,eax 
-            mov eax,edx//intensity 
-            shl eax,4 
-            and eax,0xF0F0F0F0 
-            or ebx,eax 
-            
-            mov dword ptr [edi],ebx //save dword 
-            add edi,4  
-            
-            mov eax,dword ptr [esi]          // read both pixels  
-            add esi,8  
-            
-            xor ebx,ebx 
-            mov edx,eax 
-            shr eax,4//all alpha 
-            and eax,0x0F0F0F0F 
-            or ebx,eax 
-            mov eax,edx//intensity 
-            shl eax,4 
-            and eax,0xF0F0F0F0 
-            or ebx,eax 
-            
-            mov dword ptr [edi],ebx //save dword 
-            add edi,4  
+
+      dec ecx  
+      jnz x_loop  
+
+      pop ecx  
+      dec ecx  
+      jz end_y_loop  
+      push ecx  
+
+      add esi,dword ptr [line]  
+      add edi,dword ptr [ext]  
+
+      mov ecx,dword ptr [wid_64]  
+      x_loop_2:  
+    mov eax,dword ptr [esi+4]          // read both pixels  
+
+      xor ebx,ebx 
+      mov edx,eax 
+      shr eax,4//all alpha 
+      and eax,0x0F0F0F0F 
+      or ebx,eax 
+      mov eax,edx//intensity 
+      shl eax,4 
+      and eax,0xF0F0F0F0 
+      or ebx,eax 
+
+      mov dword ptr [edi],ebx //save dword 
+      add edi,4  
+
+      mov eax,dword ptr [esi]          // read both pixels  
+      add esi,8  
+
+      xor ebx,ebx 
+      mov edx,eax 
+      shr eax,4//all alpha 
+      and eax,0x0F0F0F0F 
+      or ebx,eax 
+      mov eax,edx//intensity 
+      shl eax,4 
+      and eax,0xF0F0F0F0 
+      or ebx,eax 
+
+      mov dword ptr [edi],ebx //save dword 
+      add edi,4  
             // *  
-            
-            dec ecx  
-            jnz x_loop_2  
-            
-            add esi,dword ptr [line]  
-            add edi,dword ptr [ext]  
-            
-            pop ecx  
-            dec ecx  
-            jnz y_loop  
-            
-end_y_loop:  
-    }  
+
+      dec ecx  
+      jnz x_loop_2  
+
+      add esi,dword ptr [line]  
+      add edi,dword ptr [ext]  
+
+      pop ecx  
+      dec ecx  
+      jnz y_loop  
+
+      end_y_loop:  
+  }  
 #elif !defined(NO_ASM)
    //printf("Load8bIA\n");
-   int lTemp, lHeight = (int) height;
-   asm volatile (
-         "1:                     \n"  // y_loop5
-         "mov %[wid_64], %%eax    \n"
-         "mov %%eax, %[temp]      \n"
-         "2:                      \n"  // x_loop5
-         "mov (%[src]), %%eax     \n"          // read all 4 pixels  
-         "add $4, %[src]          \n"
-         
-         "xor %%ecx, %%ecx       \n"
-         "mov %%eax, %%edx       \n"
-         "shr $4, %%eax          \n"//all alpha 
-         "and $0x0F0F0F0F, %%eax \n"
-         "or %%eax, %%ecx        \n"
-         "mov %%edx, %%eax       \n"//intensity 
-         "shl $4, %%eax          \n"
-         "and $0xF0F0F0F0, %%eax \n"
-         "or %%eax, %%ecx        \n"
-         
-         "mov %%ecx, (%[dst])     \n" // save dword 
-         "add $4, %[dst]          \n"
-         
-         "mov (%[src]), %%eax     \n"          // read all 4 pixels  
-         "add $4, %[src]          \n"
-         
-         "xor %%ecx, %%ecx       \n"
-         "mov %%eax, %%edx       \n"
-         "shr $4, %%eax          \n"//all alpha 
-         "and $0x0F0F0F0F, %%eax \n"
-         "or %%eax, %%ecx        \n"
-         "mov %%edx, %%eax       \n"//intensity 
-         "shl $4, %%eax          \n"
-         "and $0xF0F0F0F0, %%eax \n"
-         "or %%eax, %%ecx        \n"
-         
-         "mov %%ecx, (%[dst])    \n" // save dword 
-         "add $4, %[dst]         \n"
-            
-         "decl %[temp]           \n"
-         "jnz 2b                 \n"  // x_loop5
-         
-         "decl %[height]         \n"
-         "jz 4f                  \n"  // end_y_loop5
-         
-         "add %[line], %[src]    \n"
-         "add %[ext], %[dst]     \n"
-         
-         "mov %[wid_64], %%eax    \n"
-         "mov %%eax, %[temp]      \n"
-         "3:                      \n"  // x_loop_25
-         "mov 4(%[src]), %%eax    \n"          // read both pixels  
-         
-         "xor %%ecx, %%ecx       \n"
-         "mov %%eax, %%edx       \n"
-         "shr $4, %%eax          \n"//all alpha 
-         "and $0x0F0F0F0F, %%eax \n"
-         "or %%eax, %%ecx        \n"
-         "mov %%edx, %%eax       \n"//intensity 
-         "shl $4, %%eax          \n"
-         "and $0xF0F0F0F0, %%eax \n"
-         "or %%eax, %%ecx        \n"
-         
-         "mov %%ecx, (%[dst])     \n" //save dword 
-         "add $4, %[dst]          \n"
-         
-         "mov (%[src]), %%eax     \n"          // read both pixels  
-         "add $8, %[src]          \n"
-         
-         "xor %%ecx, %%ecx       \n"
-         "mov %%eax, %%edx       \n"
-         "shr $4, %%eax          \n"//all alpha 
-         "and $0x0F0F0F0F, %%eax \n"
-         "or %%eax, %%ecx        \n"
-         "mov %%edx, %%eax       \n"//intensity 
-         "shl $4, %%eax          \n"
-         "and $0xF0F0F0F0, %%eax \n"
-         "or %%eax, %%ecx        \n"
-         
-         "mov %%ecx, (%[dst])     \n" //save dword 
-         "add $4, %[dst]          \n"
+  int lTemp, lHeight = (int) height;
+  asm volatile (
+    "1:                     \n"  // y_loop5
+    "mov %[wid_64], %%eax    \n"
+    "mov %%eax, %[temp]      \n"
+    "2:                      \n"  // x_loop5
+    "mov (%[src]), %%eax     \n"          // read all 4 pixels  
+    "add $4, %[src]          \n"
+
+    "xor %%ecx, %%ecx       \n"
+    "mov %%eax, %%edx       \n"
+    "shr $4, %%eax          \n"//all alpha 
+    "and $0x0F0F0F0F, %%eax \n"
+    "or %%eax, %%ecx        \n"
+    "mov %%edx, %%eax       \n"//intensity 
+    "shl $4, %%eax          \n"
+    "and $0xF0F0F0F0, %%eax \n"
+    "or %%eax, %%ecx        \n"
+
+    "mov %%ecx, (%[dst])     \n" // save dword 
+    "add $4, %[dst]          \n"
+
+    "mov (%[src]), %%eax     \n"          // read all 4 pixels  
+    "add $4, %[src]          \n"
+
+    "xor %%ecx, %%ecx       \n"
+    "mov %%eax, %%edx       \n"
+    "shr $4, %%eax          \n"//all alpha 
+    "and $0x0F0F0F0F, %%eax \n"
+    "or %%eax, %%ecx        \n"
+    "mov %%edx, %%eax       \n"//intensity 
+    "shl $4, %%eax          \n"
+    "and $0xF0F0F0F0, %%eax \n"
+    "or %%eax, %%ecx        \n"
+
+    "mov %%ecx, (%[dst])    \n" // save dword 
+    "add $4, %[dst]         \n"
+
+    "decl %[temp]           \n"
+    "jnz 2b                 \n"  // x_loop5
+
+    "decl %[height]         \n"
+    "jz 4f                  \n"  // end_y_loop5
+
+    "add %[line], %[src]    \n"
+    "add %[ext], %[dst]     \n"
+
+    "mov %[wid_64], %%eax    \n"
+    "mov %%eax, %[temp]      \n"
+    "3:                      \n"  // x_loop_25
+    "mov 4(%[src]), %%eax    \n"          // read both pixels  
+
+    "xor %%ecx, %%ecx       \n"
+    "mov %%eax, %%edx       \n"
+    "shr $4, %%eax          \n"//all alpha 
+    "and $0x0F0F0F0F, %%eax \n"
+    "or %%eax, %%ecx        \n"
+    "mov %%edx, %%eax       \n"//intensity 
+    "shl $4, %%eax          \n"
+    "and $0xF0F0F0F0, %%eax \n"
+    "or %%eax, %%ecx        \n"
+
+    "mov %%ecx, (%[dst])     \n" //save dword 
+    "add $4, %[dst]          \n"
+
+    "mov (%[src]), %%eax     \n"          // read both pixels  
+    "add $8, %[src]          \n"
+
+    "xor %%ecx, %%ecx       \n"
+    "mov %%eax, %%edx       \n"
+    "shr $4, %%eax          \n"//all alpha 
+    "and $0x0F0F0F0F, %%eax \n"
+    "or %%eax, %%ecx        \n"
+    "mov %%edx, %%eax       \n"//intensity 
+    "shl $4, %%eax          \n"
+    "and $0xF0F0F0F0, %%eax \n"
+    "or %%eax, %%ecx        \n"
+
+    "mov %%ecx, (%[dst])     \n" //save dword 
+    "add $4, %[dst]          \n"
          // *  
-         
-         "decl %[temp]           \n"
-         "jnz 3b                 \n"  // x_loop_25
-         
-         "add %[line], %[src]    \n"
-         "add %[ext], %[dst]     \n"
-         
-         "decl %[height]         \n"
-         "jnz 1b                 \n"  // y_loop5
-         
-         "4:                     \n"  // end_y_loop5
-           : [temp]"=m"(lTemp), [src] "+S"(src), [dst] "+D"(dst), [height] "+g"(lHeight)
-           : [wid_64] "g" (wid_64), [line] "g" ((uintptr_t)line), [ext] "g" ((uintptr_t)ext)
-           : "memory", "cc", "eax", "edx", "ecx"
-           );
+
+    "decl %[temp]           \n"
+    "jnz 3b                 \n"  // x_loop_25
+
+    "add %[line], %[src]    \n"
+    "add %[ext], %[dst]     \n"
+
+    "decl %[height]         \n"
+    "jnz 1b                 \n"  // y_loop5
+
+    "4:                     \n"  // end_y_loop5
+    : [temp]"=m"(lTemp), [src] "+S"(src), [dst] "+D"(dst), [height] "+g"(lHeight)
+    : [wid_64] "g" (wid_64), [line] "g" ((uintptr_t)line), [ext] "g" ((uintptr_t)ext)
+    : "memory", "cc", "eax", "edx", "ecx"
+    );
 #endif
-    return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44;  
+  return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44;  
 } 
 
 //****************************************************************
 
 wxUint32 Load8bI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)  
 { 
-    if (rdp.tlut_mode != 0)
-        return Load8bCI (dst, src, wid_64, height, line, real_width, tile);
-    
-    if (wid_64 < 1) wid_64 = 1;  
-    if (height < 1) height = 1;  
-    int ext = (real_width - (wid_64 << 3));  
+  if (rdp.tlut_mode != 0)
+    return Load8bCI (dst, src, wid_64, height, line, real_width, tile);
+
+  if (wid_64 < 1) wid_64 = 1;  
+  if (height < 1) height = 1;  
+  int ext = (real_width - (wid_64 << 3));  
 #if !defined(__GNUC__) && !defined(NO_ASM)
-    __asm {  
-        mov esi,dword ptr [src]  
-            mov edi,dword ptr [dst]  
-            
-            mov ecx,dword ptr [height]  
-y_loop:  
-        push ecx  
-            
-            mov ecx,dword ptr [wid_64]  
-x_loop:  
-        mov eax,dword ptr [esi]          // read all 4 pixels  
-            add esi,4