Commits

ecsv  committed 1ff6142

Add C conversion of TexLoad4b assembler functions

  • Participants
  • Parent commits 73bffaf

Comments (0)

Files changed (2)

File projects/unix/Makefile

 
 SOURCE = $(SRCDIR)/Glide64/Texture.asm
 #CFLAGS += -DOLDASM_asmTextureCRC
+#CFLAGS += -DOLDASM_asmLoad4bCI
+#CFLAGS += -DOLDASM_asmLoad4bIAPal
+#CFLAGS += -DOLDASM_asmLoad4bIA
+#CFLAGS += -DOLDASM_asmLoad4bI
 #CFLAGS += -DOLDASM_asmLoadBlock
 #CFLAGS += -DOLDASM_asmLoadTile
 

File src/Glide64/TexLoad4b.h

 //
 //****************************************************************
 #include <stdint.h>
+
+extern "C" void asmLoad4bCI(uint8_t *src, uint8_t *dst, int wid_64, int height, uint16_t line, int ext, uint16_t *pal);
+extern "C" void asmLoad4bIAPal(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext, uint16_t *pal);
+extern "C" void asmLoad4bIA(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext);
+extern "C" void asmLoad4bI(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext);
+
+#define ALOWORD(x)   (*((uint16_t*)&(x)))   // low word
+
+template<class T> static inline T __ROR__(T value, uint count)
+{
+  const uint nbits = sizeof(T) * 8;
+  count %= nbits;
+
+  T low = value << (nbits - count);
+  value >>= count;
+  value |= low;
+  return value;
+}
+
+static inline void load4bCI(uint8_t *src, uint8_t *dst, int wid_64, int height, uint16_t line, int ext, uint16_t *pal)
+{
+  uint8_t *v7;
+  uint8_t *v8;
+  int v9;
+  int v10;
+  int v11;
+  uint32_t v12;
+  uint8_t *v13;
+  uint32_t v14;
+  uint32_t *v15;
+  uint32_t v16;
+  uint8_t *v17;
+  uint32_t *v18;
+  int v19;
+  int v20;
+  uint32_t v21;
+  uint32_t v22;
+  uint32_t *v23;
+  uint32_t v24;
+  int v25;
+  int v26;
+
+  v7 = src;
+  v8 = dst;
+  v9 = height;
+  do
+  {
+    v25 = v9;
+    v10 = wid_64;
+    do
+    {
+      v11 = v10;
+      v12 = __builtin_bswap32(*(uint32_t *)v7);
+      v13 = v7 + 4;
+      ALOWORD(v10) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 23) & 0x1E)), 1);
+      v14 = v10 << 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 27) & 0x1E)), 1);
+      *(uint32_t *)v8 = v14;
+      v15 = (uint32_t *)(v8 + 4);
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 15) & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 19) & 0x1E)), 1);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 7) & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 11) & 0x1E)), 1);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v12 & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 3) & 0x1E)), 1);
+      *v15 = v14;
+      ++v15;
+      v16 = __builtin_bswap32(*(uint32_t *)v13);
+      v7 = v13 + 4;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 23) & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 27) & 0x1E)), 1);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 15) & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 19) & 0x1E)), 1);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 7) & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 11) & 0x1E)), 1);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v16 & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 3) & 0x1E)), 1);
+      *v15 = v14;
+      v8 = (uint8_t *)(v15 + 1);
+      v10 = v11 - 1;
+    }
+    while ( v11 != 1 );
+    if ( v25 == 1 )
+      break;
+    v26 = v25 - 1;
+    v17 = &src[(line + (uintptr_t)v7 - (uintptr_t)src) & 0x7FF];
+    v18 = (uint32_t *)&v8[ext];
+    v19 = wid_64;
+    do
+    {
+      v20 = v19;
+      v21 = __builtin_bswap32(*((uint32_t *)v17 + 1));
+      ALOWORD(v19) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 23) & 0x1E)), 1);
+      v22 = v19 << 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 27) & 0x1E)), 1);
+      *v18 = v22;
+      v23 = v18 + 1;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 15) & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 19) & 0x1E)), 1);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 7) & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 11) & 0x1E)), 1);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v21 & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 3) & 0x1E)), 1);
+      *v23 = v22;
+      ++v23;
+      v24 = __builtin_bswap32(*(uint32_t *)v17);
+      v17 = &src[((uintptr_t)v17 + 8 - (uintptr_t)src) & 0x7FF];
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 23) & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 27) & 0x1E)), 1);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 15) & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 19) & 0x1E)), 1);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 7) & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 11) & 0x1E)), 1);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v24 & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 3) & 0x1E)), 1);
+      *v23 = v22;
+      v18 = v23 + 1;
+      v19 = v20 - 1;
+    }
+    while ( v20 != 1 );
+    v7 = &src[(line + (uintptr_t)v17 - (uintptr_t)src) & 0x7FF];
+    v8 = (uint8_t *)((char *)v18 + ext);
+    v9 = v26 - 1;
+  }
+  while ( v26 != 1 );
+}
+
+static inline void load4bIAPal(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext, uint16_t *pal)
+{
+  uint8_t *v7;
+  uint32_t *v8;
+  int v9;
+  int v10;
+  int v11;
+  uint32_t v12;
+  uint32_t *v13;
+  uint32_t v14;
+  uint32_t *v15;
+  uint32_t v16;
+  uint8_t *v17;
+  uint32_t *v18;
+  int v19;
+  int v20;
+  uint32_t v21;
+  uint32_t v22;
+  uint32_t *v23;
+  uint32_t v24;
+  int v25;
+  int v26;
+
+  v7 = src;
+  v8 = (uint32_t *)dst;
+  v9 = height;
+  do
+  {
+    v25 = v9;
+    v10 = wid_64;
+    do
+    {
+      v11 = v10;
+      v12 = __builtin_bswap32(*(uint32_t *)v7);
+      v13 = (uint32_t *)(v7 + 4);
+      ALOWORD(v10) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 23) & 0x1E)), 8);
+      v14 = v10 << 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 27) & 0x1E)), 8);
+      *v8 = v14;
+      v15 = v8 + 1;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 15) & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 19) & 0x1E)), 8);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 7) & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 11) & 0x1E)), 8);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v12 & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 3) & 0x1E)), 8);
+      *v15 = v14;
+      ++v15;
+      v16 = __builtin_bswap32(*v13);
+      v7 = (uint8_t *)(v13 + 1);
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 23) & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 27) & 0x1E)), 8);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 15) & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 19) & 0x1E)), 8);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 7) & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 11) & 0x1E)), 8);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v16 & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 3) & 0x1E)), 8);
+      *v15 = v14;
+      v8 = v15 + 1;
+      v10 = v11 - 1;
+    }
+    while ( v11 != 1 );
+    if ( v25 == 1 )
+      break;
+    v26 = v25 - 1;
+    v17 = &src[(line + (uintptr_t)v7 - (uintptr_t)src) & 0x7FF];
+    v18 = (uint32_t *)((char *)v8 + ext);
+    v19 = wid_64;
+    do
+    {
+      v20 = v19;
+      v21 = __builtin_bswap32(*((uint32_t *)v17 + 1));
+      ALOWORD(v19) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 23) & 0x1E)), 8);
+      v22 = v19 << 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 27) & 0x1E)), 8);
+      *v18 = v22;
+      v23 = v18 + 1;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 15) & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 19) & 0x1E)), 8);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 7) & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 11) & 0x1E)), 8);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v21 & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 3) & 0x1E)), 8);
+      *v23 = v22;
+      ++v23;
+      v24 = __builtin_bswap32(*(uint32_t *)v17);
+      v17 = &src[((uintptr_t)v17 + 8 - (uintptr_t)src) & 0x7FF];
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 23) & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 27) & 0x1E)), 8);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 15) & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 19) & 0x1E)), 8);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 7) & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 11) & 0x1E)), 8);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v24 & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 3) & 0x1E)), 8);
+      *v23 = v22;
+      v18 = v23 + 1;
+      v19 = v20 - 1;
+    }
+    while ( v20 != 1 );
+    v7 = &src[(line + (uintptr_t)v17 - (uintptr_t)src) & 0x7FF];
+    v8 = (uint32_t *)((char *)v18 + ext);
+    v9 = v26 - 1;
+  }
+  while ( v26 != 1 );
+}
+
+static inline void load4bIA(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext)
+{
+  uint32_t *v6;
+  uint32_t *v7;
+  int v8;
+  int v9;
+  int v10;
+  uint32_t v11;
+  uint32_t *v12;
+  uint32_t v13;
+  uint32_t v14;
+  uint32_t v15;
+  uint32_t *v16;
+  uint32_t v17;
+  uint32_t v18;
+  uint32_t v19;
+  uint32_t v20;
+  uint32_t v21;
+  uint32_t v22;
+  uint32_t v23;
+  uint32_t v24;
+  uint32_t v25;
+  uint32_t v26;
+  uint32_t v27;
+  uint32_t v28;
+  uint32_t v29;
+  uint32_t v30;
+  uint32_t v31;
+  uint32_t v32;
+  uint32_t *v33;
+  uint32_t *v34;
+  int v35;
+  int v36;
+  uint32_t v37;
+  uint32_t v38;
+  uint32_t v39;
+  uint32_t *v40;
+  uint32_t v41;
+  uint32_t v42;
+  uint32_t v43;
+  uint32_t v44;
+  uint32_t v45;
+  uint32_t v46;
+  uint32_t v47;
+  uint32_t v48;
+  uint32_t v49;
+  uint32_t v50;
+  uint32_t v51;
+  uint32_t v52;
+  uint32_t v53;
+  uint32_t v54;
+  uint32_t v55;
+  uint32_t v56;
+  int v57;
+  int v58;
+
+  v6 = (uint32_t *)src;
+  v7 = (uint32_t *)dst;
+  v8 = height;
+  do
+  {
+    v57 = v8;
+    v9 = wid_64;
+    do
+    {
+      v10 = v9;
+      v11 = __builtin_bswap32(*v6);
+      v12 = v6 + 1;
+      v13 = v11;
+      v14 = 8 * (v11 & 0x100000) | 4 * (v11 & 0x100000) | 2 * (v11 & 0x100000) | v11 & 0x100000 | (((v11 >> 16) & 0xE00) >> 3) & 0x100 | (v11 >> 16) & 0xE00 | 8 * ((v11 >> 12) & 0x1000) | 4 * ((v11 >> 12) & 0x1000) | 2 * ((v11 >> 12) & 0x1000) | (v11 >> 12) & 0x1000 | (((v11 >> 28) & 0xE) >> 3) | (v11 >> 28) & 0xE | 8 * ((v11 >> 24) & 0x10) | 4 * ((v11 >> 24) & 0x10) | 2 * ((v11 >> 24) & 0x10) | (v11 >> 24) & 0x10;
+      v11 >>= 4;
+      v11 &= 0xE0000u;
+      v15 = v11 | v14;
+      v11 >>= 3;
+      *v7 = (((v13 << 8) & 0xE000000) >> 3) & 0x1000000 | (v13 << 8) & 0xE000000 | 8 * ((v13 << 12) & 0x10000000) | 4 * ((v13 << 12) & 0x10000000) | 2 * ((v13 << 12) & 0x10000000) | (v13 << 12) & 0x10000000 | v11 & 0x10000 | v15;
+      v16 = v7 + 1;
+      v17 = 16 * (uint16_t)v13 & 0x1000;
+      v18 = ((v13 & 0xE00) >> 3) & 0x100 | v13 & 0xE00 | 8 * v17 | 4 * v17 | 2 * v17 | v17 | (((v13 >> 12) & 0xE) >> 3) | (v13 >> 12) & 0xE | 8 * ((v13 >> 8) & 0x10) | 4 * ((v13 >> 8) & 0x10) | 2 * ((v13 >> 8) & 0x10) | (v13 >> 8) & 0x10;
+      v19 = v13 << 16;
+      v20 = 8 * (v19 & 0x100000) | 4 * (v19 & 0x100000) | 2 * (v19 & 0x100000) | v19 & 0x100000 | v18;
+      v21 = v13 << 12;
+      v21 &= 0xE0000u;
+      v22 = v21 | v20;
+      v21 >>= 3;
+      *v16 = (((v13 << 24) & 0xE000000) >> 3) & 0x1000000 | (v13 << 24) & 0xE000000 | 8 * ((v13 << 28) & 0x10000000) | 4 * ((v13 << 28) & 0x10000000) | 2 * ((v13 << 28) & 0x10000000) | (v13 << 28) & 0x10000000 | v21 & 0x10000 | v22;
+      ++v16;
+      v23 = __builtin_bswap32(*v12);
+      v6 = v12 + 1;
+      v24 = v23;
+      v25 = 8 * (v23 & 0x100000) | 4 * (v23 & 0x100000) | 2 * (v23 & 0x100000) | v23 & 0x100000 | (((v23 >> 16) & 0xE00) >> 3) & 0x100 | (v23 >> 16) & 0xE00 | 8 * ((v23 >> 12) & 0x1000) | 4 * ((v23 >> 12) & 0x1000) | 2 * ((v23 >> 12) & 0x1000) | (v23 >> 12) & 0x1000 | (((v23 >> 28) & 0xE) >> 3) | (v23 >> 28) & 0xE | 8 * ((v23 >> 24) & 0x10) | 4 * ((v23 >> 24) & 0x10) | 2 * ((v23 >> 24) & 0x10) | (v23 >> 24) & 0x10;
+      v23 >>= 4;
+      v23 &= 0xE0000u;
+      v26 = v23 | v25;
+      v23 >>= 3;
+      *v16 = (((v24 << 8) & 0xE000000) >> 3) & 0x1000000 | (v24 << 8) & 0xE000000 | 8 * ((v24 << 12) & 0x10000000) | 4 * ((v24 << 12) & 0x10000000) | 2 * ((v24 << 12) & 0x10000000) | (v24 << 12) & 0x10000000 | v23 & 0x10000 | v26;
+      ++v16;
+      v27 = 16 * (uint16_t)v24 & 0x1000;
+      v28 = ((v24 & 0xE00) >> 3) & 0x100 | v24 & 0xE00 | 8 * v27 | 4 * v27 | 2 * v27 | v27 | (((v24 >> 12) & 0xE) >> 3) | (v24 >> 12) & 0xE | 8 * ((v24 >> 8) & 0x10) | 4 * ((v24 >> 8) & 0x10) | 2 * ((v24 >> 8) & 0x10) | (v24 >> 8) & 0x10;
+      v29 = v24 << 16;
+      v30 = 8 * (v29 & 0x100000) | 4 * (v29 & 0x100000) | 2 * (v29 & 0x100000) | v29 & 0x100000 | v28;
+      v31 = v24 << 12;
+      v31 &= 0xE0000u;
+      v32 = v31 | v30;
+      v31 >>= 3;
+      *v16 = (((v24 << 24) & 0xE000000) >> 3) & 0x1000000 | (v24 << 24) & 0xE000000 | 8 * ((v24 << 28) & 0x10000000) | 4 * ((v24 << 28) & 0x10000000) | 2 * ((v24 << 28) & 0x10000000) | (v24 << 28) & 0x10000000 | v31 & 0x10000 | v32;
+      v7 = v16 + 1;
+      v9 = v10 - 1;
+    }
+    while ( v10 != 1 );
+    if ( v57 == 1 )
+      break;
+    v58 = v57 - 1;
+    v33 = (uint32_t *)((char *)v6 + line);
+    v34 = (uint32_t *)((char *)v7 + ext);
+    v35 = wid_64;
+    do
+    {
+      v36 = v35;
+      v37 = __builtin_bswap32(v33[1]);
+      v38 = v37 >> 4;
+      v38 &= 0xE0000u;
+      v39 = v38 | 8 * (v37 & 0x100000) | 4 * (v37 & 0x100000) | 2 * (v37 & 0x100000) | v37 & 0x100000 | (((v37 >> 16) & 0xE00) >> 3) & 0x100 | (v37 >> 16) & 0xE00 | 8 * ((v37 >> 12) & 0x1000) | 4 * ((v37 >> 12) & 0x1000) | 2 * ((v37 >> 12) & 0x1000) | (v37 >> 12) & 0x1000 | (((v37 >> 28) & 0xE) >> 3) | (v37 >> 28) & 0xE | 8 * ((v37 >> 24) & 0x10) | 4 * ((v37 >> 24) & 0x10) | 2 * ((v37 >> 24) & 0x10) | (v37 >> 24) & 0x10;
+      v38 >>= 3;
+      *v34 = (((v37 << 8) & 0xE000000) >> 3) & 0x1000000 | (v37 << 8) & 0xE000000 | 8 * ((v37 << 12) & 0x10000000) | 4 * ((v37 << 12) & 0x10000000) | 2 * ((v37 << 12) & 0x10000000) | (v37 << 12) & 0x10000000 | v38 & 0x10000 | v39;
+      v40 = v34 + 1;
+      v41 = 16 * (uint16_t)v37 & 0x1000;
+      v42 = ((v37 & 0xE00) >> 3) & 0x100 | v37 & 0xE00 | 8 * v41 | 4 * v41 | 2 * v41 | v41 | (((v37 >> 12) & 0xE) >> 3) | (v37 >> 12) & 0xE | 8 * ((v37 >> 8) & 0x10) | 4 * ((v37 >> 8) & 0x10) | 2 * ((v37 >> 8) & 0x10) | (v37 >> 8) & 0x10;
+      v43 = v37 << 16;
+      v44 = 8 * (v43 & 0x100000) | 4 * (v43 & 0x100000) | 2 * (v43 & 0x100000) | v43 & 0x100000 | v42;
+      v45 = v37 << 12;
+      v45 &= 0xE0000u;
+      v46 = v45 | v44;
+      v45 >>= 3;
+      *v40 = (((v37 << 24) & 0xE000000) >> 3) & 0x1000000 | (v37 << 24) & 0xE000000 | 8 * ((v37 << 28) & 0x10000000) | 4 * ((v37 << 28) & 0x10000000) | 2 * ((v37 << 28) & 0x10000000) | (v37 << 28) & 0x10000000 | v45 & 0x10000 | v46;
+      ++v40;
+      v47 = __builtin_bswap32(*v33);
+      v33 += 2;
+      v48 = v47;
+      v49 = 8 * (v47 & 0x100000) | 4 * (v47 & 0x100000) | 2 * (v47 & 0x100000) | v47 & 0x100000 | (((v47 >> 16) & 0xE00) >> 3) & 0x100 | (v47 >> 16) & 0xE00 | 8 * ((v47 >> 12) & 0x1000) | 4 * ((v47 >> 12) & 0x1000) | 2 * ((v47 >> 12) & 0x1000) | (v47 >> 12) & 0x1000 | (((v47 >> 28) & 0xE) >> 3) | (v47 >> 28) & 0xE | 8 * ((v47 >> 24) & 0x10) | 4 * ((v47 >> 24) & 0x10) | 2 * ((v47 >> 24) & 0x10) | (v47 >> 24) & 0x10;
+      v47 >>= 4;
+      v47 &= 0xE0000u;
+      v50 = v47 | v49;
+      v47 >>= 3;
+      *v40 = (((v48 << 8) & 0xE000000) >> 3) & 0x1000000 | (v48 << 8) & 0xE000000 | 8 * ((v48 << 12) & 0x10000000) | 4 * ((v48 << 12) & 0x10000000) | 2 * ((v48 << 12) & 0x10000000) | (v48 << 12) & 0x10000000 | v47 & 0x10000 | v50;
+      ++v40;
+      v51 = 16 * (uint16_t)v48 & 0x1000;
+      v52 = ((v48 & 0xE00) >> 3) & 0x100 | v48 & 0xE00 | 8 * v51 | 4 * v51 | 2 * v51 | v51 | (((v48 >> 12) & 0xE) >> 3) | (v48 >> 12) & 0xE | 8 * ((v48 >> 8) & 0x10) | 4 * ((v48 >> 8) & 0x10) | 2 * ((v48 >> 8) & 0x10) | (v48 >> 8) & 0x10;
+      v53 = v48 << 16;
+      v54 = 8 * (v53 & 0x100000) | 4 * (v53 & 0x100000) | 2 * (v53 & 0x100000) | v53 & 0x100000 | v52;
+      v55 = v48 << 12;
+      v55 &= 0xE0000u;
+      v56 = v55 | v54;
+      v55 >>= 3;
+      *v40 = (((v48 << 24) & 0xE000000) >> 3) & 0x1000000 | (v48 << 24) & 0xE000000 | 8 * ((v48 << 28) & 0x10000000) | 4 * ((v48 << 28) & 0x10000000) | 2 * ((v48 << 28) & 0x10000000) | (v48 << 28) & 0x10000000 | v55 & 0x10000 | v56;
+      v34 = v40 + 1;
+      v35 = v36 - 1;
+    }
+    while ( v36 != 1 );
+    v6 = (uint32_t *)((char *)v33 + line);
+    v7 = (uint32_t *)((char *)v34 + ext);
+    v8 = v58 - 1;
+  }
+  while ( v58 != 1 );
+}
+
+static inline void load4bI(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext)
+{
+  uint32_t *v6;
+  uint32_t *v7;
+  int v8;
+  int v9;
+  int v10;
+  uint32_t v11;
+  uint32_t *v12;
+  uint32_t v13;
+  uint32_t v14;
+  uint32_t *v15;
+  uint32_t v16;
+  unsigned int v17;
+  unsigned int v18;
+  uint32_t v19;
+  uint32_t v20;
+  uint32_t *v21;
+  uint32_t *v22;
+  int v23;
+  int v24;
+  uint32_t v25;
+  uint32_t v26;
+  uint32_t *v27;
+  uint32_t v28;
+  uint32_t v29;
+  uint32_t v30;
+  uint32_t v31;
+  uint32_t v32;
+  int v33;
+  int v34;
+
+  v6 = (uint32_t *)src;
+  v7 = (uint32_t *)dst;
+  v8 = height;
+  do
+  {
+    v33 = v8;
+    v9 = wid_64;
+    do
+    {
+      v10 = v9;
+      v11 = __builtin_bswap32(*v6);
+      v12 = v6 + 1;
+      v13 = v11;
+      v14 = 16 * ((v11 >> 16) & 0xF00) | (v11 >> 16) & 0xF00 | 16 * (v11 >> 28) | (v11 >> 28);
+      v11 >>= 4;
+      *v7 = 16 * ((v13 << 8) & 0xF000000) | (v13 << 8) & 0xF000000 | 16 * (v11 & 0xF0000) | v11 & 0xF0000 | v14;
+      v15 = v7 + 1;
+      v16 = v13 << 12;
+      *v15 = 16 * ((v13 << 24) & 0xF000000) | (v13 << 24) & 0xF000000 | 16 * (v16 & 0xF0000) | v16 & 0xF0000 | 16 * (v13 & 0xF00) | v13 & 0xF00 | 16 * ((uint16_t)v13 >> 12) | ((uint16_t)v13 >> 12);
+      ++v15;
+      v17 = __builtin_bswap32(*v12);
+      v6 = v12 + 1;
+      v18 = v17;
+      v19 = 16 * ((v17 >> 16) & 0xF00) | (v17 >> 16) & 0xF00 | 16 * (v17 >> 28) | (v17 >> 28);
+      v17 >>= 4;
+      *v15 = 16 * ((v18 << 8) & 0xF000000) | (v18 << 8) & 0xF000000 | 16 * (v17 & 0xF0000) | v17 & 0xF0000 | v19;
+      ++v15;
+      v20 = v18 << 12;
+      *v15 = 16 * ((v18 << 24) & 0xF000000) | (v18 << 24) & 0xF000000 | 16 * (v20 & 0xF0000) | v20 & 0xF0000 | 16 * (v18 & 0xF00) | v18 & 0xF00 | 16 * ((uint16_t)v18 >> 12) | ((uint16_t)v18 >> 12);
+      v7 = v15 + 1;
+      v9 = v10 - 1;
+    }
+    while ( v10 != 1 );
+    if ( v33 == 1 )
+      break;
+    v34 = v33 - 1;
+    v21 = (uint32_t *)((char *)v6 + line);
+    v22 = (uint32_t *)((char *)v7 + ext);
+    v23 = wid_64;
+    do
+    {
+      v24 = v23;
+      v25 = __builtin_bswap32(v21[1]);
+      v26 = v25 >> 4;
+      *v22 = 16 * ((v25 << 8) & 0xF000000) | (v25 << 8) & 0xF000000 | 16 * (v26 & 0xF0000) | v26 & 0xF0000 | 16 * ((v25 >> 16) & 0xF00) | (v25 >> 16) & 0xF00 | 16 * (v25 >> 28) | (v25 >> 28);
+      v27 = v22 + 1;
+      v28 = v25 << 12;
+      *v27 = 16 * ((v25 << 24) & 0xF000000) | (v25 << 24) & 0xF000000 | 16 * (v28 & 0xF0000) | v28 & 0xF0000 | 16 * (v25 & 0xF00) | v25 & 0xF00 | 16 * ((uint16_t)v25 >> 12) | ((uint16_t)v25 >> 12);
+      ++v27;
+      v29 = __builtin_bswap32(*v21);
+      v21 += 2;
+      v30 = v29;
+      v31 = 16 * ((v29 >> 16) & 0xF00) | (v29 >> 16) & 0xF00 | 16 * (v29 >> 28) | (v29 >> 28);
+      v29 >>= 4;
+      *v27 = 16 * ((v30 << 8) & 0xF000000) | (v30 << 8) & 0xF000000 | 16 * (v29 & 0xF0000) | v29 & 0xF0000 | v31;
+      ++v27;
+      v32 = v30 << 12;
+      *v27 = 16 * ((v30 << 24) & 0xF000000) | (v30 << 24) & 0xF000000 | 16 * (v32 & 0xF0000) | v32 & 0xF0000 | 16 * (v30 & 0xF00) | v30 & 0xF00 | 16 * ((uint16_t)v30 >> 12) | ((uint16_t)v30 >> 12);
+      v22 = v27 + 1;
+      v23 = v24 - 1;
+    }
+    while ( v24 != 1 );
+    v6 = (uint32_t *)((char *)v21 + line);
+    v7 = (uint32_t *)((char *)v22 + ext);
+    v8 = v34 - 1;
+  }
+  while ( v34 != 1 );
+}
+
 //****************************************************************
 // Size: 0, Format: 2
 
 wxUint32 Load4bCI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)
 {
-    if (wid_64 < 1) wid_64 = 1;
-    if (height < 1) height = 1;
-    int ext = (real_width - (wid_64 << 4)) << 1;
-    unsigned short * pal = (rdp.pal_8 + (rdp.tiles[tile].palette << 4));
-    if (rdp.tlut_mode != 3)
-    {
-#if !defined(__GNUC__) && !defined(NO_ASM)
-    __asm {
-        mov ebx,dword ptr [pal]
+  if (wid_64 < 1) wid_64 = 1;
+  if (height < 1) height = 1;
+  int ext = (real_width - (wid_64 << 4)) << 1;
 
-        mov esi,dword ptr [src]
-        mov edi,dword ptr [dst]
+  if (rdp.tlut_mode == 0)
+  {
+    //in tlut DISABLE mode load CI texture as plain intensity texture instead of palette dereference.
+    //Thanks to angrylion for the advice
+#ifdef OLDASM_asmLoad4bI
+    asmLoad4bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
+#else
+    load4bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
+#endif
+    return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44;
+  }
 
-        mov ecx,dword ptr [height]
-y_loop:
-        push ecx
+  wxUIntPtr pal = wxPtrToUInt(rdp.pal_8 + (rdp.tiles[tile].palette << 4));
+  if (rdp.tlut_mode == 2)
+  {
+#ifdef OLDASM_asmLoad4bCI
+    asmLoad4bCI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, (uint16_t *)pal);
+#else
+    load4bCI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, (uint16_t *)pal);
+#endif
+    
+    return (1 << 16) | GR_TEXFMT_ARGB_1555;
+  }
 
-        mov ecx,dword ptr [wid_64]
-x_loop:
-        push ecx
-
-        mov eax,dword ptr [esi]     // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,word ptr [ebx+edx]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,dword ptr [esi]     // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,word ptr [ebx+edx]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-
-        dec ecx
-        jnz x_loop
-
-        pop ecx
-        dec ecx
-        jz end_y_loop
-        push ecx
-
-        add esi,dword ptr [line]
-        add edi,dword ptr [ext]
-
-        mov ecx,dword ptr [wid_64]
-x_loop_2:
-        push ecx
-
-        mov eax,dword ptr [esi+4]       // read all 8 pixels
-        bswap eax
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,word ptr [ebx+edx]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,dword ptr [esi]     // read all 8 pixels
-        bswap eax
-        add esi,8
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,word ptr [ebx+edx]
-        ror cx,1
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-
-        dec ecx
-        jnz x_loop_2
-        
-        add esi,dword ptr [line]
-        add edi,dword ptr [ext]
-
-        pop ecx
-        dec ecx
-        jnz y_loop
-
-end_y_loop:
-    }
-#elif !defined(NO_ASM)
-       //printf("Load4bCI1\n");
-        // This way, gcc generates either a 32 bit or a 64 bit register
-        long lTempX, lTempY, lHeight = (long) height;
-        intptr_t fake_eax, fake_edx;
-       asm volatile (
-        "1:                 \n" // y_loop
-             "mov %[c], %[tempy]            \n"
-             
-             "mov %[wid_64], %%ecx    \n"
-        "2:                 \n" // x_loop
-             "mov %[c], %[tempx]            \n"
-             
-             "mov (%[src]), %%eax      \n"      // read all 8 pixels
-             "bswap %%eax             \n"
-             "add $4, %[src]           \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $23, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $27, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $15, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $19, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             
-             // 3rd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $7,%%eax            \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]),%%cx  \n"
-             "ror $1,%%cx             \n"
-             "shl $16,%%ecx           \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $11, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             
-             // 4th dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $3, %%edx           \n"
-             "and $0x1E, %%edx        \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // * copy
-             "mov (%[src]), %%eax      \n"      // read all 8 pixels
-             "bswap %%eax             \n"
-             "add $4, %[src]           \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $23, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $27, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $15, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $19, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             
-             // 3rd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $7, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $11, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             
-             // 4th dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $3, %%edx           \n"
-             "and $0x1E, %%edx        \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             // *
-             
-             "mov %[tempx], %[c]       \n"
-             
-             "dec %%ecx               \n"
-             "jnz 2b                  \n" // x_loop
-             
-             "mov %[tempy], %[c]       \n"
-             "dec %%ecx               \n"
-             "jz 4f                    \n" // end_y_loop
-             "mov %[c], %[tempy]       \n"
-
-             "add %[line], %[src]      \n"
-             "add %[ext], %[dst]       \n"
-
-             "mov %[wid_64], %%ecx    \n"
-             "3:                      \n" // x_loop_2
-             "mov %[c], %[tempx]       \n"
-             
-             "mov 4(%[src]), %%eax     \n"      // read all 8 pixels
-             "bswap %%eax             \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $23, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $27, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $15, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $19, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 3rd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $7, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $11, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             
-             // 4th dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $3, %%edx           \n"
-             "and $0x1E, %%edx        \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             
-             // * copy
-             "mov (%[src]), %%eax      \n"      // read all 8 pixels
-             "bswap %%eax             \n"
-             "add $8, %[src]           \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $23, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $27, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $15, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $19, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 3rd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $7, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $11, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 4th dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $1, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $3, %%edx           \n"
-             "and $0x1E, %%edx        \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $1, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             // *
-
-             "mov %[tempx], %[c]       \n"
-             
-             "dec %%ecx               \n"
-             "jnz 3b                  \n" // x_loop_2
-             
-             "add %[line], %[src]     \n"
-             "add %[ext], %[dst]      \n"
-             
-             "mov %[tempy], %[c]       \n"
-             "dec %%ecx               \n"
-             "jnz 1b                  \n" // y_loop
-
-             "4:                      \n" // end_y_loop
-             : [tempx]"=m"(lTempX), [tempy]"=m"(lTempY), [a] "=&a"(fake_eax), [d] "=&d"(fake_edx), [src] "+S"(src), [dst] "+D"(dst), [c] "+c"(lHeight)
-             // pal needs to be in a register because its used in mov (%[pal],...), ...
-             : [pal] "r" (pal), [wid_64] "g" (wid_64), [line] "g" ((uintptr_t)line), [ext] "g" ((uintptr_t)ext)
-             : "memory", "cc"
-             );
+#ifdef OLDASM_asmLoad4bIAPal
+    asmLoad4bIAPal ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, (uint16_t *)pal);
+#else
+    load4bIAPal ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, (uint16_t *)pal);
 #endif
-    }
-    else
-    {
-#if !defined(__GNUC__) && !defined(NO_ASM)
-    __asm {
-        mov ebx,dword ptr [pal]
-
-        mov esi,dword ptr [src]
-        mov edi,dword ptr [dst]
-
-        mov ecx,dword ptr [height]
-ia_y_loop:
-        push ecx
-
-        mov ecx,dword ptr [wid_64]
-ia_x_loop:
-        push ecx
-
-        mov eax,dword ptr [esi]     // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,word ptr [ebx+edx]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,dword ptr [esi]     // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,word ptr [ebx+edx]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-
-        dec ecx
-        jnz ia_x_loop
-
-        pop ecx
-        dec ecx
-        jz ia_end_y_loop
-        push ecx
-
-        add esi,dword ptr [line]
-        add edi,dword ptr [ext]
-
-        mov ecx,dword ptr [wid_64]
-ia_x_loop_2:
-        push ecx
-
-        mov eax,dword ptr [esi+4]       // read all 8 pixels
-        bswap eax
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,word ptr [ebx+edx]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,dword ptr [esi]     // read all 8 pixels
-        bswap eax
-        add esi,8
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,word ptr [ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,word ptr [ebx+edx]
-        ror cx,8
-
-        mov dword ptr [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-
-        dec ecx
-        jnz ia_x_loop_2
-        
-        add esi,dword ptr [line]
-        add edi,dword ptr [ext]
-
-        pop ecx
-        dec ecx
-        jnz ia_y_loop
-
-ia_end_y_loop:
-    }
-#elif !defined(NO_ASM)
-       //printf("Load4bCI2\n");
-       long lTempX, lTempY, lHeight = (long) height;
-       intptr_t fake_eax, fake_edx;
-       asm volatile (
-             "1:                     \n"  // ia_y_loop
-             "mov %[c], %[tempy]     \n"
-
-             "mov %[wid_64], %%ecx   \n"
-             "2:                     \n"  // ia_x_loop
-             "mov %[c], %[tempx]     \n"
-             
-             "mov (%[src]), %%eax      \n"      // read all 8 pixels
-             "bswap %%eax             \n"
-             "add $4, %[src]           \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $23, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $27, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $15, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $19, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 3rd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $7, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $11, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 4th dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $3, %%edx           \n"
-             "and $0x1E, %%edx        \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // * copy
-             "mov (%[src]), %%eax      \n"      // read all 8 pixels
-             "bswap %%eax             \n"
-             "add $4, %[src]           \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $23, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $27, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $15, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $19, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 3rd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $7, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8,%%cx             \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $11, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 4th dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $3, %%edx           \n"
-             "and $0x1E, %%edx        \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             // *
-
-             "mov %[tempx], %[c]     \n"
-             
-             "dec %%ecx               \n"
-             "jnz 2b                  \n"  // ia_x_loop
-             
-             "mov %[tempy], %[c]     \n"
-             "dec %%ecx               \n"
-             "jz 4f                  \n"  // ia_end_y_loop
-             "mov %[c], %[tempy]     \n"
-             
-             "add %[line], %[src]     \n"
-             "add %[ext], %[dst]      \n"
-
-             "mov %[wid_64], %%ecx   \n"
-             "3:                     \n"  // ia_x_loop_2
-             "mov %[c], %[tempx]     \n"
-             
-             "mov 4(%[src]), %%eax     \n"      // read all 8 pixels
-             "bswap %%eax             \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $23, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $27, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $15, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $19, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 3rd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $7, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $11, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 4th dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $3, %%edx           \n"
-             "and $0x1E, %%edx        \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // * copy
-             "mov (%[src]), %%eax      \n"      // read all 8 pixels
-             "bswap %%eax             \n"
-             "add $8, %[src]           \n"
-             "mov %%eax, %%edx        \n"
-             
-             // 1st dword output {
-             "shr $23, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $27, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 2nd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $15, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $19, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 3rd dword output {
-             "mov %%edx, %%eax        \n"
-             "shr $7, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "mov %%edx, %%eax        \n"
-             "shr $11, %%eax          \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-
-             // 4th dword output {
-             "mov %%edx, %%eax        \n"
-             "shl $1, %%eax           \n"
-             "and $0x1E, %%eax        \n"
-             "mov (%[pal],%[a]), %%cx \n"
-             "ror $8, %%cx            \n"
-             "shl $16, %%ecx          \n"
-             
-             "shr $3, %%edx           \n"
-             "and $0x1E, %%edx        \n"
-             "mov (%[pal],%[d]), %%cx \n"
-             "ror $8, %%cx            \n"
-             
-             "mov %%ecx, (%[dst])      \n"
-             "add $4, %[dst]           \n"
-             // }
-             // *
-
-             "mov %[tempx], %[c]     \n"
-             
-             "dec %%ecx               \n"
-             "jnz 3b                  \n"  // ia_x_loop_2
-             
-             "add %[line], %[src]     \n"
-             "add %[ext], %[dst]      \n"
-             
-             "mov %[tempy], %[c]     \n"
-             "dec %%ecx               \n"
-             "jnz 1b                  \n"  // ia_y_loop
-             
-             "4:                      \n"  // ia_end_y_loop
-             : [tempx]"=m"(lTempX), [tempy]"=m"(lTempY), [a] "=&a"(fake_eax), [d] "=&d"(fake_edx), [src] "+S"(src), [dst] "+D"(dst), [c] "+c"(lHeight)
-             // pal needs to be in a register because its used in mov (%[pal],...), ...
-             : [pal] "r" (pal), [wid_64] "g" (wid_64), [line] "g" ((uintptr_t)line), [ext] "g" ((uintptr_t)ext)
-             : "memory", "cc"
-             );
-#endif
-        return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88;
-    }
-
-    return (1 << 16) | GR_TEXFMT_ARGB_1555;
+  return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88;
 }
 
 //****************************************************************
 
 wxUint32 Load4bIA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)
 {
-    if (rdp.tlut_mode != 0)
-        return Load4bCI (dst, src, wid_64, height, line, real_width, tile);
+  if (rdp.tlut_mode != 0)
+    return Load4bCI (dst, src, wid_64, height, line, real_width, tile);
 
-    if (wid_64 < 1) wid_64 = 1;
-    if (height < 1) height = 1;
-    int ext = (real_width - (wid_64 << 4));
-#if !defined(__GNUC__) && !defined(NO_ASM)
-    __asm {
-        mov esi,dword ptr [src]
-        mov edi,dword ptr [dst]
-
-        mov ecx,dword ptr [height]
-y_loop:
-        push ecx
-
-        mov ecx,dword ptr [wid_64]
-x_loop:
-        push ecx
-
-        mov eax,dword ptr [esi]     // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword {  
-        xor ecx,ecx
-
-        // pixel #1
-        //  IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-        //  xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
-        mov eax,edx
-        shr eax,24 //Alpha 
-        and eax,0x00000010
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,28 // Intensity
-        and eax,0x0000000E
-        or ecx,eax
-        shr eax,3
-        or ecx,eax
-
-        // pixel #2
-        //  xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx
-        //  xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
-        mov eax,edx
-        shr eax,12 //Alpha 
-        and eax,0x00001000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,16 // Intensity
-        and eax,0x00000E00
-        or ecx,eax
-        shr eax,3
-        and eax,0x00000100
-        or ecx,eax
-                
-        // pixel #3
-        //  xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx
-        //  xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
-        //Alpha 
-        mov eax,edx
-        and eax,0x00100000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,4 // Intensity
-        and eax,0x000E0000
-        or ecx,eax
-        shr eax,3
-        and eax,0x00010000
-        or ecx,eax
-
-        // pixel #4
-        //  xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx
-        //  AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
-        mov eax,edx