Commits

bsmiles32 committed b85accf

Refactor jpeg module.

Comments (0)

Files changed (7)

projects/msvc8/mupen64plus-rsp-hle.vcproj

 			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
 			>
 			<File
-				RelativePath="..\..\src\idct.c"
-				>
-			</File>
-			<File
 				RelativePath="..\..\src\jpeg.c"
 				>
 			</File>
 				RelativePath="..\..\src\hle.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\src\jpeg.h"
+				>
+			</File>
 		</Filter>
 	</Files>
 	<Globals>

projects/unix/Makefile

 SOURCE = \
 	$(SRCDIR)/main.c \
 	$(SRCDIR)/jpeg.c \
-	$(SRCDIR)/idct.c \
 	$(SRCDIR)/ucode3.cpp \
 	$(SRCDIR)/ucode2.cpp \
 	$(SRCDIR)/ucode1.cpp \
 
 void DebugMessage(int level, const char *message, ...);
 
-void idct(short *iblock, short *oblock);
-
-void ps_jpg_uncompress(OSTask_t *task);
-void ob_jpg_uncompress(OSTask_t *task);
 /*void ucode1(OSTask_t *task);
 void ucode2(OSTask_t *task);
 void ucode3(OSTask_t *task);

src/idct.c

-/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
- *   Mupen64plus-rsp-hle - idct.c                                          *
- *   Mupen64Plus homepage: http://code.google.com/p/mupen64plus/           *
- *   Copyright (C) 2012 Bobby Smiles                                       *
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- *   This program is distributed in the hope that it will be useful,       *
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
- *   GNU General Public License for more details.                          *
- *                                                                         *
- *   You should have received a copy of the GNU General Public License     *
- *   along with this program; if not, write to the                         *
- *   Free Software Foundation, Inc.,                                       *
- *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
- * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-
-/**
- * Fast 2D IDCT using separable formulation and normalization
- * Computations use single precision floats
- * Implementation based on Wikipedia :
- * http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
- **/
-
-// Normalized to C4 = 1
-#define C3   1.175875602f
-#define C6   0.541196100f       
- 
-#define K1   0.765366865f   //  C2-C6
-#define K2  -1.847759065f   // -C2-C6
-#define K3  -0.390180644f   //  C5-C3
-#define K4  -1.961570561f   // -C5-C3
-#define K5   1.501321110f   //  C1+C3-C5-C7
-#define K6   2.053119869f   //  C1+C3-C5+C7
-#define K7   3.072711027f   //  C1+C3+C5-C7
-#define K8   0.298631336f   // -C1+C3+C5-C7
-#define K9  -0.899976223f   //  C7-C3
-#define K10 -2.562915448f   // -C1-C3
-
-
-static void idct_1d(float *x, float *dst, unsigned every)
-{
-    float e[4];
-    float f[4];
-    float x26, x1357, x15, x37, x17, x35;
-
-    x15   =  K3 * (x[1] + x[5]);
-    x37   =  K4 * (x[3] + x[7]);
-    x17   =  K9 * (x[1] + x[7]);
-    x35   = K10 * (x[3] + x[5]);
-    x1357 =  C3 * (x[1] + x[3] + x[5] + x[7]);
-    x26   =  C6 * (x[2] + x[6]);
-
-    f[0] = x[0] + x[4];
-    f[1] = x[0] - x[4];
-    f[2] = x26 + K1*x[2];
-    f[3] = x26 + K2*x[6];
-
-    e[0] = x1357 + x15 + K5*x[1] + x17;
-    e[1] = x1357 + x37 + K7*x[3] + x35;
-    e[2] = x1357 + x15 + K6*x[5] + x35;
-    e[3] = x1357 + x37 + K8*x[7] + x17;
-
-    *dst = f[0] + f[2] + e[0]; dst += every;
-    *dst = f[1] + f[3] + e[1]; dst += every;
-    *dst = f[1] - f[3] + e[2]; dst += every;
-    *dst = f[0] - f[2] + e[3]; dst += every;
-    *dst = f[0] - f[2] - e[3]; dst += every;
-    *dst = f[1] - f[3] - e[2]; dst += every;
-    *dst = f[1] + f[3] - e[1]; dst += every;
-    *dst = f[0] + f[2] - e[0]; dst += every;
-}
-
-
-void idct(short *iblock, short *oblock)
-{
-    float x[8];
-    float tblock[64]; // temporary block
-
-    unsigned i = 0;
-    unsigned j = 0;
-
-    // idct 1d on rows (+transposition)
-    for(i=0; i<8; i++) {
-        for(j=0; j < 8; j++) {
-            x[j] = (float)iblock[i*8+j];
-        }
-
-        idct_1d(&x[0], &tblock[i], 8);
-    }
-
-    // idct 1d on columns (thanks to previous transposition)
-    for(i=0; i<8; i++) {
-        idct_1d(&tblock[i*8], &x[0], 1);
-
-        // c4 = 1 normalization implies a division by 8
-        for(j=0; j < 8; j++) {
-            oblock[i+j*8] = (short)x[j] >> 3;
-        }
-    }
-}
-
  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
-#include <string.h>
+#include <assert.h>
 #include <stdlib.h>
-#include <stdio.h>
+#include <stdint.h>
 
 #define M64P_PLUGIN_PROTOTYPES 1
 #include "m64p_types.h"
 #include "m64p_plugin.h"
 #include "hle.h"
 
-// transposed JPEG QTable
-static unsigned QTable_T[64] =
+#define SUBBLOCK_SIZE 64
+
+typedef void (*tile_line_emitter_t)(const int16_t *y, const int16_t *u, uint32_t address);
+
+/* rdram operations */
+// FIXME: these functions deserve their own module
+static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count);
+static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count);
+static uint32_t rdram_read_u32(uint32_t address);
+static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count);
+
+/* helper functions */
+static uint8_t clamp_u8(int16_t x);
+static int16_t clamp_s16(int32_t x);
+static uint16_t clamp_RGBA_component(int16_t x);
+
+/* pixel conversion & foratting */
+static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v);
+static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v);
+
+/* tile line emitters */
+static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address);
+static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address);
+
+/* macroblocks operations */
+static void DecodeMacroblock1(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable);
+static void DecodeMacroblock2(int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE]);
+static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
+static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
+
+/* subblocks operations */
+static void TransposeSubBlock(int16_t *dst, const int16_t *src);
+static void ZigZagSubBlock(int16_t *dst, const int16_t *src);
+static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table);
+static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift);
+static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale);
+static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift);
+static void InverseDCT1D(const float * const x, float *dst, unsigned int stride);
+static void InverseDCTSubBlock(int16_t *dst, const int16_t *src);
+
+/* transposed dequantization table */
+static const int16_t DEFAULT_QTABLE[SUBBLOCK_SIZE] =
 {
     16, 12, 14, 14,  18,  24,  49,  72,
     11, 12, 13, 17,  22,  35,  64,  92,
     61, 55, 56, 62,  77,  92, 101,  99
 };
 
-// ZigZag indices
-static unsigned ZigZag[64] =
+/* zig-zag indices */
+static const unsigned int ZIGZAG_TABLE[SUBBLOCK_SIZE] =
 {
      0,  1,  5,  6, 14, 15, 27, 28,
      2,  4,  7, 13, 16, 26, 29, 42,
     35, 36, 48, 49, 57, 58, 62, 63
 };
 
-// Lazy way of transposing a block
-static unsigned Transpose[64] =
+/* transposition indices */
+static const unsigned int TRANSPOSE_TABLE[SUBBLOCK_SIZE] =
 {
     0,  8, 16, 24, 32, 40, 48, 56,
     1,  9, 17, 25, 33, 41, 49, 57,
     7, 15, 23, 31, 39, 47, 55, 63
 };
 
-static const unsigned char clamp(short x)
+/* global functions */
+
+
+/***************************************************************************
+ * JPEG decoding ucode found in Ocarina of Time, Pokemon Stadium 1 and
+ * Pokemon Stadium 2.
+ **************************************************************************/
+void jpeg_decode_PS(OSTask_t *task)
+{
+    int16_t qtables[3][SUBBLOCK_SIZE];
+    unsigned int mb;
+
+    if (task->flags & 0x1)
+    {
+        DebugMessage(M64MSG_WARNING, "jpeg_decode_PS: task yielding not implemented");
+        return;
+    }
+
+    uint32_t       address          = rdram_read_u32(task->data_ptr);
+    const uint32_t macroblock_count = rdram_read_u32(task->data_ptr + 4);
+    const uint32_t mode             = rdram_read_u32(task->data_ptr + 8);
+    const uint32_t qtableY_ptr      = rdram_read_u32(task->data_ptr + 12);
+    const uint32_t qtableU_ptr      = rdram_read_u32(task->data_ptr + 16);
+    const uint32_t qtableV_ptr      = rdram_read_u32(task->data_ptr + 20);
+
+    DebugMessage(M64MSG_VERBOSE, "jpeg_decode_PS: *buffer=%x, #MB=%d, mode=%d, *Qy=%x, *Qu=%x, *Qv=%x",
+            address,
+            macroblock_count,
+            mode,
+            qtableY_ptr,
+            qtableU_ptr,
+            qtableV_ptr);
+
+    if (mode != 0 && mode != 2)
+    {
+        DebugMessage(M64MSG_WARNING, "jpeg_decode_PS: invalid mode %d", mode);
+        return;
+    }
+    
+    const unsigned int subblock_count = mode + 4;
+    const unsigned int macroblock_size = 2*subblock_count*SUBBLOCK_SIZE;
+
+    rdram_read_many_u16((uint16_t*)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
+    rdram_read_many_u16((uint16_t*)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
+    rdram_read_many_u16((uint16_t*)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);
+
+    for (mb = 0; mb < macroblock_count; ++mb)
+    {
+        int16_t macroblock[macroblock_size];
+
+        rdram_read_many_u16((uint16_t*)macroblock, address, macroblock_size >> 1);
+        DecodeMacroblock2(macroblock, subblock_count, (const int16_t (*)[SUBBLOCK_SIZE])qtables);
+
+        if (mode == 0)
+        {
+            EmitTilesMode0(EmitRGBATileLine, macroblock, address);
+        }
+        else
+        {
+            EmitTilesMode2(EmitRGBATileLine, macroblock, address);
+        }
+
+        address += macroblock_size;
+    }
+}
+
+/***************************************************************************
+ * JPEG decoding ucode found in Ogre Battle and Bottom of the 9th.
+ **************************************************************************/
+void jpeg_decode_OB(OSTask_t *task)
+{
+    int16_t qtable[SUBBLOCK_SIZE];
+    unsigned int mb;
+
+    int32_t y_dc = 0;
+    int32_t u_dc = 0;
+    int32_t v_dc = 0;
+
+    uint32_t           address          = task->data_ptr;
+    const unsigned int macroblock_count = task->data_size;
+    const int          qscale           = task->yield_data_size;
+
+    DebugMessage(M64MSG_VERBOSE, "jpeg_decode_OB: *buffer=%x, #MB=%d, qscale=%d",
+            address,
+            macroblock_count,
+            qscale);
+
+    if (qscale != 0)
+    {
+        if (qscale > 0)
+        {
+            ScaleSubBlock(qtable, DEFAULT_QTABLE, qscale);
+        }
+        else
+        {
+            RShiftSubBlock(qtable, DEFAULT_QTABLE, -qscale);
+        }
+    }
+
+    for (mb = 0; mb < macroblock_count; ++mb)
+    {
+        int16_t macroblock[6*SUBBLOCK_SIZE];
+
+        rdram_read_many_u16((uint16_t*)macroblock, address, 6*SUBBLOCK_SIZE);
+        DecodeMacroblock1(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : NULL);
+        EmitTilesMode2(EmitYUVTileLine, macroblock, address);
+
+        address += (2*6*SUBBLOCK_SIZE);
+    }
+}
+
+
+/* local functions */
+static uint8_t clamp_u8(int16_t x)
 {
     return (x & (0xff00)) ? ((-x) >> 15) & 0xff : x;
 }
 
-static short saturate(int x)
+static int16_t clamp_s16(int32_t x)
 {
     if (x > 32767) { x = 32767; } else if (x < -32768) { x = -32768; }
     return x;
 }
 
-
-void ob_jpg_uncompress(OSTask_t *task)
-{
-    // Fetch arguments
-    unsigned pBuffer = task->data_ptr;
-    unsigned nMacroBlocks = task->data_size;
-    signed QScale = task->yield_data_size;
-
-    // Rescale QTable if needed
-    unsigned i;
-    unsigned qtable[64];
-    unsigned mb;
-
-    int y_dc = 0;
-    int u_dc = 0;
-    int v_dc = 0;
-
-    DebugMessage(M64MSG_VERBOSE, "OB Task: *buffer=%x, #MB=%d, Qscale=%d\n", pBuffer, nMacroBlocks, QScale);
-
-    if (QScale != 0) {
-        if (QScale > 0) {
-            for(i = 0; i < 64; i++) {
-                unsigned q  = QTable_T[i] * QScale;
-                if (q > 32767) q = 32767;
-                qtable[i] = q;
-            }
-        }
-        else {
-            unsigned Shift = -QScale;
-            for(i = 0; i < 64; i++) {
-                qtable[i] = QTable_T[i] >> Shift;
-            }
-        }
-    }
-
-    // foreach MB
-    for(mb=0; mb < nMacroBlocks; mb++) {
-        unsigned sb;
-        short macroblock[2][0x300/2];
-        unsigned y_offset = 0;
-
-        // load MB into short_buffer
-        unsigned offset = pBuffer + 0x300*mb;
-        for(i = 0; i < 0x300/2; i++) {
-            unsigned short s = rsp.RDRAM[(offset+0)^S8];
-            s <<= 8;
-            s += rsp.RDRAM[(offset+1)^S8];
-            macroblock[0][i] = s;
-            offset += 2;
-        }
-
-        // foreach SB
-        for(sb = 0; sb < 6; sb++) {
-
-            // apply delta to DC
-            int dc = (signed)macroblock[0][sb*0x40];
-            switch(sb) {
-            case 0: case 1: case 2: case 3: y_dc += dc; macroblock[1][sb*0x40] = y_dc & 0xffff; break;
-            case 4: u_dc += dc; macroblock[1][sb*0x40] = u_dc & 0xffff; break;
-            case 5: v_dc += dc; macroblock[1][sb*0x40] = v_dc & 0xffff; break;
-            }
-
-            // zigzag reordering
-            for(i = 1; i < 64; i++) {
-                macroblock[1][sb*0x40+i] = macroblock[0][sb*0x40+ZigZag[i]];
-            }
-
-            // Apply Dequantization
-            if (QScale != 0) {
-                for(i = 0; i < 64; i++) {
-                    int v = macroblock[1][sb*0x40+i] * qtable[i];
-                    macroblock[1][sb*0x40+i] = saturate(v);
-                }
-            }
-
-            // Transpose
-            for(i = 0; i < 64; i++) {
-                macroblock[0][sb*0x40+i] = macroblock[1][sb*0x40+Transpose[i]];
-            }
-
-            // Apply Invert Discrete Cosinus Transform
-            idct(&macroblock[0][sb*0x40], &macroblock[1][sb*0x40]);
-
-            // Clamp values between [0..255]
-            for(i = 0; i < 64; i++) {
-                macroblock[0][sb*0x40+i] = clamp(macroblock[1][sb*0x40+i]);
-            }
-        }
-
-        // Texel Formatting
-        offset = pBuffer + 0x300*mb;
-        for(i = 0; i < 8; i++) {
-            // U
-            rsp.RDRAM[(offset+0x00)^S8] = (unsigned char)macroblock[0][(0x200 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x04)^S8] = (unsigned char)macroblock[0][(0x202 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x08)^S8] = (unsigned char)macroblock[0][(0x204 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x0c)^S8] = (unsigned char)macroblock[0][(0x206 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x10)^S8] = (unsigned char)macroblock[0][(0x208 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x14)^S8] = (unsigned char)macroblock[0][(0x20a + i*0x10)/2];
-            rsp.RDRAM[(offset+0x18)^S8] = (unsigned char)macroblock[0][(0x20c + i*0x10)/2];
-            rsp.RDRAM[(offset+0x1c)^S8] = (unsigned char)macroblock[0][(0x20e + i*0x10)/2];
-            rsp.RDRAM[(offset+0x20)^S8] = (unsigned char)macroblock[0][(0x200 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x24)^S8] = (unsigned char)macroblock[0][(0x202 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x28)^S8] = (unsigned char)macroblock[0][(0x204 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x2c)^S8] = (unsigned char)macroblock[0][(0x206 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x30)^S8] = (unsigned char)macroblock[0][(0x208 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x34)^S8] = (unsigned char)macroblock[0][(0x20a + i*0x10)/2];
-            rsp.RDRAM[(offset+0x38)^S8] = (unsigned char)macroblock[0][(0x20c + i*0x10)/2];
-            rsp.RDRAM[(offset+0x3c)^S8] = (unsigned char)macroblock[0][(0x20e + i*0x10)/2];
-
-            // V
-            rsp.RDRAM[(offset+0x02)^S8] = (unsigned char)macroblock[0][(0x280 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x06)^S8] = (unsigned char)macroblock[0][(0x282 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x0a)^S8] = (unsigned char)macroblock[0][(0x284 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x0e)^S8] = (unsigned char)macroblock[0][(0x286 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x12)^S8] = (unsigned char)macroblock[0][(0x288 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x16)^S8] = (unsigned char)macroblock[0][(0x28a + i*0x10)/2];
-            rsp.RDRAM[(offset+0x1a)^S8] = (unsigned char)macroblock[0][(0x28c + i*0x10)/2];
-            rsp.RDRAM[(offset+0x1e)^S8] = (unsigned char)macroblock[0][(0x28e + i*0x10)/2];
-            rsp.RDRAM[(offset+0x22)^S8] = (unsigned char)macroblock[0][(0x280 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x26)^S8] = (unsigned char)macroblock[0][(0x282 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x2a)^S8] = (unsigned char)macroblock[0][(0x284 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x2e)^S8] = (unsigned char)macroblock[0][(0x286 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x32)^S8] = (unsigned char)macroblock[0][(0x288 + i*0x10)/2];
-            rsp.RDRAM[(offset+0x36)^S8] = (unsigned char)macroblock[0][(0x28a + i*0x10)/2];
-            rsp.RDRAM[(offset+0x3a)^S8] = (unsigned char)macroblock[0][(0x28c + i*0x10)/2];
-            rsp.RDRAM[(offset+0x3e)^S8] = (unsigned char)macroblock[0][(0x28e + i*0x10)/2];
-
-            // Ya/Yb
-            rsp.RDRAM[(offset+0x01)^S8] = (unsigned char)macroblock[0][(y_offset + 0x00)/2];
-            rsp.RDRAM[(offset+0x03)^S8] = (unsigned char)macroblock[0][(y_offset + 0x02)/2];
-            rsp.RDRAM[(offset+0x05)^S8] = (unsigned char)macroblock[0][(y_offset + 0x04)/2];
-            rsp.RDRAM[(offset+0x07)^S8] = (unsigned char)macroblock[0][(y_offset + 0x06)/2];
-            rsp.RDRAM[(offset+0x09)^S8] = (unsigned char)macroblock[0][(y_offset + 0x08)/2];
-            rsp.RDRAM[(offset+0x0b)^S8] = (unsigned char)macroblock[0][(y_offset + 0x0a)/2];
-            rsp.RDRAM[(offset+0x0d)^S8] = (unsigned char)macroblock[0][(y_offset + 0x0c)/2];
-            rsp.RDRAM[(offset+0x0f)^S8] = (unsigned char)macroblock[0][(y_offset + 0x0e)/2];
-            rsp.RDRAM[(offset+0x21)^S8] = (unsigned char)macroblock[0][(y_offset + 0x10)/2];
-            rsp.RDRAM[(offset+0x23)^S8] = (unsigned char)macroblock[0][(y_offset + 0x12)/2];
-            rsp.RDRAM[(offset+0x25)^S8] = (unsigned char)macroblock[0][(y_offset + 0x14)/2];
-            rsp.RDRAM[(offset+0x27)^S8] = (unsigned char)macroblock[0][(y_offset + 0x16)/2];
-            rsp.RDRAM[(offset+0x29)^S8] = (unsigned char)macroblock[0][(y_offset + 0x18)/2];
-            rsp.RDRAM[(offset+0x2b)^S8] = (unsigned char)macroblock[0][(y_offset + 0x1a)/2];
-            rsp.RDRAM[(offset+0x2d)^S8] = (unsigned char)macroblock[0][(y_offset + 0x1c)/2];
-            rsp.RDRAM[(offset+0x2f)^S8] = (unsigned char)macroblock[0][(y_offset + 0x1e)/2];
-
-            // Ya+1/Yb+1
-            rsp.RDRAM[(offset+0x11)^S8] = (unsigned char)macroblock[0][(y_offset + 0x80)/2];
-            rsp.RDRAM[(offset+0x13)^S8] = (unsigned char)macroblock[0][(y_offset + 0x82)/2];
-            rsp.RDRAM[(offset+0x15)^S8] = (unsigned char)macroblock[0][(y_offset + 0x84)/2];
-            rsp.RDRAM[(offset+0x17)^S8] = (unsigned char)macroblock[0][(y_offset + 0x86)/2];
-            rsp.RDRAM[(offset+0x19)^S8] = (unsigned char)macroblock[0][(y_offset + 0x88)/2];
-            rsp.RDRAM[(offset+0x1b)^S8] = (unsigned char)macroblock[0][(y_offset + 0x8a)/2];
-            rsp.RDRAM[(offset+0x1d)^S8] = (unsigned char)macroblock[0][(y_offset + 0x8c)/2];
-            rsp.RDRAM[(offset+0x1f)^S8] = (unsigned char)macroblock[0][(y_offset + 0x8e)/2];
-            rsp.RDRAM[(offset+0x31)^S8] = (unsigned char)macroblock[0][(y_offset + 0x90)/2];
-            rsp.RDRAM[(offset+0x33)^S8] = (unsigned char)macroblock[0][(y_offset + 0x92)/2];
-            rsp.RDRAM[(offset+0x35)^S8] = (unsigned char)macroblock[0][(y_offset + 0x94)/2];
-            rsp.RDRAM[(offset+0x37)^S8] = (unsigned char)macroblock[0][(y_offset + 0x96)/2];
-            rsp.RDRAM[(offset+0x39)^S8] = (unsigned char)macroblock[0][(y_offset + 0x98)/2];
-            rsp.RDRAM[(offset+0x3b)^S8] = (unsigned char)macroblock[0][(y_offset + 0x9a)/2];
-            rsp.RDRAM[(offset+0x3d)^S8] = (unsigned char)macroblock[0][(y_offset + 0x9c)/2];
-            rsp.RDRAM[(offset+0x3f)^S8] = (unsigned char)macroblock[0][(y_offset + 0x9e)/2];
-
-            offset += 0x40;
-            y_offset += (i == 3) ? 0xa0 : 0x20;
-        }
-    }
-}
-
-
-
-static short yuv2rgba16_clamp(short x)
+static uint16_t clamp_RGBA_component(int16_t x)
 {
     if (x > 0xff0) { x = 0xff0; } else if (x < 0) { x = 0; }
     return (x & 0xf80);
 }
 
+static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v)
+{
+    return (uint32_t)clamp_u8(u)  << 24
+        |  (uint32_t)clamp_u8(y1) << 16
+        |  (uint32_t)clamp_u8(v)  << 8
+        |  (uint32_t)clamp_u8(y2);
+}
 
-static unsigned short yuv2rgba16(float y, float u, float v)
+static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v)
 {
-    unsigned short r, g, b;
+    const float fY = (float)y + 2048.0f;
+    const float fU = (float)u;
+    const float fV = (float)v;
 
-    r = yuv2rgba16_clamp((short)(y            + 1.4025*v));
-    g = yuv2rgba16_clamp((short)(y - 0.3443*u - 0.7144*v));
-    b = yuv2rgba16_clamp((short)(y + 1.7729*u           ));
+    const uint16_t r = clamp_RGBA_component((int16_t)(fY             + 1.4025*fV));
+    const uint16_t g = clamp_RGBA_component((int16_t)(fY - 0.3443*fU - 0.7144*fV));
+    const uint16_t b = clamp_RGBA_component((int16_t)(fY + 1.7729*fU            ));
 
     return (r << 4) | (g >> 1) | (b >> 6) | 1;
 }
 
+static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address)
+{
+    uint32_t uyvy[8];
 
-void ps_jpg_uncompress(OSTask_t *task)
+    const int16_t * const v  = u + SUBBLOCK_SIZE;
+    const int16_t * const y2 = y + SUBBLOCK_SIZE;
+
+    uyvy[0] = GetUYVY(y[0],  y[1],  u[0], v[0]);
+    uyvy[1] = GetUYVY(y[2],  y[3],  u[1], v[1]);
+    uyvy[2] = GetUYVY(y[4],  y[5],  u[2], v[2]);
+    uyvy[3] = GetUYVY(y[6],  y[7],  u[3], v[3]);
+    uyvy[4] = GetUYVY(y2[0], y2[1], u[4], v[4]);
+    uyvy[5] = GetUYVY(y2[2], y2[3], u[5], v[5]);
+    uyvy[6] = GetUYVY(y2[4], y2[5], u[6], v[6]);
+    uyvy[7] = GetUYVY(y2[6], y2[7], u[7], v[7]);
+
+    rdram_write_many_u32(uyvy, address, 8);
+}
+
+static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address)
 {
-    unsigned int iMBsize, oMBsize, nSubBlocks, mb;
+    uint16_t rgba[16];
 
-    // arguments for pokemon stadium jpg decompression
-    static struct 
+    const int16_t * const v  = u + SUBBLOCK_SIZE;
+    const int16_t * const y2 = y + SUBBLOCK_SIZE;
+
+    rgba[0]  = GetRGBA(y[0],  u[0], v[0]);
+    rgba[1]  = GetRGBA(y[1],  u[0], v[0]);
+    rgba[2]  = GetRGBA(y[2],  u[1], v[1]);
+    rgba[3]  = GetRGBA(y[3],  u[1], v[1]);
+    rgba[4]  = GetRGBA(y[4],  u[2], v[2]);
+    rgba[5]  = GetRGBA(y[5],  u[2], v[2]);
+    rgba[6]  = GetRGBA(y[6],  u[3], v[3]);
+    rgba[7]  = GetRGBA(y[7],  u[3], v[3]);
+    rgba[8]  = GetRGBA(y2[0], u[4], v[4]);
+    rgba[9]  = GetRGBA(y2[1], u[4], v[4]);
+    rgba[10] = GetRGBA(y2[2], u[5], v[5]);
+    rgba[11] = GetRGBA(y2[3], u[5], v[5]);
+    rgba[12] = GetRGBA(y2[4], u[6], v[6]);
+    rgba[13] = GetRGBA(y2[5], u[6], v[6]);
+    rgba[14] = GetRGBA(y2[6], u[7], v[7]);
+    rgba[15] = GetRGBA(y2[7], u[7], v[7]);
+
+    rdram_write_many_u16(rgba, address, 16);
+}
+
+static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
+{
+    unsigned int i;
+
+    unsigned int y_offset = 0;
+    unsigned int u_offset = 2*SUBBLOCK_SIZE;
+
+    for (i = 0; i < 8; ++i)
     {
-         unsigned pMacroBlocks; // address of Macroblocks
-         unsigned nMacroBlocks; // # of Macroblocks
-         unsigned mode;         // specify subsampling mode (as far as I understand)
-         unsigned pQTables[3];  // address of QTable for Y,U,V channel
-    } ps_jpg_data;
+        emit_line(&macroblock[y_offset], &macroblock[u_offset], address);
 
-    short QTables[3][64];
+        y_offset += 8;
+        u_offset += 8;
+        address += 32;
+    }
+}
 
-    unsigned i,j;
+static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
+{
+    unsigned int i;
 
-    // We don't support task yielding
-    if (task->flags & 0x1) {
-        DebugMessage(M64MSG_VERBOSE, "ps_jpg_uncompress doesn't support task yielding");
-        return;
+    unsigned int y_offset = 0;
+    unsigned int u_offset = 4*SUBBLOCK_SIZE;
+
+    for (i = 0; i < 8; ++i)
+    {
+        emit_line(&macroblock[y_offset],     &macroblock[u_offset], address);
+        emit_line(&macroblock[y_offset + 8], &macroblock[u_offset], address + 32);
+
+        y_offset += (i == 3) ? SUBBLOCK_SIZE+16 : 16;
+        u_offset += 8;
+        address += 64;
+    }
+}
+
+static void DecodeMacroblock1(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable)
+{
+    int sb;
+
+    for (sb = 0; sb < 6; ++sb)
+    {
+        int16_t tmp_sb[SUBBLOCK_SIZE];
+
+        /* update DC */
+        int32_t dc = (int32_t)macroblock[0];
+        switch(sb)
+        {
+        case 0: case 1: case 2: case 3:
+                *y_dc += dc; macroblock[0] = *y_dc & 0xffff; break;
+        case 4: *u_dc += dc; macroblock[0] = *u_dc & 0xffff; break;
+        case 5: *v_dc += dc; macroblock[0] = *v_dc & 0xffff; break;
+        }
+
+        ZigZagSubBlock(tmp_sb, macroblock);
+        if (qtable != NULL) { MultSubBlocks(tmp_sb, tmp_sb, qtable, 0); }
+        TransposeSubBlock(macroblock, tmp_sb);
+        InverseDCTSubBlock(macroblock, macroblock);
+        
+        macroblock += SUBBLOCK_SIZE;
+    }
+}
+
+static void DecodeMacroblock2(int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE])
+{
+    unsigned int sb;
+    unsigned int q = 0;
+
+    for (sb = 0; sb < subblock_count; ++sb)
+    {
+        int16_t tmp_sb[SUBBLOCK_SIZE];
+        const int isChromaSubBlock = (subblock_count - sb <= 2);
+
+        if (isChromaSubBlock) { ++q; }
+
+        MultSubBlocks(macroblock, macroblock, qtables[q], 4);
+        ZigZagSubBlock(tmp_sb, macroblock);
+        InverseDCTSubBlock(macroblock, tmp_sb);
+
+        macroblock += SUBBLOCK_SIZE;
     }
 
-    // Fetch arguments
-    memcpy(&ps_jpg_data, rsp.RDRAM+task->data_ptr, sizeof(ps_jpg_data));
+}
 
-    DebugMessage(M64MSG_VERBOSE, "SB Task: *MB=%x, #MB=%d, mode=%d, *Qy=%x, *Qu=%x, Qv=%x",
-        ps_jpg_data.pMacroBlocks,
-        ps_jpg_data.nMacroBlocks,
-        ps_jpg_data.mode,
-        ps_jpg_data.pQTables[0],
-        ps_jpg_data.pQTables[1],
-        ps_jpg_data.pQTables[2]);
+static void TransposeSubBlock(int16_t *dst, const int16_t *src)
+{
+    ReorderSubBlock(dst, src, TRANSPOSE_TABLE);
+}
 
-    // Setup input & output MB size, and #of subblocks
-    iMBsize = (ps_jpg_data.mode == 0) ? 0x200 : 0x300;
-    oMBsize = (ps_jpg_data.mode == 0) ? 0x100 : 0x200;
-    nSubBlocks = ps_jpg_data.mode + 4;
+static void ZigZagSubBlock(int16_t *dst, const int16_t *src)
+{
+    ReorderSubBlock(dst, src, ZIGZAG_TABLE);
+}
 
-    // Load QTables
-    for(j = 0; j < 3; j++) {
-        for(i = 0; i < 64; i++) {
-            unsigned short s = rsp.RDRAM[(ps_jpg_data.pQTables[j] + 2*i)^S8];
-            s <<= 8;
-            s |= rsp.RDRAM[(ps_jpg_data.pQTables[j] + 2*i+1)^S8];
-            QTables[j][i] = s;
+static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table)
+{
+    unsigned int i;
+
+    /* source and destination sublocks cannot overlap */
+    assert(abs(dst - src) > SUBBLOCK_SIZE);
+
+    for (i = 0; i < SUBBLOCK_SIZE; ++i)
+    {
+        dst[i] = src[table[i]];
+    }
+}
+
+static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift)
+{
+    unsigned int i;
+
+    for (i = 0; i < SUBBLOCK_SIZE; ++i)
+    {
+        int32_t v = src1[i] * src2[i];
+        dst[i] = clamp_s16(v) << shift;
+    }
+}
+
+static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale)
+{
+    unsigned int i;
+
+    for (i = 0; i < SUBBLOCK_SIZE; ++i)
+    {
+        int32_t v = src[i] * scale;
+        dst[i] = clamp_s16(v);
+    }
+}
+
+static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift)
+{
+    unsigned int i;
+
+    for (i = 0; i < SUBBLOCK_SIZE; ++i)
+    {
+        dst[i] = src[i] >> shift;
+    }
+}
+
+/***************************************************************************
+ * Fast 2D IDCT using separable formulation and normalization
+ * Computations use single precision floats
+ * Implementation based on Wikipedia :
+ * http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
+ **************************************************************************/
+
+/* Normalized such as C4 = 1 */
+#define C3   1.175875602f
+#define C6   0.541196100f       
+#define K1   0.765366865f   //  C2-C6
+#define K2  -1.847759065f   // -C2-C6
+#define K3  -0.390180644f   //  C5-C3
+#define K4  -1.961570561f   // -C5-C3
+#define K5   1.501321110f   //  C1+C3-C5-C7
+#define K6   2.053119869f   //  C1+C3-C5+C7
+#define K7   3.072711027f   //  C1+C3+C5-C7
+#define K8   0.298631336f   // -C1+C3+C5-C7
+#define K9  -0.899976223f   //  C7-C3
+#define K10 -2.562915448f   // -C1-C3
+static void InverseDCT1D(const float * const x, float *dst, unsigned int stride)
+{
+    float e[4];
+    float f[4];
+    float x26, x1357, x15, x37, x17, x35;
+
+    x15   =  K3 * (x[1] + x[5]);
+    x37   =  K4 * (x[3] + x[7]);
+    x17   =  K9 * (x[1] + x[7]);
+    x35   = K10 * (x[3] + x[5]);
+    x1357 =  C3 * (x[1] + x[3] + x[5] + x[7]);
+    x26   =  C6 * (x[2] + x[6]);
+
+    f[0] = x[0] + x[4];
+    f[1] = x[0] - x[4];
+    f[2] = x26 + K1*x[2];
+    f[3] = x26 + K2*x[6];
+
+    e[0] = x1357 + x15 + K5*x[1] + x17;
+    e[1] = x1357 + x37 + K7*x[3] + x35;
+    e[2] = x1357 + x15 + K6*x[5] + x35;
+    e[3] = x1357 + x37 + K8*x[7] + x17;
+
+    *dst = f[0] + f[2] + e[0]; dst += stride;
+    *dst = f[1] + f[3] + e[1]; dst += stride;
+    *dst = f[1] - f[3] + e[2]; dst += stride;
+    *dst = f[0] - f[2] + e[3]; dst += stride;
+    *dst = f[0] - f[2] - e[3]; dst += stride;
+    *dst = f[1] - f[3] - e[2]; dst += stride;
+    *dst = f[1] + f[3] - e[1]; dst += stride;
+    *dst = f[0] + f[2] - e[0]; dst += stride;
+}
+#undef C3  
+#undef C6  
+#undef K1  
+#undef K2  
+#undef K3  
+#undef K4  
+#undef K5  
+#undef K6  
+#undef K7  
+#undef K8  
+#undef K9  
+#undef K10 
+
+static void InverseDCTSubBlock(int16_t *dst, const int16_t *src)
+{
+    float x[8];
+    float block[SUBBLOCK_SIZE];
+    unsigned int i, j;
+
+    /* idct 1d on rows (+transposition) */
+    for (i = 0; i < 8; ++i)
+    {
+        for (j = 0; j < 8; ++j)
+        {
+            x[j] = (float)src[i*8+j];
         }
+
+        InverseDCT1D(x, &block[i], 8);
     }
 
-    // foreach MB
-    for(mb=0; mb < ps_jpg_data.nMacroBlocks; mb++) {
-        unsigned sb;
-        short macroblock[2][0x300/2];
-        unsigned int y_offset, u_offset;
+    /* idct 1d on columns (thanks to previous transposition) */
+    for (i = 0; i < 8; ++i)
+    {
+        InverseDCT1D(&block[i*8], x, 1);
 
-        // load MB into short_buffer
-        unsigned offset = ps_jpg_data.pMacroBlocks + iMBsize*mb;
-        for(i = 0; i < iMBsize/2; i++) {
-            unsigned short s = rsp.RDRAM[(offset+0)^S8];
-            s <<= 8;
-            s |= rsp.RDRAM[(offset+1)^S8];
-            macroblock[0][i] = s;
-            offset += 2;
-        }
-
-        // Apply Dequantization (Y subblocks)
-        for(sb = 0; sb < nSubBlocks-2; sb++) {
-            for(i = 0; i < 64; i++) {
-                int v = macroblock[0][sb*0x40+i]*QTables[0][i];
-                macroblock[0][sb*0x40+i] = saturate(v) << 4;
-            }
-        }
-
-        // Apply Dequantization (U,V subblocks)
-        for(j = 1; sb < nSubBlocks; sb++, j++) {
-            for(i = 0; i < 64; i++) {
-                int v = macroblock[0][sb*0x40+i]*QTables[j][i];
-                macroblock[0][sb*0x40+i] = saturate(v) << 4;
-            }
-        }
-
-        // foreach SubBlocks
-        for(sb = 0; sb < nSubBlocks; sb++) {
-            // ZigZag (transposed)
-            for(i = 0; i < 64; i++) {
-                macroblock[1][sb*0x40+i] = macroblock[0][sb*0x40+ZigZag[i]];
-            }
-
-            // Apply Invert Discrete Cosinus Transform
-            idct(&macroblock[1][sb*0x40], &macroblock[0][sb*0x40]);
-        }
-
-        // Texel Formatting (RGBA16)
-        offset = ps_jpg_data.pMacroBlocks + iMBsize*mb;
-        y_offset = 0;
-        u_offset = oMBsize/2;
-
-        if (ps_jpg_data.mode == 0)
+        /* C4 = 1 normalization implies a division by 8 */
+        for (j = 0; j < 8; ++j)
         {
-            // I have not encountered this case in Pokemon stadium (but ucode disassembly say so...)
-
-            unsigned short rgba[16];
-            for(i = 0; i < 8; i++) {
-                rgba[0]  = yuv2rgba16((float)macroblock[0][y_offset+0]+2048.0f, (float)macroblock[0][u_offset+0], (float)macroblock[0][u_offset+64+0]);
-                rgba[1]  = yuv2rgba16((float)macroblock[0][y_offset+1]+2048.0f, (float)macroblock[0][u_offset+0], (float)macroblock[0][u_offset+64+0]);
-                rgba[2]  = yuv2rgba16((float)macroblock[0][y_offset+2]+2048.0f, (float)macroblock[0][u_offset+1], (float)macroblock[0][u_offset+64+1]);
-                rgba[3]  = yuv2rgba16((float)macroblock[0][y_offset+3]+2048.0f, (float)macroblock[0][u_offset+1], (float)macroblock[0][u_offset+64+1]);
-                rgba[4]  = yuv2rgba16((float)macroblock[0][y_offset+4]+2048.0f, (float)macroblock[0][u_offset+2], (float)macroblock[0][u_offset+64+2]);
-                rgba[5]  = yuv2rgba16((float)macroblock[0][y_offset+5]+2048.0f, (float)macroblock[0][u_offset+2], (float)macroblock[0][u_offset+64+2]);
-                rgba[6]  = yuv2rgba16((float)macroblock[0][y_offset+6]+2048.0f, (float)macroblock[0][u_offset+3], (float)macroblock[0][u_offset+64+3]);
-                rgba[7]  = yuv2rgba16((float)macroblock[0][y_offset+7]+2048.0f, (float)macroblock[0][u_offset+3], (float)macroblock[0][u_offset+64+3]);
-
-                rgba[8]  = yuv2rgba16((float)macroblock[0][y_offset+64+0]+2048.0f, (float)macroblock[0][u_offset+4], (float)macroblock[0][u_offset+64+4]);
-                rgba[9]  = yuv2rgba16((float)macroblock[0][y_offset+64+1]+2048.0f, (float)macroblock[0][u_offset+4], (float)macroblock[0][u_offset+64+4]);
-                rgba[10] = yuv2rgba16((float)macroblock[0][y_offset+64+2]+2048.0f, (float)macroblock[0][u_offset+5], (float)macroblock[0][u_offset+64+5]);
-                rgba[11] = yuv2rgba16((float)macroblock[0][y_offset+64+3]+2048.0f, (float)macroblock[0][u_offset+5], (float)macroblock[0][u_offset+64+5]);
-                rgba[12] = yuv2rgba16((float)macroblock[0][y_offset+64+4]+2048.0f, (float)macroblock[0][u_offset+6], (float)macroblock[0][u_offset+64+6]);
-                rgba[13] = yuv2rgba16((float)macroblock[0][y_offset+64+5]+2048.0f, (float)macroblock[0][u_offset+6], (float)macroblock[0][u_offset+64+6]);
-                rgba[14] = yuv2rgba16((float)macroblock[0][y_offset+64+6]+2048.0f, (float)macroblock[0][u_offset+7], (float)macroblock[0][u_offset+64+7]);
-                rgba[15] = yuv2rgba16((float)macroblock[0][y_offset+64+7]+2048.0f, (float)macroblock[0][u_offset+7], (float)macroblock[0][u_offset+64+7]);
-
-                for(j = 0; j < 16; j++) {
-                    rsp.RDRAM[(offset++)^S8] = (unsigned char)(rgba[j] >> 8);
-                    rsp.RDRAM[(offset++)^S8] = (unsigned char)(rgba[j] & 0xff);
-                }
-
-                y_offset += 8;
-                u_offset += 8;
-            }
-        }
-        else
-        {
-            unsigned short rgba[32];
-            for(i = 0; i < 8; i++) {
-                for(j = 0; j < 2; j++) {
-                    rgba[j*16+0]  = yuv2rgba16((float)macroblock[0][y_offset+0]+2048.0f, (float)macroblock[0][u_offset+0], (float)macroblock[0][u_offset+64+0]);
-                    rgba[j*16+1]  = yuv2rgba16((float)macroblock[0][y_offset+1]+2048.0f, (float)macroblock[0][u_offset+0], (float)macroblock[0][u_offset+64+0]);
-                    rgba[j*16+2]  = yuv2rgba16((float)macroblock[0][y_offset+2]+2048.0f, (float)macroblock[0][u_offset+1], (float)macroblock[0][u_offset+64+1]);
-                    rgba[j*16+3]  = yuv2rgba16((float)macroblock[0][y_offset+3]+2048.0f, (float)macroblock[0][u_offset+1], (float)macroblock[0][u_offset+64+1]);
-                    rgba[j*16+4]  = yuv2rgba16((float)macroblock[0][y_offset+4]+2048.0f, (float)macroblock[0][u_offset+2], (float)macroblock[0][u_offset+64+2]);
-                    rgba[j*16+5]  = yuv2rgba16((float)macroblock[0][y_offset+5]+2048.0f, (float)macroblock[0][u_offset+2], (float)macroblock[0][u_offset+64+2]);
-                    rgba[j*16+6]  = yuv2rgba16((float)macroblock[0][y_offset+6]+2048.0f, (float)macroblock[0][u_offset+3], (float)macroblock[0][u_offset+64+3]);
-                    rgba[j*16+7]  = yuv2rgba16((float)macroblock[0][y_offset+7]+2048.0f, (float)macroblock[0][u_offset+3], (float)macroblock[0][u_offset+64+3]);
-
-                    rgba[j*16+8]  = yuv2rgba16((float)macroblock[0][y_offset+64+0]+2048.0f, (float)macroblock[0][u_offset+4], (float)macroblock[0][u_offset+64+4]);
-                    rgba[j*16+9]  = yuv2rgba16((float)macroblock[0][y_offset+64+1]+2048.0f, (float)macroblock[0][u_offset+4], (float)macroblock[0][u_offset+64+4]);
-                    rgba[j*16+10] = yuv2rgba16((float)macroblock[0][y_offset+64+2]+2048.0f, (float)macroblock[0][u_offset+5], (float)macroblock[0][u_offset+64+5]);
-                    rgba[j*16+11] = yuv2rgba16((float)macroblock[0][y_offset+64+3]+2048.0f, (float)macroblock[0][u_offset+5], (float)macroblock[0][u_offset+64+5]);
-                    rgba[j*16+12] = yuv2rgba16((float)macroblock[0][y_offset+64+4]+2048.0f, (float)macroblock[0][u_offset+6], (float)macroblock[0][u_offset+64+6]);
-                    rgba[j*16+13] = yuv2rgba16((float)macroblock[0][y_offset+64+5]+2048.0f, (float)macroblock[0][u_offset+6], (float)macroblock[0][u_offset+64+6]);
-                    rgba[j*16+14] = yuv2rgba16((float)macroblock[0][y_offset+64+6]+2048.0f, (float)macroblock[0][u_offset+7], (float)macroblock[0][u_offset+64+7]);
-                    rgba[j*16+15] = yuv2rgba16((float)macroblock[0][y_offset+64+7]+2048.0f, (float)macroblock[0][u_offset+7], (float)macroblock[0][u_offset+64+7]);
-                    y_offset += 8;
-                }
-
-                for(j = 0; j < 32; j++) {
-                    rsp.RDRAM[(offset++)^S8] = (unsigned char)(rgba[j] >> 8);
-                    rsp.RDRAM[(offset++)^S8] = (unsigned char)(rgba[j] & 0xff);
-                }
-
-                if (i == 3) y_offset += 64;
-                u_offset += 8;
-            }
+            dst[i+j*8] = (int16_t)x[j] >> 3;
         }
     }
 }
 
+
+/* FIXME: assume presence of expansion pack */
+#define MEMMASK 0x7fffff
+
+static void rdram_read_many_u16(uint16_t *dst, uint32_t address, unsigned int count)
+{
+    while (count != 0)
+    {
+        uint16_t s = rsp.RDRAM[((address++)^S8) & MEMMASK];
+        s <<= 8;
+        s |= rsp.RDRAM[((address++)^S8) & MEMMASK];
+
+        *(dst++) = s;
+
+        --count;
+    }
+}
+
+static void rdram_write_many_u16(const uint16_t *src, uint32_t address, unsigned int count)
+{
+    while (count != 0)
+    {
+        rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
+        rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);
+
+        --count;
+    }
+}
+
+static uint32_t rdram_read_u32(uint32_t address)
+{
+    uint32_t r = rsp.RDRAM[((address++) ^ S8) & MEMMASK]; r <<= 8;
+    r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK]; r <<= 8;
+    r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK]; r <<= 8;
+    r |= rsp.RDRAM[((address++) ^ S8) & MEMMASK];
+
+    return r;
+}
+
+static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count)
+{
+    while (count != 0)
+    {
+        rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 24);
+        rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 16);
+        rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*src >> 8);
+        rsp.RDRAM[((address++)^S8) & MEMMASK] = (uint8_t)(*(src++) & 0xff);
+
+        --count;
+    }
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ *   Mupen64plus-rsp-hle - jpeg.h                                          *
+ *   Mupen64Plus homepage: http://code.google.com/p/mupen64plus/           *
+ *   Copyright (C) 2002 Hacktarux                                          *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#ifndef JPEG_H
+#define JPEG_H
+
+void jpeg_decode_PS(OSTask_t *task);
+void jpeg_decode_OB(OSTask_t *task);
+
+#endif
+
 #include "m64p_common.h"
 #include "m64p_plugin.h"
 #include "hle.h"
+#include "jpeg.h"
 
 /* global variables */
 RSP_INFO rsp;
       taskdone();
       return 1;
     case 0x2caa6: // Zelda OOT, Pokemon Stadium {1,2} jpg decompression
-        ps_jpg_uncompress(task);
+        jpeg_decode_PS(task);
         taskdone();
         return 1;
     case 0x130de: // Ogre Battle background decompression
-        ob_jpg_uncompress(task);
+        jpeg_decode_OB(task);
         taskdone();
         return 1;
     }