diff --git a/config.py b/config.py
index 6bb421e..54e40a5 100644
--- a/config.py
+++ b/config.py
@@ -23,6 +23,8 @@ else:
 
 
 # [step 3]>> 以下配置可以优化体验，但大部分场合下并不需要修改
+# 对话窗的高度
+CHATBOT_HEIGHT = 1117
 
 # 发送请求到OpenAI后，等待多久判定为超时
 TIMEOUT_SECONDS = 25
diff --git a/crazy_functions/test_project/cpp/longcode/jpgd.cpp b/crazy_functions/test_project/cpp/longcode/jpgd.cpp
new file mode 100644
index 0000000..36d06c8
--- /dev/null
+++ b/crazy_functions/test_project/cpp/longcode/jpgd.cpp
@@ -0,0 +1,3276 @@
+// jpgd.cpp - C++ class for JPEG decompression.
+// Public domain, Rich Geldreich <richgel99@gmail.com>
+// Last updated Apr. 16, 2011
+// Alex Evans: Linear memory allocator (taken from jpge.h).
+//
+// Supports progressive and baseline sequential JPEG image files, and the most common chroma subsampling factors: Y, H1V1, H2V1, H1V2, and H2V2.
+//
+// Chroma upsampling quality: H2V2 is upsampled in the frequency domain, H2V1 and H1V2 are upsampled using point sampling.
+// Chroma upsampling reference: "Fast Scheme for Image Size Change in the Compressed Domain"
+// http://vision.ai.uiuc.edu/~dugad/research/dct/index.html
+
+#include "jpgd.h"
+#include <string.h>
+
+#include <assert.h>
+// BEGIN EPIC MOD
+#define JPGD_ASSERT(x) { assert(x); CA_ASSUME(x); } (void)0
+// END EPIC MOD
+
+#ifdef _MSC_VER
+#pragma warning (disable : 4611) // warning C4611: interaction between '_setjmp' and C++ object destruction is non-portable
+#endif
+
+// Set to 1 to enable freq. domain chroma upsampling on images using H2V2 subsampling (0=faster nearest neighbor sampling).
+// This is slower, but results in higher quality on images with highly saturated colors.
+#define JPGD_SUPPORT_FREQ_DOMAIN_UPSAMPLING 1
+
+#define JPGD_TRUE (1)
+#define JPGD_FALSE (0)
+
+#define JPGD_MAX(a,b) (((a)>(b)) ? (a) : (b))
+#define JPGD_MIN(a,b) (((a)<(b)) ? (a) : (b))
+
+namespace jpgd {
+
+	static inline void *jpgd_malloc(size_t nSize) { return FMemory::Malloc(nSize); }
+	static inline void jpgd_free(void *p) { FMemory::Free(p); }
+
+// BEGIN EPIC MOD
+//@UE3 - use UE3 BGRA encoding instead of assuming RGBA
+	// stolen from IImageWrapper.h
+	enum ERGBFormatJPG
+	{
+		Invalid = -1,
+		RGBA =  0,
+		BGRA =  1,
+		Gray =  2,
+	};
+	static ERGBFormatJPG jpg_format;
+// END EPIC MOD
+
+	// DCT coefficients are stored in this sequence.
+	static int g_ZAG[64] = {  0,1,8,16,9,2,3,10,17,24,32,25,18,11,4,5,12,19,26,33,40,48,41,34,27,20,13,6,7,14,21,28,35,42,49,56,57,50,43,36,29,22,15,23,30,37,44,51,58,59,52,45,38,31,39,46,53,60,61,54,47,55,62,63 };
+
+	enum JPEG_MARKER
+	{
+		M_SOF0  = 0xC0, M_SOF1  = 0xC1, M_SOF2  = 0xC2, M_SOF3  = 0xC3, M_SOF5  = 0xC5, M_SOF6  = 0xC6, M_SOF7  = 0xC7, M_JPG   = 0xC8,
+		M_SOF9  = 0xC9, M_SOF10 = 0xCA, M_SOF11 = 0xCB, M_SOF13 = 0xCD, M_SOF14 = 0xCE, M_SOF15 = 0xCF, M_DHT   = 0xC4, M_DAC   = 0xCC,
+		M_RST0  = 0xD0, M_RST1  = 0xD1, M_RST2  = 0xD2, M_RST3  = 0xD3, M_RST4  = 0xD4, M_RST5  = 0xD5, M_RST6  = 0xD6, M_RST7  = 0xD7,
+		M_SOI   = 0xD8, M_EOI   = 0xD9, M_SOS   = 0xDA, M_DQT   = 0xDB, M_DNL   = 0xDC, M_DRI   = 0xDD, M_DHP   = 0xDE, M_EXP   = 0xDF,
+		M_APP0  = 0xE0, M_APP15 = 0xEF, M_JPG0  = 0xF0, M_JPG13 = 0xFD, M_COM   = 0xFE, M_TEM   = 0x01, M_ERROR = 0x100, RST0   = 0xD0
+	};
+
+	enum JPEG_SUBSAMPLING { JPGD_GRAYSCALE = 0, JPGD_YH1V1, JPGD_YH2V1, JPGD_YH1V2, JPGD_YH2V2 };
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define SCALEDONE ((int32)1)
+
+#define FIX_0_298631336  ((int32)2446)        /* FIX(0.298631336) */
+#define FIX_0_390180644  ((int32)3196)        /* FIX(0.390180644) */
+#define FIX_0_541196100  ((int32)4433)        /* FIX(0.541196100) */
+#define FIX_0_765366865  ((int32)6270)        /* FIX(0.765366865) */
+#define FIX_0_899976223  ((int32)7373)        /* FIX(0.899976223) */
+#define FIX_1_175875602  ((int32)9633)        /* FIX(1.175875602) */
+#define FIX_1_501321110  ((int32)12299)       /* FIX(1.501321110) */
+#define FIX_1_847759065  ((int32)15137)       /* FIX(1.847759065) */
+#define FIX_1_961570560  ((int32)16069)       /* FIX(1.961570560) */
+#define FIX_2_053119869  ((int32)16819)       /* FIX(2.053119869) */
+#define FIX_2_562915447  ((int32)20995)       /* FIX(2.562915447) */
+#define FIX_3_072711026  ((int32)25172)       /* FIX(3.072711026) */
+
+#define DESCALE(x,n)  (((x) + (SCALEDONE << ((n)-1))) >> (n))
+#define DESCALE_ZEROSHIFT(x,n)  (((x) + (128 << (n)) + (SCALEDONE << ((n)-1))) >> (n))
+
+#define MULTIPLY(var, cnst)  ((var) * (cnst))
+
+#define CLAMP(i) ((static_cast<uint>(i) > 255) ? (((~i) >> 31) & 0xFF) : (i))
+
+	// Compiler creates a fast path 1D IDCT for X non-zero columns
+	template <int NONZERO_COLS>
+	struct Row
+	{
+		static void idct(int* pTemp, const jpgd_block_t* pSrc)
+		{
+			// ACCESS_COL() will be optimized at compile time to either an array access, or 0.
+#define ACCESS_COL(x) (((x) < NONZERO_COLS) ? (int)pSrc[x] : 0)
+
+			const int z2 = ACCESS_COL(2), z3 = ACCESS_COL(6);
+
+			const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+			const int tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+			const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+			const int tmp0 = (ACCESS_COL(0) + ACCESS_COL(4)) << CONST_BITS;
+			const int tmp1 = (ACCESS_COL(0) - ACCESS_COL(4)) << CONST_BITS;
+
+			const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
+
+			const int atmp0 = ACCESS_COL(7), atmp1 = ACCESS_COL(5), atmp2 = ACCESS_COL(3), atmp3 = ACCESS_COL(1);
+
+			const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
+			const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
+
+			const int az1 = MULTIPLY(bz1, - FIX_0_899976223);
+			const int az2 = MULTIPLY(bz2, - FIX_2_562915447);
+			const int az3 = MULTIPLY(bz3, - FIX_1_961570560) + bz5;
+			const int az4 = MULTIPLY(bz4, - FIX_0_390180644) + bz5;
+
+			const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
+			const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
+			const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
+			const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
+
+			pTemp[0] = DESCALE(tmp10 + btmp3, CONST_BITS-PASS1_BITS);
+			pTemp[7] = DESCALE(tmp10 - btmp3, CONST_BITS-PASS1_BITS);
+			pTemp[1] = DESCALE(tmp11 + btmp2, CONST_BITS-PASS1_BITS);
+			pTemp[6] = DESCALE(tmp11 - btmp2, CONST_BITS-PASS1_BITS);
+			pTemp[2] = DESCALE(tmp12 + btmp1, CONST_BITS-PASS1_BITS);
+			pTemp[5] = DESCALE(tmp12 - btmp1, CONST_BITS-PASS1_BITS);
+			pTemp[3] = DESCALE(tmp13 + btmp0, CONST_BITS-PASS1_BITS);
+			pTemp[4] = DESCALE(tmp13 - btmp0, CONST_BITS-PASS1_BITS);
+		}
+	};
+
+	template <>
+	struct Row<0>
+	{
+		static void idct(int* pTemp, const jpgd_block_t* pSrc)
+		{
+#ifdef _MSC_VER
+			pTemp; pSrc;
+#endif
+		}
+	};
+
+	template <>
+	struct Row<1>
+	{
+		static void idct(int* pTemp, const jpgd_block_t* pSrc)
+		{
+			const int dcval = (pSrc[0] << PASS1_BITS);
+
+			pTemp[0] = dcval;
+			pTemp[1] = dcval;
+			pTemp[2] = dcval;
+			pTemp[3] = dcval;
+			pTemp[4] = dcval;
+			pTemp[5] = dcval;
+			pTemp[6] = dcval;
+			pTemp[7] = dcval;
+		}
+	};
+
+	// Compiler creates a fast path 1D IDCT for X non-zero rows
+	template <int NONZERO_ROWS>
+	struct Col
+	{
+		static void idct(uint8* pDst_ptr, const int* pTemp)
+		{
+			// ACCESS_ROW() will be optimized at compile time to either an array access, or 0.
+#define ACCESS_ROW(x) (((x) < NONZERO_ROWS) ? pTemp[x * 8] : 0)
+
+			const int z2 = ACCESS_ROW(2);
+			const int z3 = ACCESS_ROW(6);
+
+			const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+			const int tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+			const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+			const int tmp0 = (ACCESS_ROW(0) + ACCESS_ROW(4)) << CONST_BITS;
+			const int tmp1 = (ACCESS_ROW(0) - ACCESS_ROW(4)) << CONST_BITS;
+
+			const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
+
+			const int atmp0 = ACCESS_ROW(7), atmp1 = ACCESS_ROW(5), atmp2 = ACCESS_ROW(3), atmp3 = ACCESS_ROW(1);
+
+			const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
+			const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
+
+			const int az1 = MULTIPLY(bz1, - FIX_0_899976223);
+			const int az2 = MULTIPLY(bz2, - FIX_2_562915447);
+			const int az3 = MULTIPLY(bz3, - FIX_1_961570560) + bz5;
+			const int az4 = MULTIPLY(bz4, - FIX_0_390180644) + bz5;
+
+			const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
+			const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
+			const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
+			const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
+
+			int i = DESCALE_ZEROSHIFT(tmp10 + btmp3, CONST_BITS+PASS1_BITS+3);
+			pDst_ptr[8*0] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp10 - btmp3, CONST_BITS+PASS1_BITS+3);
+			pDst_ptr[8*7] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp11 + btmp2, CONST_BITS+PASS1_BITS+3);
+			pDst_ptr[8*1] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp11 - btmp2, CONST_BITS+PASS1_BITS+3);
+			pDst_ptr[8*6] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp12 + btmp1, CONST_BITS+PASS1_BITS+3);
+			pDst_ptr[8*2] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp12 - btmp1, CONST_BITS+PASS1_BITS+3);
+			pDst_ptr[8*5] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp13 + btmp0, CONST_BITS+PASS1_BITS+3);
+			pDst_ptr[8*3] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp13 - btmp0, CONST_BITS+PASS1_BITS+3);
+			pDst_ptr[8*4] = (uint8)CLAMP(i);
+		}
+	};
+
+	template <>
+	struct Col<1>
+	{
+		static void idct(uint8* pDst_ptr, const int* pTemp)
+		{
+			int dcval = DESCALE_ZEROSHIFT(pTemp[0], PASS1_BITS+3);
+			const uint8 dcval_clamped = (uint8)CLAMP(dcval);
+			pDst_ptr[0*8] = dcval_clamped;
+			pDst_ptr[1*8] = dcval_clamped;
+			pDst_ptr[2*8] = dcval_clamped;
+			pDst_ptr[3*8] = dcval_clamped;
+			pDst_ptr[4*8] = dcval_clamped;
+			pDst_ptr[5*8] = dcval_clamped;
+			pDst_ptr[6*8] = dcval_clamped;
+			pDst_ptr[7*8] = dcval_clamped;
+		}
+	};
+
+	static const uint8 s_idct_row_table[] =
+	{
+		1,0,0,0,0,0,0,0, 2,0,0,0,0,0,0,0, 2,1,0,0,0,0,0,0, 2,1,1,0,0,0,0,0, 2,2,1,0,0,0,0,0, 3,2,1,0,0,0,0,0, 4,2,1,0,0,0,0,0, 4,3,1,0,0,0,0,0,
+		4,3,2,0,0,0,0,0, 4,3,2,1,0,0,0,0, 4,3,2,1,1,0,0,0, 4,3,2,2,1,0,0,0, 4,3,3,2,1,0,0,0, 4,4,3,2,1,0,0,0, 5,4,3,2,1,0,0,0, 6,4,3,2,1,0,0,0,
+		6,5,3,2,1,0,0,0, 6,5,4,2,1,0,0,0, 6,5,4,3,1,0,0,0, 6,5,4,3,2,0,0,0, 6,5,4,3,2,1,0,0, 6,5,4,3,2,1,1,0, 6,5,4,3,2,2,1,0, 6,5,4,3,3,2,1,0,
+		6,5,4,4,3,2,1,0, 6,5,5,4,3,2,1,0, 6,6,5,4,3,2,1,0, 7,6,5,4,3,2,1,0, 8,6,5,4,3,2,1,0, 8,7,5,4,3,2,1,0, 8,7,6,4,3,2,1,0, 8,7,6,5,3,2,1,0,
+		8,7,6,5,4,2,1,0, 8,7,6,5,4,3,1,0, 8,7,6,5,4,3,2,0, 8,7,6,5,4,3,2,1, 8,7,6,5,4,3,2,2, 8,7,6,5,4,3,3,2, 8,7,6,5,4,4,3,2, 8,7,6,5,5,4,3,2,
+		8,7,6,6,5,4,3,2, 8,7,7,6,5,4,3,2, 8,8,7,6,5,4,3,2, 8,8,8,6,5,4,3,2, 8,8,8,7,5,4,3,2, 8,8,8,7,6,4,3,2, 8,8,8,7,6,5,3,2, 8,8,8,7,6,5,4,2,
+		8,8,8,7,6,5,4,3, 8,8,8,7,6,5,4,4, 8,8,8,7,6,5,5,4, 8,8,8,7,6,6,5,4, 8,8,8,7,7,6,5,4, 8,8,8,8,7,6,5,4, 8,8,8,8,8,6,5,4, 8,8,8,8,8,7,5,4,
+		8,8,8,8,8,7,6,4, 8,8,8,8,8,7,6,5, 8,8,8,8,8,7,6,6, 8,8,8,8,8,7,7,6, 8,8,8,8,8,8,7,6, 8,8,8,8,8,8,8,6, 8,8,8,8,8,8,8,7, 8,8,8,8,8,8,8,8,
+	};
+
+	static const uint8 s_idct_col_table[] = { 1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
+
+	void idct(const jpgd_block_t* pSrc_ptr, uint8* pDst_ptr, int block_max_zag)
+	{
+		JPGD_ASSERT(block_max_zag >= 1);
+		JPGD_ASSERT(block_max_zag <= 64);
+
+		if (block_max_zag == 1)
+		{
+			int k = ((pSrc_ptr[0] + 4) >> 3) + 128;
+			k = CLAMP(k);
+			k = k | (k<<8);
+			k = k | (k<<16);
+
+			for (int i = 8; i > 0; i--)
+			{
+				*(int*)&pDst_ptr[0] = k;
+				*(int*)&pDst_ptr[4] = k;
+				pDst_ptr += 8;
+			}
+			return;
+		}
+
+		int temp[64];
+
+		const jpgd_block_t* pSrc = pSrc_ptr;
+		int* pTemp = temp;
+
+		const uint8* pRow_tab = &s_idct_row_table[(block_max_zag - 1) * 8];
+		int i;
+		for (i = 8; i > 0; i--, pRow_tab++)
+		{
+			switch (*pRow_tab)
+			{
+			case 0: Row<0>::idct(pTemp, pSrc); break;
+			case 1: Row<1>::idct(pTemp, pSrc); break;
+			case 2: Row<2>::idct(pTemp, pSrc); break;
+			case 3: Row<3>::idct(pTemp, pSrc); break;
+			case 4: Row<4>::idct(pTemp, pSrc); break;
+			case 5: Row<5>::idct(pTemp, pSrc); break;
+			case 6: Row<6>::idct(pTemp, pSrc); break;
+			case 7: Row<7>::idct(pTemp, pSrc); break;
+			case 8: Row<8>::idct(pTemp, pSrc); break;
+			}
+
+			pSrc += 8;
+			pTemp += 8;
+		}
+
+		pTemp = temp;
+
+		const int nonzero_rows = s_idct_col_table[block_max_zag - 1];
+		for (i = 8; i > 0; i--)
+		{
+			switch (nonzero_rows)
+			{
+			case 1: Col<1>::idct(pDst_ptr, pTemp); break;
+			case 2: Col<2>::idct(pDst_ptr, pTemp); break;
+			case 3: Col<3>::idct(pDst_ptr, pTemp); break;
+			case 4: Col<4>::idct(pDst_ptr, pTemp); break;
+			case 5: Col<5>::idct(pDst_ptr, pTemp); break;
+			case 6: Col<6>::idct(pDst_ptr, pTemp); break;
+			case 7: Col<7>::idct(pDst_ptr, pTemp); break;
+			case 8: Col<8>::idct(pDst_ptr, pTemp); break;
+			}
+
+			pTemp++;
+			pDst_ptr++;
+		}
+	}
+
+	void idct_4x4(const jpgd_block_t* pSrc_ptr, uint8* pDst_ptr)
+	{
+		int temp[64];
+		int* pTemp = temp;
+		const jpgd_block_t* pSrc = pSrc_ptr;
+
+		for (int i = 4; i > 0; i--)
+		{
+			Row<4>::idct(pTemp, pSrc);
+			pSrc += 8;
+			pTemp += 8;
+		}
+
+		pTemp = temp;
+		for (int i = 8; i > 0; i--)
+		{
+			Col<4>::idct(pDst_ptr, pTemp);
+			pTemp++;
+			pDst_ptr++;
+		}
+	}
+
+	// Retrieve one character from the input stream.
+	inline uint jpeg_decoder::get_char()
+	{
+		// Any bytes remaining in buffer?
+		if (!m_in_buf_left)
+		{
+			// Try to get more bytes.
+			prep_in_buffer();
+			// Still nothing to get?
+			if (!m_in_buf_left)
+			{
+				// Pad the end of the stream with 0xFF 0xD9 (EOI marker)
+				int t = m_tem_flag;
+				m_tem_flag ^= 1;
+				if (t)
+					return 0xD9;
+				else
+					return 0xFF;
+			}
+		}
+
+		uint c = *m_pIn_buf_ofs++;
+		m_in_buf_left--;
+
+		return c;
+	}
+
+	// Same as previous method, except can indicate if the character is a pad character or not.
+	inline uint jpeg_decoder::get_char(bool *pPadding_flag)
+	{
+		if (!m_in_buf_left)
+		{
+			prep_in_buffer();
+			if (!m_in_buf_left)
+			{
+				*pPadding_flag = true;
+				int t = m_tem_flag;
+				m_tem_flag ^= 1;
+				if (t)
+					return 0xD9;
+				else
+					return 0xFF;
+			}
+		}
+
+		*pPadding_flag = false;
+
+		uint c = *m_pIn_buf_ofs++;
+		m_in_buf_left--;
+
+		return c;
+	}
+
+	// Inserts a previously retrieved character back into the input buffer.
+	inline void jpeg_decoder::stuff_char(uint8 q)
+	{
+		*(--m_pIn_buf_ofs) = q;
+		m_in_buf_left++;
+	}
+
+	// Retrieves one character from the input stream, but does not read past markers. Will continue to return 0xFF when a marker is encountered.
+	inline uint8 jpeg_decoder::get_octet()
+	{
+		bool padding_flag;
+		int c = get_char(&padding_flag);
+
+		if (c == 0xFF)
+		{
+			if (padding_flag)
+				return 0xFF;
+
+			c = get_char(&padding_flag);
+			if (padding_flag)
+			{
+				stuff_char(0xFF);
+				return 0xFF;
+			}
+
+			if (c == 0x00)
+				return 0xFF;
+			else
+			{
+				stuff_char(static_cast<uint8>(c));
+				stuff_char(0xFF);
+				return 0xFF;
+			}
+		}
+
+		return static_cast<uint8>(c);
+	}
+
+	// Retrieves a variable number of bits from the input stream. Does not recognize markers.
+	inline uint jpeg_decoder::get_bits(int num_bits)
+	{
+		if (!num_bits)
+			return 0;
+
+		uint i = m_bit_buf >> (32 - num_bits);
+
+		if ((m_bits_left -= num_bits) <= 0)
+		{
+			m_bit_buf <<= (num_bits += m_bits_left);
+
+			uint c1 = get_char();
+			uint c2 = get_char();
+			m_bit_buf = (m_bit_buf & 0xFFFF0000) | (c1 << 8) | c2;
+
+			m_bit_buf <<= -m_bits_left;
+
+			m_bits_left += 16;
+
+			JPGD_ASSERT(m_bits_left >= 0);
+		}
+		else
+			m_bit_buf <<= num_bits;
+
+		return i;
+	}
+
+	// Retrieves a variable number of bits from the input stream. Markers will not be read into the input bit buffer. Instead, an infinite number of all 1's will be returned when a marker is encountered.
+	inline uint jpeg_decoder::get_bits_no_markers(int num_bits)
+	{
+		if (!num_bits)
+			return 0;
+
+		uint i = m_bit_buf >> (32 - num_bits);
+
+		if ((m_bits_left -= num_bits) <= 0)
+		{
+			m_bit_buf <<= (num_bits += m_bits_left);
+
+			if ((m_in_buf_left < 2) || (m_pIn_buf_ofs[0] == 0xFF) || (m_pIn_buf_ofs[1] == 0xFF))
+			{
+				uint c1 = get_octet();
+				uint c2 = get_octet();
+				m_bit_buf |= (c1 << 8) | c2;
+			}
+			else
+			{
+				m_bit_buf |= ((uint)m_pIn_buf_ofs[0] << 8) | m_pIn_buf_ofs[1];
+				m_in_buf_left -= 2;
+				m_pIn_buf_ofs += 2;
+			}
+
+			m_bit_buf <<= -m_bits_left;
+
+			m_bits_left += 16;
+
+			JPGD_ASSERT(m_bits_left >= 0);
+		}
+		else
+			m_bit_buf <<= num_bits;
+
+		return i;
+	}
+
+	// Decodes a Huffman encoded symbol.
+	inline int jpeg_decoder::huff_decode(huff_tables *pH)
+	{
+		int symbol;
+
+		// Check first 8-bits: do we have a complete symbol?
+		if ((symbol = pH->look_up[m_bit_buf >> 24]) < 0)
+		{
+			// Decode more bits, use a tree traversal to find symbol.
+			int ofs = 23;
+			do
+			{
+				symbol = pH->tree[-(int)(symbol + ((m_bit_buf >> ofs) & 1))];
+				ofs--;
+			} while (symbol < 0);
+
+			get_bits_no_markers(8 + (23 - ofs));
+		}
+		else
+			get_bits_no_markers(pH->code_size[symbol]);
+
+		return symbol;
+	}
+
+	// Decodes a Huffman encoded symbol.
+	inline int jpeg_decoder::huff_decode(huff_tables *pH, int& extra_bits)
+	{
+		int symbol;
+
+		// Check first 8-bits: do we have a complete symbol?
+		if ((symbol = pH->look_up2[m_bit_buf >> 24]) < 0)
+		{
+			// Use a tree traversal to find symbol.
+			int ofs = 23;
+			do
+			{
+				symbol = pH->tree[-(int)(symbol + ((m_bit_buf >> ofs) & 1))];
+				ofs--;
+			} while (symbol < 0);
+
+			get_bits_no_markers(8 + (23 - ofs));
+
+			extra_bits = get_bits_no_markers(symbol & 0xF);
+		}
+		else
+		{
+			JPGD_ASSERT(((symbol >> 8) & 31) == pH->code_size[symbol & 255] + ((symbol & 0x8000) ? (symbol & 15) : 0));
+
+			if (symbol & 0x8000)
+			{
+				get_bits_no_markers((symbol >> 8) & 31);
+				extra_bits = symbol >> 16;
+			}
+			else
+			{
+				int code_size = (symbol >> 8) & 31;
+				int num_extra_bits = symbol & 0xF;
+				int bits = code_size + num_extra_bits;
+				if (bits <= (m_bits_left + 16))
+					extra_bits = get_bits_no_markers(bits) & ((1 << num_extra_bits) - 1);
+				else
+				{
+					get_bits_no_markers(code_size);
+					extra_bits = get_bits_no_markers(num_extra_bits);
+				}
+			}
+
+			symbol &= 0xFF;
+		}
+
+		return symbol;
+	}
+
+	// Tables and macro used to fully decode the DPCM differences.
+	static const int s_extend_test[16] = { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+	static const int s_extend_offset[16] = { 0, -1, -3, -7, -15, -31, -63, -127, -255, -511, -1023, -2047, -4095, -8191, -16383, -32767 };
+	static const int s_extend_mask[] = { 0, (1<<0), (1<<1), (1<<2), (1<<3), (1<<4), (1<<5), (1<<6), (1<<7), (1<<8), (1<<9), (1<<10), (1<<11), (1<<12), (1<<13), (1<<14), (1<<15), (1<<16) };
+#define HUFF_EXTEND(x,s) ((x) < s_extend_test[s] ? (x) + s_extend_offset[s] : (x))
+
+	// Clamps a value between 0-255.
+	inline uint8 jpeg_decoder::clamp(int i)
+	{
+		if (static_cast<uint>(i) > 255)
+			i = (((~i) >> 31) & 0xFF);
+
+		return static_cast<uint8>(i);
+	}
+
+	namespace DCT_Upsample
+	{
+		struct Matrix44
+		{
+			typedef int Element_Type;
+			enum { NUM_ROWS = 4, NUM_COLS = 4 };
+
+			Element_Type v[NUM_ROWS][NUM_COLS];
+
+			inline int rows() const { return NUM_ROWS; }
+			inline int cols() const { return NUM_COLS; }
+
+			inline const Element_Type & at(int r, int c) const { return v[r][c]; }
+			inline       Element_Type & at(int r, int c)       { return v[r][c]; }
+
+			inline Matrix44() { }
+
+			inline Matrix44& operator += (const Matrix44& a)
+			{
+				for (int r = 0; r < NUM_ROWS; r++)
+				{
+					at(r, 0) += a.at(r, 0);
+					at(r, 1) += a.at(r, 1);
+					at(r, 2) += a.at(r, 2);
+					at(r, 3) += a.at(r, 3);
+				}
+				return *this;
+			}
+
+			inline Matrix44& operator -= (const Matrix44& a)
+			{
+				for (int r = 0; r < NUM_ROWS; r++)
+				{
+					at(r, 0) -= a.at(r, 0);
+					at(r, 1) -= a.at(r, 1);
+					at(r, 2) -= a.at(r, 2);
+					at(r, 3) -= a.at(r, 3);
+				}
+				return *this;
+			}
+
+			friend inline Matrix44 operator + (const Matrix44& a, const Matrix44& b)
+			{
+				Matrix44 ret;
+				for (int r = 0; r < NUM_ROWS; r++)
+				{
+					ret.at(r, 0) = a.at(r, 0) + b.at(r, 0);
+					ret.at(r, 1) = a.at(r, 1) + b.at(r, 1);
+					ret.at(r, 2) = a.at(r, 2) + b.at(r, 2);
+					ret.at(r, 3) = a.at(r, 3) + b.at(r, 3);
+				}
+				return ret;
+			}
+
+			friend inline Matrix44 operator - (const Matrix44& a, const Matrix44& b)
+			{
+				Matrix44 ret;
+				for (int r = 0; r < NUM_ROWS; r++)
+				{
+					ret.at(r, 0) = a.at(r, 0) - b.at(r, 0);
+					ret.at(r, 1) = a.at(r, 1) - b.at(r, 1);
+					ret.at(r, 2) = a.at(r, 2) - b.at(r, 2);
+					ret.at(r, 3) = a.at(r, 3) - b.at(r, 3);
+				}
+				return ret;
+			}
+
+			static inline void add_and_store(jpgd_block_t* pDst, const Matrix44& a, const Matrix44& b)
+			{
+				for (int r = 0; r < 4; r++)
+				{
+					pDst[0*8 + r] = static_cast<jpgd_block_t>(a.at(r, 0) + b.at(r, 0));
+					pDst[1*8 + r] = static_cast<jpgd_block_t>(a.at(r, 1) + b.at(r, 1));
+					pDst[2*8 + r] = static_cast<jpgd_block_t>(a.at(r, 2) + b.at(r, 2));
+					pDst[3*8 + r] = static_cast<jpgd_block_t>(a.at(r, 3) + b.at(r, 3));
+				}
+			}
+
+			static inline void sub_and_store(jpgd_block_t* pDst, const Matrix44& a, const Matrix44& b)
+			{
+				for (int r = 0; r < 4; r++)
+				{
+					pDst[0*8 + r] = static_cast<jpgd_block_t>(a.at(r, 0) - b.at(r, 0));
+					pDst[1*8 + r] = static_cast<jpgd_block_t>(a.at(r, 1) - b.at(r, 1));
+					pDst[2*8 + r] = static_cast<jpgd_block_t>(a.at(r, 2) - b.at(r, 2));
+					pDst[3*8 + r] = static_cast<jpgd_block_t>(a.at(r, 3) - b.at(r, 3));
+				}
+			}
+		};
+
+		const int FRACT_BITS = 10;
+		const int SCALE = 1 << FRACT_BITS;
+
+		typedef int Temp_Type;
+#define D(i) (((i) + (SCALE >> 1)) >> FRACT_BITS)
+#define F(i) ((int)((i) * SCALE + .5f))
+
+		// Any decent C++ compiler will optimize this at compile time to a 0, or an array access.
+#define AT(c, r) ((((c)>=NUM_COLS)||((r)>=NUM_ROWS)) ? 0 : pSrc[(c)+(r)*8])
+
+		// NUM_ROWS/NUM_COLS = # of non-zero rows/cols in input matrix
+		template<int NUM_ROWS, int NUM_COLS>
+		struct P_Q
+		{
+			static void calc(Matrix44& P, Matrix44& Q, const jpgd_block_t* pSrc)
+			{
+				// 4x8 = 4x8 times 8x8, matrix 0 is constant
+				const Temp_Type X000 = AT(0, 0);
+				const Temp_Type X001 = AT(0, 1);
+				const Temp_Type X002 = AT(0, 2);
+				const Temp_Type X003 = AT(0, 3);
+				const Temp_Type X004 = AT(0, 4);
+				const Temp_Type X005 = AT(0, 5);
+				const Temp_Type X006 = AT(0, 6);
+				const Temp_Type X007 = AT(0, 7);
+				const Temp_Type X010 = D(F(0.415735f) * AT(1, 0) + F(0.791065f) * AT(3, 0) + F(-0.352443f) * AT(5, 0) + F(0.277785f) * AT(7, 0));
+				const Temp_Type X011 = D(F(0.415735f) * AT(1, 1) + F(0.791065f) * AT(3, 1) + F(-0.352443f) * AT(5, 1) + F(0.277785f) * AT(7, 1));
+				const Temp_Type X012 = D(F(0.415735f) * AT(1, 2) + F(0.791065f) * AT(3, 2) + F(-0.352443f) * AT(5, 2) + F(0.277785f) * AT(7, 2));
+				const Temp_Type X013 = D(F(0.415735f) * AT(1, 3) + F(0.791065f) * AT(3, 3) + F(-0.352443f) * AT(5, 3) + F(0.277785f) * AT(7, 3));
+				const Temp_Type X014 = D(F(0.415735f) * AT(1, 4) + F(0.791065f) * AT(3, 4) + F(-0.352443f) * AT(5, 4) + F(0.277785f) * AT(7, 4));
+				const Temp_Type X015 = D(F(0.415735f) * AT(1, 5) + F(0.791065f) * AT(3, 5) + F(-0.352443f) * AT(5, 5) + F(0.277785f) * AT(7, 5));
+				const Temp_Type X016 = D(F(0.415735f) * AT(1, 6) + F(0.791065f) * AT(3, 6) + F(-0.352443f) * AT(5, 6) + F(0.277785f) * AT(7, 6));
+				const Temp_Type X017 = D(F(0.415735f) * AT(1, 7) + F(0.791065f) * AT(3, 7) + F(-0.352443f) * AT(5, 7) + F(0.277785f) * AT(7, 7));
+				const Temp_Type X020 = AT(4, 0);
+				const Temp_Type X021 = AT(4, 1);
+				const Temp_Type X022 = AT(4, 2);
+				const Temp_Type X023 = AT(4, 3);
+				const Temp_Type X024 = AT(4, 4);
+				const Temp_Type X025 = AT(4, 5);
+				const Temp_Type X026 = AT(4, 6);
+				const Temp_Type X027 = AT(4, 7);
+				const Temp_Type X030 = D(F(0.022887f) * AT(1, 0) + F(-0.097545f) * AT(3, 0) + F(0.490393f) * AT(5, 0) + F(0.865723f) * AT(7, 0));
+				const Temp_Type X031 = D(F(0.022887f) * AT(1, 1) + F(-0.097545f) * AT(3, 1) + F(0.490393f) * AT(5, 1) + F(0.865723f) * AT(7, 1));
+				const Temp_Type X032 = D(F(0.022887f) * AT(1, 2) + F(-0.097545f) * AT(3, 2) + F(0.490393f) * AT(5, 2) + F(0.865723f) * AT(7, 2));
+				const Temp_Type X033 = D(F(0.022887f) * AT(1, 3) + F(-0.097545f) * AT(3, 3) + F(0.490393f) * AT(5, 3) + F(0.865723f) * AT(7, 3));
+				const Temp_Type X034 = D(F(0.022887f) * AT(1, 4) + F(-0.097545f) * AT(3, 4) + F(0.490393f) * AT(5, 4) + F(0.865723f) * AT(7, 4));
+				const Temp_Type X035 = D(F(0.022887f) * AT(1, 5) + F(-0.097545f) * AT(3, 5) + F(0.490393f) * AT(5, 5) + F(0.865723f) * AT(7, 5));
+				const Temp_Type X036 = D(F(0.022887f) * AT(1, 6) + F(-0.097545f) * AT(3, 6) + F(0.490393f) * AT(5, 6) + F(0.865723f) * AT(7, 6));
+				const Temp_Type X037 = D(F(0.022887f) * AT(1, 7) + F(-0.097545f) * AT(3, 7) + F(0.490393f) * AT(5, 7) + F(0.865723f) * AT(7, 7));
+
+				// 4x4 = 4x8 times 8x4, matrix 1 is constant
+				P.at(0, 0) = X000;
+				P.at(0, 1) = D(X001 * F(0.415735f) + X003 * F(0.791065f) + X005 * F(-0.352443f) + X007 * F(0.277785f));
+				P.at(0, 2) = X004;
+				P.at(0, 3) = D(X001 * F(0.022887f) + X003 * F(-0.097545f) + X005 * F(0.490393f) + X007 * F(0.865723f));
+				P.at(1, 0) = X010;
+				P.at(1, 1) = D(X011 * F(0.415735f) + X013 * F(0.791065f) + X015 * F(-0.352443f) + X017 * F(0.277785f));
+				P.at(1, 2) = X014;
+				P.at(1, 3) = D(X011 * F(0.022887f) + X013 * F(-0.097545f) + X015 * F(0.490393f) + X017 * F(0.865723f));
+				P.at(2, 0) = X020;
+				P.at(2, 1) = D(X021 * F(0.415735f) + X023 * F(0.791065f) + X025 * F(-0.352443f) + X027 * F(0.277785f));
+				P.at(2, 2) = X024;
+				P.at(2, 3) = D(X021 * F(0.022887f) + X023 * F(-0.097545f) + X025 * F(0.490393f) + X027 * F(0.865723f));
+				P.at(3, 0) = X030;
+				P.at(3, 1) = D(X031 * F(0.415735f) + X033 * F(0.791065f) + X035 * F(-0.352443f) + X037 * F(0.277785f));
+				P.at(3, 2) = X034;
+				P.at(3, 3) = D(X031 * F(0.022887f) + X033 * F(-0.097545f) + X035 * F(0.490393f) + X037 * F(0.865723f));
+				// 40 muls 24 adds
+
+				// 4x4 = 4x8 times 8x4, matrix 1 is constant
+				Q.at(0, 0) = D(X001 * F(0.906127f) + X003 * F(-0.318190f) + X005 * F(0.212608f) + X007 * F(-0.180240f));
+				Q.at(0, 1) = X002;
+				Q.at(0, 2) = D(X001 * F(-0.074658f) + X003 * F(0.513280f) + X005 * F(0.768178f) + X007 * F(-0.375330f));
+				Q.at(0, 3) = X006;
+				Q.at(1, 0) = D(X011 * F(0.906127f) + X013 * F(-0.318190f) + X015 * F(0.212608f) + X017 * F(-0.180240f));
+				Q.at(1, 1) = X012;
+				Q.at(1, 2) = D(X011 * F(-0.074658f) + X013 * F(0.513280f) + X015 * F(0.768178f) + X017 * F(-0.375330f));
+				Q.at(1, 3) = X016;
+				Q.at(2, 0) = D(X021 * F(0.906127f) + X023 * F(-0.318190f) + X025 * F(0.212608f) + X027 * F(-0.180240f));
+				Q.at(2, 1) = X022;
+				Q.at(2, 2) = D(X021 * F(-0.074658f) + X023 * F(0.513280f) + X025 * F(0.768178f) + X027 * F(-0.375330f));
+				Q.at(2, 3) = X026;
+				Q.at(3, 0) = D(X031 * F(0.906127f) + X033 * F(-0.318190f) + X035 * F(0.212608f) + X037 * F(-0.180240f));
+				Q.at(3, 1) = X032;
+				Q.at(3, 2) = D(X031 * F(-0.074658f) + X033 * F(0.513280f) + X035 * F(0.768178f) + X037 * F(-0.375330f));
+				Q.at(3, 3) = X036;
+				// 40 muls 24 adds
+			}
+		};
+
+		template<int NUM_ROWS, int NUM_COLS>
+		struct R_S
+		{
+			static void calc(Matrix44& R, Matrix44& S, const jpgd_block_t* pSrc)
+			{
+				// 4x8 = 4x8 times 8x8, matrix 0 is constant
+				const Temp_Type X100 = D(F(0.906127f) * AT(1, 0) + F(-0.318190f) * AT(3, 0) + F(0.212608f) * AT(5, 0) + F(-0.180240f) * AT(7, 0));
+				const Temp_Type X101 = D(F(0.906127f) * AT(1, 1) + F(-0.318190f) * AT(3, 1) + F(0.212608f) * AT(5, 1) + F(-0.180240f) * AT(7, 1));
+				const Temp_Type X102 = D(F(0.906127f) * AT(1, 2) + F(-0.318190f) * AT(3, 2) + F(0.212608f) * AT(5, 2) + F(-0.180240f) * AT(7, 2));
+				const Temp_Type X103 = D(F(0.906127f) * AT(1, 3) + F(-0.318190f) * AT(3, 3) + F(0.212608f) * AT(5, 3) + F(-0.180240f) * AT(7, 3));
+				const Temp_Type X104 = D(F(0.906127f) * AT(1, 4) + F(-0.318190f) * AT(3, 4) + F(0.212608f) * AT(5, 4) + F(-0.180240f) * AT(7, 4));
+				const Temp_Type X105 = D(F(0.906127f) * AT(1, 5) + F(-0.318190f) * AT(3, 5) + F(0.212608f) * AT(5, 5) + F(-0.180240f) * AT(7, 5));
+				const Temp_Type X106 = D(F(0.906127f) * AT(1, 6) + F(-0.318190f) * AT(3, 6) + F(0.212608f) * AT(5, 6) + F(-0.180240f) * AT(7, 6));
+				const Temp_Type X107 = D(F(0.906127f) * AT(1, 7) + F(-0.318190f) * AT(3, 7) + F(0.212608f) * AT(5, 7) + F(-0.180240f) * AT(7, 7));
+				const Temp_Type X110 = AT(2, 0);
+				const Temp_Type X111 = AT(2, 1);
+				const Temp_Type X112 = AT(2, 2);
+				const Temp_Type X113 = AT(2, 3);
+				const Temp_Type X114 = AT(2, 4);
+				const Temp_Type X115 = AT(2, 5);
+				const Temp_Type X116 = AT(2, 6);
+				const Temp_Type X117 = AT(2, 7);
+				const Temp_Type X120 = D(F(-0.074658f) * AT(1, 0) + F(0.513280f) * AT(3, 0) + F(0.768178f) * AT(5, 0) + F(-0.375330f) * AT(7, 0));
+				const Temp_Type X121 = D(F(-0.074658f) * AT(1, 1) + F(0.513280f) * AT(3, 1) + F(0.768178f) * AT(5, 1) + F(-0.375330f) * AT(7, 1));
+				const Temp_Type X122 = D(F(-0.074658f) * AT(1, 2) + F(0.513280f) * AT(3, 2) + F(0.768178f) * AT(5, 2) + F(-0.375330f) * AT(7, 2));
+				const Temp_Type X123 = D(F(-0.074658f) * AT(1, 3) + F(0.513280f) * AT(3, 3) + F(0.768178f) * AT(5, 3) + F(-0.375330f) * AT(7, 3));
+				const Temp_Type X124 = D(F(-0.074658f) * AT(1, 4) + F(0.513280f) * AT(3, 4) + F(0.768178f) * AT(5, 4) + F(-0.375330f) * AT(7, 4));
+				const Temp_Type X125 = D(F(-0.074658f) * AT(1, 5) + F(0.513280f) * AT(3, 5) + F(0.768178f) * AT(5, 5) + F(-0.375330f) * AT(7, 5));
+				const Temp_Type X126 = D(F(-0.074658f) * AT(1, 6) + F(0.513280f) * AT(3, 6) + F(0.768178f) * AT(5, 6) + F(-0.375330f) * AT(7, 6));
+				const Temp_Type X127 = D(F(-0.074658f) * AT(1, 7) + F(0.513280f) * AT(3, 7) + F(0.768178f) * AT(5, 7) + F(-0.375330f) * AT(7, 7));
+				const Temp_Type X130 = AT(6, 0);
+				const Temp_Type X131 = AT(6, 1);
+				const Temp_Type X132 = AT(6, 2);
+				const Temp_Type X133 = AT(6, 3);
+				const Temp_Type X134 = AT(6, 4);
+				const Temp_Type X135 = AT(6, 5);
+				const Temp_Type X136 = AT(6, 6);
+				const Temp_Type X137 = AT(6, 7);
+				// 80 muls 48 adds
+
+				// 4x4 = 4x8 times 8x4, matrix 1 is constant
+				R.at(0, 0) = X100;
+				R.at(0, 1) = D(X101 * F(0.415735f) + X103 * F(0.791065f) + X105 * F(-0.352443f) + X107 * F(0.277785f));
+				R.at(0, 2) = X104;
+				R.at(0, 3) = D(X101 * F(0.022887f) + X103 * F(-0.097545f) + X105 * F(0.490393f) + X107 * F(0.865723f));
+				R.at(1, 0) = X110;
+				R.at(1, 1) = D(X111 * F(0.415735f) + X113 * F(0.791065f) + X115 * F(-0.352443f) + X117 * F(0.277785f));
+				R.at(1, 2) = X114;
+				R.at(1, 3) = D(X111 * F(0.022887f) + X113 * F(-0.097545f) + X115 * F(0.490393f) + X117 * F(0.865723f));
+				R.at(2, 0) = X120;
+				R.at(2, 1) = D(X121 * F(0.415735f) + X123 * F(0.791065f) + X125 * F(-0.352443f) + X127 * F(0.277785f));
+				R.at(2, 2) = X124;
+				R.at(2, 3) = D(X121 * F(0.022887f) + X123 * F(-0.097545f) + X125 * F(0.490393f) + X127 * F(0.865723f));
+				R.at(3, 0) = X130;
+				R.at(3, 1) = D(X131 * F(0.415735f) + X133 * F(0.791065f) + X135 * F(-0.352443f) + X137 * F(0.277785f));
+				R.at(3, 2) = X134;
+				R.at(3, 3) = D(X131 * F(0.022887f) + X133 * F(-0.097545f) + X135 * F(0.490393f) + X137 * F(0.865723f));
+				// 40 muls 24 adds
+				// 4x4 = 4x8 times 8x4, matrix 1 is constant
+				S.at(0, 0) = D(X101 * F(0.906127f) + X103 * F(-0.318190f) + X105 * F(0.212608f) + X107 * F(-0.180240f));
+				S.at(0, 1) = X102;
+				S.at(0, 2) = D(X101 * F(-0.074658f) + X103 * F(0.513280f) + X105 * F(0.768178f) + X107 * F(-0.375330f));
+				S.at(0, 3) = X106;
+				S.at(1, 0) = D(X111 * F(0.906127f) + X113 * F(-0.318190f) + X115 * F(0.212608f) + X117 * F(-0.180240f));
+				S.at(1, 1) = X112;
+				S.at(1, 2) = D(X111 * F(-0.074658f) + X113 * F(0.513280f) + X115 * F(0.768178f) + X117 * F(-0.375330f));
+				S.at(1, 3) = X116;
+				S.at(2, 0) = D(X121 * F(0.906127f) + X123 * F(-0.318190f) + X125 * F(0.212608f) + X127 * F(-0.180240f));
+				S.at(2, 1) = X122;
+				S.at(2, 2) = D(X121 * F(-0.074658f) + X123 * F(0.513280f) + X125 * F(0.768178f) + X127 * F(-0.375330f));
+				S.at(2, 3) = X126;
+				S.at(3, 0) = D(X131 * F(0.906127f) + X133 * F(-0.318190f) + X135 * F(0.212608f) + X137 * F(-0.180240f));
+				S.at(3, 1) = X132;
+				S.at(3, 2) = D(X131 * F(-0.074658f) + X133 * F(0.513280f) + X135 * F(0.768178f) + X137 * F(-0.375330f));
+				S.at(3, 3) = X136;
+				// 40 muls 24 adds
+			}
+		};
+	} // end namespace DCT_Upsample
+
+	// Unconditionally frees all allocated m_blocks.
+	void jpeg_decoder::free_all_blocks()
+	{
+		m_pStream = NULL;
+		for (mem_block *b = m_pMem_blocks; b; )
+		{
+			mem_block *n = b->m_pNext;
+			jpgd_free(b);
+			b = n;
+		}
+		m_pMem_blocks = NULL;
+	}
+
+	// This method handles all errors.
+	// It could easily be changed to use C++ exceptions.
+	void jpeg_decoder::stop_decoding(jpgd_status status)
+	{
+		m_error_code = status;
+		free_all_blocks();
+		longjmp(m_jmp_state, status);
+
+		// we shouldn't get here as longjmp shouldn't return, but we put it here to make it explicit
+		// that this function doesn't return, otherwise we get this error:
+		// 
+		// error : function declared 'noreturn' should not return
+		exit(1);
+	}
+
+	void *jpeg_decoder::alloc(size_t nSize, bool zero)
+	{
+		nSize = (JPGD_MAX(nSize, 1) + 3) & ~3;
+		char *rv = NULL;
+		for (mem_block *b = m_pMem_blocks; b; b = b->m_pNext)
+		{
+			if ((b->m_used_count + nSize) <= b->m_size)
+			{
+				rv = b->m_data + b->m_used_count;
+				b->m_used_count += nSize;
+				break;
+			}
+		}
+		if (!rv)
+		{
+			int capacity = JPGD_MAX(32768 - 256, (nSize + 2047) & ~2047);
+			mem_block *b = (mem_block*)jpgd_malloc(sizeof(mem_block) + capacity);
+			if (!b) stop_decoding(JPGD_NOTENOUGHMEM);
+			b->m_pNext = m_pMem_blocks; m_pMem_blocks = b;
+			b->m_used_count = nSize;
+			b->m_size = capacity;
+			rv = b->m_data;
+		}
+		if (zero) memset(rv, 0, nSize);
+		return rv;
+	}
+
+	void jpeg_decoder::word_clear(void *p, uint16 c, uint n)
+	{
+		uint8 *pD = (uint8*)p;
+		const uint8 l = c & 0xFF, h = (c >> 8) & 0xFF;
+		while (n)
+		{
+			pD[0] = l; pD[1] = h; pD += 2;
+			n--;
+		}
+	}
+
+	// Refill the input buffer.
+	// This method will sit in a loop until (A) the buffer is full or (B)
+	// the stream's read() method reports and end of file condition.
+	void jpeg_decoder::prep_in_buffer()
+	{
+		m_in_buf_left = 0;
+		m_pIn_buf_ofs = m_in_buf;
+
+		if (m_eof_flag)
+			return;
+
+		do
+		{
+			int bytes_read = m_pStream->read(m_in_buf + m_in_buf_left, JPGD_IN_BUF_SIZE - m_in_buf_left, &m_eof_flag);
+			if (bytes_read == -1)
+				stop_decoding(JPGD_STREAM_READ);
+
+			m_in_buf_left += bytes_read;
+		} while ((m_in_buf_left < JPGD_IN_BUF_SIZE) && (!m_eof_flag));
+
+		m_total_bytes_read += m_in_buf_left;
+
+		// Pad the end of the block with M_EOI (prevents the decompressor from going off the rails if the stream is invalid).
+		// (This dates way back to when this decompressor was written in C/asm, and the all-asm Huffman decoder did some fancy things to increase perf.)
+		word_clear(m_pIn_buf_ofs + m_in_buf_left, 0xD9FF, 64);
+	}
+
+	// Read a Huffman code table.
+	void jpeg_decoder::read_dht_marker()
+	{
+		int i, index, count;
+		uint8 huff_num[17];
+		uint8 huff_val[256];
+
+		uint num_left = get_bits(16);
+
+		if (num_left < 2)
+			stop_decoding(JPGD_BAD_DHT_MARKER);
+
+		num_left -= 2;
+
+		while (num_left)
+		{
+			index = get_bits(8);
+
+			huff_num[0] = 0;
+
+			count = 0;
+
+			for (i = 1; i <= 16; i++)
+			{
+				huff_num[i] = static_cast<uint8>(get_bits(8));
+				count += huff_num[i];
+			}
+
+			if (count > 255)
+				stop_decoding(JPGD_BAD_DHT_COUNTS);
+
+			for (i = 0; i < count; i++)
+				huff_val[i] = static_cast<uint8>(get_bits(8));
+
+			i = 1 + 16 + count;
+
+			if (num_left < (uint)i)
+				stop_decoding(JPGD_BAD_DHT_MARKER);
+
+			num_left -= i;
+
+			if ((index & 0x10) > 0x10)
+				stop_decoding(JPGD_BAD_DHT_INDEX);
+
+			index = (index & 0x0F) + ((index & 0x10) >> 4) * (JPGD_MAX_HUFF_TABLES >> 1);
+
+			if (index >= JPGD_MAX_HUFF_TABLES)
+				stop_decoding(JPGD_BAD_DHT_INDEX);
+
+			if (!m_huff_num[index])
+				m_huff_num[index] = (uint8 *)alloc(17);
+
+			if (!m_huff_val[index])
+				m_huff_val[index] = (uint8 *)alloc(256);
+
+			m_huff_ac[index] = (index & 0x10) != 0;
+			memcpy(m_huff_num[index], huff_num, 17);
+			memcpy(m_huff_val[index], huff_val, 256);
+		}
+	}
+
+	// Read a quantization table.
+	void jpeg_decoder::read_dqt_marker()
+	{
+		int n, i, prec;
+		uint num_left;
+		uint temp;
+
+		num_left = get_bits(16);
+
+		if (num_left < 2)
+			stop_decoding(JPGD_BAD_DQT_MARKER);
+
+		num_left -= 2;
+
+		while (num_left)
+		{
+			n = get_bits(8);
+			prec = n >> 4;
+			n &= 0x0F;
+
+			if (n >= JPGD_MAX_QUANT_TABLES)
+				stop_decoding(JPGD_BAD_DQT_TABLE);
+
+			if (!m_quant[n])
+				m_quant[n] = (jpgd_quant_t *)alloc(64 * sizeof(jpgd_quant_t));
+
+			// read quantization entries, in zag order
+			for (i = 0; i < 64; i++)
+			{
+				temp = get_bits(8);
+
+				if (prec)
+					temp = (temp << 8) + get_bits(8);
+
+				m_quant[n][i] = static_cast<jpgd_quant_t>(temp);
+			}
+
+			i = 64 + 1;
+
+			if (prec)
+				i += 64;
+
+			if (num_left < (uint)i)
+				stop_decoding(JPGD_BAD_DQT_LENGTH);
+
+			num_left -= i;
+		}
+	}
+
+	// Read the start of frame (SOF) marker.
+	void jpeg_decoder::read_sof_marker()
+	{
+		int i;
+		uint num_left;
+
+		num_left = get_bits(16);
+
+		if (get_bits(8) != 8)   /* precision: sorry, only 8-bit precision is supported right now */
+			stop_decoding(JPGD_BAD_PRECISION);
+
+		m_image_y_size = get_bits(16);
+
+		if ((m_image_y_size < 1) || (m_image_y_size > JPGD_MAX_HEIGHT))
+			stop_decoding(JPGD_BAD_HEIGHT);
+
+		m_image_x_size = get_bits(16);
+
+		if ((m_image_x_size < 1) || (m_image_x_size > JPGD_MAX_WIDTH))
+			stop_decoding(JPGD_BAD_WIDTH);
+
+		m_comps_in_frame = get_bits(8);
+
+		if (m_comps_in_frame > JPGD_MAX_COMPONENTS)
+			stop_decoding(JPGD_TOO_MANY_COMPONENTS);
+
+		if (num_left != (uint)(m_comps_in_frame * 3 + 8))
+			stop_decoding(JPGD_BAD_SOF_LENGTH);
+
+		for (i = 0; i < m_comps_in_frame; i++)
+		{
+			m_comp_ident[i]  = get_bits(8);
+			m_comp_h_samp[i] = get_bits(4);
+			m_comp_v_samp[i] = get_bits(4);
+			m_comp_quant[i]  = get_bits(8);
+		}
+	}
+
+	// Used to skip unrecognized markers.
+	void jpeg_decoder::skip_variable_marker()
+	{
+		uint num_left;
+
+		num_left = get_bits(16);
+
+		if (num_left < 2)
+			stop_decoding(JPGD_BAD_VARIABLE_MARKER);
+
+		num_left -= 2;
+
+		while (num_left)
+		{
+			get_bits(8);
+			num_left--;
+		}
+	}
+
+	// Read a define restart interval (DRI) marker.
+	void jpeg_decoder::read_dri_marker()
+	{
+		if (get_bits(16) != 4)
+			stop_decoding(JPGD_BAD_DRI_LENGTH);
+
+		m_restart_interval = get_bits(16);
+	}
+
+	// Read a start of scan (SOS) marker.
+	void jpeg_decoder::read_sos_marker()
+	{
+		uint num_left;
+		int i, ci, n, c, cc;
+
+		num_left = get_bits(16);
+
+		n = get_bits(8);
+
+		m_comps_in_scan = n;
+
+		num_left -= 3;
+
+		if ( (num_left != (uint)(n * 2 + 3)) || (n < 1) || (n > JPGD_MAX_COMPS_IN_SCAN) )
+			stop_decoding(JPGD_BAD_SOS_LENGTH);
+
+		for (i = 0; i < n; i++)
+		{
+			cc = get_bits(8);
+			c = get_bits(8);
+			num_left -= 2;
+
+			for (ci = 0; ci < m_comps_in_frame; ci++)
+				if (cc == m_comp_ident[ci])
+					break;
+
+			if (ci >= m_comps_in_frame)
+				stop_decoding(JPGD_BAD_SOS_COMP_ID);
+
+			m_comp_list[i]    = ci;
+			m_comp_dc_tab[ci] = (c >> 4) & 15;
+			m_comp_ac_tab[ci] = (c & 15) + (JPGD_MAX_HUFF_TABLES >> 1);
+		}
+
+		m_spectral_start  = get_bits(8);
+		m_spectral_end    = get_bits(8);
+		m_successive_high = get_bits(4);
+		m_successive_low  = get_bits(4);
+
+		if (!m_progressive_flag)
+		{
+			m_spectral_start = 0;
+			m_spectral_end = 63;
+		}
+
+		num_left -= 3;
+
+		while (num_left)                  /* read past whatever is num_left */
+		{
+			get_bits(8);
+			num_left--;
+		}
+	}
+
+	// Finds the next marker.
+	int jpeg_decoder::next_marker()
+	{
+		uint c, bytes;
+
+		bytes = 0;
+
+		do
+		{
+			do
+			{
+				bytes++;
+				c = get_bits(8);
+			} while (c != 0xFF);
+
+			do
+			{
+				c = get_bits(8);
+			} while (c == 0xFF);
+
+		} while (c == 0);
+
+		// If bytes > 0 here, there where extra bytes before the marker (not good).
+
+		return c;
+	}
+
+	// Process markers. Returns when an SOFx, SOI, EOI, or SOS marker is
+	// encountered.
+	int jpeg_decoder::process_markers()
+	{
+		int c;
+
+		for ( ; ; )
+		{
+			c = next_marker();
+
+			switch (c)
+			{
+			case M_SOF0:
+			case M_SOF1:
+			case M_SOF2:
+			case M_SOF3:
+			case M_SOF5:
+			case M_SOF6:
+			case M_SOF7:
+				//      case M_JPG:
+			case M_SOF9:
+			case M_SOF10:
+			case M_SOF11:
+			case M_SOF13:
+			case M_SOF14:
+			case M_SOF15:
+			case M_SOI:
+			case M_EOI:
+			case M_SOS:
+				{
+					return c;
+				}
+			case M_DHT:
+				{
+					read_dht_marker();
+					break;
+				}
+				// No arithmitic support - dumb patents!
+			case M_DAC:
+				{
+					stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
+					break;
+				}
+			case M_DQT:
+				{
+					read_dqt_marker();
+					break;
+				}
+			case M_DRI:
+				{
+					read_dri_marker();
+					break;
+				}
+				//case M_APP0:  /* no need to read the JFIF marker */
+
+			case M_JPG:
+			case M_RST0:    /* no parameters */
+			case M_RST1:
+			case M_RST2:
+			case M_RST3:
+			case M_RST4:
+			case M_RST5:
+			case M_RST6:
+			case M_RST7:
+			case M_TEM:
+				{
+					stop_decoding(JPGD_UNEXPECTED_MARKER);
+					break;
+				}
+			default:    /* must be DNL, DHP, EXP, APPn, JPGn, COM, or RESn or APP0 */
+				{
+					skip_variable_marker();
+					break;
+				}
+			}
+		}
+	}
+
+	// Finds the start of image (SOI) marker.
+	// This code is rather defensive: it only checks the first 512 bytes to avoid
+	// false positives.
+	void jpeg_decoder::locate_soi_marker()
+	{
+		uint lastchar, thischar;
+		uint bytesleft;
+
+		lastchar = get_bits(8);
+
+		thischar = get_bits(8);
+
+		/* ok if it's a normal JPEG file without a special header */
+
+		if ((lastchar == 0xFF) && (thischar == M_SOI))
+			return;
+
+		bytesleft = 4096; //512;
+
+		for ( ; ; )
+		{
+			if (--bytesleft == 0)
+				stop_decoding(JPGD_NOT_JPEG);
+
+			lastchar = thischar;
+
+			thischar = get_bits(8);
+
+			if (lastchar == 0xFF)
+			{
+				if (thischar == M_SOI)
+					break;
+				else if (thischar == M_EOI) // get_bits will keep returning M_EOI if we read past the end
+					stop_decoding(JPGD_NOT_JPEG);
+			}
+		}
+
+		// Check the next character after marker: if it's not 0xFF, it can't be the start of the next marker, so the file is bad.
+		thischar = (m_bit_buf >> 24) & 0xFF;
+
+		if (thischar != 0xFF)
+			stop_decoding(JPGD_NOT_JPEG);
+	}
+
+	// Find a start of frame (SOF) marker.
+	void jpeg_decoder::locate_sof_marker()
+	{
+		locate_soi_marker();
+
+		int c = process_markers();
+
+		switch (c)
+		{
+		case M_SOF2:
+			m_progressive_flag = JPGD_TRUE;
+		case M_SOF0:  /* baseline DCT */
+		case M_SOF1:  /* extended sequential DCT */
+			{
+				read_sof_marker();
+				break;
+			}
+		case M_SOF9:  /* Arithmitic coding */
+			{
+				stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
+				break;
+			}
+		default:
+			{
+				stop_decoding(JPGD_UNSUPPORTED_MARKER);
+				break;
+			}
+		}
+	}
+
+	// Find a start of scan (SOS) marker.
+	int jpeg_decoder::locate_sos_marker()
+	{
+		int c;
+
+		c = process_markers();
+
+		if (c == M_EOI)
+			return JPGD_FALSE;
+		else if (c != M_SOS)
+			stop_decoding(JPGD_UNEXPECTED_MARKER);
+
+		read_sos_marker();
+
+		return JPGD_TRUE;
+	}
+
+	// Reset everything to default/uninitialized state.
+	void jpeg_decoder::init(jpeg_decoder_stream *pStream)
+	{
+		m_pMem_blocks = NULL;
+		m_error_code = JPGD_SUCCESS;
+		m_ready_flag = false;
+		m_image_x_size = m_image_y_size = 0;
+		m_pStream = pStream;
+		m_progressive_flag = JPGD_FALSE;
+
+		memset(m_huff_ac, 0, sizeof(m_huff_ac));
+		memset(m_huff_num, 0, sizeof(m_huff_num));
+		memset(m_huff_val, 0, sizeof(m_huff_val));
+		memset(m_quant, 0, sizeof(m_quant));
+
+		m_scan_type = 0;
+		m_comps_in_frame = 0;
+
+		memset(m_comp_h_samp, 0, sizeof(m_comp_h_samp));
+		memset(m_comp_v_samp, 0, sizeof(m_comp_v_samp));
+		memset(m_comp_quant, 0, sizeof(m_comp_quant));
+		memset(m_comp_ident, 0, sizeof(m_comp_ident));
+		memset(m_comp_h_blocks, 0, sizeof(m_comp_h_blocks));
+		memset(m_comp_v_blocks, 0, sizeof(m_comp_v_blocks));
+
+		m_comps_in_scan = 0;
+		memset(m_comp_list, 0, sizeof(m_comp_list));
+		memset(m_comp_dc_tab, 0, sizeof(m_comp_dc_tab));
+		memset(m_comp_ac_tab, 0, sizeof(m_comp_ac_tab));
+
+		m_spectral_start = 0;
+		m_spectral_end = 0;
+		m_successive_low = 0;
+		m_successive_high = 0;
+		m_max_mcu_x_size = 0;
+		m_max_mcu_y_size = 0;
+		m_blocks_per_mcu = 0;
+		m_max_blocks_per_row = 0;
+		m_mcus_per_row = 0;
+		m_mcus_per_col = 0;
+		m_expanded_blocks_per_component = 0;
+		m_expanded_blocks_per_mcu = 0;
+		m_expanded_blocks_per_row = 0;
+		m_freq_domain_chroma_upsample = false;
+
+		memset(m_mcu_org, 0, sizeof(m_mcu_org));
+
+		m_total_lines_left = 0;
+		m_mcu_lines_left = 0;
+		m_real_dest_bytes_per_scan_line = 0;
+		m_dest_bytes_per_scan_line = 0;
+		m_dest_bytes_per_pixel = 0;
+
+		memset(m_pHuff_tabs, 0, sizeof(m_pHuff_tabs));
+
+		memset(m_dc_coeffs, 0, sizeof(m_dc_coeffs));
+		memset(m_ac_coeffs, 0, sizeof(m_ac_coeffs));
+		memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
+
+		m_eob_run = 0;
+
+		memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
+
+		m_pIn_buf_ofs = m_in_buf;
+		m_in_buf_left = 0;
+		m_eof_flag = false;
+		m_tem_flag = 0;
+
+		memset(m_in_buf_pad_start, 0, sizeof(m_in_buf_pad_start));
+		memset(m_in_buf, 0, sizeof(m_in_buf));
+		memset(m_in_buf_pad_end, 0, sizeof(m_in_buf_pad_end));
+
+		m_restart_interval = 0;
+		m_restarts_left    = 0;
+		m_next_restart_num = 0;
+
+		m_max_mcus_per_row = 0;
+		m_max_blocks_per_mcu = 0;
+		m_max_mcus_per_col = 0;
+
+		memset(m_last_dc_val, 0, sizeof(m_last_dc_val));
+		m_pMCU_coefficients = NULL;
+		m_pSample_buf = NULL;
+
+		m_total_bytes_read = 0;
+
+		m_pScan_line_0 = NULL;
+		m_pScan_line_1 = NULL;
+
+		// Ready the input buffer.
+		prep_in_buffer();
+
+		// Prime the bit buffer.
+		m_bits_left = 16;
+		m_bit_buf = 0;
+
+		get_bits(16);
+		get_bits(16);
+
+		for (int i = 0; i < JPGD_MAX_BLOCKS_PER_MCU; i++)
+			m_mcu_block_max_zag[i] = 64;
+	}
+
+#define SCALEBITS 16
+#define ONE_HALF  ((int) 1 << (SCALEBITS-1))
+#define FIX(x)    ((int) ((x) * (1L<<SCALEBITS) + 0.5f))
+
+	// Create a few tables that allow us to quickly convert YCbCr to RGB.
+	void jpeg_decoder::create_look_ups()
+	{
+		for (int i = 0; i <= 255; i++)
+		{
+			int k = i - 128;
+			m_crr[i] = ( FIX(1.40200f)  * k + ONE_HALF) >> SCALEBITS;
+			m_cbb[i] = ( FIX(1.77200f)  * k + ONE_HALF) >> SCALEBITS;
+			m_crg[i] = (-FIX(0.71414f)) * k;
+			m_cbg[i] = (-FIX(0.34414f)) * k + ONE_HALF;
+		}
+	}
+
+	// This method throws back into the stream any bytes that where read
+	// into the bit buffer during initial marker scanning.
+	void jpeg_decoder::fix_in_buffer()
+	{
+		// In case any 0xFF's where pulled into the buffer during marker scanning.
+		JPGD_ASSERT((m_bits_left & 7) == 0);
+
+		if (m_bits_left == 16)
+			stuff_char( (uint8)(m_bit_buf & 0xFF));
+
+		if (m_bits_left >= 8)
+			stuff_char( (uint8)((m_bit_buf >> 8) & 0xFF));
+
+		stuff_char((uint8)((m_bit_buf >> 16) & 0xFF));
+		stuff_char((uint8)((m_bit_buf >> 24) & 0xFF));
+
+		m_bits_left = 16;
+		get_bits_no_markers(16);
+		get_bits_no_markers(16);
+	}
+
+	void jpeg_decoder::transform_mcu(int mcu_row)
+	{
+		jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
+		uint8* pDst_ptr = m_pSample_buf + mcu_row * m_blocks_per_mcu * 64;
+
+		for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+		{
+			idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block]);
+			pSrc_ptr += 64;
+			pDst_ptr += 64;
+		}
+	}
+
+	static const uint8 s_max_rc[64] =
+	{
+		17, 18, 34, 50, 50, 51, 52, 52, 52, 68, 84, 84, 84, 84, 85, 86, 86, 86, 86, 86,
+		102, 118, 118, 118, 118, 118, 118, 119, 120, 120, 120, 120, 120, 120, 120, 136,
+		136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
+		136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136
+	};
+
+	void jpeg_decoder::transform_mcu_expand(int mcu_row)
+	{
+		jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
+		uint8* pDst_ptr = m_pSample_buf + mcu_row * m_expanded_blocks_per_mcu * 64;
+
+		// Y IDCT
+		int mcu_block;
+		for (mcu_block = 0; mcu_block < m_expanded_blocks_per_component; mcu_block++)
+		{
+			idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block]);
+			pSrc_ptr += 64;
+			pDst_ptr += 64;
+		}
+
+		// Chroma IDCT, with upsampling
+		jpgd_block_t temp_block[64];
+
+		for (int i = 0; i < 2; i++)
+		{
+			DCT_Upsample::Matrix44 P, Q, R, S;
+
+			JPGD_ASSERT(m_mcu_block_max_zag[mcu_block] >= 1);
+			JPGD_ASSERT(m_mcu_block_max_zag[mcu_block] <= 64);
+
+			switch (s_max_rc[m_mcu_block_max_zag[mcu_block++] - 1])
+			{
+			case 1*16+1:
+				DCT_Upsample::P_Q<1, 1>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<1, 1>::calc(R, S, pSrc_ptr);
+				break;
+			case 1*16+2:
+				DCT_Upsample::P_Q<1, 2>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<1, 2>::calc(R, S, pSrc_ptr);
+				break;
+			case 2*16+2:
+				DCT_Upsample::P_Q<2, 2>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<2, 2>::calc(R, S, pSrc_ptr);
+				break;
+			case 3*16+2:
+				DCT_Upsample::P_Q<3, 2>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<3, 2>::calc(R, S, pSrc_ptr);
+				break;
+			case 3*16+3:
+				DCT_Upsample::P_Q<3, 3>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<3, 3>::calc(R, S, pSrc_ptr);
+				break;
+			case 3*16+4:
+				DCT_Upsample::P_Q<3, 4>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<3, 4>::calc(R, S, pSrc_ptr);
+				break;
+			case 4*16+4:
+				DCT_Upsample::P_Q<4, 4>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<4, 4>::calc(R, S, pSrc_ptr);
+				break;
+			case 5*16+4:
+				DCT_Upsample::P_Q<5, 4>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<5, 4>::calc(R, S, pSrc_ptr);
+				break;
+			case 5*16+5:
+				DCT_Upsample::P_Q<5, 5>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<5, 5>::calc(R, S, pSrc_ptr);
+				break;
+			case 5*16+6:
+				DCT_Upsample::P_Q<5, 6>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<5, 6>::calc(R, S, pSrc_ptr);
+				break;
+			case 6*16+6:
+				DCT_Upsample::P_Q<6, 6>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<6, 6>::calc(R, S, pSrc_ptr);
+				break;
+			case 7*16+6:
+				DCT_Upsample::P_Q<7, 6>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<7, 6>::calc(R, S, pSrc_ptr);
+				break;
+			case 7*16+7:
+				DCT_Upsample::P_Q<7, 7>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<7, 7>::calc(R, S, pSrc_ptr);
+				break;
+			case 7*16+8:
+				DCT_Upsample::P_Q<7, 8>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<7, 8>::calc(R, S, pSrc_ptr);
+				break;
+			case 8*16+8:
+				DCT_Upsample::P_Q<8, 8>::calc(P, Q, pSrc_ptr);
+				DCT_Upsample::R_S<8, 8>::calc(R, S, pSrc_ptr);
+				break;
+			default:
+				JPGD_ASSERT(false);
+			}
+
+			DCT_Upsample::Matrix44 a(P + Q); P -= Q;
+			DCT_Upsample::Matrix44& b = P;
+			DCT_Upsample::Matrix44 c(R + S); R -= S;
+			DCT_Upsample::Matrix44& d = R;
+
+			DCT_Upsample::Matrix44::add_and_store(temp_block, a, c);
+			idct_4x4(temp_block, pDst_ptr);
+			pDst_ptr += 64;
+
+			DCT_Upsample::Matrix44::sub_and_store(temp_block, a, c);
+			idct_4x4(temp_block, pDst_ptr);
+			pDst_ptr += 64;
+
+			DCT_Upsample::Matrix44::add_and_store(temp_block, b, d);
+			idct_4x4(temp_block, pDst_ptr);
+			pDst_ptr += 64;
+
+			DCT_Upsample::Matrix44::sub_and_store(temp_block, b, d);
+			idct_4x4(temp_block, pDst_ptr);
+			pDst_ptr += 64;
+
+			pSrc_ptr += 64;
+		}
+	}
+
+	// Loads and dequantizes the next row of (already decoded) coefficients.
+	// Progressive images only.
+	void jpeg_decoder::load_next_row()
+	{
+		int i;
+		jpgd_block_t *p;
+		jpgd_quant_t *q;
+		int mcu_row, mcu_block, row_block = 0;
+		int component_num, component_id;
+		int block_x_mcu[JPGD_MAX_COMPONENTS];
+
+		memset(block_x_mcu, 0, JPGD_MAX_COMPONENTS * sizeof(int));
+
+		for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+		{
+			int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
+
+			for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+			{
+				component_id = m_mcu_org[mcu_block];
+				q = m_quant[m_comp_quant[component_id]];
+
+				p = m_pMCU_coefficients + 64 * mcu_block;
+
+				jpgd_block_t* pAC = coeff_buf_getp(m_ac_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+				jpgd_block_t* pDC = coeff_buf_getp(m_dc_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+				p[0] = pDC[0];
+				memcpy(&p[1], &pAC[1], 63 * sizeof(jpgd_block_t));
+
+				for (i = 63; i > 0; i--)
+					if (p[g_ZAG[i]])
+						break;
+
+				m_mcu_block_max_zag[mcu_block] = i + 1;
+
+				for ( ; i >= 0; i--)
+					if (p[g_ZAG[i]])
+						p[g_ZAG[i]] = static_cast<jpgd_block_t>(p[g_ZAG[i]] * q[i]);
+
+				row_block++;
+
+				if (m_comps_in_scan == 1)
+					block_x_mcu[component_id]++;
+				else
+				{
+					if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
+					{
+						block_x_mcu_ofs = 0;
+
+						if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
+						{
+							block_y_mcu_ofs = 0;
+
+							block_x_mcu[component_id] += m_comp_h_samp[component_id];
+						}
+					}
+				}
+			}
+
+			if (m_freq_domain_chroma_upsample)
+				transform_mcu_expand(mcu_row);
+			else
+				transform_mcu(mcu_row);
+		}
+
+		if (m_comps_in_scan == 1)
+			m_block_y_mcu[m_comp_list[0]]++;
+		else
+		{
+			for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+			{
+				component_id = m_comp_list[component_num];
+
+				m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
+			}
+		}
+	}
+
+	// Restart interval processing.
+	void jpeg_decoder::process_restart()
+	{
+		int i;
+		int c = 0;
+
+		// Align to a byte boundry
+		// FIXME: Is this really necessary? get_bits_no_markers() never reads in markers!
+		//get_bits_no_markers(m_bits_left & 7);
+
+		// Let's scan a little bit to find the marker, but not _too_ far.
+		// 1536 is a "fudge factor" that determines how much to scan.
+		for (i = 1536; i > 0; i--)
+			if (get_char() == 0xFF)
+				break;
+
+		if (i == 0)
+			stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+		for ( ; i > 0; i--)
+			if ((c = get_char()) != 0xFF)
+				break;
+
+		if (i == 0)
+			stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+		// Is it the expected marker? If not, something bad happened.
+		if (c != (m_next_restart_num + M_RST0))
+			stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+		// Reset each component's DC prediction values.
+		memset(&m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
+
+		m_eob_run = 0;
+
+		m_restarts_left = m_restart_interval;
+
+		m_next_restart_num = (m_next_restart_num + 1) & 7;
+
+		// Get the bit buffer going again...
+
+		m_bits_left = 16;
+		get_bits_no_markers(16);
+		get_bits_no_markers(16);
+	}
+
+	static inline int dequantize_ac(int c, int q) {	c *= q;	return c; }
+
+	// Decodes and dequantizes the next row of coefficients.
+	void jpeg_decoder::decode_next_row()
+	{
+		int row_block = 0;
+
+		for (int mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+		{
+			if ((m_restart_interval) && (m_restarts_left == 0))
+				process_restart();
+
+			jpgd_block_t* p = m_pMCU_coefficients;
+			for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++, p += 64)
+			{
+				int component_id = m_mcu_org[mcu_block];
+				jpgd_quant_t* q = m_quant[m_comp_quant[component_id]];
+
+				int r, s;
+				s = huff_decode(m_pHuff_tabs[m_comp_dc_tab[component_id]], r);
+				s = HUFF_EXTEND(r, s);
+
+				m_last_dc_val[component_id] = (s += m_last_dc_val[component_id]);
+
+				p[0] = static_cast<jpgd_block_t>(s * q[0]);
+
+				int prev_num_set = m_mcu_block_max_zag[mcu_block];
+
+				huff_tables *pH = m_pHuff_tabs[m_comp_ac_tab[component_id]];
+
+				int k;
+				for (k = 1; k < 64; k++)
+				{
+					int extra_bits;
+					s = huff_decode(pH, extra_bits);
+
+					r = s >> 4;
+					s &= 15;
+
+					if (s)
+					{
+						if (r)
+						{
+							if ((k + r) > 63)
+								stop_decoding(JPGD_DECODE_ERROR);
+
+							if (k < prev_num_set)
+							{
+								int n = JPGD_MIN(r, prev_num_set - k);
+								int kt = k;
+								while (n--)
+									p[g_ZAG[kt++]] = 0;
+							}
+
+							k += r;
+						}
+
+						s = HUFF_EXTEND(extra_bits, s);
+
+						JPGD_ASSERT(k < 64);
+
+						p[g_ZAG[k]] = static_cast<jpgd_block_t>(dequantize_ac(s, q[k])); //s * q[k];
+					}
+					else
+					{
+						if (r == 15)
+						{
+							if ((k + 16) > 64)
+								stop_decoding(JPGD_DECODE_ERROR);
+
+							if (k < prev_num_set)
+							{
+								int n = JPGD_MIN(16, prev_num_set - k);
+								int kt = k;
+								while (n--)
+								{
+									JPGD_ASSERT(kt <= 63);
+									p[g_ZAG[kt++]] = 0;
+								}
+							}
+
+							k += 16 - 1; // - 1 because the loop counter is k
+							// BEGIN EPIC MOD
+							JPGD_ASSERT(k < 64 && p[g_ZAG[k]] == 0);
+							// END EPIC MOD
+						}
+						else
+							break;
+					}
+				}
+
+				if (k < prev_num_set)
+				{
+					int kt = k;
+					while (kt < prev_num_set)
+						p[g_ZAG[kt++]] = 0;
+				}
+
+				m_mcu_block_max_zag[mcu_block] = k;
+
+				row_block++;
+			}
+
+			if (m_freq_domain_chroma_upsample)
+				transform_mcu_expand(mcu_row);
+			else
+				transform_mcu(mcu_row);
+
+			m_restarts_left--;
+		}
+	}
+
+	// YCbCr H1V1 (1x1:1:1, 3 m_blocks per MCU) to RGB
+	void jpeg_decoder::H1V1Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8 *d = m_pScan_line_0;
+		uint8 *s = m_pSample_buf + row * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int j = 0; j < 8; j++)
+			{
+				int y = s[j];
+				int cb = s[64+j];
+				int cr = s[128+j];
+
+				if (jpg_format == ERGBFormatJPG::BGRA)
+				{
+					d[0] = clamp(y + m_cbb[cb]);
+					d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
+					d[2] = clamp(y + m_crr[cr]);
+					d[3] = 255;
+				}
+				else
+				{
+					d[0] = clamp(y + m_crr[cr]);
+					d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
+					d[2] = clamp(y + m_cbb[cb]);
+					d[3] = 255;
+				}
+				d += 4;
+			}
+
+			s += 64*3;
+		}
+	}
+
+	// YCbCr H2V1 (2x1:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H2V1Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8 *d0 = m_pScan_line_0;
+		uint8 *y = m_pSample_buf + row * 8;
+		uint8 *c = m_pSample_buf + 2*64 + row * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int l = 0; l < 2; l++)
+			{
+				for (int j = 0; j < 4; j++)
+				{
+					int cb = c[0];
+					int cr = c[64];
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					int yy = y[j<<1];
+					if (jpg_format == ERGBFormatJPG::BGRA)
+					{
+						d0[0] = clamp(yy+bc);
+						d0[1] = clamp(yy+gc);
+						d0[2] = clamp(yy+rc);
+						d0[3] = 255;
+						yy = y[(j<<1)+1];
+						d0[4] = clamp(yy+bc);
+						d0[5] = clamp(yy+gc);
+						d0[6] = clamp(yy+rc);
+						d0[7] = 255;
+					}
+					else
+					{
+						d0[0] = clamp(yy+rc);
+						d0[1] = clamp(yy+gc);
+						d0[2] = clamp(yy+bc);
+						d0[3] = 255;
+						yy = y[(j<<1)+1];
+						d0[4] = clamp(yy+rc);
+						d0[5] = clamp(yy+gc);
+						d0[6] = clamp(yy+bc);
+						d0[7] = 255;
+					}
+
+					d0 += 8;
+
+					c++;
+				}
+				y += 64;
+			}
+
+			y += 64*4 - 64*2;
+			c += 64*4 - 8;
+		}
+	}
+
+	// YCbCr H2V1 (1x2:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H1V2Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8 *d0 = m_pScan_line_0;
+		uint8 *d1 = m_pScan_line_1;
+		uint8 *y;
+		uint8 *c;
+
+		if (row < 8)
+			y = m_pSample_buf + row * 8;
+		else
+			y = m_pSample_buf + 64*1 + (row & 7) * 8;
+
+		c = m_pSample_buf + 64*2 + (row >> 1) * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int j = 0; j < 8; j++)
+			{
+				int cb = c[0+j];
+				int cr = c[64+j];
+
+				int rc = m_crr[cr];
+				int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+				int bc = m_cbb[cb];
+
+				int yy = y[j];
+				if (jpg_format == ERGBFormatJPG::BGRA)
+				{
+					d0[0] = clamp(yy+bc);
+					d0[1] = clamp(yy+gc);
+					d0[2] = clamp(yy+rc);
+					d0[3] = 255;
+					yy = y[8+j];
+					d1[0] = clamp(yy+bc);
+					d1[1] = clamp(yy+gc);
+					d1[2] = clamp(yy+rc);
+					d1[3] = 255;
+				}
+				else
+				{
+					d0[0] = clamp(yy+rc);
+					d0[1] = clamp(yy+gc);
+					d0[2] = clamp(yy+bc);
+					d0[3] = 255;
+					yy = y[8+j];
+					d1[0] = clamp(yy+rc);
+					d1[1] = clamp(yy+gc);
+					d1[2] = clamp(yy+bc);
+					d1[3] = 255;
+				}
+
+				d0 += 4;
+				d1 += 4;
+			}
+
+			y += 64*4;
+			c += 64*4;
+		}
+	}
+
+	// YCbCr H2V2 (2x2:1:1, 6 m_blocks per MCU) to RGB
+	void jpeg_decoder::H2V2Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8 *d0 = m_pScan_line_0;
+		uint8 *d1 = m_pScan_line_1;
+		uint8 *y;
+		uint8 *c;
+
+		if (row < 8)
+			y = m_pSample_buf + row * 8;
+		else
+			y = m_pSample_buf + 64*2 + (row & 7) * 8;
+
+		c = m_pSample_buf + 64*4 + (row >> 1) * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int l = 0; l < 2; l++)
+			{
+				for (int j = 0; j < 8; j += 2)
+				{
+					int cb = c[0];
+					int cr = c[64];
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					int yy = y[j];
+					if (jpg_format == ERGBFormatJPG::BGRA)
+					{
+						d0[0] = clamp(yy+bc);
+						d0[1] = clamp(yy+gc);
+						d0[2] = clamp(yy+rc);
+						d0[3] = 255;
+						yy = y[j+1];
+						d0[4] = clamp(yy+bc);
+						d0[5] = clamp(yy+gc);
+						d0[6] = clamp(yy+rc);
+						d0[7] = 255;
+						yy = y[j+8];
+						d1[0] = clamp(yy+bc);
+						d1[1] = clamp(yy+gc);
+						d1[2] = clamp(yy+rc);
+						d1[3] = 255;
+						yy = y[j+8+1];
+						d1[4] = clamp(yy+bc);
+						d1[5] = clamp(yy+gc);
+						d1[6] = clamp(yy+rc);
+						d1[7] = 255;
+					}
+					else
+					{
+						d0[0] = clamp(yy+rc);
+						d0[1] = clamp(yy+gc);
+						d0[2] = clamp(yy+bc);
+						d0[3] = 255;
+						yy = y[j+1];
+						d0[4] = clamp(yy+rc);
+						d0[5] = clamp(yy+gc);
+						d0[6] = clamp(yy+bc);
+						d0[7] = 255;
+						yy = y[j+8];
+						d1[0] = clamp(yy+rc);
+						d1[1] = clamp(yy+gc);
+						d1[2] = clamp(yy+bc);
+						d1[3] = 255;
+						yy = y[j+8+1];
+						d1[4] = clamp(yy+rc);
+						d1[5] = clamp(yy+gc);
+						d1[6] = clamp(yy+bc);
+						d1[7] = 255;
+					}
+
+					d0 += 8;
+					d1 += 8;
+
+					c++;
+				}
+				y += 64;
+			}
+
+			y += 64*6 - 64*2;
+			c += 64*6 - 8;
+		}
+	}
+
+	// Y (1 block per MCU) to 8-bit grayscale
+	void jpeg_decoder::gray_convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8 *d = m_pScan_line_0;
+		uint8 *s = m_pSample_buf + row * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			*(uint *)d = *(uint *)s;
+			*(uint *)(&d[4]) = *(uint *)(&s[4]);
+
+			s += 64;
+			d += 8;
+		}
+	}
+
+	void jpeg_decoder::expanded_convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+
+		uint8* Py = m_pSample_buf + (row / 8) * 64 * m_comp_h_samp[0] + (row & 7) * 8;
+
+		uint8* d = m_pScan_line_0;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int k = 0; k < m_max_mcu_x_size; k += 8)
+			{
+				const int Y_ofs = k * 8;
+				const int Cb_ofs = Y_ofs + 64 * m_expanded_blocks_per_component;
+				const int Cr_ofs = Y_ofs + 64 * m_expanded_blocks_per_component * 2;
+				for (int j = 0; j < 8; j++)
+				{
+					int y = Py[Y_ofs + j];
+					int cb = Py[Cb_ofs + j];
+					int cr = Py[Cr_ofs + j];
+
+					if (jpg_format == ERGBFormatJPG::BGRA)
+					{
+						d[0] = clamp(y + m_cbb[cb]);
+						d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
+						d[2] = clamp(y + m_crr[cr]);
+						d[3] = 255;
+					}
+					else
+					{
+						d[0] = clamp(y + m_crr[cr]);
+						d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
+						d[2] = clamp(y + m_cbb[cb]);
+						d[3] = 255;
+					}
+
+					d += 4;
+				}
+			}
+
+			Py += 64 * m_expanded_blocks_per_mcu;
+		}
+	}
+
+	// Find end of image (EOI) marker, so we can return to the user the exact size of the input stream.
+	void jpeg_decoder::find_eoi()
+	{
+		if (!m_progressive_flag)
+		{
+			// Attempt to read the EOI marker.
+			//get_bits_no_markers(m_bits_left & 7);
+
+			// Prime the bit buffer
+			m_bits_left = 16;
+			get_bits(16);
+			get_bits(16);
+
+			// The next marker _should_ be EOI
+			process_markers();
+		}
+
+		m_total_bytes_read -= m_in_buf_left;
+	}
+
+	int jpeg_decoder::decode(const void** pScan_line, uint* pScan_line_len)
+	{
+		if ((m_error_code) || (!m_ready_flag))
+			return JPGD_FAILED;
+
+		if (m_total_lines_left == 0)
+			return JPGD_DONE;
+
+		if (m_mcu_lines_left == 0)
+		{
+			if (setjmp(m_jmp_state))
+				return JPGD_FAILED;
+
+			if (m_progressive_flag)
+				load_next_row();
+			else
+				decode_next_row();
+
+			// Find the EOI marker if that was the last row.
+			if (m_total_lines_left <= m_max_mcu_y_size)
+				find_eoi();
+
+			m_mcu_lines_left = m_max_mcu_y_size;
+		}
+
+		if (m_freq_domain_chroma_upsample)
+		{
+			expanded_convert();
+			*pScan_line = m_pScan_line_0;
+		}
+		else
+		{
+			switch (m_scan_type)
+			{
+			case JPGD_YH2V2:
+				{
+					if ((m_mcu_lines_left & 1) == 0)
+					{
+						H2V2Convert();
+						*pScan_line = m_pScan_line_0;
+					}
+					else
+						*pScan_line = m_pScan_line_1;
+
+					break;
+				}
+			case JPGD_YH2V1:
+				{
+					H2V1Convert();
+					*pScan_line = m_pScan_line_0;
+					break;
+				}
+			case JPGD_YH1V2:
+				{
+					if ((m_mcu_lines_left & 1) == 0)
+					{
+						H1V2Convert();
+						*pScan_line = m_pScan_line_0;
+					}
+					else
+						*pScan_line = m_pScan_line_1;
+
+					break;
+				}
+			case JPGD_YH1V1:
+				{
+					H1V1Convert();
+					*pScan_line = m_pScan_line_0;
+					break;
+				}
+			case JPGD_GRAYSCALE:
+				{
+					gray_convert();
+					*pScan_line = m_pScan_line_0;
+
+					break;
+				}
+			}
+		}
+
+		*pScan_line_len = m_real_dest_bytes_per_scan_line;
+
+		m_mcu_lines_left--;
+		m_total_lines_left--;
+
+		return JPGD_SUCCESS;
+	}
+
+	// Creates the tables needed for efficient Huffman decoding.
+	void jpeg_decoder::make_huff_table(int index, huff_tables *pH)
+	{
+		int p, i, l, si;
+		uint8 huffsize[257];
+		uint huffcode[257];
+		uint code;
+		uint subtree;
+		int code_size;
+		int lastp;
+		int nextfreeentry;
+		int currententry;
+
+		pH->ac_table = m_huff_ac[index] != 0;
+
+		p = 0;
+
+		for (l = 1; l <= 16; l++)
+		{
+			for (i = 1; i <= m_huff_num[index][l]; i++)
+				huffsize[p++] = static_cast<uint8>(l);
+		}
+
+		huffsize[p] = 0;
+
+		lastp = p;
+
+		code = 0;
+		si = huffsize[0];
+		p = 0;
+
+		while (huffsize[p])
+		{
+			while (huffsize[p] == si)
+			{
+				huffcode[p++] = code;
+				code++;
+			}
+
+			code <<= 1;
+			si++;
+		}
+
+		memset(pH->look_up, 0, sizeof(pH->look_up));
+		memset(pH->look_up2, 0, sizeof(pH->look_up2));
+		memset(pH->tree, 0, sizeof(pH->tree));
+		memset(pH->code_size, 0, sizeof(pH->code_size));
+
+		nextfreeentry = -1;
+
+		p = 0;
+
+		while (p < lastp)
+		{
+			i = m_huff_val[index][p];
+			code = huffcode[p];
+			code_size = huffsize[p];
+
+			pH->code_size[i] = static_cast<uint8>(code_size);
+
+			if (code_size <= 8)
+			{
+				code <<= (8 - code_size);
+
+				for (l = 1 << (8 - code_size); l > 0; l--)
+				{
+					JPGD_ASSERT(i < 256);
+
+					pH->look_up[code] = i;
+
+					bool has_extrabits = false;
+					int extra_bits = 0;
+					int num_extra_bits = i & 15;
+
+					int bits_to_fetch = code_size;
+					if (num_extra_bits)
+					{
+						int total_codesize = code_size + num_extra_bits;
+						if (total_codesize <= 8)
+						{
+							has_extrabits = true;
+							extra_bits = ((1 << num_extra_bits) - 1) & (code >> (8 - total_codesize));
+							JPGD_ASSERT(extra_bits <= 0x7FFF);
+							bits_to_fetch += num_extra_bits;
+						}
+					}
+
+					if (!has_extrabits)
+						pH->look_up2[code] = i | (bits_to_fetch << 8);
+					else
+						pH->look_up2[code] = i | 0x8000 | (extra_bits << 16) | (bits_to_fetch << 8);
+
+					code++;
+				}
+			}
+			else
+			{
+				subtree = (code >> (code_size - 8)) & 0xFF;
+
+				currententry = pH->look_up[subtree];
+
+				if (currententry == 0)
+				{
+					pH->look_up[subtree] = currententry = nextfreeentry;
+					pH->look_up2[subtree] = currententry = nextfreeentry;
+
+					nextfreeentry -= 2;
+				}
+
+				code <<= (16 - (code_size - 8));
+
+				for (l = code_size; l > 9; l--)
+				{
+					if ((code & 0x8000) == 0)
+						currententry--;
+
+					if (pH->tree[-currententry - 1] == 0)
+					{
+						pH->tree[-currententry - 1] = nextfreeentry;
+
+						currententry = nextfreeentry;
+
+						nextfreeentry -= 2;
+					}
+					else
+						currententry = pH->tree[-currententry - 1];
+
+					code <<= 1;
+				}
+
+				if ((code & 0x8000) == 0)
+					currententry--;
+
+				pH->tree[-currententry - 1] = i;
+			}
+
+			p++;
+		}
+	}
+
+	// Verifies the quantization tables needed for this scan are available.
+	void jpeg_decoder::check_quant_tables()
+	{
+		for (int i = 0; i < m_comps_in_scan; i++)
+			if (m_quant[m_comp_quant[m_comp_list[i]]] == NULL)
+				stop_decoding(JPGD_UNDEFINED_QUANT_TABLE);
+	}
+
+	// Verifies that all the Huffman tables needed for this scan are available.
+	void jpeg_decoder::check_huff_tables()
+	{
+		for (int i = 0; i < m_comps_in_scan; i++)
+		{
+			if ((m_spectral_start == 0) && (m_huff_num[m_comp_dc_tab[m_comp_list[i]]] == NULL))
+				stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
+
+			if ((m_spectral_end > 0) && (m_huff_num[m_comp_ac_tab[m_comp_list[i]]] == NULL))
+				stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
+		}
+
+		for (int i = 0; i < JPGD_MAX_HUFF_TABLES; i++)
+			if (m_huff_num[i])
+			{
+				if (!m_pHuff_tabs[i])
+					m_pHuff_tabs[i] = (huff_tables *)alloc(sizeof(huff_tables));
+
+				make_huff_table(i, m_pHuff_tabs[i]);
+			}
+	}
+
+	// Determines the component order inside each MCU.
+	// Also calcs how many MCU's are on each row, etc.
+	void jpeg_decoder::calc_mcu_block_order()
+	{
+		int component_num, component_id;
+		int max_h_samp = 0, max_v_samp = 0;
+
+		for (component_id = 0; component_id < m_comps_in_frame; component_id++)
+		{
+			if (m_comp_h_samp[component_id] > max_h_samp)
+				max_h_samp = m_comp_h_samp[component_id];
+
+			if (m_comp_v_samp[component_id] > max_v_samp)
+				max_v_samp = m_comp_v_samp[component_id];
+		}
+
+		for (component_id = 0; component_id < m_comps_in_frame; component_id++)
+		{
+			m_comp_h_blocks[component_id] = ((((m_image_x_size * m_comp_h_samp[component_id]) + (max_h_samp - 1)) / max_h_samp) + 7) / 8;
+			m_comp_v_blocks[component_id] = ((((m_image_y_size * m_comp_v_samp[component_id]) + (max_v_samp - 1)) / max_v_samp) + 7) / 8;
+		}
+
+		if (m_comps_in_scan == 1)
+		{
+			m_mcus_per_row = m_comp_h_blocks[m_comp_list[0]];
+			m_mcus_per_col = m_comp_v_blocks[m_comp_list[0]];
+		}
+		else
+		{
+			m_mcus_per_row = (((m_image_x_size + 7) / 8) + (max_h_samp - 1)) / max_h_samp;
+			m_mcus_per_col = (((m_image_y_size + 7) / 8) + (max_v_samp - 1)) / max_v_samp;
+		}
+
+		if (m_comps_in_scan == 1)
+		{
+			m_mcu_org[0] = m_comp_list[0];
+
+			m_blocks_per_mcu = 1;
+		}
+		else
+		{
+			m_blocks_per_mcu = 0;
+
+			for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+			{
+				int num_blocks;
+
+				component_id = m_comp_list[component_num];
+
+				num_blocks = m_comp_h_samp[component_id] * m_comp_v_samp[component_id];
+
+				while (num_blocks--)
+					m_mcu_org[m_blocks_per_mcu++] = component_id;
+			}
+		}
+	}
+
+	// Starts a new scan.
+	int jpeg_decoder::init_scan()
+	{
+		if (!locate_sos_marker())
+			return JPGD_FALSE;
+
+		calc_mcu_block_order();
+
+		check_huff_tables();
+
+		check_quant_tables();
+
+		memset(m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
+
+		m_eob_run = 0;
+
+		if (m_restart_interval)
+		{
+			m_restarts_left = m_restart_interval;
+			m_next_restart_num = 0;
+		}
+
+		fix_in_buffer();
+
+		return JPGD_TRUE;
+	}
+
+	// Starts a frame. Determines if the number of components or sampling factors
+	// are supported.
+	void jpeg_decoder::init_frame()
+	{
+		int i;
+
+		if (m_comps_in_frame == 1)
+		{
+			if ((m_comp_h_samp[0] != 1) || (m_comp_v_samp[0] != 1))
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+			m_scan_type = JPGD_GRAYSCALE;
+			m_max_blocks_per_mcu = 1;
+			m_max_mcu_x_size = 8;
+			m_max_mcu_y_size = 8;
+		}
+		else if (m_comps_in_frame == 3)
+		{
+			if ( ((m_comp_h_samp[1] != 1) || (m_comp_v_samp[1] != 1)) ||
+				((m_comp_h_samp[2] != 1) || (m_comp_v_samp[2] != 1)) )
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+			if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 1))
+			{
+				m_scan_type = JPGD_YH1V1;
+
+				m_max_blocks_per_mcu = 3;
+				m_max_mcu_x_size = 8;
+				m_max_mcu_y_size = 8;
+			}
+			else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 1))
+			{
+				m_scan_type = JPGD_YH2V1;
+				m_max_blocks_per_mcu = 4;
+				m_max_mcu_x_size = 16;
+				m_max_mcu_y_size = 8;
+			}
+			else if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 2))
+			{
+				m_scan_type = JPGD_YH1V2;
+				m_max_blocks_per_mcu = 4;
+				m_max_mcu_x_size = 8;
+				m_max_mcu_y_size = 16;
+			}
+			else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 2))
+			{
+				m_scan_type = JPGD_YH2V2;
+				m_max_blocks_per_mcu = 6;
+				m_max_mcu_x_size = 16;
+				m_max_mcu_y_size = 16;
+			}
+			else
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+		}
+		else
+			stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
+
+		m_max_mcus_per_row = (m_image_x_size + (m_max_mcu_x_size - 1)) / m_max_mcu_x_size;
+		m_max_mcus_per_col = (m_image_y_size + (m_max_mcu_y_size - 1)) / m_max_mcu_y_size;
+
+		// These values are for the *destination* pixels: after conversion.
+		if (m_scan_type == JPGD_GRAYSCALE)
+			m_dest_bytes_per_pixel = 1;
+		else
+			m_dest_bytes_per_pixel = 4;
+
+		m_dest_bytes_per_scan_line = ((m_image_x_size + 15) & 0xFFF0) * m_dest_bytes_per_pixel;
+
+		m_real_dest_bytes_per_scan_line = (m_image_x_size * m_dest_bytes_per_pixel);
+
+		// Initialize two scan line buffers.
+		m_pScan_line_0 = (uint8 *)alloc(m_dest_bytes_per_scan_line, true);
+		if ((m_scan_type == JPGD_YH1V2) || (m_scan_type == JPGD_YH2V2))
+			m_pScan_line_1 = (uint8 *)alloc(m_dest_bytes_per_scan_line, true);
+
+		m_max_blocks_per_row = m_max_mcus_per_row * m_max_blocks_per_mcu;
+
+		// Should never happen
+		if (m_max_blocks_per_row > JPGD_MAX_BLOCKS_PER_ROW)
+			stop_decoding(JPGD_ASSERTION_ERROR);
+
+		// Allocate the coefficient buffer, enough for one MCU
+		m_pMCU_coefficients = (jpgd_block_t*)alloc(m_max_blocks_per_mcu * 64 * sizeof(jpgd_block_t));
+
+		for (i = 0; i < m_max_blocks_per_mcu; i++)
+			m_mcu_block_max_zag[i] = 64;
+
+		m_expanded_blocks_per_component = m_comp_h_samp[0] * m_comp_v_samp[0];
+		m_expanded_blocks_per_mcu = m_expanded_blocks_per_component * m_comps_in_frame;
+		m_expanded_blocks_per_row = m_max_mcus_per_row * m_expanded_blocks_per_mcu;
+		// Freq. domain chroma upsampling is only supported for H2V2 subsampling factor.
+// BEGIN EPIC MOD
+#if JPGD_SUPPORT_FREQ_DOMAIN_UPSAMPLING
+		m_freq_domain_chroma_upsample = (m_expanded_blocks_per_mcu == 4*3);
+#else
+		m_freq_domain_chroma_upsample = 0;
+#endif
+// END EPIC MOD
+
+		if (m_freq_domain_chroma_upsample)
+			m_pSample_buf = (uint8 *)alloc(m_expanded_blocks_per_row * 64);
+		else
+			m_pSample_buf = (uint8 *)alloc(m_max_blocks_per_row * 64);
+
+		m_total_lines_left = m_image_y_size;
+
+		m_mcu_lines_left = 0;
+
+		create_look_ups();
+	}
+
+	// The coeff_buf series of methods originally stored the coefficients
+	// into a "virtual" file which was located in EMS, XMS, or a disk file. A cache
+	// was used to make this process more efficient. Now, we can store the entire
+	// thing in RAM.
+	jpeg_decoder::coeff_buf* jpeg_decoder::coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y)
+	{
+		coeff_buf* cb = (coeff_buf*)alloc(sizeof(coeff_buf));
+
+		cb->block_num_x = block_num_x;
+		cb->block_num_y = block_num_y;
+		cb->block_len_x = block_len_x;
+		cb->block_len_y = block_len_y;
+		cb->block_size = (block_len_x * block_len_y) * sizeof(jpgd_block_t);
+		cb->pData = (uint8 *)alloc(cb->block_size * block_num_x * block_num_y, true);
+		return cb;
+	}
+
+	inline jpgd_block_t *jpeg_decoder::coeff_buf_getp(coeff_buf *cb, int block_x, int block_y)
+	{
+		JPGD_ASSERT((block_x < cb->block_num_x) && (block_y < cb->block_num_y));
+		return (jpgd_block_t *)(cb->pData + block_x * cb->block_size + block_y * (cb->block_size * cb->block_num_x));
+	}
+
+	// The following methods decode the various types of m_blocks encountered
+	// in progressively encoded images.
+	void jpeg_decoder::decode_block_dc_first(jpeg_decoder *pD, int component_id, int block_x, int block_y)
+	{
+		int s, r;
+		jpgd_block_t *p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
+
+		if ((s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_dc_tab[component_id]])) != 0)
+		{
+			r = pD->get_bits_no_markers(s);
+			s = HUFF_EXTEND(r, s);
+		}
+
+		pD->m_last_dc_val[component_id] = (s += pD->m_last_dc_val[component_id]);
+
+		p[0] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
+	}
+
+	void jpeg_decoder::decode_block_dc_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y)
+	{
+		if (pD->get_bits_no_markers(1))
+		{
+			jpgd_block_t *p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
+
+			p[0] |= (1 << pD->m_successive_low);
+		}
+	}
+
+	void jpeg_decoder::decode_block_ac_first(jpeg_decoder *pD, int component_id, int block_x, int block_y)
+	{
+		int k, s, r;
+
+		if (pD->m_eob_run)
+		{
+			pD->m_eob_run--;
+			return;
+		}
+
+		jpgd_block_t *p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
+
+		for (k = pD->m_spectral_start; k <= pD->m_spectral_end; k++)
+		{
+			s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_ac_tab[component_id]]);
+
+			r = s >> 4;
+			s &= 15;
+
+			if (s)
+			{
+				if ((k += r) > 63)
+					pD->stop_decoding(JPGD_DECODE_ERROR);
+
+				r = pD->get_bits_no_markers(s);
+				s = HUFF_EXTEND(r, s);
+
+				p[g_ZAG[k]] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
+			}
+			else
+			{
+				if (r == 15)
+				{
+					if ((k += 15) > 63)
+						pD->stop_decoding(JPGD_DECODE_ERROR);
+				}
+				else
+				{
+					pD->m_eob_run = 1 << r;
+
+					if (r)
+						pD->m_eob_run += pD->get_bits_no_markers(r);
+
+					pD->m_eob_run--;
+
+					break;
+				}
+			}
+		}
+	}
+
+	void jpeg_decoder::decode_block_ac_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y)
+	{
+		int s, k, r;
+		int p1 = 1 << pD->m_successive_low;
+		int m1 = (-1) << pD->m_successive_low;
+		jpgd_block_t *p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
+
+		k = pD->m_spectral_start;
+
+		if (pD->m_eob_run == 0)
+		{
+			for ( ; k <= pD->m_spectral_end; k++)
+			{
+				s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_ac_tab[component_id]]);
+
+				r = s >> 4;
+				s &= 15;
+
+				if (s)
+				{
+					if (s != 1)
+						pD->stop_decoding(JPGD_DECODE_ERROR);
+
+					if (pD->get_bits_no_markers(1))
+						s = p1;
+					else
+						s = m1;
+				}
+				else
+				{
+					if (r != 15)
+					{
+						pD->m_eob_run = 1 << r;
+
+						if (r)
+							pD->m_eob_run += pD->get_bits_no_markers(r);
+
+						break;
+					}
+				}
+
+				do
+				{
+					// BEGIN EPIC MOD
+					JPGD_ASSERT(k < 64);
+					// END EPIC MOD
+
+					jpgd_block_t *this_coef = p + g_ZAG[k];
+
+					if (*this_coef != 0)
+					{
+						if (pD->get_bits_no_markers(1))
+						{
+							if ((*this_coef & p1) == 0)
+							{
+								if (*this_coef >= 0)
+									*this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
+								else
+									*this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
+							}
+						}
+					}
+					else
+					{
+						if (--r < 0)
+							break;
+					}
+
+					k++;
+
+				} while (k <= pD->m_spectral_end);
+
+				if ((s) && (k < 64))
+				{
+					p[g_ZAG[k]] = static_cast<jpgd_block_t>(s);
+				}
+			}
+		}
+
+		if (pD->m_eob_run > 0)
+		{
+			for ( ; k <= pD->m_spectral_end; k++)
+			{
+				// BEGIN EPIC MOD
+				JPGD_ASSERT(k < 64);
+				// END EPIC MOD
+
+				jpgd_block_t *this_coef = p + g_ZAG[k];
+
+				if (*this_coef != 0)
+				{
+					if (pD->get_bits_no_markers(1))
+					{
+						if ((*this_coef & p1) == 0)
+						{
+							if (*this_coef >= 0)
+								*this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
+							else
+								*this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
+						}
+					}
+				}
+			}
+
+			pD->m_eob_run--;
+		}
+	}
+
+	// Decode a scan in a progressively encoded image.
+	void jpeg_decoder::decode_scan(pDecode_block_func decode_block_func)
+	{
+		int mcu_row, mcu_col, mcu_block;
+		int block_x_mcu[JPGD_MAX_COMPONENTS], m_block_y_mcu[JPGD_MAX_COMPONENTS];
+
+		memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
+
+		for (mcu_col = 0; mcu_col < m_mcus_per_col; mcu_col++)
+		{
+			int component_num, component_id;
+
+			memset(block_x_mcu, 0, sizeof(block_x_mcu));
+
+			for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+			{
+				int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
+
+				if ((m_restart_interval) && (m_restarts_left == 0))
+					process_restart();
+
+				for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+				{
+					component_id = m_mcu_org[mcu_block];
+
+					decode_block_func(this, component_id, block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+
+					if (m_comps_in_scan == 1)
+						block_x_mcu[component_id]++;
+					else
+					{
+						if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
+						{
+							block_x_mcu_ofs = 0;
+
+							if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
+							{
+								block_y_mcu_ofs = 0;
+								block_x_mcu[component_id] += m_comp_h_samp[component_id];
+							}
+						}
+					}
+				}
+
+				m_restarts_left--;
+			}
+
+			if (m_comps_in_scan == 1)
+				m_block_y_mcu[m_comp_list[0]]++;
+			else
+			{
+				for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+				{
+					component_id = m_comp_list[component_num];
+					m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
+				}
+			}
+		}
+	}
+
+	// Decode a progressively encoded image.
+	void jpeg_decoder::init_progressive()
+	{
+		int i;
+
+		if (m_comps_in_frame == 4)
+			stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
+
+		// Allocate the coefficient buffers.
+		for (i = 0; i < m_comps_in_frame; i++)
+		{
+			m_dc_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 1, 1);
+			m_ac_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 8, 8);
+		}
+
+		for ( ; ; )
+		{
+			int dc_only_scan, refinement_scan;
+			pDecode_block_func decode_block_func;
+
+			if (!init_scan())
+				break;
+
+			dc_only_scan = (m_spectral_start == 0);
+			refinement_scan = (m_successive_high != 0);
+
+			if ((m_spectral_start > m_spectral_end) || (m_spectral_end > 63))
+				stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+
+			if (dc_only_scan)
+			{
+				if (m_spectral_end)
+					stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+			}
+			else if (m_comps_in_scan != 1)  /* AC scans can only contain one component */
+				stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+
+			if ((refinement_scan) && (m_successive_low != m_successive_high - 1))
+				stop_decoding(JPGD_BAD_SOS_SUCCESSIVE);
+
+			if (dc_only_scan)
+			{
+				if (refinement_scan)
+					decode_block_func = decode_block_dc_refine;
+				else
+					decode_block_func = decode_block_dc_first;
+			}
+			else
+			{
+				if (refinement_scan)
+					decode_block_func = decode_block_ac_refine;
+				else
+					decode_block_func = decode_block_ac_first;
+			}
+
+			decode_scan(decode_block_func);
+
+			m_bits_left = 16;
+			get_bits(16);
+			get_bits(16);
+		}
+
+		m_comps_in_scan = m_comps_in_frame;
+
+		for (i = 0; i < m_comps_in_frame; i++)
+			m_comp_list[i] = i;
+
+		calc_mcu_block_order();
+	}
+
+	void jpeg_decoder::init_sequential()
+	{
+		if (!init_scan())
+			stop_decoding(JPGD_UNEXPECTED_MARKER);
+	}
+
+	void jpeg_decoder::decode_start()
+	{
+		init_frame();
+
+		if (m_progressive_flag)
+			init_progressive();
+		else
+			init_sequential();
+	}
+
+	void jpeg_decoder::decode_init(jpeg_decoder_stream *pStream)
+	{
+		init(pStream);
+		locate_sof_marker();
+	}
+
+	jpeg_decoder::jpeg_decoder(jpeg_decoder_stream *pStream)
+	{
+		if (setjmp(m_jmp_state))
+			return;
+		decode_init(pStream);
+	}
+
+	int jpeg_decoder::begin_decoding()
+	{
+		if (m_ready_flag)
+			return JPGD_SUCCESS;
+
+		if (m_error_code)
+			return JPGD_FAILED;
+
+		if (setjmp(m_jmp_state))
+			return JPGD_FAILED;
+
+		decode_start();
+
+		m_ready_flag = true;
+
+		return JPGD_SUCCESS;
+	}
+
+	jpeg_decoder::~jpeg_decoder()
+	{
+		free_all_blocks();
+	}
+
+	jpeg_decoder_file_stream::jpeg_decoder_file_stream()
+	{
+		m_pFile = NULL;
+		m_eof_flag = false;
+		m_error_flag = false;
+	}
+
+	void jpeg_decoder_file_stream::close()
+	{
+		if (m_pFile)
+		{
+			fclose(m_pFile);
+			m_pFile = NULL;
+		}
+
+		m_eof_flag = false;
+		m_error_flag = false;
+	}
+
+	jpeg_decoder_file_stream::~jpeg_decoder_file_stream()
+	{
+		close();
+	}
+
+	bool jpeg_decoder_file_stream::open(const char *Pfilename)
+	{
+		close();
+
+		m_eof_flag = false;
+		m_error_flag = false;
+
+#if defined(_MSC_VER)
+		m_pFile = NULL;
+		fopen_s(&m_pFile, Pfilename, "rb");
+#else
+		m_pFile = fopen(Pfilename, "rb");
+#endif
+		return m_pFile != NULL;
+	}
+
+	int jpeg_decoder_file_stream::read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag)
+	{
+		if (!m_pFile)
+			return -1;
+
+		if (m_eof_flag)
+		{
+			*pEOF_flag = true;
+			return 0;
+		}
+
+		if (m_error_flag)
+			return -1;
+
+		int bytes_read = static_cast<int>(fread(pBuf, 1, max_bytes_to_read, m_pFile));
+		if (bytes_read < max_bytes_to_read)
+		{
+			if (ferror(m_pFile))
+			{
+				m_error_flag = true;
+				return -1;
+			}
+
+			m_eof_flag = true;
+			*pEOF_flag = true;
+		}
+
+		return bytes_read;
+	}
+
+	bool jpeg_decoder_mem_stream::open(const uint8 *pSrc_data, uint size)
+	{
+		close();
+		m_pSrc_data = pSrc_data;
+		m_ofs = 0;
+		m_size = size;
+		return true;
+	}
+
+	int jpeg_decoder_mem_stream::read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag)
+	{
+		*pEOF_flag = false;
+
+		if (!m_pSrc_data)
+			return -1;
+
+		uint bytes_remaining = m_size - m_ofs;
+		if ((uint)max_bytes_to_read > bytes_remaining)
+		{
+			max_bytes_to_read = bytes_remaining;
+			*pEOF_flag = true;
+		}
+
+		memcpy(pBuf, m_pSrc_data + m_ofs, max_bytes_to_read);
+		m_ofs += max_bytes_to_read;
+
+		return max_bytes_to_read;
+	}
+
+	unsigned char *decompress_jpeg_image_from_stream(jpeg_decoder_stream *pStream, int *width, int *height, int *actual_comps, int req_comps)
+	{
+		if (!actual_comps)
+			return NULL;
+		*actual_comps = 0;
+
+		if ((!pStream) || (!width) || (!height) || (!req_comps))
+			return NULL;
+
+		if ((req_comps != 1) && (req_comps != 3) && (req_comps != 4))
+			return NULL;
+
+		jpeg_decoder decoder(pStream);
+		if (decoder.get_error_code() != JPGD_SUCCESS)
+			return NULL;
+
+		const int image_width = decoder.get_width(), image_height = decoder.get_height();
+		*width = image_width;
+		*height = image_height;
+		*actual_comps = decoder.get_num_components();
+
+		if (decoder.begin_decoding() != JPGD_SUCCESS)
+			return NULL;
+
+		const int dst_bpl = image_width * req_comps;
+
+		uint8 *pImage_data = (uint8*)jpgd_malloc(dst_bpl * image_height);
+		if (!pImage_data)
+			return NULL;
+
+		for (int y = 0; y < image_height; y++)
+		{
+			const uint8* pScan_line = 0;
+			uint scan_line_len;
+			if (decoder.decode((const void**)&pScan_line, &scan_line_len) != JPGD_SUCCESS)
+			{
+				jpgd_free(pImage_data);
+				return NULL;
+			}
+
+			uint8 *pDst = pImage_data + y * dst_bpl;
+
+			if (((req_comps == 4) && (decoder.get_num_components() == 3)) ||
+				((req_comps == 1) && (decoder.get_num_components() == 1)))
+			{
+				memcpy(pDst, pScan_line, dst_bpl);
+			}
+			else if (decoder.get_num_components() == 1)
+			{
+				if (req_comps == 3)
+				{
+					for (int x = 0; x < image_width; x++)
+					{
+						uint8 luma = pScan_line[x];
+						pDst[0] = luma;
+						pDst[1] = luma;
+						pDst[2] = luma;
+						pDst += 3;
+					}
+				}
+				else
+				{
+					for (int x = 0; x < image_width; x++)
+					{
+						uint8 luma = pScan_line[x];
+						pDst[0] = luma;
+						pDst[1] = luma;
+						pDst[2] = luma;
+						pDst[3] = 255;
+						pDst += 4;
+					}
+				}
+			}
+			else if (decoder.get_num_components() == 3)
+			{
+				if (req_comps == 1)
+				{
+					const int YR = 19595, YG = 38470, YB = 7471;
+					for (int x = 0; x < image_width; x++)
+					{
+						int r = pScan_line[x*4+0];
+						int g = pScan_line[x*4+1];
+						int b = pScan_line[x*4+2];
+						*pDst++ = static_cast<uint8>((r * YR + g * YG + b * YB + 32768) >> 16);
+					}
+				}
+				else
+				{
+					for (int x = 0; x < image_width; x++)
+					{
+						pDst[0] = pScan_line[x*4+0];
+						pDst[1] = pScan_line[x*4+1];
+						pDst[2] = pScan_line[x*4+2];
+						pDst += 3;
+					}
+				}
+			}
+		}
+
+		return pImage_data;
+	}
+
+// BEGIN EPIC MOD
+	unsigned char *decompress_jpeg_image_from_memory(const unsigned char *pSrc_data, int src_data_size, int *width, int *height, int *actual_comps, int req_comps, int format)
+	{
+		jpg_format = (ERGBFormatJPG)format;
+// EMD EPIC MOD
+		jpgd::jpeg_decoder_mem_stream mem_stream(pSrc_data, src_data_size);
+		return decompress_jpeg_image_from_stream(&mem_stream, width, height, actual_comps, req_comps);
+	}
+
+	unsigned char *decompress_jpeg_image_from_file(const char *pSrc_filename, int *width, int *height, int *actual_comps, int req_comps)
+	{
+		jpgd::jpeg_decoder_file_stream file_stream;
+		if (!file_stream.open(pSrc_filename))
+			return NULL;
+		return decompress_jpeg_image_from_stream(&file_stream, width, height, actual_comps, req_comps);
+	}
+
+} // namespace jpgd
diff --git a/crazy_functions/test_project/cpp/longcode/jpge.cpp b/crazy_functions/test_project/cpp/longcode/jpge.cpp
new file mode 100644
index 0000000..2e26b71
--- /dev/null
+++ b/crazy_functions/test_project/cpp/longcode/jpge.cpp
@@ -0,0 +1,1049 @@
+// jpge.cpp - C++ class for JPEG compression.
+// Public domain, Rich Geldreich <richgel99@gmail.com>
+// v1.01, Dec. 18, 2010 - Initial release
+// v1.02, Apr. 6, 2011 - Removed 2x2 ordered dither in H2V1 chroma subsampling method load_block_16_8_8(). (The rounding factor was 2, when it should have been 1. Either way, it wasn't helping.)
+// v1.03, Apr. 16, 2011 - Added support for optimized Huffman code tables, optimized dynamic memory allocation down to only 1 alloc.
+//                        Also from Alex Evans: Added RGBA support, linear memory allocator (no longer needed in v1.03).
+// v1.04, May. 19, 2012: Forgot to set m_pFile ptr to NULL in cfile_stream::close(). Thanks to Owen Kaluza for reporting this bug.
+//                       Code tweaks to fix VS2008 static code analysis warnings (all looked harmless).
+//                       Code review revealed method load_block_16_8_8() (used for the non-default H2V1 sampling mode to downsample chroma) somehow didn't get the rounding factor fix from v1.02.
+
+#include "jpge.h"
+
+#include <stdlib.h>
+#include <string.h>
+#if PLATFORM_WINDOWS
+#include <malloc.h>
+#endif
+
+#define JPGE_MAX(a,b) (((a)>(b))?(a):(b))
+#define JPGE_MIN(a,b) (((a)<(b))?(a):(b))
+
+namespace jpge {
+
+static inline void *jpge_malloc(size_t nSize) { return FMemory::Malloc(nSize); }
+static inline void jpge_free(void *p) { FMemory::Free(p);; }
+
+// Various JPEG enums and tables.
+enum { M_SOF0 = 0xC0, M_DHT = 0xC4, M_SOI = 0xD8, M_EOI = 0xD9, M_SOS = 0xDA, M_DQT = 0xDB, M_APP0 = 0xE0 };
+enum { DC_LUM_CODES = 12, AC_LUM_CODES = 256, DC_CHROMA_CODES = 12, AC_CHROMA_CODES = 256, MAX_HUFF_SYMBOLS = 257, MAX_HUFF_CODESIZE = 32 };
+
+static uint8 s_zag[64] = { 0,1,8,16,9,2,3,10,17,24,32,25,18,11,4,5,12,19,26,33,40,48,41,34,27,20,13,6,7,14,21,28,35,42,49,56,57,50,43,36,29,22,15,23,30,37,44,51,58,59,52,45,38,31,39,46,53,60,61,54,47,55,62,63 };
+static int16 s_std_lum_quant[64] = { 16,11,12,14,12,10,16,14,13,14,18,17,16,19,24,40,26,24,22,22,24,49,35,37,29,40,58,51,61,60,57,51,56,55,64,72,92,78,64,68,87,69,55,56,80,109,81,87,95,98,103,104,103,62,77,113,121,112,100,120,92,101,103,99 };
+static int16 s_std_croma_quant[64] = { 17,18,18,24,21,24,47,26,26,47,99,66,56,66,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99 };
+static uint8 s_dc_lum_bits[17] = { 0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0 };
+static uint8 s_dc_lum_val[DC_LUM_CODES] = { 0,1,2,3,4,5,6,7,8,9,10,11 };
+static uint8 s_ac_lum_bits[17] = { 0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d };
+static uint8 s_ac_lum_val[AC_LUM_CODES]  =
+{
+  0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,
+  0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,
+  0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
+  0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,
+  0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,
+  0xf9,0xfa
+};
+static uint8 s_dc_chroma_bits[17] = { 0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0 };
+static uint8 s_dc_chroma_val[DC_CHROMA_CODES]  = { 0,1,2,3,4,5,6,7,8,9,10,11 };
+static uint8 s_ac_chroma_bits[17] = { 0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77 };
+static uint8 s_ac_chroma_val[AC_CHROMA_CODES] =
+{
+  0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,
+  0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,
+  0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
+  0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,
+  0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,
+  0xf9,0xfa
+};
+
+// Low-level helper functions.
+template <class T> inline void clear_obj(T &obj) { memset(&obj, 0, sizeof(obj)); }
+
+const int YR = 19595, YG = 38470, YB = 7471, CB_R = -11059, CB_G = -21709, CB_B = 32768, CR_R = 32768, CR_G = -27439, CR_B = -5329;
+static inline uint8 clamp(int i) { if (static_cast<uint>(i) > 255U) { if (i < 0) i = 0; else if (i > 255) i = 255; } return static_cast<uint8>(i); }
+
+static void RGB_to_YCC(uint8* pDst, const uint8 *pSrc, int num_pixels)
+{
+  for ( ; num_pixels; pDst += 3, pSrc += 3, num_pixels--)
+  {
+    const int r = pSrc[0], g = pSrc[1], b = pSrc[2];
+    pDst[0] = static_cast<uint8>((r * YR + g * YG + b * YB + 32768) >> 16);
+    pDst[1] = clamp(128 + ((r * CB_R + g * CB_G + b * CB_B + 32768) >> 16));
+    pDst[2] = clamp(128 + ((r * CR_R + g * CR_G + b * CR_B + 32768) >> 16));
+  }
+}
+
+static void RGB_to_Y(uint8* pDst, const uint8 *pSrc, int num_pixels)
+{
+  for ( ; num_pixels; pDst++, pSrc += 3, num_pixels--)
+    pDst[0] = static_cast<uint8>((pSrc[0] * YR + pSrc[1] * YG + pSrc[2] * YB + 32768) >> 16);
+}
+
+static void RGBA_to_YCC(uint8* pDst, const uint8 *pSrc, int num_pixels)
+{
+  for ( ; num_pixels; pDst += 3, pSrc += 4, num_pixels--)
+  {
+    const int r = pSrc[0], g = pSrc[1], b = pSrc[2];
+    pDst[0] = static_cast<uint8>((r * YR + g * YG + b * YB + 32768) >> 16);
+    pDst[1] = clamp(128 + ((r * CB_R + g * CB_G + b * CB_B + 32768) >> 16));
+    pDst[2] = clamp(128 + ((r * CR_R + g * CR_G + b * CR_B + 32768) >> 16));
+  }
+}
+
+static void RGBA_to_Y(uint8* pDst, const uint8 *pSrc, int num_pixels)
+{
+  for ( ; num_pixels; pDst++, pSrc += 4, num_pixels--)
+    pDst[0] = static_cast<uint8>((pSrc[0] * YR + pSrc[1] * YG + pSrc[2] * YB + 32768) >> 16);
+}
+
+static void Y_to_YCC(uint8* pDst, const uint8* pSrc, int num_pixels)
+{
+  for( ; num_pixels; pDst += 3, pSrc++, num_pixels--) { pDst[0] = pSrc[0]; pDst[1] = 128; pDst[2] = 128; }
+}
+
+// Forward DCT - DCT derived from jfdctint.
+#define CONST_BITS  13
+#define ROW_BITS    2
+#define DCT_DESCALE(x, n) (((x) + (((int32)1) << ((n) - 1))) >> (n))
+#define DCT_MUL(var, c) (static_cast<int16>(var) * static_cast<int32>(c))
+#define DCT1D(s0, s1, s2, s3, s4, s5, s6, s7) \
+  int32 t0 = s0 + s7, t7 = s0 - s7, t1 = s1 + s6, t6 = s1 - s6, t2 = s2 + s5, t5 = s2 - s5, t3 = s3 + s4, t4 = s3 - s4; \
+  int32 t10 = t0 + t3, t13 = t0 - t3, t11 = t1 + t2, t12 = t1 - t2; \
+  int32 u1 = DCT_MUL(t12 + t13, 4433); \
+  s2 = u1 + DCT_MUL(t13, 6270); \
+  s6 = u1 + DCT_MUL(t12, -15137); \
+  u1 = t4 + t7; \
+  int32 u2 = t5 + t6, u3 = t4 + t6, u4 = t5 + t7; \
+  int32 z5 = DCT_MUL(u3 + u4, 9633); \
+  t4 = DCT_MUL(t4, 2446); t5 = DCT_MUL(t5, 16819); \
+  t6 = DCT_MUL(t6, 25172); t7 = DCT_MUL(t7, 12299); \
+  u1 = DCT_MUL(u1, -7373); u2 = DCT_MUL(u2, -20995); \
+  u3 = DCT_MUL(u3, -16069); u4 = DCT_MUL(u4, -3196); \
+  u3 += z5; u4 += z5; \
+  s0 = t10 + t11; s1 = t7 + u1 + u4; s3 = t6 + u2 + u3; s4 = t10 - t11; s5 = t5 + u2 + u4; s7 = t4 + u1 + u3;
+
+static void DCT2D(int32 *p)
+{
+  int32 c, *q = p;
+  for (c = 7; c >= 0; c--, q += 8)
+  {
+    int32 s0 = q[0], s1 = q[1], s2 = q[2], s3 = q[3], s4 = q[4], s5 = q[5], s6 = q[6], s7 = q[7];
+    DCT1D(s0, s1, s2, s3, s4, s5, s6, s7);
+    q[0] = s0 << ROW_BITS; q[1] = DCT_DESCALE(s1, CONST_BITS-ROW_BITS); q[2] = DCT_DESCALE(s2, CONST_BITS-ROW_BITS); q[3] = DCT_DESCALE(s3, CONST_BITS-ROW_BITS);
+    q[4] = s4 << ROW_BITS; q[5] = DCT_DESCALE(s5, CONST_BITS-ROW_BITS); q[6] = DCT_DESCALE(s6, CONST_BITS-ROW_BITS); q[7] = DCT_DESCALE(s7, CONST_BITS-ROW_BITS);
+  }
+  for (q = p, c = 7; c >= 0; c--, q++)
+  {
+    int32 s0 = q[0*8], s1 = q[1*8], s2 = q[2*8], s3 = q[3*8], s4 = q[4*8], s5 = q[5*8], s6 = q[6*8], s7 = q[7*8];
+    DCT1D(s0, s1, s2, s3, s4, s5, s6, s7);
+    q[0*8] = DCT_DESCALE(s0, ROW_BITS+3); q[1*8] = DCT_DESCALE(s1, CONST_BITS+ROW_BITS+3); q[2*8] = DCT_DESCALE(s2, CONST_BITS+ROW_BITS+3); q[3*8] = DCT_DESCALE(s3, CONST_BITS+ROW_BITS+3);
+    q[4*8] = DCT_DESCALE(s4, ROW_BITS+3); q[5*8] = DCT_DESCALE(s5, CONST_BITS+ROW_BITS+3); q[6*8] = DCT_DESCALE(s6, CONST_BITS+ROW_BITS+3); q[7*8] = DCT_DESCALE(s7, CONST_BITS+ROW_BITS+3);
+  }
+}
+
+struct sym_freq { uint m_key, m_sym_index; };
+
+// Radix sorts sym_freq[] array by 32-bit key m_key. Returns ptr to sorted values.
+static inline sym_freq* radix_sort_syms(uint num_syms, sym_freq* pSyms0, sym_freq* pSyms1)
+{
+  const uint cMaxPasses = 4;
+  uint32 hist[256 * cMaxPasses]; clear_obj(hist);
+  for (uint i = 0; i < num_syms; i++) { uint freq = pSyms0[i].m_key; hist[freq & 0xFF]++; hist[256 + ((freq >> 8) & 0xFF)]++; hist[256*2 + ((freq >> 16) & 0xFF)]++; hist[256*3 + ((freq >> 24) & 0xFF)]++; }
+  sym_freq* pCur_syms = pSyms0, *pNew_syms = pSyms1;
+  uint total_passes = cMaxPasses; while ((total_passes > 1) && (num_syms == hist[(total_passes - 1) * 256])) total_passes--;
+  for (uint pass_shift = 0, pass = 0; pass < total_passes; pass++, pass_shift += 8)
+  {
+    const uint32* pHist = &hist[pass << 8];
+    uint offsets[256], cur_ofs = 0;
+    for (uint i = 0; i < 256; i++) { offsets[i] = cur_ofs; cur_ofs += pHist[i]; }
+    for (uint i = 0; i < num_syms; i++)
+      pNew_syms[offsets[(pCur_syms[i].m_key >> pass_shift) & 0xFF]++] = pCur_syms[i];
+    sym_freq* t = pCur_syms; pCur_syms = pNew_syms; pNew_syms = t;
+  }
+  return pCur_syms;
+}
+
+// calculate_minimum_redundancy() originally written by: Alistair Moffat, alistair@cs.mu.oz.au, Jyrki Katajainen, jyrki@diku.dk, November 1996.
+static void calculate_minimum_redundancy(sym_freq *A, int n)
+{
+  int root, leaf, next, avbl, used, dpth;
+  if (n==0) return; else if (n==1) { A[0].m_key = 1; return; }
+  A[0].m_key += A[1].m_key; root = 0; leaf = 2;
+  for (next=1; next < n-1; next++)
+  {
+    if (leaf>=n || A[root].m_key<A[leaf].m_key) { A[next].m_key = A[root].m_key; A[root++].m_key = next; } else A[next].m_key = A[leaf++].m_key;
+    if (leaf>=n || (root<next && A[root].m_key<A[leaf].m_key)) { A[next].m_key += A[root].m_key; A[root++].m_key = next; } else A[next].m_key += A[leaf++].m_key;
+  }
+  A[n-2].m_key = 0;
+  for (next=n-3; next>=0; next--) A[next].m_key = A[A[next].m_key].m_key+1;
+  avbl = 1; used = dpth = 0; root = n-2; next = n-1;
+  while (avbl>0)
+  {
+    while (root>=0 && (int)A[root].m_key==dpth) { used++; root--; }
+    while (avbl>used) { A[next--].m_key = dpth; avbl--; }
+    avbl = 2*used; dpth++; used = 0;
+  }
+}
+
+// Limits canonical Huffman code table's max code size to max_code_size.
+static void huffman_enforce_max_code_size(int *pNum_codes, int code_list_len, int max_code_size)
+{
+  if (code_list_len <= 1) return;
+
+  for (int i = max_code_size + 1; i <= MAX_HUFF_CODESIZE; i++) pNum_codes[max_code_size] += pNum_codes[i];
+
+  uint32 total = 0;
+  for (int i = max_code_size; i > 0; i--)
+    total += (((uint32)pNum_codes[i]) << (max_code_size - i));
+
+  while (total != (1UL << max_code_size))
+  {
+    pNum_codes[max_code_size]--;
+    for (int i = max_code_size - 1; i > 0; i--)
+    {
+      if (pNum_codes[i]) { pNum_codes[i]--; pNum_codes[i + 1] += 2; break; }
+    }
+    total--;
+  }
+}
+
+// Generates an optimized offman table.
+void jpeg_encoder::optimize_huffman_table(int table_num, int table_len)
+{
+  sym_freq syms0[MAX_HUFF_SYMBOLS], syms1[MAX_HUFF_SYMBOLS];
+  syms0[0].m_key = 1; syms0[0].m_sym_index = 0;  // dummy symbol, assures that no valid code contains all 1's
+  int num_used_syms = 1;
+  const uint32 *pSym_count = &m_huff_count[table_num][0];
+  for (int i = 0; i < table_len; i++)
+    if (pSym_count[i]) { syms0[num_used_syms].m_key = pSym_count[i]; syms0[num_used_syms++].m_sym_index = i + 1; }
+  sym_freq* pSyms = radix_sort_syms(num_used_syms, syms0, syms1);
+  calculate_minimum_redundancy(pSyms, num_used_syms);
+
+  // Count the # of symbols of each code size.
+  int num_codes[1 + MAX_HUFF_CODESIZE]; clear_obj(num_codes);
+  for (int i = 0; i < num_used_syms; i++)
+    num_codes[pSyms[i].m_key]++;
+
+  const uint JPGE_CODE_SIZE_LIMIT = 16; // the maximum possible size of a JPEG Huffman code (valid range is [9,16] - 9 vs. 8 because of the dummy symbol)
+  huffman_enforce_max_code_size(num_codes, num_used_syms, JPGE_CODE_SIZE_LIMIT);
+
+  // Compute m_huff_bits array, which contains the # of symbols per code size.
+  clear_obj(m_huff_bits[table_num]);
+  for (int i = 1; i <= (int)JPGE_CODE_SIZE_LIMIT; i++)
+    m_huff_bits[table_num][i] = static_cast<uint8>(num_codes[i]);
+
+  // Remove the dummy symbol added above, which must be in largest bucket.
+  for (int i = JPGE_CODE_SIZE_LIMIT; i >= 1; i--)
+  {
+    if (m_huff_bits[table_num][i]) { m_huff_bits[table_num][i]--; break; }
+  }
+
+  // Compute the m_huff_val array, which contains the symbol indices sorted by code size (smallest to largest).
+  for (int i = num_used_syms - 1; i >= 1; i--)
+    m_huff_val[table_num][num_used_syms - 1 - i] = static_cast<uint8>(pSyms[i].m_sym_index - 1);
+}
+
+// JPEG marker generation.
+void jpeg_encoder::emit_byte(uint8 i)
+{
+  m_all_stream_writes_succeeded = m_all_stream_writes_succeeded && m_pStream->put_obj(i);
+}
+
+void jpeg_encoder::emit_word(uint i)
+{
+  emit_byte(uint8(i >> 8)); emit_byte(uint8(i & 0xFF));
+}
+
+void jpeg_encoder::emit_marker(int marker)
+{
+  emit_byte(uint8(0xFF)); emit_byte(uint8(marker));
+}
+
+// Emit JFIF marker
+void jpeg_encoder::emit_jfif_app0()
+{
+  emit_marker(M_APP0);
+  emit_word(2 + 4 + 1 + 2 + 1 + 2 + 2 + 1 + 1);
+  emit_byte(0x4A); emit_byte(0x46); emit_byte(0x49); emit_byte(0x46); /* Identifier: ASCII "JFIF" */
+  emit_byte(0);
+  emit_byte(1);      /* Major version */
+  emit_byte(1);      /* Minor version */
+  emit_byte(0);      /* Density unit */
+  emit_word(1);
+  emit_word(1);
+  emit_byte(0);      /* No thumbnail image */
+  emit_byte(0);
+}
+
+// Emit quantization tables
+void jpeg_encoder::emit_dqt()
+{
+  for (int i = 0; i < ((m_num_components == 3) ? 2 : 1); i++)
+  {
+    emit_marker(M_DQT);
+    emit_word(64 + 1 + 2);
+    emit_byte(static_cast<uint8>(i));
+    for (int j = 0; j < 64; j++)
+      emit_byte(static_cast<uint8>(m_quantization_tables[i][j]));
+  }
+}
+
+// Emit start of frame marker
+void jpeg_encoder::emit_sof()
+{
+  emit_marker(M_SOF0);                           /* baseline */
+  emit_word(3 * m_num_components + 2 + 5 + 1);
+  emit_byte(8);                                  /* precision */
+  emit_word(m_image_y);
+  emit_word(m_image_x);
+  emit_byte(m_num_components);
+  for (int i = 0; i < m_num_components; i++)
+  {
+    emit_byte(static_cast<uint8>(i + 1));                                   /* component ID     */
+    emit_byte((m_comp_h_samp[i] << 4) + m_comp_v_samp[i]);  /* h and v sampling */
+    emit_byte(i > 0);                                   /* quant. table num */
+  }
+}
+
+// Emit Huffman table.
+void jpeg_encoder::emit_dht(uint8 *bits, uint8 *val, int index, bool ac_flag)
+{
+  emit_marker(M_DHT);
+
+  int length = 0;
+  for (int i = 1; i <= 16; i++)
+    length += bits[i];
+
+  emit_word(length + 2 + 1 + 16);
+  emit_byte(static_cast<uint8>(index + (ac_flag << 4)));
+
+  for (int i = 1; i <= 16; i++)
+    emit_byte(bits[i]);
+
+  for (int i = 0; i < length; i++)
+    emit_byte(val[i]);
+}
+
+// Emit all Huffman tables.
+void jpeg_encoder::emit_dhts()
+{
+  emit_dht(m_huff_bits[0+0], m_huff_val[0+0], 0, false);
+  emit_dht(m_huff_bits[2+0], m_huff_val[2+0], 0, true);
+  if (m_num_components == 3)
+  {
+    emit_dht(m_huff_bits[0+1], m_huff_val[0+1], 1, false);
+    emit_dht(m_huff_bits[2+1], m_huff_val[2+1], 1, true);
+  }
+}
+
+// emit start of scan
+void jpeg_encoder::emit_sos()
+{
+  emit_marker(M_SOS);
+  emit_word(2 * m_num_components + 2 + 1 + 3);
+  emit_byte(m_num_components);
+  for (int i = 0; i < m_num_components; i++)
+  {
+    emit_byte(static_cast<uint8>(i + 1));
+    if (i == 0)
+      emit_byte((0 << 4) + 0);
+    else
+      emit_byte((1 << 4) + 1);
+  }
+  emit_byte(0);     /* spectral selection */
+  emit_byte(63);
+  emit_byte(0);
+}
+
+// Emit all markers at beginning of image file.
+void jpeg_encoder::emit_markers()
+{
+  emit_marker(M_SOI);
+  emit_jfif_app0();
+  emit_dqt();
+  emit_sof();
+  emit_dhts();
+  emit_sos();
+}
+
+// Compute the actual canonical Huffman codes/code sizes given the JPEG huff bits and val arrays.
+void jpeg_encoder::compute_huffman_table(uint *codes, uint8 *code_sizes, uint8 *bits, uint8 *val)
+{
+  int i, l, last_p, si;
+  uint8 huff_size[257];
+  uint huff_code[257];
+  uint code;
+
+  int p = 0;
+  for (l = 1; l <= 16; l++)
+    for (i = 1; i <= bits[l]; i++)
+      huff_size[p++] = (char)l;
+
+  huff_size[p] = 0; last_p = p; // write sentinel
+
+  code = 0; si = huff_size[0]; p = 0;
+
+  while (huff_size[p])
+  {
+    while (huff_size[p] == si)
+      huff_code[p++] = code++;
+    code <<= 1;
+    si++;
+  }
+
+  memset(codes, 0, sizeof(codes[0])*256);
+  memset(code_sizes, 0, sizeof(code_sizes[0])*256);
+  for (p = 0; p < last_p; p++)
+  {
+    codes[val[p]]      = huff_code[p];
+    code_sizes[val[p]] = huff_size[p];
+  }
+}
+
+// Quantization table generation.
+void jpeg_encoder::compute_quant_table(int32 *pDst, int16 *pSrc)
+{
+  int32 q;
+  if (m_params.m_quality < 50)
+    q = 5000 / m_params.m_quality;
+  else
+    q = 200 - m_params.m_quality * 2;
+  for (int i = 0; i < 64; i++)
+  {
+    int32 j = *pSrc++; j = (j * q + 50L) / 100L;
+    *pDst++ = JPGE_MIN(JPGE_MAX(j, 1), 255);
+  }
+}
+
+// Higher-level methods.
+void jpeg_encoder::first_pass_init()
+{
+  m_bit_buffer = 0; m_bits_in = 0;
+  memset(m_last_dc_val, 0, 3 * sizeof(m_last_dc_val[0]));
+  m_mcu_y_ofs = 0;
+  m_pass_num = 1;
+}
+
+bool jpeg_encoder::second_pass_init()
+{
+  compute_huffman_table(&m_huff_codes[0+0][0], &m_huff_code_sizes[0+0][0], m_huff_bits[0+0], m_huff_val[0+0]);
+  compute_huffman_table(&m_huff_codes[2+0][0], &m_huff_code_sizes[2+0][0], m_huff_bits[2+0], m_huff_val[2+0]);
+  if (m_num_components > 1)
+  {
+    compute_huffman_table(&m_huff_codes[0+1][0], &m_huff_code_sizes[0+1][0], m_huff_bits[0+1], m_huff_val[0+1]);
+    compute_huffman_table(&m_huff_codes[2+1][0], &m_huff_code_sizes[2+1][0], m_huff_bits[2+1], m_huff_val[2+1]);
+  }
+  first_pass_init();
+  emit_markers();
+  m_pass_num = 2;
+  return true;
+}
+
+bool jpeg_encoder::jpg_open(int p_x_res, int p_y_res, int src_channels)
+{
+  m_num_components = 3;
+  switch (m_params.m_subsampling)
+  {
+    case Y_ONLY:
+    {
+      m_num_components = 1;
+      m_comp_h_samp[0] = 1; m_comp_v_samp[0] = 1;
+      m_mcu_x          = 8; m_mcu_y          = 8;
+      break;
+    }
+    case H1V1:
+    {
+      m_comp_h_samp[0] = 1; m_comp_v_samp[0] = 1;
+      m_comp_h_samp[1] = 1; m_comp_v_samp[1] = 1;
+      m_comp_h_samp[2] = 1; m_comp_v_samp[2] = 1;
+      m_mcu_x          = 8; m_mcu_y          = 8;
+      break;
+    }
+    case H2V1:
+    {
+      m_comp_h_samp[0] = 2; m_comp_v_samp[0] = 1;
+      m_comp_h_samp[1] = 1; m_comp_v_samp[1] = 1;
+      m_comp_h_samp[2] = 1; m_comp_v_samp[2] = 1;
+      m_mcu_x          = 16; m_mcu_y         = 8;
+      break;
+    }
+    case H2V2:
+    {
+      m_comp_h_samp[0] = 2; m_comp_v_samp[0] = 2;
+      m_comp_h_samp[1] = 1; m_comp_v_samp[1] = 1;
+      m_comp_h_samp[2] = 1; m_comp_v_samp[2] = 1;
+      m_mcu_x          = 16; m_mcu_y         = 16;
+    }
+  }
+
+  m_image_x        = p_x_res; m_image_y = p_y_res;
+  m_image_bpp      = src_channels;
+  m_image_bpl      = m_image_x * src_channels;
+  m_image_x_mcu    = (m_image_x + m_mcu_x - 1) & (~(m_mcu_x - 1));
+  m_image_y_mcu    = (m_image_y + m_mcu_y - 1) & (~(m_mcu_y - 1));
+  m_image_bpl_xlt  = m_image_x * m_num_components;
+  m_image_bpl_mcu  = m_image_x_mcu * m_num_components;
+  m_mcus_per_row   = m_image_x_mcu / m_mcu_x;
+
+  if ((m_mcu_lines[0] = static_cast<uint8*>(jpge_malloc(m_image_bpl_mcu * m_mcu_y))) == NULL) return false;
+  for (int i = 1; i < m_mcu_y; i++)
+    m_mcu_lines[i] = m_mcu_lines[i-1] + m_image_bpl_mcu;
+
+  compute_quant_table(m_quantization_tables[0], s_std_lum_quant);
+  compute_quant_table(m_quantization_tables[1], m_params.m_no_chroma_discrim_flag ? s_std_lum_quant : s_std_croma_quant);
+
+  m_out_buf_left = JPGE_OUT_BUF_SIZE;
+  m_pOut_buf = m_out_buf;
+
+  if (m_params.m_two_pass_flag)
+  {
+    clear_obj(m_huff_count);
+    first_pass_init();
+  }
+  else
+  {
+    memcpy(m_huff_bits[0+0], s_dc_lum_bits, 17);    memcpy(m_huff_val [0+0], s_dc_lum_val, DC_LUM_CODES);
+    memcpy(m_huff_bits[2+0], s_ac_lum_bits, 17);    memcpy(m_huff_val [2+0], s_ac_lum_val, AC_LUM_CODES);
+    memcpy(m_huff_bits[0+1], s_dc_chroma_bits, 17); memcpy(m_huff_val [0+1], s_dc_chroma_val, DC_CHROMA_CODES);
+    memcpy(m_huff_bits[2+1], s_ac_chroma_bits, 17); memcpy(m_huff_val [2+1], s_ac_chroma_val, AC_CHROMA_CODES);
+    if (!second_pass_init()) return false;   // in effect, skip over the first pass
+  }
+  return m_all_stream_writes_succeeded;
+}
+
+void jpeg_encoder::load_block_8_8_grey(int x)
+{
+  uint8 *pSrc;
+  sample_array_t *pDst = m_sample_array;
+  x <<= 3;
+  for (int i = 0; i < 8; i++, pDst += 8)
+  {
+    pSrc = m_mcu_lines[i] + x;
+    pDst[0] = pSrc[0] - 128; pDst[1] = pSrc[1] - 128; pDst[2] = pSrc[2] - 128; pDst[3] = pSrc[3] - 128;
+    pDst[4] = pSrc[4] - 128; pDst[5] = pSrc[5] - 128; pDst[6] = pSrc[6] - 128; pDst[7] = pSrc[7] - 128;
+  }
+}
+
+void jpeg_encoder::load_block_8_8(int x, int y, int c)
+{
+  uint8 *pSrc;
+  sample_array_t *pDst = m_sample_array;
+  x = (x * (8 * 3)) + c;
+  y <<= 3;
+  for (int i = 0; i < 8; i++, pDst += 8)
+  {
+    pSrc = m_mcu_lines[y + i] + x;
+    pDst[0] = pSrc[0 * 3] - 128; pDst[1] = pSrc[1 * 3] - 128; pDst[2] = pSrc[2 * 3] - 128; pDst[3] = pSrc[3 * 3] - 128;
+    pDst[4] = pSrc[4 * 3] - 128; pDst[5] = pSrc[5 * 3] - 128; pDst[6] = pSrc[6 * 3] - 128; pDst[7] = pSrc[7 * 3] - 128;
+  }
+}
+
+void jpeg_encoder::load_block_16_8(int x, int c)
+{
+  uint8 *pSrc1, *pSrc2;
+  sample_array_t *pDst = m_sample_array;
+  x = (x * (16 * 3)) + c;
+  int a = 0, b = 2;
+  for (int i = 0; i < 16; i += 2, pDst += 8)
+  {
+    pSrc1 = m_mcu_lines[i + 0] + x;
+    pSrc2 = m_mcu_lines[i + 1] + x;
+    pDst[0] = ((pSrc1[ 0 * 3] + pSrc1[ 1 * 3] + pSrc2[ 0 * 3] + pSrc2[ 1 * 3] + a) >> 2) - 128; pDst[1] = ((pSrc1[ 2 * 3] + pSrc1[ 3 * 3] + pSrc2[ 2 * 3] + pSrc2[ 3 * 3] + b) >> 2) - 128;
+    pDst[2] = ((pSrc1[ 4 * 3] + pSrc1[ 5 * 3] + pSrc2[ 4 * 3] + pSrc2[ 5 * 3] + a) >> 2) - 128; pDst[3] = ((pSrc1[ 6 * 3] + pSrc1[ 7 * 3] + pSrc2[ 6 * 3] + pSrc2[ 7 * 3] + b) >> 2) - 128;
+    pDst[4] = ((pSrc1[ 8 * 3] + pSrc1[ 9 * 3] + pSrc2[ 8 * 3] + pSrc2[ 9 * 3] + a) >> 2) - 128; pDst[5] = ((pSrc1[10 * 3] + pSrc1[11 * 3] + pSrc2[10 * 3] + pSrc2[11 * 3] + b) >> 2) - 128;
+    pDst[6] = ((pSrc1[12 * 3] + pSrc1[13 * 3] + pSrc2[12 * 3] + pSrc2[13 * 3] + a) >> 2) - 128; pDst[7] = ((pSrc1[14 * 3] + pSrc1[15 * 3] + pSrc2[14 * 3] + pSrc2[15 * 3] + b) >> 2) - 128;
+    int temp = a; a = b; b = temp;
+  }
+}
+
+void jpeg_encoder::load_block_16_8_8(int x, int c)
+{
+  uint8 *pSrc1;
+  sample_array_t *pDst = m_sample_array;
+  x = (x * (16 * 3)) + c;
+  for (int i = 0; i < 8; i++, pDst += 8)
+  {
+    pSrc1 = m_mcu_lines[i + 0] + x;
+    pDst[0] = ((pSrc1[ 0 * 3] + pSrc1[ 1 * 3]) >> 1) - 128; pDst[1] = ((pSrc1[ 2 * 3] + pSrc1[ 3 * 3]) >> 1) - 128;
+    pDst[2] = ((pSrc1[ 4 * 3] + pSrc1[ 5 * 3]) >> 1) - 128; pDst[3] = ((pSrc1[ 6 * 3] + pSrc1[ 7 * 3]) >> 1) - 128;
+    pDst[4] = ((pSrc1[ 8 * 3] + pSrc1[ 9 * 3]) >> 1) - 128; pDst[5] = ((pSrc1[10 * 3] + pSrc1[11 * 3]) >> 1) - 128;
+    pDst[6] = ((pSrc1[12 * 3] + pSrc1[13 * 3]) >> 1) - 128; pDst[7] = ((pSrc1[14 * 3] + pSrc1[15 * 3]) >> 1) - 128;
+  }
+}
+
+void jpeg_encoder::load_quantized_coefficients(int component_num)
+{
+  int32 *q = m_quantization_tables[component_num > 0];
+  int16 *pDst = m_coefficient_array;
+  for (int i = 0; i < 64; i++)
+  {
+    sample_array_t j = m_sample_array[s_zag[i]];
+    if (j < 0)
+    {
+      if ((j = -j + (*q >> 1)) < *q)
+        *pDst++ = 0;
+      else
+        *pDst++ = static_cast<int16>(-(j / *q));
+    }
+    else
+    {
+      if ((j = j + (*q >> 1)) < *q)
+        *pDst++ = 0;
+      else
+        *pDst++ = static_cast<int16>((j / *q));
+    }
+    q++;
+  }
+}
+
+void jpeg_encoder::flush_output_buffer()
+{
+  if (m_out_buf_left != JPGE_OUT_BUF_SIZE)
+    m_all_stream_writes_succeeded = m_all_stream_writes_succeeded && m_pStream->put_buf(m_out_buf, JPGE_OUT_BUF_SIZE - m_out_buf_left);
+  m_pOut_buf = m_out_buf;
+  m_out_buf_left = JPGE_OUT_BUF_SIZE;
+}
+
+void jpeg_encoder::put_bits(uint bits, uint len)
+{
+  m_bit_buffer |= ((uint32)bits << (24 - (m_bits_in += len)));
+  while (m_bits_in >= 8)
+  {
+    uint8 c;
+    #define JPGE_PUT_BYTE(c) { *m_pOut_buf++ = (c); if (--m_out_buf_left == 0) flush_output_buffer(); }
+    JPGE_PUT_BYTE(c = (uint8)((m_bit_buffer >> 16) & 0xFF));
+    if (c == 0xFF) JPGE_PUT_BYTE(0);
+    m_bit_buffer <<= 8;
+    m_bits_in -= 8;
+  }
+}
+
+void jpeg_encoder::code_coefficients_pass_one(int component_num)
+{
+  if (component_num >= 3) return; // just to shut up static analysis
+  int i, run_len, nbits, temp1;
+  int16 *src = m_coefficient_array;
+  uint32 *dc_count = component_num ? m_huff_count[0 + 1] : m_huff_count[0 + 0], *ac_count = component_num ? m_huff_count[2 + 1] : m_huff_count[2 + 0];
+
+  temp1 = src[0] - m_last_dc_val[component_num];
+  m_last_dc_val[component_num] = src[0];
+  if (temp1 < 0) temp1 = -temp1;
+
+  nbits = 0;
+  while (temp1)
+  {
+    nbits++; temp1 >>= 1;
+  }
+
+  dc_count[nbits]++;
+  for (run_len = 0, i = 1; i < 64; i++)
+  {
+    if ((temp1 = m_coefficient_array[i]) == 0)
+      run_len++;
+    else
+    {
+      while (run_len >= 16)
+      {
+        ac_count[0xF0]++;
+        run_len -= 16;
+      }
+      if (temp1 < 0) temp1 = -temp1;
+      nbits = 1;
+      while (temp1 >>= 1) nbits++;
+      ac_count[(run_len << 4) + nbits]++;
+      run_len = 0;
+    }
+  }
+  if (run_len) ac_count[0]++;
+}
+
+void jpeg_encoder::code_coefficients_pass_two(int component_num)
+{
+  int i, j, run_len, nbits, temp1, temp2;
+  int16 *pSrc = m_coefficient_array;
+  uint *codes[2];
+  uint8 *code_sizes[2];
+
+  if (component_num == 0)
+  {
+    codes[0] = m_huff_codes[0 + 0]; codes[1] = m_huff_codes[2 + 0];
+    code_sizes[0] = m_huff_code_sizes[0 + 0]; code_sizes[1] = m_huff_code_sizes[2 + 0];
+  }
+  else
+  {
+    codes[0] = m_huff_codes[0 + 1]; codes[1] = m_huff_codes[2 + 1];
+    code_sizes[0] = m_huff_code_sizes[0 + 1]; code_sizes[1] = m_huff_code_sizes[2 + 1];
+  }
+
+  temp1 = temp2 = pSrc[0] - m_last_dc_val[component_num];
+  m_last_dc_val[component_num] = pSrc[0];
+
+  if (temp1 < 0)
+  {
+    temp1 = -temp1; temp2--;
+  }
+
+  nbits = 0;
+  while (temp1)
+  {
+    nbits++; temp1 >>= 1;
+  }
+
+  put_bits(codes[0][nbits], code_sizes[0][nbits]);
+  if (nbits) put_bits(temp2 & ((1 << nbits) - 1), nbits);
+
+  for (run_len = 0, i = 1; i < 64; i++)
+  {
+    if ((temp1 = m_coefficient_array[i]) == 0)
+      run_len++;
+    else
+    {
+      while (run_len >= 16)
+      {
+        put_bits(codes[1][0xF0], code_sizes[1][0xF0]);
+        run_len -= 16;
+      }
+      if ((temp2 = temp1) < 0)
+      {
+        temp1 = -temp1;
+        temp2--;
+      }
+      nbits = 1;
+      while (temp1 >>= 1)
+        nbits++;
+      j = (run_len << 4) + nbits;
+      put_bits(codes[1][j], code_sizes[1][j]);
+      put_bits(temp2 & ((1 << nbits) - 1), nbits);
+      run_len = 0;
+    }
+  }
+  if (run_len)
+    put_bits(codes[1][0], code_sizes[1][0]);
+}
+
+void jpeg_encoder::code_block(int component_num)
+{
+  DCT2D(m_sample_array);
+  load_quantized_coefficients(component_num);
+  if (m_pass_num == 1)
+    code_coefficients_pass_one(component_num);
+  else
+    code_coefficients_pass_two(component_num);
+}
+
+void jpeg_encoder::process_mcu_row()
+{
+  if (m_num_components == 1)
+  {
+    for (int i = 0; i < m_mcus_per_row; i++)
+    {
+      load_block_8_8_grey(i); code_block(0);
+    }
+  }
+  else if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 1))
+  {
+    for (int i = 0; i < m_mcus_per_row; i++)
+    {
+      load_block_8_8(i, 0, 0); code_block(0); load_block_8_8(i, 0, 1); code_block(1); load_block_8_8(i, 0, 2); code_block(2);
+    }
+  }
+  else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 1))
+  {
+    for (int i = 0; i < m_mcus_per_row; i++)
+    {
+      load_block_8_8(i * 2 + 0, 0, 0); code_block(0); load_block_8_8(i * 2 + 1, 0, 0); code_block(0);
+      load_block_16_8_8(i, 1); code_block(1); load_block_16_8_8(i, 2); code_block(2);
+    }
+  }
+  else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 2))
+  {
+    for (int i = 0; i < m_mcus_per_row; i++)
+    {
+      load_block_8_8(i * 2 + 0, 0, 0); code_block(0); load_block_8_8(i * 2 + 1, 0, 0); code_block(0);
+      load_block_8_8(i * 2 + 0, 1, 0); code_block(0); load_block_8_8(i * 2 + 1, 1, 0); code_block(0);
+      load_block_16_8(i, 1); code_block(1); load_block_16_8(i, 2); code_block(2);
+    }
+  }
+}
+
+bool jpeg_encoder::terminate_pass_one()
+{
+  optimize_huffman_table(0+0, DC_LUM_CODES); optimize_huffman_table(2+0, AC_LUM_CODES);
+  if (m_num_components > 1)
+  {
+    optimize_huffman_table(0+1, DC_CHROMA_CODES); optimize_huffman_table(2+1, AC_CHROMA_CODES);
+  }
+  return second_pass_init();
+}
+
+bool jpeg_encoder::terminate_pass_two()
+{
+  put_bits(0x7F, 7);
+  flush_output_buffer();
+  emit_marker(M_EOI);
+  m_pass_num++; // purposely bump up m_pass_num, for debugging
+  return true;
+}
+
+bool jpeg_encoder::process_end_of_image()
+{
+  if (m_mcu_y_ofs)
+  {
+    if (m_mcu_y_ofs < 16) // check here just to shut up static analysis
+    {
+      for (int i = m_mcu_y_ofs; i < m_mcu_y; i++)
+        memcpy(m_mcu_lines[i], m_mcu_lines[m_mcu_y_ofs - 1], m_image_bpl_mcu);
+    }
+
+    process_mcu_row();
+  }
+
+  if (m_pass_num == 1)
+    return terminate_pass_one();
+  else
+    return terminate_pass_two();
+}
+
+void jpeg_encoder::load_mcu(const void *pSrc)
+{
+  const uint8* Psrc = reinterpret_cast<const uint8*>(pSrc);
+
+  uint8* pDst = m_mcu_lines[m_mcu_y_ofs]; // OK to write up to m_image_bpl_xlt bytes to pDst
+
+  if (m_num_components == 1)
+  {
+    if (m_image_bpp == 4)
+      RGBA_to_Y(pDst, Psrc, m_image_x);
+    else if (m_image_bpp == 3)
+      RGB_to_Y(pDst, Psrc, m_image_x);
+    else
+      memcpy(pDst, Psrc, m_image_x);
+  }
+  else
+  {
+    if (m_image_bpp == 4)
+      RGBA_to_YCC(pDst, Psrc, m_image_x);
+    else if (m_image_bpp == 3)
+      RGB_to_YCC(pDst, Psrc, m_image_x);
+    else
+      Y_to_YCC(pDst, Psrc, m_image_x);
+  }
+
+  // Possibly duplicate pixels at end of scanline if not a multiple of 8 or 16
+  if (m_num_components == 1)
+    memset(m_mcu_lines[m_mcu_y_ofs] + m_image_bpl_xlt, pDst[m_image_bpl_xlt - 1], m_image_x_mcu - m_image_x);
+  else
+  {
+    const uint8 y = pDst[m_image_bpl_xlt - 3 + 0], cb = pDst[m_image_bpl_xlt - 3 + 1], cr = pDst[m_image_bpl_xlt - 3 + 2];
+    uint8 *q = m_mcu_lines[m_mcu_y_ofs] + m_image_bpl_xlt;
+    for (int i = m_image_x; i < m_image_x_mcu; i++)
+    {
+      *q++ = y; *q++ = cb; *q++ = cr;
+    }
+  }
+
+  if (++m_mcu_y_ofs == m_mcu_y)
+  {
+    process_mcu_row();
+    m_mcu_y_ofs = 0;
+  }
+}
+
+void jpeg_encoder::clear()
+{
+  m_mcu_lines[0] = NULL;
+  m_pass_num = 0;
+  m_all_stream_writes_succeeded = true;
+}
+
+jpeg_encoder::jpeg_encoder()
+{
+  clear();
+}
+
+jpeg_encoder::~jpeg_encoder()
+{
+  deinit();
+}
+
+bool jpeg_encoder::init(output_stream *pStream, int64_t width, int64_t height, int64_t src_channels, const params &comp_params)
+{
+  deinit();
+  if (((!pStream) || (width < 1) || (height < 1)) || ((src_channels != 1) && (src_channels != 3) && (src_channels != 4)) || (!comp_params.check_valid())) return false;
+  m_pStream = pStream;
+  m_params = comp_params;
+  return jpg_open(width, height, src_channels);
+}
+
+void jpeg_encoder::deinit()
+{
+  jpge_free(m_mcu_lines[0]);
+  clear();
+}
+
+bool jpeg_encoder::process_scanline(const void* pScanline)
+{
+  if ((m_pass_num < 1) || (m_pass_num > 2)) return false;
+  if (m_all_stream_writes_succeeded)
+  {
+    if (!pScanline)
+    {
+      if (!process_end_of_image()) return false;
+    }
+    else
+    {
+      load_mcu(pScanline);
+    }
+  }
+  return m_all_stream_writes_succeeded;
+}
+
+// Higher level wrappers/examples (optional).
+#include <stdio.h>
+
+class cfile_stream : public output_stream
+{
+   cfile_stream(const cfile_stream &);
+   cfile_stream &operator= (const cfile_stream &);
+
+   FILE* m_pFile;
+   bool m_bStatus;
+
+public:
+   cfile_stream() : m_pFile(NULL), m_bStatus(false) { }
+
+   virtual ~cfile_stream()
+   {
+      close();
+   }
+
+   bool open(const char *pFilename)
+   {
+      close();
+#if defined(_MSC_VER)
+      if (fopen_s(&m_pFile, pFilename, "wb") != 0)
+	  {
+		  return false;
+	  }
+#else
+      m_pFile = fopen(pFilename, "wb");
+#endif
+      m_bStatus = (m_pFile != NULL);
+      return m_bStatus;
+   }
+
+   bool close()
+   {
+      if (m_pFile)
+      {
+         if (fclose(m_pFile) == EOF)
+         {
+            m_bStatus = false;
+         }
+         m_pFile = NULL;
+      }
+      return m_bStatus;
+   }
+
+   virtual bool put_buf(const void* pBuf, int64_t len)
+   {
+      m_bStatus = m_bStatus && (fwrite(pBuf, len, 1, m_pFile) == 1);
+      return m_bStatus;
+   }
+
+   uint get_size() const
+   {
+      return m_pFile ? ftell(m_pFile) : 0;
+   }
+};
+
+// Writes JPEG image to file.
+bool compress_image_to_jpeg_file(const char *pFilename, int64_t width, int64_t height, int64_t num_channels, const uint8 *pImage_data, const params &comp_params)
+{
+  cfile_stream dst_stream;
+  if (!dst_stream.open(pFilename))
+    return false;
+
+  jpge::jpeg_encoder dst_image;
+  if (!dst_image.init(&dst_stream, width, height, num_channels, comp_params))
+    return false;
+
+  for (uint pass_index = 0; pass_index < dst_image.get_total_passes(); pass_index++)
+  {
+    for (int64_t i = 0; i < height; i++)
+    {
+		// i, width, and num_channels are all 64bit
+       const uint8* pBuf = pImage_data + i * width * num_channels;
+       if (!dst_image.process_scanline(pBuf))
+          return false;
+    }
+    if (!dst_image.process_scanline(NULL))
+       return false;
+  }
+
+  dst_image.deinit();
+
+  return dst_stream.close();
+}
+
+class memory_stream : public output_stream
+{
+   memory_stream(const memory_stream &);
+   memory_stream &operator= (const memory_stream &);
+
+   uint8 *m_pBuf;
+   uint64_t m_buf_size, m_buf_ofs;
+
+public:
+   memory_stream(void *pBuf, uint64_t buf_size) : m_pBuf(static_cast<uint8*>(pBuf)), m_buf_size(buf_size), m_buf_ofs(0) { }
+
+   virtual ~memory_stream() { }
+
+   virtual bool put_buf(const void* pBuf, int64_t len)
+   {
+      uint64_t buf_remaining = m_buf_size - m_buf_ofs;
+      if ((uint64_t)len > buf_remaining)
+         return false;
+      memcpy(m_pBuf + m_buf_ofs, pBuf, len);
+      m_buf_ofs += len;
+      return true;
+   }
+
+   uint64_t get_size() const
+   {
+      return m_buf_ofs;
+   }
+};
+
+bool compress_image_to_jpeg_file_in_memory(void *pDstBuf, int64_t &buf_size, int64_t width, int64_t height, int64_t num_channels, const uint8 *pImage_data, const params &comp_params)
+{
+   if ((!pDstBuf) || (!buf_size))
+      return false;
+
+   memory_stream dst_stream(pDstBuf, buf_size);
+
+   buf_size = 0;
+
+   jpge::jpeg_encoder dst_image;
+   if (!dst_image.init(&dst_stream, width, height, num_channels, comp_params))
+      return false;
+
+   for (uint pass_index = 0; pass_index < dst_image.get_total_passes(); pass_index++)
+   {
+     for (int64_t i = 0; i < height; i++)
+     {
+        const uint8* pScanline = pImage_data + i * width * num_channels;
+        if (!dst_image.process_scanline(pScanline))
+           return false;
+     }
+     if (!dst_image.process_scanline(NULL))
+        return false;
+   }
+
+   dst_image.deinit();
+
+   buf_size = dst_stream.get_size();
+   return true;
+}
+
+} // namespace jpge
\ No newline at end of file
diff --git a/crazy_functions/test_project/cpp/longcode/prod_cons.h b/crazy_functions/test_project/cpp/longcode/prod_cons.h
new file mode 100644
index 0000000..28d99bd
--- /dev/null
+++ b/crazy_functions/test_project/cpp/longcode/prod_cons.h
@@ -0,0 +1,433 @@
+#pragma once
+
+#include <atomic>
+#include <utility>
+#include <cstring>
+#include <type_traits>
+#include <cstdint>
+
+#include "libipc/def.h"
+
+#include "libipc/platform/detail.h"
+#include "libipc/circ/elem_def.h"
+#include "libipc/utility/log.h"
+#include "libipc/utility/utility.h"
+
+namespace ipc {
+
+////////////////////////////////////////////////////////////////
+/// producer-consumer implementation
+////////////////////////////////////////////////////////////////
+
+template <typename Flag>
+struct prod_cons_impl;
+
+template <>
+struct prod_cons_impl<wr<relat::single, relat::single, trans::unicast>> {
+
+    template <std::size_t DataSize, std::size_t AlignSize>
+    struct elem_t {
+        std::aligned_storage_t<DataSize, AlignSize> data_ {};
+    };
+
+    alignas(cache_line_size) std::atomic<circ::u2_t> rd_; // read index
+    alignas(cache_line_size) std::atomic<circ::u2_t> wt_; // write index
+
+    constexpr circ::u2_t cursor() const noexcept {
+        return 0;
+    }
+
+    template <typename W, typename F, typename E>
+    bool push(W* /*wrapper*/, F&& f, E* elems) {
+        auto cur_wt = circ::index_of(wt_.load(std::memory_order_relaxed));
+        if (cur_wt == circ::index_of(rd_.load(std::memory_order_acquire) - 1)) {
+            return false; // full
+        }
+        std::forward<F>(f)(&(elems[cur_wt].data_));
+        wt_.fetch_add(1, std::memory_order_release);
+        return true;
+    }
+
+    /**
+     * In single-single-unicast, 'force_push' means 'no reader' or 'the only one reader is dead'.
+     * So we could just disconnect all connections of receiver, and return false.
+    */
+    template <typename W, typename F, typename E>
+    bool force_push(W* wrapper, F&&, E*) {
+        wrapper->elems()->disconnect_receiver(~static_cast<circ::cc_t>(0u));
+        return false;
+    }
+
+    template <typename W, typename F, typename R, typename E>
+    bool pop(W* /*wrapper*/, circ::u2_t& /*cur*/, F&& f, R&& out, E* elems) {
+        auto cur_rd = circ::index_of(rd_.load(std::memory_order_relaxed));
+        if (cur_rd == circ::index_of(wt_.load(std::memory_order_acquire))) {
+            return false; // empty
+        }
+        std::forward<F>(f)(&(elems[cur_rd].data_));
+        std::forward<R>(out)(true);
+        rd_.fetch_add(1, std::memory_order_release);
+        return true;
+    }
+};
+
+template <>
+struct prod_cons_impl<wr<relat::single, relat::multi , trans::unicast>>
+     : prod_cons_impl<wr<relat::single, relat::single, trans::unicast>> {
+
+    template <typename W, typename F, typename E>
+    bool force_push(W* wrapper, F&&, E*) {
+        wrapper->elems()->disconnect_receiver(1);
+        return false;
+    }
+
+    template <typename W, typename F, typename R, 
+              template <std::size_t, std::size_t> class E, std::size_t DS, std::size_t AS>
+    bool pop(W* /*wrapper*/, circ::u2_t& /*cur*/, F&& f, R&& out, E<DS, AS>* elems) {
+        byte_t buff[DS];
+        for (unsigned k = 0;;) {
+            auto cur_rd = rd_.load(std::memory_order_relaxed);
+            if (circ::index_of(cur_rd) ==
+                circ::index_of(wt_.load(std::memory_order_acquire))) {
+                return false; // empty
+            }
+            std::memcpy(buff, &(elems[circ::index_of(cur_rd)].data_), sizeof(buff));
+            if (rd_.compare_exchange_weak(cur_rd, cur_rd + 1, std::memory_order_release)) {
+                std::forward<F>(f)(buff);
+                std::forward<R>(out)(true);
+                return true;
+            }
+            ipc::yield(k);
+        }
+    }
+};
+
+template <>
+struct prod_cons_impl<wr<relat::multi , relat::multi, trans::unicast>>
+     : prod_cons_impl<wr<relat::single, relat::multi, trans::unicast>> {
+
+    using flag_t = std::uint64_t;
+
+    template <std::size_t DataSize, std::size_t AlignSize>
+    struct elem_t {
+        std::aligned_storage_t<DataSize, AlignSize> data_ {};
+        std::atomic<flag_t> f_ct_ { 0 }; // commit flag
+    };
+
+    alignas(cache_line_size) std::atomic<circ::u2_t> ct_; // commit index
+
+    template <typename W, typename F, typename E>
+    bool push(W* /*wrapper*/, F&& f, E* elems) {
+        circ::u2_t cur_ct, nxt_ct;
+        for (unsigned k = 0;;) {
+            cur_ct = ct_.load(std::memory_order_relaxed);
+            if (circ::index_of(nxt_ct = cur_ct + 1) ==
+                circ::index_of(rd_.load(std::memory_order_acquire))) {
+                return false; // full
+            }
+            if (ct_.compare_exchange_weak(cur_ct, nxt_ct, std::memory_order_acq_rel)) {
+                break;
+            }
+            ipc::yield(k);
+        }
+        auto* el = elems + circ::index_of(cur_ct);
+        std::forward<F>(f)(&(el->data_));
+        // set flag & try update wt
+        el->f_ct_.store(~static_cast<flag_t>(cur_ct), std::memory_order_release);
+        while (1) {
+            auto cac_ct = el->f_ct_.load(std::memory_order_acquire);
+            if (cur_ct != wt_.load(std::memory_order_relaxed)) {
+                return true;
+            }
+            if ((~cac_ct) != cur_ct) {
+                return true;
+            }
+            if (!el->f_ct_.compare_exchange_strong(cac_ct, 0, std::memory_order_relaxed)) {
+                return true;
+            }
+            wt_.store(nxt_ct, std::memory_order_release);
+            cur_ct = nxt_ct;
+            nxt_ct = cur_ct + 1;
+            el = elems + circ::index_of(cur_ct);
+        }
+        return true;
+    }
+
+    template <typename W, typename F, typename E>
+    bool force_push(W* wrapper, F&&, E*) {
+        wrapper->elems()->disconnect_receiver(1);
+        return false;
+    }
+
+    template <typename W, typename F, typename R, 
+              template <std::size_t, std::size_t> class E, std::size_t DS, std::size_t AS>
+    bool pop(W* /*wrapper*/, circ::u2_t& /*cur*/, F&& f, R&& out, E<DS, AS>* elems) {
+        byte_t buff[DS];
+        for (unsigned k = 0;;) {
+            auto cur_rd = rd_.load(std::memory_order_relaxed);
+            auto cur_wt = wt_.load(std::memory_order_acquire);
+            auto id_rd  = circ::index_of(cur_rd);
+            auto id_wt  = circ::index_of(cur_wt);
+            if (id_rd == id_wt) {
+                auto* el = elems + id_wt;
+                auto cac_ct = el->f_ct_.load(std::memory_order_acquire);
+                if ((~cac_ct) != cur_wt) {
+                    return false; // empty
+                }
+                if (el->f_ct_.compare_exchange_weak(cac_ct, 0, std::memory_order_relaxed)) {
+                    wt_.store(cur_wt + 1, std::memory_order_release);
+                }
+                k = 0;
+            }
+            else {
+                std::memcpy(buff, &(elems[circ::index_of(cur_rd)].data_), sizeof(buff));
+                if (rd_.compare_exchange_weak(cur_rd, cur_rd + 1, std::memory_order_release)) {
+                    std::forward<F>(f)(buff);
+                    std::forward<R>(out)(true);
+                    return true;
+                }
+                ipc::yield(k);
+            }
+        }
+    }
+};
+
+template <>
+struct prod_cons_impl<wr<relat::single, relat::multi, trans::broadcast>> {
+
+    using rc_t = std::uint64_t;
+
+    enum : rc_t {
+        ep_mask = 0x00000000ffffffffull,
+        ep_incr = 0x0000000100000000ull
+    };
+
+    template <std::size_t DataSize, std::size_t AlignSize>
+    struct elem_t {
+        std::aligned_storage_t<DataSize, AlignSize> data_ {};
+        std::atomic<rc_t> rc_ { 0 }; // read-counter
+    };
+
+    alignas(cache_line_size) std::atomic<circ::u2_t> wt_;   // write index
+    alignas(cache_line_size) rc_t epoch_ { 0 };             // only one writer
+
+    circ::u2_t cursor() const noexcept {
+        return wt_.load(std::memory_order_acquire);
+    }
+
+    template <typename W, typename F, typename E>
+    bool push(W* wrapper, F&& f, E* elems) {
+        E* el;
+        for (unsigned k = 0;;) {
+            circ::cc_t cc = wrapper->elems()->connections(std::memory_order_relaxed);
+            if (cc == 0) return false; // no reader
+            el = elems + circ::index_of(wt_.load(std::memory_order_relaxed));
+            // check all consumers have finished reading this element
+            auto cur_rc = el->rc_.load(std::memory_order_acquire);
+            circ::cc_t rem_cc = cur_rc & ep_mask;
+            if ((cc & rem_cc) && ((cur_rc & ~ep_mask) == epoch_)) {
+                return false; // has not finished yet
+            }
+            // consider rem_cc to be 0 here
+            if (el->rc_.compare_exchange_weak(
+                        cur_rc, epoch_ | static_cast<rc_t>(cc), std::memory_order_release)) {
+                break;
+            }
+            ipc::yield(k);
+        }
+        std::forward<F>(f)(&(el->data_));
+        wt_.fetch_add(1, std::memory_order_release);
+        return true;
+    }
+
+    template <typename W, typename F, typename E>
+    bool force_push(W* wrapper, F&& f, E* elems) {
+        E* el;
+        epoch_ += ep_incr;
+        for (unsigned k = 0;;) {
+            circ::cc_t cc = wrapper->elems()->connections(std::memory_order_relaxed);
+            if (cc == 0) return false; // no reader
+            el = elems + circ::index_of(wt_.load(std::memory_order_relaxed));
+            // check all consumers have finished reading this element
+            auto cur_rc = el->rc_.load(std::memory_order_acquire);
+            circ::cc_t rem_cc = cur_rc & ep_mask;
+            if (cc & rem_cc) {
+                ipc::log("force_push: k = %u, cc = %u, rem_cc = %u\n", k, cc, rem_cc);
+                cc = wrapper->elems()->disconnect_receiver(rem_cc); // disconnect all invalid readers
+                if (cc == 0) return false; // no reader
+            }
+            // just compare & exchange
+            if (el->rc_.compare_exchange_weak(
+                        cur_rc, epoch_ | static_cast<rc_t>(cc), std::memory_order_release)) {
+                break;
+            }
+            ipc::yield(k);
+        }
+        std::forward<F>(f)(&(el->data_));
+        wt_.fetch_add(1, std::memory_order_release);
+        return true;
+    }
+
+    template <typename W, typename F, typename R, typename E>
+    bool pop(W* wrapper, circ::u2_t& cur, F&& f, R&& out, E* elems) {
+        if (cur == cursor()) return false; // acquire
+        auto* el = elems + circ::index_of(cur++);
+        std::forward<F>(f)(&(el->data_));
+        for (unsigned k = 0;;) {
+            auto cur_rc = el->rc_.load(std::memory_order_acquire);
+            if ((cur_rc & ep_mask) == 0) {
+                std::forward<R>(out)(true);
+                return true;
+            }
+            auto nxt_rc = cur_rc & ~static_cast<rc_t>(wrapper->connected_id());
+            if (el->rc_.compare_exchange_weak(cur_rc, nxt_rc, std::memory_order_release)) {
+                std::forward<R>(out)((nxt_rc & ep_mask) == 0);
+                return true;
+            }
+            ipc::yield(k);
+        }
+    }
+};
+
+template <>
+struct prod_cons_impl<wr<relat::multi, relat::multi, trans::broadcast>> {
+
+    using rc_t   = std::uint64_t;
+    using flag_t = std::uint64_t;
+
+    enum : rc_t {
+        rc_mask = 0x00000000ffffffffull,
+        ep_mask = 0x00ffffffffffffffull,
+        ep_incr = 0x0100000000000000ull,
+        ic_mask = 0xff000000ffffffffull,
+        ic_incr = 0x0000000100000000ull
+    };
+
+    template <std::size_t DataSize, std::size_t AlignSize>
+    struct elem_t {
+        std::aligned_storage_t<DataSize, AlignSize> data_ {};
+        std::atomic<rc_t  > rc_   { 0 }; // read-counter
+        std::atomic<flag_t> f_ct_ { 0 }; // commit flag
+    };
+
+    alignas(cache_line_size) std::atomic<circ::u2_t> ct_;   // commit index
+    alignas(cache_line_size) std::atomic<rc_t> epoch_ { 0 };
+
+    circ::u2_t cursor() const noexcept {
+        return ct_.load(std::memory_order_acquire);
+    }
+
+    constexpr static rc_t inc_rc(rc_t rc) noexcept {
+        return (rc & ic_mask) | ((rc + ic_incr) & ~ic_mask);
+    }
+
+    constexpr static rc_t inc_mask(rc_t rc) noexcept {
+        return inc_rc(rc) & ~rc_mask;
+    }
+
+    template <typename W, typename F, typename E>
+    bool push(W* wrapper, F&& f, E* elems) {
+        E* el;
+        circ::u2_t cur_ct;
+        rc_t epoch = epoch_.load(std::memory_order_acquire);
+        for (unsigned k = 0;;) {
+            circ::cc_t cc = wrapper->elems()->connections(std::memory_order_relaxed);
+            if (cc == 0) return false; // no reader
+            el = elems + circ::index_of(cur_ct = ct_.load(std::memory_order_relaxed));
+            // check all consumers have finished reading this element
+            auto cur_rc = el->rc_.load(std::memory_order_relaxed);
+            circ::cc_t rem_cc = cur_rc & rc_mask;
+            if ((cc & rem_cc) && ((cur_rc & ~ep_mask) == epoch)) {
+                return false; // has not finished yet
+            }
+            else if (!rem_cc) {
+                auto cur_fl = el->f_ct_.load(std::memory_order_acquire);
+                if ((cur_fl != cur_ct) && cur_fl) {
+                    return false; // full
+                }
+            }
+            // consider rem_cc to be 0 here
+            if (el->rc_.compare_exchange_weak(
+                        cur_rc, inc_mask(epoch | (cur_rc & ep_mask)) | static_cast<rc_t>(cc), std::memory_order_relaxed) &&
+                epoch_.compare_exchange_weak(epoch, epoch, std::memory_order_acq_rel)) {
+                break;
+            }
+            ipc::yield(k);
+        }
+        // only one thread/process would touch here at one time
+        ct_.store(cur_ct + 1, std::memory_order_release);
+        std::forward<F>(f)(&(el->data_));
+        // set flag & try update wt
+        el->f_ct_.store(~static_cast<flag_t>(cur_ct), std::memory_order_release);
+        return true;
+    }
+
+    template <typename W, typename F, typename E>
+    bool force_push(W* wrapper, F&& f, E* elems) {
+        E* el;
+        circ::u2_t cur_ct;
+        rc_t epoch = epoch_.fetch_add(ep_incr, std::memory_order_release) + ep_incr;
+        for (unsigned k = 0;;) {
+            circ::cc_t cc = wrapper->elems()->connections(std::memory_order_relaxed);
+            if (cc == 0) return false; // no reader
+            el = elems + circ::index_of(cur_ct = ct_.load(std::memory_order_relaxed));
+            // check all consumers have finished reading this element
+            auto cur_rc = el->rc_.load(std::memory_order_acquire);
+            circ::cc_t rem_cc = cur_rc & rc_mask;
+            if (cc & rem_cc) {
+                ipc::log("force_push: k = %u, cc = %u, rem_cc = %u\n", k, cc, rem_cc);
+                cc = wrapper->elems()->disconnect_receiver(rem_cc); // disconnect all invalid readers
+                if (cc == 0) return false; // no reader
+            }
+            // just compare & exchange
+            if (el->rc_.compare_exchange_weak(
+                        cur_rc, inc_mask(epoch | (cur_rc & ep_mask)) | static_cast<rc_t>(cc), std::memory_order_relaxed)) {
+                if (epoch == epoch_.load(std::memory_order_acquire)) {
+                    break;
+                }
+                else if (push(wrapper, std::forward<F>(f), elems)) {
+                    return true;
+                }
+                epoch = epoch_.fetch_add(ep_incr, std::memory_order_release) + ep_incr;
+            }
+            ipc::yield(k);
+        }
+        // only one thread/process would touch here at one time
+        ct_.store(cur_ct + 1, std::memory_order_release);
+        std::forward<F>(f)(&(el->data_));
+        // set flag & try update wt
+        el->f_ct_.store(~static_cast<flag_t>(cur_ct), std::memory_order_release);
+        return true;
+    }
+
+    template <typename W, typename F, typename R, typename E, std::size_t N>
+    bool pop(W* wrapper, circ::u2_t& cur, F&& f, R&& out, E(& elems)[N]) {
+        auto* el = elems + circ::index_of(cur);
+        auto cur_fl = el->f_ct_.load(std::memory_order_acquire);
+        if (cur_fl != ~static_cast<flag_t>(cur)) {
+            return false; // empty
+        }
+        ++cur;
+        std::forward<F>(f)(&(el->data_));
+        for (unsigned k = 0;;) {
+            auto cur_rc = el->rc_.load(std::memory_order_acquire);
+            if ((cur_rc & rc_mask) == 0) {
+                std::forward<R>(out)(true);
+                el->f_ct_.store(cur + N - 1, std::memory_order_release);
+                return true;
+            }
+            auto nxt_rc = inc_rc(cur_rc) & ~static_cast<rc_t>(wrapper->connected_id());
+            bool last_one = false;
+            if ((last_one = (nxt_rc & rc_mask) == 0)) {
+                el->f_ct_.store(cur + N - 1, std::memory_order_release);
+            }
+            if (el->rc_.compare_exchange_weak(cur_rc, nxt_rc, std::memory_order_release)) {
+                std::forward<R>(out)(last_one);
+                return true;
+            }
+            ipc::yield(k);
+        }
+    }
+};
+
+} // namespace ipc
diff --git a/crazy_functions/高级功能函数模板.py b/crazy_functions/高级功能函数模板.py
index ec14e74..b5c84b9 100644
--- a/crazy_functions/高级功能函数模板.py
+++ b/crazy_functions/高级功能函数模板.py
@@ -11,7 +11,7 @@ def 高阶功能模板函数(txt, top_p, temperature, chatbot, history, systemPr
     for i in range(5):
         currentMonth = (datetime.date.today() + datetime.timedelta(days=i)).month
         currentDay = (datetime.date.today() + datetime.timedelta(days=i)).day
-        i_say = f'历史中哪些事件发生在{currentMonth}月{currentDay}日？列举两条并发送相关图片。发送图片时，请使用Markdown，将Unsplash API中的PUT_YOUR_QUERY_HERE替换成描述改事件的三个最重要的单词。'
+        i_say = f'历史中哪些事件发生在{currentMonth}月{currentDay}日？列举两条并发送相关图片。发送图片时，请使用Markdown，将Unsplash API中的PUT_YOUR_QUERY_HERE替换成描述该事件的一个最重要的单词。'
         chatbot.append((i_say, "[Local Message] waiting gpt response."))
         yield chatbot, history, '正常'  # 由于请求gpt需要一段时间，我们先及时地做一次状态显示
 
diff --git a/functional_crazy.py b/functional_crazy.py
index af4c83e..2f91a32 100644
--- a/functional_crazy.py
+++ b/functional_crazy.py
@@ -19,10 +19,10 @@ def get_crazy_functionals():
 
     function_plugins = {
         "请解析并解构此项目本身": {
-            # HotReload 的意思是热更新，修改函数插件后，不需要重启程序，代码直接生效
+            "AsButton": False,  # 加入下拉菜单中
             "Function": 解析项目本身
         },
-        "解析整个py项目": {
+        "解析整个Py项目": {
             "Color": "stop",    # 按钮颜色
             "Function": 解析一个Python项目
         },
@@ -32,9 +32,10 @@ def get_crazy_functionals():
         },
         "解析整个C++项目": {
             "Color": "stop",    # 按钮颜色
+            "AsButton": False,  # 加入下拉菜单中
             "Function": 解析一个C项目
         },
-        "读tex论文写摘要": {
+        "读Tex论文写摘要": {
             "Color": "stop",    # 按钮颜色
             "Function": 读文章写摘要
         },
@@ -52,7 +53,7 @@ def get_crazy_functionals():
         },
     }
 
-    # VisibleLevel=1 经过测试，但功能未达到理想状态
+    # VisibleLevel=1 经过测试，但功能上距离达到完美状态还差一点点
     if UserVisibleLevel >= 1:
         from crazy_functions.批量总结PDF文档 import 批量总结PDF文档
         from crazy_functions.批量总结PDF文档pdfminer import 批量总结PDF文档pdfminer
@@ -60,11 +61,11 @@ def get_crazy_functionals():
         function_plugins.update({
             "[仅供开发调试] 批量总结PDF文档": {
                 "Color": "stop",
-                # HotReload 的意思是热更新，修改函数插件代码后，不需要重启程序，代码直接生效
-                "Function": HotReload(批量总结PDF文档)
+                "Function": HotReload(批量总结PDF文档) # HotReload 的意思是热更新，修改函数插件代码后，不需要重启程序，代码直接生效
             },
             "[仅供开发调试] 批量总结PDF文档pdfminer": {
                 "Color": "stop",
+                "AsButton": False,  # 加入下拉菜单中
                 "Function": HotReload(批量总结PDF文档pdfminer)
             },
             "[仅供开发调试] 批量总结Word文档": {
diff --git a/main.py b/main.py
index 10bbddc..cdcfaa4 100644
--- a/main.py
+++ b/main.py
@@ -4,9 +4,8 @@ from predict import predict
 from toolbox import format_io, find_free_port, on_file_uploaded, on_report_generated, get_conf
 
 # 建议您复制一个config_private.py放自己的秘密, 如API和代理网址, 避免不小心传github被别人看到
-proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION = \
-    get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION')
-
+proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT = \
+    get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT')
 
 # 如果WEB_PORT是-1, 则随机选取WEB端口
 PORT = find_free_port() if WEB_PORT <= 0 else WEB_PORT
@@ -17,18 +16,18 @@ title_html = """<h1 align="center">ChatGPT 学术优化</h1>"""
 
 # 问询记录, python 版本建议3.9+（越新越好）
 import logging
-os.makedirs('gpt_log', exist_ok=True)
-try:logging.basicConfig(filename='gpt_log/chat_secrets.log', level=logging.INFO, encoding='utf-8')
-except:logging.basicConfig(filename='gpt_log/chat_secrets.log', level=logging.INFO)
-print('所有问询记录将自动保存在本地目录./gpt_log/chat_secrets.log, 请注意自我隐私保护哦！')
+os.makedirs("gpt_log", exist_ok=True)
+try:logging.basicConfig(filename="gpt_log/chat_secrets.log", level=logging.INFO, encoding="utf-8")
+except:logging.basicConfig(filename="gpt_log/chat_secrets.log", level=logging.INFO)
+print("所有问询记录将自动保存在本地目录./gpt_log/chat_secrets.log, 请注意自我隐私保护哦！")
 
 # 一些普通功能模块
 from functional import get_functionals
 functional = get_functionals()
 
-# 对一些丧心病狂的实验性功能模块进行测试
+# 高级函数插件
 from functional_crazy import get_crazy_functionals
-crazy_functional = get_crazy_functionals()
+crazy_fns = get_crazy_functionals()
 
 # 处理markdown文本格式的转变
 gr.Chatbot.postprocess = format_io
@@ -40,11 +39,10 @@ set_theme = adjust_theme()
 cancel_handles = []
 with gr.Blocks(theme=set_theme, analytics_enabled=False) as demo:
     gr.HTML(title_html)
-    with gr.Row():
+    with gr.Row().style(equal_height=True):
         with gr.Column(scale=2):
             chatbot = gr.Chatbot()
-            chatbot.style(height=1150)
-            chatbot.style()
+            chatbot.style(height=CHATBOT_HEIGHT)
             history = gr.State([])
         with gr.Column(scale=1):
             with gr.Row():
@@ -66,49 +64,70 @@ with gr.Blocks(theme=set_theme, analytics_enabled=False) as demo:
                 with gr.Row():
                     gr.Markdown("注意：以下“红颜色”标识的函数插件需从input区读取路径作为参数.")
                 with gr.Row():
-                    for k in crazy_functional:
-                        variant = crazy_functional[k]["Color"] if "Color" in crazy_functional[k] else "secondary"
-                        crazy_functional[k]["Button"] = gr.Button(k, variant=variant)
+                    for k in crazy_fns:
+                        if not crazy_fns[k].get("AsButton", True): continue
+                        variant = crazy_fns[k]["Color"] if "Color" in crazy_fns[k] else "secondary"
+                        crazy_fns[k]["Button"] = gr.Button(k, variant=variant)
                 with gr.Row():
-                    with gr.Accordion("展开“文件上传区”。上传本地文件供“红颜色”的函数插件调用。", open=False):
-                        file_upload = gr.Files(label='任何文件, 但推荐上传压缩文件(zip, tar)', file_count="multiple")
+                    with gr.Accordion("更多函数插件", open=True):
+                        dropdown_fn_list = [k for k in crazy_fns.keys() if not crazy_fns[k].get("AsButton", True)]
+                        with gr.Column(scale=1):
+                            dropdown = gr.Dropdown(dropdown_fn_list, value=r"打开插件列表", label="").style(container=False)
+                        with gr.Column(scale=1):
+                            switchy_bt = gr.Button(r"请先从插件列表中选择", variant="secondary")
+                with gr.Row():
+                    with gr.Accordion("点击展开“文件上传区”。上传本地文件可供红色函数插件调用。", open=False) as area_file_up:
+                        file_upload = gr.Files(label="任何文件, 但推荐上传压缩文件(zip, tar)", file_count="multiple")
             with gr.Accordion("展开SysPrompt & GPT参数 & 交互界面布局", open=False):
                 system_prompt = gr.Textbox(show_label=True, placeholder=f"System Prompt", label="System prompt", value=initial_prompt)
                 top_p = gr.Slider(minimum=-0, maximum=1.0, value=1.0, step=0.01,interactive=True, label="Top-p (nucleus sampling)",)
                 temperature = gr.Slider(minimum=-0, maximum=2.0, value=1.0, step=0.01, interactive=True, label="Temperature",)
-                checkboxes = gr.CheckboxGroup(["基础功能区", "函数插件区"], 
-                                        value=["基础功能区", "函数插件区"], label="显示哪些功能区")
+                checkboxes = gr.CheckboxGroup(["基础功能区", "函数插件区"], value=["基础功能区", "函数插件区"], label="显示/隐藏功能区")
 
-    def what_is_this(a):
+    # 功能区显示开关与功能区的互动
+    def fn_area_visibility(a):
         ret = {}
-        # if area_basic_fn.visible != ("基础功能区" in a): 
-        ret.update({area_basic_fn: gr.update(visible=("基础功能区" in a))}) 
-        # if area_crazy_fn.visible != ("函数插件区" in a): 
-        ret.update({area_crazy_fn: gr.update(visible=("函数插件区" in a))}) 
+        ret.update({area_basic_fn: gr.update(visible=("基础功能区" in a))})
+        ret.update({area_crazy_fn: gr.update(visible=("函数插件区" in a))})
         return ret
-
-    checkboxes.select(what_is_this, [checkboxes], [area_basic_fn, area_crazy_fn] )
-
-    predict_args = dict(fn=predict, inputs=[txt, top_p, temperature, chatbot, history, system_prompt], outputs=[chatbot, history, statusDisplay], show_progress=True)
+    checkboxes.select(fn_area_visibility, [checkboxes], [area_basic_fn, area_crazy_fn] )
+    # 整理反复出现的控件句柄组合
+    input_combo = [txt, top_p, temperature, chatbot, history, system_prompt]
+    output_combo = [chatbot, history, statusDisplay]
+    predict_args = dict(fn=predict, inputs=input_combo, outputs=output_combo, show_progress=True)
     empty_txt_args = dict(fn=lambda: "", inputs=[], outputs=[txt]) # 用于在提交后清空输入栏
-
-    cancel_handles.append(txt.submit(**predict_args))
-    # txt.submit(**empty_txt_args) 在提交后清空输入栏
-    cancel_handles.append(submitBtn.click(**predict_args))
-    # submitBtn.click(**empty_txt_args) 在提交后清空输入栏
-    resetBtn.click(lambda: ([], [], "已重置"), None, [chatbot, history, statusDisplay])
+    # 提交按钮、重置按钮
+    cancel_handles.append(txt.submit(**predict_args)) #; txt.submit(**empty_txt_args) 在提交后清空输入栏
+    cancel_handles.append(submitBtn.click(**predict_args)) #; submitBtn.click(**empty_txt_args) 在提交后清空输入栏
+    resetBtn.click(lambda: ([], [], "已重置"), None, output_combo)
+    # 基础功能区的回调函数注册
     for k in functional:
-        click_handle = functional[k]["Button"].click(predict,
-            [txt, top_p, temperature, chatbot, history, system_prompt, gr.State(True), gr.State(k)], [chatbot, history, statusDisplay], show_progress=True)
+        click_handle = functional[k]["Button"].click(predict, [*input_combo, gr.State(True), gr.State(k)], output_combo, show_progress=True)
         cancel_handles.append(click_handle)
+    # 文件上传区，接收文件后与chatbot的互动
     file_upload.upload(on_file_uploaded, [file_upload, chatbot, txt], [chatbot, txt])
-    for k in crazy_functional:
-        click_handle = crazy_functional[k]["Button"].click(crazy_functional[k]["Function"],
-            [txt, top_p, temperature, chatbot, history, system_prompt, gr.State(PORT)], [chatbot, history, statusDisplay]
-        )
-        try: click_handle.then(on_report_generated, [file_upload, chatbot], [file_upload, chatbot])
-        except: pass
+    # 函数插件-固定按钮区
+    for k in crazy_fns:
+        if not crazy_fns[k].get("AsButton", True): continue
+        click_handle = crazy_fns[k]["Button"].click(crazy_fns[k]["Function"], [*input_combo, gr.State(PORT)], output_combo)
+        click_handle.then(on_report_generated, [file_upload, chatbot], [file_upload, chatbot])
         cancel_handles.append(click_handle)
+    # 函数插件-下拉菜单与随变按钮的互动
+    def on_dropdown_changed(k):
+        variant = crazy_fns[k]["Color"] if "Color" in crazy_fns[k] else "secondary"
+        return {switchy_bt: gr.update(value=k, variant=variant)}
+    dropdown.select(on_dropdown_changed, [dropdown], [switchy_bt] )
+    # 随变按钮的回调函数注册
+    def route(k, *args, **kwargs):
+        if k in [r"打开插件列表", r"先从插件列表中选择"]: return 
+        yield from crazy_fns[k]["Function"](*args, **kwargs)
+    click_handle = switchy_bt.click(route,[switchy_bt, *input_combo, gr.State(PORT)], output_combo)
+    click_handle.then(on_report_generated, [file_upload, chatbot], [file_upload, chatbot])
+    def expand_file_area(file_upload, area_file_up):
+        if len(file_upload)>0: return {area_file_up: gr.update(open=True)}
+    click_handle.then(expand_file_area, [file_upload, area_file_up], [area_file_up])
+    cancel_handles.append(click_handle)
+    # 终止按钮的回调函数注册
     stopBtn.click(fn=None, inputs=None, outputs=None, cancels=cancel_handles)
 
 # gradio的inbrowser触发不太稳定，回滚代码到原始的浏览器打开函数
@@ -117,7 +136,7 @@ def auto_opentab_delay():
     print(f"如果浏览器没有自动打开，请复制并转到以下URL: http://localhost:{PORT}")
     def open(): 
         time.sleep(2)
-        webbrowser.open_new_tab(f'http://localhost:{PORT}')
+        webbrowser.open_new_tab(f"http://localhost:{PORT}")
     threading.Thread(target=open, name="open-browser", daemon=True).start()
 
 auto_opentab_delay()
diff --git a/predict.py b/predict.py
index 84036bc..31a5861 100644
--- a/predict.py
+++ b/predict.py
@@ -96,13 +96,19 @@ def predict_no_ui_long_connection(inputs, top_p, temperature, history=[], sys_pr
         except StopIteration: break
         if len(chunk)==0: continue
         if not chunk.startswith('data:'): 
-            chunk = get_full_error(chunk.encode('utf8'), stream_response)
-            raise ConnectionAbortedError("OpenAI拒绝了请求:" + chunk.decode())
-        delta = json.loads(chunk.lstrip('data:'))['choices'][0]["delta"]
+            error_msg = get_full_error(chunk.encode('utf8'), stream_response).decode()
+            if "reduce the length" in error_msg:
+                raise ConnectionAbortedError("OpenAI拒绝了请求:" + error_msg)
+            else:
+                raise RuntimeError("OpenAI拒绝了请求：" + error_msg)
+        json_data = json.loads(chunk.lstrip('data:'))['choices'][0]
+        delta = json_data["delta"]
         if len(delta) == 0: break
         if "role" in delta: continue
         if "content" in delta: result += delta["content"]; print(delta["content"], end='')
         else: raise RuntimeError("意外Json结构："+delta)
+    if json_data['finish_reason'] == 'length':
+        raise ConnectionAbortedError("正常结束，但显示Token不足。")
     return result
 
 
diff --git a/toolbox.py b/toolbox.py
index b78a513..bf88760 100644
--- a/toolbox.py
+++ b/toolbox.py
@@ -2,21 +2,21 @@ import markdown, mdtex2html, threading, importlib, traceback, importlib, inspect
 from show_math import convert as convert_math
 from functools import wraps
 
-def get_reduce_token_percent(e):
+def get_reduce_token_percent(text):
     try:
         # text = "maximum context length is 4097 tokens. However, your messages resulted in 4870 tokens"
         pattern = r"(\d+)\s+tokens\b"
         match = re.findall(pattern, text)
-        eps = 50 # 稍微留一点余地, 确保下次别再超过token
-        max_limit = float(match[0]) - eps
+        EXCEED_ALLO = 500 # 稍微留一点余地，否则在回复时会因余量太少出问题
+        max_limit = float(match[0]) - EXCEED_ALLO
         current_tokens = float(match[1])
         ratio = max_limit/current_tokens
         assert ratio > 0 and ratio < 1
-        return ratio
+        return ratio, str(int(current_tokens-max_limit))
     except:
-        return 0.5
+        return 0.5, '不详'
 
-def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[], sys_prompt='', long_connection=False):
+def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[], sys_prompt='', long_connection=True):
     """
         调用简单的predict_no_ui接口，但是依然保留了些许界面心跳功能，当对话太长时，会自动采用二分法截断
         i_say: 当前输入
@@ -45,19 +45,18 @@ def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temp
                 break
             except ConnectionAbortedError as token_exceeded_error:
                 # 尝试计算比例，尽可能多地保留文本
-                p_ratio = get_reduce_token_percent(str(token_exceeded_error))
+                p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error))
                 if len(history) > 0:
                     history = [his[     int(len(his)    *p_ratio):      ] for his in history if his is not None]
-                    mutable[1] = 'Warning! History conversation is too long, cut into half. '
                 else:
                     i_say = i_say[:     int(len(i_say)  *p_ratio)     ]
-                    mutable[1] = 'Warning! Input file is too long, cut into half. '
+                mutable[1] = f'警告，文本过长将进行截断，Token溢出数：{n_exceed}，截断比例：{(1-p_ratio):.0%}。'
             except TimeoutError as e:
-                mutable[0] = '[Local Message] Failed with timeout.'
+                mutable[0] = '[Local Message] 请求超时。'
                 raise TimeoutError
             except Exception as e:
-                mutable[0] = f'[Local Message] Failed with {str(e)}.'
-                raise RuntimeError(f'[Local Message] Failed with {str(e)}.')
+                mutable[0] = f'[Local Message] 异常：{str(e)}.'
+                raise RuntimeError(f'[Local Message] 异常：{str(e)}.')
     # 创建新线程发出http请求
     thread_name = threading.Thread(target=mt, args=(i_say, history)); thread_name.start()
     # 原来的线程则负责持续更新UI，实现一个超时倒计时，并等待新线程的任务完成
@@ -286,7 +285,7 @@ def on_report_generated(files, chatbot):
     report_files = find_recent_files('gpt_log')
     if len(report_files) == 0: return report_files, chatbot
     # files.extend(report_files)
-    chatbot.append(['汇总报告如何远程获取？', '汇总报告已经添加到右侧文件上传区，请查收。'])
+    chatbot.append(['汇总报告如何远程获取？', '汇总报告已经添加到右侧“文件上传区”（可能处于折叠状态），请查收。'])
     return report_files, chatbot
 
 def get_conf(*args):