 /*
  * <LIC_AMD_STD>
  * Copyright (C) <years> Advanced Micro Devices, Inc.  All Rights Reserved.
  * </LIC_AMD_STD>
  * 
  * <CTL_AMD_STD>
  * </CTL_AMD_STD>
  * 
  * <DOC_AMD_STD>
  * GX2 Text Acceleration.
  * </DOC_AMD_STD>
  * 
  */

#include "precomp.h"
#include "gfx_regs.h"
#include "gfx_defs.h"

extern unsigned char *gfx_virt_gpptr;
extern unsigned char *gfx_virt_fbptr;
extern unsigned long gu2_bpp;
extern unsigned long gfx_fb_size;
extern unsigned char mode_shift;

void GX2TextBlt (unsigned long dstoffset, long width, long height, unsigned char *dataptr);
void GX2MonoBlt (unsigned long dstoffset, long width, long height, 
	long pitch, long srcx, unsigned char *dataptr);
void FillBlendTable16 (unsigned short fgcolor, unsigned short bgcolor);
void FillBlendTable32 (unsigned long fgcolor, unsigned long bgcolor);
void WriteTextToBuffer16 (int width, int height, long dstoffset, unsigned char *data_ptr, 
						  long xIndex, long delta, long surface_delta);
void WriteTextToBuffer32 (int width, int height, long dstoffset, unsigned char *data_ptr, 
						  long xIndex, long delta, long surface_delta);

#define GU2_WAIT_PENDING while(READ_GP32(MGP_BLT_STATUS) & MGP_BS_BLT_PENDING)
#define GU2_WAIT_BUSY while(READ_GP32(MGP_BLT_STATUS) & MGP_BS_BLT_BUSY)

unsigned long ColorTable[17];

/*-------------------------------------------------------------------------
 * DrvTextOut
 *
 * Render a text string with optional opaque background. Theoretically
 * the background could be any brush, practically I haven't seen anything
 * other than solid rectangles, and, in fact, will punt anything else.
 -------------------------------------------------------------------------*/

BOOL DrvTextOut(
	SURFOBJ *surfPtr,
	STROBJ *stringPtr,
	FONTOBJ *fontPtr,
	CLIPOBJ *clipPtr,
	RECTL *extraPtr,
	RECTL *opaqueRectPtr,
	BRUSHOBJ *fgBrushPtr,
	BRUSHOBJ *opaqueBrushPtr,
	POINTL *destPntPtr,
	MIX mix)
{
	BOOL        more, moreGlyphs;
	PDEV       *ppdev = NULL;
	long        i, rectCount;
	BYTE        complexity;
	RECTL       clippedTarget, destRegion;
	CLIPENUM    clipRectEnum;
	int         xClipped;
	ULONG       glyphCount;
	GLYPHBITS  *glyphBitPtr;
	GLYPHPOS   *glyphPosPtr;
	UCHAR      *glyphDataPtr;
	int         blendTableInitialized = 0;
    ULONG       dstOffset;
    ULONG       dstPitch;
    ULONG       index;
    BOOL        ret;

	ppdev = (PDEV *)surfPtr->dhpdev;	

    DISPDBG ((ppdev, 1, "DrvTextOut Entry\n"));   

    // PUNT ALL BLTS DURING A MODECHANGE
    // In very rare cases, the mode can be changed before Windows has finished rendering
    // to all surfaces from the previous mode.  We will thus punt all rendering calls until
    // all such surfaces have been deleted.

    if (old_mode_count)
    {
        DISPDBG ((ppdev, 3000, "Punting DrvTextOut during modechange\n"));
        goto puntIt;
    }

    // READ SURFACE INFORMATION
	
    index = dhsurf_array[(USHORT)surfPtr->dhsurf];
    if (index & CACHE_FLAG_SYSTEM)
        goto puntIt;

    dstOffset = bitmap_heap[(USHORT)index].heap_offset;
    dstPitch  = bitmap_heap[(USHORT)index].flags_and_pitch;

    ACQUIRE_PRIMITIVE_SEMAPHORE;   

	/* SET DESTINATION STRIDE AND FOREGROUND COLOR */

	GU2_WAIT_PENDING;
	WRITE_GP32 (MGP_STRIDE, dstPitch);
	WRITE_GP32 (MGP_SRC_COLOR_FG, fgBrushPtr->iSolidColor);

	/* HANDLE CLIPPING */
	/* We are using the dreaded "goto" statement to avoid calling a function */
	/* and/or duplicating code.                                              */
 
	complexity = (clipPtr == NULL) ? DC_TRIVIAL : clipPtr->iDComplexity;

	if (complexity != DC_COMPLEX)
	{
		if (complexity == DC_TRIVIAL) 
			clippedTarget = ppdev->scrnBounds;
		else
			clippedTarget = clipPtr->rclBounds;

		rectCount = 0;
		more = FALSE;
		
		goto SingleFont;
	}
	
	/* COMPLEX CLIPPING */

	CLIPOBJ_cEnumStart(clipPtr, FALSE, CT_RECTANGLES, CD_ANY, 0);

	DISPDBG((ppdev, 0, "Complex clipping\n"));

	for(;;) 
	{		
		more = CLIPOBJ_bEnum(clipPtr, sizeof(clipRectEnum), (ULONG *) &clipRectEnum);

		/* SET RECTANGLE COUNT               */
		/* Count can also be set from above. */

		rectCount = clipRectEnum.c;
		DISPDBG((ppdev, 0, "Clip rects %d\n", rectCount));

		while (rectCount)
		{
			rectCount--;
			if(!FindIntersection(&ppdev->scrnBounds, &clipRectEnum.arcl[rectCount], &clippedTarget))
				continue;

			/* SINGLE FONT CODE */
			/* The code to execute a single block is the same as the code to execute */
			/* multiple complex rectangles.                                          */

SingleFont:

			/* HANDLE OPAQUE RECTANGLES */

			if (opaqueRectPtr != NULL)
			{
				if (FindIntersection (opaqueRectPtr, &clippedTarget, &destRegion))
				{
					GU2_WAIT_PENDING;
					WRITE_GP32 (MGP_RASTER_MODE, (gu2_bpp | 0xF0));
					WRITE_GP32 (MGP_PAT_COLOR_0, opaqueBrushPtr->iSolidColor);
					WRITE_GP32 (MGP_DST_OFFSET, (destRegion.left << (ppdev->cPelSize)) + 
						(destRegion.top * dstPitch) + dstOffset);
					WRITE_GP32 (MGP_WID_HEIGHT, 
						((destRegion.right  - destRegion.left) << 16) |
						 (destRegion.bottom - destRegion.top));
					WRITE_GP32 (MGP_BLT_MODE, 0);
				}
			}
						
			/* RETRIEVE THE GLYPH DATA */

			if (stringPtr->pgp == NULL)
			{
				STROBJ_vEnumStart(stringPtr);
				moreGlyphs = STROBJ_bEnum(stringPtr, &glyphCount, &glyphPosPtr);
			}
			else
			{
				glyphPosPtr = stringPtr->pgp;
				glyphCount  = stringPtr->cGlyphs;
				moreGlyphs  = FALSE;
			}

			/* SET MONOCHROME PARAMETERS */

			GU2_WAIT_PENDING;
			WRITE_GP32 (MGP_RASTER_MODE, (gu2_bpp | MGP_RM_SRC_TRANS | 0xCC));

			/* LOOP THROUGH EACH STRING */
			
			for(;;) 
			{
				ULONG g;

				/* HANDLE MONO SPACED FONTS */

				if (stringPtr->ulCharInc != 0) 
				{
					ULONG i;
					long x, y;

					/* SET COORDINATES IN ADVANCE                               */
					/* Since this is a mono font, set the y coordinate of all   */
					/* glyphs to the same as the first, and space out the x     */
					/* values equally.                                          */

					x = glyphPosPtr[0].ptl.x;
					y = glyphPosPtr[0].ptl.y;

					for (i = 1; i < glyphCount; i++) 
					{
						x += stringPtr->ulCharInc;
						glyphPosPtr[i].ptl.x = x;
						glyphPosPtr[i].ptl.y = y;
					}
				}

				/* LOOP THROUGH FOR EACH CHARACTER */
				
				for (g = 0; g < glyphCount; g++, glyphPosPtr++) 
				{
					long glyphWidth, glyphHeight;
					long glyphDelta;
					long glyphXOffset = 0;
					long dx, dy;
					POINTL destPnt;

					/* GRAB A POINTER TO THE GLYPH */
					
					glyphBitPtr = glyphPosPtr->pgdf->pgb;

					/* GLYPH DIMENSIONS */

					glyphWidth  = glyphBitPtr->sizlBitmap.cx;
					glyphHeight = glyphBitPtr->sizlBitmap.cy;

					/* CALCULATE DESTINATION COORDINATES */
					
					destPnt.x = glyphPosPtr->ptl.x + glyphBitPtr->ptlOrigin.x;
					destPnt.y = glyphPosPtr->ptl.y + glyphBitPtr->ptlOrigin.y;

					/* GRAB A POINTER TO THE GLYPH DATA */

					glyphDataPtr = glyphBitPtr->aj;

					/* CALCULATE THE BITMAP PITCH */
					/* Grayscale fonts are 4BPP   */
					
					if (fontPtr->flFontType & FO_GRAY16)
						glyphDelta = (glyphWidth + 1) >> 1;
					else
						glyphDelta = (glyphWidth + 7) >> 3;

					/* HORIZONTAL CLIPPING */
					/* Horizontal clipping is done before vertical clipping for */
					/* the following reason:  If x-clipping is not needed, the  */
					/* font data is contiguous, (byte-packed).  The interface   */
					/* to Durango is thus simpler (no srcx, srcy, pitch, etc.)  */

					xClipped = 0;
					dx = clippedTarget.left - destPnt.x;

					if (dx > 0) 
					{
						xClipped = 1;
						destPnt.x = clippedTarget.left;

						glyphXOffset = dx;
						glyphWidth  -= dx;
					}

					/* RIGHT CLIPPING */

					dx = (destPnt.x + glyphWidth) - clippedTarget.right;
					
					if(dx > 0)
					{
						xClipped    = 1;
						glyphWidth -= dx;
					}

					/* CHECK FOR TRIVIAL CLIPPING */
					
					if(glyphWidth <= 0)		
						continue;

					/* BOTTOM CLIPPING */
							
					dy = (destPnt.y + glyphHeight) - clippedTarget.bottom;
							
					if(dy > 0)
						glyphHeight -= dy;

					/* TOP CLIPPING */

					dy = clippedTarget.top - destPnt.y;

					if(dy > 0) 
					{
						destPnt.y     = clippedTarget.top;
						glyphHeight  -= dy;
						glyphDataPtr += dy * glyphDelta;
					}

					/* CHECK TRIVIAL CLIP */

					if (glyphHeight <= 0)
						continue;

					/* FILL IN THE BLEND TABLE */

					if (fontPtr->flFontType & FO_GRAY16)
					{
						/* 16 BPP */
						
						if (mode_shift == 1)
						{
							if (!blendTableInitialized)
							{
								blendTableInitialized = 1;
								FillBlendTable16 ((unsigned short)fgBrushPtr->iSolidColor, 
                                    *((unsigned short *)(gfx_virt_fbptr + dstOffset + (destPnt.x << 1) + 
									(destPnt.y * dstPitch))));
							}
							WriteTextToBuffer16 (glyphWidth, glyphHeight, 
								(destPnt.x << 1) + (destPnt.y * dstPitch) + dstOffset, 
								glyphDataPtr, glyphXOffset, glyphDelta,
								dstPitch);
						}

						/* 32 BPP - 8BPP GRAY SCALE IS NOT SUPPORTED */

						else
						{
							if (!blendTableInitialized)
							{
								blendTableInitialized = 1;
								FillBlendTable32 (fgBrushPtr->iSolidColor, 
									*((unsigned long *)(gfx_virt_fbptr + dstOffset + (destPnt.x << 2) + 
									(destPnt.y * dstPitch))));
							}
							WriteTextToBuffer32 (glyphWidth, glyphHeight, 
								(destPnt.x << 2) + (destPnt.y * dstPitch) + dstOffset, 
								glyphDataPtr, glyphXOffset, glyphDelta,
								dstPitch);
						}
						continue;
					}

					if (xClipped)
					{
						GU2_WAIT_PENDING;
						GX2MonoBlt (
							(destPnt.x << (ppdev->cPelSize)) + 
							(destPnt.y * dstPitch) + 
							dstOffset,
							glyphWidth,
							glyphHeight,
							glyphDelta,
							glyphXOffset,
							glyphDataPtr);
					}
					else
					{
						GU2_WAIT_PENDING;
						GX2TextBlt (
							(destPnt.x << (ppdev->cPelSize)) + 
							(destPnt.y * dstPitch) +
							 dstOffset,
							 glyphWidth,
							 glyphHeight,
							 glyphDataPtr);
						
					}
				}
				
				/* LAST STRING? */

				if(!moreGlyphs)	
					break;

				moreGlyphs = STROBJ_bEnum(stringPtr, &glyphCount, &glyphPosPtr);
			}
		} /* while (rectCount) */

		/* LAST RECTANGLE */

		if (!more)
			break;

	}

returnTrue:

	RELEASE_PRIMITIVE_SEMAPHORE;

    DISPDBG ((ppdev, 1, "DrvTextOut Exit\n"));
	return (TRUE);

puntIt:

    if (fontPtr->flFontType & FO_GRAY16)
    {
        return EngTextOut (surfPtr, stringPtr, fontPtr, clipPtr, extraPtr, opaqueRectPtr,
            fgBrushPtr, opaqueBrushPtr, destPntPtr, mix);
    }
    else
    {
        ULONG index;
        BOOL  ret;
        
        ACQUIRE_PRIMITIVE_SEMAPHORE;

        MAKE_PUNTABLE(surfPtr);

        GU2_WAIT_BUSY;

        ret = EngTextOut (surfPtr, stringPtr, fontPtr, clipPtr, extraPtr, opaqueRectPtr,
            fgBrushPtr, opaqueBrushPtr, destPntPtr, mix);

        RELEASE_PRIMITIVE_SEMAPHORE;

        return ret;
    }
}

void GX2MonoBlt (unsigned long dstoffset, long width, long height, 
	long pitch, long srcx, unsigned char *dataptr)
{
	_asm {

		mov		edi, gfx_virt_gpptr
		mov     esi, dataptr

		mov     eax, dstoffset
		mov     [edi + MGP_DST_OFFSET], eax

		/* WRITE THE SOURCE INDEX AND CALCULATE THE BYTE WIDTH */
		/* bytes = ((srcx & 7) + 7) >> 3;                      */

		mov     eax, srcx
		mov     edx, width
		movzx   ebx, al
		and     bl, 7
		lea     ecx, [ebx + 7]
		shl     ebx, 26
		mov     [edi + MGP_SRC_OFFSET], ebx
		add     ecx, edx
		shr     ecx, 3

		/* CALCULATE THE DATA ADDRESS */

		shr     eax, 3
		add     esi, eax

		/* WRITE THE WIDTH AND HEIGHT */
		
		shl     edx, 16
		mov     dx, WORD PTR height
		mov     [edi + MGP_WID_HEIGHT], edx

		/* START THE BLT */

		mov     DWORD PTR [edi + MGP_BLT_MODE], MGP_BM_SRC_HOST | MGP_BM_SRC_MONO

		/* CALCULATE BYTE DIFFERENCE FOR EACH LINE           */
		/* At the end of each loop, the data pointer will be */
		/* at the beginning of the last dword.               */
		/* ECX = Bytes per line                              */
		/* ESI = First source byte                           */

		mov     ebx, pitch
		sub     ebx, ecx
		movzx   eax, cl
		and     al, 3
		add     ebx, eax

		/* WAIT FOR BLT TO BE LATCHED */

GX2MonoBltWaitBltLatched:

		test    DWORD PTR [edi + MGP_BLT_STATUS], MGP_BS_BLT_PENDING
		jnz     GX2MonoBltWaitBltLatched

		/* AT THIS POINT                     */
		/* ECX = Bytes per line              */
		/* EBX = Byte addition for each line */
		/* ESI = First Byte                  */
		
		push    ecx
		mov     dl, 17
		
GX2MonoBltLineLoop:

		mov     ecx, [esp]
		shr     ecx, 2
		jz      GX2MonoBltLineDwordLoopDone

GX2MonoBltLineDwordLoop:

		dec     dl
		jnz     GX2MonoBltLineDwordNoWait

GX2MonoBltLineDwordWaitLoop:

		test    [edi + MGP_BLT_STATUS], MGP_BS_HALF_EMPTY
		jz      GX2MonoBltLineDwordWaitLoop
		mov     dl, 8

GX2MonoBltLineDwordNoWait:

		lodsd 
		mov     [edi + MGP_HST_SOURCE], eax
		loop    GX2MonoBltLineDwordLoop

GX2MonoBltLineDwordLoopDone:

		movzx   ecx, BYTE PTR [esp]
		and     cl, 3
		or      cl, cl
		jz      GX2MonoNonContiguousBLTLineDone

		/* SET POINTER TO LAST BYTE */

		lea     esi, [esi + ecx]
		
GX2MonoBltLineByteLoop:

		shl     eax, 8
		dec     esi
		mov     al, [esi]
		loop    GX2MonoBltLineByteLoop

		dec     dl
		jnz     GX2MonoBltLineByteLoopNoWait

GX2MonoBltLineByteWaitLoop:

		test    [edi + MGP_BLT_STATUS], MGP_BS_HALF_EMPTY
		jz      GX2MonoBltLineByteWaitLoop
		mov     dl, 8

GX2MonoBltLineByteLoopNoWait:

		mov     [edi + MGP_HST_SOURCE], eax

GX2MonoNonContiguousBLTLineDone:

		add     esi, ebx
		dec     height
		jnz     GX2MonoBltLineLoop

		pop     eax
	}
}

void GX2TextBlt (unsigned long dstoffset, long width, long height, unsigned char *dataptr)
{
	/* BYTE-PACKED TEXT GLYPH                            */
	/* The stride and raster mode have already been set. */

	_asm {

		mov		edi, gfx_virt_gpptr
		mov     esi, dataptr

		mov     eax, dstoffset
		mov     [edi + MGP_DST_OFFSET], eax
		mov     DWORD PTR [edi + MGP_SRC_OFFSET], 0
		mov     eax, width
		mov     ebx, height
		
		/* CALCULATE THE TOTAL BYTES */

		lea     ecx, [eax + 7]
		shr     ecx, 3
		imul    ecx, ebx

		/* WRITE THE WIDTH AND HEIGHT */

		shl     eax, 16
		mov     ax, bx
		mov     [edi + MGP_WID_HEIGHT], eax

		/* START THE BLT */

		mov     DWORD PTR [edi + MGP_BLT_MODE], MGP_BM_SRC_HOST | MGP_BM_SRC_BP_MONO

		/* WAIT FOR BLT TO BE LATCHED */

GX2TextBltWaitBltLatched:

		test    DWORD PTR [edi + MGP_BLT_STATUS], MGP_BS_BLT_PENDING
		jnz     GX2TextBltWaitBltLatched

		/* WRITE THE DATA */

		mov     dl, 17

		mov     ebx, ecx
		shr     ecx, 2
		jz      GX2TextBltDwordLoopDone
		
GX2TextBltDwordLoop:

		dec     dl
		jnz     GX2TextBltDwordLoopNoWait

GX2TextBltDwordWaitLoop:

		test    [edi + MGP_BLT_STATUS], MGP_BS_HALF_EMPTY
		jz      GX2TextBltDwordWaitLoop
		mov     dl, 8

GX2TextBltDwordLoopNoWait:

		lodsd
		mov     [edi + MGP_HST_SOURCE], eax
		loop    GX2TextBltDwordLoop

GX2TextBltDwordLoopDone:

		movzx   ecx, bl
		and     cl, 3
		jz      GX2TextBLTDone

		/* LOAD THE REMAINING BYTES */
		/* We cannot simply read a dword as we might inandvertently */
		/* create a page fault.                                     */

		lea     esi, [esi + ecx - 1]
		
GX2TextBltByteLoop:

		shl     eax, 8
		mov     al, [esi]
		dec     esi
		loop    GX2TextBltByteLoop

		dec     dl
		jnz     GX2TextBltByteLoopNoWait

GX2TextBltByteWaitLoop:

		test    [edi + MGP_BLT_STATUS], MGP_BS_HALF_EMPTY
		jz      GX2TextBltByteWaitLoop

GX2TextBltByteLoopNoWait:

		mov     [edi + MGP_HST_SOURCE], eax

GX2TextBLTDone:

	}
}

/* WELCOME TO ANTIALIASED TEXT 101                                      */
/* Tragically, the GX2 does not have any true alpha blending support.   */
/* However, to improve performance, a modified version of the algorithm */
/* is being implemented.  The algorithm is as follows:                  */
/*                                                                      */
/* Windows provides data in byte-packed 4BPP format.  So, each byte     */
/* contains two nibbles of blend information.  The nibble value is      */
/* interpreted as follows:                                              */
/*    b  = 0 -> Transparent                                             */
/*    b != 0 -> Blend is ((b+1)/16) * Src + ((15-b)/16) * Dst           */
/*                                                                      */
/* We are implementing a slightly simpler version.  In our version,     */
/* there are only five blend states.  We reach these states through     */
/* the following conversion: our_b = (b + 2) >> 2                       */
/* So, states 2-5 map to 1/4 blend, states 6-9 map to 1/2 blend, states */
/* 10-13 map to 3/4 blend, 14-15 map to full source and 0-1 map to      */
/* transparency.                                                        */
/*                                                                      */
/* The last piece of the puzzle is how this is implemented.  Rather     */
/* than reading a destination and source pixel, comparing the source    */
/* value, blending, writing and then repeating, we are using the        */
/* GX1 transparency hardware and doing a table lookup.  Each value is   */
/* an index into a blend table.  This blend table is pre-generated      */
/* based on the blend values for the constant source color and          */
/* the first pixel of the destination region.  This algorithm will thus */
/* break down if the font data crosses a very contrasting color region. */

#define SHIFT_ONE_MASK 0x7BEF7BEF 
#define SHIFT_TWO_MASK 0x39E739E7 

void FillBlendTable16 (unsigned short fgcolor, unsigned short bgcolor)
{
	_asm {
		
		mov     edi, OFFSET ColorTable

		/* SET TRANSPARENT COLOR */
		/* Normally use 0xFEED, unless the primary foreground color is      */
		/* 0xFEED.                                                          */

		mov     edx, 0xFEEDFEED
		cmp     dx, fgcolor
		
		jne     FillTableColorDone
		mov     edx, 0xBEEFBEEF

FillTableColorDone:	

		mov     [edi], edx

		movzx   eax, fgcolor
		shl     eax, 16
		mov     ax,  fgcolor
		movzx   ecx, bgcolor
		shl     ecx, 16
		mov     cx,  bgcolor   

		/* INITIALIZE TABLE */
		
		/* 4/4 - SOLID COLOR */
		/* Entries 14-15     */

		mov     [edi + 28], eax
		
		/* CALCULATE BLEND VALUES */
		/* EAX = 1/4 source       */
		/* EBX = 1/2 source       */
		/* ECX = 1/4 dest         */
		/* EDX = 1/2 dest         */
		
		mov     ebx, eax
		mov     edx, ecx
		shr     ax, 2
		ror     eax, 16
		shr     ax, 2
		and     eax, SHIFT_TWO_MASK
		shr     bx, 1
		ror     ebx, 16
		shr     bx, 1
		and     ebx, SHIFT_ONE_MASK
		shr     cx, 2
		ror     ecx, 16
		shr     cx, 2
		and     ecx, SHIFT_TWO_MASK
		shr     dx, 1
		ror     edx, 16
		shr     dx, 1
		and     edx, SHIFT_ONE_MASK

		/* 1/4 BLEND */
		/* 1/4 Source + 1/4 Dest + 1/2 Dest */
		/* Entries 2-5                      */

		lea     esi, [eax + ecx]
		add     esi, edx

		mov     [edi + 4], esi
		mov     [edi + 8], esi

		/* 1/2 BLEND */
		/* 1/2 Source + 1/2 Dest */
		/* Entries 6-9           */

		lea     esi, [ebx + edx]
		
		mov     [edi + 12],  esi
		mov     [edi + 16],  esi

		/* 3/4 BLEND */
		/* 1/2 Source + 1/4 source + 1/4 dest */
		/* Entries 10-13                      */
		
		lea     esi, [eax + ebx]
		add     esi, ecx
		
		mov     [edi + 20], esi
		mov     [edi + 24], esi
	}
}

#define SHIFT_ONE_MASK_32 0x007F7F7F 
#define SHIFT_TWO_MASK_32 0x003F3F3F 

void FillBlendTable32 (unsigned long fgcolor, unsigned long bgcolor)
{
	_asm {
		
		mov     edi, OFFSET ColorTable

		/* SET TRANSPARENT COLOR */
		/* Normally use 0xFEEDFEED, unless the primary foreground color is  */
		/* 0xFEEDFEED.                                                      */

		mov     edx, 0xFEEDFEED
		cmp     edx, fgcolor
		
		jne     FillTableColorDone32
		mov     edx, 0xBEEFBEEF

FillTableColorDone32:	

		mov     [edi], edx
		mov     [edi + 4], edx

		/* REPLICATE DATA TO SIMPLIFY CALCULATION */

		mov     eax, fgcolor
		mov     ecx, bgcolor
		
		/* INITIALIZE TABLE */
		
		/* 4/4 - SOLID COLOR */
		/* Entries 14-15     */

		mov     [edi + 56], eax
		mov     [edi + 60], eax
		
		/* CALCULATE BLEND VALUES */
		/* EAX = 1/4 source       */
		/* EBX = 1/2 source       */
		/* ECX = 1/4 dest         */
		/* EDX = 1/2 dest         */
		
		mov     ebx, eax
		mov     edx, ecx
		shr     eax, 2
		and     eax, SHIFT_TWO_MASK_32
		shr     ebx, 1
		and     ebx, SHIFT_ONE_MASK_32
		shr     ecx, 2
		and     ecx, SHIFT_TWO_MASK_32
		shr     edx, 1
		and     edx, SHIFT_ONE_MASK_32

		/* 1/4 BLEND                        */
		/* 1/4 Source + 1/4 Dest + 1/2 Dest */
		/* Entries 2-5                      */

		lea     esi, [eax + ecx]
		add     esi, edx

		mov     [edi + 8], esi
		mov     [edi + 12], esi
		mov     [edi + 16], esi
		mov     [edi + 20], esi

		/* 1/2 BLEND             */
		/* 1/2 Source + 1/2 Dest */
		/* Entries 6-9           */

		lea     esi, [ebx + edx]
		
		mov     [edi + 24],  esi
		mov     [edi + 28],  esi
		mov     [edi + 32],  esi
		mov     [edi + 36],  esi

		/* 3/4 BLEND */
		/* 1/2 Source + 1/4 source + 1/4 dest */
		/* Entries 10-13                      */
		
		lea     esi, [eax + ebx]
		add     esi, ecx
		
		mov     [edi + 40], esi
		mov     [edi + 44], esi
		mov     [edi + 48], esi
		mov     [edi + 52], esi
	}
}

void WriteTextToBuffer16 (int width, int height, long dstoffset, unsigned char *data_ptr, 
						  long xIndex, long delta, long surface_delta)
{
	unsigned long qword_count, leftover_count, gfx_gp_scratch_base, line_toggle = 0;

	/* ONE BIG ASM ROUTINE */

	_asm {

		/* SET SCRATCH BASE      */
		/* Last 16K of FB memory */

		mov     eax, gfx_fb_size
		sub     eax, 0x4000
		mov     gfx_gp_scratch_base, eax

		/* CHECK FOR X CLIPPING */
		/* When the font is xclipped, we cannot go simply trudging through the data. */
		/* The reason is that the hardware does not support an x index for color     */
		/* data.  So, we do two things.   We add the byte offset of the x clipping   */
		/* to the pointer, and we set a flag if we have an odd index.  This flag     */
		/* causes the width to be reduced by one and a special case to happen at the */
		/* beginning of the line.                                                    */

		mov		eax, xIndex
		mov     ebx, eax
		shr     ebx, 1
		and     eax, 1
		add     data_ptr, ebx
		mov     xIndex, eax
		
		/* CALCULATE DWORDS AND BYTES PER LINE       */
		/* Use width - 1 for dword calculation       */
		/* Use width when writing the width register */

		mov     ecx, width
		mov     edx, ecx
		sub     ecx, eax
		mov     ebx, ecx
		and     ebx, 7         
		shr     ecx, 3	

		mov     qword_count, ecx
		mov     leftover_count, ebx
		
		mov		esi, gfx_virt_gpptr
		mov     ebx, OFFSET ColorTable

		/* WRITE THE REGISTERS THAT DO NOT CHANGE */
		
		shl     edx, 16
		inc     edx
		mov     ecx, [ebx]
		mov     eax, dstoffset
					
WriteText16WaitPending:

		test    DWORD PTR [esi + MGP_BLT_STATUS], MGP_BS_BLT_PENDING
		jnz     WriteText16WaitPending
		
		mov     DWORD PTR [esi + MGP_WID_HEIGHT], edx
		mov     DWORD PTR [esi + MGP_SRC_COLOR_FG], ecx
		mov     DWORD PTR [esi + MGP_SRC_COLOR_BG], 0xFFFFFFFF
			
WriteText16LineLoop:

		test    DWORD PTR [esi + MGP_BLT_STATUS], MGP_BS_BLT_PENDING
		jnz     WriteText16LineLoop

		/* WRITE THE NEW DEST OFFSET */

		mov     eax, dstoffset
		mov     DWORD PTR [esi + MGP_DST_OFFSET], eax
		
		/* CALCULATE THE SOURCE OFFSET */

		mov     edi, line_toggle
		mov     eax, edi
		shl     edi, 13
		add     edi, gfx_gp_scratch_base
		mov     [esi + MGP_SRC_OFFSET], edi
		add     edi, gfx_virt_fbptr
		dec     al
		setnz   BYTE PTR line_toggle
		mov     ebx, data_ptr
		mov     esi, OFFSET ColorTable

		/* XCLIPPED PIXEL */

		cmp     xIndex, 0
		je      WriteTextLine16NoClipping

		movzx   eax, BYTE PTR [ebx]
		inc     ebx
		and     al, 0x0F
		shl     al, 1
		mov     eax, [esi + eax]
		stosw

WriteTextLine16NoClipping:

		mov     ecx, qword_count
		or      ecx, ecx
		jz      WriteText16DwordLoopDone

WriteText16DwordLoop:
		
		mov     eax, [ebx]
		add     ebx, 4

		/* 8 4-BIT EXPANSIONS */

		mov     edx, eax
		and     edx, 0x000000F0
		shr     edx, 3
		mov     edx, [esi + edx]
		mov     WORD PTR [edi], dx

		mov     edx, eax
		and     edx, 0x0000000F
		shl     edx, 1
		mov     edx, [esi + edx]
		mov     WORD PTR [edi + 2], dx

		mov     edx, eax
		and     edx, 0x0000F000
		shr     edx, 11
		mov     edx, [esi + edx]
		mov     WORD PTR [edi + 4], dx

		mov     edx, eax
		and     edx, 0x00000F00
		shr     edx, 7
		mov     edx, [esi + edx]
		mov     WORD PTR [edi + 6], dx

		mov     edx, eax
		and     edx, 0x00F00000
		shr     edx, 19
		mov     edx, [esi + edx]
		mov     WORD PTR [edi + 8], dx

		mov     edx, eax
		and     edx, 0x000F0000
		shr     edx, 15
		mov     edx, [esi + edx]
		mov     WORD PTR [edi + 10], dx

		mov     edx, eax
		and     edx, 0xF0000000
		shr     edx, 27
		mov     edx, [esi + edx]
		mov     WORD PTR [edi + 12], dx

		and     eax, 0x0F000000
		shr     eax, 23
		mov     edx, [esi + eax]
		mov     WORD PTR [edi + 14], dx
	
		add     edi, 16
		dec     ecx
		jnz     WriteText16DwordLoop
	
WriteText16DwordLoopDone:

		mov     ecx, leftover_count
		or      ecx, ecx
		jz      WriteText16PixelLoopDone

WriteText16PixelLoop:

		xor     edx, edx
		mov     al, [ebx]
		inc     ebx
		mov     dl, al
		and     dl, 0xF0
		shr     dl, 3
		mov     edx, [esi + edx]
		mov     WORD PTR [edi], dx

		and     eax, 0x0000000F
		shl     al, 1
		mov     edx, [esi + eax]
		mov     WORD PTR [edi + 2], dx

		add     edi, 4
		sub     ecx, 2
		jg      WriteText16PixelLoop 
		
WriteText16PixelLoopDone:
	
		/* ONE LINE IS DONE */
		/* Kick off BLT and prepare for next line */

		mov     ebx, delta
		add     data_ptr, ebx
		mov     ebx, surface_delta
		add     dstoffset, ebx

		mov     esi, gfx_virt_gpptr
		mov     [esi + MGP_BLT_MODE], MGP_BM_SRC_FB

		dec     height
		jnz     WriteText16LineLoop
	}
}

void WriteTextToBuffer32 (int width, int height, long dstoffset, unsigned char *data_ptr, 
						  long xIndex, long delta, long surface_delta)
{
	unsigned long qword_count, leftover_count, gfx_gp_scratch_base, line_toggle = 0;

	/* ONE BIG ASM ROUTINE */

	_asm {

		/* SET SCRATCH BASE      */
		/* Last 16K of FB memory */

		mov     eax, gfx_fb_size
		sub     eax, 0x4000
		mov     gfx_gp_scratch_base, eax

		/* CHECK FOR X CLIPPING */
		/* When the font is xclipped, we cannot go simply trudging through the data. */
		/* The reason is that the hardware does not support an x index for color     */
		/* data.  So, we do two things.   We add the byte offset of the x clipping   */
		/* to the pointer, and we set a flag if we have an odd index.  This flag     */
		/* causes the width to be reduced by one and a special case to happen at the */
		/* beginning of the line.                                                    */

		mov		eax, xIndex
		mov     ebx, eax
		shr     ebx, 1
		and     eax, 1
		add     data_ptr, ebx
		mov     xIndex, eax
		
		/* CALCULATE DWORDS AND BYTES PER LINE       */
		/* Use width - 1 for dword calculation       */
		/* Use width when writing the width register */

		mov     ecx, width
		mov     edx, ecx
		sub     ecx, eax
		mov     ebx, ecx
		and     ebx, 7         
		shr     ecx, 3	

		mov     qword_count, ecx
		mov     leftover_count, ebx
		
		mov		esi, gfx_virt_gpptr
		mov     ebx, OFFSET ColorTable

		/* WRITE THE REGISTERS THAT DO NOT CHANGE */
		
		shl     edx, 16
		inc     edx
		mov     ecx, [ebx]
		mov     eax, dstoffset
					
WriteText32WaitPending:

		test    DWORD PTR [esi + MGP_BLT_STATUS], MGP_BS_BLT_PENDING
		jnz     WriteText32WaitPending
		
		mov     DWORD PTR [esi + MGP_WID_HEIGHT], edx
		mov     DWORD PTR [esi + MGP_SRC_COLOR_FG], ecx
		mov     DWORD PTR [esi + MGP_SRC_COLOR_BG], 0xFFFFFFFF
			
WriteText32LineLoop:

		test    DWORD PTR [esi + MGP_BLT_STATUS], MGP_BS_BLT_PENDING
		jnz     WriteText32LineLoop

		/* WRITE THE NEW DEST OFFSET */

		mov     eax, dstoffset
		mov     DWORD PTR [esi + MGP_DST_OFFSET], eax
		
		/* CALCULATE THE SOURCE OFFSET */

		mov     edi, line_toggle
		mov     eax, edi
		shl     edi, 13
		add     edi, gfx_gp_scratch_base
		mov     [esi + MGP_SRC_OFFSET], edi
		add     edi, gfx_virt_fbptr
		dec     al
		setnz   BYTE PTR line_toggle
		mov     ebx, data_ptr
		mov     esi, OFFSET ColorTable

		/* XCLIPPED PIXEL */

		cmp     xIndex, 0
		je      WriteTextLine16NoClipping

		movzx   eax, BYTE PTR [ebx]
		inc     ebx
		and     al, 0x0F
		shl     al, 2
		mov     eax, [esi + eax]
		stosd

WriteTextLine16NoClipping:

		mov     ecx, qword_count
		or      ecx, ecx
		jz      WriteText32DwordLoopDone

WriteText32DwordLoop:
		
		mov     eax, [ebx]
		add     ebx, 4

		/* 8 4-BIT EXPANSIONS */

		mov     edx, eax
		and     edx, 0x000000F0
		shr     edx, 2
		mov     edx, [esi + edx]
		mov     [edi], edx

		mov     edx, eax
		and     edx, 0x0000000F
		shl     edx, 2
		mov     edx, [esi + edx]
		mov     [edi + 4], edx

		mov     edx, eax
		and     edx, 0x0000F000
		shr     edx, 10
		mov     edx, [esi + edx]
		mov     [edi + 8], edx

		mov     edx, eax
		and     edx, 0x00000F00
		shr     edx, 6
		mov     edx, [esi + edx]
		mov     [edi + 12], edx

		mov     edx, eax
		and     edx, 0x00F00000
		shr     edx, 18
		mov     edx, [esi + edx]
		mov     [edi + 16], edx

		mov     edx, eax
		and     edx, 0x000F0000
		shr     edx, 14
		mov     edx, [esi + edx]
		mov     [edi + 20], edx

		mov     edx, eax
		and     edx, 0xF0000000
		shr     edx, 26
		mov     edx, [esi + edx]
		mov     [edi + 24], edx

		and     eax, 0x0F000000
		shr     eax, 22
		mov     edx, [esi + eax]
		mov     [edi + 28], edx
	
		add     edi, 32
		dec     ecx
		jnz     WriteText32DwordLoop
	
WriteText32DwordLoopDone:

		mov     ecx, leftover_count
		or      ecx, ecx
		jz      WriteText32PixelLoopDone

WriteText32PixelLoop:

		xor     edx, edx
		mov     al, [ebx]
		inc     ebx
		mov     dl, al
		and     dl, 0xF0
		shr     dl, 2
		mov     edx, [esi + edx]
		mov     [edi], edx

		and     eax, 0x0000000F
		shl     al, 2
		mov     edx, [esi + eax]
		mov     [edi + 4], edx

		add     edi, 8
		sub     ecx, 2
		jg      WriteText32PixelLoop 
		
WriteText32PixelLoopDone:
	
		/* ONE LINE IS DONE */
		/* Kick off BLT and prepare for next line */

		mov     ebx, delta
		add     data_ptr, ebx
		mov     ebx, surface_delta
		add     dstoffset, ebx

		mov     esi, gfx_virt_gpptr
		mov     [esi + MGP_BLT_MODE], MGP_BM_SRC_FB

		dec     height
		jnz     WriteText32LineLoop
	}
}

