 /*
  * <LIC_AMD_STD>
  * Copyright (C) <years> Advanced Micro Devices, Inc.  All Rights Reserved.
  * </LIC_AMD_STD>
  * 
  * <CTL_AMD_STD>
  * </CTL_AMD_STD>
  * 
  * <DOC_AMD_STD>
  * Code to perform alpha blending and transparent BLTs for GX2.
  * </DOC_AMD_STD>
  * 
  */

#include "precomp.h"
#include "gfx_regs.h"
#include "gfx_defs.h"

extern unsigned char *gfx_virt_gpptr;
extern unsigned char *gfx_virt_fbptr;
extern unsigned long  gfx_fb_size;
extern unsigned char mode_shift;
extern unsigned long gu2_bpp;
extern unsigned long patOrigin;

void FBBlt (RECTL *dstRect, ULONG dstOffset, ULONG dstPitch, ULONG srcOffset, 
            ULONG srcPitch, ULONG srcLeft, ULONG srcTop, unsigned long blt_mode);

void FBBlt24 (RECTL *dstRect, ULONG dstOffset, ULONG dstPitch, ULONG srcOffset, 
            ULONG srcPitch, ULONG srcLeft, ULONG srcTop, unsigned long blt_mode);

void ScreenToScreenBLT (RECTL *dstRect, ULONG dstOffset, ULONG dstPitch, ULONG srcOffset, 
                    ULONG srcPitch, ULONG srcLeft, ULONG srcTop, unsigned long blt_mode);


void MonoToScreen (RECTL *dstRect, ULONG dstOffset, ULONG dstPitch, ULONG srcOffset, 
            ULONG srcPitch, ULONG srcLeft, ULONG srcTop, unsigned long blt_mode);

#define GU2_WAIT_PENDING while(READ_GP32(MGP_BLT_STATUS) & MGP_BS_BLT_PENDING)

/////////////////////////////////////////////////////////////////////////
// DrvAlphaBlend
//
// Blend source and destination data using a constant or per-pixel alpha values.
//
BOOL DrvAlphaBlend(
	SURFOBJ  *psoDst,
	SURFOBJ  *psoSrc,
	CLIPOBJ  *pco,
	XLATEOBJ *pxlo, 
	RECTL    *prclDst,
	RECTL    *prclSrc,
	BLENDOBJ *pBlendObj)
{
	PDEV *ppdev = NULL;
	RECTL destRegion;
	RECTL *regionPtr;
	CLIPENUM  EnumRects8;
	int more, i, mono = 0;
    ULONG dstSurfOffset, dstSurfPitch;
    ULONG srcSurfOffset, srcSurfPitch;
    ULONG index, fromFB;

    DISPDBG ((ppdev, 1, "DrvAlphaBlend Entry\n"));

    ACQUIRE_PRIMITIVE_SEMAPHORE;

    // PUNT ALL BLTS DURING A MODECHANGE
    // In very rare cases, the mode can be changed before Windows has finished rendering
    // to all surfaces from the previous mode.  We will thus punt all rendering calls until
    // all such surfaces have been deleted.
    //
    if (old_mode_count)
    {
        DISPDBG ((ppdev, 3000, "Punting blend during modechange\n"));
        goto puntIt;
    }

	/* CHECK FOR UNSUPPORTED FORMATS */
	/* We are only supporting a premultiplied BGRA with no other alpha  */
	/* or a constant alpha.                                             */

	if (!psoDst->dhsurf || 
		(pBlendObj->BlendFunction.AlphaFormat &&
		 pBlendObj->BlendFunction.SourceConstantAlpha != 0xFF))
		 goto puntIt;

	if ((prclDst->right  - prclDst->left) != (prclSrc->right  - prclSrc->left) ||
		(prclDst->bottom - prclDst->top)  != (prclSrc->bottom - prclSrc->top))
	{
		goto puntIt;
	}

	/* CHECK FOR NON-32BPP DATA */
	/* Apparently Windows occasionally sends data in non-32BPP formats, */
	/* such as monochrome.                                              */

	if (psoSrc->iBitmapFormat == BMF_1BPP)
	{
		/* MONOCHROME DATA */
		/* Assume the expanded source data is 8:8:8:8 data with alpha data. */
		
		mono = 1;

		GU2_WAIT_PENDING;
		WRITE_GP32 (MGP_SRC_COLOR_BG, pxlo->pulXlate[0]);
		WRITE_GP32 (MGP_SRC_COLOR_FG, pxlo->pulXlate[1]);
	}
	else if (psoSrc->iBitmapFormat != BMF_32BPP)
	{
		DISPDBG ((ppdev, 3000, "Punting Color Conversion Alpha Blend.\n"));
		goto puntIt;
	}

	/* PROGRAM RASTER OPERATION */

	GU2_WAIT_PENDING;
	if (pBlendObj->BlendFunction.SourceConstantAlpha != 0xFF)
	{
		WRITE_GP32 (MGP_RASTER_MODE, gu2_bpp | (BYTE)pBlendObj->BlendFunction.SourceConstantAlpha |
			MGP_RM_ALPHA_EN_MASK | MGP_RM_ALPHA_A_PLUS_BETA_B | MGP_RM_SELECT_ALPHA_R);
	}
	else
	{
		WRITE_GP32 (MGP_RASTER_MODE, gu2_bpp | MGP_RM_A_PLUS_BETA_B | MGP_RM_ALPHA_TO_RGB | MGP_RM_ALPHA_TO_ALPHA);
	}

    // READ DESTINATION AND SOURCE SURFACES 
	// Needed because all surfaces are opaque.
    //
    index = dhsurf_array[(USHORT)psoDst->dhsurf];
    if (index & CACHE_FLAG_SYSTEM)
        goto puntIt;

    dstSurfOffset = bitmap_heap[(USHORT)index].heap_offset;
    dstSurfPitch  = bitmap_heap[(USHORT)index].flags_and_pitch;
	
    if (psoSrc->dhsurf)
    {
        index = dhsurf_array[(USHORT)psoSrc->dhsurf];
        if (index & CACHE_FLAG_SYSTEM)
        {
            fromFB = 0;
            srcSurfOffset = system_heap[(USHORT)index].heap_offset;
            srcSurfPitch  = system_heap[(USHORT)index].flags_and_pitch;
        }
        else
        {
            fromFB = 1;
            srcSurfOffset = bitmap_heap[(USHORT)index].heap_offset;
            srcSurfPitch  = bitmap_heap[(USHORT)index].flags_and_pitch;
        }
    }
    else
    {
        fromFB = 0;
        srcSurfOffset = (ULONG)psoSrc->pvScan0;
        srcSurfPitch  = (ULONG)psoSrc->lDelta;
    }

	if (!pco || pco->iDComplexity == DC_TRIVIAL)
	{
		if (mono)
			MonoToScreen (prclDst, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch, prclSrc->left, 
				prclSrc->top,  MGP_BM_DST_REQ);
		else if (fromFB)
			ScreenToScreenBLT (prclDst, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch, prclSrc->left, 
				prclSrc->top,  MGP_BM_SRC_FB | MGP_BM_DST_REQ);
		else
			FBBlt (prclDst, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch, prclSrc->left, 
				prclSrc->top,  MGP_BM_SRC_FB | MGP_BM_DST_REQ);
	}
	else if (pco->iDComplexity == DC_RECT)
	{
		if (!FindIntersection (prclDst, &pco->rclBounds, &destRegion))
			goto returnTrue;
		
		if (mono)
			MonoToScreen (&destRegion, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch,
				prclSrc->left + destRegion.left - prclDst->left, 
				prclSrc->top  + destRegion.top  - prclDst->top,
				MGP_BM_DST_REQ);
		else if (fromFB)
			ScreenToScreenBLT (&destRegion, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch,
				prclSrc->left + destRegion.left - prclDst->left, 
				prclSrc->top  + destRegion.top  - prclDst->top,
				MGP_BM_SRC_FB | MGP_BM_DST_REQ);
		else
			FBBlt (&destRegion, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch,
				prclSrc->left + destRegion.left - prclDst->left, 
				prclSrc->top  + destRegion.top  - prclDst->top,
				MGP_BM_SRC_FB | MGP_BM_DST_REQ);
	}
	else
	{
		/* COMPLEX CLIPPED */

		CLIPOBJ_cEnumStart(pco, FALSE, CT_RECTANGLES, CD_LEFTDOWN, 0);

		while (1)
		{			
			RECTL region;

			more = CLIPOBJ_bEnum(pco, sizeof (CLIPENUM), (PULONG) &EnumRects8);
														
			/* ITERATE THROUGH ALL RECTANGLES IN THE CURRENT BATCH */

			for (i = 0; i < EnumRects8.c; i++) 
			{
				/* INTERSECT WITH TARGET BOUNDS */

				if (!FindIntersection (prclDst, &EnumRects8.arcl[i], &destRegion))
					continue;

				GU2_WAIT_PENDING;
				if (mono)
					MonoToScreen (&destRegion, dstSurfOffset, dstSurfPitch,
                        srcSurfOffset, srcSurfPitch,
						prclSrc->left + destRegion.left - prclDst->left, 
						prclSrc->top  + destRegion.top  - prclDst->top,
						MGP_BM_DST_REQ);				
				if (fromFB)
					ScreenToScreenBLT (&destRegion, dstSurfOffset, dstSurfPitch,
                        srcSurfOffset, srcSurfPitch,
						prclSrc->left + destRegion.left - prclDst->left, 
						prclSrc->top  + destRegion.top  - prclDst->top,
						MGP_BM_SRC_FB | MGP_BM_DST_REQ);				
				else
					FBBlt (&destRegion, dstSurfOffset, dstSurfPitch,
                        srcSurfOffset, srcSurfPitch,
						prclSrc->left + destRegion.left - prclDst->left, 
						prclSrc->top  + destRegion.top  - prclDst->top,
						MGP_BM_SRC_FB | MGP_BM_DST_REQ);				
			}

			if (!more)
				break;
		}
	}

returnTrue:

    RELEASE_PRIMITIVE_SEMAPHORE;

    DISPDBG ((ppdev, 1, "DrvAlphaBlend Exit\n"));
	return TRUE;

puntIt:
    
    RELEASE_PRIMITIVE_SEMAPHORE;
        
    DISPDBG ((ppdev, 1, "Punting DrvAlphaBlend\n"));
	return EngAlphaBlend(psoDst, psoSrc, pco, pxlo, prclDst, prclSrc, pBlendObj);
}

/////////////////////////////////////////////////////////////////////////
// DrvTransparentBlt
//
// Source copy using transparency
//
BOOL DrvTransparentBlt(
	SURFOBJ  *psoDst,
	SURFOBJ  *psoSrc,
	CLIPOBJ  *pco,
	XLATEOBJ *pxlo,
	RECTL    *prclDst,
	RECTL    *prclSrc,
	ULONG     iTransColor,
	ULONG     ulReserved)
{
	PDEV *ppdev = NULL;
	RECTL destRegion;
	RECTL *regionPtr;
	CLIPENUM  EnumRects8;
	int more, i, ColorConversion = 0;
    ULONG dstSurfOffset, dstSurfPitch;
    ULONG srcSurfOffset, srcSurfPitch;
    ULONG index, fromFB;

    DISPDBG ((ppdev, 1, "DrvTransparentBlt Entry\n"));

    ACQUIRE_PRIMITIVE_SEMAPHORE;

    // PUNT ALL BLTS DURING A MODECHANGE
    // In very rare cases, the mode can be changed before Windows has finished rendering
    // to all surfaces from the previous mode.  We will thus punt all rendering calls until
    // all such surfaces have been deleted.

    if (old_mode_count)
    {
        DISPDBG ((ppdev, 3000, "Punting transparent BLT during modechange\n"));
        goto puntIt;
    }

	if (!psoDst->dhsurf)
	{
		DISPDBG ((ppdev, 3000, "Transparent BLT to non-device surface.\n"));
		goto puntIt;
	}

	ppdev = (PDEV *)psoDst->dhpdev;

	if (psoSrc->iBitmapFormat != ppdev->iBitmapFormat ||
		(pxlo && !(pxlo->flXlate & XO_TRIVIAL)) )       
	{
		/* CHECK FOR SINGLE SUPPORTED CASE */
		/* We handle 24BPP to 32BPP conversion as it is          */
		/* a relatively simple conversion.  We assume that 24BPP */
		/* bitmaps are never stored in the frame buffer          */

		if (psoSrc->iBitmapFormat == BMF_24BPP && ppdev->iBitmapFormat == BMF_32BPP)
		{
			ColorConversion = 1;
		}
		else
		{
			DISPDBG ((ppdev, 300, "Punting Color conversion Transparent BLT\n"));
			goto puntIt;
		}
	}
	if(((prclDst->right  - prclDst->left) != (prclSrc->right  - prclSrc->left)) ||
	   ((prclDst->bottom - prclDst->top)  != (prclSrc->bottom - prclSrc->top)))
	{		
		DISPDBG ((ppdev, 3000, "Punting Transparent BLT\n"));
		goto puntIt;
	}

	/* SET UP RASTER MODE AND TRANSPARENCY */

	GU2_WAIT_PENDING;
	WRITE_GP32 (MGP_RASTER_MODE, gu2_bpp | 0xCC | MGP_RM_SRC_TRANS);
	WRITE_GP32 (MGP_SRC_COLOR_FG, iTransColor);

	if (ColorConversion)
		WRITE_GP32 (MGP_SRC_COLOR_BG, 0x00FFFFFF);
	else
		WRITE_GP32 (MGP_SRC_COLOR_BG, 0xFFFFFFFF);

    // READ DESTINATION AND SOURCE SURFACES 
	// Needed because all surfaces are opaque.
    //
    index = dhsurf_array[(USHORT)psoDst->dhsurf];
    if (index & CACHE_FLAG_SYSTEM)
        goto puntIt;

    dstSurfOffset = bitmap_heap[(USHORT)index].heap_offset;
    dstSurfPitch  = bitmap_heap[(USHORT)index].flags_and_pitch;
	
    if (psoSrc->dhsurf)
    {
        index = dhsurf_array[(USHORT)psoSrc->dhsurf];
        if (index & CACHE_FLAG_SYSTEM)
        {
            fromFB = 0;
            srcSurfOffset = system_heap[(USHORT)index].heap_offset;
            srcSurfPitch  = system_heap[(USHORT)index].flags_and_pitch;
        }
        else
        {
            fromFB = 1;
            srcSurfOffset = bitmap_heap[(USHORT)index].heap_offset;
            srcSurfPitch  = bitmap_heap[(USHORT)index].flags_and_pitch;
        }
    }
    else
    {
        fromFB = 0;
        srcSurfOffset = (ULONG)psoSrc->pvScan0;
        srcSurfPitch  = (ULONG)psoSrc->lDelta;
    }

	/* HANDLE CLIPPING */
	
	if (!pco || pco->iDComplexity == DC_TRIVIAL)
	{
		if (ColorConversion)
			FBBlt24 (prclDst, dstSurfOffset, dstSurfPitch, 
                srcSurfOffset, srcSurfPitch, prclSrc->left, 
				prclSrc->top,  MGP_BM_SRC_FB);
		else if (fromFB)
			ScreenToScreenBLT (prclDst, dstSurfOffset, dstSurfPitch, 
                srcSurfOffset, srcSurfPitch, prclSrc->left, 
				prclSrc->top,  MGP_BM_SRC_FB);
		else
			FBBlt (prclDst, dstSurfOffset, dstSurfPitch, 
                srcSurfOffset, srcSurfPitch, prclSrc->left, 
				prclSrc->top,  MGP_BM_SRC_FB);
	}
	else if (pco->iDComplexity == DC_RECT)
	{
		if (!FindIntersection (prclDst, &pco->rclBounds, &destRegion))
			goto returnTrue;
		
		if (ColorConversion)
			FBBlt24 (&destRegion, dstSurfOffset, dstSurfPitch, 
                srcSurfOffset, srcSurfPitch,  
				prclSrc->left + destRegion.left - prclDst->left, 
				prclSrc->top  + destRegion.top  - prclDst->top,
				MGP_BM_SRC_FB);
		else if (fromFB)
			ScreenToScreenBLT (&destRegion, dstSurfOffset, dstSurfPitch, 
                srcSurfOffset, srcSurfPitch, 
				prclSrc->left + destRegion.left - prclDst->left, 
				prclSrc->top  + destRegion.top  - prclDst->top,
				MGP_BM_SRC_FB);
		else
			FBBlt (&destRegion, dstSurfOffset, dstSurfPitch, 
                srcSurfOffset, srcSurfPitch, 
				prclSrc->left + destRegion.left - prclDst->left, 
				prclSrc->top  + destRegion.top  - prclDst->top,
				MGP_BM_SRC_FB);
	}
	else
	{
		/* COMPLEX CLIPPED */

		CLIPOBJ_cEnumStart(pco, FALSE, CT_RECTANGLES, CD_LEFTDOWN, 0);

		while (1)
		{			
			RECTL region;

			more = CLIPOBJ_bEnum(pco, sizeof (CLIPENUM), (PULONG) &EnumRects8);
														
			/* ITERATE THROUGH ALL RECTANGLES IN THE CURRENT BATCH */

			for (i = 0; i < EnumRects8.c; i++) 
			{
				/* INTERSECT WITH TARGET BOUNDS */

				if (!FindIntersection (prclDst, &EnumRects8.arcl[i], &destRegion))
					continue;

				GU2_WAIT_PENDING;
				if (ColorConversion)
					FBBlt24 (&destRegion, dstSurfOffset, dstSurfPitch, 
                        srcSurfOffset, srcSurfPitch, 
						prclSrc->left + destRegion.left - prclDst->left, 
						prclSrc->top  + destRegion.top  - prclDst->top,
						MGP_BM_SRC_FB);	
				else if (fromFB)
					ScreenToScreenBLT (&destRegion, dstSurfOffset, dstSurfPitch, 
                        srcSurfOffset, srcSurfPitch, 
						prclSrc->left + destRegion.left - prclDst->left, 
						prclSrc->top  + destRegion.top  - prclDst->top,
						MGP_BM_SRC_FB);			
				else
					FBBlt (&destRegion, dstSurfOffset, dstSurfPitch, 
                        srcSurfOffset, srcSurfPitch, 
						prclSrc->left + destRegion.left - prclDst->left, 
						prclSrc->top  + destRegion.top  - prclDst->top,
						MGP_BM_SRC_FB);			
			}

			if (!more)
				break;
		}
	}

returnTrue:

    RELEASE_PRIMITIVE_SEMAPHORE;

    DISPDBG ((ppdev, 1, "DrvTransparentBlt Exit\n"));
	return TRUE;	

puntIt:

    RELEASE_PRIMITIVE_SEMAPHORE;

    DISPDBG ((ppdev, 1, "Punting DrvTransparentBlt\n"));
	return EngTransparentBlt (psoDst, psoSrc, pco, pxlo, prclDst,
		prclSrc, iTransColor, ulReserved);
}

void FBBlt24 (RECTL *dstRect, ULONG dstOffset, ULONG dstPitch, ULONG srcOffset, 
            ULONG srcPitch, ULONG srcLeft, ULONG srcTop, unsigned long blt_mode)
{
	unsigned long gfx_gp_scratch_base, width, height, src_data, src_delta;

	_asm {

		cld

		/* SET SCRATCH BASE      */
		/* Last 16K of FB memory */

		mov     eax, gfx_fb_size
		sub     eax, 0x4000
		mov     gfx_gp_scratch_base, eax

		/* INITIALIZE REGISTER POINTER */

		mov		edi, gfx_virt_gpptr

		/* SAVE DESTINATION DELTA */
        
		push    dstPitch
		
		/* SET DEST OFFSET, WIDTH AND HEIGHT */

		mov     eax, dstOffset
		mov     ebx, dstRect
		mov     esi, [ebx]RECTL.left
		mov     edx, [ebx]RECTL.right
		sub     edx, esi
		lea     eax, [eax + esi*4]
		mov     esi, [ebx]RECTL.top
		mov     ecx, [ebx]RECTL.bottom
		sub     ecx, esi
		imul    esi, [esp]
		add     eax, esi
		
		/* AT THIS POINT:    */
		/* EAX = Dest offset */
		/* ECX = Height      */
		/* EDX = Width       */
		
		/* SAVE THE WIDTH AND HEIGHT */

		mov     height, ecx
		mov     width, edx
		
		/* PROGRAM THE WIDTH, HEIGHT AND DEST OFFSET */
		/* The height is programmed to 1.            */

		mov     [edi + MGP_DST_OFFSET], eax
		shl     edx, 16
		inc     edx
		mov     [edi + MGP_WID_HEIGHT], edx

		/* READ DATA POINTER */

		mov     edx, srcLeft
		imul    edx, 3
		
		mov     esi, srcTop
		mov     ecx, srcPitch
		imul    esi, ecx
		
		add     esi, edx
		add     esi, srcOffset

		/* AT THIS POINT        */
		/* ECX = Source Delta   */
		/* ESI = Source pointer */
		
		mov     src_data, esi
		mov     src_delta, ecx

		/* ZERO LINE INDEX */
		/* Index is 0 or 1 based on current source line */

		xor     ebx, ebx

		/* READY TO WRITE THE DATA */
		/* We must wait for BLT busy as another driver routine may */
		/* have also used the offscreen frame buffer area.         */

FBBlt24WaitBusyLoop:

		test    [edi + MGP_BLT_STATUS], MGP_BS_BLT_BUSY
		jnz     FBBlt24WaitBusyLoop

FBBlt24LineLoop:

		/* WRITE ALL DWORDS EXCEPT THE LAST */
		/* We read the 3-byte pixels as a DWORD.  However, you never know when */
		/* reading a dword might push you over the edge.                       */

		mov     edx, ebx
		shl     edx, 13
		add     edx, gfx_gp_scratch_base
		mov     [edi + MGP_SRC_OFFSET], edx
		mov     [edi + MGP_DST_OFFSET], eax
		mov     edi, edx
		add     edi, gfx_virt_fbptr
		mov     ecx, width
		dec     ecx
		jz      FBBlt24LastPixel

FBBlt24PixelLoop:

		movsd
		dec     esi
		loop    FBBlt24PixelLoop

FBBlt24LastPixel:

		/* READ THE LAST PIXEL */

		dec     esi
		mov     edx, [esi]
		shr     edx, 8
		mov     [edi], edx

		/* START THE BLT */

		mov     edi, gfx_virt_gpptr
		mov     ecx, blt_mode
		mov     DWORD PTR [edi + MGP_BLT_MODE], ecx

		dec     height
		jz      FBBlt24Finished

		mov     esi, src_data
		add     esi, src_delta
		mov     src_data, esi
		add     eax, [esp]
		
		/* ADJUST LINE INDEX */
		/* bl = 1 - bl       */

		dec     bl
		setnz   bl

		/* WAIT FOR BLT PENDING */

FBBlt24WaitPendingLoop2:

		test    DWORD PTR [edi + MGP_BLT_STATUS], MGP_BS_BLT_PENDING
		jnz     FBBlt24WaitPendingLoop2
		jmp     FBBlt24LineLoop

FBBlt24Finished:		

		pop     eax
	}
}

void FBBlt (RECTL *dstRect, ULONG dstOffset, ULONG dstPitch, ULONG srcOffset, 
            ULONG srcPitch, ULONG srcLeft, ULONG srcTop, unsigned long blt_mode)
{
	unsigned long gfx_gp_scratch_base;

	_asm {

		cld

		/* SET SCRATCH BASE      */
		/* Last 16K of FB memory */

		mov     eax, gfx_fb_size
		sub     eax, 0x4000
		mov     gfx_gp_scratch_base, eax

		/* INITIALIZE REGISTER POINTER */

		mov		edi, gfx_virt_gpptr

		/* SAVE DESTINATION DELTA */

		push    dstPitch
		
		/* SET DEST OFFSET, WIDTH AND HEIGHT */

		mov     eax, dstOffset
		mov     ebx, dstRect
		mov     esi, [ebx]RECTL.left
		mov     edx, [ebx]RECTL.right
		sub     edx, esi
		mov     cl, mode_shift
		shl     esi, cl
		add     eax, esi
		mov     esi, [ebx]RECTL.top
		mov     ecx, [ebx]RECTL.bottom
		sub     ecx, esi
		imul    esi, [esp]
		add     eax, esi
		
		/* AT THIS POINT:    */
		/* EAX = Dest offset */
		/* ECX = Height      */
		/* EDX = Width       */
		
		/* SAVE THE WIDTH AND HEIGHT */

		push    ecx
		push    edx

		/* PROGRAM THE WIDTH, HEIGHT AND DEST OFFSET */
		/* The height is programmed to 1.            */

		or      eax, patOrigin
		mov     [edi + MGP_DST_OFFSET], eax
		shl     edx, 16
		mov     dx, 1
		mov     [edi + MGP_WID_HEIGHT], edx

		/* READ DATA POINTER */

		mov     cl, mode_shift
		mov     edx, srcLeft
		shl     edx, cl

		mov     esi, srcTop
		mov     ecx, srcPitch
		imul    esi, ecx
		
		add     esi, edx
		add     esi, srcOffset
		mov     edx, ecx

		/* PLAY THE ALIGNMENT GAME */
		/* Force DWORD alignment on both source */
		/* and dest for rep movsd               */

		mov     ecx, esi
		and     ecx, 3
		add     gfx_gp_scratch_base, ecx
		sub     cl, 4
		neg     cl
		and     cl, 3
		push    ecx

		/* CALCULATE THE DELTA BETWEEN SOURCE LINES                      */
		/* Save the byte count on the stack.  We also try really hard    */
		/* to align the source and destination data on DWORD boundaries. */
		/* We accomplish this by first writing bytes to a DWORD boundary */
		/* and then doing a rep movsd.  This implies that we must start  */
		/* with the source and destination at the same alignment         */
		/* (accomplished above) and ensure that we do handle cases where */
		/* the bytes needed to pad out to a dword are more than the      */
		/* actual byte width (accomplished below)                        */

		mov     ebx, [esp + 4]
		mov     cl, mode_shift
		shl     ebx, cl

		/* COMPARE BYTE WIDTH AGAINST PAD WIDTH */

		mov     ecx, [esp]
		cmp     ebx, ecx
		
		/* SET PADDING TO BYTE WIDTH IF NECESSARY */
		
		cmovl   ecx, ebx
		mov     [esp], ecx
		
		sub     edx, ebx
		sub     ebx, [esp]
		mov     [esp + 4], ebx

		/* AT THIS POINT:               */
		/* EAX = Dest offset            */
		/* EDX = Source Line Delta      */
		/* ESI = Source data pointer    */
		/* [ESP]      = First alignment */
		/* [ESP + 4]  = Byte Width      */
		/* [ESP + 8]  = Height          */
		/* [ESP + 12] = Dest Delta      */

		/* ZERO LINE INDEX */
		/* Index is 0 or 1 based on current source line */

		xor     ebx, ebx

		/* READY TO WRITE THE DATA */
		/* We must wait for BLT busy as another driver routine may */
		/* have also used the offscreen frame buffer area.         */

FBBltWaitBusyLoop:

		test    [edi + MGP_BLT_STATUS], MGP_BS_BLT_BUSY
		jnz     FBBltWaitBusyLoop

FBBltLineLoop:

		/* WRITE THE REGISTERS SPECIFIC TO EACH LINE */

		mov     [edi + MGP_DST_OFFSET], eax
		mov     ecx, ebx
		shl     ecx, 13
		add     ecx, gfx_gp_scratch_base
		mov     [edi + MGP_SRC_OFFSET], ecx
		mov     edi, ecx
		add     edi, gfx_virt_fbptr

		/* WRITE THE DATA TO THE FRAME BUFFER */

		mov     ecx, [esp]
		rep     movsb
		mov     ecx, [esp + 4]
		shr     ecx, 2
		rep     movsd
		movzx   ecx, BYTE PTR [esp + 4]
		and     cl, 3
		rep     movsb

		/* START THE BLT */

		mov     edi, gfx_virt_gpptr
		mov     ecx, blt_mode
		mov     DWORD PTR [edi + MGP_BLT_MODE], ecx

		dec     DWORD PTR [esp + 8]
		jz      FBBltFinished

		add     esi, edx
		add     eax, [esp + 12]
		add     eax, 0x20000000
		
		/* ADJUST LINE INDEX */
		/* bl = 1 - bl       */

		dec     bl
		setnz   bl

		/* WAIT FOR BLT PENDING */

FBBltWaitPendingLoop2:

		test    DWORD PTR [edi + MGP_BLT_STATUS], MGP_BS_BLT_PENDING
		jnz     FBBltWaitPendingLoop2
		jmp     FBBltLineLoop

FBBltFinished:

		lea     esp, [esp + 16]
	}
}

void ScreenToScreenBLT (RECTL *dstRect, ULONG dstOffset, ULONG dstPitch, ULONG srcOffset, 
            ULONG srcPitch, ULONG srcLeft, ULONG srcTop, unsigned long blt_mode)
{
	_asm {

		/* SET REGISTER POINTER */

		mov		edi, gfx_virt_gpptr

		/* SET WIDTH AND HEIGHT */
		
		mov     eax, dstRect
		mov     ebx, [eax]RECTL.left
		mov     ecx, [eax]RECTL.right
		mov     esi, [eax]RECTL.top
		mov     edx, [eax]RECTL.bottom
		sub     ecx, ebx
		sub     edx, esi

		/* AT THIS POINT:  */
		/* EDX = Height    */
		/* EBX = Dest Left */
		/* ECX = Width     */
		/* ESI = Dest Top  */
		/* Hang on to the width and height.  Negative x and y BLTs */
		/* require the src and dest offsets to point to the first  */
		/* pixel.  To save a multiply, we will simply adjust the   */
		/* x and y coordinates before the multiply.                */

		shrd    eax, ecx, 16
		mov     ax, dx
		mov     [edi + MGP_WID_HEIGHT], eax

		/* CALCULATE SOURCE AND DEST OFFSETS                     */
		/* BLTs with no negative x or y have a shorter code path */

		test    WORD PTR blt_mode, MGP_BM_NEG_YDIR | MGP_BM_NEG_XDIR
		jnz     CalculateOffsetsWithNegativeDirections

		mov     cl, mode_shift

		mov     eax, dstPitch
		imul    esi, eax
		shl     ebx, cl
		add     esi, ebx
		add     esi, dstOffset
		or      esi, patOrigin
		mov     [edi + MGP_DST_OFFSET], esi

		/* AT THIS POINT:   */
		/* EAX = Dest Delta */
		/* ECX = Mode Shift */

		mov     esi, srcTop
		mov     ebx, srcPitch
		imul    esi, ebx
		shl     ebx, 16
		mov     bx, ax
		mov     [edi + MGP_STRIDE], ebx
		mov     eax, srcLeft
		shl     eax, cl
		add     esi, eax
		add     esi, srcOffset
		mov     [edi + MGP_SRC_OFFSET], esi
		jmp     ScreenBltCalculationDone

CalculateOffsetsWithNegativeDirections:

		/* PREPARE FOR POSSIBLE ADJUSTMENTS FOR NEGATIVE BLTS */
		/* EDX = Height                                       */
		/* EBX = Dest Left                                    */
		/* ECX = Width                                        */
		/* ESI = Dest Top                                     */
				
		xor     eax, eax
		dec     edx
		test    WORD PTR blt_mode, MGP_BM_NEG_YDIR
		setz    al
		dec     eax
		and     edx, eax

		xor     eax, eax
		test    WORD PTR blt_mode, MGP_BM_NEG_XDIR
		setz    al
		lea     edi, [eax - 1]
		setnz   al
		and     edi, ecx
		mov     cl, mode_shift
		shl     edi, cl
		shl     ebx, cl
		sub     edi, eax

		/* AT THIS POINT:                           */
		/* EDX = (0 or height - 1)                  */
		/* EBX = Dest Left * BytesPerPixel          */
		/* EDI = (0 or (width * BytesPerPixel) - 1) */
		/* ESI = Dest Top                           */
		/* ECX = mode_shift                         */

		/* SET DEST OFFSET */

		add     esi, edx
		imul    esi, dstPitch
		add     esi, ebx
		add     esi, edi
		add     esi, dstOffset
		mov     ebx, dstPitch
		mov     eax, gfx_virt_gpptr
		or      esi, patOrigin
		mov     [eax + MGP_DST_OFFSET], esi

		/* EBX = Dest Pitch                         */
		/* EDX = (0 or height - 1)                  */
		/* EDI = (0 or (width * BytesPerPixel) - 1) */
		/* ECX = mode_shift                         */

		mov     esi, srcLeft
		shl     esi, cl
		add     edi, esi
		mov     ecx, srcTop
		add     ecx, edx
		mov     edx, srcPitch
		imul    ecx, edx
		add     ecx, edi
		shl     edx, 16
		mov     dx, bx
		add     ecx, srcOffset

		mov     [eax + MGP_STRIDE], edx
		mov     [eax + MGP_SRC_OFFSET], ecx
		mov     edi, eax

ScreenBltCalculationDone:

		mov     eax, blt_mode
		mov     DWORD PTR [edi + MGP_BLT_MODE], eax
		
	}
}
