 /*
  * <LIC_AMD_STD>
  * Copyright (C) <years> Advanced Micro Devices, Inc.  All Rights Reserved.
  * </LIC_AMD_STD>
  * 
  * <CTL_AMD_STD>
  * </CTL_AMD_STD>
  * 
  * <DOC_AMD_STD>
  *     DrvBitBlt
  *     DrvCopyBits
  *     DrvSynchronize
  * </DOC_AMD_STD>
  * 
  */

#include "precomp.h"
#include "rop4.h"
#include "gfx_regs.h"
#include "gfx_defs.h"

#define GU2_WAIT_PENDING while(READ_GP32(MGP_BLT_STATUS) & MGP_BS_BLT_PENDING)
#define GU2_WAIT_BUSY while(READ_GP32(MGP_BLT_STATUS) & MGP_BS_BLT_BUSY)

/* FUNCTION POINTER ROUTINES */

void ScreenToScreenBLT (RECTL *dstRect, ULONG dstOffset, ULONG dstPitch, ULONG srcOffset, 
            ULONG srcPitch, ULONG srcLeft, ULONG srcTop, unsigned long blt_mode);
void FBBlt (RECTL *dstRect, ULONG dstOffset, ULONG dstPitch, ULONG srcOffset, 
            ULONG srcPitch, ULONG srcLeft, ULONG srcTop, unsigned long blt_mode);
void MonoToScreen (RECTL *dstRect, ULONG dstOffset, ULONG dstPitch, ULONG srcOffset, 
            ULONG srcPitch, ULONG srcLeft, ULONG srcTop, unsigned long blt_mode);
void Blt15To16 (RECTL *dstRect, ULONG dstOffset, ULONG dstPitch, ULONG srcOffset, 
            ULONG srcPitch, ULONG srcLeft, ULONG srcTop, unsigned long blt_mode);

extern unsigned char *gfx_virt_fbptr;
extern unsigned char *gfx_virt_gpptr;
extern unsigned long  gu2_bpp;
extern unsigned char  mode_shift;
extern unsigned long  gfx_fb_size;

ULONG patOrigin = 0;

/*-------------------------------------------------------------------------
 * DrvBitBlt
 *
 * BitBlt is responsible for a wide variety for rectangular fills,
 * memory to screen blits, and screen to memory blits. Additionally
 * DrvCopyBits is just a wrapper for this routine so all screen
 * to screen blits will come here as well.
 -------------------------------------------------------------------------*/

BOOL DrvBitBlt(
	SURFOBJ *psoTrg,		/* target surface */
	SURFOBJ *psoSrc,		/* source surface */
	SURFOBJ *psoMask,		/* 1 bpp mask for masking destination */
	CLIPOBJ *pco,			/* clip object */
	XLATEOBJ *pxlo,			/* translation object - color translation tables */
	RECTL *prclDst,			/* target rectangle */
	POINTL *pptlSrc,		/* origin of source */
	POINTL *pptlMask,		/* origin of mask */
	BRUSHOBJ *pbo,			/* brush object */
	POINTL *pptlBrush,		/* brush origin */
	ROP4 rop4)					/* raster operation code */
{
	PDEV*       ppdev          = NULL;
	PBRUSH*     gBrushPtr;                 /* Pointer to resolved brush */
	RECTL       destRegion;
	int         more, i, monobrush = 0;
	int         blt15 = 0;
	void       (*renderFunc)(RECTL *, ULONG, ULONG, ULONG, ULONG, ULONG, ULONG, ULONG);
	CLIPENUM    EnumRects8;
	ULONG       blt_mode;
	RECTL*      clippedTargetPtr;
	ULONG       destoffset;
	long        x, y;
    ULONG       dstSurfOffset, dstSurfPitch;
    ULONG       srcSurfOffset, srcSurfPitch;
    ULONG       index, fromFB;
		
    DISPDBG ((ppdev, 1, "DrvBitBlt Entry\n"));

    ACQUIRE_PRIMITIVE_SEMAPHORE;

    if (!psoTrg->dhsurf)
	{
		ppdev = (PDEV *)psoSrc->dhpdev;
		goto reversePuntIt;
	}
	ppdev = (PDEV *)psoTrg->dhpdev;

    // PUNT ALL BLTS DURING A MODECHANGE
    // In very rare cases, the mode can be changed before Windows has finished rendering
    // to all surfaces from the previous mode.  We will thus punt all rendering calls until
    // all such surfaces have been deleted.

    if (old_mode_count)
    {
        DISPDBG ((ppdev, 3000, "Punting BLT during modechange\n"));
        goto puntIt;
    }

	if ((rop4 & 0xFF) != ((USHORT)rop4 >> 8))
		goto puntIt;

	/* SET DESTINATION INVOLVED                             */
	/* DST_REQ is bit 2.  GX_ROP_DST is bit 2.  Convenient! */

	blt_mode = (ULONG)(rop4Flags[(UCHAR)rop4] & GX_ROP_DST);

	/* CHECK FOR PATTERN FILL CASE                                           */
	/* Pattern fills are much simpler than anything that involves source, as */
	/* well as more common...                                                */

	if (!(rop4Flags[(UCHAR)rop4] & GX_ROP_SRC))
	{
        index = dhsurf_array[(USHORT)psoTrg->dhsurf];
        if (index & CACHE_FLAG_SYSTEM)
            goto puntSystem;

        dstSurfOffset = bitmap_heap[(USHORT)index].heap_offset;
        dstSurfPitch  = bitmap_heap[(USHORT)index].flags_and_pitch;

		/* FIX FOR ISSUE 134.8 */

		blt_mode |= 0x40;

		GU2_WAIT_PENDING;		
		if (rop4Flags[(UCHAR)rop4] & GX_ROP_PAT)
		{
			/* CHECK FOR SOLID COLOR                                       */
			/* Color != 0xFFFFFFFF implies solid color instead of pattern. */
			
			if(pbo->iSolidColor != 0xffffffff) 
			{
				/* SET SOLID COLOR                                     */
				/* Write raster mode first so GX2 replicates the data. */

				WRITE_GP32 (MGP_RASTER_MODE, gu2_bpp | (UCHAR)rop4);
				WRITE_GP32 (MGP_PAT_COLOR_0, pbo->iSolidColor);
			} 
			else 
			{
				/* CHECK BRUSH TYPE */
				/* At this time, we will only support monochrome brushes.  This may change. */
					
				if (pbo->pvRbrush != NULL)
					gBrushPtr = pbo->pvRbrush;
				else 
				{
					gBrushPtr = BRUSHOBJ_pvGetRbrush (pbo);

					DISPDBG((ppdev, 1, "gBrushPtr returned 0x%x\n", gBrushPtr));

					if (gBrushPtr == NULL) 
					{
						goto puntIt;
					}
				}

				/* BRUSH MUST BE MONO     */
				/* For now, we know that our brush is mono, as our realize brush routine */
				/* fails anything else.  Furthermore, we can rotate the brush using the  */
				/* handy dandy pattern origin bits in the GX2.                           */
				
				monobrush = 1;
				patOrigin = (((4000 - pptlBrush->x + prclDst->left) & 7) << 26) |
					         ((4000 - pptlBrush->y + prclDst->top) << 29);
					        
				/* WRITE THE PATTERN DATA */
				/* Write the raster mode register first such that color data is */
				/* replicated by the GP.                                        */

				WRITE_GP32 (MGP_RASTER_MODE, gu2_bpp | (UCHAR)rop4 | MGP_RM_PAT_MONO);
				WRITE_GP32 (MGP_PAT_COLOR_0, gBrushPtr->Color0);
				WRITE_GP32 (MGP_PAT_COLOR_1, gBrushPtr->Color1);
				WRITE_GP32 (MGP_PAT_DATA_0,  (*((unsigned long *)gBrushPtr->Pattern)));
				WRITE_GP32 (MGP_PAT_DATA_1,  (*((unsigned long *)(gBrushPtr->Pattern + 4))));
			}
		}
		else
		{
			/* DESTINATION ONLY, WHITENESS OR BLACKNESS */
			/* Just write ROP.                          */

			WRITE_GP32 (MGP_RASTER_MODE, gu2_bpp | (UCHAR)rop4);
		}

		/* PROGRAM THE DESTINATION STRIDE */

		WRITE_GP32 (MGP_STRIDE, dstSurfPitch);

		/* BRUSH AND ROP SETUP COMPLETE */
		/* Handle clipping              */

		if (!pco || pco->iDComplexity == DC_TRIVIAL)
		{
			destoffset  = (dstSurfPitch * prclDst->top +
				          (prclDst->left << mode_shift)) + patOrigin;
			destoffset += dstSurfOffset;

			WRITE_GP32 (MGP_DST_OFFSET, destoffset);
			WRITE_GP32 (MGP_WID_HEIGHT, 
				(((prclDst->right - prclDst->left) << 16) | 
				  (prclDst->bottom - prclDst->top)));
			WRITE_GP32 (MGP_BLT_MODE, blt_mode);
		}
		else if (pco->iDComplexity == DC_RECT)
		{
			if (!FindIntersection (prclDst, &pco->rclBounds, &destRegion))
				goto returnTrue;

			/* ADJUST PATTERN ORIGIN */

			if (monobrush)
			{
				patOrigin = (((4000 - pptlBrush->x + destRegion.left) & 7) << 26) |
							 ((4000 - pptlBrush->y + destRegion.top) << 29);
			}
			destoffset = (dstSurfPitch * destRegion.top +
				         (destRegion.left << mode_shift)) + patOrigin;
			destoffset += dstSurfOffset;

			WRITE_GP32 (MGP_DST_OFFSET, destoffset);
			WRITE_GP32 (MGP_WID_HEIGHT, 
				(((destRegion.right - destRegion.left) << 16) | 
				  (destRegion.bottom - destRegion.top)));
			WRITE_GP32 (MGP_BLT_MODE, blt_mode);
		}
		else
		{
			/* COMPLEX CLIPPING */	

			CLIPOBJ_cEnumStart(pco, FALSE, CT_RECTANGLES, CD_ANY, 0);

			while (1)
			{			
				more = CLIPOBJ_bEnum(pco, sizeof (CLIPENUM), (PULONG) &EnumRects8);
																
				/* ITERATE THROUGH ALL RECTANGLES IN THE CURRENT BATCH */

				for (i = 0; i < EnumRects8.c; i++) 
				{
					/* INTERSECT WITH TARGET BOUNDS */

					if (!FindIntersection (prclDst, &EnumRects8.arcl[i], &destRegion))
						continue;

					/* ADJUST PATTERN ORIGIN */

					if (monobrush)
					{
						patOrigin = (((4000 - pptlBrush->x + destRegion.left) & 7) << 26) |
							         ((4000 - pptlBrush->y + destRegion.top) << 29);
					}
					destoffset = (dstSurfPitch * destRegion.top +
				         (destRegion.left << mode_shift)) + patOrigin;
					destoffset += dstSurfOffset;

					GU2_WAIT_PENDING;
					WRITE_GP32 (MGP_DST_OFFSET, destoffset);
					WRITE_GP32 (MGP_WID_HEIGHT, 
						(((destRegion.right - destRegion.left) << 16) | 
						  (destRegion.bottom - destRegion.top)));
					WRITE_GP32 (MGP_BLT_MODE, blt_mode);					
				}

				if (!more)
					break;
			}
		}
		goto returnTrue;
	}
	
	/* CALCULATE PUNT CASES                                             */
	/* The following punt cases are only relevant if source is present. */
		
	if (psoSrc->iBitmapFormat != BMF_1BPP)
	{
		if (psoSrc->iBitmapFormat != psoTrg->iBitmapFormat)
			goto puntIt;
		
		if (pxlo && !(pxlo->flXlate & XO_TRIVIAL))
		{
			if (psoTrg->iBitmapFormat != BMF_16BPP)
				goto puntIt;
			else
				blt15 = 1;
		}		
	}

	/* CHECK FOR PATTERN */
	/* Also writes the MGP_RASTER_MODE register, so wait for the GP */

	GU2_WAIT_PENDING;
	if (rop4Flags[(UCHAR)rop4] & GX_ROP_PAT)
	{
		/* CHECK FOR SOLID COLOR                                       */
		/* Color != 0xFFFFFFFF implies solid color instead of pattern. */
		
		if(pbo->iSolidColor != 0xffffffff)
		{
			/* SET SOLID COLOR                                     */
			/* Write raster mode first so GX2 replicates the data. */

			WRITE_GP32 (MGP_RASTER_MODE, gu2_bpp | (UCHAR)rop4);
			WRITE_GP32 (MGP_PAT_COLOR_0, pbo->iSolidColor);
		} 
		else 
		{
			/* CHECK BRUSH TYPE */
			/* At this time, we will only support monochrome brushes.  This may change. */
				
			if (pbo->pvRbrush != NULL)
				gBrushPtr = pbo->pvRbrush;
			else 
			{
				gBrushPtr = BRUSHOBJ_pvGetRbrush (pbo);

				DISPDBG((ppdev, 1, "gBrushPtr returned 0x%x\n", gBrushPtr));

				if (gBrushPtr == NULL) 
				{
					goto puntIt;
				}
			}

			/* BRUSH MUST BE MONO     */
			/* For now, we know that our brush is mono, as our realize brush routine */
			/* fails anything else.  Furthermore, we can rotate the brush using the  */
			/* handy dandy pattern origin bits in the GX2.                           */
			
			monobrush = 1;
			patOrigin = (((4000 - pptlBrush->x + prclDst->left) & 7) << 26) |
					     ((4000 - pptlBrush->y + prclDst->top) << 29);

			/* WRITE THE PATTERN DATA */
			/* Write the raster mode register first such that color data is */
			/* replicated by the GP.                                        */

			WRITE_GP32 (MGP_RASTER_MODE, gu2_bpp | (UCHAR)rop4 | MGP_RM_PAT_MONO);
			WRITE_GP32 (MGP_PAT_COLOR_0, gBrushPtr->Color0);
			WRITE_GP32 (MGP_PAT_COLOR_1, gBrushPtr->Color1);
			WRITE_GP32 (MGP_PAT_DATA_0,  (*((unsigned long *)gBrushPtr->Pattern)));
			WRITE_GP32 (MGP_PAT_DATA_1,  (*((unsigned long *)(gBrushPtr->Pattern + 4))));
		}
	}
	else
	{
		/* DESTINATION ONLY, WHITENESS OR BLACKNESS */
		/* Just write ROP.                          */

		WRITE_GP32 (MGP_RASTER_MODE, gu2_bpp | (UCHAR)rop4);
	}

    // READ DESTINATION AND SOURCE SURFACES 
	// Needed because all surfaces are opaque.
    //
    index = dhsurf_array[(USHORT)psoTrg->dhsurf];
    if (index & CACHE_FLAG_SYSTEM)
        goto puntSystem;

    dstSurfOffset = bitmap_heap[(USHORT)index].heap_offset;
    dstSurfPitch  = bitmap_heap[(USHORT)index].flags_and_pitch;
	
    if (psoSrc->dhsurf)
    {
        index = dhsurf_array[(USHORT)psoSrc->dhsurf];
        if (index & CACHE_FLAG_SYSTEM)
        {
            fromFB = 0;
            srcSurfOffset = system_heap[(USHORT)index].heap_offset;
            srcSurfPitch  = system_heap[(USHORT)index].flags_and_pitch;
        }
        else
        {
            fromFB = 1;
            srcSurfOffset = bitmap_heap[(USHORT)index].heap_offset;
            srcSurfPitch  = bitmap_heap[(USHORT)index].flags_and_pitch;
        }
    }
    else
    {
        fromFB = 0;
        srcSurfOffset = (ULONG)psoSrc->pvScan0;
        srcSurfPitch  = (ULONG)psoSrc->lDelta;
    }

	/* CLIPPING */
	/* Complex-clipped source BLTs default to function pointers.  */

	x = pptlSrc->x;
	y = pptlSrc->y;

	if (!pco || pco->iDComplexity == DC_TRIVIAL || pco->iDComplexity == DC_RECT)
	{	
		if (pco && pco->iDComplexity == DC_RECT)
		{
			clippedTargetPtr = &destRegion;
			if (!FindIntersection (prclDst, &pco->rclBounds, &destRegion))
				goto returnTrue;

			/* ADJUST PATTERN ORIGIN */

			if (monobrush)
			{
				patOrigin = (((4000 - pptlBrush->x + destRegion.left) & 7) << 26) |
							 ((4000 - pptlBrush->y + destRegion.top) << 29);
			}

			x += (destRegion.left - prclDst->left);
			y += (destRegion.top  - prclDst->top);
		}
		else
			clippedTargetPtr = prclDst;

		if (blt15)
		{
			Blt15To16 (clippedTargetPtr, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch, x, y, blt_mode | MGP_BM_SRC_FB);
		}
		else if (psoSrc->dhsurf == psoTrg->dhsurf)
		{
			/* SET SHIFTS */
			/* XDIR is bit 9, YDIR is bit 8 */

			blt_mode |= (prclDst->left > x) << 9;
			blt_mode |= (prclDst->top  > y) << 8;
			
			ScreenToScreenBLT (clippedTargetPtr, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch, x, y, blt_mode | MGP_BM_SRC_FB);
		}
		else if (fromFB)
		{
			ScreenToScreenBLT (clippedTargetPtr, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch, x, y, blt_mode | MGP_BM_SRC_FB);

		}
		else if (psoSrc->iBitmapFormat == BMF_1BPP)
		{
			/* WRITE MONO COLORS */
			/* We will NOT set the monochrome flag in the BLT mode register.  */
			/* The mono BLT routine will attempt to treat the data as         */
			/* byte-packed data.                                              */ 

			WRITE_GP32 (MGP_SRC_COLOR_BG, pxlo->pulXlate[0]);
			WRITE_GP32 (MGP_SRC_COLOR_FG, pxlo->pulXlate[1]);

			/* MONOCHROME BLT */

			MonoToScreen (clippedTargetPtr, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch, x, y, blt_mode);
		}
		else
		{
			FBBlt (clippedTargetPtr, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch, x, y, blt_mode | MGP_BM_SRC_FB);
		}
	}
	else
	{
		/* COMPLEX CLIPPING */

		ULONG enumOrder = CD_LEFTDOWN;

		/* COMPLEX CLIPPING - RESORT TO FUNCTION POINTERS */
		/* My opinion is that function pointers are often slow.  However, this */
		/* implementation only uses function pointers for complex clipping.    */

		if (blt15)
		{
			renderFunc = Blt15To16;
			blt_mode |= MGP_BM_SRC_FB;
		}
		else if (psoSrc->dhsurf == psoTrg->dhsurf)
		{
			renderFunc = ScreenToScreenBLT;

			enumOrder = 0;
			if (prclDst->left > x)
			{
				enumOrder |= CD_LEFTWARDS;
				blt_mode  |= MGP_BM_NEG_XDIR;
			}
			if (prclDst->top > y)
			{
				enumOrder |= CD_UPWARDS;
				blt_mode  |= MGP_BM_NEG_YDIR;
			}
			blt_mode |= MGP_BM_SRC_FB;
		}
		else if (fromFB)
		{
			renderFunc = ScreenToScreenBLT;

			blt_mode |= MGP_BM_SRC_FB;
		}
		else if (psoSrc->iBitmapFormat == BMF_1BPP)
		{
			/* WRITE MONO COLORS */
			/* We will NOT set the monochrome flag in the BLT mode register.  */
			/* The mono BLT routine will attempt to treat the data as         */
			/* byte-packed data.                                              */ 

			WRITE_GP32 (MGP_SRC_COLOR_BG, pxlo->pulXlate[0]);
			WRITE_GP32 (MGP_SRC_COLOR_FG, pxlo->pulXlate[1]);

			renderFunc = MonoToScreen;				
		}
		else
		{
			renderFunc = FBBlt;
			blt_mode |= MGP_BM_SRC_FB;			
		}

		CLIPOBJ_cEnumStart(pco, FALSE, CT_RECTANGLES, enumOrder, 0);

		while (1)
		{			
			more = CLIPOBJ_bEnum(pco, sizeof (CLIPENUM), (PULONG) &EnumRects8);
														
			/* ITERATE THROUGH ALL RECTANGLES IN THE CURRENT BATCH */

			for (i = 0; i < EnumRects8.c; i++) 
			{
				/* INTERSECT WITH TARGET BOUNDS */

				if (!FindIntersection (prclDst, &EnumRects8.arcl[i], &destRegion))
					continue;

				/* ADJUST PATTERN ORIGIN */

				if (monobrush)
				{
					patOrigin = (((4000 - pptlBrush->x + destRegion.left) & 7) << 26) |
								 ((4000 - pptlBrush->y + destRegion.top) << 29);
				}

				GU2_WAIT_PENDING;
				renderFunc (&destRegion, dstSurfOffset, dstSurfPitch,
                    srcSurfOffset, srcSurfPitch, 
					x + destRegion.left - prclDst->left,
					y + destRegion.top  - prclDst->top,
					blt_mode);
			}

			if (!more)
				break;
		}
	}

returnTrue:

    RELEASE_PRIMITIVE_SEMAPHORE;

    DISPDBG ((ppdev, 1, "DrvCopyBits Exit\n"));
	return TRUE;

puntSystem:

    DISPDBG ((ppdev, 1, "Punting System Memory BLT\n"));

    // This code assumes 'index' has been setup immediately before jumping to
    // this location.
    //
    index &= 0xFFFF;
    if (!system_heap[index].lockedSurf)
    {
        if (!AddLockedSurfForSystemBitmap (ppdev, index))
            goto returnTrue;
    }
    psoTrg = system_heap[index].lockedSurf;
    
    if (psoSrc && psoSrc->dhsurf)
    {
        MAKE_PUNTABLE(psoSrc);
    }

    GU2_WAIT_BUSY;

	EngBitBlt (psoTrg, psoSrc, psoMask, pco, pxlo, prclDst, pptlSrc, 
		pptlMask, pbo, pptlBrush, rop4);

    RELEASE_PRIMITIVE_SEMAPHORE;

    return TRUE;

reversePuntIt:

    DISPDBG ((ppdev, 1, "DrvBitBlt Reverse Punt\n"));

	// RETRIEVE LOCKED SURFACE
	//
    MAKE_PUNTABLE(psoSrc);
    
    GU2_WAIT_BUSY;

	EngBitBlt (psoTrg, psoSrc, psoMask, pco, pxlo, prclDst, pptlSrc, 
		pptlMask, pbo, pptlBrush, rop4);

    RELEASE_PRIMITIVE_SEMAPHORE;

    return TRUE;

puntIt:
    
    DISPDBG ((ppdev, 1, "DrvBitBlt Punt\n"));

    MAKE_PUNTABLE(psoTrg);
            
    if (psoSrc && psoSrc->dhsurf)
    {
        MAKE_PUNTABLE(psoSrc);
    }

    GU2_WAIT_BUSY;

	// CALL ENGBITBLT 

	EngBitBlt (psoTrg, psoSrc, psoMask, pco, pxlo, prclDst, pptlSrc, 
		pptlMask, pbo, pptlBrush, rop4);

    RELEASE_PRIMITIVE_SEMAPHORE;

	return TRUE;
}

/*---------------------------------------------------------------------------
 * DrvCopyBits
 *
 * This function handles a variety of data copies from memory to screen,
 * screen to memory, and screen to screen. 
 ---------------------------------------------------------------------------*/

BOOL DrvCopyBits(
	SURFOBJ *psoTrg,  /* target surface                                */
	SURFOBJ *psoSrc,   /* source surface                                */
	CLIPOBJ *pco,   /* clip object                                   */
	XLATEOBJ *pxlo, /* translation object - color translation tables */
	RECTL *prclDst, /* target rectangle                              */
	POINTL *pptlSrc)  /* origin of source                              */
{
	PDEV        *ppdev = NULL;
	RECTL       *clippedTargetPtr;
	CLIPENUM     EnumRects8;
	ULONG        blt_mode = 0;
	RECTL        destRegion;
	void        (*renderFunc)(RECTL *, ULONG, ULONG, ULONG, ULONG, ULONG, ULONG, ULONG);
    ULONG       dstSurfOffset, dstSurfPitch;
    ULONG       srcSurfOffset, srcSurfPitch;
	int          i, more, blt15 = 0;
    long        x, y;
    ULONG       index, fromFB;

    DISPDBG ((ppdev, 1, "DrvCopyBits Entry\n"));

    ACQUIRE_PRIMITIVE_SEMAPHORE;

	if (!psoTrg->dhsurf)
	{
		ppdev = (PDEV *)psoSrc->dhpdev;
		goto reversePuntIt;
	}
	ppdev = (PDEV *)psoTrg->dhpdev;
	
    // PUNT ALL BLTS DURING A MODECHANGE
    // In very rare cases, the mode can be changed before Windows has finished rendering
    // to all surfaces from the previous mode.  We will thus punt all rendering calls until
    // all such surfaces have been deleted.

    if (old_mode_count)
    {
        DISPDBG ((ppdev, 3000, "Punting CopyBits during modechange\n"));
        goto puntIt;
    }

	/* CALCULATE PUNT CASES                     */
	/* We do not punt 15-16 color translations. */
		
	if (psoSrc->iBitmapFormat != BMF_1BPP)
	{
		if (psoSrc->iBitmapFormat != psoTrg->iBitmapFormat)
			goto puntIt;

		if (pxlo && !(pxlo->flXlate & XO_TRIVIAL))
		{
			if (psoTrg->iBitmapFormat != BMF_16BPP)
				goto puntIt;

			else
				blt15 = 1;
		}
	}

	// READ DESTINATION AND SOURCE INFORMATION
	// Needed because all device surfaces are opaque.
    //
	index = dhsurf_array[(USHORT)psoTrg->dhsurf];
    if (index & CACHE_FLAG_SYSTEM)
        goto puntSystem;

    dstSurfOffset = bitmap_heap[(USHORT)index].heap_offset;
    dstSurfPitch  = bitmap_heap[(USHORT)index].flags_and_pitch;
	
    if (psoSrc->dhsurf)
    {
        index = dhsurf_array[(USHORT)psoSrc->dhsurf];
        if (index & CACHE_FLAG_SYSTEM)
        {
            fromFB = 0;
            srcSurfOffset = system_heap[(USHORT)index].heap_offset;
            srcSurfPitch  = system_heap[(USHORT)index].flags_and_pitch;
        }
        else
        {
            fromFB = 1;
            srcSurfOffset = bitmap_heap[(USHORT)index].heap_offset;
            srcSurfPitch  = bitmap_heap[(USHORT)index].flags_and_pitch;
        }
    }
    else
    {
        fromFB = 0;
        srcSurfOffset = (ULONG)psoSrc->pvScan0;
        srcSurfPitch  = (ULONG)psoSrc->lDelta;
    }

	/* WRITE THE RASTER MODE REGISTER */

	GU2_WAIT_PENDING;
	WRITE_GP32 (MGP_RASTER_MODE, (gu2_bpp | 0xCC));

	x = pptlSrc->x;
	y = pptlSrc->y;

	/* CLIPPING */
	/* Complex-clipped source BLTs default to function pointers.  */

	if (!pco || pco->iDComplexity == DC_TRIVIAL || pco->iDComplexity == DC_RECT)
	{	
		if (pco && pco->iDComplexity == DC_RECT)
		{
			clippedTargetPtr = &destRegion;
			if (!FindIntersection (prclDst, &pco->rclBounds, &destRegion))
				goto returnTrue;

			x += (destRegion.left - prclDst->left);
			y += (destRegion.top  - prclDst->top);
		}
		else
			clippedTargetPtr = prclDst;

		if (blt15)
		{
			Blt15To16 (clippedTargetPtr, dstSurfOffset, dstSurfPitch, 
                srcSurfOffset, srcSurfPitch, x, y, MGP_BM_SRC_FB);			
		}
		else if (psoSrc->dhsurf == psoTrg->dhsurf)
		{
			/* SET SHIFTS */
			/* XDIR is bit 9, YDIR is bit 8 */

			blt_mode |= (prclDst->left > x) << 9;
			blt_mode |= (prclDst->top  > y) << 8;
			
			ScreenToScreenBLT (clippedTargetPtr, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch, x, y, blt_mode | MGP_BM_SRC_FB);
		}
		else if (fromFB)
		{
			ScreenToScreenBLT (clippedTargetPtr, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch, x, y, MGP_BM_SRC_FB);
		}
		else if (psoSrc->iBitmapFormat == BMF_1BPP)
		{
			/* WRITE MONO COLORS */
			/* We will NOT set the monochrome flag in the BLT mode register.  */
			/* The mono BLT routine will attempt to treat the data as         */
			/* byte-packed data.                                              */ 

			WRITE_GP32 (MGP_SRC_COLOR_BG, pxlo->pulXlate[0]);
			WRITE_GP32 (MGP_SRC_COLOR_FG, pxlo->pulXlate[1]);

			/* MONOCHROME BLT */

			MonoToScreen (clippedTargetPtr, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch, x, y, 0);
		}
		else 
		{
			FBBlt (clippedTargetPtr, dstSurfOffset, dstSurfPitch,
                srcSurfOffset, srcSurfPitch, x, y, MGP_BM_SRC_FB);
		}
	}
	else
	{
		/* COMPLEX CLIPPING */

		ULONG enumOrder = CD_LEFTDOWN;

		/* COMPLEX CLIPPING - RESORT TO FUNCTION POINTERS */
		/* My opinion is that function pointers are often slow.  However, this */
		/* implementation only uses function pointers for complex clipping.    */

		if (blt15)
		{
			renderFunc = Blt15To16;
			blt_mode |= MGP_BM_SRC_FB;
		}
		else if (psoSrc->dhsurf == psoTrg->dhsurf)
		{
			renderFunc = ScreenToScreenBLT;

			enumOrder = 0;
			if (prclDst->left > x)
			{
				enumOrder |= CD_LEFTWARDS;
				blt_mode  |= MGP_BM_NEG_XDIR;
			}
			if (prclDst->top > y)
			{
				enumOrder |= CD_UPWARDS;
				blt_mode  |= MGP_BM_NEG_YDIR;
			}
			blt_mode |= MGP_BM_SRC_FB;
		}
		else if (fromFB)
		{
			renderFunc = ScreenToScreenBLT;

			blt_mode |= MGP_BM_SRC_FB;
		}
		else if (psoSrc->iBitmapFormat == BMF_1BPP)
		{
			/* WRITE MONO COLORS */
			/* We will NOT set the monochrome flag in the BLT mode register.  */
			/* The mono BLT routine will attempt to treat the data as         */
			/* byte-packed data.                                              */ 

			WRITE_GP32 (MGP_SRC_COLOR_BG, pxlo->pulXlate[0]);
			WRITE_GP32 (MGP_SRC_COLOR_FG, pxlo->pulXlate[1]);

			renderFunc = MonoToScreen;				
		}
		else
		{
			renderFunc = FBBlt;
			blt_mode |= MGP_BM_SRC_FB;					
		}

		CLIPOBJ_cEnumStart(pco, FALSE, CT_RECTANGLES, enumOrder, 0);

		while (1)
		{			
			more = CLIPOBJ_bEnum(pco, sizeof (CLIPENUM), (PULONG) &EnumRects8);
														
			/* ITERATE THROUGH ALL RECTANGLES IN THE CURRENT BATCH */

			for (i = 0; i < EnumRects8.c; i++) 
			{
				/* INTERSECT WITH TARGET BOUNDS */

				if (!FindIntersection (prclDst, &EnumRects8.arcl[i], &destRegion))
					continue;

				GU2_WAIT_PENDING;
				renderFunc (&destRegion, dstSurfOffset, dstSurfPitch,
                    srcSurfOffset, srcSurfPitch,
					x + destRegion.left - prclDst->left,
					y + destRegion.top  - prclDst->top,
					blt_mode);
			}

			if (!more)
				break;
		}
	}

returnTrue:

    RELEASE_PRIMITIVE_SEMAPHORE;

    DISPDBG ((ppdev, 1, "DrvCopyBits Exit\n"));
	return TRUE;

puntSystem:

    DISPDBG ((ppdev, 1, "Punting System Memory BLT\n"));

    // This code assumes 'index' has been setup immediately before jumping to
    // this location.
    //
    index &= 0xFFFF;
    if (!system_heap[index].lockedSurf)
    {
        if (!AddLockedSurfForSystemBitmap (ppdev, index))
            goto returnTrue;
    }
    psoTrg = system_heap[index].lockedSurf;
    
    if (psoSrc->dhsurf)
    {
        MAKE_PUNTABLE(psoSrc);
    }

    GU2_WAIT_BUSY;

	EngCopyBits (psoTrg, psoSrc, pco, pxlo, 
		prclDst, pptlSrc);

    RELEASE_PRIMITIVE_SEMAPHORE;

    return TRUE;

puntIt:

    DISPDBG ((ppdev, 1, "DrvCopyBits Punt\n"));

    MAKE_PUNTABLE(psoTrg);
            
    if (psoSrc->dhsurf)
    {
        MAKE_PUNTABLE(psoSrc);
    }

    GU2_WAIT_BUSY;
	
	// CALL ENGBITBLT 

	EngCopyBits (psoTrg, psoSrc, pco, pxlo, 
		prclDst, pptlSrc);
		
    RELEASE_PRIMITIVE_SEMAPHORE;

	return TRUE;

reversePuntIt:

    DISPDBG ((ppdev, 1, "DrvCopyBits Reverse Punt\n"));

	// RETRIEVE LOCKED SURFACE
	//
    MAKE_PUNTABLE(psoSrc);
        
    GU2_WAIT_BUSY;

	EngCopyBits(psoTrg, psoSrc, pco, pxlo, prclDst, pptlSrc);
	
    RELEASE_PRIMITIVE_SEMAPHORE;

	return TRUE;
}

void MonoToScreen (RECTL *destRect, ULONG dstOffset, ULONG dstPitch, 
                   ULONG srcOffset, ULONG srcPitch, ULONG srcLeft, 
                   ULONG srcTop, ULONG blt_mode)
{
	_asm {

		mov		edi, gfx_virt_gpptr

		/* WRITE DESTINATION OFFSET, STRIDE, WIDTH AND HEIGHT */

		mov     ebx, dstPitch
		mov     edx, destRect
		mov     esi, [edx]RECTL.top
		imul    esi, ebx
		add     esi, dstOffset
		mov     [edi + MGP_STRIDE], ebx
		mov     ebx, [edx]RECTL.left
		mov     eax, [edx]RECTL.right
		sub     eax, ebx
		mov     cl, mode_shift
		shl     ebx, cl
		add     esi, ebx
		mov     ebx, [edx]RECTL.bottom
		sub     ebx, [edx]RECTL.top
		or      esi, patOrigin
		mov     [edi + MGP_DST_OFFSET], esi
		shrd    ecx, eax, 16
		mov     cx, bx
		mov     [edi + MGP_WID_HEIGHT], ecx

		/* AT THIS POINT */
		/* EAX = Width   */
		/* EBX = Height  */

		/* CALCULATE BYTE WIDTH FOR THE LINE */

		mov     edx, srcLeft
		movzx   ecx, dl
		and     cl, 7
		shr     edx, 3
		
		lea     eax, [eax + ecx + 7]
		shr     eax, 3
		shl     ecx, 26

		mov     [edi + MGP_SRC_OFFSET], ecx

		/* AT THIS POINT                     */
		/* EAX = Bytes per line              */
		/* EBX = Height                      */
		/* EDX = Byte offset into first line */

		/* CHECK FOR CONTIGUOUS DATA */
		/* We are more efficient if the data is byte-packed, i.e */
		/* the pitch is equal to the byte width.                 */

		cmp     eax, srcPitch
		jne     MonoBltNotContiguous

		/* START BLT */

		mov     ecx, blt_mode
		or      ecx, MGP_BM_SRC_BP_MONO | MGP_BM_SRC_HOST
		mov     [edi + MGP_BLT_MODE], ecx

		/* CONTINGUOUS - EASY    */
		/* Calculate total bytes */

		imul    ebx, eax

		/* CALCULATE SOURCE POINTER */

		imul    eax, srcTop
		add     eax, srcOffset
		lea     esi, [eax + edx]
		
		/* WAIT FOR THE BLT TO BE LATCHED */

MonoBltWaitBltLatched:

		test    DWORD PTR [edi + MGP_BLT_STATUS], MGP_BS_BLT_PENDING
		jnz     MonoBltWaitBltLatched

		/* WRITE THE DATA */

		mov     dl, 17 

		mov     ecx, ebx
		shr     ecx, 2
		jz      MonoDwordLoopDone

MonoDwordLoop:

		dec     dl
		jnz     MonoDwordNoWait

MonoDwordWaitLoop:

		test    [edi + MGP_BLT_STATUS], MGP_BS_HALF_EMPTY
		jz      MonoDwordWaitLoop
		mov     dl, 8

MonoDwordNoWait:

		lodsd 
		mov     [edi + MGP_HST_SOURCE], eax
		loop    MonoDwordLoop

MonoDwordLoopDone:

		movzx   ecx, bl
		and     cl, 3
		or      cl, cl
		jz      MonoContiguousBLTDone

		/* SET POINTER TO LAST BYTE */

		lea     esi, [esi + ecx - 1]
		
MonoByteLoop:

		shl     eax, 8
		mov     al, [esi]
		dec     esi
		loop    MonoByteLoop

		dec     dl
		jnz     MonoByteLoopNoWait

MonoByteWaitLoop:

		test    [edi + MGP_BLT_STATUS], MGP_BS_HALF_EMPTY
		jz      MonoByteWaitLoop

MonoByteLoopNoWait:

		mov     [edi + MGP_HST_SOURCE], eax
		jmp     MonoContiguousBLTDone

MonoBltNotContiguous:

		/* NON-CONTIGUOUS - AT THIS POINT:   */
		/* EAX = Bytes per line              */
		/* EBX = Height                      */
		/* EDX = Byte offset into first line */
		/* ESI = Source Surface              */

		/* START BLT */

		mov     ecx, blt_mode
		or      ecx, MGP_BM_SRC_MONO | MGP_BM_SRC_HOST
		mov     [edi + MGP_BLT_MODE], ecx

		/* SAVE HEIGHT */

		push    ebx

		/* CALCULATE SOURCE POINTER */

		mov     ebx, srcPitch
		mov     ecx, srcTop
		imul    ecx, ebx
		add     ecx, srcOffset
		lea     esi, [ecx + edx]

		/* CALCULATE BYTE DIFFERENCE FOR EACH LINE           */
		/* At the end of each loop, the data pointer will be */
		/* at the beginning of the last dword.               */
		/* EAX = Bytes per line                              */
		/* EBX = Source Delta                                */
		/* ESI = First source byte                           */

		sub     ebx, eax
		movzx   ecx, al
		and     cl, 3
		add     ebx, ecx

		/* WAIT FOR BLT TO BE LATCHED */

MonoBltNonContiguousWaitBltLatched:

		test    DWORD PTR [edi + MGP_BLT_STATUS], MGP_BS_BLT_PENDING
		jnz     MonoBltNonContiguousWaitBltLatched

		/* AT THIS POINT                     */
		/* EAX = Bytes per line              */
		/* EBX = Byte addition for each line */
		/* ESI = First Byte                  */
		/* [esp] = Height                    */

		push    eax
		mov     dl, 17

MonoBltLineLoop:

		mov     ecx, [esp]
		shr     ecx, 2
		jz      MonoBltLineDwordLoopDone

MonoBltLineDwordLoop:

		dec     dl
		jnz     MonoBltLineDwordNoWait

MonoBltLineDwordWaitLoop:

		test    [edi + MGP_BLT_STATUS], MGP_BS_HALF_EMPTY
		jz      MonoBltLineDwordWaitLoop
		mov     dl, 8

MonoBltLineDwordNoWait:

		lodsd 
		mov     [edi + MGP_HST_SOURCE], eax
		loop    MonoBltLineDwordLoop

MonoBltLineDwordLoopDone:

		movzx   ecx, BYTE PTR [esp]
		and     cl, 3
		or      cl, cl
		jz      MonoNonContiguousBLTLineDone

		/* SET POINTER TO LAST BYTE */

		lea     esi, [esi + ecx]
		
MonoBltLineByteLoop:

		shl     eax, 8
		dec     esi
		mov     al, [esi]
		loop    MonoBltLineByteLoop

		dec     dl
		jnz     MonoBltLineByteLoopNoWait

MonoBltLineByteWaitLoop:

		test    [edi + MGP_BLT_STATUS], MGP_BS_HALF_EMPTY
		jz      MonoBltLineByteWaitLoop
		mov     dl, 8

MonoBltLineByteLoopNoWait:

		mov     [edi + MGP_HST_SOURCE], eax

MonoNonContiguousBLTLineDone:

		add     esi, ebx
		dec     DWORD PTR [esp + 4]
		jnz     MonoBltLineLoop

		lea     esp, [esp + 8]

MonoContiguousBLTDone:	

	}
}

void Blt15To16 (RECTL *dstRect, ULONG dstOffset, ULONG dstPitch, 
                ULONG srcOffset, ULONG srcPitch,
                ULONG srcLeft, ULONG srcTop, unsigned long blt_mode)
{
	unsigned long gfx_gp_scratch_base;

	_asm {

		/* SET SCRATCH BASE      */
		/* Last 16K of FB memory */

		mov     eax, gfx_fb_size
		sub     eax, 0x4000
		mov     gfx_gp_scratch_base, eax

		/* INITIALIZE REGISTER POINTER */

		mov		edi, gfx_virt_gpptr

		/* SAVE DESTINATION DELTA */

		push    dstPitch
		
		/* SET DEST OFFSET, WIDTH AND HEIGHT */

		mov     eax, dstOffset
		mov     ebx, dstRect
		mov     esi, [ebx]RECTL.left
		mov     edx, [ebx]RECTL.right
		sub     edx, esi
		lea     eax, [eax + esi*2]
		mov     esi, [ebx]RECTL.top
		mov     ecx, [ebx]RECTL.bottom
		sub     ecx, esi
		imul    esi, [esp]
		add     eax, esi
		
		/* AT THIS POINT:    */
		/* EAX = Dest offset */
		/* ECX = Height      */
		/* EDX = Width       */
		
		/* SAVE THE WIDTH AND HEIGHT */

		push    ecx
		push    edx

		/* PROGRAM THE WIDTH, HEIGHT AND DEST OFFSET */
		/* The height is programmed to 1.            */

		or      eax, patOrigin
		mov     [edi + MGP_DST_OFFSET], eax
		shl     edx, 16
		inc     edx
		mov     [edi + MGP_WID_HEIGHT], edx

		/* READ DATA POINTER */

		mov     edx, srcLeft

		mov     esi, srcTop
		mov     ecx, srcPitch
		imul    esi, ecx
		
		lea     esi, [esi + edx*2]
		add     esi, srcOffset
		
		/* CALCULATE THE DELTA BETWEEN SOURCE LINES */
		/* Save the byte count on the stack.        */

		mov     ebx, [esp]
		sub     ecx, ebx
		sub     ecx, ebx
		push    ecx

		/* SAVE DESTINATION OFFSET */

		push    eax

		/* AT THIS POINT:                   */
		/* EAX        = Dest offset         */
		/* ESI        = Source data pointer */
		/* [ESP]      = Saved dest offset   */
		/* [ESP + 4]  = Delta               */
		/* [ESP + 8]  = Width               */
		/* [ESP + 12] = Height              */
		/* [ESP + 16] = Dest Delta          */
		
		/* ZERO LINE INDEX */
		/* Index is 0 or 1 based on current source line */

		xor     ebx, ebx

		/* READY TO WRITE THE DATA */
		/* We must wait for BLT busy as another driver routine may */
		/* have also used the offscreen frame buffer area.         */

Blt15To16WaitBusyLoop:

		test    [edi + MGP_BLT_STATUS], MGP_BS_BLT_BUSY
		jnz     Blt15To16WaitBusyLoop

Blt15To16LineLoop:

		/* WRITE THE REGISTERS SPECIFIC TO EACH LINE */

		mov     [edi + MGP_DST_OFFSET], eax
		mov     ecx, ebx
		shl     ecx, 13
		add     ecx, gfx_gp_scratch_base
		mov     [edi + MGP_SRC_OFFSET], ecx
		mov     edi, ecx
		add     edi, gfx_virt_fbptr

		/* WRITE THE DATA TO THE FRAME BUFFER */

		mov     ecx, [esp + 8]
		shr     ecx, 1
		jz      Blt15To16OnlyOnePixel

Blt15To16DwordLoop:

		lodsd  
		lea     edx, [eax * 2]
		and     edx, 0xFFC0FFC0
		and     eax, 0x001F001F
		or      eax, edx
		and     edx, 0x04000400
		shr     edx, 5
		or      eax, edx		
		stosd
		loop    Blt15To16DwordLoop

		mov     ecx, [esp + 8]
		and     cl, 1
		jz      Blt15To16StartBlt

Blt15To16OnlyOnePixel:

		lodsw
		lea     edx, [eax * 2]
		and     dx, 0xFFC0
		and     ax, 0x001F
		or      eax, edx
		and     dx, 0x0400
		shr     dx, 5
		or      eax, edx
		stosw

Blt15To16StartBlt:  
		
		mov     eax, blt_mode
		mov     edi, gfx_virt_gpptr
		mov     [edi + MGP_BLT_MODE], eax

		dec     DWORD PTR [esp + 12]
		jz      Blt15To16Finished

		/* UPDATE OFFSETS */

		add     esi, [esp + 4]
		mov     eax, [esp]
		add     eax, [esp + 16]
		add     eax, 0x20000000
		mov     [esp], eax

		/* UPDATE LINE */

		dec     bl
		setnz   bl

Blt15To16WaitPendingLoop:
		test    DWORD PTR [edi + MGP_BLT_STATUS], MGP_BS_BLT_PENDING
		jnz     Blt15To16WaitPendingLoop
		jmp     Blt15To16LineLoop

Blt15To16Finished:

		lea     esp, [esp + 20]
	}
}

