/*
 * Copyright 2008 Alex Deucher
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 *
 * Based on radeon_exa_render.c and kdrive ati_video.c by Eric Anholt, et al.
 *
 */

#if defined(ACCEL_MMIO) && defined(ACCEL_CP)
#error Cannot define both MMIO and CP acceleration!
#endif

#if !defined(UNIXCPP) || defined(ANSICPP)
#define FUNC_NAME_CAT(prefix,suffix) prefix##suffix
#else
#define FUNC_NAME_CAT(prefix,suffix) prefix/**/suffix
#endif

#ifdef ACCEL_MMIO
#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,MMIO)
#else
#ifdef ACCEL_CP
#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,CP)
#else
#error No accel type defined!
#endif
#endif

#ifdef ACCEL_CP

#define VTX_OUT_FILTER(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)	\
do {									\
    OUT_RING_F(_dstX);						\
    OUT_RING_F(_dstY);						\
    OUT_RING_F(_srcX);						\
    OUT_RING_F(_srcY);						\
    OUT_RING_F(_maskX);						\
    OUT_RING_F(_maskY);						\
} while (0)

#define VTX_OUT(_dstX, _dstY, _srcX, _srcY)	\
do {								\
    OUT_RING_F(_dstX);						\
    OUT_RING_F(_dstY);						\
    OUT_RING_F(_srcX);						\
    OUT_RING_F(_srcY);						\
} while (0)

#else /* ACCEL_CP */

#define VTX_OUT_FILTER(_dstX, _dstY, _srcX, _srcY, _maskX, _maskY)	\
do {									\
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);			\
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);			\
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcX);			\
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcY);			\
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskX);			\
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _maskY);			\
} while (0)

#define VTX_OUT(_dstX, _dstY, _srcX, _srcY)	\
do {								\
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstX);		\
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _dstY);		\
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcX);		\
    OUT_ACCEL_REG_F(RADEON_SE_PORT_DATA0, _srcY);		\
} while (0)

#endif /* !ACCEL_CP */

static void
FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
{
    RADEONInfoPtr info = RADEONPTR(pScrn);
    PixmapPtr pPixmap = pPriv->pPixmap;
    uint32_t txformat;
    uint32_t txfilter, txformat0, txformat1, txoffset, txpitch;
    uint32_t dst_offset, dst_pitch, dst_format;
    uint32_t txenable, colorpitch;
    uint32_t blendcntl;
    Bool isplanar = FALSE;
    int dstxoff, dstyoff, pixel_shift, vtx_count;
    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
    int nBox = REGION_NUM_RECTS(&pPriv->clip);
    ACCEL_PREAMBLE();

    pixel_shift = pPixmap->drawable.bitsPerPixel >> 4;

#ifdef USE_EXA
    if (info->useEXA) {
	dst_offset = exaGetPixmapOffset(pPixmap) + info->fbLocation + pScrn->fbOffset;
	dst_pitch = exaGetPixmapPitch(pPixmap);
    } else
#endif
	{
	    dst_offset = (pPixmap->devPrivate.ptr - info->FB) +
		info->fbLocation + pScrn->fbOffset;
	    dst_pitch = pPixmap->devKind;
	}

#ifdef COMPOSITE
    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
#else
    dstxoff = 0;
    dstyoff = 0;
#endif

#ifdef USE_EXA
    if (info->useEXA) {
	RADEON_SWITCH_TO_3D();
    } else
#endif
	{
	    BEGIN_ACCEL(2);
	    if (IS_R300_3D || IS_R500_3D)
		OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);
	    else
		OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH);
	    /* We must wait for 3d to idle, in case source was just written as a dest. */
	    OUT_ACCEL_REG(RADEON_WAIT_UNTIL,
			  RADEON_WAIT_HOST_IDLECLEAN |
			  RADEON_WAIT_2D_IDLECLEAN |
			  RADEON_WAIT_3D_IDLECLEAN |
			  RADEON_WAIT_DMA_GUI_IDLE);
	    FINISH_ACCEL();

	    if (!info->accel_state->XInited3D)
		RADEONInit3DEngine(pScrn);
	}

    if (pPriv->bicubic_enabled)
	vtx_count = 6;
    else
	vtx_count = 4;

    if (IS_R300_3D || IS_R500_3D) {
	uint32_t output_fmt;

	switch (pPixmap->drawable.bitsPerPixel) {
	case 16:
	    if (pPixmap->drawable.depth == 15)
		dst_format = R300_COLORFORMAT_ARGB1555;
	    else
		dst_format = R300_COLORFORMAT_RGB565;
	    break;
	case 32:
	    dst_format = R300_COLORFORMAT_ARGB8888;
	    break;
	default:
	    return;
	}

	output_fmt = (R300_OUT_FMT_C4_8 |
		      R300_OUT_FMT_C0_SEL_BLUE |
		      R300_OUT_FMT_C1_SEL_GREEN |
		      R300_OUT_FMT_C2_SEL_RED |
		      R300_OUT_FMT_C3_SEL_ALPHA);

	colorpitch = dst_pitch >> pixel_shift;
	colorpitch |= dst_format;

	if (RADEONTilingEnabled(pScrn, pPixmap))
	    colorpitch |= R300_COLORTILE;

	if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
	    isplanar = TRUE;
	}

	if (isplanar) {
	    txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
	    txpitch = pPriv->src_pitch;
	} else {
	    if (pPriv->id == FOURCC_UYVY)
		txformat1 = R300_TX_FORMAT_YVYU422;
	    else
		txformat1 = R300_TX_FORMAT_VYUY422;

	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;

	    /* pitch is in pixels */
	    txpitch = pPriv->src_pitch / 2;
	}
	txpitch -= 1;

	txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
		    (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
		    R300_TXPITCH_EN);

	info->accel_state->texW[0] = pPriv->w;
	info->accel_state->texH[0] = pPriv->h;

	txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
		    R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
		    R300_TX_MAG_FILTER_LINEAR |
		    R300_TX_MIN_FILTER_LINEAR |
		    (0 << R300_TX_ID_SHIFT));


	if (IS_R500_3D && ((pPriv->w - 1) & 0x800))
	    txpitch |= R500_TXWIDTH_11;

	if (IS_R500_3D && ((pPriv->h - 1) & 0x800))
	    txpitch |= R500_TXHEIGHT_11;

	txoffset = pPriv->src_offset;

	BEGIN_ACCEL(6);
	OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter);
	OUT_ACCEL_REG(R300_TX_FILTER1_0, 0);
	OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0);
	OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1);
	OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch);
	OUT_ACCEL_REG(R300_TX_OFFSET_0, txoffset);
	FINISH_ACCEL();

	txenable = R300_TEX_0_ENABLE;

	if (isplanar) {
	    txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
			(((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
			R300_TXPITCH_EN);
	    txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
	    txpitch -= 1;
	    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
		        R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
			R300_TX_MIN_FILTER_LINEAR |
			R300_TX_MAG_FILTER_LINEAR);

		BEGIN_ACCEL(12);
		OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
		OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
		OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
		OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
		OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
		OUT_ACCEL_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset);
		OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
		OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
		OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
		OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
		OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
		OUT_ACCEL_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset);
		FINISH_ACCEL();
		txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
	}

	if (pPriv->bicubic_enabled) {
		/* Size is 128x1 */
		txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
			     (0x0 << R300_TXHEIGHT_SHIFT) |
			     R300_TXPITCH_EN);
		/* Format is 32-bit floats, 4bpp */
		txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
		/* Pitch is 127 (128-1) */
		txpitch = 0x7f;
		/* Tex filter */
		txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) |
			    R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) |
			    R300_TX_MIN_FILTER_NEAREST |
			    R300_TX_MAG_FILTER_NEAREST |
			    (1 << R300_TX_ID_SHIFT));

		BEGIN_ACCEL(6);
		OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter);
		OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
		OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
		OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1);
		OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
		OUT_ACCEL_REG(R300_TX_OFFSET_1, pPriv->bicubic_src_offset);
		FINISH_ACCEL();

		/* Enable tex 1 */
		txenable |= R300_TEX_1_ENABLE;
	}

	/* setup the VAP */
	if (info->accel_state->has_tcl) {
	    if (pPriv->bicubic_enabled)
		BEGIN_ACCEL(7);
	    else
		BEGIN_ACCEL(6);
	} else {
	    if (pPriv->bicubic_enabled)
		BEGIN_ACCEL(5);
	    else
		BEGIN_ACCEL(4);
	}

	/* These registers define the number, type, and location of data submitted
	 * to the PVS unit of GA input (when PVS is disabled)
	 * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is
	 * enabled.  This memory provides the imputs to the vertex shader program
	 * and ordering is not important.  When PVS/TCL is disabled, this field maps
	 * directly to the GA input memory and the order is signifigant.  In
	 * PVS_BYPASS mode the order is as follows:
	 * Position
	 * Point Size
	 * Color 0-3
	 * Textures 0-7
	 * Fog
	 */
	if (pPriv->bicubic_enabled) {
	    OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
			  ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
			   (0 << R300_SKIP_DWORDS_0_SHIFT) |
			   (0 << R300_DST_VEC_LOC_0_SHIFT) |
			   R300_SIGNED_0 |
			   (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
			   (0 << R300_SKIP_DWORDS_1_SHIFT) |
			   (6 << R300_DST_VEC_LOC_1_SHIFT) |
			   R300_SIGNED_1));
	    OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1,
			  ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) |
			   (0 << R300_SKIP_DWORDS_2_SHIFT) |
			   (7 << R300_DST_VEC_LOC_2_SHIFT) |
			   R300_LAST_VEC_2 |
			   R300_SIGNED_2));
	} else {
	    OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0,
			  ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
			   (0 << R300_SKIP_DWORDS_0_SHIFT) |
			   (0 << R300_DST_VEC_LOC_0_SHIFT) |
			   R300_SIGNED_0 |
			   (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) |
			   (0 << R300_SKIP_DWORDS_1_SHIFT) |
			   (6 << R300_DST_VEC_LOC_1_SHIFT) |
			   R300_LAST_VEC_1 |
			   R300_SIGNED_1));
	}

	/* load the vertex shader
	 * We pre-load vertex programs in RADEONInit3DEngine():
	 * - exa mask/Xv bicubic
	 * - exa no mask
	 * - Xv
	 * Here we select the offset of the vertex program we want to use
	 */
	if (info->accel_state->has_tcl) {
	    if (pPriv->bicubic_enabled) {
		OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
			      ((0 << R300_PVS_FIRST_INST_SHIFT) |
			       (2 << R300_PVS_XYZW_VALID_INST_SHIFT) |
			       (2 << R300_PVS_LAST_INST_SHIFT)));
		OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
			      (2 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
	    } else {
		OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0,
			      ((5 << R300_PVS_FIRST_INST_SHIFT) |
			       (6 << R300_PVS_XYZW_VALID_INST_SHIFT) |
			       (6 << R300_PVS_LAST_INST_SHIFT)));
		OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1,
			      (6 << R300_PVS_LAST_VTX_SRC_INST_SHIFT));
	    }
	}

	/* Position and one set of 2 texture coordinates */
	OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT);
	if (pPriv->bicubic_enabled)
	    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) |
						   (2 << R300_TEX_1_COMP_CNT_SHIFT)));
	else
	    OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT));

	OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt);
	FINISH_ACCEL();

	/* setup pixel shader */
	if (IS_R300_3D) {
	    if (pPriv->bicubic_enabled) {
		BEGIN_ACCEL(79);

		/* 4 components: 2 for tex0 and 2 for tex1 */
		OUT_ACCEL_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
						   R300_RS_COUNT_HIRES_EN));

		/* R300_INST_COUNT_RS - highest RS instruction used */
		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));

		/* Pixel stack frame size. */
		OUT_ACCEL_REG(R300_US_PIXSIZE, 5);

		/* Indirection levels */
		OUT_ACCEL_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) |
							R300_FIRST_TEX));

		/* Set nodes. */
		OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
							R300_ALU_CODE_SIZE(14) |
							R300_TEX_CODE_OFFSET(0) |
							R300_TEX_CODE_SIZE(6)));

		/* Nodes are allocated highest first, but executed lowest first */
		OUT_ACCEL_REG(R300_US_CODE_ADDR_0, 0);
		OUT_ACCEL_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) |
							R300_ALU_SIZE(0) |
							R300_TEX_START(0) |
							R300_TEX_SIZE(0)));
		OUT_ACCEL_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) |
							R300_ALU_SIZE(9) |
							R300_TEX_START(1) |
							R300_TEX_SIZE(0)));
		OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) |
							R300_ALU_SIZE(2) |
							R300_TEX_START(2) |
							R300_TEX_SIZE(3) |
							R300_RGBA_OUT));

		/* ** BICUBIC FP ** */

		/* texcoord0 => temp0
		 * texcoord1 => temp1 */

		// first node
		/* TEX temp2, temp1.rrr0, tex1, 1D */
		OUT_ACCEL_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) |
						   R300_TEX_ID(1) |
						   R300_TEX_SRC_ADDR(1) |
						   R300_TEX_DST_ADDR(2)));

		/* MOV temp1.r, temp1.ggg0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) |
						   R300_ALU_RGB_ADDRD(1) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));


		// second node
		/* TEX temp1, temp1, tex1, 1D */
		OUT_ACCEL_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) |
						   R300_TEX_ID(1) |
						   R300_TEX_SRC_ADDR(1) |
						   R300_TEX_DST_ADDR(1)));

		/* MUL temp3.rg, temp2.ggg0, const0.rgb0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) |
						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
						   R300_ALU_RGB_ADDRD(3) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));


		/* MUL temp2.rg, temp2.rrr0, const0.rgb */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) |
						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) |
						   R300_ALU_RGB_ADDRD(2) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

		/* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) |
						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
						   R300_ALU_RGB_ADDR2(3) |
						   R300_ALU_RGB_ADDRD(4) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

		/* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) |
						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
						   R300_ALU_RGB_ADDR2(2) |
						   R300_ALU_RGB_ADDRD(5) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

		/* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) |
						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
						   R300_ALU_RGB_ADDR2(3) |
						   R300_ALU_RGB_ADDRD(3) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

		/* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) |
						   R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) |
						   R300_ALU_RGB_ADDR2(2) |
						   R300_ALU_RGB_ADDRD(1) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

		/* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) |
						   R300_ALU_RGB_ADDR2(1) |
						   R300_ALU_RGB_ADDRD(1) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

		/* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) |
						   R300_ALU_RGB_ADDR2(3) |
						   R300_ALU_RGB_ADDRD(2) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

		/* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) |
						   R300_ALU_RGB_ADDR2(5) |
						   R300_ALU_RGB_ADDRD(3) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));

		/* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) |
						   R300_ALU_RGB_ADDR2(4) |
						   R300_ALU_RGB_ADDRD(0) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));


		// third node
		/* TEX temp4, temp1.rg--, tex0, 1D */
		OUT_ACCEL_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) |
						   R300_TEX_ID(0) |
						   R300_TEX_SRC_ADDR(1) |
						   R300_TEX_DST_ADDR(4)));

		/* TEX temp3, temp3.rg--, tex0, 1D */
		OUT_ACCEL_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) |
						   R300_TEX_ID(0) |
						   R300_TEX_SRC_ADDR(3) |
						   R300_TEX_DST_ADDR(3)));

		/* TEX temp5, temp2.rg--, tex0, 1D */
		OUT_ACCEL_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) |
						   R300_TEX_ID(0) |
						   R300_TEX_SRC_ADDR(2) |
						   R300_TEX_DST_ADDR(5)));

		/* TEX temp0, temp0.rg--, tex0, 1D */
		OUT_ACCEL_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) |
						   R300_TEX_ID(0) |
						   R300_TEX_SRC_ADDR(0) |
						   R300_TEX_DST_ADDR(0)));

		/* LRP temp3, temp1.bbbb, temp4, temp3 ->
		 * - PRESUB temps, temp4 - temp3
		 * - MAD temp3, temp1.bbbb, temps, temp3 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
						   R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) |
						   R300_ALU_RGB_ADDR1(4) |
						   R300_ALU_RGB_ADDR2(1) |
						   R300_ALU_RGB_ADDRD(3) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) |
						   R300_ALU_ALPHA_ADDR1(4) |
						   R300_ALU_ALPHA_ADDR2(1) |
						   R300_ALU_ALPHA_ADDRD(3) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));

		/* LRP temp0, temp1.bbbb, temp5, temp0 ->
		 * - PRESUB temps, temp5 - temp0
		 * - MAD temp0, temp1.bbbb, temps, temp0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
						   R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) |
						   R300_ALU_RGB_INSERT_NOP));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) |
						   R300_ALU_RGB_ADDR1(5) |
						   R300_ALU_RGB_ADDR2(1) |
						   R300_ALU_RGB_ADDRD(0) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) |
						   R300_ALU_ALPHA_ADDR1(5) |
						   R300_ALU_ALPHA_ADDR2(1) |
						   R300_ALU_ALPHA_ADDRD(0) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A)));

		/* LRP output, temp2.bbbb, temp3, temp0 ->
		 * - PRESUB temps, temp3 - temp0
		 * - MAD output, temp2.bbbb, temps, temp0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
						   R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) |
						   R300_ALU_RGB_ADDR1(3) |
						   R300_ALU_RGB_ADDR2(2) |
						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) |
						   R300_ALU_ALPHA_ADDR1(3) |
						   R300_ALU_ALPHA_ADDR2(2) |
						   R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A)));

		/* Shader constants. */
		OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w));
		OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), 0);
		OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), 0);
		OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), 0);

		OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), 0);
		OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h));
		OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), 0);
		OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0);

		FINISH_ACCEL();
	    } else if (isplanar) {
	    /*
	     * y' = y - .0625
	     * u' = u - .5
	     * v' = v - .5;
	     *
	     * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
	     * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
	     * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
	     *
	     * DP3 might look like the straightforward solution
	     * but we'd need to move the texture yuv values in
	     * the same reg for this to work. Therefore use MADs.
	     * Without changing the shader at all (only the constants)
	     * could also provide hue/saturation/brightness/contrast control.
	     *
	     * yco = 1.1643
	     * uco = 0, -0.39173, 2.017
	     * vco = 1.5958, -0.8129, 0
	     * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
	     *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
	     *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
	     *
	     * temp = MAD(yco, yuv.yyyy, off)
	     * temp = MAD(uco, yuv.uuuu, temp)
	     * result = MAD(vco, yuv.vvvv, temp)
	     */
		float yco = 1.1643;
		float uco[3] = {0.0, -0.39173, 2.018};
		float vco[3] = {1.5958, -0.8129, 0.0};
		float off[3] = {-0.0625 * yco + -0.5 * uco[0] + -0.5 * vco[0],
				-0.0625 * yco + -0.5 * uco[1] + -0.5 * vco[1],
				-0.0625 * yco + -0.5 * uco[2] + -0.5 * vco[2]};

		BEGIN_ACCEL(33);
		/* 2 components: same 2 for tex0/1/2 */
		OUT_ACCEL_REG(R300_RS_COUNT,
			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
			   R300_RS_COUNT_HIRES_EN));
		/* R300_INST_COUNT_RS - highest RS instruction used */
		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));

		OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */

		/* Indirection levels */
		OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
							R300_FIRST_TEX));

		OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
						   R300_ALU_CODE_SIZE(3) |
						   R300_TEX_CODE_OFFSET(0) |
						   R300_TEX_CODE_SIZE(3)));

		OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
						   R300_ALU_SIZE(2) |
						   R300_TEX_START(0) |
						   R300_TEX_SIZE(2) |
						   R300_RGBA_OUT));

		/* tex inst */
		OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
						  R300_TEX_DST_ADDR(0) |
						  R300_TEX_ID(0) |
						  R300_TEX_INST(R300_TEX_INST_LD)));
		OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
						  R300_TEX_DST_ADDR(1) |
						  R300_TEX_ID(1) |
						  R300_TEX_INST(R300_TEX_INST_LD)));
		OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
						  R300_TEX_DST_ADDR(2) |
						  R300_TEX_ID(2) |
						  R300_TEX_INST(R300_TEX_INST_LD)));

		/* ALU inst */
		/* MAD temp0, const0.a, temp0, const0.rgb */
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
						   R300_ALU_RGB_ADDR1(0) |
						   R300_ALU_RGB_ADDR2(0) |
						   R300_ALU_RGB_ADDRD(0) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
		/* alpha nop, but need to set up alpha source for rgb usage */
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
						   R300_ALU_ALPHA_ADDR1(0) |
						   R300_ALU_ALPHA_ADDR2(0) |
						   R300_ALU_ALPHA_ADDRD(0) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));

		/* MAD const1, temp1, temp0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
						   R300_ALU_RGB_ADDR1(1) |
						   R300_ALU_RGB_ADDR2(0) |
						   R300_ALU_RGB_ADDRD(0) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
		/* alpha nop */
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));

		/* MAD result, const2, temp2, temp0 */
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
						   R300_ALU_RGB_ADDR1(2) |
						   R300_ALU_RGB_ADDR2(0) |
						   R300_ALU_RGB_ADDRD(0) |
						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
						   R300_ALU_RGB_CLAMP));
		/* write alpha 1 */
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
						   R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
						   R300_ALU_ALPHA_TARGET_A));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));

		/* Shader constants. */
		/* constant 0: off, yco */
		OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
		OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
		OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
		OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
		/* constant 1: uco */
		OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
		OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
		OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
		OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(0.0));
		/* constant 2: vco */
		OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
		OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
		OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
		OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));

		FINISH_ACCEL();

	    } else {
		BEGIN_ACCEL(11);
		/* 2 components: 2 for tex0 */
		OUT_ACCEL_REG(R300_RS_COUNT,
			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
			   R300_RS_COUNT_HIRES_EN));
		/* R300_INST_COUNT_RS - highest RS instruction used */
		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));

		OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */

		/* Indirection levels */
		OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
							R300_FIRST_TEX));

		OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
						   R300_ALU_CODE_SIZE(1) |
						   R300_TEX_CODE_OFFSET(0) |
						   R300_TEX_CODE_SIZE(1)));

		OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
						   R300_ALU_SIZE(0) |
						   R300_TEX_START(0) |
						   R300_TEX_SIZE(0) |
						   R300_RGBA_OUT));

		/* tex inst */
		OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
						  R300_TEX_DST_ADDR(0) |
						  R300_TEX_ID(0) |
						  R300_TEX_INST(R300_TEX_INST_LD)));

		/* ALU inst */
		/* RGB */
		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) |
						   R300_ALU_RGB_ADDR1(0) |
						   R300_ALU_RGB_ADDR2(0) |
						   R300_ALU_RGB_ADDRD(0) |
						   R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R |
						   R300_ALU_RGB_MASK_G |
						   R300_ALU_RGB_MASK_B)) |
						   R300_ALU_RGB_TARGET_A));
		OUT_ACCEL_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) |
						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) |
						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
						   R300_ALU_RGB_CLAMP));
		/* Alpha */
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) |
						   R300_ALU_ALPHA_ADDR1(0) |
						   R300_ALU_ALPHA_ADDR2(0) |
						   R300_ALU_ALPHA_ADDRD(0) |
						   R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
						   R300_ALU_ALPHA_TARGET_A |
						   R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE)));
		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) |
						   R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) |
						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) |
						   R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) |
						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) |
						   R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) |
						   R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
						   R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
						   R300_ALU_ALPHA_CLAMP));
		FINISH_ACCEL();
	    }
	} else {
	    if (pPriv->bicubic_enabled) {
		BEGIN_ACCEL(7);

		/* 4 components: 2 for tex0 and 2 for tex1 */
		OUT_ACCEL_REG(R300_RS_COUNT,
			      ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) |
			       R300_RS_COUNT_HIRES_EN));

		/* R300_INST_COUNT_RS - highest RS instruction used */
		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6));

		/* Pixel stack frame size. */
		OUT_ACCEL_REG(R300_US_PIXSIZE, 5);

		/* FP length. */
		OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
						  R500_US_CODE_END_ADDR(13)));
		OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
						   R500_US_CODE_RANGE_SIZE(13)));

		/* Prepare for FP emission. */
		OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));
		FINISH_ACCEL();

		BEGIN_ACCEL(89);
		/* Pixel shader.
		 * I've gone ahead and annotated each instruction, since this
		 * thing is MASSIVE. :3
		 * Note: In order to avoid buggies with temps and multiple
		 * inputs, all temps are offset by 2. temp0 -> register2. */

		/* TEX temp2, input1.xxxx, tex1, 1D */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
						       R500_TEX_INST_LD |
						       R500_TEX_IGNORE_UNCOVERED));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
						       R500_TEX_SRC_S_SWIZ_R |
						       R500_TEX_SRC_T_SWIZ_R |
						       R500_TEX_SRC_R_SWIZ_R |
						       R500_TEX_SRC_Q_SWIZ_R |
						       R500_TEX_DST_ADDR(2) |
						       R500_TEX_DST_R_SWIZ_R |
						       R500_TEX_DST_G_SWIZ_G |
						       R500_TEX_DST_B_SWIZ_B |
						       R500_TEX_DST_A_SWIZ_A));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

		/* TEX temp5, input1.yyyy, tex1, 1D */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
						       R500_INST_TEX_SEM_WAIT |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) |
						       R500_TEX_INST_LD |
						       R500_TEX_SEM_ACQUIRE |
						       R500_TEX_IGNORE_UNCOVERED));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) |
						       R500_TEX_SRC_S_SWIZ_G |
						       R500_TEX_SRC_T_SWIZ_G |
						       R500_TEX_SRC_R_SWIZ_G |
						       R500_TEX_SRC_Q_SWIZ_G |
						       R500_TEX_DST_ADDR(5) |
						       R500_TEX_DST_R_SWIZ_R |
						       R500_TEX_DST_G_SWIZ_G |
						       R500_TEX_DST_B_SWIZ_B |
						       R500_TEX_DST_A_SWIZ_A));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

		/* MUL temp4, const0.x0x0, temp2.yyxx */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
						       R500_INST_TEX_SEM_WAIT |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						       R500_INST_ALPHA_WMASK));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
						       R500_RGB_ADDR0_CONST |
						       R500_RGB_ADDR1(2)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
						       R500_ALPHA_ADDR0_CONST |
						       R500_ALPHA_ADDR1(2)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
						       R500_ALU_RGB_R_SWIZ_A_R |
						       R500_ALU_RGB_G_SWIZ_A_0 |
						       R500_ALU_RGB_B_SWIZ_A_R |
						       R500_ALU_RGB_SEL_B_SRC1 |
						       R500_ALU_RGB_R_SWIZ_B_G |
						       R500_ALU_RGB_G_SWIZ_B_G |
						       R500_ALU_RGB_B_SWIZ_B_R));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
						       R500_ALPHA_OP_MAD |
						       R500_ALPHA_SEL_A_SRC0 |
						       R500_ALPHA_SWIZ_A_0 |
						       R500_ALPHA_SEL_B_SRC1 |
						       R500_ALPHA_SWIZ_B_R));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
						       R500_ALU_RGBA_OP_MAD |
						       R500_ALU_RGBA_R_SWIZ_0 |
						       R500_ALU_RGBA_G_SWIZ_0 |
						       R500_ALU_RGBA_B_SWIZ_0 |
						       R500_ALU_RGBA_A_SWIZ_0));

		/* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						       R500_INST_ALPHA_WMASK));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
						       R500_RGB_ADDR0_CONST |
						       R500_RGB_ADDR1(5) |
						       R500_RGB_ADDR2(4)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
						       R500_ALPHA_ADDR0_CONST |
						       R500_ALPHA_ADDR1(5) |
						       R500_ALPHA_ADDR2(4)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
						       R500_ALU_RGB_R_SWIZ_A_0 |
						       R500_ALU_RGB_G_SWIZ_A_G |
						       R500_ALU_RGB_B_SWIZ_A_0 |
						       R500_ALU_RGB_SEL_B_SRC1 |
						       R500_ALU_RGB_R_SWIZ_B_R |
						       R500_ALU_RGB_G_SWIZ_B_R |
						       R500_ALU_RGB_B_SWIZ_B_R));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
						       R500_ALPHA_OP_MAD |
						       R500_ALPHA_SEL_A_SRC0 |
						       R500_ALPHA_SWIZ_A_G |
						       R500_ALPHA_SEL_B_SRC1 |
						       R500_ALPHA_SWIZ_B_R));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
						       R500_ALU_RGBA_OP_MAD |
						       R500_ALU_RGBA_SEL_C_SRC2 |
						       R500_ALU_RGBA_R_SWIZ_R |
						       R500_ALU_RGBA_G_SWIZ_G |
						       R500_ALU_RGBA_B_SWIZ_B |
						       R500_ALU_RGBA_A_SWIZ_A));

		/* ADD temp3, temp3, input0.xyxy */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						       R500_INST_ALPHA_WMASK));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) |
						       R500_RGB_ADDR2(0)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) |
						       R500_ALPHA_ADDR2(0)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
						       R500_ALU_RGB_G_SWIZ_A_1 |
						       R500_ALU_RGB_B_SWIZ_A_1 |
						       R500_ALU_RGB_SEL_B_SRC1 |
						       R500_ALU_RGB_R_SWIZ_B_R |
						       R500_ALU_RGB_G_SWIZ_B_G |
						       R500_ALU_RGB_B_SWIZ_B_B));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
						       R500_ALPHA_OP_MAD |
						       R500_ALPHA_SWIZ_A_1 |
						       R500_ALPHA_SEL_B_SRC1 |
						       R500_ALPHA_SWIZ_B_A));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
						       R500_ALU_RGBA_OP_MAD |
						       R500_ALU_RGBA_SEL_C_SRC2 |
						       R500_ALU_RGBA_R_SWIZ_R |
						       R500_ALU_RGBA_G_SWIZ_G |
						       R500_ALU_RGBA_B_SWIZ_R |
						       R500_ALU_RGBA_A_SWIZ_G));

		/* TEX temp1, temp3.zwxy, tex0, 2D */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						       R500_INST_ALPHA_WMASK));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
						       R500_TEX_INST_LD |
						       R500_TEX_IGNORE_UNCOVERED));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
						       R500_TEX_SRC_S_SWIZ_B |
						       R500_TEX_SRC_T_SWIZ_A |
						       R500_TEX_SRC_R_SWIZ_R |
						       R500_TEX_SRC_Q_SWIZ_G |
						       R500_TEX_DST_ADDR(1) |
						       R500_TEX_DST_R_SWIZ_R |
						       R500_TEX_DST_G_SWIZ_G |
						       R500_TEX_DST_B_SWIZ_B |
						       R500_TEX_DST_A_SWIZ_A));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

		/* TEX temp3, temp3.xyzw, tex0, 2D */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
						       R500_INST_TEX_SEM_WAIT |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						       R500_INST_ALPHA_WMASK));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
						       R500_TEX_INST_LD |
						       R500_TEX_SEM_ACQUIRE |
						       R500_TEX_IGNORE_UNCOVERED));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) |
						       R500_TEX_SRC_S_SWIZ_R |
						       R500_TEX_SRC_T_SWIZ_G |
						       R500_TEX_SRC_R_SWIZ_B |
						       R500_TEX_SRC_Q_SWIZ_A |
						       R500_TEX_DST_ADDR(3) |
						       R500_TEX_DST_R_SWIZ_R |
						       R500_TEX_DST_G_SWIZ_G |
						       R500_TEX_DST_B_SWIZ_B |
						       R500_TEX_DST_A_SWIZ_A));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

		/* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						       R500_INST_ALPHA_WMASK));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
						       R500_RGB_ADDR0_CONST |
						       R500_RGB_ADDR1(5) |
						       R500_RGB_ADDR2(4)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
						       R500_ALPHA_ADDR0_CONST |
						       R500_ALPHA_ADDR1(5) |
						       R500_ALPHA_ADDR2(4)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
						       R500_ALU_RGB_R_SWIZ_A_0 |
						       R500_ALU_RGB_G_SWIZ_A_G |
						       R500_ALU_RGB_B_SWIZ_A_0 |
						       R500_ALU_RGB_SEL_B_SRC1 |
						       R500_ALU_RGB_R_SWIZ_B_G |
						       R500_ALU_RGB_G_SWIZ_B_G |
						       R500_ALU_RGB_B_SWIZ_B_G));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) |
						       R500_ALPHA_OP_MAD |
						       R500_ALPHA_SEL_A_SRC0 |
						       R500_ALPHA_SWIZ_A_G |
						       R500_ALPHA_SEL_B_SRC1 |
						       R500_ALPHA_SWIZ_B_G));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) |
						       R500_ALU_RGBA_OP_MAD |
						       R500_ALU_RGBA_SEL_C_SRC2 |
						       R500_ALU_RGBA_R_SWIZ_R |
						       R500_ALU_RGBA_G_SWIZ_G |
						       R500_ALU_RGBA_B_SWIZ_B |
						       R500_ALU_RGBA_A_SWIZ_A));

		/* ADD temp0, temp4, input0.xyxy */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						       R500_INST_ALPHA_WMASK));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) |
						       R500_RGB_ADDR2(0)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) |
						       R500_ALPHA_ADDR2(0)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 |
						       R500_ALU_RGB_G_SWIZ_A_1 |
						       R500_ALU_RGB_B_SWIZ_A_1 |
						       R500_ALU_RGB_SEL_B_SRC1 |
						       R500_ALU_RGB_R_SWIZ_B_R |
						       R500_ALU_RGB_G_SWIZ_B_G |
						       R500_ALU_RGB_B_SWIZ_B_B));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
						       R500_ALPHA_OP_MAD |
						       R500_ALPHA_SWIZ_A_1 |
						       R500_ALPHA_SEL_B_SRC1 |
						       R500_ALPHA_SWIZ_B_A));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
						       R500_ALU_RGBA_OP_MAD |
						       R500_ALU_RGBA_SEL_C_SRC2 |
						       R500_ALU_RGBA_R_SWIZ_R |
						       R500_ALU_RGBA_G_SWIZ_G |
						       R500_ALU_RGBA_B_SWIZ_R |
						       R500_ALU_RGBA_A_SWIZ_G));

		/* TEX temp4, temp0.zwzw, tex0, 2D */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
						       R500_INST_TEX_SEM_WAIT |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						       R500_INST_ALPHA_WMASK));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
						       R500_TEX_INST_LD |
						       R500_TEX_IGNORE_UNCOVERED));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
						       R500_TEX_SRC_S_SWIZ_B |
						       R500_TEX_SRC_T_SWIZ_A |
						       R500_TEX_SRC_R_SWIZ_B |
						       R500_TEX_SRC_Q_SWIZ_A |
						       R500_TEX_DST_ADDR(4) |
						       R500_TEX_DST_R_SWIZ_R |
						       R500_TEX_DST_G_SWIZ_G |
						       R500_TEX_DST_B_SWIZ_B |
						       R500_TEX_DST_A_SWIZ_A));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

		/* TEX temp0, temp0.xyzw, tex0, 2D */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
						       R500_INST_TEX_SEM_WAIT |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						   R500_INST_ALPHA_WMASK));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
						       R500_TEX_INST_LD |
						       R500_TEX_SEM_ACQUIRE |
						       R500_TEX_IGNORE_UNCOVERED));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
						       R500_TEX_SRC_S_SWIZ_R |
						       R500_TEX_SRC_T_SWIZ_G |
						       R500_TEX_SRC_R_SWIZ_B |
						       R500_TEX_SRC_Q_SWIZ_A |
						       R500_TEX_DST_ADDR(0) |
						       R500_TEX_DST_R_SWIZ_R |
						       R500_TEX_DST_G_SWIZ_G |
						       R500_TEX_DST_B_SWIZ_B |
						       R500_TEX_DST_A_SWIZ_A));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

		/* LRP temp3, temp2.zzzz, temp1, temp3 ->
		 * - PRESUB temps, temp1 - temp3
		 * - MAD temp2.zzzz, temps, temp3 */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						       R500_INST_ALPHA_WMASK));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) |
						       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
						       R500_RGB_ADDR1(1) |
						       R500_RGB_ADDR2(2)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) |
						       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
						       R500_ALPHA_ADDR1(1) |
						       R500_ALPHA_ADDR2(2)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
						       R500_ALU_RGB_R_SWIZ_A_B |
						       R500_ALU_RGB_G_SWIZ_A_B |
						       R500_ALU_RGB_B_SWIZ_A_B |
						       R500_ALU_RGB_SEL_B_SRCP |
						       R500_ALU_RGB_R_SWIZ_B_R |
						       R500_ALU_RGB_G_SWIZ_B_G |
						       R500_ALU_RGB_B_SWIZ_B_B));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) |
						       R500_ALPHA_OP_MAD |
						       R500_ALPHA_SEL_A_SRC2 |
						       R500_ALPHA_SWIZ_A_B |
						       R500_ALPHA_SEL_B_SRCP |
						       R500_ALPHA_SWIZ_B_A));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) |
						       R500_ALU_RGBA_OP_MAD |
						       R500_ALU_RGBA_SEL_C_SRC0 |
						       R500_ALU_RGBA_R_SWIZ_R |
						       R500_ALU_RGBA_G_SWIZ_G |
						       R500_ALU_RGBA_B_SWIZ_B |
						       R500_ALU_RGBA_A_SWIZ_A));

		/* LRP temp0, temp2.zzzz, temp4, temp0 ->
		 * - PRESUB temps, temp4 - temp1
		 * - MAD temp2.zzzz, temps, temp0 */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU |
						       R500_INST_TEX_SEM_WAIT |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						       R500_INST_ALPHA_WMASK));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
						       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
						       R500_RGB_ADDR1(4) |
						       R500_RGB_ADDR2(2)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
						       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
						       R500_ALPHA_ADDR1(4) |
						       R500_ALPHA_ADDR2(2)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
						       R500_ALU_RGB_R_SWIZ_A_B |
						       R500_ALU_RGB_G_SWIZ_A_B |
						       R500_ALU_RGB_B_SWIZ_A_B |
						       R500_ALU_RGB_SEL_B_SRCP |
						       R500_ALU_RGB_R_SWIZ_B_R |
						       R500_ALU_RGB_G_SWIZ_B_G |
						       R500_ALU_RGB_B_SWIZ_B_B));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
						       R500_ALPHA_OP_MAD |
						       R500_ALPHA_SEL_A_SRC2 |
						       R500_ALPHA_SWIZ_A_B |
						       R500_ALPHA_SEL_B_SRCP |
						       R500_ALPHA_SWIZ_B_A));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
						       R500_ALU_RGBA_OP_MAD |
						       R500_ALU_RGBA_SEL_C_SRC0 |
						       R500_ALU_RGBA_R_SWIZ_R |
						       R500_ALU_RGBA_G_SWIZ_G |
						       R500_ALU_RGBA_B_SWIZ_B |
						       R500_ALU_RGBA_A_SWIZ_A));

		/* LRP output, temp5.zzzz, temp3, temp0 ->
		 * - PRESUB temps, temp3 - temp0
		 * - MAD temp5.zzzz, temps, temp0 */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
						       R500_INST_LAST |
						       R500_INST_TEX_SEM_WAIT |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						       R500_INST_ALPHA_WMASK |
						       R500_INST_RGB_OMASK_R |
						       R500_INST_RGB_OMASK_G |
						       R500_INST_RGB_OMASK_B |
						       R500_INST_ALPHA_OMASK));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
						       R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 |
						       R500_RGB_ADDR1(3) |
						       R500_RGB_ADDR2(5)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
						       R500_ALPHA_SRCP_OP_A1_MINUS_A0 |
						       R500_ALPHA_ADDR1(3) |
						       R500_ALPHA_ADDR2(5)));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 |
						       R500_ALU_RGB_R_SWIZ_A_B |
						       R500_ALU_RGB_G_SWIZ_A_B |
						       R500_ALU_RGB_B_SWIZ_A_B |
						       R500_ALU_RGB_SEL_B_SRCP |
						       R500_ALU_RGB_R_SWIZ_B_R |
						       R500_ALU_RGB_G_SWIZ_B_G |
						       R500_ALU_RGB_B_SWIZ_B_B));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) |
						       R500_ALPHA_OP_MAD |
						       R500_ALPHA_SEL_A_SRC2 |
						       R500_ALPHA_SWIZ_A_B |
						       R500_ALPHA_SEL_B_SRCP |
						       R500_ALPHA_SWIZ_B_A));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) |
						       R500_ALU_RGBA_OP_MAD |
						       R500_ALU_RGBA_SEL_C_SRC0 |
						       R500_ALU_RGBA_R_SWIZ_R |
						       R500_ALU_RGBA_G_SWIZ_G |
						       R500_ALU_RGBA_B_SWIZ_B |
						       R500_ALU_RGBA_A_SWIZ_A));

		/* Shader constants. */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0));

		/* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */
		OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w));
		OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h));
		OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);
		OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0);

		FINISH_ACCEL();

	    } else {
		BEGIN_ACCEL(19);
		/* 2 components: 2 for tex0 */
		OUT_ACCEL_REG(R300_RS_COUNT,
			      ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
			       R300_RS_COUNT_HIRES_EN));

		/* R300_INST_COUNT_RS - highest RS instruction used */
		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));

		/* Pixel stack frame size. */
		OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */

		/* FP length. */
		OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) |
						  R500_US_CODE_END_ADDR(1)));
		OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) |
						   R500_US_CODE_RANGE_SIZE(1)));

		/* Prepare for FP emission. */
		OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0));

		/* tex inst */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX |
						       R500_INST_TEX_SEM_WAIT |
						       R500_INST_RGB_WMASK_R |
						       R500_INST_RGB_WMASK_G |
						       R500_INST_RGB_WMASK_B |
						       R500_INST_ALPHA_WMASK |
						       R500_INST_RGB_CLAMP |
						       R500_INST_ALPHA_CLAMP));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) |
						       R500_TEX_INST_LD |
						       R500_TEX_SEM_ACQUIRE |
						       R500_TEX_IGNORE_UNCOVERED));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) |
						       R500_TEX_SRC_S_SWIZ_R |
						       R500_TEX_SRC_T_SWIZ_G |
						       R500_TEX_DST_ADDR(0) |
						       R500_TEX_DST_R_SWIZ_R |
						       R500_TEX_DST_G_SWIZ_G |
						       R500_TEX_DST_B_SWIZ_B |
						       R500_TEX_DST_A_SWIZ_A));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) |
						       R500_DX_S_SWIZ_R |
						       R500_DX_T_SWIZ_R |
						       R500_DX_R_SWIZ_R |
						       R500_DX_Q_SWIZ_R |
						       R500_DY_ADDR(0) |
						       R500_DY_S_SWIZ_R |
						       R500_DY_T_SWIZ_R |
						       R500_DY_R_SWIZ_R |
						       R500_DY_Q_SWIZ_R));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000);

		/* ALU inst */
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT |
						       R500_INST_TEX_SEM_WAIT |
						       R500_INST_LAST |
						       R500_INST_RGB_OMASK_R |
						       R500_INST_RGB_OMASK_G |
						       R500_INST_RGB_OMASK_B |
						       R500_INST_ALPHA_OMASK |
						       R500_INST_RGB_CLAMP |
						       R500_INST_ALPHA_CLAMP));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) |
						       R500_RGB_ADDR1(0) |
						       R500_RGB_ADDR1_CONST |
						       R500_RGB_ADDR2(0) |
						       R500_RGB_ADDR2_CONST));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) |
						       R500_ALPHA_ADDR1(0) |
						       R500_ALPHA_ADDR1_CONST |
						       R500_ALPHA_ADDR2(0) |
						       R500_ALPHA_ADDR2_CONST));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 |
						       R500_ALU_RGB_R_SWIZ_A_R |
						       R500_ALU_RGB_G_SWIZ_A_G |
						       R500_ALU_RGB_B_SWIZ_A_B |
						       R500_ALU_RGB_SEL_B_SRC0 |
						       R500_ALU_RGB_R_SWIZ_B_1 |
						       R500_ALU_RGB_B_SWIZ_B_1 |
						       R500_ALU_RGB_G_SWIZ_B_1));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD |
						       R500_ALPHA_SWIZ_A_A |
						       R500_ALPHA_SWIZ_B_1));
		OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD |
						       R500_ALU_RGBA_R_SWIZ_0 |
						       R500_ALU_RGBA_G_SWIZ_0 |
						       R500_ALU_RGBA_B_SWIZ_0 |
						       R500_ALU_RGBA_A_SWIZ_0));
		FINISH_ACCEL();
	    }
	}

	BEGIN_ACCEL(6);
	OUT_ACCEL_REG(R300_TX_INVALTAGS, 0);
	OUT_ACCEL_REG(R300_TX_ENABLE, txenable);

	OUT_ACCEL_REG(R300_RB3D_COLOROFFSET0, dst_offset);
	OUT_ACCEL_REG(R300_RB3D_COLORPITCH0, colorpitch);

	blendcntl = RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO;
	/* no need to enable blending */
	OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, blendcntl);

	OUT_ACCEL_REG(R300_VAP_VTX_SIZE, vtx_count);
	FINISH_ACCEL();

    } else {

	/* Same for R100/R200 */
	switch (pPixmap->drawable.bitsPerPixel) {
	case 16:
	    if (pPixmap->drawable.depth == 15)
		dst_format = RADEON_COLOR_FORMAT_ARGB1555;
	    else
		dst_format = RADEON_COLOR_FORMAT_RGB565;
	    break;
	case 32:
	    dst_format = RADEON_COLOR_FORMAT_ARGB8888;
	    break;
	default:
	    return;
	}

	if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
	    isplanar = TRUE;
	}

	if (isplanar) {
	    txformat = RADEON_TXFORMAT_I8;
	} else {
	    if (pPriv->id == FOURCC_UYVY)
		txformat = RADEON_TXFORMAT_YVYU422;
	    else
		txformat = RADEON_TXFORMAT_VYUY422;
	}

	txformat |= RADEON_TXFORMAT_NON_POWER2;

	colorpitch = dst_pitch >> pixel_shift;

	if (RADEONTilingEnabled(pScrn, pPixmap))
	    colorpitch |= RADEON_COLOR_TILE_ENABLE;

	BEGIN_ACCEL(4);

	OUT_ACCEL_REG(RADEON_RB3D_CNTL,
		      dst_format /*| RADEON_ALPHA_BLEND_ENABLE*/);
	OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, dst_offset);

	OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, colorpitch);

	OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL,
		      RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO);

	FINISH_ACCEL();


	if ((info->ChipFamily == CHIP_FAMILY_RV250) ||
	    (info->ChipFamily == CHIP_FAMILY_RV280) ||
	    (info->ChipFamily == CHIP_FAMILY_RS300) ||
	    (info->ChipFamily == CHIP_FAMILY_R200)) {

	    info->accel_state->texW[0] = pPriv->w;
	    info->accel_state->texH[0] = pPriv->h;

	    if (isplanar) {
		/* note: in contrast to r300, use input biasing on uv components */
		float yco = 1.1643;
		float yoff = -0.0625 * yco;
		float uco[3] = {0.0, -0.39173, 2.018};
		float vco[3] = {1.5958, -0.8129, 0.0};

		/* need 2 texcoord sets (even though they are identical) due
		   to denormalization! hw apparently can't premultiply
		   same coord set by different texture size */
		vtx_count = 6;

		txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
			    (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
		txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
		txpitch -= 32;
		txfilter =  R200_MAG_FILTER_LINEAR |
			    R200_MIN_FILTER_LINEAR |
			    R200_CLAMP_S_CLAMP_LAST |
			    R200_CLAMP_T_CLAMP_LAST;

		BEGIN_ACCEL(36);

		OUT_ACCEL_REG(RADEON_PP_CNTL,
			      RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE |
			      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
			      RADEON_TEX_BLEND_2_ENABLE);

		OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
		OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
			      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
			      (2 << R200_VTX_TEX1_COMP_CNT_SHIFT));

		OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
		OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
		OUT_ACCEL_REG(R200_PP_TXSIZE_0,
			      (pPriv->w - 1) |
			      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
		OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
		OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);

		OUT_ACCEL_REG(R200_PP_TXFILTER_1, txfilter);
		OUT_ACCEL_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_1, 0);
		OUT_ACCEL_REG(R200_PP_TXSIZE_1, txformat0);
		OUT_ACCEL_REG(R200_PP_TXPITCH_1, txpitch);
		OUT_ACCEL_REG(R200_PP_TXOFFSET_1, pPriv->src_offset + pPriv->planeu_offset);

		OUT_ACCEL_REG(R200_PP_TXFILTER_2, txfilter);
		OUT_ACCEL_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_2, 0);
		OUT_ACCEL_REG(R200_PP_TXSIZE_2, txformat0);
		OUT_ACCEL_REG(R200_PP_TXPITCH_2, txpitch);
		OUT_ACCEL_REG(R200_PP_TXOFFSET_2, pPriv->src_offset + pPriv->planev_offset);

		/* similar to r300 code. Note the big problem is that hardware constants
		 * are 8 bits only, representing 0.0-1.0. We can get that up (using bias
		 * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually
		 * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but
		 * the constants not. To get larger range can use output scale, but for
		 * that 2.018 value we need a total scale by 8, which means the constants
		 * really have no accuracy whatsoever (5 fractional bits only).
		 * The only direct way to get high  precision "constants" into the fragment
		 * pipe I know of is to use the texcoord interpolator (not color, this one
		 * is 8 bit only too), which seems a bit expensive. We're lucky though it
		 * seems the values we need seem to fit better than worst case (get about
		 * 6 fractional bits for this instead of 5, at least when not correcting for
		 * hue/saturation/contrast/brightness, which is the same as for vco - yco and
		 * yoff get 8 fractional bits).
		 *
		 * A higher precision (8 fractional bits) version might just put uco into
		 * a texcoord, and calculate a new vcoconst in the shader, like so:
		 * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable
		 * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0}
		 * vcocalc = ADD temp, bias/scale(cohelper), vco
		 * would in total use 4 tex units, 4 instructions which seems fairly
		 * balanced for this architecture (instead of 3 + 3 for the solution here)
		 *
		 * temp = MAD(yco, yuv.yyyy, yoff)
		 * temp = MAD(uco, yuv.uuuu, temp)
		 * result = MAD(vco, yuv.vvvv, temp)
		 *
		 * note first mad produces actually scalar, hence we transform
		 * it into a dp2a to get 8 bit precision of yco instead of 7 -
		 * That's assuming hw correctly expands consts to internal precision.
		 * (y * 1 + y * (yco - 1) + yoff)
		 * temp = DP2A / 2 (yco, yuv.yyyy, yoff)
		 * temp = MAD (uco / 4, yuv.uuuu * 2, temp)
		 * result = MAD x2 (vco / 2, yuv.vvvv, temp)
		 *
		 * vco, uco need bias (and hence scale too)
		 *
		 */

		/* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */
		OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
			      R200_TXC_ARG_A_TFACTOR_COLOR |
			      R200_TXC_ARG_B_R0_COLOR |
			      R200_TXC_ARG_C_TFACTOR_COLOR |
			      R200_TXC_NEG_ARG_C |
			      R200_TXC_OP_DOT2_ADD);
		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
			      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
			      R200_TXC_SCALE_INV2 |
			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
		OUT_ACCEL_REG(R200_PP_TXABLEND_0,
			      R200_TXA_ARG_A_ZERO |
			      R200_TXA_ARG_B_ZERO |
			      R200_TXA_ARG_C_ZERO |
			      R200_TXA_OP_MADD);
		OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
			      R200_TXA_OUTPUT_REG_NONE);

		/* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */
		OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
			      R200_TXC_ARG_A_TFACTOR_COLOR |
			      R200_TXC_BIAS_ARG_A |
			      R200_TXC_SCALE_ARG_A |
			      R200_TXC_ARG_B_R1_COLOR |
			      R200_TXC_BIAS_ARG_B |
			      R200_TXC_SCALE_ARG_B |
			      R200_TXC_ARG_C_R0_COLOR |
			      R200_TXC_OP_MADD);
		OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
			      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
		OUT_ACCEL_REG(R200_PP_TXABLEND_1,
			      R200_TXA_ARG_A_ZERO |
			      R200_TXA_ARG_B_ZERO |
			      R200_TXA_ARG_C_ZERO |
			      R200_TXA_OP_MADD);
		OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
			      R200_TXA_OUTPUT_REG_NONE);

		/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */
		OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
			      R200_TXC_ARG_A_TFACTOR_COLOR |
			      R200_TXC_BIAS_ARG_A |
			      R200_TXC_SCALE_ARG_A |
			      R200_TXC_ARG_B_R2_COLOR |
			      R200_TXC_BIAS_ARG_B |
			      R200_TXC_ARG_C_R0_COLOR |
			      R200_TXC_OP_MADD);
		OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
			      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
			      R200_TXC_SCALE_2X |
			      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
		OUT_ACCEL_REG(R200_PP_TXABLEND_2,
			      R200_TXA_ARG_A_ZERO |
			      R200_TXA_ARG_B_ZERO |
			      R200_TXA_ARG_C_ZERO |
			      R200_TXA_COMP_ARG_C |
			      R200_TXA_OP_MADD);
		OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);

		/* shader constants */
		OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(1.0, /* src range [1, 2] */
							      yco - 1.0,
							      -yoff, /* range [-1, 0] */
							      0.0));
		OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * 0.125 + 0.5, /* range [-4, 4] */
							      uco[1] * 0.125 + 0.5,
							      uco[2] * 0.125 + 0.5,
							      0.0));
		OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * 0.25 + 0.5, /* range [-2, 2] */
							      vco[1] * 0.25 + 0.5,
							      vco[2] * 0.25 + 0.5,
							      0.0));

		FINISH_ACCEL();
	    }
	    else if (info->ChipFamily == CHIP_FAMILY_RV250) {
		/* fix up broken packed yuv - shader same as above except
		   yuv compoents are all in same reg */
		float yco = 1.1643;
		float yoff = -0.0625 * yco;
		float uco[3] = {0.0, -0.39173, 2.018};
		float vco[3] = {1.5958, -0.8129, 0.0};

		txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
			    (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
		txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
		txpitch -= 32;
		txfilter =  R200_MAG_FILTER_LINEAR |
			    R200_MIN_FILTER_LINEAR |
			    R200_CLAMP_S_CLAMP_LAST |
			    R200_CLAMP_T_CLAMP_LAST;

		BEGIN_ACCEL(24);

		OUT_ACCEL_REG(RADEON_PP_CNTL,
			      RADEON_TEX_0_ENABLE |
			      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
			      RADEON_TEX_BLEND_2_ENABLE);

		OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
		OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
			      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));

		OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
		OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
		OUT_ACCEL_REG(R200_PP_TXSIZE_0,
			      (pPriv->w - 1) |
			      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
		OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
		OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);

		/* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */
		OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
			      R200_TXC_ARG_A_TFACTOR_COLOR |
			      R200_TXC_ARG_B_R0_COLOR |
			      R200_TXC_ARG_C_TFACTOR_COLOR |
			      R200_TXC_NEG_ARG_C |
			      R200_TXC_OP_DOT2_ADD);
		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
			      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
			      R200_TXC_SCALE_INV2 |
			      (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) |
			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
		OUT_ACCEL_REG(R200_PP_TXABLEND_0,
			      R200_TXA_ARG_A_ZERO |
			      R200_TXA_ARG_B_ZERO |
			      R200_TXA_ARG_C_ZERO |
			      R200_TXA_OP_MADD);
		OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
			      R200_TXA_OUTPUT_REG_NONE);

		/* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */
		OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
			      R200_TXC_ARG_A_TFACTOR_COLOR |
			      R200_TXC_BIAS_ARG_A |
			      R200_TXC_SCALE_ARG_A |
			      R200_TXC_ARG_B_R0_COLOR |
			      R200_TXC_BIAS_ARG_B |
			      R200_TXC_SCALE_ARG_B |
			      R200_TXC_ARG_C_R1_COLOR |
			      R200_TXC_OP_MADD);
		OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
			      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
			      (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) |
			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
		OUT_ACCEL_REG(R200_PP_TXABLEND_1,
			      R200_TXA_ARG_A_ZERO |
			      R200_TXA_ARG_B_ZERO |
			      R200_TXA_ARG_C_ZERO |
			      R200_TXA_OP_MADD);
		OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
			      R200_TXA_OUTPUT_REG_NONE);

		/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */
		OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
			      R200_TXC_ARG_A_TFACTOR_COLOR |
			      R200_TXC_BIAS_ARG_A |
			      R200_TXC_SCALE_ARG_A |
			      R200_TXC_ARG_B_R0_COLOR |
			      R200_TXC_BIAS_ARG_B |
			      R200_TXC_ARG_C_R1_COLOR |
			      R200_TXC_OP_MADD);
		OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
			      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
			      R200_TXC_SCALE_2X |
			      (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) |
			      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
		OUT_ACCEL_REG(R200_PP_TXABLEND_2,
			      R200_TXA_ARG_A_ZERO |
			      R200_TXA_ARG_B_ZERO |
			      R200_TXA_ARG_C_ZERO |
			      R200_TXA_COMP_ARG_C |
			      R200_TXA_OP_MADD);
		OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);

		/* shader constants */
		OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(1.0, /* src range [1, 2] */
							      yco - 1.0,
							      -yoff, /* range [-1, 0] */
							      0.0));
		OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * 0.125 + 0.5, /* range [-4, 4] */
							      uco[1] * 0.125 + 0.5,
							      uco[2] * 0.125 + 0.5,
							      0.0));
		OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * 0.25 + 0.5, /* range [-2, 2] */
							      vco[1] * 0.25 + 0.5,
							      vco[2] * 0.25 + 0.5,
							      0.0));

		FINISH_ACCEL();
	    }
	    else {
		BEGIN_ACCEL(13);
		OUT_ACCEL_REG(RADEON_PP_CNTL,
			      RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);

		OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
		OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
			      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));

		OUT_ACCEL_REG(R200_PP_TXFILTER_0,
			      R200_MAG_FILTER_LINEAR |
			      R200_MIN_FILTER_LINEAR |
			      R200_CLAMP_S_CLAMP_LAST |
			      R200_CLAMP_T_CLAMP_LAST |
			      R200_YUV_TO_RGB);
		OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
		OUT_ACCEL_REG(R200_PP_TXSIZE_0,
			      (pPriv->w - 1) |
			      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
		OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);

		OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);

		OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
			      R200_TXC_ARG_A_ZERO |
			      R200_TXC_ARG_B_ZERO |
			      R200_TXC_ARG_C_R0_COLOR |
			      R200_TXC_OP_MADD);
		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
			      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
		OUT_ACCEL_REG(R200_PP_TXABLEND_0,
			      R200_TXA_ARG_A_ZERO |
			      R200_TXA_ARG_B_ZERO |
			      R200_TXA_ARG_C_R0_ALPHA |
			      R200_TXA_OP_MADD);
		OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
		FINISH_ACCEL();
	    }
	} else {

	    info->accel_state->texW[0] = 1;
	    info->accel_state->texH[0] = 1;

	    BEGIN_ACCEL(9);

	    OUT_ACCEL_REG(RADEON_PP_CNTL,
			  RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);

	    OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
					      RADEON_SE_VTX_FMT_ST0));

	    OUT_ACCEL_REG(RADEON_PP_TXFILTER_0,
			  RADEON_MAG_FILTER_LINEAR |
			  RADEON_MIN_FILTER_LINEAR |
			  RADEON_CLAMP_S_CLAMP_LAST |
			  RADEON_CLAMP_T_CLAMP_LAST |
			  RADEON_YUV_TO_RGB);
	    OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat);
	    OUT_ACCEL_REG(RADEON_PP_TXOFFSET_0, pPriv->src_offset);
	    OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0,
			  RADEON_COLOR_ARG_A_ZERO |
			  RADEON_COLOR_ARG_B_ZERO |
			  RADEON_COLOR_ARG_C_T0_COLOR |
			  RADEON_BLEND_CTL_ADD |
			  RADEON_CLAMP_TX);
	    OUT_ACCEL_REG(RADEON_PP_TXABLEND_0,
			  RADEON_ALPHA_ARG_A_ZERO |
			  RADEON_ALPHA_ARG_B_ZERO |
			  RADEON_ALPHA_ARG_C_T0_ALPHA |
			  RADEON_BLEND_CTL_ADD |
			  RADEON_CLAMP_TX);

	    OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0,
			  (pPriv->w - 1) |
			  ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
	    OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0,
			  pPriv->src_pitch - 32);
	    FINISH_ACCEL();
	}
    }

    if (pPriv->vsync) {
	xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn,
						    pPriv->drw_x,
						    pPriv->drw_x + pPriv->dst_w,
						    pPriv->drw_y,
						    pPriv->drw_y + pPriv->dst_h);
	if (crtc) {
	    RADEONCrtcPrivatePtr radeon_crtc = crtc->driver_private;

	    FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap,
					  radeon_crtc->crtc_id,
					  pPriv->drw_y - crtc->y,
					  (pPriv->drw_y - crtc->y) + pPriv->dst_h);
	}
    }
    /*
     * Rendering of the actual polygon is done in two different
     * ways depending on chip generation:
     *
     * < R300:
     *
     *     These chips can render a rectangle in one pass, so
     *     handling is pretty straight-forward.
     *
     * >= R300:
     *
     *     These chips can accept a quad, but will render it as
     *     two triangles which results in a diagonal tear. Instead
     *     We render a single, large triangle and use the scissor
     *     functionality to restrict it to the desired rectangle.
     *     Due to guardband limits on r3xx/r4xx, we can only use
     *     the single triangle up to 2880 pixels; above that we
     *     render as a quad.
     */

    while (nBox--) {
	int srcX, srcY, srcw, srch;
	int dstX, dstY, dstw, dsth;
	Bool use_quad = FALSE;
	dstX = pBox->x1 + dstxoff;
	dstY = pBox->y1 + dstyoff;
	dstw = pBox->x2 - pBox->x1;
	dsth = pBox->y2 - pBox->y1;

	srcX = ((pBox->x1 - pPriv->drw_x) *
		pPriv->src_w) / pPriv->dst_w;
	srcY = ((pBox->y1 - pPriv->drw_y) *
		pPriv->src_h) / pPriv->dst_h;

	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
	srch = (pPriv->src_h * dsth) / pPriv->dst_h;

#if 0
	ErrorF("dst: %d, %d, %d, %d\n", dstX, dstY, dstw, dsth);
	ErrorF("src: %d, %d, %d, %d\n", srcX, srcY, srcw, srch);
#endif

	if (IS_R300_3D || IS_R500_3D) {
	    if (IS_R300_3D && ((dstw+dsth) > 2880))
		use_quad = TRUE;
	    /*
	     * Set up the scissor area to that of the output size.
	     */
	    BEGIN_ACCEL(2);
	    if (IS_R300_3D) {
		/* R300 has an offset */
		OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX + 1088) << R300_SCISSOR_X_SHIFT) |
						 ((dstY + 1088) << R300_SCISSOR_Y_SHIFT)));
		OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw + 1088 - 1) << R300_SCISSOR_X_SHIFT) |
						 ((dstY + dsth + 1088 - 1) << R300_SCISSOR_Y_SHIFT)));
	    } else {
		OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX) << R300_SCISSOR_X_SHIFT) |
						 ((dstY) << R300_SCISSOR_Y_SHIFT)));
		OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw - 1) << R300_SCISSOR_X_SHIFT) |
						 ((dstY + dsth - 1) << R300_SCISSOR_Y_SHIFT)));
	    }
	    FINISH_ACCEL();
	}

#ifdef ACCEL_CP
	if (info->ChipFamily < CHIP_FAMILY_R200) {
	    BEGIN_RING(3 * vtx_count + 3);
	    OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD,
				3 * vtx_count + 1));
	    OUT_RING(RADEON_CP_VC_FRMT_XY |
		     RADEON_CP_VC_FRMT_ST0);
	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
		     RADEON_CP_VC_CNTL_MAOS_ENABLE |
		     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
		     (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
	} else if (IS_R300_3D || IS_R500_3D) {
	    if (use_quad) {
		BEGIN_RING(4 * vtx_count + 4);
		OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
				    4 * vtx_count));
		OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_QUAD_LIST |
			 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
			 (4 << RADEON_CP_VC_CNTL_NUM_SHIFT));
	    } else {
		BEGIN_RING(3 * vtx_count + 4);
		OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
				    3 * vtx_count));
		OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST |
			 RADEON_CP_VC_CNTL_PRIM_WALK_RING |
			 (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
	    }
	} else {
	    BEGIN_RING(3 * vtx_count + 2);
	    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2,
				3 * vtx_count));
	    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST |
		     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
		     (3 << RADEON_CP_VC_CNTL_NUM_SHIFT));
	}
#else /* ACCEL_CP */
	if (IS_R300_3D || IS_R500_3D) {
	    if (use_quad)
		BEGIN_ACCEL(2 + vtx_count * 4);
	    else
		BEGIN_ACCEL(2 + vtx_count * 3);
	} else
	    BEGIN_ACCEL(1 + vtx_count * 3);

	if (info->ChipFamily < CHIP_FAMILY_R200)
	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
					      RADEON_VF_PRIM_WALK_DATA |
					      RADEON_VF_RADEON_MODE |
					      (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
	else if (IS_R300_3D || IS_R500_3D) {
	    if (use_quad)
		OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_QUAD_LIST |
						  RADEON_VF_PRIM_WALK_DATA |
						  (4 << RADEON_VF_NUM_VERTICES_SHIFT)));
	    else
		OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST |
						  RADEON_VF_PRIM_WALK_DATA |
						  (3 << RADEON_VF_NUM_VERTICES_SHIFT)));
	} else
	    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST |
					      RADEON_VF_PRIM_WALK_DATA |
					      (3 << RADEON_VF_NUM_VERTICES_SHIFT)));

#endif
	if (pPriv->bicubic_enabled) {
		/*
		 * This code is only executed on >= R300, so we don't
		 * have to deal with the legacy handling.
		 */
	    if (use_quad) {
		VTX_OUT_FILTER((float)dstX,                                       (float)dstY,
			       (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
			       (float)srcX + 0.5,                                 (float)srcY + 0.5);
		VTX_OUT_FILTER((float)dstX,                                       (float)(dstY + dsth),
			       (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
			       (float)srcX + 0.5,                                 (float)(srcY + srch) + 0.5);
		VTX_OUT_FILTER((float)(dstX + dstw),                              (float)(dstY + dsth),
			       (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
			       (float)(srcX + srcw) + 0.5,                        (float)(srcY + srch) + 0.5);
		VTX_OUT_FILTER((float)(dstX + dstw),                              (float)dstY,
			       (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
			       (float)(srcX + srcw) + 0.5,                        (float)srcY + 0.5);
	    } else {
		VTX_OUT_FILTER((float)dstX,                                       (float)dstY,
			       (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0],
			       (float)srcX + 0.5,                                 (float)srcY + 0.5);
		VTX_OUT_FILTER((float)dstX,                                       (float)(dstY + dstw + dsth),
			       (float)srcX / info->accel_state->texW[0],          ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0],
			       (float)srcX + 0.5,                                 (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5);
		VTX_OUT_FILTER((float)(dstX + dstw + dsth),                       (float)dstY,
			       ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
			                                                          (float)srcY / info->accel_state->texH[0],
			       (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5,
			                                                          (float)srcY + 0.5);
	    }
	} else {
	    if (IS_R300_3D || IS_R500_3D) {
		if (use_quad) {
		    VTX_OUT((float)dstX,                                       (float)dstY,
			    (float)srcX / info->accel_state->texW[0],          (float)srcY / info->accel_state->texH[0]);
		    VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
			    (float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
		    VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
			    (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
		    VTX_OUT((float)(dstX + dstw),                              (float)dstY,
			    (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
		} else {
		    /*
		     * Render a big, scissored triangle. This means
		     * increasing the triangle size and adjusting
		     * texture coordinates.
		     */
		    VTX_OUT((float)dstX,                              (float)dstY,
			    (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
		    VTX_OUT((float)dstX,                              (float)(dstY + dsth + dstw),
			    (float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]);
			    
		    VTX_OUT((float)(dstX + dstw + dsth),              (float)dstY,
			    ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
			                                              (float)srcY / info->accel_state->texH[0]);
		}
	    } else if (isplanar) {
		/*
		 * Just render a rect (using three coords).
		 * Filter is a bit a misnomer, it's just texcoords...
		 */
		VTX_OUT_FILTER((float)dstX,                                (float)(dstY + dsth),
			(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
			(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
		VTX_OUT_FILTER((float)(dstX + dstw),                       (float)(dstY + dsth),
			(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
			(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
		VTX_OUT_FILTER((float)(dstX + dstw),                       (float)dstY,
			(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
			(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
	    } else {
		/*
		 * Just render a rect (using three coords).
		 */
		VTX_OUT((float)dstX,                                       (float)(dstY + dsth),
			(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
		VTX_OUT((float)(dstX + dstw),                              (float)(dstY + dsth),
			(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
		VTX_OUT((float)(dstX + dstw),                              (float)dstY,
			(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
	    }
	}

	if (IS_R300_3D || IS_R500_3D)
	    /* flushing is pipelined, free/finish is not */
	    OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D);

#ifdef ACCEL_CP
	ADVANCE_RING();
#else
	FINISH_ACCEL();
#endif /* !ACCEL_CP */

	pBox++;
    }

    if (IS_R300_3D || IS_R500_3D) {
	BEGIN_ACCEL(3);
	OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA);
	OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL);
    } else
	BEGIN_ACCEL(1);
    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
    FINISH_ACCEL();

    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
}

#undef VTX_OUT
#undef VTX_OUT_FILTER
#undef FUNC_NAME