diff options
author | Alex Deucher <alexdeucher@gmail.com> | 2009-04-13 17:04:31 -0400 |
---|---|---|
committer | Alex Deucher <alexdeucher@gmail.com> | 2009-04-17 10:24:58 -0400 |
commit | 12839fc17a2cca4ac14b9757bdaa63ba4679f96f (patch) | |
tree | 8914fdb9b745d8653b0bf122653c0fac98d2b8c6 /src | |
parent | a30737b337edb31528174b483c9094941a5d41bb (diff) |
Tex vid: split by family
Diffstat (limited to 'src')
-rw-r--r-- | src/radeon_textured_video.c | 28 | ||||
-rw-r--r-- | src/radeon_textured_videofuncs.c | 4874 |
2 files changed, 2804 insertions, 2098 deletions
diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c index bf8a2763..f64da025 100644 --- a/src/radeon_textured_video.c +++ b/src/radeon_textured_video.c @@ -598,13 +598,29 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn, pPriv->h = height; #ifdef XF86DRI - if (IS_R600_3D) - R600DisplayTexturedVideo(pScrn, pPriv); - else if (info->directRenderingEnabled) - RADEONDisplayTexturedVideoCP(pScrn, pPriv); - else + if (info->directRenderingEnabled) { + if (IS_R600_3D) + R600DisplayTexturedVideo(pScrn, pPriv); + else if (IS_R500_3D) + R500DisplayTexturedVideoCP(pScrn, pPriv); + else if (IS_R300_3D) + R300DisplayTexturedVideoCP(pScrn, pPriv); + else if (IS_R200_3D) + R200DisplayTexturedVideoCP(pScrn, pPriv); + else + RADEONDisplayTexturedVideoCP(pScrn, pPriv); + } else #endif - RADEONDisplayTexturedVideoMMIO(pScrn, pPriv); + { + if (IS_R500_3D) + R500DisplayTexturedVideoMMIO(pScrn, pPriv); + else if (IS_R300_3D) + R300DisplayTexturedVideoMMIO(pScrn, pPriv); + else if (IS_R200_3D) + R200DisplayTexturedVideoMMIO(pScrn, pPriv); + else + RADEONDisplayTexturedVideoMMIO(pScrn, pPriv); + } return Success; } diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c index 3c4289f0..9361f07f 100644 --- a/src/radeon_textured_videofuncs.c +++ b/src/radeon_textured_videofuncs.c @@ -93,10 +93,8 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv RADEONInfoPtr info = RADEONPTR(pScrn); PixmapPtr pPixmap = pPriv->pPixmap; uint32_t txformat; - uint32_t txfilter, txformat0, txformat1, txoffset, txpitch; uint32_t dst_offset, dst_pitch, dst_format; - uint32_t txenable, colorpitch; - uint32_t blendcntl; + uint32_t colorpitch; Bool isplanar = FALSE; int dstxoff, dstyoff, pixel_shift, vtx_count; BoxPtr pBox = REGION_RECTS(&pPriv->clip); @@ -132,10 +130,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv #endif { BEGIN_ACCEL(2); - if (IS_R300_3D || IS_R500_3D) - OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D); - else - OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH); + OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH); /* We must wait for 3d to idle, in case source was just written as a dest. */ OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_HOST_IDLECLEAN | @@ -148,2057 +143,1835 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv RADEONInit3DEngine(pScrn); } - if (pPriv->bicubic_enabled) - vtx_count = 6; - else - vtx_count = 4; + vtx_count = 4; - if (IS_R300_3D || IS_R500_3D) { - uint32_t output_fmt; - - switch (pPixmap->drawable.bitsPerPixel) { - case 16: - if (pPixmap->drawable.depth == 15) - dst_format = R300_COLORFORMAT_ARGB1555; - else - dst_format = R300_COLORFORMAT_RGB565; - break; - case 32: - dst_format = R300_COLORFORMAT_ARGB8888; - break; - default: - return; + /* Same for R100/R200 */ + switch (pPixmap->drawable.bitsPerPixel) { + case 16: + if (pPixmap->drawable.depth == 15) + dst_format = RADEON_COLOR_FORMAT_ARGB1555; + else + dst_format = RADEON_COLOR_FORMAT_RGB565; + break; + case 32: + dst_format = RADEON_COLOR_FORMAT_ARGB8888; + break; + default: + return; + } + + if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) { + isplanar = TRUE; + } + + if (isplanar) { + txformat = RADEON_TXFORMAT_I8; + } else { + if (pPriv->id == FOURCC_UYVY) + txformat = RADEON_TXFORMAT_YVYU422; + else + txformat = RADEON_TXFORMAT_VYUY422; + } + + txformat |= RADEON_TXFORMAT_NON_POWER2; + + colorpitch = dst_pitch >> pixel_shift; + + if (RADEONTilingEnabled(pScrn, pPixmap)) + colorpitch |= RADEON_COLOR_TILE_ENABLE; + + BEGIN_ACCEL(4); + + OUT_ACCEL_REG(RADEON_RB3D_CNTL, + dst_format /*| RADEON_ALPHA_BLEND_ENABLE*/); + OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, dst_offset); + + OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, colorpitch); + + OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL, + RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO); + + FINISH_ACCEL(); + + + info->accel_state->texW[0] = 1; + info->accel_state->texH[0] = 1; + + BEGIN_ACCEL(9); + + OUT_ACCEL_REG(RADEON_PP_CNTL, + RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE); + + OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY | + RADEON_SE_VTX_FMT_ST0)); + + OUT_ACCEL_REG(RADEON_PP_TXFILTER_0, + RADEON_MAG_FILTER_LINEAR | + RADEON_MIN_FILTER_LINEAR | + RADEON_CLAMP_S_CLAMP_LAST | + RADEON_CLAMP_T_CLAMP_LAST | + RADEON_YUV_TO_RGB); + OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat); + OUT_ACCEL_REG(RADEON_PP_TXOFFSET_0, pPriv->src_offset); + OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0, + RADEON_COLOR_ARG_A_ZERO | + RADEON_COLOR_ARG_B_ZERO | + RADEON_COLOR_ARG_C_T0_COLOR | + RADEON_BLEND_CTL_ADD | + RADEON_CLAMP_TX); + OUT_ACCEL_REG(RADEON_PP_TXABLEND_0, + RADEON_ALPHA_ARG_A_ZERO | + RADEON_ALPHA_ARG_B_ZERO | + RADEON_ALPHA_ARG_C_T0_ALPHA | + RADEON_BLEND_CTL_ADD | + RADEON_CLAMP_TX); + + OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0, + (pPriv->w - 1) | + ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT)); + OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0, + pPriv->src_pitch - 32); + FINISH_ACCEL(); + + if (pPriv->vsync) { + xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn, + pPriv->drw_x, + pPriv->drw_x + pPriv->dst_w, + pPriv->drw_y, + pPriv->drw_y + pPriv->dst_h); + if (crtc) { + RADEONCrtcPrivatePtr radeon_crtc = crtc->driver_private; + + FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap, + radeon_crtc->crtc_id, + pPriv->drw_y - crtc->y, + (pPriv->drw_y - crtc->y) + pPriv->dst_h); } + } + /* + * Rendering of the actual polygon is done in two different + * ways depending on chip generation: + * + * < R300: + * + * These chips can render a rectangle in one pass, so + * handling is pretty straight-forward. + * + * >= R300: + * + * These chips can accept a quad, but will render it as + * two triangles which results in a diagonal tear. Instead + * We render a single, large triangle and use the scissor + * functionality to restrict it to the desired rectangle. + * Due to guardband limits on r3xx/r4xx, we can only use + * the single triangle up to 2880 pixels; above that we + * render as a quad. + */ + + while (nBox--) { + int srcX, srcY, srcw, srch; + int dstX, dstY, dstw, dsth; + dstX = pBox->x1 + dstxoff; + dstY = pBox->y1 + dstyoff; + dstw = pBox->x2 - pBox->x1; + dsth = pBox->y2 - pBox->y1; + + srcX = ((pBox->x1 - pPriv->drw_x) * + pPriv->src_w) / pPriv->dst_w; + srcY = ((pBox->y1 - pPriv->drw_y) * + pPriv->src_h) / pPriv->dst_h; + + srcw = (pPriv->src_w * dstw) / pPriv->dst_w; + srch = (pPriv->src_h * dsth) / pPriv->dst_h; + +#ifdef ACCEL_CP + BEGIN_RING(3 * vtx_count + 3); + OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD, + 3 * vtx_count + 1)); + OUT_RING(RADEON_CP_VC_FRMT_XY | + RADEON_CP_VC_FRMT_ST0); + OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST | + RADEON_CP_VC_CNTL_PRIM_WALK_RING | + RADEON_CP_VC_CNTL_MAOS_ENABLE | + RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE | + (3 << RADEON_CP_VC_CNTL_NUM_SHIFT)); +#else /* ACCEL_CP */ + BEGIN_ACCEL(1 + vtx_count * 3); + OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST | + RADEON_VF_PRIM_WALK_DATA | + RADEON_VF_RADEON_MODE | + (3 << RADEON_VF_NUM_VERTICES_SHIFT))); +#endif + /* + * Just render a rect (using three coords). + */ + VTX_OUT((float)dstX, (float)(dstY + dsth), + (float)srcX / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]); + VTX_OUT((float)(dstX + dstw), (float)(dstY + dsth), + (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]); + VTX_OUT((float)(dstX + dstw), (float)dstY, + (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]); + +#ifdef ACCEL_CP + ADVANCE_RING(); +#else + FINISH_ACCEL(); +#endif /* !ACCEL_CP */ + + pBox++; + } - output_fmt = (R300_OUT_FMT_C4_8 | - R300_OUT_FMT_C0_SEL_BLUE | - R300_OUT_FMT_C1_SEL_GREEN | - R300_OUT_FMT_C2_SEL_RED | - R300_OUT_FMT_C3_SEL_ALPHA); + BEGIN_ACCEL(1); + OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN); + FINISH_ACCEL(); - colorpitch = dst_pitch >> pixel_shift; - colorpitch |= dst_format; + DamageDamageRegion(pPriv->pDraw, &pPriv->clip); +} - if (RADEONTilingEnabled(pScrn, pPixmap)) - colorpitch |= R300_COLORTILE; +static void +FUNC_NAME(R200DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv) +{ + RADEONInfoPtr info = RADEONPTR(pScrn); + PixmapPtr pPixmap = pPriv->pPixmap; + uint32_t txformat; + uint32_t txfilter, txformat0, txpitch; + uint32_t dst_offset, dst_pitch, dst_format; + uint32_t colorpitch; + Bool isplanar = FALSE; + int dstxoff, dstyoff, pixel_shift, vtx_count; + BoxPtr pBox = REGION_RECTS(&pPriv->clip); + int nBox = REGION_NUM_RECTS(&pPriv->clip); + ACCEL_PREAMBLE(); - if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) { - isplanar = TRUE; + pixel_shift = pPixmap->drawable.bitsPerPixel >> 4; + +#ifdef USE_EXA + if (info->useEXA) { + dst_offset = exaGetPixmapOffset(pPixmap) + info->fbLocation + pScrn->fbOffset; + dst_pitch = exaGetPixmapPitch(pPixmap); + } else +#endif + { + dst_offset = (pPixmap->devPrivate.ptr - info->FB) + + info->fbLocation + pScrn->fbOffset; + dst_pitch = pPixmap->devKind; } - if (isplanar) { - txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0; - txpitch = pPriv->src_pitch; - } else { - if (pPriv->id == FOURCC_UYVY) - txformat1 = R300_TX_FORMAT_YVYU422; - else - txformat1 = R300_TX_FORMAT_VYUY422; +#ifdef COMPOSITE + dstxoff = -pPixmap->screen_x + pPixmap->drawable.x; + dstyoff = -pPixmap->screen_y + pPixmap->drawable.y; +#else + dstxoff = 0; + dstyoff = 0; +#endif - txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP; +#ifdef USE_EXA + if (info->useEXA) { + RADEON_SWITCH_TO_3D(); + } else +#endif + { + BEGIN_ACCEL(2); + OUT_ACCEL_REG(RADEON_RB3D_DSTCACHE_CTLSTAT, RADEON_RB3D_DC_FLUSH); + /* We must wait for 3d to idle, in case source was just written as a dest. */ + OUT_ACCEL_REG(RADEON_WAIT_UNTIL, + RADEON_WAIT_HOST_IDLECLEAN | + RADEON_WAIT_2D_IDLECLEAN | + RADEON_WAIT_3D_IDLECLEAN | + RADEON_WAIT_DMA_GUI_IDLE); + FINISH_ACCEL(); - /* pitch is in pixels */ - txpitch = pPriv->src_pitch / 2; + if (!info->accel_state->XInited3D) + RADEONInit3DEngine(pScrn); } - txpitch -= 1; - txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) | - (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) | - R300_TXPITCH_EN); + vtx_count = 4; - info->accel_state->texW[0] = pPriv->w; - info->accel_state->texH[0] = pPriv->h; + /* Same for R100/R200 */ + switch (pPixmap->drawable.bitsPerPixel) { + case 16: + if (pPixmap->drawable.depth == 15) + dst_format = RADEON_COLOR_FORMAT_ARGB1555; + else + dst_format = RADEON_COLOR_FORMAT_RGB565; + break; + case 32: + dst_format = RADEON_COLOR_FORMAT_ARGB8888; + break; + default: + return; + } - txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) | - R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) | - R300_TX_MAG_FILTER_LINEAR | - R300_TX_MIN_FILTER_LINEAR | - (0 << R300_TX_ID_SHIFT)); + if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) { + isplanar = TRUE; + } + if (isplanar) { + txformat = RADEON_TXFORMAT_I8; + } else { + if (pPriv->id == FOURCC_UYVY) + txformat = RADEON_TXFORMAT_YVYU422; + else + txformat = RADEON_TXFORMAT_VYUY422; + } - if (IS_R500_3D && ((pPriv->w - 1) & 0x800)) - txpitch |= R500_TXWIDTH_11; + txformat |= RADEON_TXFORMAT_NON_POWER2; - if (IS_R500_3D && ((pPriv->h - 1) & 0x800)) - txpitch |= R500_TXHEIGHT_11; + colorpitch = dst_pitch >> pixel_shift; - txoffset = pPriv->src_offset; + if (RADEONTilingEnabled(pScrn, pPixmap)) + colorpitch |= RADEON_COLOR_TILE_ENABLE; - BEGIN_ACCEL(6); - OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter); - OUT_ACCEL_REG(R300_TX_FILTER1_0, 0); - OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0); - OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1); - OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch); - OUT_ACCEL_REG(R300_TX_OFFSET_0, txoffset); - FINISH_ACCEL(); + BEGIN_ACCEL(4); - txenable = R300_TEX_0_ENABLE; + OUT_ACCEL_REG(RADEON_RB3D_CNTL, + dst_format /*| RADEON_ALPHA_BLEND_ENABLE*/); + OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, dst_offset); - if (isplanar) { - txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) | - (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) | - R300_TXPITCH_EN); - txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63; - txpitch -= 1; - txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) | - R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) | - R300_TX_MIN_FILTER_LINEAR | - R300_TX_MAG_FILTER_LINEAR); - - BEGIN_ACCEL(12); - OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT)); - OUT_ACCEL_REG(R300_TX_FILTER1_1, 0); - OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0); - OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2); - OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch); - OUT_ACCEL_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset); - OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT)); - OUT_ACCEL_REG(R300_TX_FILTER1_2, 0); - OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0); - OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3); - OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch); - OUT_ACCEL_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset); - FINISH_ACCEL(); - txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE; + OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, colorpitch); + + OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL, + RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO); + + FINISH_ACCEL(); + + info->accel_state->texW[0] = pPriv->w; + info->accel_state->texH[0] = pPriv->h; + + if (isplanar) { + /* note: in contrast to r300, use input biasing on uv components */ + const float Loff = -0.0627; + float uvcosf, uvsinf; + float yco, yoff; + float uco[3], vco[3]; + float bright, cont, sat; + int ref = pPriv->transform_index; + float ucscale = 0.25, vcscale = 0.25; + Bool needux8 = FALSE, needvx8 = FALSE; + + /* contrast can cause constant overflow, clamp */ + cont = RTFContrast(pPriv->contrast); + if (cont * trans[ref].RefLuma > 2.0) + cont = 2.0 / trans[ref].RefLuma; + /* brightness is only from -0.5 to 0.5 should be safe */ + bright = RTFBrightness(pPriv->brightness); + /* saturation can also cause overflow, clamp */ + sat = RTFSaturation(pPriv->saturation); + if (sat * trans[ref].RefBCb > 4.0) + sat = 4.0 / trans[ref].RefBCb; + uvcosf = sat * cos(RTFHue(pPriv->hue)); + uvsinf = sat * sin(RTFHue(pPriv->hue)); + + yco = trans[ref].RefLuma * cont; + uco[0] = -trans[ref].RefRCr * uvsinf; + uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf; + uco[2] = trans[ref].RefBCb * uvcosf; + vco[0] = trans[ref].RefRCr * uvcosf; + vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf; + vco[2] = trans[ref].RefBCb * uvsinf; + yoff = Loff * yco + bright; + + if ((uco[0] > 2.0) || (uco[2] > 2.0)) { + needux8 = TRUE; + ucscale = 0.125; + } + if ((vco[0] > 2.0) || (vco[2] > 2.0)) { + needvx8 = TRUE; + vcscale = 0.125; } - if (pPriv->bicubic_enabled) { - /* Size is 128x1 */ - txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) | - (0x0 << R300_TXHEIGHT_SHIFT) | - R300_TXPITCH_EN); - /* Format is 32-bit floats, 4bpp */ - txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16); - /* Pitch is 127 (128-1) */ - txpitch = 0x7f; - /* Tex filter */ - txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) | - R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) | - R300_TX_MIN_FILTER_NEAREST | - R300_TX_MAG_FILTER_NEAREST | - (1 << R300_TX_ID_SHIFT)); - - BEGIN_ACCEL(6); - OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter); - OUT_ACCEL_REG(R300_TX_FILTER1_1, 0); - OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0); - OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1); - OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch); - OUT_ACCEL_REG(R300_TX_OFFSET_1, pPriv->bicubic_src_offset); - FINISH_ACCEL(); - - /* Enable tex 1 */ - txenable |= R300_TEX_1_ENABLE; + /* need 2 texcoord sets (even though they are identical) due + to denormalization! hw apparently can't premultiply + same coord set by different texture size */ + vtx_count = 6; + + txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) | + (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT)); + txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63; + txpitch -= 32; + txfilter = R200_MAG_FILTER_LINEAR | + R200_MIN_FILTER_LINEAR | + R200_CLAMP_S_CLAMP_LAST | + R200_CLAMP_T_CLAMP_LAST; + + BEGIN_ACCEL(36); + + OUT_ACCEL_REG(RADEON_PP_CNTL, + RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE | + RADEON_TEX_BLEND_0_ENABLE | + RADEON_TEX_BLEND_1_ENABLE | + RADEON_TEX_BLEND_2_ENABLE); + + OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY); + OUT_ACCEL_REG(R200_SE_VTX_FMT_1, + (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) | + (2 << R200_VTX_TEX1_COMP_CNT_SHIFT)); + + OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter); + OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat); + OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0); + OUT_ACCEL_REG(R200_PP_TXSIZE_0, + (pPriv->w - 1) | + ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT)); + OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32); + OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset); + + OUT_ACCEL_REG(R200_PP_TXFILTER_1, txfilter); + OUT_ACCEL_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1); + OUT_ACCEL_REG(R200_PP_TXFORMAT_X_1, 0); + OUT_ACCEL_REG(R200_PP_TXSIZE_1, txformat0); + OUT_ACCEL_REG(R200_PP_TXPITCH_1, txpitch); + OUT_ACCEL_REG(R200_PP_TXOFFSET_1, pPriv->src_offset + pPriv->planeu_offset); + + OUT_ACCEL_REG(R200_PP_TXFILTER_2, txfilter); + OUT_ACCEL_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1); + OUT_ACCEL_REG(R200_PP_TXFORMAT_X_2, 0); + OUT_ACCEL_REG(R200_PP_TXSIZE_2, txformat0); + OUT_ACCEL_REG(R200_PP_TXPITCH_2, txpitch); + OUT_ACCEL_REG(R200_PP_TXOFFSET_2, pPriv->src_offset + pPriv->planev_offset); + + /* similar to r300 code. Note the big problem is that hardware constants + * are 8 bits only, representing 0.0-1.0. We can get that up (using bias + * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually + * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but + * the constants not. To get larger range can use output scale, but for + * that 2.018 value we need a total scale by 8, which means the constants + * really have no accuracy whatsoever (5 fractional bits only). + * The only direct way to get high precision "constants" into the fragment + * pipe I know of is to use the texcoord interpolator (not color, this one + * is 8 bit only too), which seems a bit expensive. We're lucky though it + * seems the values we need seem to fit better than worst case (get about + * 6 fractional bits for this instead of 5, at least when not correcting for + * hue/saturation/contrast/brightness, which is the same as for vco - yco and + * yoff get 8 fractional bits). Try to preserve as much accuracy as possible + * even with non-default saturation/hue/contrast/brightness adjustments, + * it gets a little crazy and ultimately precision might still be lacking. + * + * A higher precision (8 fractional bits) version might just put uco into + * a texcoord, and calculate a new vcoconst in the shader, like so: + * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable + * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0} + * vcocalc = ADD temp, bias/scale(cohelper), vco + * would in total use 4 tex units, 4 instructions which seems fairly + * balanced for this architecture (instead of 3 + 3 for the solution here) + * + * temp = MAD(yco, yuv.yyyy, yoff) + * temp = MAD(uco, yuv.uuuu, temp) + * result = MAD(vco, yuv.vvvv, temp) + * + * note first mad produces actually scalar, hence we transform + * it into a dp2a to get 8 bit precision of yco instead of 7 - + * That's assuming hw correctly expands consts to internal precision. + * (y * 1 + y * (yco - 1) + yoff) + * temp = DP2A / 2 (yco, yuv.yyyy, yoff) + * temp = MAD (uco / 4, yuv.uuuu * 2, temp) + * result = MAD x2 (vco / 2, yuv.vvvv, temp) + * + * vco, uco need bias (and hence scale too) + * + */ + + /* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */ + OUT_ACCEL_REG(R200_PP_TXCBLEND_0, + R200_TXC_ARG_A_TFACTOR_COLOR | + R200_TXC_ARG_B_R0_COLOR | + R200_TXC_ARG_C_TFACTOR_COLOR | + (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) | + R200_TXC_OP_DOT2_ADD); + OUT_ACCEL_REG(R200_PP_TXCBLEND2_0, + (0 << R200_TXC_TFACTOR_SEL_SHIFT) | + R200_TXC_SCALE_INV2 | + R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0); + OUT_ACCEL_REG(R200_PP_TXABLEND_0, + R200_TXA_ARG_A_ZERO | + R200_TXA_ARG_B_ZERO | + R200_TXA_ARG_C_ZERO | + R200_TXA_OP_MADD); + OUT_ACCEL_REG(R200_PP_TXABLEND2_0, + R200_TXA_OUTPUT_REG_NONE); + + /* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */ + OUT_ACCEL_REG(R200_PP_TXCBLEND_1, + R200_TXC_ARG_A_TFACTOR_COLOR | + R200_TXC_BIAS_ARG_A | + R200_TXC_SCALE_ARG_A | + R200_TXC_ARG_B_R1_COLOR | + R200_TXC_BIAS_ARG_B | + (needux8 ? R200_TXC_SCALE_ARG_B : 0) | + R200_TXC_ARG_C_R0_COLOR | + R200_TXC_OP_MADD); + OUT_ACCEL_REG(R200_PP_TXCBLEND2_1, + (1 << R200_TXC_TFACTOR_SEL_SHIFT) | + R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0); + OUT_ACCEL_REG(R200_PP_TXABLEND_1, + R200_TXA_ARG_A_ZERO | + R200_TXA_ARG_B_ZERO | + R200_TXA_ARG_C_ZERO | + R200_TXA_OP_MADD); + OUT_ACCEL_REG(R200_PP_TXABLEND2_1, + R200_TXA_OUTPUT_REG_NONE); + + /* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */ + OUT_ACCEL_REG(R200_PP_TXCBLEND_2, + R200_TXC_ARG_A_TFACTOR_COLOR | + R200_TXC_BIAS_ARG_A | + R200_TXC_SCALE_ARG_A | + R200_TXC_ARG_B_R2_COLOR | + R200_TXC_BIAS_ARG_B | + (needvx8 ? R200_TXC_SCALE_ARG_B : 0) | + R200_TXC_ARG_C_R0_COLOR | + R200_TXC_OP_MADD); + OUT_ACCEL_REG(R200_PP_TXCBLEND2_2, + (2 << R200_TXC_TFACTOR_SEL_SHIFT) | + R200_TXC_SCALE_2X | + R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0); + OUT_ACCEL_REG(R200_PP_TXABLEND_2, + R200_TXA_ARG_A_ZERO | + R200_TXA_ARG_B_ZERO | + R200_TXA_ARG_C_ZERO | + R200_TXA_COMP_ARG_C | + R200_TXA_OP_MADD); + OUT_ACCEL_REG(R200_PP_TXABLEND2_2, + R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0); + + /* shader constants */ + OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */ + yco > 1.0 ? yco - 1.0: yco, + yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */ + 0.0)); + OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */ + uco[1] * ucscale + 0.5, /* or [-2, 2] */ + uco[2] * ucscale + 0.5, + 0.0)); + OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */ + vco[1] * vcscale + 0.5, /* or [-4, 4] */ + vco[2] * vcscale + 0.5, + 0.0)); + + FINISH_ACCEL(); + } else if (info->ChipFamily == CHIP_FAMILY_RV250) { + /* fix up broken packed yuv - shader same as above except + yuv components are all in same reg */ + /* note: in contrast to r300, use input biasing on uv components */ + const float Loff = -0.0627; + float uvcosf, uvsinf; + float yco, yoff; + float uco[3], vco[3]; + float bright, cont, sat; + int ref = pPriv->transform_index; + float ucscale = 0.25, vcscale = 0.25; + Bool needux8 = FALSE, needvx8 = FALSE; + + /* contrast can cause constant overflow, clamp */ + cont = RTFContrast(pPriv->contrast); + if (cont * trans[ref].RefLuma > 2.0) + cont = 2.0 / trans[ref].RefLuma; + /* brightness is only from -0.5 to 0.5 should be safe */ + bright = RTFBrightness(pPriv->brightness); + /* saturation can also cause overflow, clamp */ + sat = RTFSaturation(pPriv->saturation); + if (sat * trans[ref].RefBCb > 4.0) + sat = 4.0 / trans[ref].RefBCb; + uvcosf = sat * cos(RTFHue(pPriv->hue)); + uvsinf = sat * sin(RTFHue(pPriv->hue)); + + yco = trans[ref].RefLuma * cont; + uco[0] = -trans[ref].RefRCr * uvsinf; + uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf; + uco[2] = trans[ref].RefBCb * uvcosf; + vco[0] = trans[ref].RefRCr * uvcosf; + vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf; + vco[2] = trans[ref].RefBCb * uvsinf; + yoff = Loff * yco + bright; + + if ((uco[0] > 2.0) || (uco[2] > 2.0)) { + needux8 = TRUE; + ucscale = 0.125; + } + if ((vco[0] > 2.0) || (vco[2] > 2.0)) { + needvx8 = TRUE; + vcscale = 0.125; } - /* setup the VAP */ - if (info->accel_state->has_tcl) { - if (pPriv->bicubic_enabled) - BEGIN_ACCEL(7); - else - BEGIN_ACCEL(6); - } else { - if (pPriv->bicubic_enabled) - BEGIN_ACCEL(5); - else - BEGIN_ACCEL(4); + txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) | + (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT)); + txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63; + txpitch -= 32; + txfilter = R200_MAG_FILTER_LINEAR | + R200_MIN_FILTER_LINEAR | + R200_CLAMP_S_CLAMP_LAST | + R200_CLAMP_T_CLAMP_LAST; + + BEGIN_ACCEL(24); + + OUT_ACCEL_REG(RADEON_PP_CNTL, + RADEON_TEX_0_ENABLE | + RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE | + RADEON_TEX_BLEND_2_ENABLE); + + OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY); + OUT_ACCEL_REG(R200_SE_VTX_FMT_1, + (2 << R200_VTX_TEX0_COMP_CNT_SHIFT)); + + OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter); + OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat); + OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0); + OUT_ACCEL_REG(R200_PP_TXSIZE_0, + (pPriv->w - 1) | + ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT)); + OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32); + OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset); + + /* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */ + OUT_ACCEL_REG(R200_PP_TXCBLEND_0, + R200_TXC_ARG_A_TFACTOR_COLOR | + R200_TXC_ARG_B_R0_COLOR | + R200_TXC_ARG_C_TFACTOR_COLOR | + (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) | + R200_TXC_OP_DOT2_ADD); + OUT_ACCEL_REG(R200_PP_TXCBLEND2_0, + (0 << R200_TXC_TFACTOR_SEL_SHIFT) | + R200_TXC_SCALE_INV2 | + (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) | + R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1); + OUT_ACCEL_REG(R200_PP_TXABLEND_0, + R200_TXA_ARG_A_ZERO | + R200_TXA_ARG_B_ZERO | + R200_TXA_ARG_C_ZERO | + R200_TXA_OP_MADD); + OUT_ACCEL_REG(R200_PP_TXABLEND2_0, + R200_TXA_OUTPUT_REG_NONE); + + /* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */ + OUT_ACCEL_REG(R200_PP_TXCBLEND_1, + R200_TXC_ARG_A_TFACTOR_COLOR | + R200_TXC_BIAS_ARG_A | + R200_TXC_SCALE_ARG_A | + R200_TXC_ARG_B_R0_COLOR | + R200_TXC_BIAS_ARG_B | + (needux8 ? R200_TXC_SCALE_ARG_B : 0) | + R200_TXC_ARG_C_R1_COLOR | + R200_TXC_OP_MADD); + OUT_ACCEL_REG(R200_PP_TXCBLEND2_1, + (1 << R200_TXC_TFACTOR_SEL_SHIFT) | + (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) | + R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1); + OUT_ACCEL_REG(R200_PP_TXABLEND_1, + R200_TXA_ARG_A_ZERO | + R200_TXA_ARG_B_ZERO | + R200_TXA_ARG_C_ZERO | + R200_TXA_OP_MADD); + OUT_ACCEL_REG(R200_PP_TXABLEND2_1, + R200_TXA_OUTPUT_REG_NONE); + + /* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */ + OUT_ACCEL_REG(R200_PP_TXCBLEND_2, + R200_TXC_ARG_A_TFACTOR_COLOR | + R200_TXC_BIAS_ARG_A | + R200_TXC_SCALE_ARG_A | + R200_TXC_ARG_B_R0_COLOR | + R200_TXC_BIAS_ARG_B | + (needvx8 ? R200_TXC_SCALE_ARG_B : 0) | + R200_TXC_ARG_C_R1_COLOR | + R200_TXC_OP_MADD); + OUT_ACCEL_REG(R200_PP_TXCBLEND2_2, + (2 << R200_TXC_TFACTOR_SEL_SHIFT) | + R200_TXC_SCALE_2X | + (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) | + R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0); + OUT_ACCEL_REG(R200_PP_TXABLEND_2, + R200_TXA_ARG_A_ZERO | + R200_TXA_ARG_B_ZERO | + R200_TXA_ARG_C_ZERO | + R200_TXA_COMP_ARG_C | + R200_TXA_OP_MADD); + OUT_ACCEL_REG(R200_PP_TXABLEND2_2, + R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0); + + /* shader constants */ + OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */ + yco > 1.0 ? yco - 1.0: yco, + yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */ + 0.0)); + OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */ + uco[1] * ucscale + 0.5, /* or [-2, 2] */ + uco[2] * ucscale + 0.5, + 0.0)); + OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */ + vco[1] * vcscale + 0.5, /* or [-4, 4] */ + vco[2] * vcscale + 0.5, + 0.0)); + + FINISH_ACCEL(); + } else { + BEGIN_ACCEL(13); + OUT_ACCEL_REG(RADEON_PP_CNTL, + RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE); + + OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY); + OUT_ACCEL_REG(R200_SE_VTX_FMT_1, + (2 << R200_VTX_TEX0_COMP_CNT_SHIFT)); + + OUT_ACCEL_REG(R200_PP_TXFILTER_0, + R200_MAG_FILTER_LINEAR | + R200_MIN_FILTER_LINEAR | + R200_CLAMP_S_CLAMP_LAST | + R200_CLAMP_T_CLAMP_LAST | + R200_YUV_TO_RGB); + OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat); + OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0); + OUT_ACCEL_REG(R200_PP_TXSIZE_0, + (pPriv->w - 1) | + ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT)); + OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32); + + OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset); + + OUT_ACCEL_REG(R200_PP_TXCBLEND_0, + R200_TXC_ARG_A_ZERO | + R200_TXC_ARG_B_ZERO | + R200_TXC_ARG_C_R0_COLOR | + R200_TXC_OP_MADD); + OUT_ACCEL_REG(R200_PP_TXCBLEND2_0, + R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0); + OUT_ACCEL_REG(R200_PP_TXABLEND_0, + R200_TXA_ARG_A_ZERO | + R200_TXA_ARG_B_ZERO | + R200_TXA_ARG_C_R0_ALPHA | + R200_TXA_OP_MADD); + OUT_ACCEL_REG(R200_PP_TXABLEND2_0, + R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0); + FINISH_ACCEL(); + } + + if (pPriv->vsync) { + xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn, + pPriv->drw_x, + pPriv->drw_x + pPriv->dst_w, + pPriv->drw_y, + pPriv->drw_y + pPriv->dst_h); + if (crtc) { + RADEONCrtcPrivatePtr radeon_crtc = crtc->driver_private; + + FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap, + radeon_crtc->crtc_id, + pPriv->drw_y - crtc->y, + (pPriv->drw_y - crtc->y) + pPriv->dst_h); } + } + /* + * Rendering of the actual polygon is done in two different + * ways depending on chip generation: + * + * < R300: + * + * These chips can render a rectangle in one pass, so + * handling is pretty straight-forward. + * + * >= R300: + * + * These chips can accept a quad, but will render it as + * two triangles which results in a diagonal tear. Instead + * We render a single, large triangle and use the scissor + * functionality to restrict it to the desired rectangle. + * Due to guardband limits on r3xx/r4xx, we can only use + * the single triangle up to 2880 pixels; above that we + * render as a quad. + */ - /* These registers define the number, type, and location of data submitted - * to the PVS unit of GA input (when PVS is disabled) - * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is - * enabled. This memory provides the imputs to the vertex shader program - * and ordering is not important. When PVS/TCL is disabled, this field maps - * directly to the GA input memory and the order is signifigant. In - * PVS_BYPASS mode the order is as follows: - * Position - * Point Size - * Color 0-3 - * Textures 0-7 - * Fog - */ - if (pPriv->bicubic_enabled) { - OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0, - ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) | - (0 << R300_SKIP_DWORDS_0_SHIFT) | - (0 << R300_DST_VEC_LOC_0_SHIFT) | - R300_SIGNED_0 | - (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) | - (0 << R300_SKIP_DWORDS_1_SHIFT) | - (6 << R300_DST_VEC_LOC_1_SHIFT) | - R300_SIGNED_1)); - OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1, - ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) | - (0 << R300_SKIP_DWORDS_2_SHIFT) | - (7 << R300_DST_VEC_LOC_2_SHIFT) | - R300_LAST_VEC_2 | - R300_SIGNED_2)); + while (nBox--) { + int srcX, srcY, srcw, srch; + int dstX, dstY, dstw, dsth; + dstX = pBox->x1 + dstxoff; + dstY = pBox->y1 + dstyoff; + dstw = pBox->x2 - pBox->x1; + dsth = pBox->y2 - pBox->y1; + + srcX = ((pBox->x1 - pPriv->drw_x) * + pPriv->src_w) / pPriv->dst_w; + srcY = ((pBox->y1 - pPriv->drw_y) * + pPriv->src_h) / pPriv->dst_h; + + srcw = (pPriv->src_w * dstw) / pPriv->dst_w; + srch = (pPriv->src_h * dsth) / pPriv->dst_h; + +#ifdef ACCEL_CP + BEGIN_RING(3 * vtx_count + 2); + OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2, + 3 * vtx_count)); + OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST | + RADEON_CP_VC_CNTL_PRIM_WALK_RING | + (3 << RADEON_CP_VC_CNTL_NUM_SHIFT)); +#else /* ACCEL_CP */ + BEGIN_ACCEL(1 + vtx_count * 3); + OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST | + RADEON_VF_PRIM_WALK_DATA | + (3 << RADEON_VF_NUM_VERTICES_SHIFT))); + +#endif + if (isplanar) { + /* + * Just render a rect (using three coords). + * Filter is a bit a misnomer, it's just texcoords... + */ + VTX_OUT_FILTER((float)dstX, (float)(dstY + dsth), + (float)srcX / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0], + (float)srcX / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]); + VTX_OUT_FILTER((float)(dstX + dstw), (float)(dstY + dsth), + (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0], + (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]); + VTX_OUT_FILTER((float)(dstX + dstw), (float)dstY, + (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0], + (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]); } else { - OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0, - ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) | - (0 << R300_SKIP_DWORDS_0_SHIFT) | - (0 << R300_DST_VEC_LOC_0_SHIFT) | - R300_SIGNED_0 | - (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) | - (0 << R300_SKIP_DWORDS_1_SHIFT) | - (6 << R300_DST_VEC_LOC_1_SHIFT) | - R300_LAST_VEC_1 | - R300_SIGNED_1)); + /* + * Just render a rect (using three coords). + */ + VTX_OUT((float)dstX, (float)(dstY + dsth), + (float)srcX / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]); + VTX_OUT((float)(dstX + dstw), (float)(dstY + dsth), + (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]); + VTX_OUT((float)(dstX + dstw), (float)dstY, + (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]); } - /* load the vertex shader - * We pre-load vertex programs in RADEONInit3DEngine(): - * - exa mask/Xv bicubic - * - exa no mask - * - Xv - * Here we select the offset of the vertex program we want to use - */ - if (info->accel_state->has_tcl) { - if (pPriv->bicubic_enabled) { - OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0, - ((0 << R300_PVS_FIRST_INST_SHIFT) | - (2 << R300_PVS_XYZW_VALID_INST_SHIFT) | - (2 << R300_PVS_LAST_INST_SHIFT))); - OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1, - (2 << R300_PVS_LAST_VTX_SRC_INST_SHIFT)); - } else { - OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0, - ((5 << R300_PVS_FIRST_INST_SHIFT) | - (6 << R300_PVS_XYZW_VALID_INST_SHIFT) | - (6 << R300_PVS_LAST_INST_SHIFT))); - OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1, - (6 << R300_PVS_LAST_VTX_SRC_INST_SHIFT)); - } +#ifdef ACCEL_CP + ADVANCE_RING(); +#else + FINISH_ACCEL(); +#endif /* !ACCEL_CP */ + + pBox++; + } + + BEGIN_ACCEL(1); + OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN); + FINISH_ACCEL(); + + DamageDamageRegion(pPriv->pDraw, &pPriv->clip); +} + +static void +FUNC_NAME(R300DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv) +{ + RADEONInfoPtr info = RADEONPTR(pScrn); + PixmapPtr pPixmap = pPriv->pPixmap; + uint32_t txfilter, txformat0, txformat1, txoffset, txpitch; + uint32_t dst_offset, dst_pitch, dst_format; + uint32_t txenable, colorpitch; + uint32_t output_fmt; + Bool isplanar = FALSE; + int dstxoff, dstyoff, pixel_shift, vtx_count; + BoxPtr pBox = REGION_RECTS(&pPriv->clip); + int nBox = REGION_NUM_RECTS(&pPriv->clip); + ACCEL_PREAMBLE(); + + pixel_shift = pPixmap->drawable.bitsPerPixel >> 4; + +#ifdef USE_EXA + if (info->useEXA) { + dst_offset = exaGetPixmapOffset(pPixmap) + info->fbLocation + pScrn->fbOffset; + dst_pitch = exaGetPixmapPitch(pPixmap); + } else +#endif + { + dst_offset = (pPixmap->devPrivate.ptr - info->FB) + + info->fbLocation + pScrn->fbOffset; + dst_pitch = pPixmap->devKind; } - /* Position and one set of 2 texture coordinates */ - OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT); - if (pPriv->bicubic_enabled) - OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) | - (2 << R300_TEX_1_COMP_CNT_SHIFT))); +#ifdef COMPOSITE + dstxoff = -pPixmap->screen_x + pPixmap->drawable.x; + dstyoff = -pPixmap->screen_y + pPixmap->drawable.y; +#else + dstxoff = 0; + dstyoff = 0; +#endif + +#ifdef USE_EXA + if (info->useEXA) { + RADEON_SWITCH_TO_3D(); + } else +#endif + { + BEGIN_ACCEL(2); + OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D); + /* We must wait for 3d to idle, in case source was just written as a dest. */ + OUT_ACCEL_REG(RADEON_WAIT_UNTIL, + RADEON_WAIT_HOST_IDLECLEAN | + RADEON_WAIT_2D_IDLECLEAN | + RADEON_WAIT_3D_IDLECLEAN | + RADEON_WAIT_DMA_GUI_IDLE); + FINISH_ACCEL(); + + if (!info->accel_state->XInited3D) + RADEONInit3DEngine(pScrn); + } + + if (pPriv->bicubic_enabled) + vtx_count = 6; + else + vtx_count = 4; + + switch (pPixmap->drawable.bitsPerPixel) { + case 16: + if (pPixmap->drawable.depth == 15) + dst_format = R300_COLORFORMAT_ARGB1555; else - OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT)); + dst_format = R300_COLORFORMAT_RGB565; + break; + case 32: + dst_format = R300_COLORFORMAT_ARGB8888; + break; + default: + return; + } - OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt); - FINISH_ACCEL(); + output_fmt = (R300_OUT_FMT_C4_8 | + R300_OUT_FMT_C0_SEL_BLUE | + R300_OUT_FMT_C1_SEL_GREEN | + R300_OUT_FMT_C2_SEL_RED | + R300_OUT_FMT_C3_SEL_ALPHA); - /* setup pixel shader */ - if (IS_R300_3D) { - if (pPriv->bicubic_enabled) { - BEGIN_ACCEL(79); - - /* 4 components: 2 for tex0 and 2 for tex1 */ - OUT_ACCEL_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) | - R300_RS_COUNT_HIRES_EN)); - - /* R300_INST_COUNT_RS - highest RS instruction used */ - OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6)); - - /* Pixel stack frame size. */ - OUT_ACCEL_REG(R300_US_PIXSIZE, 5); - - /* Indirection levels */ - OUT_ACCEL_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) | - R300_FIRST_TEX)); - - /* Set nodes. */ - OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) | - R300_ALU_CODE_SIZE(14) | - R300_TEX_CODE_OFFSET(0) | - R300_TEX_CODE_SIZE(6))); - - /* Nodes are allocated highest first, but executed lowest first */ - OUT_ACCEL_REG(R300_US_CODE_ADDR_0, 0); - OUT_ACCEL_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) | - R300_ALU_SIZE(0) | - R300_TEX_START(0) | - R300_TEX_SIZE(0))); - OUT_ACCEL_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) | - R300_ALU_SIZE(9) | - R300_TEX_START(1) | - R300_TEX_SIZE(0))); - OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) | - R300_ALU_SIZE(2) | - R300_TEX_START(2) | - R300_TEX_SIZE(3) | - R300_RGBA_OUT)); - - /* ** BICUBIC FP ** */ - - /* texcoord0 => temp0 - * texcoord1 => temp1 */ - - // first node - /* TEX temp2, temp1.rrr0, tex1, 1D */ - OUT_ACCEL_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) | - R300_TEX_ID(1) | - R300_TEX_SRC_ADDR(1) | - R300_TEX_DST_ADDR(2))); - - /* MOV temp1.r, temp1.ggg0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) | - R300_ALU_RGB_ADDRD(1) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + colorpitch = dst_pitch >> pixel_shift; + colorpitch |= dst_format; + if (RADEONTilingEnabled(pScrn, pPixmap)) + colorpitch |= R300_COLORTILE; - // second node - /* TEX temp1, temp1, tex1, 1D */ - OUT_ACCEL_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) | - R300_TEX_ID(1) | - R300_TEX_SRC_ADDR(1) | - R300_TEX_DST_ADDR(1))); - - /* MUL temp3.rg, temp2.ggg0, const0.rgb0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) | - R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) | - R300_ALU_RGB_ADDRD(3) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) { + isplanar = TRUE; + } + if (isplanar) { + txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0; + txpitch = pPriv->src_pitch; + } else { + if (pPriv->id == FOURCC_UYVY) + txformat1 = R300_TX_FORMAT_YVYU422; + else + txformat1 = R300_TX_FORMAT_VYUY422; - /* MUL temp2.rg, temp2.rrr0, const0.rgb */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) | - R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) | - R300_ALU_RGB_ADDRD(2) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP; - /* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) | - R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) | - R300_ALU_RGB_ADDR2(3) | - R300_ALU_RGB_ADDRD(4) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + /* pitch is in pixels */ + txpitch = pPriv->src_pitch / 2; + } + txpitch -= 1; + + txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) | + (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) | + R300_TXPITCH_EN); + + info->accel_state->texW[0] = pPriv->w; + info->accel_state->texH[0] = pPriv->h; + + txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) | + R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) | + R300_TX_MAG_FILTER_LINEAR | + R300_TX_MIN_FILTER_LINEAR | + (0 << R300_TX_ID_SHIFT)); + + txoffset = pPriv->src_offset; + + BEGIN_ACCEL(6); + OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter); + OUT_ACCEL_REG(R300_TX_FILTER1_0, 0); + OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0); + OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1); + OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch); + OUT_ACCEL_REG(R300_TX_OFFSET_0, txoffset); + FINISH_ACCEL(); - /* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) | - R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) | - R300_ALU_RGB_ADDR2(2) | - R300_ALU_RGB_ADDRD(5) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + txenable = R300_TEX_0_ENABLE; - /* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) | - R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) | - R300_ALU_RGB_ADDR2(3) | - R300_ALU_RGB_ADDRD(3) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + if (isplanar) { + txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) | + (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) | + R300_TXPITCH_EN); + txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63; + txpitch -= 1; + txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) | + R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) | + R300_TX_MIN_FILTER_LINEAR | + R300_TX_MAG_FILTER_LINEAR); + + BEGIN_ACCEL(12); + OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT)); + OUT_ACCEL_REG(R300_TX_FILTER1_1, 0); + OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0); + OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2); + OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch); + OUT_ACCEL_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset); + OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT)); + OUT_ACCEL_REG(R300_TX_FILTER1_2, 0); + OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0); + OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3); + OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch); + OUT_ACCEL_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset); + FINISH_ACCEL(); + txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE; + } - /* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) | - R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) | - R300_ALU_RGB_ADDR2(2) | - R300_ALU_RGB_ADDRD(1) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + if (pPriv->bicubic_enabled) { + /* Size is 128x1 */ + txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) | + (0x0 << R300_TXHEIGHT_SHIFT) | + R300_TXPITCH_EN); + /* Format is 32-bit floats, 4bpp */ + txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16); + /* Pitch is 127 (128-1) */ + txpitch = 0x7f; + /* Tex filter */ + txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) | + R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) | + R300_TX_MIN_FILTER_NEAREST | + R300_TX_MAG_FILTER_NEAREST | + (1 << R300_TX_ID_SHIFT)); - /* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_ADDR2(1) | - R300_ALU_RGB_ADDRD(1) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + BEGIN_ACCEL(6); + OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter); + OUT_ACCEL_REG(R300_TX_FILTER1_1, 0); + OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0); + OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1); + OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch); + OUT_ACCEL_REG(R300_TX_OFFSET_1, pPriv->bicubic_src_offset); + FINISH_ACCEL(); - /* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_ADDR2(3) | - R300_ALU_RGB_ADDRD(2) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + /* Enable tex 1 */ + txenable |= R300_TEX_1_ENABLE; + } - /* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_ADDR2(5) | - R300_ALU_RGB_ADDRD(3) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + /* setup the VAP */ + if (info->accel_state->has_tcl) { + if (pPriv->bicubic_enabled) + BEGIN_ACCEL(7); + else + BEGIN_ACCEL(6); + } else { + if (pPriv->bicubic_enabled) + BEGIN_ACCEL(5); + else + BEGIN_ACCEL(4); + } - /* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_ADDR2(4) | - R300_ALU_RGB_ADDRD(0) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + /* These registers define the number, type, and location of data submitted + * to the PVS unit of GA input (when PVS is disabled) + * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is + * enabled. This memory provides the imputs to the vertex shader program + * and ordering is not important. When PVS/TCL is disabled, this field maps + * directly to the GA input memory and the order is signifigant. In + * PVS_BYPASS mode the order is as follows: + * Position + * Point Size + * Color 0-3 + * Textures 0-7 + * Fog + */ + if (pPriv->bicubic_enabled) { + OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0, + ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) | + (0 << R300_SKIP_DWORDS_0_SHIFT) | + (0 << R300_DST_VEC_LOC_0_SHIFT) | + R300_SIGNED_0 | + (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) | + (0 << R300_SKIP_DWORDS_1_SHIFT) | + (6 << R300_DST_VEC_LOC_1_SHIFT) | + R300_SIGNED_1)); + OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1, + ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) | + (0 << R300_SKIP_DWORDS_2_SHIFT) | + (7 << R300_DST_VEC_LOC_2_SHIFT) | + R300_LAST_VEC_2 | + R300_SIGNED_2)); + } else { + OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0, + ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) | + (0 << R300_SKIP_DWORDS_0_SHIFT) | + (0 << R300_DST_VEC_LOC_0_SHIFT) | + R300_SIGNED_0 | + (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) | + (0 << R300_SKIP_DWORDS_1_SHIFT) | + (6 << R300_DST_VEC_LOC_1_SHIFT) | + R300_LAST_VEC_1 | + R300_SIGNED_1)); + } + + /* load the vertex shader + * We pre-load vertex programs in RADEONInit3DEngine(): + * - exa mask/Xv bicubic + * - exa no mask + * - Xv + * Here we select the offset of the vertex program we want to use + */ + if (info->accel_state->has_tcl) { + if (pPriv->bicubic_enabled) { + OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0, + ((0 << R300_PVS_FIRST_INST_SHIFT) | + (2 << R300_PVS_XYZW_VALID_INST_SHIFT) | + (2 << R300_PVS_LAST_INST_SHIFT))); + OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1, + (2 << R300_PVS_LAST_VTX_SRC_INST_SHIFT)); + } else { + OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0, + ((5 << R300_PVS_FIRST_INST_SHIFT) | + (6 << R300_PVS_XYZW_VALID_INST_SHIFT) | + (6 << R300_PVS_LAST_INST_SHIFT))); + OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1, + (6 << R300_PVS_LAST_VTX_SRC_INST_SHIFT)); + } + } + + /* Position and one set of 2 texture coordinates */ + OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT); + if (pPriv->bicubic_enabled) + OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) | + (2 << R300_TEX_1_COMP_CNT_SHIFT))); + else + OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT)); + + OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt); + FINISH_ACCEL(); + + /* setup pixel shader */ + if (pPriv->bicubic_enabled) { + BEGIN_ACCEL(79); + + /* 4 components: 2 for tex0 and 2 for tex1 */ + OUT_ACCEL_REG(R300_RS_COUNT, ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) | + R300_RS_COUNT_HIRES_EN)); + + /* R300_INST_COUNT_RS - highest RS instruction used */ + OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6)); + + /* Pixel stack frame size. */ + OUT_ACCEL_REG(R300_US_PIXSIZE, 5); + + /* Indirection levels */ + OUT_ACCEL_REG(R300_US_CONFIG, ((2 << R300_NLEVEL_SHIFT) | + R300_FIRST_TEX)); + + /* Set nodes. */ + OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) | + R300_ALU_CODE_SIZE(14) | + R300_TEX_CODE_OFFSET(0) | + R300_TEX_CODE_SIZE(6))); + + /* Nodes are allocated highest first, but executed lowest first */ + OUT_ACCEL_REG(R300_US_CODE_ADDR_0, 0); + OUT_ACCEL_REG(R300_US_CODE_ADDR_1, (R300_ALU_START(0) | + R300_ALU_SIZE(0) | + R300_TEX_START(0) | + R300_TEX_SIZE(0))); + OUT_ACCEL_REG(R300_US_CODE_ADDR_2, (R300_ALU_START(1) | + R300_ALU_SIZE(9) | + R300_TEX_START(1) | + R300_TEX_SIZE(0))); + OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(11) | + R300_ALU_SIZE(2) | + R300_TEX_START(2) | + R300_TEX_SIZE(3) | + R300_RGBA_OUT)); + + /* ** BICUBIC FP ** */ + + /* texcoord0 => temp0 + * texcoord1 => temp1 */ + + // first node + /* TEX temp2, temp1.rrr0, tex1, 1D */ + OUT_ACCEL_REG(R300_US_TEX_INST(0), (R300_TEX_INST(R300_TEX_INST_LD) | + R300_TEX_ID(1) | + R300_TEX_SRC_ADDR(1) | + R300_TEX_DST_ADDR(2))); + + /* MOV temp1.r, temp1.ggg0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(1) | + R300_ALU_RGB_ADDRD(1) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDRD(1) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + + + // second node + /* TEX temp1, temp1, tex1, 1D */ + OUT_ACCEL_REG(R300_US_TEX_INST(1), (R300_TEX_INST(R300_TEX_INST_LD) | + R300_TEX_ID(1) | + R300_TEX_SRC_ADDR(1) | + R300_TEX_DST_ADDR(1))); + + /* MUL temp3.rg, temp2.ggg0, const0.rgb0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(2) | + R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) | + R300_ALU_RGB_ADDRD(3) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(3) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + + + /* MUL temp2.rg, temp2.rrr0, const0.rgb */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(2) | + R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(0)) | + R300_ALU_RGB_ADDRD(2) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(2) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + + /* MAD temp4.rg, temp1.ggg0, const1.rgb, temp3.rgb0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(1) | + R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) | + R300_ALU_RGB_ADDR2(3) | + R300_ALU_RGB_ADDRD(4) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(4) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + + /* MAD temp5.rg, temp1.ggg0, const1.rgb, temp2.rgb0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_GGG) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(1) | + R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) | + R300_ALU_RGB_ADDR2(2) | + R300_ALU_RGB_ADDRD(5) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(5) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + + /* MAD temp3.rg, temp1.rrr0, const1.rgb, temp3.rgb0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(1) | + R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) | + R300_ALU_RGB_ADDR2(3) | + R300_ALU_RGB_ADDRD(3) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(3) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + + /* MAD temp1.rg, temp1.rrr0, const1.rgb, temp2.rgb0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RRR) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(1) | + R300_ALU_RGB_ADDR1(R300_ALU_RGB_CONST(1)) | + R300_ALU_RGB_ADDR2(2) | + R300_ALU_RGB_ADDRD(1) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(1) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + + /* ADD temp1.rg, temp0.rgb0, temp1.rgb0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_ADDR2(1) | + R300_ALU_RGB_ADDRD(1) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(1) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + + /* ADD temp2.rg, temp0.rgb0, temp3.rgb0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_ADDR2(3) | + R300_ALU_RGB_ADDRD(2) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(2) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + + /* ADD temp3.rg, temp0.rgb0, temp5.rgb0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_ADDR2(5) | + R300_ALU_RGB_ADDRD(3) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(3) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + + /* ADD temp0.rg, temp0.rgb0, temp4.rgb0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(10), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(10), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_ADDR2(4) | + R300_ALU_RGB_ADDRD(0) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R | R300_ALU_RGB_MASK_G))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(10), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) | + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(10), (R300_ALU_ALPHA_ADDRD(0) | R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); - // third node - /* TEX temp4, temp1.rg--, tex0, 1D */ - OUT_ACCEL_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) | - R300_TEX_ID(0) | - R300_TEX_SRC_ADDR(1) | - R300_TEX_DST_ADDR(4))); - - /* TEX temp3, temp3.rg--, tex0, 1D */ - OUT_ACCEL_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) | - R300_TEX_ID(0) | - R300_TEX_SRC_ADDR(3) | - R300_TEX_DST_ADDR(3))); - - /* TEX temp5, temp2.rg--, tex0, 1D */ - OUT_ACCEL_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) | - R300_TEX_ID(0) | - R300_TEX_SRC_ADDR(2) | - R300_TEX_DST_ADDR(5))); - - /* TEX temp0, temp0.rg--, tex0, 1D */ - OUT_ACCEL_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) | - R300_TEX_ID(0) | - R300_TEX_SRC_ADDR(0) | - R300_TEX_DST_ADDR(0))); - - /* LRP temp3, temp1.bbbb, temp4, temp3 -> - * - PRESUB temps, temp4 - temp3 - * - MAD temp3, temp1.bbbb, temps, temp3 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) | - R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) | - R300_ALU_RGB_ADDR1(4) | - R300_ALU_RGB_ADDR2(1) | - R300_ALU_RGB_ADDRD(3) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + // third node + /* TEX temp4, temp1.rg--, tex0, 1D */ + OUT_ACCEL_REG(R300_US_TEX_INST(2), (R300_TEX_INST(R300_TEX_INST_LD) | + R300_TEX_ID(0) | + R300_TEX_SRC_ADDR(1) | + R300_TEX_DST_ADDR(4))); + + /* TEX temp3, temp3.rg--, tex0, 1D */ + OUT_ACCEL_REG(R300_US_TEX_INST(3), (R300_TEX_INST(R300_TEX_INST_LD) | + R300_TEX_ID(0) | + R300_TEX_SRC_ADDR(3) | + R300_TEX_DST_ADDR(3))); + + /* TEX temp5, temp2.rg--, tex0, 1D */ + OUT_ACCEL_REG(R300_US_TEX_INST(4), (R300_TEX_INST(R300_TEX_INST_LD) | + R300_TEX_ID(0) | + R300_TEX_SRC_ADDR(2) | + R300_TEX_DST_ADDR(5))); + + /* TEX temp0, temp0.rg--, tex0, 1D */ + OUT_ACCEL_REG(R300_US_TEX_INST(5), (R300_TEX_INST(R300_TEX_INST_LD) | + R300_TEX_ID(0) | + R300_TEX_SRC_ADDR(0) | + R300_TEX_DST_ADDR(0))); + + /* LRP temp3, temp1.bbbb, temp4, temp3 -> + * - PRESUB temps, temp4 - temp3 + * - MAD temp3, temp1.bbbb, temps, temp3 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(11), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) | + R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(11), (R300_ALU_RGB_ADDR0(3) | + R300_ALU_RGB_ADDR1(4) | + R300_ALU_RGB_ADDR2(1) | + R300_ALU_RGB_ADDRD(3) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(11), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) | R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) | R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) | + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(11), (R300_ALU_ALPHA_ADDR0(3) | R300_ALU_ALPHA_ADDR1(4) | R300_ALU_ALPHA_ADDR2(1) | R300_ALU_ALPHA_ADDRD(3) | R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A))); - /* LRP temp0, temp1.bbbb, temp5, temp0 -> - * - PRESUB temps, temp5 - temp0 - * - MAD temp0, temp1.bbbb, temps, temp0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) | - R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) | - R300_ALU_RGB_INSERT_NOP)); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_ADDR1(5) | - R300_ALU_RGB_ADDR2(1) | - R300_ALU_RGB_ADDRD(0) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + /* LRP temp0, temp1.bbbb, temp5, temp0 -> + * - PRESUB temps, temp5 - temp0 + * - MAD temp0, temp1.bbbb, temps, temp0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(12), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) | + R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0) | + R300_ALU_RGB_INSERT_NOP)); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(12), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_ADDR1(5) | + R300_ALU_RGB_ADDR2(1) | + R300_ALU_RGB_ADDRD(0) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(12), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) | R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) | R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) | + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(12), (R300_ALU_ALPHA_ADDR0(0) | R300_ALU_ALPHA_ADDR1(5) | R300_ALU_ALPHA_ADDR2(1) | R300_ALU_ALPHA_ADDRD(0) | R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_A))); - /* LRP output, temp2.bbbb, temp3, temp0 -> - * - PRESUB temps, temp3 - temp0 - * - MAD output, temp2.bbbb, temps, temp0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) | - R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0))); - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_ADDR1(3) | - R300_ALU_RGB_ADDR2(2) | - R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + /* LRP output, temp2.bbbb, temp3, temp0 -> + * - PRESUB temps, temp3 - temp0 + * - MAD output, temp2.bbbb, temps, temp0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(13), (R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC2_BBB) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRCP_RGB) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) | + R300_ALU_RGB_SRCP_OP(R300_ALU_RGB_SRCP_OP_RGB1_MINUS_RGB0))); + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(13), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_ADDR1(3) | + R300_ALU_RGB_ADDR2(2) | + R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(13), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC2_B) | R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_SRCP_A) | R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_SRC0_A))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) | + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(13), (R300_ALU_ALPHA_ADDR0(0) | R300_ALU_ALPHA_ADDR1(3) | R300_ALU_ALPHA_ADDR2(2) | R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A))); - /* Shader constants. */ - OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w)); - OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), 0); - OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), 0); - OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), 0); - - OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), 0); - OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h)); - OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), 0); - OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0); - - FINISH_ACCEL(); - } else if (isplanar) { - /* - * y' = y - .0625 - * u' = u - .5 - * v' = v - .5; - * - * r = 1.1643 * y' + 0.0 * u' + 1.5958 * v' - * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v' - * b = 1.1643 * y' + 2.017 * u' + 0.0 * v' - * - * DP3 might look like the straightforward solution - * but we'd need to move the texture yuv values in - * the same reg for this to work. Therefore use MADs. - * Brightness just adds to the off constant. - * Contrast is multiplication of luminance. - * Saturation and hue change the u and v coeffs. - * Default values (before adjustments - depend on colorspace): - * yco = 1.1643 - * uco = 0, -0.39173, 2.017 - * vco = 1.5958, -0.8129, 0 - * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r], - * -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g], - * -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b], - * - * temp = MAD(yco, yuv.yyyy, off) - * temp = MAD(uco, yuv.uuuu, temp) - * result = MAD(vco, yuv.vvvv, temp) - */ - /* TODO: don't recalc consts always */ - const float Loff = -0.0627; - const float Coff = -0.502; - float uvcosf, uvsinf; - float yco; - float uco[3], vco[3], off[3]; - float bright, cont, gamma; - int ref = pPriv->transform_index; - Bool needgamma = FALSE; - - cont = RTFContrast(pPriv->contrast); - bright = RTFBrightness(pPriv->brightness); - gamma = (float)pPriv->gamma / 1000.0; - uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue)); - uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue)); - /* overlay video also does pre-gamma contrast/sat adjust, should we? */ - - yco = trans[ref].RefLuma * cont; - uco[0] = -trans[ref].RefRCr * uvsinf; - uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf; - uco[2] = trans[ref].RefBCb * uvcosf; - vco[0] = trans[ref].RefRCr * uvcosf; - vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf; - vco[2] = trans[ref].RefBCb * uvsinf; - off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright; - off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright; - off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright; - - if (gamma != 1.0) { - needgamma = TRUE; - /* note: gamma correction is out = in ^ gamma; - gpu can only do LG2/EX2 therefore we transform into - in ^ gamma = 2 ^ (log2(in) * gamma). - Lots of scalar ops, unfortunately (better solution?) - - without gamma that's 3 inst, with gamma it's 10... - could use different gamma factors per channel, - if that's of any use. */ - } - - BEGIN_ACCEL(needgamma ? 28 + 33 : 33); - /* 2 components: same 2 for tex0/1/2 */ - OUT_ACCEL_REG(R300_RS_COUNT, - ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) | - R300_RS_COUNT_HIRES_EN)); - /* R300_INST_COUNT_RS - highest RS instruction used */ - OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6)); - - OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */ - - /* Indirection levels */ - OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) | - R300_FIRST_TEX)); - - OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) | - R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) | - R300_TEX_CODE_OFFSET(0) | - R300_TEX_CODE_SIZE(3))); - - OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) | - R300_ALU_SIZE(needgamma ? 7 + 2 : 2) | - R300_TEX_START(0) | - R300_TEX_SIZE(2) | - R300_RGBA_OUT)); - - /* tex inst */ - OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) | - R300_TEX_DST_ADDR(0) | - R300_TEX_ID(0) | - R300_TEX_INST(R300_TEX_INST_LD))); - OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) | - R300_TEX_DST_ADDR(1) | - R300_TEX_ID(1) | - R300_TEX_INST(R300_TEX_INST_LD))); - OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) | - R300_TEX_DST_ADDR(2) | - R300_TEX_ID(2) | - R300_TEX_INST(R300_TEX_INST_LD))); - - /* ALU inst */ - /* MAD temp0, const0.a, temp0, const0.rgb */ - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) | - R300_ALU_RGB_ADDR1(0) | - R300_ALU_RGB_ADDR2(0) | - R300_ALU_RGB_ADDRD(0) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB))); - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) | - R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | - R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) | - R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE))); - /* alpha nop, but need to set up alpha source for rgb usage */ - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) | - R300_ALU_ALPHA_ADDR1(0) | - R300_ALU_ALPHA_ADDR2(0) | - R300_ALU_ALPHA_ADDRD(0) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - - /* MAD const1, temp1, temp0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) | - R300_ALU_RGB_ADDR1(1) | - R300_ALU_RGB_ADDR2(0) | - R300_ALU_RGB_ADDRD(0) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB))); - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | - R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | - R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) | - R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE))); - /* alpha nop */ - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - - /* MAD result, const2, temp2, temp0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) | - R300_ALU_RGB_ADDR1(2) | - R300_ALU_RGB_ADDR2(0) | - R300_ALU_RGB_ADDRD(0) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) | - (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)))); - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | - R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | - R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) | - R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) | - R300_ALU_RGB_CLAMP)); - /* write alpha 1 */ - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) | - R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) | - R300_ALU_ALPHA_TARGET_A)); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0))); - - if (needgamma) { - /* rgb temp0.r = op_sop, set up src0 reg */ - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R))); - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), - R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) | - R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)); - /* alpha lg2 temp0, temp0.r */ - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - - /* rgb temp0.g = op_sop, set up src0 reg */ - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G))); - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), - R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) | - R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)); - /* alpha lg2 temp0, temp0.g */ - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + /* Shader constants. */ + OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(1.0/(float)pPriv->w)); + OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), 0); + OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), 0); + OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), 0); - /* rgb temp0.b = op_sop, set up src0 reg */ - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B))); - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), - R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) | - R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)); - /* alpha lg2 temp0, temp0.b */ - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), 0); + OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(1.0/(float)pPriv->h)); + OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), 0); + OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0); - /* MUL const1, temp1, temp0 */ - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_ADDR1(0) | - R300_ALU_RGB_ADDR2(0) | - R300_ALU_RGB_ADDRD(0) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB))); - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | - R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) | - R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) | - R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE))); - /* alpha nop, but set up const1 */ - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) | - R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - - /* rgb out0.r = op_sop, set up src0 reg */ - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) | - R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R))); - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), - R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) | - R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)); - /* alpha ex2 temp0, temp0.r */ - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - - /* rgb out0.g = op_sop, set up src0 reg */ - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) | - R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G))); - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), - R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) | - R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)); - /* alpha ex2 temp0, temp0.g */ - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - - /* rgb out0.b = op_sop, set up src0 reg */ - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) | - R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B))); - OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), - R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) | - R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)); - /* alpha ex2 temp0, temp0.b */ - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) | - R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) | - R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); - } - - /* Shader constants. */ - /* constant 0: off, yco */ - OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0])); - OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1])); - OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2])); - OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco)); - /* constant 1: uco */ - OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0])); - OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1])); - OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2])); - OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(gamma)); - /* constant 2: vco */ - OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0])); - OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1])); - OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2])); - OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0)); - - FINISH_ACCEL(); - - } else { - BEGIN_ACCEL(11); - /* 2 components: 2 for tex0 */ - OUT_ACCEL_REG(R300_RS_COUNT, - ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) | - R300_RS_COUNT_HIRES_EN)); - /* R300_INST_COUNT_RS - highest RS instruction used */ - OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6)); - - OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */ - - /* Indirection levels */ - OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) | - R300_FIRST_TEX)); - - OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) | - R300_ALU_CODE_SIZE(1) | - R300_TEX_CODE_OFFSET(0) | - R300_TEX_CODE_SIZE(1))); - - OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) | - R300_ALU_SIZE(0) | - R300_TEX_START(0) | - R300_TEX_SIZE(0) | - R300_RGBA_OUT)); - - /* tex inst */ - OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) | - R300_TEX_DST_ADDR(0) | - R300_TEX_ID(0) | - R300_TEX_INST(R300_TEX_INST_LD))); - - /* ALU inst */ - /* RGB */ - OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) | - R300_ALU_RGB_ADDR1(0) | - R300_ALU_RGB_ADDR2(0) | - R300_ALU_RGB_ADDRD(0) | - R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R | - R300_ALU_RGB_MASK_G | - R300_ALU_RGB_MASK_B)) | - R300_ALU_RGB_TARGET_A)); - OUT_ACCEL_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | - R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) | - R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) | - R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) | - R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | - R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) | - R300_ALU_RGB_CLAMP)); - /* Alpha */ - OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) | - R300_ALU_ALPHA_ADDR1(0) | - R300_ALU_ALPHA_ADDR2(0) | - R300_ALU_ALPHA_ADDRD(0) | - R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) | - R300_ALU_ALPHA_TARGET_A | - R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE))); - OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) | - R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) | - R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) | - R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) | - R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) | - R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) | - R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | - R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) | - R300_ALU_ALPHA_CLAMP)); - FINISH_ACCEL(); - } - } else { - if (pPriv->bicubic_enabled) { - BEGIN_ACCEL(7); - - /* 4 components: 2 for tex0 and 2 for tex1 */ - OUT_ACCEL_REG(R300_RS_COUNT, - ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) | - R300_RS_COUNT_HIRES_EN)); - - /* R300_INST_COUNT_RS - highest RS instruction used */ - OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6)); - - /* Pixel stack frame size. */ - OUT_ACCEL_REG(R300_US_PIXSIZE, 5); - - /* FP length. */ - OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) | - R500_US_CODE_END_ADDR(13))); - OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) | - R500_US_CODE_RANGE_SIZE(13))); - - /* Prepare for FP emission. */ - OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0); - OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0)); - FINISH_ACCEL(); - - BEGIN_ACCEL(89); - /* Pixel shader. - * I've gone ahead and annotated each instruction, since this - * thing is MASSIVE. :3 - * Note: In order to avoid buggies with temps and multiple - * inputs, all temps are offset by 2. temp0 -> register2. */ - - /* TEX temp2, input1.xxxx, tex1, 1D */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) | - R500_TEX_INST_LD | - R500_TEX_IGNORE_UNCOVERED)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) | - R500_TEX_SRC_S_SWIZ_R | - R500_TEX_SRC_T_SWIZ_R | - R500_TEX_SRC_R_SWIZ_R | - R500_TEX_SRC_Q_SWIZ_R | - R500_TEX_DST_ADDR(2) | - R500_TEX_DST_R_SWIZ_R | - R500_TEX_DST_G_SWIZ_G | - R500_TEX_DST_B_SWIZ_B | - R500_TEX_DST_A_SWIZ_A)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - - /* TEX temp5, input1.yyyy, tex1, 1D */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | - R500_INST_TEX_SEM_WAIT | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) | - R500_TEX_INST_LD | - R500_TEX_SEM_ACQUIRE | - R500_TEX_IGNORE_UNCOVERED)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) | - R500_TEX_SRC_S_SWIZ_G | - R500_TEX_SRC_T_SWIZ_G | - R500_TEX_SRC_R_SWIZ_G | - R500_TEX_SRC_Q_SWIZ_G | - R500_TEX_DST_ADDR(5) | - R500_TEX_DST_R_SWIZ_R | - R500_TEX_DST_G_SWIZ_G | - R500_TEX_DST_B_SWIZ_B | - R500_TEX_DST_A_SWIZ_A)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - - /* MUL temp4, const0.x0x0, temp2.yyxx */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | - R500_INST_TEX_SEM_WAIT | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) | - R500_RGB_ADDR0_CONST | - R500_RGB_ADDR1(2))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) | - R500_ALPHA_ADDR0_CONST | - R500_ALPHA_ADDR1(2))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 | - R500_ALU_RGB_R_SWIZ_A_R | - R500_ALU_RGB_G_SWIZ_A_0 | - R500_ALU_RGB_B_SWIZ_A_R | - R500_ALU_RGB_SEL_B_SRC1 | - R500_ALU_RGB_R_SWIZ_B_G | - R500_ALU_RGB_G_SWIZ_B_G | - R500_ALU_RGB_B_SWIZ_B_R)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) | - R500_ALPHA_OP_MAD | - R500_ALPHA_SEL_A_SRC0 | - R500_ALPHA_SWIZ_A_0 | - R500_ALPHA_SEL_B_SRC1 | - R500_ALPHA_SWIZ_B_R)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) | - R500_ALU_RGBA_OP_MAD | - R500_ALU_RGBA_R_SWIZ_0 | - R500_ALU_RGBA_G_SWIZ_0 | - R500_ALU_RGBA_B_SWIZ_0 | - R500_ALU_RGBA_A_SWIZ_0)); - - /* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) | - R500_RGB_ADDR0_CONST | - R500_RGB_ADDR1(5) | - R500_RGB_ADDR2(4))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) | - R500_ALPHA_ADDR0_CONST | - R500_ALPHA_ADDR1(5) | - R500_ALPHA_ADDR2(4))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 | - R500_ALU_RGB_R_SWIZ_A_0 | - R500_ALU_RGB_G_SWIZ_A_G | - R500_ALU_RGB_B_SWIZ_A_0 | - R500_ALU_RGB_SEL_B_SRC1 | - R500_ALU_RGB_R_SWIZ_B_R | - R500_ALU_RGB_G_SWIZ_B_R | - R500_ALU_RGB_B_SWIZ_B_R)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) | - R500_ALPHA_OP_MAD | - R500_ALPHA_SEL_A_SRC0 | - R500_ALPHA_SWIZ_A_G | - R500_ALPHA_SEL_B_SRC1 | - R500_ALPHA_SWIZ_B_R)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) | - R500_ALU_RGBA_OP_MAD | - R500_ALU_RGBA_SEL_C_SRC2 | - R500_ALU_RGBA_R_SWIZ_R | - R500_ALU_RGBA_G_SWIZ_G | - R500_ALU_RGBA_B_SWIZ_B | - R500_ALU_RGBA_A_SWIZ_A)); - - /* ADD temp3, temp3, input0.xyxy */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) | - R500_RGB_ADDR2(0))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) | - R500_ALPHA_ADDR2(0))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 | - R500_ALU_RGB_G_SWIZ_A_1 | - R500_ALU_RGB_B_SWIZ_A_1 | - R500_ALU_RGB_SEL_B_SRC1 | - R500_ALU_RGB_R_SWIZ_B_R | - R500_ALU_RGB_G_SWIZ_B_G | - R500_ALU_RGB_B_SWIZ_B_B)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) | - R500_ALPHA_OP_MAD | - R500_ALPHA_SWIZ_A_1 | - R500_ALPHA_SEL_B_SRC1 | - R500_ALPHA_SWIZ_B_A)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) | - R500_ALU_RGBA_OP_MAD | - R500_ALU_RGBA_SEL_C_SRC2 | - R500_ALU_RGBA_R_SWIZ_R | - R500_ALU_RGBA_G_SWIZ_G | - R500_ALU_RGBA_B_SWIZ_R | - R500_ALU_RGBA_A_SWIZ_G)); - - /* TEX temp1, temp3.zwxy, tex0, 2D */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) | - R500_TEX_INST_LD | - R500_TEX_IGNORE_UNCOVERED)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) | - R500_TEX_SRC_S_SWIZ_B | - R500_TEX_SRC_T_SWIZ_A | - R500_TEX_SRC_R_SWIZ_R | - R500_TEX_SRC_Q_SWIZ_G | - R500_TEX_DST_ADDR(1) | - R500_TEX_DST_R_SWIZ_R | - R500_TEX_DST_G_SWIZ_G | - R500_TEX_DST_B_SWIZ_B | - R500_TEX_DST_A_SWIZ_A)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - - /* TEX temp3, temp3.xyzw, tex0, 2D */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | - R500_INST_TEX_SEM_WAIT | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) | - R500_TEX_INST_LD | - R500_TEX_SEM_ACQUIRE | - R500_TEX_IGNORE_UNCOVERED)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) | - R500_TEX_SRC_S_SWIZ_R | - R500_TEX_SRC_T_SWIZ_G | - R500_TEX_SRC_R_SWIZ_B | - R500_TEX_SRC_Q_SWIZ_A | - R500_TEX_DST_ADDR(3) | - R500_TEX_DST_R_SWIZ_R | - R500_TEX_DST_G_SWIZ_G | - R500_TEX_DST_B_SWIZ_B | - R500_TEX_DST_A_SWIZ_A)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - - /* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) | - R500_RGB_ADDR0_CONST | - R500_RGB_ADDR1(5) | - R500_RGB_ADDR2(4))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) | - R500_ALPHA_ADDR0_CONST | - R500_ALPHA_ADDR1(5) | - R500_ALPHA_ADDR2(4))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 | - R500_ALU_RGB_R_SWIZ_A_0 | - R500_ALU_RGB_G_SWIZ_A_G | - R500_ALU_RGB_B_SWIZ_A_0 | - R500_ALU_RGB_SEL_B_SRC1 | - R500_ALU_RGB_R_SWIZ_B_G | - R500_ALU_RGB_G_SWIZ_B_G | - R500_ALU_RGB_B_SWIZ_B_G)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) | - R500_ALPHA_OP_MAD | - R500_ALPHA_SEL_A_SRC0 | - R500_ALPHA_SWIZ_A_G | - R500_ALPHA_SEL_B_SRC1 | - R500_ALPHA_SWIZ_B_G)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) | - R500_ALU_RGBA_OP_MAD | - R500_ALU_RGBA_SEL_C_SRC2 | - R500_ALU_RGBA_R_SWIZ_R | - R500_ALU_RGBA_G_SWIZ_G | - R500_ALU_RGBA_B_SWIZ_B | - R500_ALU_RGBA_A_SWIZ_A)); - - /* ADD temp0, temp4, input0.xyxy */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) | - R500_RGB_ADDR2(0))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) | - R500_ALPHA_ADDR2(0))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 | - R500_ALU_RGB_G_SWIZ_A_1 | - R500_ALU_RGB_B_SWIZ_A_1 | - R500_ALU_RGB_SEL_B_SRC1 | - R500_ALU_RGB_R_SWIZ_B_R | - R500_ALU_RGB_G_SWIZ_B_G | - R500_ALU_RGB_B_SWIZ_B_B)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) | - R500_ALPHA_OP_MAD | - R500_ALPHA_SWIZ_A_1 | - R500_ALPHA_SEL_B_SRC1 | - R500_ALPHA_SWIZ_B_A)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) | - R500_ALU_RGBA_OP_MAD | - R500_ALU_RGBA_SEL_C_SRC2 | - R500_ALU_RGBA_R_SWIZ_R | - R500_ALU_RGBA_G_SWIZ_G | - R500_ALU_RGBA_B_SWIZ_R | - R500_ALU_RGBA_A_SWIZ_G)); - - /* TEX temp4, temp0.zwzw, tex0, 2D */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | - R500_INST_TEX_SEM_WAIT | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) | - R500_TEX_INST_LD | - R500_TEX_IGNORE_UNCOVERED)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) | - R500_TEX_SRC_S_SWIZ_B | - R500_TEX_SRC_T_SWIZ_A | - R500_TEX_SRC_R_SWIZ_B | - R500_TEX_SRC_Q_SWIZ_A | - R500_TEX_DST_ADDR(4) | - R500_TEX_DST_R_SWIZ_R | - R500_TEX_DST_G_SWIZ_G | - R500_TEX_DST_B_SWIZ_B | - R500_TEX_DST_A_SWIZ_A)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - - /* TEX temp0, temp0.xyzw, tex0, 2D */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | - R500_INST_TEX_SEM_WAIT | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) | - R500_TEX_INST_LD | - R500_TEX_SEM_ACQUIRE | - R500_TEX_IGNORE_UNCOVERED)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) | - R500_TEX_SRC_S_SWIZ_R | - R500_TEX_SRC_T_SWIZ_G | - R500_TEX_SRC_R_SWIZ_B | - R500_TEX_SRC_Q_SWIZ_A | - R500_TEX_DST_ADDR(0) | - R500_TEX_DST_R_SWIZ_R | - R500_TEX_DST_G_SWIZ_G | - R500_TEX_DST_B_SWIZ_B | - R500_TEX_DST_A_SWIZ_A)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - - /* LRP temp3, temp2.zzzz, temp1, temp3 -> - * - PRESUB temps, temp1 - temp3 - * - MAD temp2.zzzz, temps, temp3 */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) | - R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 | - R500_RGB_ADDR1(1) | - R500_RGB_ADDR2(2))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) | - R500_ALPHA_SRCP_OP_A1_MINUS_A0 | - R500_ALPHA_ADDR1(1) | - R500_ALPHA_ADDR2(2))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 | - R500_ALU_RGB_R_SWIZ_A_B | - R500_ALU_RGB_G_SWIZ_A_B | - R500_ALU_RGB_B_SWIZ_A_B | - R500_ALU_RGB_SEL_B_SRCP | - R500_ALU_RGB_R_SWIZ_B_R | - R500_ALU_RGB_G_SWIZ_B_G | - R500_ALU_RGB_B_SWIZ_B_B)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) | - R500_ALPHA_OP_MAD | - R500_ALPHA_SEL_A_SRC2 | - R500_ALPHA_SWIZ_A_B | - R500_ALPHA_SEL_B_SRCP | - R500_ALPHA_SWIZ_B_A)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) | - R500_ALU_RGBA_OP_MAD | - R500_ALU_RGBA_SEL_C_SRC0 | - R500_ALU_RGBA_R_SWIZ_R | - R500_ALU_RGBA_G_SWIZ_G | - R500_ALU_RGBA_B_SWIZ_B | - R500_ALU_RGBA_A_SWIZ_A)); - - /* LRP temp0, temp2.zzzz, temp4, temp0 -> - * - PRESUB temps, temp4 - temp1 - * - MAD temp2.zzzz, temps, temp0 */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | - R500_INST_TEX_SEM_WAIT | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) | - R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 | - R500_RGB_ADDR1(4) | - R500_RGB_ADDR2(2))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) | - R500_ALPHA_SRCP_OP_A1_MINUS_A0 | - R500_ALPHA_ADDR1(4) | - R500_ALPHA_ADDR2(2))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 | - R500_ALU_RGB_R_SWIZ_A_B | - R500_ALU_RGB_G_SWIZ_A_B | - R500_ALU_RGB_B_SWIZ_A_B | - R500_ALU_RGB_SEL_B_SRCP | - R500_ALU_RGB_R_SWIZ_B_R | - R500_ALU_RGB_G_SWIZ_B_G | - R500_ALU_RGB_B_SWIZ_B_B)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) | - R500_ALPHA_OP_MAD | - R500_ALPHA_SEL_A_SRC2 | - R500_ALPHA_SWIZ_A_B | - R500_ALPHA_SEL_B_SRCP | - R500_ALPHA_SWIZ_B_A)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) | - R500_ALU_RGBA_OP_MAD | - R500_ALU_RGBA_SEL_C_SRC0 | - R500_ALU_RGBA_R_SWIZ_R | - R500_ALU_RGBA_G_SWIZ_G | - R500_ALU_RGBA_B_SWIZ_B | - R500_ALU_RGBA_A_SWIZ_A)); - - /* LRP output, temp5.zzzz, temp3, temp0 -> - * - PRESUB temps, temp3 - temp0 - * - MAD temp5.zzzz, temps, temp0 */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT | - R500_INST_LAST | - R500_INST_TEX_SEM_WAIT | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK | - R500_INST_RGB_OMASK_R | - R500_INST_RGB_OMASK_G | - R500_INST_RGB_OMASK_B | - R500_INST_ALPHA_OMASK)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) | - R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 | - R500_RGB_ADDR1(3) | - R500_RGB_ADDR2(5))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) | - R500_ALPHA_SRCP_OP_A1_MINUS_A0 | - R500_ALPHA_ADDR1(3) | - R500_ALPHA_ADDR2(5))); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 | - R500_ALU_RGB_R_SWIZ_A_B | - R500_ALU_RGB_G_SWIZ_A_B | - R500_ALU_RGB_B_SWIZ_A_B | - R500_ALU_RGB_SEL_B_SRCP | - R500_ALU_RGB_R_SWIZ_B_R | - R500_ALU_RGB_G_SWIZ_B_G | - R500_ALU_RGB_B_SWIZ_B_B)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) | - R500_ALPHA_OP_MAD | - R500_ALPHA_SEL_A_SRC2 | - R500_ALPHA_SWIZ_A_B | - R500_ALPHA_SEL_B_SRCP | - R500_ALPHA_SWIZ_B_A)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) | - R500_ALU_RGBA_OP_MAD | - R500_ALU_RGBA_SEL_C_SRC0 | - R500_ALU_RGBA_R_SWIZ_R | - R500_ALU_RGBA_G_SWIZ_G | - R500_ALU_RGBA_B_SWIZ_B | - R500_ALU_RGBA_A_SWIZ_A)); - - /* Shader constants. */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0)); - - /* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */ - OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w)); - OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h)); - OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0); - OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0); - - FINISH_ACCEL(); - - } else { - BEGIN_ACCEL(19); - /* 2 components: 2 for tex0 */ - OUT_ACCEL_REG(R300_RS_COUNT, - ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) | - R300_RS_COUNT_HIRES_EN)); - - /* R300_INST_COUNT_RS - highest RS instruction used */ - OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6)); - - /* Pixel stack frame size. */ - OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */ - - /* FP length. */ - OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) | - R500_US_CODE_END_ADDR(1))); - OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) | - R500_US_CODE_RANGE_SIZE(1))); - - /* Prepare for FP emission. */ - OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0); - OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0)); - - /* tex inst */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | - R500_INST_TEX_SEM_WAIT | - R500_INST_RGB_WMASK_R | - R500_INST_RGB_WMASK_G | - R500_INST_RGB_WMASK_B | - R500_INST_ALPHA_WMASK | - R500_INST_RGB_CLAMP | - R500_INST_ALPHA_CLAMP)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) | - R500_TEX_INST_LD | - R500_TEX_SEM_ACQUIRE | - R500_TEX_IGNORE_UNCOVERED)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) | - R500_TEX_SRC_S_SWIZ_R | - R500_TEX_SRC_T_SWIZ_G | - R500_TEX_DST_ADDR(0) | - R500_TEX_DST_R_SWIZ_R | - R500_TEX_DST_G_SWIZ_G | - R500_TEX_DST_B_SWIZ_B | - R500_TEX_DST_A_SWIZ_A)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) | - R500_DX_S_SWIZ_R | - R500_DX_T_SWIZ_R | - R500_DX_R_SWIZ_R | - R500_DX_Q_SWIZ_R | - R500_DY_ADDR(0) | - R500_DY_S_SWIZ_R | - R500_DY_T_SWIZ_R | - R500_DY_R_SWIZ_R | - R500_DY_Q_SWIZ_R)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); - - /* ALU inst */ - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT | - R500_INST_TEX_SEM_WAIT | - R500_INST_LAST | - R500_INST_RGB_OMASK_R | - R500_INST_RGB_OMASK_G | - R500_INST_RGB_OMASK_B | - R500_INST_ALPHA_OMASK | - R500_INST_RGB_CLAMP | - R500_INST_ALPHA_CLAMP)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) | - R500_RGB_ADDR1(0) | - R500_RGB_ADDR1_CONST | - R500_RGB_ADDR2(0) | - R500_RGB_ADDR2_CONST)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) | - R500_ALPHA_ADDR1(0) | - R500_ALPHA_ADDR1_CONST | - R500_ALPHA_ADDR2(0) | - R500_ALPHA_ADDR2_CONST)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 | - R500_ALU_RGB_R_SWIZ_A_R | - R500_ALU_RGB_G_SWIZ_A_G | - R500_ALU_RGB_B_SWIZ_A_B | - R500_ALU_RGB_SEL_B_SRC0 | - R500_ALU_RGB_R_SWIZ_B_1 | - R500_ALU_RGB_B_SWIZ_B_1 | - R500_ALU_RGB_G_SWIZ_B_1)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD | - R500_ALPHA_SWIZ_A_A | - R500_ALPHA_SWIZ_B_1)); - OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD | - R500_ALU_RGBA_R_SWIZ_0 | - R500_ALU_RGBA_G_SWIZ_0 | - R500_ALU_RGBA_B_SWIZ_0 | - R500_ALU_RGBA_A_SWIZ_0)); - FINISH_ACCEL(); - } - } - - BEGIN_ACCEL(6); - OUT_ACCEL_REG(R300_TX_INVALTAGS, 0); - OUT_ACCEL_REG(R300_TX_ENABLE, txenable); - - OUT_ACCEL_REG(R300_RB3D_COLOROFFSET0, dst_offset); - OUT_ACCEL_REG(R300_RB3D_COLORPITCH0, colorpitch); - - blendcntl = RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO; - /* no need to enable blending */ - OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, blendcntl); - - OUT_ACCEL_REG(R300_VAP_VTX_SIZE, vtx_count); FINISH_ACCEL(); - - } else { - - /* Same for R100/R200 */ - switch (pPixmap->drawable.bitsPerPixel) { - case 16: - if (pPixmap->drawable.depth == 15) - dst_format = RADEON_COLOR_FORMAT_ARGB1555; - else - dst_format = RADEON_COLOR_FORMAT_RGB565; - break; - case 32: - dst_format = RADEON_COLOR_FORMAT_ARGB8888; - break; - default: - return; - } - - if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) { - isplanar = TRUE; + } else if (isplanar) { + /* + * y' = y - .0625 + * u' = u - .5 + * v' = v - .5; + * + * r = 1.1643 * y' + 0.0 * u' + 1.5958 * v' + * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v' + * b = 1.1643 * y' + 2.017 * u' + 0.0 * v' + * + * DP3 might look like the straightforward solution + * but we'd need to move the texture yuv values in + * the same reg for this to work. Therefore use MADs. + * Brightness just adds to the off constant. + * Contrast is multiplication of luminance. + * Saturation and hue change the u and v coeffs. + * Default values (before adjustments - depend on colorspace): + * yco = 1.1643 + * uco = 0, -0.39173, 2.017 + * vco = 1.5958, -0.8129, 0 + * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r], + * -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g], + * -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b], + * + * temp = MAD(yco, yuv.yyyy, off) + * temp = MAD(uco, yuv.uuuu, temp) + * result = MAD(vco, yuv.vvvv, temp) + */ + /* TODO: don't recalc consts always */ + const float Loff = -0.0627; + const float Coff = -0.502; + float uvcosf, uvsinf; + float yco; + float uco[3], vco[3], off[3]; + float bright, cont, gamma; + int ref = pPriv->transform_index; + Bool needgamma = FALSE; + + cont = RTFContrast(pPriv->contrast); + bright = RTFBrightness(pPriv->brightness); + gamma = (float)pPriv->gamma / 1000.0; + uvcosf = RTFSaturation(pPriv->saturation) * cos(RTFHue(pPriv->hue)); + uvsinf = RTFSaturation(pPriv->saturation) * sin(RTFHue(pPriv->hue)); + /* overlay video also does pre-gamma contrast/sat adjust, should we? */ + + yco = trans[ref].RefLuma * cont; + uco[0] = -trans[ref].RefRCr * uvsinf; + uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf; + uco[2] = trans[ref].RefBCb * uvcosf; + vco[0] = trans[ref].RefRCr * uvcosf; + vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf; + vco[2] = trans[ref].RefBCb * uvsinf; + off[0] = Loff * yco + Coff * (uco[0] + vco[0]) + bright; + off[1] = Loff * yco + Coff * (uco[1] + vco[1]) + bright; + off[2] = Loff * yco + Coff * (uco[2] + vco[2]) + bright; + + if (gamma != 1.0) { + needgamma = TRUE; + /* note: gamma correction is out = in ^ gamma; + gpu can only do LG2/EX2 therefore we transform into + in ^ gamma = 2 ^ (log2(in) * gamma). + Lots of scalar ops, unfortunately (better solution?) - + without gamma that's 3 inst, with gamma it's 10... + could use different gamma factors per channel, + if that's of any use. */ } - if (isplanar) { - txformat = RADEON_TXFORMAT_I8; - } else { - if (pPriv->id == FOURCC_UYVY) - txformat = RADEON_TXFORMAT_YVYU422; - else - txformat = RADEON_TXFORMAT_VYUY422; + BEGIN_ACCEL(needgamma ? 28 + 33 : 33); + /* 2 components: same 2 for tex0/1/2 */ + OUT_ACCEL_REG(R300_RS_COUNT, + ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) | + R300_RS_COUNT_HIRES_EN)); + /* R300_INST_COUNT_RS - highest RS instruction used */ + OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6)); + + OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */ + + /* Indirection levels */ + OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) | + R300_FIRST_TEX)); + + OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) | + R300_ALU_CODE_SIZE(needgamma ? 7 + 3 : 3) | + R300_TEX_CODE_OFFSET(0) | + R300_TEX_CODE_SIZE(3))); + + OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) | + R300_ALU_SIZE(needgamma ? 7 + 2 : 2) | + R300_TEX_START(0) | + R300_TEX_SIZE(2) | + R300_RGBA_OUT)); + + /* tex inst */ + OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) | + R300_TEX_DST_ADDR(0) | + R300_TEX_ID(0) | + R300_TEX_INST(R300_TEX_INST_LD))); + OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) | + R300_TEX_DST_ADDR(1) | + R300_TEX_ID(1) | + R300_TEX_INST(R300_TEX_INST_LD))); + OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) | + R300_TEX_DST_ADDR(2) | + R300_TEX_ID(2) | + R300_TEX_INST(R300_TEX_INST_LD))); + + /* ALU inst */ + /* MAD temp0, const0.a, temp0, const0.rgb */ + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) | + R300_ALU_RGB_ADDR1(0) | + R300_ALU_RGB_ADDR2(0) | + R300_ALU_RGB_ADDRD(0) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB))); + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) | + R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | + R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) | + R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE))); + /* alpha nop, but need to set up alpha source for rgb usage */ + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) | + R300_ALU_ALPHA_ADDR1(0) | + R300_ALU_ALPHA_ADDR2(0) | + R300_ALU_ALPHA_ADDRD(0) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + + /* MAD const1, temp1, temp0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) | + R300_ALU_RGB_ADDR1(1) | + R300_ALU_RGB_ADDR2(0) | + R300_ALU_RGB_ADDRD(0) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB))); + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | + R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | + R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) | + R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE))); + /* alpha nop */ + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + + /* MAD result, const2, temp2, temp0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) | + R300_ALU_RGB_ADDR1(2) | + R300_ALU_RGB_ADDR2(0) | + R300_ALU_RGB_ADDRD(0) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) | + (needgamma ? 0 : R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)))); + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | + R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) | + R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) | + R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) | + R300_ALU_RGB_CLAMP)); + /* write alpha 1 */ + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(2), (R300_ALU_ALPHA_ADDRD(0) | + R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) | + R300_ALU_ALPHA_TARGET_A)); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(2), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0))); + + if (needgamma) { + /* rgb temp0.r = op_sop, set up src0 reg */ + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(3), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R))); + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(3), + R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) | + R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)); + /* alpha lg2 temp0, temp0.r */ + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(3), (R300_ALU_ALPHA_ADDRD(0) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(3), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + + /* rgb temp0.g = op_sop, set up src0 reg */ + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(4), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G))); + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(4), + R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) | + R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)); + /* alpha lg2 temp0, temp0.g */ + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + + /* rgb temp0.b = op_sop, set up src0 reg */ + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(5), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B))); + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(5), + R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) | + R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)); + /* alpha lg2 temp0, temp0.b */ + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(5), (R300_ALU_ALPHA_ADDRD(0) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(5), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_LN2) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + + /* MUL const1, temp1, temp0 */ + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(6), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_ADDR1(0) | + R300_ALU_RGB_ADDR2(0) | + R300_ALU_RGB_ADDRD(0) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB))); + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(6), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | + R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC0_AAA) | + R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) | + R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE))); + /* alpha nop, but set up const1 */ + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(6), (R300_ALU_ALPHA_ADDRD(0) | + R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(1)) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(6), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + + /* rgb out0.r = op_sop, set up src0 reg */ + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(7), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_R) | + R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_R))); + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(7), + R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) | + R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)); + /* alpha ex2 temp0, temp0.r */ + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(7), (R300_ALU_ALPHA_ADDRD(0) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(7), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_R) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + + /* rgb out0.g = op_sop, set up src0 reg */ + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(8), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_G) | + R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_G))); + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(8), + R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) | + R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)); + /* alpha ex2 temp0, temp0.g */ + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(8), (R300_ALU_ALPHA_ADDRD(0) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(8), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_G) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); + + /* rgb out0.b = op_sop, set up src0 reg */ + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(9), (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_B) | + R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_B))); + OUT_ACCEL_REG(R300_US_ALU_RGB_INST(9), + R300_ALU_RGB_OP(R300_ALU_RGB_OP_SOP) | + R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)); + /* alpha ex2 temp0, temp0.b */ + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(9), (R300_ALU_ALPHA_ADDRD(0) | + R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(9), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_EX2) | + R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_B) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0))); } - txformat |= RADEON_TXFORMAT_NON_POWER2; - - colorpitch = dst_pitch >> pixel_shift; + /* Shader constants. */ + /* constant 0: off, yco */ + OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0])); + OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1])); + OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2])); + OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco)); + /* constant 1: uco */ + OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0])); + OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1])); + OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2])); + OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(gamma)); + /* constant 2: vco */ + OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0])); + OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1])); + OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2])); + OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0)); - if (RADEONTilingEnabled(pScrn, pPixmap)) - colorpitch |= RADEON_COLOR_TILE_ENABLE; - - BEGIN_ACCEL(4); - - OUT_ACCEL_REG(RADEON_RB3D_CNTL, - dst_format /*| RADEON_ALPHA_BLEND_ENABLE*/); - OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, dst_offset); - - OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, colorpitch); - - OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL, - RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO); + FINISH_ACCEL(); + } else { + BEGIN_ACCEL(11); + /* 2 components: 2 for tex0 */ + OUT_ACCEL_REG(R300_RS_COUNT, + ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) | + R300_RS_COUNT_HIRES_EN)); + /* R300_INST_COUNT_RS - highest RS instruction used */ + OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6)); + + OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */ + + /* Indirection levels */ + OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) | + R300_FIRST_TEX)); + + OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) | + R300_ALU_CODE_SIZE(1) | + R300_TEX_CODE_OFFSET(0) | + R300_TEX_CODE_SIZE(1))); + + OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) | + R300_ALU_SIZE(0) | + R300_TEX_START(0) | + R300_TEX_SIZE(0) | + R300_RGBA_OUT)); + + /* tex inst */ + OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) | + R300_TEX_DST_ADDR(0) | + R300_TEX_ID(0) | + R300_TEX_INST(R300_TEX_INST_LD))); + + /* ALU inst */ + /* RGB */ + OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR_0, (R300_ALU_RGB_ADDR0(0) | + R300_ALU_RGB_ADDR1(0) | + R300_ALU_RGB_ADDR2(0) | + R300_ALU_RGB_ADDRD(0) | + R300_ALU_RGB_OMASK((R300_ALU_RGB_MASK_R | + R300_ALU_RGB_MASK_G | + R300_ALU_RGB_MASK_B)) | + R300_ALU_RGB_TARGET_A)); + OUT_ACCEL_REG(R300_US_ALU_RGB_INST_0, (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) | + R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_SEL_B(R300_ALU_RGB_1_0) | + R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_SEL_C(R300_ALU_RGB_0_0) | + R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) | + R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) | + R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) | + R300_ALU_RGB_CLAMP)); + /* Alpha */ + OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR_0, (R300_ALU_ALPHA_ADDR0(0) | + R300_ALU_ALPHA_ADDR1(0) | + R300_ALU_ALPHA_ADDR2(0) | + R300_ALU_ALPHA_ADDRD(0) | + R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) | + R300_ALU_ALPHA_TARGET_A | + R300_ALU_ALPHA_OMASK_W(R300_ALU_ALPHA_MASK_NONE))); + OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST_0, (R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_SRC0_A) | + R300_ALU_ALPHA_MOD_A(R300_ALU_ALPHA_MOD_NOP) | + R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_1_0) | + R300_ALU_ALPHA_MOD_B(R300_ALU_ALPHA_MOD_NOP) | + R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0) | + R300_ALU_ALPHA_MOD_C(R300_ALU_ALPHA_MOD_NOP) | + R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) | + R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) | + R300_ALU_ALPHA_CLAMP)); FINISH_ACCEL(); + } + BEGIN_ACCEL(6); + OUT_ACCEL_REG(R300_TX_INVALTAGS, 0); + OUT_ACCEL_REG(R300_TX_ENABLE, txenable); - if (IS_R200_3D) { - - info->accel_state->texW[0] = pPriv->w; - info->accel_state->texH[0] = pPriv->h; - - if (isplanar) { - /* note: in contrast to r300, use input biasing on uv components */ - const float Loff = -0.0627; - float uvcosf, uvsinf; - float yco, yoff; - float uco[3], vco[3]; - float bright, cont, sat; - int ref = pPriv->transform_index; - float ucscale = 0.25, vcscale = 0.25; - Bool needux8 = FALSE, needvx8 = FALSE; - - /* contrast can cause constant overflow, clamp */ - cont = RTFContrast(pPriv->contrast); - if (cont * trans[ref].RefLuma > 2.0) - cont = 2.0 / trans[ref].RefLuma; - /* brightness is only from -0.5 to 0.5 should be safe */ - bright = RTFBrightness(pPriv->brightness); - /* saturation can also cause overflow, clamp */ - sat = RTFSaturation(pPriv->saturation); - if (sat * trans[ref].RefBCb > 4.0) - sat = 4.0 / trans[ref].RefBCb; - uvcosf = sat * cos(RTFHue(pPriv->hue)); - uvsinf = sat * sin(RTFHue(pPriv->hue)); - - yco = trans[ref].RefLuma * cont; - uco[0] = -trans[ref].RefRCr * uvsinf; - uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf; - uco[2] = trans[ref].RefBCb * uvcosf; - vco[0] = trans[ref].RefRCr * uvcosf; - vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf; - vco[2] = trans[ref].RefBCb * uvsinf; - yoff = Loff * yco + bright; - - if ((uco[0] > 2.0) || (uco[2] > 2.0)) { - needux8 = TRUE; - ucscale = 0.125; - } - if ((vco[0] > 2.0) || (vco[2] > 2.0)) { - needvx8 = TRUE; - vcscale = 0.125; - } - - /* need 2 texcoord sets (even though they are identical) due - to denormalization! hw apparently can't premultiply - same coord set by different texture size */ - vtx_count = 6; - - txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) | - (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT)); - txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63; - txpitch -= 32; - txfilter = R200_MAG_FILTER_LINEAR | - R200_MIN_FILTER_LINEAR | - R200_CLAMP_S_CLAMP_LAST | - R200_CLAMP_T_CLAMP_LAST; - - BEGIN_ACCEL(36); - - OUT_ACCEL_REG(RADEON_PP_CNTL, - RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE | - RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE | - RADEON_TEX_BLEND_2_ENABLE); - - OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY); - OUT_ACCEL_REG(R200_SE_VTX_FMT_1, - (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) | - (2 << R200_VTX_TEX1_COMP_CNT_SHIFT)); - - OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter); - OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat); - OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0); - OUT_ACCEL_REG(R200_PP_TXSIZE_0, - (pPriv->w - 1) | - ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT)); - OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32); - OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset); - - OUT_ACCEL_REG(R200_PP_TXFILTER_1, txfilter); - OUT_ACCEL_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1); - OUT_ACCEL_REG(R200_PP_TXFORMAT_X_1, 0); - OUT_ACCEL_REG(R200_PP_TXSIZE_1, txformat0); - OUT_ACCEL_REG(R200_PP_TXPITCH_1, txpitch); - OUT_ACCEL_REG(R200_PP_TXOFFSET_1, pPriv->src_offset + pPriv->planeu_offset); - - OUT_ACCEL_REG(R200_PP_TXFILTER_2, txfilter); - OUT_ACCEL_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1); - OUT_ACCEL_REG(R200_PP_TXFORMAT_X_2, 0); - OUT_ACCEL_REG(R200_PP_TXSIZE_2, txformat0); - OUT_ACCEL_REG(R200_PP_TXPITCH_2, txpitch); - OUT_ACCEL_REG(R200_PP_TXOFFSET_2, pPriv->src_offset + pPriv->planev_offset); - - /* similar to r300 code. Note the big problem is that hardware constants - * are 8 bits only, representing 0.0-1.0. We can get that up (using bias - * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually - * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but - * the constants not. To get larger range can use output scale, but for - * that 2.018 value we need a total scale by 8, which means the constants - * really have no accuracy whatsoever (5 fractional bits only). - * The only direct way to get high precision "constants" into the fragment - * pipe I know of is to use the texcoord interpolator (not color, this one - * is 8 bit only too), which seems a bit expensive. We're lucky though it - * seems the values we need seem to fit better than worst case (get about - * 6 fractional bits for this instead of 5, at least when not correcting for - * hue/saturation/contrast/brightness, which is the same as for vco - yco and - * yoff get 8 fractional bits). Try to preserve as much accuracy as possible - * even with non-default saturation/hue/contrast/brightness adjustments, - * it gets a little crazy and ultimately precision might still be lacking. - * - * A higher precision (8 fractional bits) version might just put uco into - * a texcoord, and calculate a new vcoconst in the shader, like so: - * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable - * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0} - * vcocalc = ADD temp, bias/scale(cohelper), vco - * would in total use 4 tex units, 4 instructions which seems fairly - * balanced for this architecture (instead of 3 + 3 for the solution here) - * - * temp = MAD(yco, yuv.yyyy, yoff) - * temp = MAD(uco, yuv.uuuu, temp) - * result = MAD(vco, yuv.vvvv, temp) - * - * note first mad produces actually scalar, hence we transform - * it into a dp2a to get 8 bit precision of yco instead of 7 - - * That's assuming hw correctly expands consts to internal precision. - * (y * 1 + y * (yco - 1) + yoff) - * temp = DP2A / 2 (yco, yuv.yyyy, yoff) - * temp = MAD (uco / 4, yuv.uuuu * 2, temp) - * result = MAD x2 (vco / 2, yuv.vvvv, temp) - * - * vco, uco need bias (and hence scale too) - * - */ + OUT_ACCEL_REG(R300_RB3D_COLOROFFSET0, dst_offset); + OUT_ACCEL_REG(R300_RB3D_COLORPITCH0, colorpitch); - /* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */ - OUT_ACCEL_REG(R200_PP_TXCBLEND_0, - R200_TXC_ARG_A_TFACTOR_COLOR | - R200_TXC_ARG_B_R0_COLOR | - R200_TXC_ARG_C_TFACTOR_COLOR | - (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) | - R200_TXC_OP_DOT2_ADD); - OUT_ACCEL_REG(R200_PP_TXCBLEND2_0, - (0 << R200_TXC_TFACTOR_SEL_SHIFT) | - R200_TXC_SCALE_INV2 | - R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0); - OUT_ACCEL_REG(R200_PP_TXABLEND_0, - R200_TXA_ARG_A_ZERO | - R200_TXA_ARG_B_ZERO | - R200_TXA_ARG_C_ZERO | - R200_TXA_OP_MADD); - OUT_ACCEL_REG(R200_PP_TXABLEND2_0, - R200_TXA_OUTPUT_REG_NONE); - - /* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */ - OUT_ACCEL_REG(R200_PP_TXCBLEND_1, - R200_TXC_ARG_A_TFACTOR_COLOR | - R200_TXC_BIAS_ARG_A | - R200_TXC_SCALE_ARG_A | - R200_TXC_ARG_B_R1_COLOR | - R200_TXC_BIAS_ARG_B | - (needux8 ? R200_TXC_SCALE_ARG_B : 0) | - R200_TXC_ARG_C_R0_COLOR | - R200_TXC_OP_MADD); - OUT_ACCEL_REG(R200_PP_TXCBLEND2_1, - (1 << R200_TXC_TFACTOR_SEL_SHIFT) | - R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0); - OUT_ACCEL_REG(R200_PP_TXABLEND_1, - R200_TXA_ARG_A_ZERO | - R200_TXA_ARG_B_ZERO | - R200_TXA_ARG_C_ZERO | - R200_TXA_OP_MADD); - OUT_ACCEL_REG(R200_PP_TXABLEND2_1, - R200_TXA_OUTPUT_REG_NONE); - - /* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */ - OUT_ACCEL_REG(R200_PP_TXCBLEND_2, - R200_TXC_ARG_A_TFACTOR_COLOR | - R200_TXC_BIAS_ARG_A | - R200_TXC_SCALE_ARG_A | - R200_TXC_ARG_B_R2_COLOR | - R200_TXC_BIAS_ARG_B | - (needvx8 ? R200_TXC_SCALE_ARG_B : 0) | - R200_TXC_ARG_C_R0_COLOR | - R200_TXC_OP_MADD); - OUT_ACCEL_REG(R200_PP_TXCBLEND2_2, - (2 << R200_TXC_TFACTOR_SEL_SHIFT) | - R200_TXC_SCALE_2X | - R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0); - OUT_ACCEL_REG(R200_PP_TXABLEND_2, - R200_TXA_ARG_A_ZERO | - R200_TXA_ARG_B_ZERO | - R200_TXA_ARG_C_ZERO | - R200_TXA_COMP_ARG_C | - R200_TXA_OP_MADD); - OUT_ACCEL_REG(R200_PP_TXABLEND2_2, - R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0); - - /* shader constants */ - OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */ - yco > 1.0 ? yco - 1.0: yco, - yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */ - 0.0)); - OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */ - uco[1] * ucscale + 0.5, /* or [-2, 2] */ - uco[2] * ucscale + 0.5, - 0.0)); - OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */ - vco[1] * vcscale + 0.5, /* or [-4, 4] */ - vco[2] * vcscale + 0.5, - 0.0)); - - FINISH_ACCEL(); - } - else if (info->ChipFamily == CHIP_FAMILY_RV250) { - /* fix up broken packed yuv - shader same as above except - yuv components are all in same reg */ - /* note: in contrast to r300, use input biasing on uv components */ - const float Loff = -0.0627; - float uvcosf, uvsinf; - float yco, yoff; - float uco[3], vco[3]; - float bright, cont, sat; - int ref = pPriv->transform_index; - float ucscale = 0.25, vcscale = 0.25; - Bool needux8 = FALSE, needvx8 = FALSE; - - /* contrast can cause constant overflow, clamp */ - cont = RTFContrast(pPriv->contrast); - if (cont * trans[ref].RefLuma > 2.0) - cont = 2.0 / trans[ref].RefLuma; - /* brightness is only from -0.5 to 0.5 should be safe */ - bright = RTFBrightness(pPriv->brightness); - /* saturation can also cause overflow, clamp */ - sat = RTFSaturation(pPriv->saturation); - if (sat * trans[ref].RefBCb > 4.0) - sat = 4.0 / trans[ref].RefBCb; - uvcosf = sat * cos(RTFHue(pPriv->hue)); - uvsinf = sat * sin(RTFHue(pPriv->hue)); - - yco = trans[ref].RefLuma * cont; - uco[0] = -trans[ref].RefRCr * uvsinf; - uco[1] = trans[ref].RefGCb * uvcosf - trans[ref].RefGCr * uvsinf; - uco[2] = trans[ref].RefBCb * uvcosf; - vco[0] = trans[ref].RefRCr * uvcosf; - vco[1] = trans[ref].RefGCb * uvsinf + trans[ref].RefGCr * uvcosf; - vco[2] = trans[ref].RefBCb * uvsinf; - yoff = Loff * yco + bright; - - if ((uco[0] > 2.0) || (uco[2] > 2.0)) { - needux8 = TRUE; - ucscale = 0.125; - } - if ((vco[0] > 2.0) || (vco[2] > 2.0)) { - needvx8 = TRUE; - vcscale = 0.125; - } - - txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) | - (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT)); - txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63; - txpitch -= 32; - txfilter = R200_MAG_FILTER_LINEAR | - R200_MIN_FILTER_LINEAR | - R200_CLAMP_S_CLAMP_LAST | - R200_CLAMP_T_CLAMP_LAST; - - BEGIN_ACCEL(24); - - OUT_ACCEL_REG(RADEON_PP_CNTL, - RADEON_TEX_0_ENABLE | - RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE | - RADEON_TEX_BLEND_2_ENABLE); - - OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY); - OUT_ACCEL_REG(R200_SE_VTX_FMT_1, - (2 << R200_VTX_TEX0_COMP_CNT_SHIFT)); - - OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter); - OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat); - OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0); - OUT_ACCEL_REG(R200_PP_TXSIZE_0, - (pPriv->w - 1) | - ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT)); - OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32); - OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset); - - /* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */ - OUT_ACCEL_REG(R200_PP_TXCBLEND_0, - R200_TXC_ARG_A_TFACTOR_COLOR | - R200_TXC_ARG_B_R0_COLOR | - R200_TXC_ARG_C_TFACTOR_COLOR | - (yoff < 0 ? R200_TXC_NEG_ARG_C : 0) | - R200_TXC_OP_DOT2_ADD); - OUT_ACCEL_REG(R200_PP_TXCBLEND2_0, - (0 << R200_TXC_TFACTOR_SEL_SHIFT) | - R200_TXC_SCALE_INV2 | - (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) | - R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1); - OUT_ACCEL_REG(R200_PP_TXABLEND_0, - R200_TXA_ARG_A_ZERO | - R200_TXA_ARG_B_ZERO | - R200_TXA_ARG_C_ZERO | - R200_TXA_OP_MADD); - OUT_ACCEL_REG(R200_PP_TXABLEND2_0, - R200_TXA_OUTPUT_REG_NONE); - - /* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */ - OUT_ACCEL_REG(R200_PP_TXCBLEND_1, - R200_TXC_ARG_A_TFACTOR_COLOR | - R200_TXC_BIAS_ARG_A | - R200_TXC_SCALE_ARG_A | - R200_TXC_ARG_B_R0_COLOR | - R200_TXC_BIAS_ARG_B | - (needux8 ? R200_TXC_SCALE_ARG_B : 0) | - R200_TXC_ARG_C_R1_COLOR | - R200_TXC_OP_MADD); - OUT_ACCEL_REG(R200_PP_TXCBLEND2_1, - (1 << R200_TXC_TFACTOR_SEL_SHIFT) | - (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) | - R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1); - OUT_ACCEL_REG(R200_PP_TXABLEND_1, - R200_TXA_ARG_A_ZERO | - R200_TXA_ARG_B_ZERO | - R200_TXA_ARG_C_ZERO | - R200_TXA_OP_MADD); - OUT_ACCEL_REG(R200_PP_TXABLEND2_1, - R200_TXA_OUTPUT_REG_NONE); - - /* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */ - OUT_ACCEL_REG(R200_PP_TXCBLEND_2, - R200_TXC_ARG_A_TFACTOR_COLOR | - R200_TXC_BIAS_ARG_A | - R200_TXC_SCALE_ARG_A | - R200_TXC_ARG_B_R0_COLOR | - R200_TXC_BIAS_ARG_B | - (needvx8 ? R200_TXC_SCALE_ARG_B : 0) | - R200_TXC_ARG_C_R1_COLOR | - R200_TXC_OP_MADD); - OUT_ACCEL_REG(R200_PP_TXCBLEND2_2, - (2 << R200_TXC_TFACTOR_SEL_SHIFT) | - R200_TXC_SCALE_2X | - (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) | - R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0); - OUT_ACCEL_REG(R200_PP_TXABLEND_2, - R200_TXA_ARG_A_ZERO | - R200_TXA_ARG_B_ZERO | - R200_TXA_ARG_C_ZERO | - R200_TXA_COMP_ARG_C | - R200_TXA_OP_MADD); - OUT_ACCEL_REG(R200_PP_TXABLEND2_2, - R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0); - - /* shader constants */ - OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(yco > 1.0 ? 1.0 : 0.0, /* range special [0, 2] */ - yco > 1.0 ? yco - 1.0: yco, - yoff < 0 ? -yoff : yoff, /* range special [-1, 1] */ - 0.0)); - OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * ucscale + 0.5, /* range [-4, 4] */ - uco[1] * ucscale + 0.5, /* or [-2, 2] */ - uco[2] * ucscale + 0.5, - 0.0)); - OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * vcscale + 0.5, /* range [-2, 2] */ - vco[1] * vcscale + 0.5, /* or [-4, 4] */ - vco[2] * vcscale + 0.5, - 0.0)); - - FINISH_ACCEL(); - } - else { - BEGIN_ACCEL(13); - OUT_ACCEL_REG(RADEON_PP_CNTL, - RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE); - - OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY); - OUT_ACCEL_REG(R200_SE_VTX_FMT_1, - (2 << R200_VTX_TEX0_COMP_CNT_SHIFT)); - - OUT_ACCEL_REG(R200_PP_TXFILTER_0, - R200_MAG_FILTER_LINEAR | - R200_MIN_FILTER_LINEAR | - R200_CLAMP_S_CLAMP_LAST | - R200_CLAMP_T_CLAMP_LAST | - R200_YUV_TO_RGB); - OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat); - OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0); - OUT_ACCEL_REG(R200_PP_TXSIZE_0, - (pPriv->w - 1) | - ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT)); - OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32); - - OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset); - - OUT_ACCEL_REG(R200_PP_TXCBLEND_0, - R200_TXC_ARG_A_ZERO | - R200_TXC_ARG_B_ZERO | - R200_TXC_ARG_C_R0_COLOR | - R200_TXC_OP_MADD); - OUT_ACCEL_REG(R200_PP_TXCBLEND2_0, - R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0); - OUT_ACCEL_REG(R200_PP_TXABLEND_0, - R200_TXA_ARG_A_ZERO | - R200_TXA_ARG_B_ZERO | - R200_TXA_ARG_C_R0_ALPHA | - R200_TXA_OP_MADD); - OUT_ACCEL_REG(R200_PP_TXABLEND2_0, - R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0); - FINISH_ACCEL(); - } - } else { + /* no need to enable blending */ + OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO); - info->accel_state->texW[0] = 1; - info->accel_state->texH[0] = 1; - - BEGIN_ACCEL(9); - - OUT_ACCEL_REG(RADEON_PP_CNTL, - RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE); - - OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY | - RADEON_SE_VTX_FMT_ST0)); - - OUT_ACCEL_REG(RADEON_PP_TXFILTER_0, - RADEON_MAG_FILTER_LINEAR | - RADEON_MIN_FILTER_LINEAR | - RADEON_CLAMP_S_CLAMP_LAST | - RADEON_CLAMP_T_CLAMP_LAST | - RADEON_YUV_TO_RGB); - OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat); - OUT_ACCEL_REG(RADEON_PP_TXOFFSET_0, pPriv->src_offset); - OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0, - RADEON_COLOR_ARG_A_ZERO | - RADEON_COLOR_ARG_B_ZERO | - RADEON_COLOR_ARG_C_T0_COLOR | - RADEON_BLEND_CTL_ADD | - RADEON_CLAMP_TX); - OUT_ACCEL_REG(RADEON_PP_TXABLEND_0, - RADEON_ALPHA_ARG_A_ZERO | - RADEON_ALPHA_ARG_B_ZERO | - RADEON_ALPHA_ARG_C_T0_ALPHA | - RADEON_BLEND_CTL_ADD | - RADEON_CLAMP_TX); - - OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0, - (pPriv->w - 1) | - ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT)); - OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0, - pPriv->src_pitch - 32); - FINISH_ACCEL(); - } - } + OUT_ACCEL_REG(R300_VAP_VTX_SIZE, vtx_count); + FINISH_ACCEL(); if (pPriv->vsync) { xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn, @@ -2257,92 +2030,49 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv ErrorF("src: %d, %d, %d, %d\n", srcX, srcY, srcw, srch); #endif - if (IS_R300_3D || IS_R500_3D) { - if (IS_R300_3D && ((dstw+dsth) > 2880)) - use_quad = TRUE; - /* - * Set up the scissor area to that of the output size. - */ - BEGIN_ACCEL(2); - if (IS_R300_3D) { - /* R300 has an offset */ - OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX + 1088) << R300_SCISSOR_X_SHIFT) | - ((dstY + 1088) << R300_SCISSOR_Y_SHIFT))); - OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw + 1088 - 1) << R300_SCISSOR_X_SHIFT) | - ((dstY + dsth + 1088 - 1) << R300_SCISSOR_Y_SHIFT))); - } else { - OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX) << R300_SCISSOR_X_SHIFT) | - ((dstY) << R300_SCISSOR_Y_SHIFT))); - OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw - 1) << R300_SCISSOR_X_SHIFT) | - ((dstY + dsth - 1) << R300_SCISSOR_Y_SHIFT))); - } - FINISH_ACCEL(); - } + if (IS_R300_3D && ((dstw+dsth) > 2880)) + use_quad = TRUE; + /* + * Set up the scissor area to that of the output size. + */ + BEGIN_ACCEL(2); + /* R300 has an offset */ + OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX + 1088) << R300_SCISSOR_X_SHIFT) | + ((dstY + 1088) << R300_SCISSOR_Y_SHIFT))); + OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw + 1088 - 1) << R300_SCISSOR_X_SHIFT) | + ((dstY + dsth + 1088 - 1) << R300_SCISSOR_Y_SHIFT))); + FINISH_ACCEL(); #ifdef ACCEL_CP - if (info->ChipFamily < CHIP_FAMILY_R200) { - BEGIN_RING(3 * vtx_count + 3); - OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD, - 3 * vtx_count + 1)); - OUT_RING(RADEON_CP_VC_FRMT_XY | - RADEON_CP_VC_FRMT_ST0); - OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST | + if (use_quad) { + BEGIN_RING(4 * vtx_count + 4); + OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2, + 4 * vtx_count)); + OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_QUAD_LIST | RADEON_CP_VC_CNTL_PRIM_WALK_RING | - RADEON_CP_VC_CNTL_MAOS_ENABLE | - RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE | - (3 << RADEON_CP_VC_CNTL_NUM_SHIFT)); - } else if (IS_R300_3D || IS_R500_3D) { - if (use_quad) { - BEGIN_RING(4 * vtx_count + 4); - OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2, - 4 * vtx_count)); - OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_QUAD_LIST | - RADEON_CP_VC_CNTL_PRIM_WALK_RING | - (4 << RADEON_CP_VC_CNTL_NUM_SHIFT)); - } else { - BEGIN_RING(3 * vtx_count + 4); - OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2, - 3 * vtx_count)); - OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST | - RADEON_CP_VC_CNTL_PRIM_WALK_RING | - (3 << RADEON_CP_VC_CNTL_NUM_SHIFT)); - } + (4 << RADEON_CP_VC_CNTL_NUM_SHIFT)); } else { - BEGIN_RING(3 * vtx_count + 2); + BEGIN_RING(3 * vtx_count + 4); OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2, 3 * vtx_count)); - OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST | + OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST | RADEON_CP_VC_CNTL_PRIM_WALK_RING | (3 << RADEON_CP_VC_CNTL_NUM_SHIFT)); } #else /* ACCEL_CP */ - if (IS_R300_3D || IS_R500_3D) { - if (use_quad) - BEGIN_ACCEL(2 + vtx_count * 4); - else - BEGIN_ACCEL(2 + vtx_count * 3); - } else - BEGIN_ACCEL(1 + vtx_count * 3); - - if (info->ChipFamily < CHIP_FAMILY_R200) - OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST | + if (use_quad) + BEGIN_ACCEL(2 + vtx_count * 4); + else + BEGIN_ACCEL(2 + vtx_count * 3); + + if (use_quad) + OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_QUAD_LIST | RADEON_VF_PRIM_WALK_DATA | - RADEON_VF_RADEON_MODE | - (3 << RADEON_VF_NUM_VERTICES_SHIFT))); - else if (IS_R300_3D || IS_R500_3D) { - if (use_quad) - OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_QUAD_LIST | - RADEON_VF_PRIM_WALK_DATA | - (4 << RADEON_VF_NUM_VERTICES_SHIFT))); - else - OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST | - RADEON_VF_PRIM_WALK_DATA | - (3 << RADEON_VF_NUM_VERTICES_SHIFT))); - } else - OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_RECTANGLE_LIST | + (4 << RADEON_VF_NUM_VERTICES_SHIFT))); + else + OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST | RADEON_VF_PRIM_WALK_DATA | (3 << RADEON_VF_NUM_VERTICES_SHIFT))); - #endif if (pPriv->bicubic_enabled) { /* @@ -2376,61 +2106,33 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv (float)srcY + 0.5); } } else { - if (IS_R300_3D || IS_R500_3D) { - if (use_quad) { - VTX_OUT((float)dstX, (float)dstY, - (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]); - VTX_OUT((float)dstX, (float)(dstY + dsth), - (float)srcX / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]); - VTX_OUT((float)(dstX + dstw), (float)(dstY + dsth), - (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]); - VTX_OUT((float)(dstX + dstw), (float)dstY, - (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]); - } else { - /* - * Render a big, scissored triangle. This means - * increasing the triangle size and adjusting - * texture coordinates. - */ - VTX_OUT((float)dstX, (float)dstY, - (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]); - VTX_OUT((float)dstX, (float)(dstY + dsth + dstw), - (float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]); - - VTX_OUT((float)(dstX + dstw + dsth), (float)dstY, - ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0], - (float)srcY / info->accel_state->texH[0]); - } - } else if (isplanar) { - /* - * Just render a rect (using three coords). - * Filter is a bit a misnomer, it's just texcoords... - */ - VTX_OUT_FILTER((float)dstX, (float)(dstY + dsth), - (float)srcX / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0], - (float)srcX / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]); - VTX_OUT_FILTER((float)(dstX + dstw), (float)(dstY + dsth), - (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0], - (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]); - VTX_OUT_FILTER((float)(dstX + dstw), (float)dstY, - (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0], - (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]); - } else { - /* - * Just render a rect (using three coords). - */ + if (use_quad) { + VTX_OUT((float)dstX, (float)dstY, + (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]); VTX_OUT((float)dstX, (float)(dstY + dsth), (float)srcX / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]); VTX_OUT((float)(dstX + dstw), (float)(dstY + dsth), (float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]); VTX_OUT((float)(dstX + dstw), (float)dstY, (float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]); + } else { + /* + * Render a big, scissored triangle. This means + * increasing the triangle size and adjusting + * texture coordinates. + */ + VTX_OUT((float)dstX, (float)dstY, + (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]); + VTX_OUT((float)dstX, (float)(dstY + dsth + dstw), + (float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]); + VTX_OUT((float)(dstX + dstw + dsth), (float)dstY, + ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0], + (float)srcY / info->accel_state->texH[0]); } } - if (IS_R300_3D || IS_R500_3D) - /* flushing is pipelined, free/finish is not */ - OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D); + /* flushing is pipelined, free/finish is not */ + OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D); #ifdef ACCEL_CP ADVANCE_RING(); @@ -2441,12 +2143,1000 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv pBox++; } - if (IS_R300_3D || IS_R500_3D) { - BEGIN_ACCEL(3); - OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA); - OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL); + BEGIN_ACCEL(3); + OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA); + OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL); + OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN); + FINISH_ACCEL(); + + DamageDamageRegion(pPriv->pDraw, &pPriv->clip); +} + +static void +FUNC_NAME(R500DisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv) +{ + RADEONInfoPtr info = RADEONPTR(pScrn); + PixmapPtr pPixmap = pPriv->pPixmap; + uint32_t txfilter, txformat0, txformat1, txoffset, txpitch; + uint32_t dst_offset, dst_pitch, dst_format; + uint32_t txenable, colorpitch; + uint32_t output_fmt; + Bool isplanar = FALSE; + int dstxoff, dstyoff, pixel_shift, vtx_count; + BoxPtr pBox = REGION_RECTS(&pPriv->clip); + int nBox = REGION_NUM_RECTS(&pPriv->clip); + ACCEL_PREAMBLE(); + + pixel_shift = pPixmap->drawable.bitsPerPixel >> 4; + +#ifdef USE_EXA + if (info->useEXA) { + dst_offset = exaGetPixmapOffset(pPixmap) + info->fbLocation + pScrn->fbOffset; + dst_pitch = exaGetPixmapPitch(pPixmap); } else - BEGIN_ACCEL(1); +#endif + { + dst_offset = (pPixmap->devPrivate.ptr - info->FB) + + info->fbLocation + pScrn->fbOffset; + dst_pitch = pPixmap->devKind; + } + +#ifdef COMPOSITE + dstxoff = -pPixmap->screen_x + pPixmap->drawable.x; + dstyoff = -pPixmap->screen_y + pPixmap->drawable.y; +#else + dstxoff = 0; + dstyoff = 0; +#endif + +#ifdef USE_EXA + if (info->useEXA) { + RADEON_SWITCH_TO_3D(); + } else +#endif + { + BEGIN_ACCEL(2); + OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D); + /* We must wait for 3d to idle, in case source was just written as a dest. */ + OUT_ACCEL_REG(RADEON_WAIT_UNTIL, + RADEON_WAIT_HOST_IDLECLEAN | + RADEON_WAIT_2D_IDLECLEAN | + RADEON_WAIT_3D_IDLECLEAN | + RADEON_WAIT_DMA_GUI_IDLE); + FINISH_ACCEL(); + + if (!info->accel_state->XInited3D) + RADEONInit3DEngine(pScrn); + } + + if (pPriv->bicubic_enabled) + vtx_count = 6; + else + vtx_count = 4; + + switch (pPixmap->drawable.bitsPerPixel) { + case 16: + if (pPixmap->drawable.depth == 15) + dst_format = R300_COLORFORMAT_ARGB1555; + else + dst_format = R300_COLORFORMAT_RGB565; + break; + case 32: + dst_format = R300_COLORFORMAT_ARGB8888; + break; + default: + return; + } + + output_fmt = (R300_OUT_FMT_C4_8 | + R300_OUT_FMT_C0_SEL_BLUE | + R300_OUT_FMT_C1_SEL_GREEN | + R300_OUT_FMT_C2_SEL_RED | + R300_OUT_FMT_C3_SEL_ALPHA); + + colorpitch = dst_pitch >> pixel_shift; + colorpitch |= dst_format; + + if (RADEONTilingEnabled(pScrn, pPixmap)) + colorpitch |= R300_COLORTILE; + + if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) { + isplanar = TRUE; + } + + if (isplanar) { + txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0; + txpitch = pPriv->src_pitch; + } else { + if (pPriv->id == FOURCC_UYVY) + txformat1 = R300_TX_FORMAT_YVYU422; + else + txformat1 = R300_TX_FORMAT_VYUY422; + + txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP; + + /* pitch is in pixels */ + txpitch = pPriv->src_pitch / 2; + } + txpitch -= 1; + + txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) | + (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) | + R300_TXPITCH_EN); + + info->accel_state->texW[0] = pPriv->w; + info->accel_state->texH[0] = pPriv->h; + + txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) | + R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) | + R300_TX_MAG_FILTER_LINEAR | + R300_TX_MIN_FILTER_LINEAR | + (0 << R300_TX_ID_SHIFT)); + + + if ((pPriv->w - 1) & 0x800) + txpitch |= R500_TXWIDTH_11; + + if ((pPriv->h - 1) & 0x800) + txpitch |= R500_TXHEIGHT_11; + + txoffset = pPriv->src_offset; + + BEGIN_ACCEL(6); + OUT_ACCEL_REG(R300_TX_FILTER0_0, txfilter); + OUT_ACCEL_REG(R300_TX_FILTER1_0, 0); + OUT_ACCEL_REG(R300_TX_FORMAT0_0, txformat0); + OUT_ACCEL_REG(R300_TX_FORMAT1_0, txformat1); + OUT_ACCEL_REG(R300_TX_FORMAT2_0, txpitch); + OUT_ACCEL_REG(R300_TX_OFFSET_0, txoffset); + FINISH_ACCEL(); + + txenable = R300_TEX_0_ENABLE; + + if (isplanar) { + txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) | + (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) | + R300_TXPITCH_EN); + txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63; + txpitch -= 1; + txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) | + R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) | + R300_TX_MIN_FILTER_LINEAR | + R300_TX_MAG_FILTER_LINEAR); + + BEGIN_ACCEL(12); + OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT)); + OUT_ACCEL_REG(R300_TX_FILTER1_1, 0); + OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0); + OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2); + OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch); + OUT_ACCEL_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset); + OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT)); + OUT_ACCEL_REG(R300_TX_FILTER1_2, 0); + OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0); + OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3); + OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch); + OUT_ACCEL_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset); + FINISH_ACCEL(); + txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE; + } + + if (pPriv->bicubic_enabled) { + /* Size is 128x1 */ + txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) | + (0x0 << R300_TXHEIGHT_SHIFT) | + R300_TXPITCH_EN); + /* Format is 32-bit floats, 4bpp */ + txformat1 = R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16); + /* Pitch is 127 (128-1) */ + txpitch = 0x7f; + /* Tex filter */ + txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_WRAP) | + R300_TX_CLAMP_T(R300_TX_CLAMP_WRAP) | + R300_TX_MIN_FILTER_NEAREST | + R300_TX_MAG_FILTER_NEAREST | + (1 << R300_TX_ID_SHIFT)); + + BEGIN_ACCEL(6); + OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter); + OUT_ACCEL_REG(R300_TX_FILTER1_1, 0); + OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0); + OUT_ACCEL_REG(R300_TX_FORMAT1_1, txformat1); + OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch); + OUT_ACCEL_REG(R300_TX_OFFSET_1, pPriv->bicubic_src_offset); + FINISH_ACCEL(); + + /* Enable tex 1 */ + txenable |= R300_TEX_1_ENABLE; + } + + /* setup the VAP */ + if (info->accel_state->has_tcl) { + if (pPriv->bicubic_enabled) + BEGIN_ACCEL(7); + else + BEGIN_ACCEL(6); + } else { + if (pPriv->bicubic_enabled) + BEGIN_ACCEL(5); + else + BEGIN_ACCEL(4); + } + + /* These registers define the number, type, and location of data submitted + * to the PVS unit of GA input (when PVS is disabled) + * DST_VEC_LOC is the slot in the PVS input vector memory when PVS/TCL is + * enabled. This memory provides the imputs to the vertex shader program + * and ordering is not important. When PVS/TCL is disabled, this field maps + * directly to the GA input memory and the order is signifigant. In + * PVS_BYPASS mode the order is as follows: + * Position + * Point Size + * Color 0-3 + * Textures 0-7 + * Fog + */ + if (pPriv->bicubic_enabled) { + OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0, + ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) | + (0 << R300_SKIP_DWORDS_0_SHIFT) | + (0 << R300_DST_VEC_LOC_0_SHIFT) | + R300_SIGNED_0 | + (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) | + (0 << R300_SKIP_DWORDS_1_SHIFT) | + (6 << R300_DST_VEC_LOC_1_SHIFT) | + R300_SIGNED_1)); + OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_1, + ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_2_SHIFT) | + (0 << R300_SKIP_DWORDS_2_SHIFT) | + (7 << R300_DST_VEC_LOC_2_SHIFT) | + R300_LAST_VEC_2 | + R300_SIGNED_2)); + } else { + OUT_ACCEL_REG(R300_VAP_PROG_STREAM_CNTL_0, + ((R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) | + (0 << R300_SKIP_DWORDS_0_SHIFT) | + (0 << R300_DST_VEC_LOC_0_SHIFT) | + R300_SIGNED_0 | + (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_1_SHIFT) | + (0 << R300_SKIP_DWORDS_1_SHIFT) | + (6 << R300_DST_VEC_LOC_1_SHIFT) | + R300_LAST_VEC_1 | + R300_SIGNED_1)); + } + + /* load the vertex shader + * We pre-load vertex programs in RADEONInit3DEngine(): + * - exa mask/Xv bicubic + * - exa no mask + * - Xv + * Here we select the offset of the vertex program we want to use + */ + if (info->accel_state->has_tcl) { + if (pPriv->bicubic_enabled) { + OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0, + ((0 << R300_PVS_FIRST_INST_SHIFT) | + (2 << R300_PVS_XYZW_VALID_INST_SHIFT) | + (2 << R300_PVS_LAST_INST_SHIFT))); + OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1, + (2 << R300_PVS_LAST_VTX_SRC_INST_SHIFT)); + } else { + OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_0, + ((5 << R300_PVS_FIRST_INST_SHIFT) | + (6 << R300_PVS_XYZW_VALID_INST_SHIFT) | + (6 << R300_PVS_LAST_INST_SHIFT))); + OUT_ACCEL_REG(R300_VAP_PVS_CODE_CNTL_1, + (6 << R300_PVS_LAST_VTX_SRC_INST_SHIFT)); + } + } + + /* Position and one set of 2 texture coordinates */ + OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_0, R300_VTX_POS_PRESENT); + if (pPriv->bicubic_enabled) + OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, ((2 << R300_TEX_0_COMP_CNT_SHIFT) | + (2 << R300_TEX_1_COMP_CNT_SHIFT))); + else + OUT_ACCEL_REG(R300_VAP_OUT_VTX_FMT_1, (2 << R300_TEX_0_COMP_CNT_SHIFT)); + + OUT_ACCEL_REG(R300_US_OUT_FMT_0, output_fmt); + FINISH_ACCEL(); + + /* setup pixel shader */ + if (pPriv->bicubic_enabled) { + BEGIN_ACCEL(7); + + /* 4 components: 2 for tex0 and 2 for tex1 */ + OUT_ACCEL_REG(R300_RS_COUNT, + ((4 << R300_RS_COUNT_IT_COUNT_SHIFT) | + R300_RS_COUNT_HIRES_EN)); + + /* R300_INST_COUNT_RS - highest RS instruction used */ + OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(1) | R300_TX_OFFSET_RS(6)); + + /* Pixel stack frame size. */ + OUT_ACCEL_REG(R300_US_PIXSIZE, 5); + + /* FP length. */ + OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) | + R500_US_CODE_END_ADDR(13))); + OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) | + R500_US_CODE_RANGE_SIZE(13))); + + /* Prepare for FP emission. */ + OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0); + OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0)); + FINISH_ACCEL(); + + BEGIN_ACCEL(89); + /* Pixel shader. + * I've gone ahead and annotated each instruction, since this + * thing is MASSIVE. :3 + * Note: In order to avoid buggies with temps and multiple + * inputs, all temps are offset by 2. temp0 -> register2. */ + + /* TEX temp2, input1.xxxx, tex1, 1D */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) | + R500_TEX_INST_LD | + R500_TEX_IGNORE_UNCOVERED)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) | + R500_TEX_SRC_S_SWIZ_R | + R500_TEX_SRC_T_SWIZ_R | + R500_TEX_SRC_R_SWIZ_R | + R500_TEX_SRC_Q_SWIZ_R | + R500_TEX_DST_ADDR(2) | + R500_TEX_DST_R_SWIZ_R | + R500_TEX_DST_G_SWIZ_G | + R500_TEX_DST_B_SWIZ_B | + R500_TEX_DST_A_SWIZ_A)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + + /* TEX temp5, input1.yyyy, tex1, 1D */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | + R500_INST_TEX_SEM_WAIT | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(1) | + R500_TEX_INST_LD | + R500_TEX_SEM_ACQUIRE | + R500_TEX_IGNORE_UNCOVERED)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(1) | + R500_TEX_SRC_S_SWIZ_G | + R500_TEX_SRC_T_SWIZ_G | + R500_TEX_SRC_R_SWIZ_G | + R500_TEX_SRC_Q_SWIZ_G | + R500_TEX_DST_ADDR(5) | + R500_TEX_DST_R_SWIZ_R | + R500_TEX_DST_G_SWIZ_G | + R500_TEX_DST_B_SWIZ_B | + R500_TEX_DST_A_SWIZ_A)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + + /* MUL temp4, const0.x0x0, temp2.yyxx */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | + R500_INST_TEX_SEM_WAIT | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) | + R500_RGB_ADDR0_CONST | + R500_RGB_ADDR1(2))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) | + R500_ALPHA_ADDR0_CONST | + R500_ALPHA_ADDR1(2))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 | + R500_ALU_RGB_R_SWIZ_A_R | + R500_ALU_RGB_G_SWIZ_A_0 | + R500_ALU_RGB_B_SWIZ_A_R | + R500_ALU_RGB_SEL_B_SRC1 | + R500_ALU_RGB_R_SWIZ_B_G | + R500_ALU_RGB_G_SWIZ_B_G | + R500_ALU_RGB_B_SWIZ_B_R)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) | + R500_ALPHA_OP_MAD | + R500_ALPHA_SEL_A_SRC0 | + R500_ALPHA_SWIZ_A_0 | + R500_ALPHA_SEL_B_SRC1 | + R500_ALPHA_SWIZ_B_R)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) | + R500_ALU_RGBA_OP_MAD | + R500_ALU_RGBA_R_SWIZ_0 | + R500_ALU_RGBA_G_SWIZ_0 | + R500_ALU_RGBA_B_SWIZ_0 | + R500_ALU_RGBA_A_SWIZ_0)); + + /* MAD temp3, const0.0y0y, temp5.xxxx, temp4 */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) | + R500_RGB_ADDR0_CONST | + R500_RGB_ADDR1(5) | + R500_RGB_ADDR2(4))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) | + R500_ALPHA_ADDR0_CONST | + R500_ALPHA_ADDR1(5) | + R500_ALPHA_ADDR2(4))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 | + R500_ALU_RGB_R_SWIZ_A_0 | + R500_ALU_RGB_G_SWIZ_A_G | + R500_ALU_RGB_B_SWIZ_A_0 | + R500_ALU_RGB_SEL_B_SRC1 | + R500_ALU_RGB_R_SWIZ_B_R | + R500_ALU_RGB_G_SWIZ_B_R | + R500_ALU_RGB_B_SWIZ_B_R)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) | + R500_ALPHA_OP_MAD | + R500_ALPHA_SEL_A_SRC0 | + R500_ALPHA_SWIZ_A_G | + R500_ALPHA_SEL_B_SRC1 | + R500_ALPHA_SWIZ_B_R)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) | + R500_ALU_RGBA_OP_MAD | + R500_ALU_RGBA_SEL_C_SRC2 | + R500_ALU_RGBA_R_SWIZ_R | + R500_ALU_RGBA_G_SWIZ_G | + R500_ALU_RGBA_B_SWIZ_B | + R500_ALU_RGBA_A_SWIZ_A)); + + /* ADD temp3, temp3, input0.xyxy */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(3) | + R500_RGB_ADDR2(0))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(3) | + R500_ALPHA_ADDR2(0))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 | + R500_ALU_RGB_G_SWIZ_A_1 | + R500_ALU_RGB_B_SWIZ_A_1 | + R500_ALU_RGB_SEL_B_SRC1 | + R500_ALU_RGB_R_SWIZ_B_R | + R500_ALU_RGB_G_SWIZ_B_G | + R500_ALU_RGB_B_SWIZ_B_B)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) | + R500_ALPHA_OP_MAD | + R500_ALPHA_SWIZ_A_1 | + R500_ALPHA_SEL_B_SRC1 | + R500_ALPHA_SWIZ_B_A)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) | + R500_ALU_RGBA_OP_MAD | + R500_ALU_RGBA_SEL_C_SRC2 | + R500_ALU_RGBA_R_SWIZ_R | + R500_ALU_RGBA_G_SWIZ_G | + R500_ALU_RGBA_B_SWIZ_R | + R500_ALU_RGBA_A_SWIZ_G)); + + /* TEX temp1, temp3.zwxy, tex0, 2D */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) | + R500_TEX_INST_LD | + R500_TEX_IGNORE_UNCOVERED)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) | + R500_TEX_SRC_S_SWIZ_B | + R500_TEX_SRC_T_SWIZ_A | + R500_TEX_SRC_R_SWIZ_R | + R500_TEX_SRC_Q_SWIZ_G | + R500_TEX_DST_ADDR(1) | + R500_TEX_DST_R_SWIZ_R | + R500_TEX_DST_G_SWIZ_G | + R500_TEX_DST_B_SWIZ_B | + R500_TEX_DST_A_SWIZ_A)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + + /* TEX temp3, temp3.xyzw, tex0, 2D */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | + R500_INST_TEX_SEM_WAIT | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) | + R500_TEX_INST_LD | + R500_TEX_SEM_ACQUIRE | + R500_TEX_IGNORE_UNCOVERED)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(3) | + R500_TEX_SRC_S_SWIZ_R | + R500_TEX_SRC_T_SWIZ_G | + R500_TEX_SRC_R_SWIZ_B | + R500_TEX_SRC_Q_SWIZ_A | + R500_TEX_DST_ADDR(3) | + R500_TEX_DST_R_SWIZ_R | + R500_TEX_DST_G_SWIZ_G | + R500_TEX_DST_B_SWIZ_B | + R500_TEX_DST_A_SWIZ_A)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + + /* MAD temp4, const0.0y0y, temp5.yyyy, temp4 */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) | + R500_RGB_ADDR0_CONST | + R500_RGB_ADDR1(5) | + R500_RGB_ADDR2(4))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) | + R500_ALPHA_ADDR0_CONST | + R500_ALPHA_ADDR1(5) | + R500_ALPHA_ADDR2(4))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 | + R500_ALU_RGB_R_SWIZ_A_0 | + R500_ALU_RGB_G_SWIZ_A_G | + R500_ALU_RGB_B_SWIZ_A_0 | + R500_ALU_RGB_SEL_B_SRC1 | + R500_ALU_RGB_R_SWIZ_B_G | + R500_ALU_RGB_G_SWIZ_B_G | + R500_ALU_RGB_B_SWIZ_B_G)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(4) | + R500_ALPHA_OP_MAD | + R500_ALPHA_SEL_A_SRC0 | + R500_ALPHA_SWIZ_A_G | + R500_ALPHA_SEL_B_SRC1 | + R500_ALPHA_SWIZ_B_G)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(4) | + R500_ALU_RGBA_OP_MAD | + R500_ALU_RGBA_SEL_C_SRC2 | + R500_ALU_RGBA_R_SWIZ_R | + R500_ALU_RGBA_G_SWIZ_G | + R500_ALU_RGBA_B_SWIZ_B | + R500_ALU_RGBA_A_SWIZ_A)); + + /* ADD temp0, temp4, input0.xyxy */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR1(4) | + R500_RGB_ADDR2(0))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR1(4) | + R500_ALPHA_ADDR2(0))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_R_SWIZ_A_1 | + R500_ALU_RGB_G_SWIZ_A_1 | + R500_ALU_RGB_B_SWIZ_A_1 | + R500_ALU_RGB_SEL_B_SRC1 | + R500_ALU_RGB_R_SWIZ_B_R | + R500_ALU_RGB_G_SWIZ_B_G | + R500_ALU_RGB_B_SWIZ_B_B)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) | + R500_ALPHA_OP_MAD | + R500_ALPHA_SWIZ_A_1 | + R500_ALPHA_SEL_B_SRC1 | + R500_ALPHA_SWIZ_B_A)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) | + R500_ALU_RGBA_OP_MAD | + R500_ALU_RGBA_SEL_C_SRC2 | + R500_ALU_RGBA_R_SWIZ_R | + R500_ALU_RGBA_G_SWIZ_G | + R500_ALU_RGBA_B_SWIZ_R | + R500_ALU_RGBA_A_SWIZ_G)); + + /* TEX temp4, temp0.zwzw, tex0, 2D */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | + R500_INST_TEX_SEM_WAIT | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) | + R500_TEX_INST_LD | + R500_TEX_IGNORE_UNCOVERED)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) | + R500_TEX_SRC_S_SWIZ_B | + R500_TEX_SRC_T_SWIZ_A | + R500_TEX_SRC_R_SWIZ_B | + R500_TEX_SRC_Q_SWIZ_A | + R500_TEX_DST_ADDR(4) | + R500_TEX_DST_R_SWIZ_R | + R500_TEX_DST_G_SWIZ_G | + R500_TEX_DST_B_SWIZ_B | + R500_TEX_DST_A_SWIZ_A)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + + /* TEX temp0, temp0.xyzw, tex0, 2D */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | + R500_INST_TEX_SEM_WAIT | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) | + R500_TEX_INST_LD | + R500_TEX_SEM_ACQUIRE | + R500_TEX_IGNORE_UNCOVERED)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) | + R500_TEX_SRC_S_SWIZ_R | + R500_TEX_SRC_T_SWIZ_G | + R500_TEX_SRC_R_SWIZ_B | + R500_TEX_SRC_Q_SWIZ_A | + R500_TEX_DST_ADDR(0) | + R500_TEX_DST_R_SWIZ_R | + R500_TEX_DST_G_SWIZ_G | + R500_TEX_DST_B_SWIZ_B | + R500_TEX_DST_A_SWIZ_A)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + + /* LRP temp3, temp2.zzzz, temp1, temp3 -> + * - PRESUB temps, temp1 - temp3 + * - MAD temp2.zzzz, temps, temp3 */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(3) | + R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 | + R500_RGB_ADDR1(1) | + R500_RGB_ADDR2(2))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(3) | + R500_ALPHA_SRCP_OP_A1_MINUS_A0 | + R500_ALPHA_ADDR1(1) | + R500_ALPHA_ADDR2(2))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 | + R500_ALU_RGB_R_SWIZ_A_B | + R500_ALU_RGB_G_SWIZ_A_B | + R500_ALU_RGB_B_SWIZ_A_B | + R500_ALU_RGB_SEL_B_SRCP | + R500_ALU_RGB_R_SWIZ_B_R | + R500_ALU_RGB_G_SWIZ_B_G | + R500_ALU_RGB_B_SWIZ_B_B)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(3) | + R500_ALPHA_OP_MAD | + R500_ALPHA_SEL_A_SRC2 | + R500_ALPHA_SWIZ_A_B | + R500_ALPHA_SEL_B_SRCP | + R500_ALPHA_SWIZ_B_A)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(3) | + R500_ALU_RGBA_OP_MAD | + R500_ALU_RGBA_SEL_C_SRC0 | + R500_ALU_RGBA_R_SWIZ_R | + R500_ALU_RGBA_G_SWIZ_G | + R500_ALU_RGBA_B_SWIZ_B | + R500_ALU_RGBA_A_SWIZ_A)); + + /* LRP temp0, temp2.zzzz, temp4, temp0 -> + * - PRESUB temps, temp4 - temp1 + * - MAD temp2.zzzz, temps, temp0 */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_ALU | + R500_INST_TEX_SEM_WAIT | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) | + R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 | + R500_RGB_ADDR1(4) | + R500_RGB_ADDR2(2))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) | + R500_ALPHA_SRCP_OP_A1_MINUS_A0 | + R500_ALPHA_ADDR1(4) | + R500_ALPHA_ADDR2(2))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 | + R500_ALU_RGB_R_SWIZ_A_B | + R500_ALU_RGB_G_SWIZ_A_B | + R500_ALU_RGB_B_SWIZ_A_B | + R500_ALU_RGB_SEL_B_SRCP | + R500_ALU_RGB_R_SWIZ_B_R | + R500_ALU_RGB_G_SWIZ_B_G | + R500_ALU_RGB_B_SWIZ_B_B)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) | + R500_ALPHA_OP_MAD | + R500_ALPHA_SEL_A_SRC2 | + R500_ALPHA_SWIZ_A_B | + R500_ALPHA_SEL_B_SRCP | + R500_ALPHA_SWIZ_B_A)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) | + R500_ALU_RGBA_OP_MAD | + R500_ALU_RGBA_SEL_C_SRC0 | + R500_ALU_RGBA_R_SWIZ_R | + R500_ALU_RGBA_G_SWIZ_G | + R500_ALU_RGBA_B_SWIZ_B | + R500_ALU_RGBA_A_SWIZ_A)); + + /* LRP output, temp5.zzzz, temp3, temp0 -> + * - PRESUB temps, temp3 - temp0 + * - MAD temp5.zzzz, temps, temp0 */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT | + R500_INST_LAST | + R500_INST_TEX_SEM_WAIT | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK | + R500_INST_RGB_OMASK_R | + R500_INST_RGB_OMASK_G | + R500_INST_RGB_OMASK_B | + R500_INST_ALPHA_OMASK)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) | + R500_RGB_SRCP_OP_RGB1_MINUS_RGB0 | + R500_RGB_ADDR1(3) | + R500_RGB_ADDR2(5))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) | + R500_ALPHA_SRCP_OP_A1_MINUS_A0 | + R500_ALPHA_ADDR1(3) | + R500_ALPHA_ADDR2(5))); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC2 | + R500_ALU_RGB_R_SWIZ_A_B | + R500_ALU_RGB_G_SWIZ_A_B | + R500_ALU_RGB_B_SWIZ_A_B | + R500_ALU_RGB_SEL_B_SRCP | + R500_ALU_RGB_R_SWIZ_B_R | + R500_ALU_RGB_G_SWIZ_B_G | + R500_ALU_RGB_B_SWIZ_B_B)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDRD(0) | + R500_ALPHA_OP_MAD | + R500_ALPHA_SEL_A_SRC2 | + R500_ALPHA_SWIZ_A_B | + R500_ALPHA_SEL_B_SRCP | + R500_ALPHA_SWIZ_B_A)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_ADDRD(0) | + R500_ALU_RGBA_OP_MAD | + R500_ALU_RGBA_SEL_C_SRC0 | + R500_ALU_RGBA_R_SWIZ_R | + R500_ALU_RGBA_G_SWIZ_G | + R500_ALU_RGBA_B_SWIZ_B | + R500_ALU_RGBA_A_SWIZ_A)); + + /* Shader constants. */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_CONST_INDEX(0)); + + /* const0 = {1 / texture[0].width, 1 / texture[0].height, 0, 0} */ + OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->w)); + OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, (1.0/(float)pPriv->h)); + OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0); + OUT_ACCEL_REG_F(R500_GA_US_VECTOR_DATA, 0x0); + + FINISH_ACCEL(); + + } else { + BEGIN_ACCEL(19); + /* 2 components: 2 for tex0 */ + OUT_ACCEL_REG(R300_RS_COUNT, + ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) | + R300_RS_COUNT_HIRES_EN)); + + /* R300_INST_COUNT_RS - highest RS instruction used */ + OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6)); + + /* Pixel stack frame size. */ + OUT_ACCEL_REG(R300_US_PIXSIZE, 0); /* highest temp used */ + + /* FP length. */ + OUT_ACCEL_REG(R500_US_CODE_ADDR, (R500_US_CODE_START_ADDR(0) | + R500_US_CODE_END_ADDR(1))); + OUT_ACCEL_REG(R500_US_CODE_RANGE, (R500_US_CODE_RANGE_ADDR(0) | + R500_US_CODE_RANGE_SIZE(1))); + + /* Prepare for FP emission. */ + OUT_ACCEL_REG(R500_US_CODE_OFFSET, 0); + OUT_ACCEL_REG(R500_GA_US_VECTOR_INDEX, R500_US_VECTOR_INST_INDEX(0)); + + /* tex inst */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_TEX | + R500_INST_TEX_SEM_WAIT | + R500_INST_RGB_WMASK_R | + R500_INST_RGB_WMASK_G | + R500_INST_RGB_WMASK_B | + R500_INST_ALPHA_WMASK | + R500_INST_RGB_CLAMP | + R500_INST_ALPHA_CLAMP)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_ID(0) | + R500_TEX_INST_LD | + R500_TEX_SEM_ACQUIRE | + R500_TEX_IGNORE_UNCOVERED)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_TEX_SRC_ADDR(0) | + R500_TEX_SRC_S_SWIZ_R | + R500_TEX_SRC_T_SWIZ_G | + R500_TEX_DST_ADDR(0) | + R500_TEX_DST_R_SWIZ_R | + R500_TEX_DST_G_SWIZ_G | + R500_TEX_DST_B_SWIZ_B | + R500_TEX_DST_A_SWIZ_A)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_DX_ADDR(0) | + R500_DX_S_SWIZ_R | + R500_DX_T_SWIZ_R | + R500_DX_R_SWIZ_R | + R500_DX_Q_SWIZ_R | + R500_DY_ADDR(0) | + R500_DY_S_SWIZ_R | + R500_DY_T_SWIZ_R | + R500_DY_R_SWIZ_R | + R500_DY_Q_SWIZ_R)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, 0x00000000); + + /* ALU inst */ + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_INST_TYPE_OUT | + R500_INST_TEX_SEM_WAIT | + R500_INST_LAST | + R500_INST_RGB_OMASK_R | + R500_INST_RGB_OMASK_G | + R500_INST_RGB_OMASK_B | + R500_INST_ALPHA_OMASK | + R500_INST_RGB_CLAMP | + R500_INST_ALPHA_CLAMP)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_RGB_ADDR0(0) | + R500_RGB_ADDR1(0) | + R500_RGB_ADDR1_CONST | + R500_RGB_ADDR2(0) | + R500_RGB_ADDR2_CONST)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_ADDR0(0) | + R500_ALPHA_ADDR1(0) | + R500_ALPHA_ADDR1_CONST | + R500_ALPHA_ADDR2(0) | + R500_ALPHA_ADDR2_CONST)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGB_SEL_A_SRC0 | + R500_ALU_RGB_R_SWIZ_A_R | + R500_ALU_RGB_G_SWIZ_A_G | + R500_ALU_RGB_B_SWIZ_A_B | + R500_ALU_RGB_SEL_B_SRC0 | + R500_ALU_RGB_R_SWIZ_B_1 | + R500_ALU_RGB_B_SWIZ_B_1 | + R500_ALU_RGB_G_SWIZ_B_1)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALPHA_OP_MAD | + R500_ALPHA_SWIZ_A_A | + R500_ALPHA_SWIZ_B_1)); + OUT_ACCEL_REG(R500_GA_US_VECTOR_DATA, (R500_ALU_RGBA_OP_MAD | + R500_ALU_RGBA_R_SWIZ_0 | + R500_ALU_RGBA_G_SWIZ_0 | + R500_ALU_RGBA_B_SWIZ_0 | + R500_ALU_RGBA_A_SWIZ_0)); + FINISH_ACCEL(); + } + + BEGIN_ACCEL(6); + OUT_ACCEL_REG(R300_TX_INVALTAGS, 0); + OUT_ACCEL_REG(R300_TX_ENABLE, txenable); + + OUT_ACCEL_REG(R300_RB3D_COLOROFFSET0, dst_offset); + OUT_ACCEL_REG(R300_RB3D_COLORPITCH0, colorpitch); + + /* no need to enable blending */ + OUT_ACCEL_REG(R300_RB3D_BLENDCNTL, RADEON_SRC_BLEND_GL_ONE | RADEON_DST_BLEND_GL_ZERO); + + OUT_ACCEL_REG(R300_VAP_VTX_SIZE, vtx_count); + FINISH_ACCEL(); + + if (pPriv->vsync) { + xf86CrtcPtr crtc = radeon_xv_pick_best_crtc(pScrn, + pPriv->drw_x, + pPriv->drw_x + pPriv->dst_w, + pPriv->drw_y, + pPriv->drw_y + pPriv->dst_h); + if (crtc) { + RADEONCrtcPrivatePtr radeon_crtc = crtc->driver_private; + + FUNC_NAME(RADEONWaitForVLine)(pScrn, pPixmap, + radeon_crtc->crtc_id, + pPriv->drw_y - crtc->y, + (pPriv->drw_y - crtc->y) + pPriv->dst_h); + } + } + /* + * Rendering of the actual polygon is done in two different + * ways depending on chip generation: + * + * < R300: + * + * These chips can render a rectangle in one pass, so + * handling is pretty straight-forward. + * + * >= R300: + * + * These chips can accept a quad, but will render it as + * two triangles which results in a diagonal tear. Instead + * We render a single, large triangle and use the scissor + * functionality to restrict it to the desired rectangle. + * Due to guardband limits on r3xx/r4xx, we can only use + * the single triangle up to 2880 pixels; above that we + * render as a quad. + */ + + while (nBox--) { + int srcX, srcY, srcw, srch; + int dstX, dstY, dstw, dsth; + dstX = pBox->x1 + dstxoff; + dstY = pBox->y1 + dstyoff; + dstw = pBox->x2 - pBox->x1; + dsth = pBox->y2 - pBox->y1; + + srcX = ((pBox->x1 - pPriv->drw_x) * + pPriv->src_w) / pPriv->dst_w; + srcY = ((pBox->y1 - pPriv->drw_y) * + pPriv->src_h) / pPriv->dst_h; + + srcw = (pPriv->src_w * dstw) / pPriv->dst_w; + srch = (pPriv->src_h * dsth) / pPriv->dst_h; + + BEGIN_ACCEL(2); + OUT_ACCEL_REG(R300_SC_SCISSOR0, (((dstX) << R300_SCISSOR_X_SHIFT) | + ((dstY) << R300_SCISSOR_Y_SHIFT))); + OUT_ACCEL_REG(R300_SC_SCISSOR1, (((dstX + dstw - 1) << R300_SCISSOR_X_SHIFT) | + ((dstY + dsth - 1) << R300_SCISSOR_Y_SHIFT))); + FINISH_ACCEL(); + +#ifdef ACCEL_CP + BEGIN_RING(3 * vtx_count + 4); + OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2, + 3 * vtx_count)); + OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST | + RADEON_CP_VC_CNTL_PRIM_WALK_RING | + (3 << RADEON_CP_VC_CNTL_NUM_SHIFT)); +#else /* ACCEL_CP */ + BEGIN_ACCEL(2 + vtx_count * 3); + OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_TRIANGLE_LIST | + RADEON_VF_PRIM_WALK_DATA | + (3 << RADEON_VF_NUM_VERTICES_SHIFT))); +#endif + if (pPriv->bicubic_enabled) { + VTX_OUT_FILTER((float)dstX, (float)dstY, + (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0], + (float)srcX + 0.5, (float)srcY + 0.5); + VTX_OUT_FILTER((float)dstX, (float)(dstY + dstw + dsth), + (float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0], + (float)srcX + 0.5, (float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0) + 0.5); + VTX_OUT_FILTER((float)(dstX + dstw + dsth), (float)dstY, + ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0], + (float)srcY / info->accel_state->texH[0], + (float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0) + 0.5, + (float)srcY + 0.5); + } else { + /* + * Render a big, scissored triangle. This means + * increasing the triangle size and adjusting + * texture coordinates. + */ + VTX_OUT((float)dstX, (float)dstY, + (float)srcX / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]); + VTX_OUT((float)dstX, (float)(dstY + dsth + dstw), + (float)srcX / info->accel_state->texW[0], ((float)srcY + (float)srch * (((float)dstw / (float)dsth) + 1.0)) / info->accel_state->texH[0]); + VTX_OUT((float)(dstX + dstw + dsth), (float)dstY, + ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0], + (float)srcY / info->accel_state->texH[0]); + } + + /* flushing is pipelined, free/finish is not */ + OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_DC_FLUSH_3D); + +#ifdef ACCEL_CP + ADVANCE_RING(); +#else + FINISH_ACCEL(); +#endif /* !ACCEL_CP */ + + pBox++; + } + + BEGIN_ACCEL(3); + OUT_ACCEL_REG(R300_SC_CLIP_RULE, 0xAAAA); + OUT_ACCEL_REG(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DC_FLUSH_ALL); OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN); FINISH_ACCEL(); |