diff options
Diffstat (limited to 'lib')
44 files changed, 472 insertions, 130 deletions
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp index 7bda1184e..3ee708f4f 100644 --- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp @@ -536,6 +536,15 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, #if defined(PIPE_ARCH_PPC) MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec"); +#if HAVE_LLVM >= 0x0304 + /* + * Make sure VSX instructions are disabled + * See LLVM bug https://llvm.org/bugs/show_bug.cgi?id=25503#c7 + */ + if (util_cpu_caps.has_altivec) { + MAttrs.push_back("-vsx"); + } +#endif #endif builder.setMAttrs(MAttrs); diff --git a/lib/mesa/src/gallium/auxiliary/nir/tgsi_to_nir.c b/lib/mesa/src/gallium/auxiliary/nir/tgsi_to_nir.c index 93dfb8033..14370e26d 100644 --- a/lib/mesa/src/gallium/auxiliary/nir/tgsi_to_nir.c +++ b/lib/mesa/src/gallium/auxiliary/nir/tgsi_to_nir.c @@ -1087,6 +1087,11 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src) op = nir_texop_tex; num_srcs = 1; break; + case TGSI_OPCODE_TEX2: + op = nir_texop_tex; + num_srcs = 1; + samp = 2; + break; case TGSI_OPCODE_TXP: op = nir_texop_tex; num_srcs = 2; @@ -1242,10 +1247,12 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src) } if (instr->is_shadow) { - if (instr->coord_components < 3) - instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], Z)); - else + if (instr->coord_components == 4) + instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[1], X)); + else if (instr->coord_components == 3) instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], W)); + else + instr->src[src_number].src = nir_src_for_ssa(ttn_channel(b, src[0], Z)); instr->src[src_number].src_type = nir_tex_src_comparitor; src_number++; @@ -1651,6 +1658,7 @@ ttn_emit_instruction(struct ttn_compile *c) case TGSI_OPCODE_TXL: case TGSI_OPCODE_TXB: case TGSI_OPCODE_TXD: + case TGSI_OPCODE_TEX2: case TGSI_OPCODE_TXL2: case TGSI_OPCODE_TXB2: case TGSI_OPCODE_TXQ_LZ: diff --git a/lib/mesa/src/gallium/auxiliary/tgsi/tgsi_scan.c b/lib/mesa/src/gallium/auxiliary/tgsi/tgsi_scan.c index 7523baf4c..98a5fec77 100644 --- a/lib/mesa/src/gallium/auxiliary/tgsi/tgsi_scan.c +++ b/lib/mesa/src/gallium/auxiliary/tgsi/tgsi_scan.c @@ -258,6 +258,9 @@ tgsi_scan_shader(const struct tgsi_token *tokens, info->output_semantic_index[reg] = (ubyte) semIndex; info->num_outputs++; + if (semName == TGSI_SEMANTIC_COLOR) + info->colors_written |= 1 << semIndex; + if (procType == TGSI_PROCESSOR_VERTEX || procType == TGSI_PROCESSOR_GEOMETRY || procType == TGSI_PROCESSOR_TESS_CTRL || diff --git a/lib/mesa/src/gallium/auxiliary/tgsi/tgsi_scan.h b/lib/mesa/src/gallium/auxiliary/tgsi/tgsi_scan.h index b81bdd71f..6301f91c1 100644 --- a/lib/mesa/src/gallium/auxiliary/tgsi/tgsi_scan.h +++ b/lib/mesa/src/gallium/auxiliary/tgsi/tgsi_scan.h @@ -76,6 +76,7 @@ struct tgsi_shader_info uint opcode_count[TGSI_OPCODE_LAST]; /**< opcode histogram */ + ubyte colors_written; boolean reads_position; /**< does fragment shader read position? */ boolean reads_z; /**< does fragment shader read depth? */ boolean writes_z; /**< does fragment shader write Z value? */ diff --git a/lib/mesa/src/gallium/auxiliary/util/u_helpers.c b/lib/mesa/src/gallium/auxiliary/util/u_helpers.c index ac1edcdbb..117a51b3b 100644 --- a/lib/mesa/src/gallium/auxiliary/util/u_helpers.c +++ b/lib/mesa/src/gallium/auxiliary/util/u_helpers.c @@ -81,7 +81,13 @@ void util_set_vertex_buffers_count(struct pipe_vertex_buffer *dst, const struct pipe_vertex_buffer *src, unsigned start_slot, unsigned count) { - uint32_t enabled_buffers = (1ull << *dst_count) - 1; + unsigned i; + uint32_t enabled_buffers = 0; + + for (i = 0; i < *dst_count; i++) { + if (dst[i].buffer || dst[i].user_buffer) + enabled_buffers |= (1ull << i); + } util_set_vertex_buffers_mask(dst, &enabled_buffers, src, start_slot, count); diff --git a/lib/mesa/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/lib/mesa/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h index 2e1d712a2..149a197ae 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h +++ b/lib/mesa/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h @@ -153,7 +153,7 @@ enum a4xx_vtx_fmt { enum a4xx_tex_fmt { TFMT4_5_6_5_UNORM = 11, - TFMT4_5_5_5_1_UNORM = 10, + TFMT4_5_5_5_1_UNORM = 9, TFMT4_4_4_4_4_UNORM = 8, TFMT4_X8Z24_UNORM = 71, TFMT4_10_10_10_2_UNORM = 33, @@ -2718,6 +2718,12 @@ static inline uint32_t A4XX_TEX_SAMP_0_ANISO(enum a4xx_tex_aniso val) { return ((val) << A4XX_TEX_SAMP_0_ANISO__SHIFT) & A4XX_TEX_SAMP_0_ANISO__MASK; } +#define A4XX_TEX_SAMP_0_LOD_BIAS__MASK 0xfff80000 +#define A4XX_TEX_SAMP_0_LOD_BIAS__SHIFT 19 +static inline uint32_t A4XX_TEX_SAMP_0_LOD_BIAS(float val) +{ + return ((((int32_t)(val * 256.0))) << A4XX_TEX_SAMP_0_LOD_BIAS__SHIFT) & A4XX_TEX_SAMP_0_LOD_BIAS__MASK; +} #define REG_A4XX_TEX_SAMP_1 0x00000001 #define A4XX_TEX_SAMP_1_COMPARE_FUNC__MASK 0x0000000e diff --git a/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_program.c index a3d7123cc..47e8855e7 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_program.c +++ b/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_program.c @@ -250,14 +250,6 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, } } - /* adjust regids for alpha output formats. there is no alpha render - * format, so it's just treated like red - */ - for (i = 0; i < nr; i++) - if (util_format_is_alpha(pipe_surface_format(bufs[i]))) - color_regid[i] += 3; - - /* TODO get these dynamically: */ face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0); coord_regid = s[FS].v->frag_coord ? regid(0,0) : regid(63,0); diff --git a/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_texture.c index 213b29c91..7a28e09b3 100644 --- a/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_texture.c +++ b/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_texture.c @@ -111,6 +111,7 @@ fd4_sampler_state_create(struct pipe_context *pctx, COND(!cso->normalized_coords, A4XX_TEX_SAMP_1_UNNORM_COORDS); if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { + so->texsamp0 |= A4XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias); so->texsamp1 |= A4XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | A4XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp index fa8ee072a..9f0e07333 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp @@ -291,7 +291,7 @@ void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b) if (b->prev) b->prev->next = b; - if (a->prev) + if (a->next) a->next->prev = a; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index 8f1542959..fa9336283 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -575,8 +575,8 @@ CodeEmitterGK110::emitIMUL(const Instruction *i) if (isLIMM(i->src(1), TYPE_S32)) { emitForm_L(i, 0x280, 2, Modifier(0)); - assert(i->subOp != NV50_IR_SUBOP_MUL_HIGH); - + if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) + code[1] |= 1 << 24; if (i->sType == TYPE_S32) code[1] |= 3 << 25; } else { @@ -695,14 +695,9 @@ CodeEmitterGK110::emitIMAD(const Instruction *i) if (i->sType == TYPE_S32) code[1] |= (1 << 19) | (1 << 24); - if (code[0] & 0x1) { - assert(!i->subOp); - SAT_(39); - } else { - if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) - code[1] |= 1 << 25; - SAT_(35); - } + if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) + code[1] |= 1 << 25; + SAT_(35); } void diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index 6bf5219d3..f88f4c2e9 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -2322,6 +2322,9 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn) case OP_PFETCH: emitPFETCH(insn); break; + case OP_AFETCH: + emitAFETCH(insn); + break; case OP_EMIT: case OP_RESTART: emitOUT(insn); diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index f153674e9..f4ff33164 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -2870,6 +2870,12 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn) bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK); } setPosition(reinterpret_cast<BasicBlock *>(breakBBs.pop().u.p), true); + + // If the loop never breaks (e.g. only has RET's inside), then there + // will be no way to get to the break bb. However BGNLOOP will have + // already made a PREBREAK to it, so it must be in the CFG. + if (getBB()->cfg.incidentCount() == 0) + loopBB->cfg.attach(&getBB()->cfg, Graph::Edge::TREE); } break; case TGSI_OPCODE_BRK: diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index d87cdfff8..4f4320b4d 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -202,7 +202,8 @@ NV50LegalizePostRA::visit(Function *fn) Program *prog = fn->getProgram(); r63 = new_LValue(fn, FILE_GPR); - if (prog->maxGPR < 63) + // GPR units on nv50 are in half-regs + if (prog->maxGPR < 126) r63->reg.data.id = 63; else r63->reg.data.id = 127; @@ -831,7 +832,7 @@ NV50LoweringPreSSA::handleTXB(TexInstruction *i) } Value *flags = bld.getScratch(1, FILE_FLAGS); bld.setPosition(cond, true); - bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0)); + bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0; Instruction *tex[4]; for (l = 0; l < 4; ++l) { diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index b1f406585..0f575f2ee 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -686,7 +686,7 @@ NVC0LoweringPass::handleTEX(TexInstruction *i) i->tex.s = 0x1f; i->setIndirectR(hnd); i->setIndirectS(NULL); - } else if (i->tex.r == i->tex.s) { + } else if (i->tex.r == i->tex.s || i->op == OP_TXF) { i->tex.r += prog->driver->io.texBindBase / 4; i->tex.s = 0; // only a single cX[] value possible here } else { @@ -962,11 +962,14 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd) bool NVC0LoweringPass::handleTXQ(TexInstruction *txq) { + const int chipset = prog->getTarget()->getChipset(); + if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0) + txq->tex.r += prog->driver->io.texBindBase / 4; + if (txq->tex.rIndirectSrc < 0) return true; Value *ticRel = txq->getIndirectR(); - const int chipset = prog->getTarget()->getChipset(); txq->setIndirectS(NULL); txq->tex.sIndirectSrc = -1; diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 44f74c613..13f36d0cf 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -842,6 +842,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) i->src(0).mod = i->src(t).mod; i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); i->src(1).mod = 0; + } else + if (i->postFactor && i->sType == TYPE_F32) { + /* Can't emit a postfactor with an immediate, have to fold it in */ + i->setSrc(s, new_ImmediateValue( + prog, imm0.reg.data.f32 * exp2f(i->postFactor))); + i->postFactor = 0; } break; case OP_MAD: @@ -2606,8 +2612,11 @@ NV50PostRaConstantFolding::visit(BasicBlock *bb) i->getSrc(0)->reg.data.id >= 64) break; + if (i->getPredicate()) + break; + def = i->getSrc(1)->getInsn(); - if (def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) { + if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) { vtmp = i->getSrc(1); i->setSrc(1, def->getSrc(0)); @@ -2909,6 +2918,16 @@ DeadCodeElim::visit(BasicBlock *bb) return true; } +// Each load can go into up to 4 destinations, any of which might potentially +// be dead (i.e. a hole). These can always be split into 2 loads, independent +// of where the holes are. We find the first contiguous region, put it into +// the first load, and then put the second contiguous region into the second +// load. There can be at most 2 contiguous regions. +// +// Note that there are some restrictions, for example it's not possible to do +// a 64-bit load that's not 64-bit aligned, so such a load has to be split +// up. Also hardware doesn't support 96-bit loads, so those also have to be +// split into a 64-bit and 32-bit load. void DeadCodeElim::checkSplitLoad(Instruction *ld1) { @@ -2929,6 +2948,8 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1) addr1 = ld1->getSrc(0)->reg.data.offset; n1 = n2 = 0; size1 = size2 = 0; + + // Compute address/width for first load for (d = 0; ld1->defExists(d); ++d) { if (mask & (1 << d)) { if (size1 && (addr1 & 0x7)) @@ -2942,16 +2963,34 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1) break; } } + + // Scale back the size of the first load until it can be loaded. This + // typically happens for TYPE_B96 loads. + while (n1 && + !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file, + typeOfSize(size1))) { + size1 -= def1[--n1]->reg.size; + d--; + } + + // Compute address/width for second load for (addr2 = addr1 + size1; ld1->defExists(d); ++d) { if (mask & (1 << d)) { + assert(!size2 || !(addr2 & 0x7)); def2[n2] = ld1->getDef(d); size2 += def2[n2++]->reg.size; - } else { + } else if (!n2) { assert(!n2); addr2 += ld1->getDef(d)->reg.size; + } else { + break; } } + // Make sure that we've processed all the values + for (; ld1->defExists(d); ++d) + assert(!(mask & (1 << d))); + updateLdStOffset(ld1, addr1, func); ld1->setType(typeOfSize(size1)); for (d = 0; d < 4; ++d) diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index 7859c8e79..41d2cc916 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -1573,10 +1573,28 @@ SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval) Instruction *st; if (slot->reg.file == FILE_MEMORY_LOCAL) { - st = new_Instruction(func, OP_STORE, ty); - st->setSrc(0, slot); - st->setSrc(1, lval); lval->noSpill = 1; + if (ty != TYPE_B96) { + st = new_Instruction(func, OP_STORE, ty); + st->setSrc(0, slot); + st->setSrc(1, lval); + } else { + st = new_Instruction(func, OP_SPLIT, ty); + st->setSrc(0, lval); + for (int d = 0; d < lval->reg.size / 4; ++d) + st->setDef(d, new_LValue(func, FILE_GPR)); + + for (int d = lval->reg.size / 4 - 1; d >= 0; --d) { + Value *tmp = cloneShallow(func, slot); + tmp->reg.size = 4; + tmp->reg.data.offset += 4 * d; + + Instruction *s = new_Instruction(func, OP_STORE, TYPE_U32); + s->setSrc(0, tmp); + s->setSrc(1, st->getDef(d)); + defi->bb->insertAfter(defi, s); + } + } } else { st = new_Instruction(func, OP_CVT, ty); st->setDef(0, slot); @@ -1596,7 +1614,27 @@ SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot) Instruction *ld; if (slot->reg.file == FILE_MEMORY_LOCAL) { lval->noSpill = 1; - ld = new_Instruction(func, OP_LOAD, ty); + if (ty != TYPE_B96) { + ld = new_Instruction(func, OP_LOAD, ty); + } else { + ld = new_Instruction(func, OP_MERGE, ty); + for (int d = 0; d < lval->reg.size / 4; ++d) { + Value *tmp = cloneShallow(func, slot); + LValue *val; + tmp->reg.size = 4; + tmp->reg.data.offset += 4 * d; + + Instruction *l = new_Instruction(func, OP_LOAD, TYPE_U32); + l->setDef(0, (val = new_LValue(func, FILE_GPR))); + l->setSrc(0, tmp); + usei->bb->insertBefore(usei, l); + ld->setSrc(d, val); + val->noSpill = 1; + } + ld->setDef(0, lval); + usei->bb->insertBefore(usei, ld); + return lval; + } } else { ld = new_Instruction(func, OP_CVT, ty); } diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp index f3ddcaa51..76a76545c 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp @@ -454,7 +454,7 @@ TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const return false; } } - if (s >= 3) + if (s >= opInfo[insn->op].srcNr || s >= 3) return false; return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 27df0eba6..3b4e8025b 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -426,7 +426,7 @@ TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const return false; } } - if (s >= 3) + if (s >= opInfo[insn->op].srcNr || s >= 3) return false; return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod; } diff --git a/lib/mesa/src/gallium/drivers/nouveau/nouveau_buffer.c b/lib/mesa/src/gallium/drivers/nouveau/nouveau_buffer.c index 72e070b5f..371e1ec76 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nouveau_buffer.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nouveau_buffer.c @@ -656,8 +656,8 @@ nouveau_buffer_create(struct pipe_screen *pscreen, if (buffer->base.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT | PIPE_RESOURCE_FLAG_MAP_COHERENT)) { buffer->domain = NOUVEAU_BO_GART; - } else if (buffer->base.bind & - (screen->vidmem_bindings & screen->sysmem_bindings)) { + } else if (buffer->base.bind == 0 || (buffer->base.bind & + (screen->vidmem_bindings & screen->sysmem_bindings))) { switch (buffer->base.usage) { case PIPE_USAGE_DEFAULT: case PIPE_USAGE_IMMUTABLE: @@ -684,6 +684,10 @@ nouveau_buffer_create(struct pipe_screen *pscreen, if (buffer->base.bind & screen->sysmem_bindings) buffer->domain = NOUVEAU_BO_GART; } + /* There can be very special situations where we want non-gpu-mapped + * buffers, but never through this interface. + */ + assert(buffer->domain); ret = nouveau_buffer_allocate(screen, buffer, buffer->domain); if (ret == false) diff --git a/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_context.c b/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_context.c index 152c2ce13..2710b7524 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_context.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_context.c @@ -159,9 +159,10 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx, int ref) { struct nv50_context *nv50 = nv50_context(&ctx->pipe); + unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER; unsigned s, i; - if (res->bind & PIPE_BIND_RENDER_TARGET) { + if (bind & PIPE_BIND_RENDER_TARGET) { assert(nv50->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS); for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) { if (nv50->framebuffer.cbufs[i] && @@ -173,7 +174,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx, } } } - if (res->bind & PIPE_BIND_DEPTH_STENCIL) { + if (bind & PIPE_BIND_DEPTH_STENCIL) { if (nv50->framebuffer.zsbuf && nv50->framebuffer.zsbuf->texture == res) { nv50->dirty |= NV50_NEW_FRAMEBUFFER; @@ -183,11 +184,11 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx, } } - if (res->bind & (PIPE_BIND_VERTEX_BUFFER | - PIPE_BIND_INDEX_BUFFER | - PIPE_BIND_CONSTANT_BUFFER | - PIPE_BIND_STREAM_OUTPUT | - PIPE_BIND_SAMPLER_VIEW)) { + if (bind & (PIPE_BIND_VERTEX_BUFFER | + PIPE_BIND_INDEX_BUFFER | + PIPE_BIND_CONSTANT_BUFFER | + PIPE_BIND_STREAM_OUTPUT | + PIPE_BIND_SAMPLER_VIEW)) { assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS); for (i = 0; i < nv50->num_vtxbufs; ++i) { diff --git a/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_state.c b/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_state.c index 410e6311e..335d95259 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_state.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_state.c @@ -960,6 +960,9 @@ nv50_set_vertex_buffers(struct pipe_context *pipe, struct nv50_context *nv50 = nv50_context(pipe); unsigned i; + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX); + nv50->dirty |= NV50_NEW_ARRAYS; + util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb, start_slot, count); @@ -983,10 +986,6 @@ nv50_set_vertex_buffers(struct pipe_context *pipe, nv50->vbo_constant &= ~(1 << dst_index); } } - - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX); - - nv50->dirty |= NV50_NEW_ARRAYS; } static void diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.c index 7a15a11f5..a0b9a54d9 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.c @@ -180,9 +180,10 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx, int ref) { struct nvc0_context *nvc0 = nvc0_context(&ctx->pipe); + unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER; unsigned s, i; - if (res->bind & PIPE_BIND_RENDER_TARGET) { + if (bind & PIPE_BIND_RENDER_TARGET) { for (i = 0; i < nvc0->framebuffer.nr_cbufs; ++i) { if (nvc0->framebuffer.cbufs[i] && nvc0->framebuffer.cbufs[i]->texture == res) { @@ -193,7 +194,7 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx, } } } - if (res->bind & PIPE_BIND_DEPTH_STENCIL) { + if (bind & PIPE_BIND_DEPTH_STENCIL) { if (nvc0->framebuffer.zsbuf && nvc0->framebuffer.zsbuf->texture == res) { nvc0->dirty |= NVC0_NEW_FRAMEBUFFER; @@ -203,12 +204,12 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx, } } - if (res->bind & (PIPE_BIND_VERTEX_BUFFER | - PIPE_BIND_INDEX_BUFFER | - PIPE_BIND_CONSTANT_BUFFER | - PIPE_BIND_STREAM_OUTPUT | - PIPE_BIND_COMMAND_ARGS_BUFFER | - PIPE_BIND_SAMPLER_VIEW)) { + if (bind & (PIPE_BIND_VERTEX_BUFFER | + PIPE_BIND_INDEX_BUFFER | + PIPE_BIND_CONSTANT_BUFFER | + PIPE_BIND_STREAM_OUTPUT | + PIPE_BIND_COMMAND_ARGS_BUFFER | + PIPE_BIND_SAMPLER_VIEW)) { for (i = 0; i < nvc0->num_vtxbufs; ++i) { if (nvc0->vtxbuf[i].buffer == res) { nvc0->dirty |= NVC0_NEW_ARRAYS; diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 191e3b727..139a5039b 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -417,6 +417,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen) if (screen->pm.prog) { screen->pm.prog->code = NULL; /* hardcoded, don't FREE */ nvc0_program_destroy(NULL, screen->pm.prog); + FREE(screen->pm.prog); } nouveau_bo_ref(NULL, &screen->text); diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index c5bfd0395..310e30f52 100644 --- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -998,6 +998,9 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe, struct nvc0_context *nvc0 = nvc0_context(pipe); unsigned i; + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX); + nvc0->dirty |= NVC0_NEW_ARRAYS; + util_set_vertex_buffers_count(nvc0->vtxbuf, &nvc0->num_vtxbufs, vb, start_slot, count); @@ -1021,9 +1024,6 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe, nvc0->constant_vbos &= ~(1 << dst_index); } } - - nvc0->dirty |= NVC0_NEW_ARRAYS; - nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX); } static void diff --git a/lib/mesa/src/gallium/drivers/r600/evergreen_state.c b/lib/mesa/src/gallium/drivers/r600/evergreen_state.c index 2dc381178..1976d873f 100644 --- a/lib/mesa/src/gallium/drivers/r600/evergreen_state.c +++ b/lib/mesa/src/gallium/drivers/r600/evergreen_state.c @@ -1527,12 +1527,17 @@ static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples, S_028C00_EXPAND_LINE_WIDTH(1)); /* R_028C00_PA_SC_LINE_CNTL */ radeon_emit(cs, S_028C04_MSAA_NUM_SAMPLES(util_logbase2(nr_samples)) | S_028C04_MAX_SAMPLE_DIST(max_dist)); /* R_028C04_PA_SC_AA_CONFIG */ - r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1)); + r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, + EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) | + EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1)); } else { r600_write_context_reg_seq(cs, R_028C00_PA_SC_LINE_CNTL, 2); radeon_emit(cs, S_028C00_LAST_PIXEL(1)); /* R_028C00_PA_SC_LINE_CNTL */ radeon_emit(cs, 0); /* R_028C04_PA_SC_AA_CONFIG */ - r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0); + r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, + EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1)); } } diff --git a/lib/mesa/src/gallium/drivers/r600/r600_pipe.h b/lib/mesa/src/gallium/drivers/r600/r600_pipe.h index bb91f8308..896e3f71e 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_pipe.h +++ b/lib/mesa/src/gallium/drivers/r600/r600_pipe.h @@ -57,7 +57,7 @@ /* the number of CS dwords for flushing and drawing */ #define R600_MAX_FLUSH_CS_DWORDS 16 -#define R600_MAX_DRAW_CS_DWORDS 47 +#define R600_MAX_DRAW_CS_DWORDS 52 #define R600_TRACE_CS_DWORDS 7 #define R600_MAX_USER_CONST_BUFFERS 13 diff --git a/lib/mesa/src/gallium/drivers/r600/r600_shader.c b/lib/mesa/src/gallium/drivers/r600/r600_shader.c index 819f9a798..911e81fed 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_shader.c +++ b/lib/mesa/src/gallium/drivers/r600/r600_shader.c @@ -598,6 +598,106 @@ static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back return 0; } +/* execute a single slot ALU calculation */ +static int single_alu_op2(struct r600_shader_ctx *ctx, int op, + int dst_sel, int dst_chan, + int src0_sel, unsigned src0_chan_val, + int src1_sel, unsigned src1_chan_val) +{ + struct r600_bytecode_alu alu; + int r, i; + + if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) { + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = op; + alu.src[0].sel = src0_sel; + if (src0_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[0].value = src0_chan_val; + else + alu.src[0].chan = src0_chan_val; + alu.src[1].sel = src1_sel; + if (src1_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[1].value = src1_chan_val; + else + alu.src[1].chan = src1_chan_val; + alu.dst.sel = dst_sel; + alu.dst.chan = i; + alu.dst.write = i == dst_chan; + alu.last = (i == 3); + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + return 0; + } + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = op; + alu.src[0].sel = src0_sel; + if (src0_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[0].value = src0_chan_val; + else + alu.src[0].chan = src0_chan_val; + alu.src[1].sel = src1_sel; + if (src1_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[1].value = src1_chan_val; + else + alu.src[1].chan = src1_chan_val; + alu.dst.sel = dst_sel; + alu.dst.chan = dst_chan; + alu.dst.write = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + return 0; +} + +/* execute a single slot ALU calculation */ +static int single_alu_op3(struct r600_shader_ctx *ctx, int op, + int dst_sel, int dst_chan, + int src0_sel, unsigned src0_chan_val, + int src1_sel, unsigned src1_chan_val, + int src2_sel, unsigned src2_chan_val) +{ + struct r600_bytecode_alu alu; + int r; + + /* validate this for other ops */ + assert(op == ALU_OP3_MULADD_UINT24); + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = op; + alu.src[0].sel = src0_sel; + if (src0_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[0].value = src0_chan_val; + else + alu.src[0].chan = src0_chan_val; + alu.src[1].sel = src1_sel; + if (src1_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[1].value = src1_chan_val; + else + alu.src[1].chan = src1_chan_val; + alu.src[2].sel = src2_sel; + if (src2_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[2].value = src2_chan_val; + else + alu.src[2].chan = src2_chan_val; + alu.dst.sel = dst_sel; + alu.dst.chan = dst_chan; + alu.is_op3 = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + return 0; +} + +static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index) +{ + return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg; +} + static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid) { int i; @@ -1129,6 +1229,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi unsigned vtx_id = src->Dimension.Index; int offset_reg = vtx_id / 3; int offset_chan = vtx_id % 3; + int t2 = 0; /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y, * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */ @@ -1136,13 +1237,24 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi if (offset_reg == 0 && offset_chan == 2) offset_chan = 3; + if (src->Dimension.Indirect || src->Register.Indirect) + t2 = r600_get_temp(ctx); + if (src->Dimension.Indirect) { int treg[3]; - int t2; struct r600_bytecode_alu alu; int r, i; - - /* you have got to be shitting me - + unsigned addr_reg; + addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index); + if (src->DimIndirect.Index > 0) { + r = single_alu_op2(ctx, ALU_OP1_MOV, + ctx->bc->ar_reg, 0, + addr_reg, 0, + 0, 0); + if (r) + return r; + } + /* we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt. at least this is what fglrx seems to do. */ for (i = 0; i < 3; i++) { @@ -1150,7 +1262,6 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi } r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F); - t2 = r600_get_temp(ctx); for (i = 0; i < 3; i++) { memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = ALU_OP1_MOV; @@ -1175,8 +1286,33 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi if (r) return r; offset_reg = t2; + offset_chan = 0; } + if (src->Register.Indirect) { + int addr_reg; + unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID]; + + addr_reg = get_address_file_reg(ctx, src->Indirect.Index); + + /* pull the value from index_reg */ + r = single_alu_op2(ctx, ALU_OP2_ADD_INT, + t2, 1, + addr_reg, 0, + V_SQ_ALU_SRC_LITERAL, first); + if (r) + return r; + r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, + t2, 0, + t2, 1, + V_SQ_ALU_SRC_LITERAL, 4, + offset_reg, offset_chan); + if (r) + return r; + offset_reg = t2; + offset_chan = 0; + index = src->Register.Index - first; + } memset(&vtx, 0, sizeof(vtx)); vtx.buffer_id = R600_GS_RING_CONST_BUFFER; @@ -1222,6 +1358,7 @@ static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx) fetch_gs_input(ctx, src, treg); ctx->src[i].sel = treg; + ctx->src[i].rel = 0; } } return 0; @@ -1972,7 +2109,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.nliterals = 0; ctx.literals = NULL; - shader->fs_write_all = FALSE; + + shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && + ctx.info.colors_written == 1; if (shader->vs_as_gs_a) vs_add_primid_output(&ctx, key.vs.prim_id_out); @@ -2003,10 +2142,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, case TGSI_TOKEN_TYPE_PROPERTY: property = &ctx.parse.FullToken.FullProperty; switch (property->Property.PropertyName) { - case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS: - if (property->u[0].Data == 1) - shader->fs_write_all = TRUE; - break; case TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION: if (property->u[0].Data == 1) shader->vs_position_window_space = TRUE; @@ -2159,6 +2294,10 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, struct r600_bytecode_alu alu; int r; + /* GS thread with no output workaround - emit a cut at start of GS */ + if (ctx.bc->chip_class == R600) + r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX); + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = ALU_OP1_MOV; alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; @@ -6671,7 +6810,7 @@ static int tgsi_eg_arl(struct r600_shader_ctx *ctx) struct r600_bytecode_alu alu; int r; int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); - unsigned reg = inst->Dst[0].Register.Index > 0 ? ctx->bc->index_reg[inst->Dst[0].Register.Index - 1] : ctx->bc->ar_reg; + unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index); assert(inst->Dst[0].Register.Index < 3); memset(&alu, 0, sizeof(struct r600_bytecode_alu)); diff --git a/lib/mesa/src/gallium/drivers/r600/r600_state.c b/lib/mesa/src/gallium/drivers/r600/r600_state.c index 2c727815d..a588e16e5 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_state.c +++ b/lib/mesa/src/gallium/drivers/r600/r600_state.c @@ -2181,10 +2181,11 @@ void r600_init_atom_start_cs(struct r600_context *rctx) num_temp_gprs = 4; num_gs_gprs = 0; num_es_gprs = 0; - num_ps_threads = 136; - num_vs_threads = 48; - num_gs_threads = 4; - num_es_threads = 4; + /* use limits 40 VS and at least 16 ES/GS */ + num_ps_threads = 120; + num_vs_threads = 40; + num_gs_threads = 16; + num_es_threads = 16; num_ps_stack_entries = 40; num_vs_stack_entries = 40; num_gs_stack_entries = 32; @@ -2643,6 +2644,9 @@ void r600_update_vs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha S_02881C_USE_VTX_VIEWPORT_INDX(rshader->vs_out_viewport); } +#define RV610_GSVS_ALIGN 32 +#define R600_GSVS_ALIGN 16 + void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *shader) { struct r600_context *rctx = (struct r600_context *)ctx; @@ -2652,6 +2656,23 @@ void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha unsigned gsvs_itemsize = (cp_shader->ring_item_size * rshader->gs_max_out_vertices) >> 2; + /* some r600s needs gsvs itemsize aligned to cacheline size + this was fixed in rs780 and above. */ + switch (rctx->b.family) { + case CHIP_RV610: + gsvs_itemsize = align(gsvs_itemsize, RV610_GSVS_ALIGN); + break; + case CHIP_R600: + case CHIP_RV630: + case CHIP_RV670: + case CHIP_RV620: + case CHIP_RV635: + gsvs_itemsize = align(gsvs_itemsize, R600_GSVS_ALIGN); + break; + default: + break; + } + r600_init_command_buffer(cb, 64); /* VGT_GS_MODE is written by r600_emit_shader_stages */ diff --git a/lib/mesa/src/gallium/drivers/r600/r600_state_common.c b/lib/mesa/src/gallium/drivers/r600/r600_state_common.c index bdd9337ec..2eebb5792 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600_state_common.c +++ b/lib/mesa/src/gallium/drivers/r600/r600_state_common.c @@ -1691,6 +1691,24 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info (info.count_from_stream_output ? S_0287F0_USE_OPAQUE(1) : 0); } + /* SMX returns CONTEXT_DONE too early workaround */ + if (rctx->b.family == CHIP_R600 || + rctx->b.family == CHIP_RV610 || + rctx->b.family == CHIP_RV630 || + rctx->b.family == CHIP_RV635) { + /* if we have gs shader or streamout + we need to do a wait idle after every draw */ + if (rctx->gs_shader || rctx->b.streamout.streamout_enabled) { + r600_write_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_3D_IDLE(1)); + } + } + + /* ES ring rolling over at EOP - workaround */ + if (rctx->b.chip_class == R600) { + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT); + } + if (rctx->screen->b.trace_bo) { r600_trace_emit(rctx); } diff --git a/lib/mesa/src/gallium/drivers/r600/r600d.h b/lib/mesa/src/gallium/drivers/r600/r600d.h index bce8b4ea0..4b44546b6 100644 --- a/lib/mesa/src/gallium/drivers/r600/r600d.h +++ b/lib/mesa/src/gallium/drivers/r600/r600d.h @@ -130,6 +130,7 @@ #define EVENT_TYPE_SAMPLE_STREAMOUTSTATS 0x20 #define EVENT_TYPE_FLUSH_AND_INV_DB_META 0x2c /* supported on r700+ */ #define EVENT_TYPE_VGT_FLUSH 0x24 +#define EVENT_TYPE_SQ_NON_EVENT 0x26 #define EVENT_TYPE_FLUSH_AND_INV_CB_META 46 /* supported on r700+ */ #define EVENT_TYPE(x) ((x) << 0) #define EVENT_INDEX(x) ((x) << 8) diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.am b/lib/mesa/src/gallium/drivers/radeon/Makefile.am index 13d8976de..a6fc145cb 100644 --- a/lib/mesa/src/gallium/drivers/radeon/Makefile.am +++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.am @@ -16,7 +16,8 @@ libradeon_la_SOURCES = \ if NEED_RADEON_LLVM AM_CFLAGS += \ - $(LLVM_CFLAGS) + $(LLVM_CFLAGS) \ + $(LIBELF_CFLAGS) libradeon_la_SOURCES += \ $(LLVM_C_FILES) @@ -24,7 +25,7 @@ libradeon_la_SOURCES += \ libradeon_la_LIBADD = \ $(CLOCK_LIB) \ $(LLVM_LIBS) \ - $(ELF_LIB) + $(LIBELF_LIBS) libradeon_la_LDFLAGS = \ $(LLVM_LDFLAGS) diff --git a/lib/mesa/src/gallium/drivers/radeon/cayman_msaa.c b/lib/mesa/src/gallium/drivers/radeon/cayman_msaa.c index 12a5f6047..60cf2db5d 100644 --- a/lib/mesa/src/gallium/drivers/radeon/cayman_msaa.c +++ b/lib/mesa/src/gallium/drivers/radeon/cayman_msaa.c @@ -229,13 +229,17 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1)); r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, - EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1)); + EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) | + EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1)); } else if (overrast_samples > 1) { r600_write_context_reg(cs, CM_R_028804_DB_EQAA, S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) | S_028804_OVERRASTERIZATION_AMOUNT(log_samples)); - r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0); + r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, + EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1)); } } else { r600_write_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2); @@ -245,6 +249,8 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, r600_write_context_reg(cs, CM_R_028804_DB_EQAA, S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1)); - r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0); + r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, + EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1)); } } diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c index 495fda0a8..62f7647d3 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c @@ -226,8 +226,8 @@ bool r600_common_context_init(struct r600_common_context *rctx, rctx->family = rscreen->family; rctx->chip_class = rscreen->chip_class; - if (rscreen->family == CHIP_HAWAII) - rctx->max_db = 16; + if (rscreen->chip_class >= CIK) + rctx->max_db = MAX2(8, rscreen->info.r600_num_backends); else if (rscreen->chip_class >= EVERGREEN) rctx->max_db = 8; else @@ -543,10 +543,11 @@ const char *r600_get_llvm_processor_name(enum radeon_family family) case CHIP_TONGA: return "tonga"; case CHIP_ICELAND: return "iceland"; case CHIP_CARRIZO: return "carrizo"; - case CHIP_FIJI: return "fiji"; #if HAVE_LLVM <= 0x0307 + case CHIP_FIJI: return "tonga"; case CHIP_STONEY: return "carrizo"; #else + case CHIP_FIJI: return "fiji"; case CHIP_STONEY: return "stoney"; #endif default: return ""; diff --git a/lib/mesa/src/gallium/drivers/radeon/r600d_common.h b/lib/mesa/src/gallium/drivers/radeon/r600d_common.h index 115042d15..7512c7232 100644 --- a/lib/mesa/src/gallium/drivers/radeon/r600d_common.h +++ b/lib/mesa/src/gallium/drivers/radeon/r600d_common.h @@ -168,6 +168,8 @@ #define EG_R_028A4C_PA_SC_MODE_CNTL_1 0x028A4C #define EG_S_028A4C_PS_ITER_SAMPLE(x) (((x) & 0x1) << 16) +#define EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x) (((x) & 0x1) << 25) +#define EG_S_028A4C_FORCE_EOV_REZ_ENABLE(x) (((x) & 0x1) << 26) #define CM_R_028804_DB_EQAA 0x00028804 #define S_028804_MAX_ANCHOR_SAMPLES(x) (((x) & 0x7) << 0) diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c index 55c216aa5..3a5d9f444 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c @@ -951,6 +951,8 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, dec->msg->body.decode.db_pitch = dec->base.width; dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target); + if (((struct r600_common_screen*)dec->screen)->family >= CHIP_STONEY) + dec->msg->body.decode.dt_wa_chroma_top_offset = dec->msg->body.decode.dt_pitch / 2; switch (u_reduce_video_profile(picture->profile)) { case PIPE_VIDEO_FORMAT_MPEG4_AVC: diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h index 452fbd608..756f69828 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h @@ -385,7 +385,10 @@ struct ruvd_msg { uint32_t dt_chroma_top_offset; uint32_t dt_chroma_bottom_offset; uint32_t dt_surf_tile_config; - uint32_t dt_reserved[3]; + uint32_t dt_uv_surf_tile_config; + // re-use dt_wa_chroma_top_offset as dt_ext_info for UV pitch in stoney + uint32_t dt_wa_chroma_top_offset; + uint32_t dt_wa_chroma_bottom_offset; uint32_t reserved[16]; diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c index 7eab974a3..bb1f9de50 100644 --- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c +++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c @@ -388,6 +388,11 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, struct radeon_surf *tmp_surf; unsigned cpb_size; + if (rscreen->info.family == CHIP_STONEY) { + RVID_ERR("Stoney VCE is not supported!\n"); + return NULL; + } + if (!rscreen->info.vce_fw_version) { RVID_ERR("Kernel doesn't supports VCE!\n"); return NULL; diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c index d4fe56536..748bf5acb 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c @@ -33,14 +33,6 @@ #include "sid.h" #define MAX_GLOBAL_BUFFERS 20 -#if HAVE_LLVM < 0x0305 -#define NUM_USER_SGPRS 2 -#else -/* XXX: Even though we don't pass the scratch buffer via user sgprs any more - * LLVM still expects that we specify 4 USER_SGPRS so it can remain compatible - * with older mesa. */ -#define NUM_USER_SGPRS 4 -#endif struct si_compute { struct si_context *ctx; @@ -241,7 +233,6 @@ static void si_launch_grid( uint64_t kernel_args_va; uint64_t scratch_buffer_va = 0; uint64_t shader_va; - unsigned arg_user_sgpr_count = NUM_USER_SGPRS; unsigned i; struct si_shader *shader = &program->shader; unsigned lds_blocks; @@ -365,20 +356,7 @@ static void si_launch_grid( si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, (shader_va >> 8) & 0xffffffff); si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40); - si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, - /* We always use at least 3 VGPRS, these come from - * TIDIG_COMP_CNT. - * XXX: The compiler should account for this. - */ - S_00B848_VGPRS((MAX2(3, shader->num_vgprs) - 1) / 4) - /* We always use at least 4 + arg_user_sgpr_count. The 4 extra - * sgprs are from TGID_X_EN, TGID_Y_EN, TGID_Z_EN, TG_SIZE_EN - * XXX: The compiler should account for this. - */ - | S_00B848_SGPRS(((MAX2(4 + arg_user_sgpr_count, - shader->num_sgprs)) - 1) / 8) - | S_00B028_FLOAT_MODE(shader->float_mode)) - ; + si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1); lds_blocks = shader->lds_size; /* XXX: We are over allocating LDS. For SI, the shader reports LDS in @@ -394,17 +372,10 @@ static void si_launch_grid( assert(lds_blocks <= 0xFF); - si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, - S_00B84C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0) - | S_00B84C_USER_SGPR(arg_user_sgpr_count) - | S_00B84C_TGID_X_EN(1) - | S_00B84C_TGID_Y_EN(1) - | S_00B84C_TGID_Z_EN(1) - | S_00B84C_TG_SIZE_EN(1) - | S_00B84C_TIDIG_COMP_CNT(2) - | S_00B84C_LDS_SIZE(lds_blocks) - | S_00B84C_EXCP_EN(0)) - ; + shader->rsrc2 &= C_00B84C_LDS_SIZE; + shader->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks); + + si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2); si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0); si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c index ef986bd2e..2f9e00975 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c @@ -637,6 +637,14 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base, lp_build_const_int32(gallivm, swizzle)); value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr); + if (type == TGSI_TYPE_DOUBLE) { + LLVMValueRef value2; + dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr, + lp_build_const_int32(gallivm, swizzle + 1)); + value2 = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr); + return radeon_llvm_emit_fetch_double(bld_base, value, value2); + } + return LLVMBuildBitCast(gallivm->builder, value, tgsi2llvmtype(bld_base, type), ""); } @@ -3752,12 +3760,14 @@ void si_shader_binary_read_config(const struct si_screen *sscreen, shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8); shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4); shader->float_mode = G_00B028_FLOAT_MODE(value); + shader->rsrc1 = value; break; case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value)); break; case R_00B84C_COMPUTE_PGM_RSRC2: shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value)); + shader->rsrc2 = value; break; case R_0286CC_SPI_PS_INPUT_ENA: shader->spi_ps_input_ena = value; diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h index 511ee7333..5f4944a96 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h @@ -268,8 +268,8 @@ struct si_shader { bool is_gs_copy_shader; bool dx10_clamp_mode; /* convert NaNs to 0 */ - unsigned ls_rsrc1; - unsigned ls_rsrc2; + unsigned rsrc1; + unsigned rsrc2; }; static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx) diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state.c index 81e138233..af96d8680 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_state.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state.c @@ -2979,6 +2979,28 @@ static void si_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw, si_need_cs_space((struct si_context*)ctx, num_dw, include_draw_vbo); } +static void si_init_border_color_buffer(struct si_context *sctx) +{ + struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); + if (!pm4) + return; + + assert(sctx->scratch_buffer == NULL); + r600_resource_reference(&sctx->scratch_buffer, NULL); + sctx->scratch_buffer = si_resource_create_custom(&sctx->screen->b.b, + PIPE_USAGE_DEFAULT, + 4096 * 16); + + uint64_t va = sctx->scratch_buffer->gpu_address; + + si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, va >> 8); + if (sctx->b.chip_class >= CIK) + si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, va >> 40); + si_pm4_add_bo(pm4, sctx->scratch_buffer, RADEON_USAGE_READ, + RADEON_PRIO_SHADER_DATA); + si_pm4_set_state(sctx, ta_bordercolor_base, pm4); +} + static void si_init_config(struct si_context *sctx); void si_init_state_functions(struct si_context *sctx) @@ -3045,6 +3067,7 @@ void si_init_state_functions(struct si_context *sctx) } si_init_config(sctx); + si_init_border_color_buffer(sctx); } static void diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c index e0394cf00..a15981de3 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c @@ -163,7 +163,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size; lds_size = output_patch0_offset + output_patch_size * *num_patches; - ls_rsrc2 = ls->current->ls_rsrc2; + ls_rsrc2 = ls->current->rsrc2; if (sctx->b.chip_class >= CIK) { assert(lds_size <= 65536); @@ -178,7 +178,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII) si_write_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2); si_write_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2); - radeon_emit(cs, ls->current->ls_rsrc1); + radeon_emit(cs, ls->current->rsrc1); radeon_emit(cs, ls_rsrc2); /* Compute userdata SGPRs. */ @@ -216,6 +216,18 @@ static void si_emit_derived_tess_state(struct si_context *sctx, radeon_emit(cs, tcs_out_layout | (num_tcs_output_cp << 26)); } +static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info) +{ + switch (info->mode) { + case PIPE_PRIM_PATCHES: + return info->count / info->vertices_per_patch; + case R600_PRIM_RECTANGLE_LIST: + return info->count / 3; + default: + return u_prims_for_vertices(info->mode, info->count); + } +} + static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, const struct pipe_draw_info *info, unsigned num_patches) @@ -305,7 +317,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi && (info->indirect || (info->instance_count > 1 && - u_prims_for_vertices(info->mode, info->count) <= 1))) + si_num_prims_for_vertices(info) <= 1))) sctx->b.flags |= SI_CONTEXT_VGT_FLUSH; /* Instancing bug on 2 SE chips. */ @@ -849,7 +861,9 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) /* Workaround for a VGT hang when streamout is enabled. * It must be done after drawing. */ - if ((sctx->b.family == CHIP_HAWAII || sctx->b.family == CHIP_TONGA) && + if ((sctx->b.family == CHIP_HAWAII || + sctx->b.family == CHIP_TONGA || + sctx->b.family == CHIP_FIJI) && (sctx->b.streamout.streamout_enabled || sctx->b.streamout.prims_gen_query_enabled)) { sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC; diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c index f9b38ed53..1ba9c8595 100644 --- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -119,10 +119,10 @@ static void si_shader_ls(struct si_shader *shader) si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40); - shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) | + shader->rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) | S_00B528_SGPRS((num_sgprs - 1) / 8) | S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt); - shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) | + shader->rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) | S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0); } diff --git a/lib/mesa/src/gallium/targets/opencl/Makefile.am b/lib/mesa/src/gallium/targets/opencl/Makefile.am index c78b26832..93b4d3a83 100644 --- a/lib/mesa/src/gallium/targets/opencl/Makefile.am +++ b/lib/mesa/src/gallium/targets/opencl/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/src/gallium/Automake.inc lib_LTLIBRARIES = lib@OPENCL_LIBNAME@.la +AM_CPPFLAGS = \ + $(LIBELF_CFLAGS) + lib@OPENCL_LIBNAME@_la_LDFLAGS = \ $(LLVM_LDFLAGS) \ -no-undefined \ @@ -20,8 +23,8 @@ lib@OPENCL_LIBNAME@_la_LIBADD = \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/util/libmesautil.la \ $(GALLIUM_PIPE_LOADER_WINSYS_LIBS) \ - $(ELF_LIB) \ - -ldl \ + $(LIBELF_LIBS) \ + $(DLOPEN_LIBS) \ -lclangCodeGen \ -lclangFrontendTool \ -lclangFrontend \ |