38 files changed, 436 insertions, 124 deletions
diff --git a/lib/mesa/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/lib/mesa/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index 2e1d712a2..149a197ae 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/lib/mesa/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -153,7 +153,7 @@ enum a4xx_vtx_fmt {
 
 enum a4xx_tex_fmt {
 	TFMT4_5_6_5_UNORM = 11,
-	TFMT4_5_5_5_1_UNORM = 10,
+	TFMT4_5_5_5_1_UNORM = 9,
 	TFMT4_4_4_4_4_UNORM = 8,
 	TFMT4_X8Z24_UNORM = 71,
 	TFMT4_10_10_10_2_UNORM = 33,
@@ -2718,6 +2718,12 @@ static inline uint32_t A4XX_TEX_SAMP_0_ANISO(enum a4xx_tex_aniso val)
 {
 	return ((val) << A4XX_TEX_SAMP_0_ANISO__SHIFT) & A4XX_TEX_SAMP_0_ANISO__MASK;
 }
+#define A4XX_TEX_SAMP_0_LOD_BIAS__MASK				0xfff80000
+#define A4XX_TEX_SAMP_0_LOD_BIAS__SHIFT				19
+static inline uint32_t A4XX_TEX_SAMP_0_LOD_BIAS(float val)
+{
+	return ((((int32_t)(val * 256.0))) << A4XX_TEX_SAMP_0_LOD_BIAS__SHIFT) & A4XX_TEX_SAMP_0_LOD_BIAS__MASK;
+}
 
 #define REG_A4XX_TEX_SAMP_1					0x00000001
 #define A4XX_TEX_SAMP_1_COMPARE_FUNC__MASK			0x0000000e
diff --git a/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index a3d7123cc..47e8855e7 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -250,14 +250,6 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 		}
 	}
 
-	/* adjust regids for alpha output formats. there is no alpha render
-	 * format, so it's just treated like red
-	 */
-	for (i = 0; i < nr; i++)
-		if (util_format_is_alpha(pipe_surface_format(bufs[i])))
-			color_regid[i] += 3;
-
-
 	/* TODO get these dynamically: */
 	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
 	coord_regid = s[FS].v->frag_coord ? regid(0,0) : regid(63,0);
diff --git a/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
index 213b29c91..7a28e09b3 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -111,6 +111,7 @@ fd4_sampler_state_create(struct pipe_context *pctx,
 		COND(!cso->normalized_coords, A4XX_TEX_SAMP_1_UNNORM_COORDS);
 
 	if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+		so->texsamp0 |= A4XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias);
 		so->texsamp1 |=
 			A4XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) |
 			A4XX_TEX_SAMP_1_MAX_LOD(cso->max_lod);
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
index fa8ee072a..9f0e07333 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -291,7 +291,7 @@ void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b)
 
    if (b->prev)
       b->prev->next = b;
-   if (a->prev)
+   if (a->next)
       a->next->prev = a;
 }
 
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 8f1542959..fa9336283 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -575,8 +575,8 @@ CodeEmitterGK110::emitIMUL(const Instruction *i)
    if (isLIMM(i->src(1), TYPE_S32)) {
       emitForm_L(i, 0x280, 2, Modifier(0));
 
-      assert(i->subOp != NV50_IR_SUBOP_MUL_HIGH);
-
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+         code[1] |= 1 << 24;
       if (i->sType == TYPE_S32)
          code[1] |= 3 << 25;
    } else {
@@ -695,14 +695,9 @@ CodeEmitterGK110::emitIMAD(const Instruction *i)
    if (i->sType == TYPE_S32)
       code[1] |= (1 << 19) | (1 << 24);
 
-   if (code[0] & 0x1) {
-      assert(!i->subOp);
-      SAT_(39);
-   } else {
-      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
-         code[1] |= 1 << 25;
-      SAT_(35);
-   }
+   if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+      code[1] |= 1 << 25;
+   SAT_(35);
 }
 
 void
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 6bf5219d3..f88f4c2e9 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -2322,6 +2322,9 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn)
    case OP_PFETCH:
       emitPFETCH(insn);
       break;
+   case OP_AFETCH:
+      emitAFETCH(insn);
+      break;
    case OP_EMIT:
    case OP_RESTART:
       emitOUT(insn);
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index f153674e9..f4ff33164 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -2870,6 +2870,12 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
          bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
       }
       setPosition(reinterpret_cast<BasicBlock *>(breakBBs.pop().u.p), true);
+
+      // If the loop never breaks (e.g. only has RET's inside), then there
+      // will be no way to get to the break bb. However BGNLOOP will have
+      // already made a PREBREAK to it, so it must be in the CFG.
+      if (getBB()->cfg.incidentCount() == 0)
+         loopBB->cfg.attach(&getBB()->cfg, Graph::Edge::TREE);
    }
       break;
    case TGSI_OPCODE_BRK:
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index d87cdfff8..4f4320b4d 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -202,7 +202,8 @@ NV50LegalizePostRA::visit(Function *fn)
    Program *prog = fn->getProgram();
 
    r63 = new_LValue(fn, FILE_GPR);
-   if (prog->maxGPR < 63)
+   // GPR units on nv50 are in half-regs
+   if (prog->maxGPR < 126)
       r63->reg.data.id = 63;
    else
       r63->reg.data.id = 127;
@@ -831,7 +832,7 @@ NV50LoweringPreSSA::handleTXB(TexInstruction *i)
    }
    Value *flags = bld.getScratch(1, FILE_FLAGS);
    bld.setPosition(cond, true);
-   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
+   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
 
    Instruction *tex[4];
    for (l = 0; l < 4; ++l) {
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index b1f406585..0f575f2ee 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -686,7 +686,7 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
          i->tex.s = 0x1f;
          i->setIndirectR(hnd);
          i->setIndirectS(NULL);
-      } else if (i->tex.r == i->tex.s) {
+      } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
          i->tex.r += prog->driver->io.texBindBase / 4;
          i->tex.s  = 0; // only a single cX[] value possible here
       } else {
@@ -962,11 +962,14 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd)
 bool
 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
 {
+   const int chipset = prog->getTarget()->getChipset();
+   if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
+      txq->tex.r += prog->driver->io.texBindBase / 4;
+
    if (txq->tex.rIndirectSrc < 0)
       return true;
 
    Value *ticRel = txq->getIndirectR();
-   const int chipset = prog->getTarget()->getChipset();
 
    txq->setIndirectS(NULL);
    txq->tex.sIndirectSrc = -1;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 44f74c613..13f36d0cf 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -842,6 +842,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
          i->src(0).mod = i->src(t).mod;
          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
          i->src(1).mod = 0;
+      } else
+      if (i->postFactor && i->sType == TYPE_F32) {
+         /* Can't emit a postfactor with an immediate, have to fold it in */
+         i->setSrc(s, new_ImmediateValue(
+                      prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
+         i->postFactor = 0;
       }
       break;
    case OP_MAD:
@@ -2606,8 +2612,11 @@ NV50PostRaConstantFolding::visit(BasicBlock *bb)
              i->getSrc(0)->reg.data.id >= 64)
             break;
 
+         if (i->getPredicate())
+            break;
+
          def = i->getSrc(1)->getInsn();
-         if (def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
+         if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
             vtmp = i->getSrc(1);
             i->setSrc(1, def->getSrc(0));
 
@@ -2909,6 +2918,16 @@ DeadCodeElim::visit(BasicBlock *bb)
    return true;
 }
 
+// Each load can go into up to 4 destinations, any of which might potentially
+// be dead (i.e. a hole). These can always be split into 2 loads, independent
+// of where the holes are. We find the first contiguous region, put it into
+// the first load, and then put the second contiguous region into the second
+// load. There can be at most 2 contiguous regions.
+//
+// Note that there are some restrictions, for example it's not possible to do
+// a 64-bit load that's not 64-bit aligned, so such a load has to be split
+// up. Also hardware doesn't support 96-bit loads, so those also have to be
+// split into a 64-bit and 32-bit load.
 void
 DeadCodeElim::checkSplitLoad(Instruction *ld1)
 {
@@ -2929,6 +2948,8 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
    addr1 = ld1->getSrc(0)->reg.data.offset;
    n1 = n2 = 0;
    size1 = size2 = 0;
+
+   // Compute address/width for first load
    for (d = 0; ld1->defExists(d); ++d) {
       if (mask & (1 << d)) {
          if (size1 && (addr1 & 0x7))
@@ -2942,16 +2963,34 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
          break;
       }
    }
+
+   // Scale back the size of the first load until it can be loaded. This
+   // typically happens for TYPE_B96 loads.
+   while (n1 &&
+          !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
+                                                typeOfSize(size1))) {
+      size1 -= def1[--n1]->reg.size;
+      d--;
+   }
+
+   // Compute address/width for second load
    for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
       if (mask & (1 << d)) {
+         assert(!size2 || !(addr2 & 0x7));
          def2[n2] = ld1->getDef(d);
          size2 += def2[n2++]->reg.size;
-      } else {
+      } else if (!n2) {
          assert(!n2);
          addr2 += ld1->getDef(d)->reg.size;
+      } else {
+         break;
       }
    }
 
+   // Make sure that we've processed all the values
+   for (; ld1->defExists(d); ++d)
+      assert(!(mask & (1 << d)));
+
    updateLdStOffset(ld1, addr1, func);
    ld1->setType(typeOfSize(size1));
    for (d = 0; d < 4; ++d)
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 7859c8e79..41d2cc916 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -1573,10 +1573,28 @@ SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval)
 
    Instruction *st;
    if (slot->reg.file == FILE_MEMORY_LOCAL) {
-      st = new_Instruction(func, OP_STORE, ty);
-      st->setSrc(0, slot);
-      st->setSrc(1, lval);
       lval->noSpill = 1;
+      if (ty != TYPE_B96) {
+         st = new_Instruction(func, OP_STORE, ty);
+         st->setSrc(0, slot);
+         st->setSrc(1, lval);
+      } else {
+         st = new_Instruction(func, OP_SPLIT, ty);
+         st->setSrc(0, lval);
+         for (int d = 0; d < lval->reg.size / 4; ++d)
+            st->setDef(d, new_LValue(func, FILE_GPR));
+
+         for (int d = lval->reg.size / 4 - 1; d >= 0; --d) {
+            Value *tmp = cloneShallow(func, slot);
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *s = new_Instruction(func, OP_STORE, TYPE_U32);
+            s->setSrc(0, tmp);
+            s->setSrc(1, st->getDef(d));
+            defi->bb->insertAfter(defi, s);
+         }
+      }
    } else {
       st = new_Instruction(func, OP_CVT, ty);
       st->setDef(0, slot);
@@ -1596,7 +1614,27 @@ SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot)
    Instruction *ld;
    if (slot->reg.file == FILE_MEMORY_LOCAL) {
       lval->noSpill = 1;
-      ld = new_Instruction(func, OP_LOAD, ty);
+      if (ty != TYPE_B96) {
+         ld = new_Instruction(func, OP_LOAD, ty);
+      } else {
+         ld = new_Instruction(func, OP_MERGE, ty);
+         for (int d = 0; d < lval->reg.size / 4; ++d) {
+            Value *tmp = cloneShallow(func, slot);
+            LValue *val;
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *l = new_Instruction(func, OP_LOAD, TYPE_U32);
+            l->setDef(0, (val = new_LValue(func, FILE_GPR)));
+            l->setSrc(0, tmp);
+            usei->bb->insertBefore(usei, l);
+            ld->setSrc(d, val);
+            val->noSpill = 1;
+         }
+         ld->setDef(0, lval);
+         usei->bb->insertBefore(usei, ld);
+         return lval;
+      }
    } else {
       ld = new_Instruction(func, OP_CVT, ty);
    }
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index f3ddcaa51..76a76545c 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -454,7 +454,7 @@ TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const
          return false;
       }
    }
-   if (s >= 3)
+   if (s >= opInfo[insn->op].srcNr || s >= 3)
       return false;
    return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
 }
diff --git a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 27df0eba6..3b4e8025b 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/lib/mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -426,7 +426,7 @@ TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const
          return false;
       }
    }
-   if (s >= 3)
+   if (s >= opInfo[insn->op].srcNr || s >= 3)
       return false;
    return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
 }
diff --git a/lib/mesa/src/gallium/drivers/nouveau/nouveau_buffer.c b/lib/mesa/src/gallium/drivers/nouveau/nouveau_buffer.c
index 72e070b5f..371e1ec76 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/lib/mesa/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -656,8 +656,8 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
    if (buffer->base.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
                              PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
       buffer->domain = NOUVEAU_BO_GART;
-   } else if (buffer->base.bind &
-              (screen->vidmem_bindings & screen->sysmem_bindings)) {
+   } else if (buffer->base.bind == 0 || (buffer->base.bind &
+              (screen->vidmem_bindings & screen->sysmem_bindings))) {
       switch (buffer->base.usage) {
       case PIPE_USAGE_DEFAULT:
       case PIPE_USAGE_IMMUTABLE:
@@ -684,6 +684,10 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
       if (buffer->base.bind & screen->sysmem_bindings)
          buffer->domain = NOUVEAU_BO_GART;
    }
+   /* There can be very special situations where we want non-gpu-mapped
+    * buffers, but never through this interface.
+    */
+   assert(buffer->domain);
    ret = nouveau_buffer_allocate(screen, buffer, buffer->domain);
 
    if (ret == false)
diff --git a/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_context.c b/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 152c2ce13..2710b7524 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -159,9 +159,10 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
                                  int ref)
 {
    struct nv50_context *nv50 = nv50_context(&ctx->pipe);
+   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
    unsigned s, i;
 
-   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+   if (bind & PIPE_BIND_RENDER_TARGET) {
       assert(nv50->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS);
       for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) {
          if (nv50->framebuffer.cbufs[i] &&
@@ -173,7 +174,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
          }
       }
    }
-   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
       if (nv50->framebuffer.zsbuf &&
           nv50->framebuffer.zsbuf->texture == res) {
          nv50->dirty |= NV50_NEW_FRAMEBUFFER;
@@ -183,11 +184,11 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       }
    }
 
-   if (res->bind & (PIPE_BIND_VERTEX_BUFFER |
-                    PIPE_BIND_INDEX_BUFFER |
-                    PIPE_BIND_CONSTANT_BUFFER |
-                    PIPE_BIND_STREAM_OUTPUT |
-                    PIPE_BIND_SAMPLER_VIEW)) {
+   if (bind & (PIPE_BIND_VERTEX_BUFFER |
+               PIPE_BIND_INDEX_BUFFER |
+               PIPE_BIND_CONSTANT_BUFFER |
+               PIPE_BIND_STREAM_OUTPUT |
+               PIPE_BIND_SAMPLER_VIEW)) {
 
       assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
       for (i = 0; i < nv50->num_vtxbufs; ++i) {
diff --git a/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_state.c b/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 410e6311e..335d95259 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/lib/mesa/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -960,6 +960,9 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
    unsigned i;
 
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
+   nv50->dirty |= NV50_NEW_ARRAYS;
+
    util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb,
                                  start_slot, count);
 
@@ -983,10 +986,6 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
          nv50->vbo_constant &= ~(1 << dst_index);
       }
    }
-
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
-
-   nv50->dirty |= NV50_NEW_ARRAYS;
 }
 
 static void
diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index 7a15a11f5..a0b9a54d9 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -180,9 +180,10 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
                                  int ref)
 {
    struct nvc0_context *nvc0 = nvc0_context(&ctx->pipe);
+   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
    unsigned s, i;
 
-   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+   if (bind & PIPE_BIND_RENDER_TARGET) {
       for (i = 0; i < nvc0->framebuffer.nr_cbufs; ++i) {
          if (nvc0->framebuffer.cbufs[i] &&
              nvc0->framebuffer.cbufs[i]->texture == res) {
@@ -193,7 +194,7 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
          }
       }
    }
-   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
       if (nvc0->framebuffer.zsbuf &&
           nvc0->framebuffer.zsbuf->texture == res) {
          nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
@@ -203,12 +204,12 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
       }
    }
 
-   if (res->bind & (PIPE_BIND_VERTEX_BUFFER |
-                    PIPE_BIND_INDEX_BUFFER |
-                    PIPE_BIND_CONSTANT_BUFFER |
-                    PIPE_BIND_STREAM_OUTPUT |
-                    PIPE_BIND_COMMAND_ARGS_BUFFER |
-                    PIPE_BIND_SAMPLER_VIEW)) {
+   if (bind & (PIPE_BIND_VERTEX_BUFFER |
+               PIPE_BIND_INDEX_BUFFER |
+               PIPE_BIND_CONSTANT_BUFFER |
+               PIPE_BIND_STREAM_OUTPUT |
+               PIPE_BIND_COMMAND_ARGS_BUFFER |
+               PIPE_BIND_SAMPLER_VIEW)) {
       for (i = 0; i < nvc0->num_vtxbufs; ++i) {
          if (nvc0->vtxbuf[i].buffer == res) {
             nvc0->dirty |= NVC0_NEW_ARRAYS;
diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 191e3b727..139a5039b 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -417,6 +417,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
    if (screen->pm.prog) {
       screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
       nvc0_program_destroy(NULL, screen->pm.prog);
+      FREE(screen->pm.prog);
    }
 
    nouveau_bo_ref(NULL, &screen->text);
diff --git a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index c5bfd0395..310e30f52 100644
--- a/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/lib/mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -998,6 +998,9 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
     struct nvc0_context *nvc0 = nvc0_context(pipe);
     unsigned i;
 
+    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
+    nvc0->dirty |= NVC0_NEW_ARRAYS;
+
     util_set_vertex_buffers_count(nvc0->vtxbuf, &nvc0->num_vtxbufs, vb,
                                   start_slot, count);
 
@@ -1021,9 +1024,6 @@ nvc0_set_vertex_buffers(struct pipe_context *pipe,
           nvc0->constant_vbos &= ~(1 << dst_index);
        }
     }
-
-    nvc0->dirty |= NVC0_NEW_ARRAYS;
-    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
 }
 
 static void
diff --git a/lib/mesa/src/gallium/drivers/r600/evergreen_state.c b/lib/mesa/src/gallium/drivers/r600/evergreen_state.c
index 2dc381178..1976d873f 100644
--- a/lib/mesa/src/gallium/drivers/r600/evergreen_state.c
+++ b/lib/mesa/src/gallium/drivers/r600/evergreen_state.c
@@ -1527,12 +1527,17 @@ static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples,
 				     S_028C00_EXPAND_LINE_WIDTH(1)); /* R_028C00_PA_SC_LINE_CNTL */
 		radeon_emit(cs, S_028C04_MSAA_NUM_SAMPLES(util_logbase2(nr_samples)) |
 				     S_028C04_MAX_SAMPLE_DIST(max_dist)); /* R_028C04_PA_SC_AA_CONFIG */
-		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1));
+		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) |
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	} else {
 		r600_write_context_reg_seq(cs, R_028C00_PA_SC_LINE_CNTL, 2);
 		radeon_emit(cs, S_028C00_LAST_PIXEL(1)); /* R_028C00_PA_SC_LINE_CNTL */
 		radeon_emit(cs, 0); /* R_028C04_PA_SC_AA_CONFIG */
-		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	}
 }
 
diff --git a/lib/mesa/src/gallium/drivers/r600/r600_pipe.h b/lib/mesa/src/gallium/drivers/r600/r600_pipe.h
index bb91f8308..896e3f71e 100644
--- a/lib/mesa/src/gallium/drivers/r600/r600_pipe.h
+++ b/lib/mesa/src/gallium/drivers/r600/r600_pipe.h
@@ -57,7 +57,7 @@
 
 /* the number of CS dwords for flushing and drawing */
 #define R600_MAX_FLUSH_CS_DWORDS	16
-#define R600_MAX_DRAW_CS_DWORDS		47
+#define R600_MAX_DRAW_CS_DWORDS		52
 #define R600_TRACE_CS_DWORDS		7
 
 #define R600_MAX_USER_CONST_BUFFERS 13
diff --git a/lib/mesa/src/gallium/drivers/r600/r600_shader.c b/lib/mesa/src/gallium/drivers/r600/r600_shader.c
index 819f9a798..911e81fed 100644
--- a/lib/mesa/src/gallium/drivers/r600/r600_shader.c
+++ b/lib/mesa/src/gallium/drivers/r600/r600_shader.c
@@ -598,6 +598,106 @@ static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back
 	return 0;
 }
 
+/* execute a single slot ALU calculation */
+static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
+			  int dst_sel, int dst_chan,
+			  int src0_sel, unsigned src0_chan_val,
+			  int src1_sel, unsigned src1_chan_val)
+{
+	struct r600_bytecode_alu alu;
+	int r, i;
+
+	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
+		for (i = 0; i < 4; i++) {
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = op;
+			alu.src[0].sel = src0_sel;
+			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+				alu.src[0].value = src0_chan_val;
+			else
+				alu.src[0].chan = src0_chan_val;
+			alu.src[1].sel = src1_sel;
+			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+				alu.src[1].value = src1_chan_val;
+			else
+				alu.src[1].chan = src1_chan_val;
+			alu.dst.sel = dst_sel;
+			alu.dst.chan = i;
+			alu.dst.write = i == dst_chan;
+			alu.last = (i == 3);
+			r = r600_bytecode_add_alu(ctx->bc, &alu);
+			if (r)
+				return r;
+		}
+		return 0;
+	}
+
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = op;
+	alu.src[0].sel = src0_sel;
+	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[0].value = src0_chan_val;
+	else
+		alu.src[0].chan = src0_chan_val;
+	alu.src[1].sel = src1_sel;
+	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[1].value = src1_chan_val;
+	else
+		alu.src[1].chan = src1_chan_val;
+	alu.dst.sel = dst_sel;
+	alu.dst.chan = dst_chan;
+	alu.dst.write = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	return 0;
+}
+
+/* execute a single slot ALU calculation */
+static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
+			  int dst_sel, int dst_chan,
+			  int src0_sel, unsigned src0_chan_val,
+			  int src1_sel, unsigned src1_chan_val,
+			  int src2_sel, unsigned src2_chan_val)
+{
+	struct r600_bytecode_alu alu;
+	int r;
+
+	/* validate this for other ops */
+	assert(op == ALU_OP3_MULADD_UINT24);
+	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+	alu.op = op;
+	alu.src[0].sel = src0_sel;
+	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[0].value = src0_chan_val;
+	else
+		alu.src[0].chan = src0_chan_val;
+	alu.src[1].sel = src1_sel;
+	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[1].value = src1_chan_val;
+	else
+		alu.src[1].chan = src1_chan_val;
+	alu.src[2].sel = src2_sel;
+	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
+		alu.src[2].value = src2_chan_val;
+	else
+		alu.src[2].chan = src2_chan_val;
+	alu.dst.sel = dst_sel;
+	alu.dst.chan = dst_chan;
+	alu.is_op3 = 1;
+	alu.last = 1;
+	r = r600_bytecode_add_alu(ctx->bc, &alu);
+	if (r)
+		return r;
+	return 0;
+}
+
+static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
+{
+	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
+}
+
 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
 {
 	int i;
@@ -1129,6 +1229,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 	unsigned vtx_id = src->Dimension.Index;
 	int offset_reg = vtx_id / 3;
 	int offset_chan = vtx_id % 3;
+	int t2 = 0;
 
 	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
 	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
@@ -1136,13 +1237,24 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 	if (offset_reg == 0 && offset_chan == 2)
 		offset_chan = 3;
 
+	if (src->Dimension.Indirect || src->Register.Indirect)
+		t2 = r600_get_temp(ctx);
+
 	if (src->Dimension.Indirect) {
 		int treg[3];
-		int t2;
 		struct r600_bytecode_alu alu;
 		int r, i;
-
-		/* you have got to be shitting me -
+		unsigned addr_reg;
+		addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
+		if (src->DimIndirect.Index > 0) {
+			r = single_alu_op2(ctx, ALU_OP1_MOV,
+					   ctx->bc->ar_reg, 0,
+					   addr_reg, 0,
+					   0, 0);
+			if (r)
+				return r;
+		}
+		/*
 		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
 		   at least this is what fglrx seems to do. */
 		for (i = 0; i < 3; i++) {
@@ -1150,7 +1262,6 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 		}
 		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
 
-		t2 = r600_get_temp(ctx);
 		for (i = 0; i < 3; i++) {
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 			alu.op = ALU_OP1_MOV;
@@ -1175,8 +1286,33 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi
 		if (r)
 			return r;
 		offset_reg = t2;
+		offset_chan = 0;
 	}
 
+	if (src->Register.Indirect) {
+		int addr_reg;
+		unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
+
+		addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
+
+		/* pull the value from index_reg */
+		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
+				   t2, 1,
+				   addr_reg, 0,
+				   V_SQ_ALU_SRC_LITERAL, first);
+		if (r)
+			return r;
+		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+				   t2, 0,
+				   t2, 1,
+				   V_SQ_ALU_SRC_LITERAL, 4,
+				   offset_reg, offset_chan);
+		if (r)
+			return r;
+		offset_reg = t2;
+		offset_chan = 0;
+		index = src->Register.Index - first;
+	}
 
 	memset(&vtx, 0, sizeof(vtx));
 	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
@@ -1222,6 +1358,7 @@ static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
 
 			fetch_gs_input(ctx, src, treg);
 			ctx->src[i].sel = treg;
+			ctx->src[i].rel = 0;
 		}
 	}
 	return 0;
@@ -1972,7 +2109,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 
 	ctx.nliterals = 0;
 	ctx.literals = NULL;
-	shader->fs_write_all = FALSE;
+
+	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
+			       ctx.info.colors_written == 1;
 
 	if (shader->vs_as_gs_a)
 		vs_add_primid_output(&ctx, key.vs.prim_id_out);
@@ -2003,10 +2142,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		case TGSI_TOKEN_TYPE_PROPERTY:
 			property = &ctx.parse.FullToken.FullProperty;
 			switch (property->Property.PropertyName) {
-			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
-				if (property->u[0].Data == 1)
-					shader->fs_write_all = TRUE;
-				break;
 			case TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION:
 				if (property->u[0].Data == 1)
 					shader->vs_position_window_space = TRUE;
@@ -2159,6 +2294,10 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 			struct r600_bytecode_alu alu;
 			int r;
 
+			/* GS thread with no output workaround - emit a cut at start of GS */
+			if (ctx.bc->chip_class == R600)
+				r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
+
 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 			alu.op = ALU_OP1_MOV;
 			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
@@ -6671,7 +6810,7 @@ static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
 	struct r600_bytecode_alu alu;
 	int r;
 	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
-	unsigned reg = inst->Dst[0].Register.Index > 0 ? ctx->bc->index_reg[inst->Dst[0].Register.Index - 1] : ctx->bc->ar_reg;
+	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
 
 	assert(inst->Dst[0].Register.Index < 3);
 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
diff --git a/lib/mesa/src/gallium/drivers/r600/r600_state.c b/lib/mesa/src/gallium/drivers/r600/r600_state.c
index 2c727815d..a588e16e5 100644
--- a/lib/mesa/src/gallium/drivers/r600/r600_state.c
+++ b/lib/mesa/src/gallium/drivers/r600/r600_state.c
@@ -2181,10 +2181,11 @@ void r600_init_atom_start_cs(struct r600_context *rctx)
 		num_temp_gprs = 4;
 		num_gs_gprs = 0;
 		num_es_gprs = 0;
-		num_ps_threads = 136;
-		num_vs_threads = 48;
-		num_gs_threads = 4;
-		num_es_threads = 4;
+		/* use limits 40 VS and at least 16 ES/GS */
+		num_ps_threads = 120;
+		num_vs_threads = 40;
+		num_gs_threads = 16;
+		num_es_threads = 16;
 		num_ps_stack_entries = 40;
 		num_vs_stack_entries = 40;
 		num_gs_stack_entries = 32;
@@ -2643,6 +2644,9 @@ void r600_update_vs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
 		S_02881C_USE_VTX_VIEWPORT_INDX(rshader->vs_out_viewport);
 }
 
+#define RV610_GSVS_ALIGN 32
+#define R600_GSVS_ALIGN 16
+
 void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *shader)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
@@ -2652,6 +2656,23 @@ void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
 	unsigned gsvs_itemsize =
 			(cp_shader->ring_item_size * rshader->gs_max_out_vertices) >> 2;
 
+	/* some r600s needs gsvs itemsize aligned to cacheline size
+	   this was fixed in rs780 and above. */
+	switch (rctx->b.family) {
+	case CHIP_RV610:
+		gsvs_itemsize = align(gsvs_itemsize, RV610_GSVS_ALIGN);
+		break;
+	case CHIP_R600:
+	case CHIP_RV630:
+	case CHIP_RV670:
+	case CHIP_RV620:
+	case CHIP_RV635:
+		gsvs_itemsize = align(gsvs_itemsize, R600_GSVS_ALIGN);
+		break;
+	default:
+		break;
+	}
+
 	r600_init_command_buffer(cb, 64);
 
 	/* VGT_GS_MODE is written by r600_emit_shader_stages */
diff --git a/lib/mesa/src/gallium/drivers/r600/r600_state_common.c b/lib/mesa/src/gallium/drivers/r600/r600_state_common.c
index bdd9337ec..2eebb5792 100644
--- a/lib/mesa/src/gallium/drivers/r600/r600_state_common.c
+++ b/lib/mesa/src/gallium/drivers/r600/r600_state_common.c
@@ -1691,6 +1691,24 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 					(info.count_from_stream_output ? S_0287F0_USE_OPAQUE(1) : 0);
 	}
 
+	/* SMX returns CONTEXT_DONE too early workaround */
+	if (rctx->b.family == CHIP_R600 ||
+	    rctx->b.family == CHIP_RV610 ||
+	    rctx->b.family == CHIP_RV630 ||
+	    rctx->b.family == CHIP_RV635) {
+		/* if we have gs shader or streamout
+		   we need to do a wait idle after every draw */
+		if (rctx->gs_shader || rctx->b.streamout.streamout_enabled) {
+			r600_write_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_3D_IDLE(1));
+		}
+	}
+
+	/* ES ring rolling over at EOP - workaround */
+	if (rctx->b.chip_class == R600) {
+		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT);
+	}
+
 	if (rctx->screen->b.trace_bo) {
 		r600_trace_emit(rctx);
 	}
diff --git a/lib/mesa/src/gallium/drivers/r600/r600d.h b/lib/mesa/src/gallium/drivers/r600/r600d.h
index bce8b4ea0..4b44546b6 100644
--- a/lib/mesa/src/gallium/drivers/r600/r600d.h
+++ b/lib/mesa/src/gallium/drivers/r600/r600d.h
@@ -130,6 +130,7 @@
 #define EVENT_TYPE_SAMPLE_STREAMOUTSTATS	0x20
 #define EVENT_TYPE_FLUSH_AND_INV_DB_META       0x2c /* supported on r700+ */
 #define EVENT_TYPE_VGT_FLUSH                   0x24
+#define EVENT_TYPE_SQ_NON_EVENT                0x26
 #define EVENT_TYPE_FLUSH_AND_INV_CB_META	46 /* supported on r700+ */
 #define		EVENT_TYPE(x)                           ((x) << 0)
 #define		EVENT_INDEX(x)                          ((x) << 8)
diff --git a/lib/mesa/src/gallium/drivers/radeon/Makefile.am b/lib/mesa/src/gallium/drivers/radeon/Makefile.am
index 13d8976de..a6fc145cb 100644
--- a/lib/mesa/src/gallium/drivers/radeon/Makefile.am
+++ b/lib/mesa/src/gallium/drivers/radeon/Makefile.am
@@ -16,7 +16,8 @@ libradeon_la_SOURCES = \
 if NEED_RADEON_LLVM
 
 AM_CFLAGS += \
-	$(LLVM_CFLAGS)
+	$(LLVM_CFLAGS) \
+	$(LIBELF_CFLAGS)
 
 libradeon_la_SOURCES += \
 	$(LLVM_C_FILES)
@@ -24,7 +25,7 @@ libradeon_la_SOURCES += \
 libradeon_la_LIBADD = \
 	$(CLOCK_LIB) \
 	$(LLVM_LIBS) \
-	$(ELF_LIB)
+	$(LIBELF_LIBS)
 
 libradeon_la_LDFLAGS = \
 	$(LLVM_LDFLAGS)
diff --git a/lib/mesa/src/gallium/drivers/radeon/cayman_msaa.c b/lib/mesa/src/gallium/drivers/radeon/cayman_msaa.c
index 12a5f6047..60cf2db5d 100644
--- a/lib/mesa/src/gallium/drivers/radeon/cayman_msaa.c
+++ b/lib/mesa/src/gallium/drivers/radeon/cayman_msaa.c
@@ -229,13 +229,17 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
 			r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
-					     EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1));
+					       EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) |
+					       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+					       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 		} else if (overrast_samples > 1) {
 			r600_write_context_reg(cs, CM_R_028804_DB_EQAA,
 					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) |
 					       S_028804_OVERRASTERIZATION_AMOUNT(log_samples));
-			r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+			r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+					       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+					       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 		}
 	} else {
 		r600_write_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2);
@@ -245,6 +249,8 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 		r600_write_context_reg(cs, CM_R_028804_DB_EQAA,
 				       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 				       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
-		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+		r600_write_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	}
 }
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c
index 495fda0a8..62f7647d3 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/lib/mesa/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -226,8 +226,8 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	rctx->family = rscreen->family;
 	rctx->chip_class = rscreen->chip_class;
 
-	if (rscreen->family == CHIP_HAWAII)
-		rctx->max_db = 16;
+	if (rscreen->chip_class >= CIK)
+		rctx->max_db = MAX2(8, rscreen->info.r600_num_backends);
 	else if (rscreen->chip_class >= EVERGREEN)
 		rctx->max_db = 8;
 	else
@@ -543,10 +543,11 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 	case CHIP_TONGA: return "tonga";
 	case CHIP_ICELAND: return "iceland";
 	case CHIP_CARRIZO: return "carrizo";
-	case CHIP_FIJI: return "fiji";
 #if HAVE_LLVM <= 0x0307
+	case CHIP_FIJI: return "tonga";
 	case CHIP_STONEY: return "carrizo";
 #else
+	case CHIP_FIJI: return "fiji";
 	case CHIP_STONEY: return "stoney";
 #endif
 	default: return "";
diff --git a/lib/mesa/src/gallium/drivers/radeon/r600d_common.h b/lib/mesa/src/gallium/drivers/radeon/r600d_common.h
index 115042d15..7512c7232 100644
--- a/lib/mesa/src/gallium/drivers/radeon/r600d_common.h
+++ b/lib/mesa/src/gallium/drivers/radeon/r600d_common.h
@@ -168,6 +168,8 @@
 
 #define EG_R_028A4C_PA_SC_MODE_CNTL_1                0x028A4C
 #define   EG_S_028A4C_PS_ITER_SAMPLE(x)                 (((x) & 0x1) << 16)
+#define   EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)        (((x) & 0x1) << 25)
+#define   EG_S_028A4C_FORCE_EOV_REZ_ENABLE(x)           (((x) & 0x1) << 26)
 
 #define CM_R_028804_DB_EQAA                          0x00028804
 #define   S_028804_MAX_ANCHOR_SAMPLES(x)		(((x) & 0x7) << 0)
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c
index 55c216aa5..3a5d9f444 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.c
@@ -951,6 +951,8 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	dec->msg->body.decode.db_pitch = dec->base.width;
 
 	dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target);
+	if (((struct r600_common_screen*)dec->screen)->family >= CHIP_STONEY)
+		dec->msg->body.decode.dt_wa_chroma_top_offset = dec->msg->body.decode.dt_pitch / 2;
 
 	switch (u_reduce_video_profile(picture->profile)) {
 	case PIPE_VIDEO_FORMAT_MPEG4_AVC:
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h
index 452fbd608..756f69828 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_uvd.h
@@ -385,7 +385,10 @@ struct ruvd_msg {
 			uint32_t	dt_chroma_top_offset;
 			uint32_t	dt_chroma_bottom_offset;
 			uint32_t	dt_surf_tile_config;
-			uint32_t	dt_reserved[3];
+			uint32_t	dt_uv_surf_tile_config;
+			// re-use dt_wa_chroma_top_offset as dt_ext_info for UV pitch in stoney
+			uint32_t	dt_wa_chroma_top_offset;
+			uint32_t	dt_wa_chroma_bottom_offset;
 
 			uint32_t	reserved[16];
 
diff --git a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c
index 7eab974a3..bb1f9de50 100644
--- a/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c
+++ b/lib/mesa/src/gallium/drivers/radeon/radeon_vce.c
@@ -388,6 +388,11 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 	struct radeon_surf *tmp_surf;
 	unsigned cpb_size;
 
+	if (rscreen->info.family == CHIP_STONEY) {
+		RVID_ERR("Stoney VCE is not supported!\n");
+		return NULL;
+	}
+
 	if (!rscreen->info.vce_fw_version) {
 		RVID_ERR("Kernel doesn't supports VCE!\n");
 		return NULL;
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c b/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c
index d4fe56536..748bf5acb 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_compute.c
@@ -33,14 +33,6 @@
 #include "sid.h"
 
 #define MAX_GLOBAL_BUFFERS 20
-#if HAVE_LLVM < 0x0305
-#define NUM_USER_SGPRS 2
-#else
-/* XXX: Even though we don't pass the scratch buffer via user sgprs any more
- * LLVM still expects that we specify 4 USER_SGPRS so it can remain compatible
- * with older mesa. */
-#define NUM_USER_SGPRS 4
-#endif
 
 struct si_compute {
 	struct si_context *ctx;
@@ -241,7 +233,6 @@ static void si_launch_grid(
 	uint64_t kernel_args_va;
 	uint64_t scratch_buffer_va = 0;
 	uint64_t shader_va;
-	unsigned arg_user_sgpr_count = NUM_USER_SGPRS;
 	unsigned i;
 	struct si_shader *shader = &program->shader;
 	unsigned lds_blocks;
@@ -365,20 +356,7 @@ static void si_launch_grid(
 	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, (shader_va >> 8) & 0xffffffff);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
 
-	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1,
-		/* We always use at least 3 VGPRS, these come from
-		 * TIDIG_COMP_CNT.
-		 * XXX: The compiler should account for this.
-		 */
-		S_00B848_VGPRS((MAX2(3, shader->num_vgprs) - 1) / 4)
-		/* We always use at least 4 + arg_user_sgpr_count.  The 4 extra
-		 * sgprs are from TGID_X_EN, TGID_Y_EN, TGID_Z_EN, TG_SIZE_EN
-		 * XXX: The compiler should account for this.
-		 */
-		|  S_00B848_SGPRS(((MAX2(4 + arg_user_sgpr_count,
-		                        shader->num_sgprs)) - 1) / 8)
-		|  S_00B028_FLOAT_MODE(shader->float_mode))
-		;
+	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1, shader->rsrc1);
 
 	lds_blocks = shader->lds_size;
 	/* XXX: We are over allocating LDS.  For SI, the shader reports LDS in
@@ -394,17 +372,10 @@ static void si_launch_grid(
 
 	assert(lds_blocks <= 0xFF);
 
-	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2,
-		S_00B84C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)
-		| S_00B84C_USER_SGPR(arg_user_sgpr_count)
-		| S_00B84C_TGID_X_EN(1)
-		| S_00B84C_TGID_Y_EN(1)
-		| S_00B84C_TGID_Z_EN(1)
-		| S_00B84C_TG_SIZE_EN(1)
-		| S_00B84C_TIDIG_COMP_CNT(2)
-		| S_00B84C_LDS_SIZE(lds_blocks)
-		| S_00B84C_EXCP_EN(0))
-		;
+	shader->rsrc2 &= C_00B84C_LDS_SIZE;
+	shader->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
+
+	si_pm4_set_reg(pm4, R_00B84C_COMPUTE_PGM_RSRC2, shader->rsrc2);
 	si_pm4_set_reg(pm4, R_00B854_COMPUTE_RESOURCE_LIMITS, 0);
 
 	si_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0,
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c
index ef986bd2e..2f9e00975 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.c
@@ -637,6 +637,14 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 			    lp_build_const_int32(gallivm, swizzle));
 
 	value = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+	if (type == TGSI_TYPE_DOUBLE) {
+		LLVMValueRef value2;
+		dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
+				       lp_build_const_int32(gallivm, swizzle + 1));
+		value2 = build_indexed_load(si_shader_ctx, si_shader_ctx->lds, dw_addr);
+		return radeon_llvm_emit_fetch_double(bld_base, value, value2);
+	}
+
 	return LLVMBuildBitCast(gallivm->builder, value,
 				tgsi2llvmtype(bld_base, type), "");
 }
@@ -3752,12 +3760,14 @@ void si_shader_binary_read_config(const struct si_screen *sscreen,
 			shader->num_sgprs = MAX2(shader->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
 			shader->num_vgprs = MAX2(shader->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
 			shader->float_mode =  G_00B028_FLOAT_MODE(value);
+			shader->rsrc1 = value;
 			break;
 		case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
 			shader->lds_size = MAX2(shader->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
 			break;
 		case R_00B84C_COMPUTE_PGM_RSRC2:
 			shader->lds_size = MAX2(shader->lds_size, G_00B84C_LDS_SIZE(value));
+			shader->rsrc2 = value;
 			break;
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			shader->spi_ps_input_ena = value;
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h
index 511ee7333..5f4944a96 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_shader.h
@@ -268,8 +268,8 @@ struct si_shader {
 	bool			is_gs_copy_shader;
 	bool			dx10_clamp_mode; /* convert NaNs to 0 */
 
-	unsigned		ls_rsrc1;
-	unsigned		ls_rsrc2;
+	unsigned		rsrc1;
+	unsigned		rsrc2;
 };
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state.c
index 81e138233..af96d8680 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state.c
@@ -2979,6 +2979,28 @@ static void si_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw,
 	si_need_cs_space((struct si_context*)ctx, num_dw, include_draw_vbo);
 }
 
+static void si_init_border_color_buffer(struct si_context *sctx)
+{
+	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
+	if (!pm4)
+		return;
+
+	assert(sctx->scratch_buffer == NULL);
+	r600_resource_reference(&sctx->scratch_buffer, NULL);
+	sctx->scratch_buffer = si_resource_create_custom(&sctx->screen->b.b,
+							 PIPE_USAGE_DEFAULT,
+							 4096 * 16);
+
+	uint64_t va = sctx->scratch_buffer->gpu_address;
+
+	si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, va >> 8);
+	if (sctx->b.chip_class >= CIK)
+		si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, va >> 40);
+	si_pm4_add_bo(pm4, sctx->scratch_buffer, RADEON_USAGE_READ,
+		      RADEON_PRIO_SHADER_DATA);
+	si_pm4_set_state(sctx, ta_bordercolor_base, pm4);
+}
+
 static void si_init_config(struct si_context *sctx);
 
 void si_init_state_functions(struct si_context *sctx)
@@ -3045,6 +3067,7 @@ void si_init_state_functions(struct si_context *sctx)
 	}
 
 	si_init_config(sctx);
+	si_init_border_color_buffer(sctx);
 }
 
 static void
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c
index e0394cf00..a15981de3 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -163,7 +163,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
 
 	lds_size = output_patch0_offset + output_patch_size * *num_patches;
-	ls_rsrc2 = ls->current->ls_rsrc2;
+	ls_rsrc2 = ls->current->rsrc2;
 
 	if (sctx->b.chip_class >= CIK) {
 		assert(lds_size <= 65536);
@@ -178,7 +178,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
 		si_write_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
 	si_write_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-	radeon_emit(cs, ls->current->ls_rsrc1);
+	radeon_emit(cs, ls->current->rsrc1);
 	radeon_emit(cs, ls_rsrc2);
 
 	/* Compute userdata SGPRs. */
@@ -216,6 +216,18 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	radeon_emit(cs, tcs_out_layout | (num_tcs_output_cp << 26));
 }
 
+static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info)
+{
+	switch (info->mode) {
+	case PIPE_PRIM_PATCHES:
+		return info->count / info->vertices_per_patch;
+	case R600_PRIM_RECTANGLE_LIST:
+		return info->count / 3;
+	default:
+		return u_prims_for_vertices(info->mode, info->count);
+	}
+}
+
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 					  const struct pipe_draw_info *info,
 					  unsigned num_patches)
@@ -305,7 +317,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 	if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi &&
 	    (info->indirect ||
 	     (info->instance_count > 1 &&
-	      u_prims_for_vertices(info->mode, info->count) <= 1)))
+	      si_num_prims_for_vertices(info) <= 1)))
 		sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
 
 	/* Instancing bug on 2 SE chips. */
@@ -849,7 +861,9 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 	/* Workaround for a VGT hang when streamout is enabled.
 	 * It must be done after drawing. */
-	if ((sctx->b.family == CHIP_HAWAII || sctx->b.family == CHIP_TONGA) &&
+	if ((sctx->b.family == CHIP_HAWAII ||
+	     sctx->b.family == CHIP_TONGA ||
+	     sctx->b.family == CHIP_FIJI) &&
 	    (sctx->b.streamout.streamout_enabled ||
 	     sctx->b.streamout.prims_gen_query_enabled)) {
 		sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
diff --git a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c
index f9b38ed53..1ba9c8595 100644
--- a/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/lib/mesa/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -119,10 +119,10 @@ static void si_shader_ls(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40);
 
-	shader->ls_rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
+	shader->rsrc1 = S_00B528_VGPRS((shader->num_vgprs - 1) / 4) |
 			   S_00B528_SGPRS((num_sgprs - 1) / 8) |
 		           S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt);
-	shader->ls_rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
+	shader->rsrc2 = S_00B52C_USER_SGPR(num_user_sgprs) |
 			   S_00B52C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0);
 }