5 files changed, 550 insertions, 979 deletions
diff --git a/lib/mesa/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h b/lib/mesa/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h
index f26bb2ffc..6b1e9f7e8 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h
+++ b/lib/mesa/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h
@@ -1,3 +1,5 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
 /*
  * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
  *
diff --git a/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c b/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
index af9811864..2b62b3ae2 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
@@ -35,13 +35,19 @@
 #define WARN_MSG(f, ...)   DBG("WARN:  "f, ##__VA_ARGS__)
 #define ERROR_MSG(f, ...)  DBG("ERROR: "f, ##__VA_ARGS__)
 
+#define REG_MASK 0x3f
+
+static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr);
+
 static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords,
 		uint32_t idx, struct ir2_shader_info *info);
 
-static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n);
-static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg);
-static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg);
-static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg);
+static void reg_update_stats(struct ir2_register *reg,
+		struct ir2_shader_info *info, bool dest);
+static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n);
+static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg);
+static uint32_t reg_alu_dst_swiz(struct ir2_register *reg);
+static uint32_t reg_alu_src_swiz(struct ir2_register *reg);
 
 /* simple allocator to carve allocations out of an up-front allocated heap,
  * so that we can free everything easily in one shot.
@@ -49,7 +55,7 @@ static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg);
 static void * ir2_alloc(struct ir2_shader *shader, int sz)
 {
 	void *ptr = &shader->heap[shader->heap_idx];
-	shader->heap_idx += align(sz, 4) / 4;
+	shader->heap_idx += align(sz, 4);
 	return ptr;
 }
 
@@ -68,9 +74,7 @@ static char * ir2_strdup(struct ir2_shader *shader, const char *str)
 struct ir2_shader * ir2_shader_create(void)
 {
 	DEBUG_MSG("");
-	struct ir2_shader *shader = calloc(1, sizeof(struct ir2_shader));
-	shader->max_reg = -1;
-	return shader;
+	return calloc(1, sizeof(struct ir2_shader));
 }
 
 void ir2_shader_destroy(struct ir2_shader *shader)
@@ -79,344 +83,189 @@ void ir2_shader_destroy(struct ir2_shader *shader)
 	free(shader);
 }
 
-/* check if an instruction is a simple MOV
- */
-static struct ir2_instruction * simple_mov(struct ir2_instruction *instr,
-		bool output)
+/* resolve addr/cnt/sequence fields in the individual CF's */
+static int shader_resolve(struct ir2_shader *shader, struct ir2_shader_info *info)
 {
-    struct ir2_src_register *src_reg = instr->src_reg;
-    struct ir2_dst_register *dst_reg = &instr->dst_reg;
-    struct ir2_register *reg;
-    unsigned i;
-
-    /* MAXv used for MOV */
-    if (instr->instr_type != IR2_ALU_VECTOR ||
-		instr->alu_vector.opc != MAXv)
-		return NULL;
-
-	/* non identical srcs */
-	if (src_reg[0].num != src_reg[1].num)
-		return NULL;
-
-	/* flags */
-	int flags = IR2_REG_NEGATE | IR2_REG_ABS;
-	if (output)
-		flags |= IR2_REG_INPUT | IR2_REG_CONST;
-	if ((src_reg[0].flags & flags) || (src_reg[1].flags & flags))
-		return NULL;
-
-	/* clamping */
-	if (instr->alu_vector.clamp)
-		return NULL;
-
-	/* swizzling */
-    for (i = 0; i < 4; i++) {
-		char swiz = (dst_reg->swizzle ? dst_reg->swizzle : "xyzw")[i];
-		if (swiz == '_')
-			continue;
-
-		if (swiz != (src_reg[0].swizzle ? src_reg[0].swizzle : "xyzw")[i] ||
-			swiz != (src_reg[1].swizzle ? src_reg[1].swizzle : "xyzw")[i])
-			return NULL;
-    }
-
-    if (output)
-		reg = &instr->shader->reg[src_reg[0].num];
-	else
-		reg = &instr->shader->reg[dst_reg->num];
-
-	assert(reg->write_idx >= 0);
-    if (reg->write_idx != reg->write_idx2)
-		return NULL;
-
-	if (!output)
-		return instr;
-
-	instr = instr->shader->instr[reg->write_idx];
-	return instr->instr_type != IR2_ALU_VECTOR ? NULL : instr;
-}
+	uint32_t addr;
+	unsigned i;
+	int j;
+
+	addr = shader->cfs_count / 2;
+	for (i = 0; i < shader->cfs_count; i++) {
+		struct ir2_cf *cf = shader->cfs[i];
+		if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) {
+			uint32_t sequence = 0;
+
+			if (cf->exec.addr && (cf->exec.addr != addr))
+				WARN_MSG("invalid addr '%d' at CF %d", cf->exec.addr, i);
+			if (cf->exec.cnt && (cf->exec.cnt != cf->exec.instrs_count))
+				WARN_MSG("invalid cnt '%d' at CF %d", cf->exec.cnt, i);
+
+			for (j = cf->exec.instrs_count - 1; j >= 0; j--) {
+				struct ir2_instruction *instr = cf->exec.instrs[j];
+				sequence <<= 2;
+				if (instr->instr_type == IR2_FETCH)
+					sequence |= 0x1;
+				if (instr->sync)
+					sequence |= 0x2;
+			}
 
-static int src_to_reg(struct ir2_instruction *instr,
-		struct ir2_src_register *reg)
-{
-	if (reg->flags & IR2_REG_CONST)
-		return reg->num;
+			cf->exec.addr = addr;
+			cf->exec.cnt  = cf->exec.instrs_count;
+			cf->exec.sequence = sequence;
 
-	return instr->shader->reg[reg->num].reg;
-}
-
-static int dst_to_reg(struct ir2_instruction *instr,
-		struct ir2_dst_register *reg)
-{
-	if (reg->flags & IR2_REG_EXPORT)
-		return reg->num;
+			addr += cf->exec.instrs_count;
+		}
+	}
 
-	return instr->shader->reg[reg->num].reg;
-}
+	info->sizedwords = 3 * addr;
 
-static bool mask_get(uint32_t *mask, unsigned index)
-{
-    return !!(mask[index / 32] & 1 << index % 32);
+	return 0;
 }
 
-static void mask_set(uint32_t *mask, struct ir2_register *reg, int index)
+void * ir2_shader_assemble(struct ir2_shader *shader, struct ir2_shader_info *info)
 {
-	if (reg) {
-		unsigned i;
-		for (i = 0; i < ARRAY_SIZE(reg->regmask); i++)
-			mask[i] |= reg->regmask[i];
+	uint32_t i, j;
+	uint32_t *ptr, *dwords = NULL;
+	uint32_t idx = 0;
+	int ret;
+
+	info->sizedwords    = 0;
+	info->max_reg       = -1;
+	info->max_input_reg = 0;
+	info->regs_written  = 0;
+
+	/* we need an even # of CF's.. insert a NOP if needed */
+	if (shader->cfs_count != align(shader->cfs_count, 2))
+		ir2_cf_create(shader, NOP);
+
+	/* first pass, resolve sizes and addresses: */
+	ret = shader_resolve(shader, info);
+	if (ret) {
+		ERROR_MSG("resolve failed: %d", ret);
+		goto fail;
 	}
-	if (index >= 0)
-		mask[index / 32] |= 1 << index % 32;
-}
 
-static bool sets_pred(struct ir2_instruction *instr)
-{
-    return instr->instr_type == IR2_ALU_SCALAR &&
-		instr->alu_scalar.opc >= PRED_SETEs &&
-		instr->alu_scalar.opc <= PRED_SET_RESTOREs;
-}
+	ptr = dwords = calloc(4, info->sizedwords);
 
-
-
-void* ir2_shader_assemble(struct ir2_shader *shader,
-		struct ir2_shader_info *info)
-{
-	/* NOTES
-	 * blob compiler seems to always puts PRED_* instrs in a CF by
-	 * themselves, and wont combine EQ/NE in the same CF
-	 * (not doing this - doesn't seem to make a difference)
-	 *
-	 * TODO: implement scheduling for combining vector+scalar instructions
-	 * -some vector instructions can be replaced by scalar
-	 */
-
-	/* first step:
-	 * 1. remove "NOP" MOV instructions generated by TGSI for input/output:
-	 * 2. track information for register allocation, and to remove
-	 * the dead code when some exports are not needed
-	 * 3. add additional instructions for a20x hw binning if needed
-	 * NOTE: modifies the shader instrs
-	 * this step could be done as instructions are added by compiler instead
-	 */
-
-	/* mask of exports that must be generated
-	 * used to avoid calculating ps exports with hw binning
-	*/
-	uint64_t export = ~0ull;
-	/* bitmask of variables required for exports defined by "export" */
-	uint32_t export_mask[REG_MASK/32+1] = {};
-
-	unsigned idx, reg_idx;
-	unsigned max_input = 0;
-	int export_size = -1;
-
-	for (idx = 0; idx < shader->instr_count; idx++) {
-		struct ir2_instruction *instr = shader->instr[idx], *prev;
-		struct ir2_dst_register dst_reg = instr->dst_reg;
-
-		if (dst_reg.flags & IR2_REG_EXPORT) {
-			if (dst_reg.num < 32)
-				export_size++;
-
-			if ((prev = simple_mov(instr, true))) {
-				/* copy instruction but keep dst */
-				*instr = *prev;
-				instr->dst_reg = dst_reg;
-			}
+	/* second pass, emit CF program in pairs: */
+	for (i = 0; i < shader->cfs_count; i += 2) {
+		instr_cf_t *cfs = (instr_cf_t *)ptr;
+		ret = cf_emit(shader->cfs[i], &cfs[0]);
+		if (ret) {
+			ERROR_MSG("CF emit failed: %d\n", ret);
+			goto fail;
 		}
-
-		for (reg_idx = 0; reg_idx < instr->src_reg_count; reg_idx++) {
-			struct ir2_src_register *src_reg = &instr->src_reg[reg_idx];
-			struct ir2_register *reg;
-			int num;
-
-			if (src_reg->flags & IR2_REG_CONST)
-				continue;
-
-			num = src_reg->num;
-			reg = &shader->reg[num];
-			reg->read_idx = idx;
-
-			if (src_reg->flags & IR2_REG_INPUT) {
-				max_input = MAX2(max_input, num);
-			} else {
-				/* bypass simple mov used to set src_reg */
-				assert(reg->write_idx >= 0);
-				prev = shader->instr[reg->write_idx];
-				if (simple_mov(prev, false)) {
-					*src_reg = prev->src_reg[0];
-					/* process same src_reg again */
-					reg_idx -= 1;
-					continue;
-				}
-			}
-
-			/* update dependencies */
-			uint32_t *mask = (dst_reg.flags & IR2_REG_EXPORT) ?
-					export_mask : shader->reg[dst_reg.num].regmask;
-			mask_set(mask, reg, num);
-			if (sets_pred(instr))
-				mask_set(export_mask, reg, num);
+		ret = cf_emit(shader->cfs[i+1], &cfs[1]);
+		if (ret) {
+			ERROR_MSG("CF emit failed: %d\n", ret);
+			goto fail;
 		}
+		ptr += 3;
+		assert((ptr - dwords) <= info->sizedwords);
 	}
 
-	/* second step:
-	 * emit instructions (with CFs) + RA
-	 */
-	instr_cf_t cfs[128], *cf = cfs;
-	uint32_t alufetch[3*256], *af = alufetch;
-
-	/* RA is done on write, so inputs must be allocated here */
-	for (reg_idx = 0; reg_idx <= max_input; reg_idx++)
-		shader->reg[reg_idx].reg = reg_idx;
-	info->max_reg = max_input;
-
-	/* CF instr state */
-	instr_cf_exec_t exec = { .opc = EXEC };
-	instr_cf_alloc_t alloc = { .opc = ALLOC };
-	bool need_alloc = 0;
-	bool pos_export = 0;
-
-	export_size = MAX2(export_size, 0);
-
-	for (idx = 0; idx < shader->instr_count; idx++) {
-		struct ir2_instruction *instr = shader->instr[idx];
-		struct ir2_dst_register *dst_reg = &instr->dst_reg;
-		unsigned num = dst_reg->num;
-		struct ir2_register *reg;
-
-		/* a2xx only has 64 registers, so we can use a single 64-bit mask */
-		uint64_t regmask = 0ull;
-
-		/* compute the current regmask */
-		for (reg_idx = 0; (int) reg_idx <= shader->max_reg; reg_idx++) {
-			reg = &shader->reg[reg_idx];
-			if ((int) idx > reg->write_idx && idx < reg->read_idx)
-				regmask |= (1ull << reg->reg);
-		}
-
-		if (dst_reg->flags & IR2_REG_EXPORT) {
-			/* skip if export is not needed */
-			if (!(export & (1ull << num)))
-				continue;
-
-            /* ALLOC CF:
-             * want to alloc all < 32 at once
-			 * 32/33 and 62/63 come in pairs
-			 * XXX assuming all 3 types are never interleaved
-			 */
-            if (num < 32) {
-				alloc.size = export_size;
-				alloc.buffer_select = SQ_PARAMETER_PIXEL;
-				need_alloc = export_size >= 0;
-				export_size = -1;
-			} else if (num == 32 || num == 33) {
-				alloc.size = 0;
-				alloc.buffer_select = SQ_MEMORY;
-				need_alloc = num != 33;
-			} else {
-				alloc.size = 0;
-				alloc.buffer_select = SQ_POSITION;
-				need_alloc = !pos_export;
-				pos_export = true;
-			}
-
-		} else {
-			/* skip if dst register not needed to compute exports */
-			if (!mask_get(export_mask, num))
-				continue;
-
-			/* RA on first write */
-			reg = &shader->reg[num];
-			if (reg->write_idx == idx) {
-				reg->reg = ffsll(~regmask) - 1;
-				info->max_reg = MAX2(info->max_reg, reg->reg);
+	/* third pass, emit ALU/FETCH: */
+	for (i = 0; i < shader->cfs_count; i++) {
+		struct ir2_cf *cf = shader->cfs[i];
+		if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) {
+			for (j = 0; j < cf->exec.instrs_count; j++) {
+				ret = instr_emit(cf->exec.instrs[j], ptr, idx++, info);
+				if (ret) {
+					ERROR_MSG("instruction emit failed: %d", ret);
+					goto fail;
+				}
+				ptr += 3;
+				assert((ptr - dwords) <= info->sizedwords);
 			}
 		}
-
-		if (exec.count == 6 || (exec.count && need_alloc)) {
-			*cf++ = *(instr_cf_t*) &exec;
-			exec.address += exec.count;
-			exec.serialize = 0;
-			exec.count = 0;
-		}
-
-		if (need_alloc) {
-			*cf++ = *(instr_cf_t*) &alloc;
-			need_alloc = false;
-		}
-
-		int ret = instr_emit(instr, af, idx, info); af += 3;
-		assert(!ret);
-
-		if (instr->instr_type == IR2_FETCH)
-			exec.serialize |= 0x1 << exec.count * 2;
-		if (instr->sync)
-			exec.serialize |= 0x2 << exec.count * 2;
-		 exec.count += 1;
 	}
 
+	return dwords;
 
-	exec.opc = !export_size ? EXEC : EXEC_END;
-	*cf++ = *(instr_cf_t*) &exec;
-	exec.address += exec.count;
-	exec.serialize = 0;
-	exec.count = 0;
+fail:
+	free(dwords);
+	return NULL;
+}
 
-	/* GPU will hang without at least one pixel alloc */
-	if (!export_size) {
-		alloc.size = 0;
-		alloc.buffer_select = SQ_PARAMETER_PIXEL;
-		*cf++ = *(instr_cf_t*) &alloc;
 
-		exec.opc = EXEC_END;
-		*cf++ = *(instr_cf_t*) &exec;
-	}
+struct ir2_cf * ir2_cf_create(struct ir2_shader *shader, instr_cf_opc_t cf_type)
+{
+	struct ir2_cf *cf = ir2_alloc(shader, sizeof(struct ir2_cf));
+	DEBUG_MSG("%d", cf_type);
+	cf->shader = shader;
+	cf->cf_type = cf_type;
+	assert(shader->cfs_count < ARRAY_SIZE(shader->cfs));
+	shader->cfs[shader->cfs_count++] = cf;
+	return cf;
+}
 
-	unsigned num_cfs = cf - cfs;
 
-	/* insert nop to get an even # of CFs */
-	if (num_cfs % 2) {
-		*cf++ = (instr_cf_t) { .opc = NOP };
-		num_cfs++;
-	}
+/*
+ * CF instructions:
+ */
 
-	/* offset cf addrs */
-	for (idx = 0; idx < num_cfs; idx++) {
-        switch (cfs[idx].opc) {
-		case EXEC:
-		case EXEC_END:
-			cfs[idx].exec.address += num_cfs / 2;
+static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr)
+{
+	memset(instr, 0, sizeof(*instr));
+
+	instr->opc = cf->cf_type;
+
+	switch (cf->cf_type) {
+	case NOP:
+		break;
+	case EXEC:
+	case EXEC_END:
+		assert(cf->exec.addr <= 0x1ff);
+		assert(cf->exec.cnt <= 0x6);
+		assert(cf->exec.sequence <= 0xfff);
+		instr->exec.address = cf->exec.addr;
+		instr->exec.count = cf->exec.cnt;
+		instr->exec.serialize = cf->exec.sequence;
+		break;
+	case ALLOC:
+		assert(cf->alloc.size <= 0xf);
+		instr->alloc.size = cf->alloc.size;
+		switch (cf->alloc.type) {
+		case SQ_POSITION:
+		case SQ_PARAMETER_PIXEL:
+			instr->alloc.buffer_select = cf->alloc.type;
 			break;
 		default:
-			break;
-		/* XXX  and any other address using cf that gets implemented */
+			ERROR_MSG("invalid alloc type: %d", cf->alloc.type);
+			return -1;
 		}
+		break;
+	case COND_EXEC:
+	case COND_EXEC_END:
+	case COND_PRED_EXEC:
+	case COND_PRED_EXEC_END:
+	case LOOP_START:
+	case LOOP_END:
+	case COND_CALL:
+	case RETURN:
+	case COND_JMP:
+	case COND_EXEC_PRED_CLEAN:
+	case COND_EXEC_PRED_CLEAN_END:
+	case MARK_VS_FETCH_DONE:
+		ERROR_MSG("TODO");
+		return -1;
 	}
 
-	/* concatenate cfs+alufetchs */
-	uint32_t cfdwords = num_cfs / 2 * 3;
-	uint32_t alufetchdwords = exec.address * 3;
-	info->sizedwords = cfdwords + alufetchdwords;
-	uint32_t *dwords = malloc(info->sizedwords * 4);
-	assert(dwords);
-	memcpy(dwords, cfs, cfdwords * 4);
-	memcpy(&dwords[cfdwords], alufetch, alufetchdwords * 4);
-	return dwords;
+	return 0;
 }
 
-struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader,
-		int instr_type)
+
+struct ir2_instruction * ir2_instr_create(struct ir2_cf *cf, int instr_type)
 {
 	struct ir2_instruction *instr =
-			ir2_alloc(shader, sizeof(struct ir2_instruction));
+			ir2_alloc(cf->shader, sizeof(struct ir2_instruction));
 	DEBUG_MSG("%d", instr_type);
-	instr->shader = shader;
-	instr->idx = shader->instr_count;
-	instr->pred = shader->pred;
+	instr->shader = cf->shader;
+	instr->pred = cf->shader->pred;
 	instr->instr_type = instr_type;
-	shader->instr[shader->instr_count++] = instr;
+	assert(cf->exec.instrs_count < ARRAY_SIZE(cf->exec.instrs));
+	cf->exec.instrs[cf->exec.instrs_count++] = instr;
 	return instr;
 }
 
@@ -430,11 +279,15 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
 		struct ir2_shader_info *info)
 {
 	instr_fetch_t *fetch = (instr_fetch_t *)dwords;
-	struct ir2_dst_register *dst_reg = &instr->dst_reg;
-	struct ir2_src_register *src_reg = &instr->src_reg[0];
+	int reg = 0;
+	struct ir2_register *dst_reg = instr->regs[reg++];
+	struct ir2_register *src_reg = instr->regs[reg++];
 
 	memset(fetch, 0, sizeof(*fetch));
 
+	reg_update_stats(dst_reg, info, true);
+	reg_update_stats(src_reg, info, false);
+
 	fetch->opc = instr->fetch.opc;
 
 	if (instr->fetch.opc == VTX_FETCH) {
@@ -445,9 +298,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
 		assert(instr->fetch.const_idx <= 0x1f);
 		assert(instr->fetch.const_idx_sel <= 0x3);
 
-		vtx->src_reg = src_to_reg(instr, src_reg);
+		vtx->src_reg = src_reg->num;
 		vtx->src_swiz = reg_fetch_src_swiz(src_reg, 1);
-		vtx->dst_reg = dst_to_reg(instr, dst_reg);
+		vtx->dst_reg = dst_reg->num;
 		vtx->dst_swiz = reg_fetch_dst_swiz(dst_reg);
 		vtx->must_be_one = 1;
 		vtx->const_index = instr->fetch.const_idx;
@@ -473,9 +326,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
 
 		assert(instr->fetch.const_idx <= 0x1f);
 
-		tex->src_reg = src_to_reg(instr, src_reg);
+		tex->src_reg = src_reg->num;
 		tex->src_swiz = reg_fetch_src_swiz(src_reg, 3);
-		tex->dst_reg = dst_to_reg(instr, dst_reg);
+		tex->dst_reg = dst_reg->num;
 		tex->dst_swiz = reg_fetch_dst_swiz(dst_reg);
 		tex->const_idx = instr->fetch.const_idx;
 		tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
@@ -488,7 +341,6 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
 		tex->use_comp_lod = 1;
 		tex->use_reg_lod = !instr->fetch.is_cube;
 		tex->sample_location = SAMPLE_CENTER;
-		tex->tx_coord_denorm = instr->fetch.is_rect;
 
 		if (instr->pred != IR2_PRED_NONE) {
 			tex->pred_select = 1;
@@ -507,62 +359,95 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
  * ALU instructions:
  */
 
-static int instr_emit_alu(struct ir2_instruction *instr_v,
-		struct ir2_instruction *instr_s, uint32_t *dwords,
+static int instr_emit_alu(struct ir2_instruction *instr, uint32_t *dwords,
 		struct ir2_shader_info *info)
 {
+	int reg = 0;
 	instr_alu_t *alu = (instr_alu_t *)dwords;
-	struct ir2_dst_register *vdst_reg, *sdst_reg;
-	struct ir2_src_register *src1_reg, *src2_reg, *src3_reg;
-	struct ir2_shader *shader = instr_v ? instr_v->shader : instr_s->shader;
-	enum ir2_pred pred = IR2_PRED_NONE;
+	struct ir2_register *dst_reg  = instr->regs[reg++];
+	struct ir2_register *src1_reg;
+	struct ir2_register *src2_reg;
+	struct ir2_register *src3_reg;
 
 	memset(alu, 0, sizeof(*alu));
 
-	vdst_reg = NULL;
-	sdst_reg = NULL;
-	src1_reg = NULL;
-	src2_reg = NULL;
-	src3_reg = NULL;
-
-	if (instr_v) {
-		vdst_reg = &instr_v->dst_reg;
-		assert(instr_v->src_reg_count >= 2);
-		src1_reg = &instr_v->src_reg[0];
-		src2_reg = &instr_v->src_reg[1];
-		if (instr_v->src_reg_count > 2)
-			src3_reg = &instr_v->src_reg[2];
-		pred = instr_v->pred;
+	/* handle instructions w/ 3 src operands: */
+	switch (instr->alu.vector_opc) {
+	case MULADDv:
+	case CNDEv:
+	case CNDGTEv:
+	case CNDGTv:
+	case DOT2ADDv:
+		/* note: disassembler lists 3rd src first, ie:
+		 *   MULADDv Rdst = Rsrc3 + (Rsrc1 * Rsrc2)
+		 * which is the reason for this strange ordering.
+		 */
+		src3_reg = instr->regs[reg++];
+		break;
+	default:
+		src3_reg = NULL;
+		break;
 	}
 
-	if (instr_s) {
-		sdst_reg = &instr_s->dst_reg;
-		assert(instr_s->src_reg_count == 1);
-		assert(!instr_v || vdst_reg->flags == sdst_reg->flags);
-		assert(!instr_v || pred == instr_s->pred);
-		if (src3_reg) {
-			assert(src3_reg->flags == instr_s->src_reg[0].flags);
-			assert(src3_reg->num == instr_s->src_reg[0].num);
-			assert(!strcmp(src3_reg->swizzle, instr_s->src_reg[0].swizzle));
-		}
-		src3_reg = &instr_s->src_reg[0];
-		pred = instr_s->pred;
-	}
+	src1_reg = instr->regs[reg++];
+	src2_reg = instr->regs[reg++];
 
-	if (vdst_reg) {
-		assert((vdst_reg->flags & ~IR2_REG_EXPORT) == 0);
-		assert(!vdst_reg->swizzle || (strlen(vdst_reg->swizzle) == 4));
-		alu->vector_opc          = instr_v->alu_vector.opc;
-		alu->vector_write_mask   = reg_alu_dst_swiz(vdst_reg);
-		alu->vector_dest         = dst_to_reg(instr_v, vdst_reg);
-	} else {
+	reg_update_stats(dst_reg, info, true);
+	reg_update_stats(src1_reg, info, false);
+	reg_update_stats(src2_reg, info, false);
+
+	assert((dst_reg->flags & ~IR2_REG_EXPORT) == 0);
+	assert(!dst_reg->swizzle || (strlen(dst_reg->swizzle) == 4));
+	assert((src1_reg->flags & IR2_REG_EXPORT) == 0);
+	assert(!src1_reg->swizzle || (strlen(src1_reg->swizzle) == 4));
+	assert((src2_reg->flags & IR2_REG_EXPORT) == 0);
+	assert(!src2_reg->swizzle || (strlen(src2_reg->swizzle) == 4));
+
+	if (instr->alu.vector_opc == ~0) {
 		alu->vector_opc          = MAXv;
+		alu->vector_write_mask   = 0;
+	} else {
+		alu->vector_opc          = instr->alu.vector_opc;
+		alu->vector_write_mask   = reg_alu_dst_swiz(dst_reg);
 	}
 
-	if (sdst_reg) {
-		alu->scalar_opc          = instr_s->alu_scalar.opc;
+	alu->vector_dest         = dst_reg->num;
+	alu->export_data         = !!(dst_reg->flags & IR2_REG_EXPORT);
+
+	// TODO predicate case/condition.. need to add to parser
+
+	alu->src2_reg            = src2_reg->num;
+	alu->src2_swiz           = reg_alu_src_swiz(src2_reg);
+	alu->src2_reg_negate     = !!(src2_reg->flags & IR2_REG_NEGATE);
+	alu->src2_reg_abs        = !!(src2_reg->flags & IR2_REG_ABS);
+	alu->src2_sel            = !(src2_reg->flags & IR2_REG_CONST);
+
+	alu->src1_reg            = src1_reg->num;
+	alu->src1_swiz           = reg_alu_src_swiz(src1_reg);
+	alu->src1_reg_negate     = !!(src1_reg->flags & IR2_REG_NEGATE);
+	alu->src1_reg_abs        = !!(src1_reg->flags & IR2_REG_ABS);
+	alu->src1_sel            = !(src1_reg->flags & IR2_REG_CONST);
+
+	alu->vector_clamp        = instr->alu.vector_clamp;
+	alu->scalar_clamp        = instr->alu.scalar_clamp;
+
+	if (instr->alu.scalar_opc != ~0) {
+		struct ir2_register *sdst_reg = instr->regs[reg++];
+
+		reg_update_stats(sdst_reg, info, true);
+
+		assert(sdst_reg->flags == dst_reg->flags);
+
+		if (src3_reg) {
+			assert(src3_reg == instr->regs[reg]);
+			reg++;
+		} else {
+			src3_reg = instr->regs[reg++];
+		}
+
+		alu->scalar_dest         = sdst_reg->num;
 		alu->scalar_write_mask   = reg_alu_dst_swiz(sdst_reg);
-		alu->scalar_dest         = dst_to_reg(instr_s, sdst_reg);
+		alu->scalar_opc          = instr->alu.scalar_opc;
 	} else {
 		/* not sure if this is required, but adreno compiler seems
 		 * to always set scalar opc to MAXs if it is not used:
@@ -570,58 +455,13 @@ static int instr_emit_alu(struct ir2_instruction *instr_v,
 		alu->scalar_opc = MAXs;
 	}
 
-	alu->export_data =
-		!!((instr_v ? vdst_reg : sdst_reg)->flags & IR2_REG_EXPORT);
+	if (src3_reg) {
+		reg_update_stats(src3_reg, info, false);
 
-	/* export32 has this bit set.. it seems to do more than just set
-	 * the base address of the constants used to zero
-	 * TODO make this less of a hack
-	 */
-	if (alu->export_data && alu->vector_dest == 32) {
-		assert(!instr_s);
-		alu->relative_addr = 1;
-	}
-
-	if (src1_reg) {
-		if (src1_reg->flags & IR2_REG_CONST) {
-			assert(!(src1_reg->flags & IR2_REG_ABS));
-			alu->src1_reg_const  = src1_reg->num;
-		} else {
-			alu->src1_reg        = shader->reg[src1_reg->num].reg;
-			alu->src1_reg_abs    = !!(src1_reg->flags & IR2_REG_ABS);
-		}
-		alu->src1_swiz           = reg_alu_src_swiz(src1_reg);
-		alu->src1_reg_negate     = !!(src1_reg->flags & IR2_REG_NEGATE);
-		alu->src1_sel            = !(src1_reg->flags & IR2_REG_CONST);
-    }  else {
-		alu->src1_sel = 1;
-	}
-
-    if (src2_reg) {
-		if (src2_reg->flags & IR2_REG_CONST) {
-			assert(!(src2_reg->flags & IR2_REG_ABS));
-			alu->src2_reg_const  = src2_reg->num;
-		} else {
-			alu->src2_reg        = shader->reg[src2_reg->num].reg;
-			alu->src2_reg_abs    = !!(src2_reg->flags & IR2_REG_ABS);
-		}
-		alu->src2_swiz           = reg_alu_src_swiz(src2_reg);
-		alu->src2_reg_negate     = !!(src2_reg->flags & IR2_REG_NEGATE);
-		alu->src2_sel            = !(src2_reg->flags & IR2_REG_CONST);
-    } else {
-		alu->src2_sel = 1;
-    }
-
-    if (src3_reg) {
-		if (src3_reg->flags & IR2_REG_CONST) {
-			assert(!(src3_reg->flags & IR2_REG_ABS));
-			alu->src3_reg_const  = src3_reg->num;
-		} else {
-			alu->src3_reg        = shader->reg[src3_reg->num].reg;
-			alu->src3_reg_abs    = !!(src3_reg->flags & IR2_REG_ABS);
-		}
+		alu->src3_reg            = src3_reg->num;
 		alu->src3_swiz           = reg_alu_src_swiz(src3_reg);
 		alu->src3_reg_negate     = !!(src3_reg->flags & IR2_REG_NEGATE);
+		alu->src3_reg_abs        = !!(src3_reg->flags & IR2_REG_ABS);
 		alu->src3_sel            = !(src3_reg->flags & IR2_REG_CONST);
 	} else {
 		/* not sure if this is required, but adreno compiler seems
@@ -630,11 +470,9 @@ static int instr_emit_alu(struct ir2_instruction *instr_v,
 		alu->src3_sel = 1;
 	}
 
-	alu->vector_clamp = instr_v ? instr_v->alu_vector.clamp : 0;
-	alu->scalar_clamp = instr_s ? instr_s->alu_scalar.clamp : 0;
-
-	if (pred != IR2_PRED_NONE)
-		alu->pred_select = (pred == IR2_PRED_EQ) ? 3 : 2;
+	if (instr->pred != IR2_PRED_NONE) {
+		alu->pred_select = (instr->pred == IR2_PRED_EQ) ? 3 : 2;
+	}
 
 	return 0;
 }
@@ -644,63 +482,51 @@ static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords,
 {
 	switch (instr->instr_type) {
 	case IR2_FETCH: return instr_emit_fetch(instr, dwords, idx, info);
-	case IR2_ALU_VECTOR: return instr_emit_alu(instr, NULL, dwords, info);
-	case IR2_ALU_SCALAR: return instr_emit_alu(NULL, instr, dwords, info);
+	case IR2_ALU:   return instr_emit_alu(instr, dwords, info);
 	}
 	return -1;
 }
 
-struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr,
+
+struct ir2_register * ir2_reg_create(struct ir2_instruction *instr,
 		int num, const char *swizzle, int flags)
 {
-	if (!(flags & IR2_REG_EXPORT)) {
-		struct ir2_register *reg = &instr->shader->reg[num];
-
-		unsigned i;
-		for (i = instr->shader->max_reg + 1; i <= num; i++)
-			instr->shader->reg[i].write_idx = -1;
-		instr->shader->max_reg = i - 1;
-
-		if (reg->write_idx < 0)
-            reg->write_idx = instr->idx;
-		reg->write_idx2 = instr->idx;
-	}
-
-	struct ir2_dst_register *reg = &instr->dst_reg;
+	struct ir2_register *reg =
+			ir2_alloc(instr->shader, sizeof(struct ir2_register));
+	DEBUG_MSG("%x, %d, %s", flags, num, swizzle);
+	assert(num <= REG_MASK);
 	reg->flags = flags;
 	reg->num = num;
 	reg->swizzle = ir2_strdup(instr->shader, swizzle);
+	assert(instr->regs_count < ARRAY_SIZE(instr->regs));
+	instr->regs[instr->regs_count++] = reg;
 	return reg;
 }
 
-struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr,
-		int num, const char *swizzle, int flags)
+static void reg_update_stats(struct ir2_register *reg,
+		struct ir2_shader_info *info, bool dest)
 {
-	assert(instr->src_reg_count + 1 <= ARRAY_SIZE(instr->src_reg));
-	if (!(flags & IR2_REG_CONST)) {
-		struct ir2_register *reg = &instr->shader->reg[num];
-
-		reg->read_idx = instr->idx;
-
-		unsigned i;
-		for (i = instr->shader->max_reg + 1; i <= num; i++)
-			instr->shader->reg[i].write_idx = -1;
-		instr->shader->max_reg = i - 1;
+	if (!(reg->flags & (IR2_REG_CONST|IR2_REG_EXPORT))) {
+		info->max_reg = MAX2(info->max_reg, reg->num);
+
+		if (dest) {
+			info->regs_written |= (1 << reg->num);
+		} else if (!(info->regs_written & (1 << reg->num))) {
+			/* for registers that haven't been written, they must be an
+			 * input register that the thread scheduler (presumably?)
+			 * needs to know about:
+			 */
+			info->max_input_reg = MAX2(info->max_input_reg, reg->num);
+		}
 	}
-
-	struct ir2_src_register *reg = &instr->src_reg[instr->src_reg_count++];
-	reg->flags = flags;
-	reg->num = num;
-	reg->swizzle = ir2_strdup(instr->shader, swizzle);
-	return reg;
 }
 
-static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n)
+static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n)
 {
 	uint32_t swiz = 0;
 	int i;
 
-	assert((reg->flags & ~IR2_REG_INPUT) == 0);
+	assert(reg->flags == 0);
 	assert(reg->swizzle);
 
 	DEBUG_MSG("fetch src R%d.%s", reg->num, reg->swizzle);
@@ -720,7 +546,7 @@ static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n)
 	return swiz;
 }
 
-static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg)
+static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg)
 {
 	uint32_t swiz = 0;
 	int i;
@@ -753,7 +579,7 @@ static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg)
 }
 
 /* actually, a write-mask */
-static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg)
+static uint32_t reg_alu_dst_swiz(struct ir2_register *reg)
 {
 	uint32_t swiz = 0;
 	int i;
@@ -780,11 +606,12 @@ static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg)
 	return swiz;
 }
 
-static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg)
+static uint32_t reg_alu_src_swiz(struct ir2_register *reg)
 {
 	uint32_t swiz = 0;
 	int i;
 
+	assert((reg->flags & IR2_REG_EXPORT) == 0);
 	assert(!reg->swizzle || (strlen(reg->swizzle) == 4));
 
 	DEBUG_MSG("vector src R%d.%s", reg->num, reg->swizzle);
diff --git a/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h b/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h
index ac2931266..822e5ec4c 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h
+++ b/lib/mesa/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h
@@ -33,25 +33,17 @@
 
 struct ir2_shader;
 
-#define REG_MASK 0xff
-
 struct ir2_shader_info {
 	uint16_t sizedwords;
 	int8_t   max_reg;   /* highest GPR # used by shader */
+	uint8_t  max_input_reg;
+	uint64_t regs_written;
 };
 
 struct ir2_register {
-	int16_t write_idx, write_idx2, read_idx, reg;
-	/* bitmask of variables on which this one depends
-	 * XXX: use bitmask util?
-	 */
-	uint32_t regmask[REG_MASK/32+1];
-};
-
-struct ir2_src_register {
 	enum {
-		IR2_REG_INPUT  = 0x1,
-		IR2_REG_CONST  = 0x2,
+		IR2_REG_CONST  = 0x1,
+		IR2_REG_EXPORT = 0x2,
 		IR2_REG_NEGATE = 0x4,
 		IR2_REG_ABS    = 0x8,
 	} flags;
@@ -59,14 +51,6 @@ struct ir2_src_register {
 	char *swizzle;
 };
 
-struct ir2_dst_register {
-	enum {
-		IR2_REG_EXPORT = 0x1,
-	} flags;
-	int num;
-	char *swizzle;
-};
-
 enum ir2_pred {
 	IR2_PRED_NONE = 0,
 	IR2_PRED_EQ = 1,
@@ -75,17 +59,14 @@ enum ir2_pred {
 
 struct ir2_instruction {
 	struct ir2_shader *shader;
-	unsigned idx;
 	enum {
 		IR2_FETCH,
-		IR2_ALU_VECTOR,
-		IR2_ALU_SCALAR,
+		IR2_ALU,
 	} instr_type;
 	enum ir2_pred pred;
 	int sync;
-	unsigned src_reg_count;
-	struct ir2_dst_register dst_reg;
-	struct ir2_src_register src_reg[3];
+	unsigned regs_count;
+	struct ir2_register *regs[5];
 	union {
 		/* FETCH specific: */
 		struct {
@@ -93,7 +74,6 @@ struct ir2_instruction {
 			unsigned const_idx;
 			/* texture fetch specific: */
 			bool is_cube : 1;
-			bool is_rect : 1;
 			/* vertex fetch specific: */
 			unsigned const_idx_sel;
 			enum a2xx_sq_surfaceformat fmt;
@@ -102,25 +82,38 @@ struct ir2_instruction {
 			uint32_t stride;
 			uint32_t offset;
 		} fetch;
-		/* ALU-Vector specific: */
+		/* ALU specific: */
 		struct {
-			instr_vector_opc_t opc;
-			bool clamp;
-		} alu_vector;
-		/* ALU-Scalar specific: */
+			instr_vector_opc_t vector_opc;
+			instr_scalar_opc_t scalar_opc;
+			bool vector_clamp : 1;
+			bool scalar_clamp : 1;
+		} alu;
+	};
+};
+
+struct ir2_cf {
+	struct ir2_shader *shader;
+	instr_cf_opc_t cf_type;
+
+	union {
+		/* EXEC/EXEC_END specific: */
+		struct {
+			unsigned instrs_count;
+			struct ir2_instruction *instrs[6];
+			uint32_t addr, cnt, sequence;
+		} exec;
+		/* ALLOC specific: */
 		struct {
-			instr_scalar_opc_t opc;
-			bool clamp;
-		} alu_scalar;
+			instr_alloc_type_t type;   /* SQ_POSITION or SQ_PARAMETER_PIXEL */
+			int size;
+		} alloc;
 	};
 };
 
 struct ir2_shader {
-	unsigned instr_count;
-	int max_reg;
-	struct ir2_register reg[REG_MASK+1];
-
-	struct ir2_instruction *instr[0x200];
+	unsigned cfs_count;
+	struct ir2_cf *cfs[0x56];
 	uint32_t heap[100 * 4096];
 	unsigned heap_idx;
 
@@ -132,41 +125,40 @@ void ir2_shader_destroy(struct ir2_shader *shader);
 void * ir2_shader_assemble(struct ir2_shader *shader,
 		struct ir2_shader_info *info);
 
-struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader,
-		int instr_type);
+struct ir2_cf * ir2_cf_create(struct ir2_shader *shader, instr_cf_opc_t cf_type);
 
-struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr,
-		int num, const char *swizzle, int flags);
-struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr,
+struct ir2_instruction * ir2_instr_create(struct ir2_cf *cf, int instr_type);
+
+struct ir2_register * ir2_reg_create(struct ir2_instruction *instr,
 		int num, const char *swizzle, int flags);
 
 /* some helper fxns: */
 
-static inline struct ir2_instruction *
-ir2_instr_create_alu_v(struct ir2_shader *shader, instr_vector_opc_t vop)
+static inline struct ir2_cf *
+ir2_cf_create_alloc(struct ir2_shader *shader, instr_alloc_type_t type, int size)
 {
-	struct ir2_instruction *instr = ir2_instr_create(shader, IR2_ALU_VECTOR);
-	if (!instr)
-		return instr;
-	instr->alu_vector.opc = vop;
-	return instr;
+	struct ir2_cf *cf = ir2_cf_create(shader, ALLOC);
+	if (!cf)
+		return cf;
+	cf->alloc.type = type;
+	cf->alloc.size = size;
+	return cf;
 }
-
 static inline struct ir2_instruction *
-ir2_instr_create_alu_s(struct ir2_shader *shader, instr_scalar_opc_t sop)
+ir2_instr_create_alu(struct ir2_cf *cf, instr_vector_opc_t vop, instr_scalar_opc_t sop)
 {
-	struct ir2_instruction *instr = ir2_instr_create(shader, IR2_ALU_SCALAR);
+	struct ir2_instruction *instr = ir2_instr_create(cf, IR2_ALU);
 	if (!instr)
 		return instr;
-	instr->alu_scalar.opc = sop;
+	instr->alu.vector_opc = vop;
+	instr->alu.scalar_opc = sop;
 	return instr;
 }
-
 static inline struct ir2_instruction *
-ir2_instr_create_vtx_fetch(struct ir2_shader *shader, int ci, int cis,
+ir2_instr_create_vtx_fetch(struct ir2_cf *cf, int ci, int cis,
 		enum a2xx_sq_surfaceformat fmt, bool is_signed, int stride)
 {
-	struct ir2_instruction *instr = ir2_instr_create(shader, IR2_FETCH);
+	struct ir2_instruction *instr = instr = ir2_instr_create(cf, IR2_FETCH);
 	instr->fetch.opc = VTX_FETCH;
 	instr->fetch.const_idx = ci;
 	instr->fetch.const_idx_sel = cis;
@@ -176,9 +168,9 @@ ir2_instr_create_vtx_fetch(struct ir2_shader *shader, int ci, int cis,
 	return instr;
 }
 static inline struct ir2_instruction *
-ir2_instr_create_tex_fetch(struct ir2_shader *shader, int ci)
+ir2_instr_create_tex_fetch(struct ir2_cf *cf, int ci)
 {
-	struct ir2_instruction *instr = ir2_instr_create(shader, IR2_FETCH);
+	struct ir2_instruction *instr = instr = ir2_instr_create(cf, IR2_FETCH);
 	instr->fetch.opc = TEX_FETCH;
 	instr->fetch.const_idx = ci;
 	return instr;
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c
index b6ef6e4b5..07e03d269 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -1,3 +1,5 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
 /*
  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
  *
@@ -33,13 +35,9 @@
 
 static void print_instr_name(struct ir3_instruction *instr)
 {
-	if (!instr)
-		return;
 #ifdef DEBUG
 	printf("%04u:", instr->serialno);
 #endif
-	printf("%04u:", instr->name);
-	printf("%04u:", instr->ip);
 	printf("%03u: ", instr->depth);
 
 	if (instr->flags & IR3_INSTR_SY)
@@ -48,15 +46,22 @@ static void print_instr_name(struct ir3_instruction *instr)
 		printf("(ss)");
 
 	if (is_meta(instr)) {
-		switch (instr->opc) {
-		case OPC_META_INPUT:  printf("_meta:in");   break;
-		case OPC_META_FO:     printf("_meta:fo");   break;
-		case OPC_META_FI:     printf("_meta:fi");   break;
-
-		/* shouldn't hit here.. just for debugging: */
-		default: printf("_meta:%d", instr->opc);    break;
+		switch(instr->opc) {
+		case OPC_META_PHI:
+			printf("&#934;");
+			break;
+		default:
+			/* shouldn't hit here.. just for debugging: */
+			switch (instr->opc) {
+			case OPC_META_INPUT:  printf("_meta:in");   break;
+			case OPC_META_FO:     printf("_meta:fo");   break;
+			case OPC_META_FI:     printf("_meta:fi");   break;
+
+			default: printf("_meta:%d", instr->opc); break;
+			}
+			break;
 		}
-	} else if (instr->opc == OPC_MOV) {
+	} else if (instr->category == 1) {
 		static const char *type[] = {
 				[TYPE_F16] = "f16",
 				[TYPE_F32] = "f32",
@@ -89,7 +94,7 @@ static void print_instr_name(struct ir3_instruction *instr)
 	}
 }
 
-static void print_reg_name(struct ir3_register *reg)
+static void print_reg_name(struct ir3_register *reg, bool followssa)
 {
 	if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
 			(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
@@ -101,29 +106,20 @@ static void print_reg_name(struct ir3_register *reg)
 
 	if (reg->flags & IR3_REG_IMMED) {
 		printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
-	} else if (reg->flags & IR3_REG_ARRAY) {
-		printf("arr[id=%u, offset=%d, size=%u", reg->array.id,
-				reg->array.offset, reg->size);
-		/* for ARRAY we could have null src, for example first write
-		 * instruction..
-		 */
-		if (reg->instr) {
-			printf(", _[");
+	} else if (reg->flags & IR3_REG_SSA) {
+		printf("_");
+		if (followssa) {
+			printf("[");
 			print_instr_name(reg->instr);
 			printf("]");
 		}
-		printf("]");
-	} else if (reg->flags & IR3_REG_SSA) {
-		printf("_[");
-		print_instr_name(reg->instr);
-		printf("]");
 	} else if (reg->flags & IR3_REG_RELATIV) {
 		if (reg->flags & IR3_REG_HALF)
 			printf("h");
 		if (reg->flags & IR3_REG_CONST)
-			printf("c<a0.x + %d>", reg->array.offset);
+			printf("c<a0.x + %u>", reg->num);
 		else
-			printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size);
+			printf("\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size);
 	} else {
 		if (reg->flags & IR3_REG_HALF)
 			printf("h");
@@ -141,6 +137,16 @@ tab(int lvl)
 		printf("\t");
 }
 
+static uint32_t
+block_id(struct ir3_block *block)
+{
+#ifdef DEBUG
+	return block->serialno;
+#else
+	return (uint32_t)(uint64_t)block;
+#endif
+}
+
 static void
 print_instr(struct ir3_instruction *instr, int lvl)
 {
@@ -152,7 +158,7 @@ print_instr(struct ir3_instruction *instr, int lvl)
 	for (i = 0; i < instr->regs_count; i++) {
 		struct ir3_register *reg = instr->regs[i];
 		printf(i ? ", " : " ");
-		print_reg_name(reg);
+		print_reg_name(reg, !!i);
 	}
 
 	if (instr->address) {
@@ -162,6 +168,13 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		printf("]");
 	}
 
+	if (instr->fanin) {
+		printf(", fanin=_");
+		printf("[");
+		print_instr_name(instr->fanin);
+		printf("]");
+	}
+
 	if (instr->cp.left) {
 		printf(", left=_");
 		printf("[");
@@ -176,8 +189,12 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		printf("]");
 	}
 
-	if (instr->opc == OPC_META_FO) {
-		printf(", off=%d", instr->fo.off);
+	if (is_meta(instr)) {
+		if (instr->opc == OPC_META_FO) {
+			printf(", off=%d", instr->fo.off);
+		} else if ((instr->opc == OPC_META_FI) && instr->fi.aid) {
+			printf(", aid=%d", instr->fi.aid);
+		}
 	}
 
 	if (is_flow(instr) && instr->cat0.target) {
@@ -188,17 +205,6 @@ print_instr(struct ir3_instruction *instr, int lvl)
 		printf(", target=block%u", block_id(instr->cat0.target));
 	}
 
-	if (instr->deps_count) {
-		printf(", false-deps:");
-		for (unsigned i = 0; i < instr->deps_count; i++) {
-			if (i > 0)
-				printf(", ");
-			printf("_[");
-			print_instr_name(instr->deps[i]);
-			printf("]");
-		}
-	}
-
 	printf("\n");
 }
 
@@ -211,28 +217,9 @@ static void
 print_block(struct ir3_block *block, int lvl)
 {
 	tab(lvl); printf("block%u {\n", block_id(block));
-
-	if (block->predecessors_count > 0) {
-		tab(lvl+1);
-		printf("pred: ");
-		for (unsigned i = 0; i < block->predecessors_count; i++) {
-			if (i)
-				printf(", ");
-			printf("block%u", block_id(block->predecessors[i]));
-		}
-		printf("\n");
-	}
-
 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 		print_instr(instr, lvl+1);
 	}
-
-	tab(lvl+1); printf("/* keeps:\n");
-	for (unsigned i = 0; i < block->keeps_count; i++) {
-		print_instr(block->keeps[i], lvl+2);
-	}
-	tab(lvl+1); printf(" */\n");
-
 	if (block->successors[1]) {
 		/* leading into if/else: */
 		tab(lvl+1);
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 6552980d9..2ee325518 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -1,3 +1,5 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
 /*
  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
  *
@@ -32,12 +34,11 @@
 /*
  * Instruction Scheduling:
  *
- * A recursive depth based scheduling algo.  Recursively find an eligible
- * instruction to schedule from the deepest instruction (recursing through
- * it's unscheduled src instructions).  Normally this would result in a
- * lot of re-traversal of the same instructions, so we cache results in
- * instr->data (and clear cached results that would be no longer valid
- * after scheduling an instruction).
+ * A priority-queue based scheduling algo.  Add eligible instructions,
+ * ie. ones with all their dependencies scheduled, to the priority
+ * (depth) sorted queue (list).  Pop highest priority instruction off
+ * the queue and schedule it, add newly eligible instructions to the
+ * priority queue, rinse, repeat.
  *
  * There are a few special cases that need to be handled, since sched
  * is currently independent of register allocation.  Usages of address
@@ -51,7 +52,6 @@
 
 struct ir3_sched_ctx {
 	struct ir3_block *block;           /* the current block */
-	struct list_head depth_list;       /* depth sorted unscheduled instrs */
 	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
 	struct ir3_instruction *addr;      /* current a0.x user, if any */
 	struct ir3_instruction *pred;      /* current p0.x user, if any */
@@ -63,17 +63,6 @@ static bool is_sfu_or_mem(struct ir3_instruction *instr)
 	return is_sfu(instr) || is_mem(instr);
 }
 
-#define NULL_INSTR ((void *)~0)
-
-static void
-clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
-{
-	list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
-		if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
-			instr2->data = NULL;
-	}
-}
-
 static void
 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
@@ -104,103 +93,30 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 
 	list_addtail(&instr->node, &instr->block->instr_list);
 	ctx->scheduled = instr;
-
-	if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
-		clear_cache(ctx, NULL);
-	} else {
-		/* invalidate only the necessary entries.. */
-		clear_cache(ctx, instr);
-	}
-}
-
-static struct ir3_instruction *
-deepest(struct ir3_instruction **srcs, unsigned nsrcs)
-{
-	struct ir3_instruction *d = NULL;
-	unsigned i = 0, id = 0;
-
-	while ((i < nsrcs) && !(d = srcs[id = i]))
-		i++;
-
-	if (!d)
-		return NULL;
-
-	for (; i < nsrcs; i++)
-		if (srcs[i] && (srcs[i]->depth > d->depth))
-			d = srcs[id = i];
-
-	srcs[id] = NULL;
-
-	return d;
 }
 
-/**
- * @block: the block to search in, starting from end; in first pass,
- *    this will be the block the instruction would be inserted into
- *    (but has not yet, ie. it only contains already scheduled
- *    instructions).  For intra-block scheduling (second pass), this
- *    would be one of the predecessor blocks.
- * @instr: the instruction to search for
- * @maxd:  max distance, bail after searching this # of instruction
- *    slots, since it means the instruction we are looking for is
- *    far enough away
- * @pred:  if true, recursively search into predecessor blocks to
- *    find the worst case (shortest) distance (only possible after
- *    individual blocks are all scheduled
- */
 static unsigned
-distance(struct ir3_block *block, struct ir3_instruction *instr,
-		unsigned maxd, bool pred)
+distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
+		unsigned maxd)
 {
+	struct list_head *instr_list = &ctx->block->instr_list;
 	unsigned d = 0;
 
-	list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
+	list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
 		if ((n == instr) || (d >= maxd))
-			return d;
-		/* NOTE: don't count branch/jump since we don't know yet if they will
-		 * be eliminated later in resolve_jumps().. really should do that
-		 * earlier so we don't have this constraint.
-		 */
-		if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
+			break;
+		if (is_alu(n) || is_flow(n))
 			d++;
 	}
 
-	/* if coming from a predecessor block, assume it is assigned far
-	 * enough away.. we'll fix up later.
-	 */
-	if (!pred)
-		return maxd;
-
-	if (pred && (block->data != block)) {
-		/* Search into predecessor blocks, finding the one with the
-		 * shortest distance, since that will be the worst case
-		 */
-		unsigned min = maxd - d;
-
-		/* (ab)use block->data to prevent recursion: */
-		block->data = block;
-
-		for (unsigned i = 0; i < block->predecessors_count; i++) {
-			unsigned n;
-
-			n = distance(block->predecessors[i], instr, min, pred);
-
-			min = MIN2(min, n);
-		}
-
-		block->data = NULL;
-		d += min;
-	}
-
 	return d;
 }
 
 /* calculate delay for specified src: */
 static unsigned
-delay_calc_srcn(struct ir3_block *block,
+delay_calc_srcn(struct ir3_sched_ctx *ctx,
 		struct ir3_instruction *assigner,
-		struct ir3_instruction *consumer,
-		unsigned srcn, bool soft, bool pred)
+		struct ir3_instruction *consumer, unsigned srcn)
 {
 	unsigned delay = 0;
 
@@ -208,20 +124,14 @@ delay_calc_srcn(struct ir3_block *block,
 		struct ir3_instruction *src;
 		foreach_ssa_src(src, assigner) {
 			unsigned d;
-			d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
+			if (src->block != assigner->block)
+				break;
+			d = delay_calc_srcn(ctx, src, consumer, srcn);
 			delay = MAX2(delay, d);
 		}
 	} else {
-		if (soft) {
-			if (is_sfu(assigner)) {
-				delay = 4;
-			} else {
-				delay = ir3_delayslots(assigner, consumer, srcn);
-			}
-		} else {
-			delay = ir3_delayslots(assigner, consumer, srcn);
-		}
-		delay -= distance(block, assigner, delay, pred);
+		delay = ir3_delayslots(assigner, consumer, srcn);
+		delay -= distance(ctx, assigner, delay);
 	}
 
 	return delay;
@@ -229,15 +139,16 @@ delay_calc_srcn(struct ir3_block *block,
 
 /* calculate delay for instruction (maximum of delay for all srcs): */
 static unsigned
-delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-		bool soft, bool pred)
+delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
 	unsigned delay = 0;
 	struct ir3_instruction *src;
 
 	foreach_ssa_src_n(src, i, instr) {
 		unsigned d;
-		d = delay_calc_srcn(block, src, instr, i, soft, pred);
+		if (src->block != instr->block)
+			continue;
+		d = delay_calc_srcn(ctx, src, instr, i);
 		delay = MAX2(delay, d);
 	}
 
@@ -260,51 +171,10 @@ static bool is_scheduled(struct ir3_instruction *instr)
 	return !!(instr->flags & IR3_INSTR_MARK);
 }
 
-/* could an instruction be scheduled if specified ssa src was scheduled? */
-static bool
-could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
-{
-	struct ir3_instruction *other_src;
-	foreach_ssa_src(other_src, instr) {
-		/* if dependency not scheduled, we aren't ready yet: */
-		if ((src != other_src) && !is_scheduled(other_src)) {
-			return false;
-		}
-	}
-	return true;
-}
-
-/* Check if instruction is ok to schedule.  Make sure it is not blocked
- * by use of addr/predicate register, etc.
- */
 static bool
-check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		struct ir3_instruction *instr)
 {
-	/* For instructions that write address register we need to
-	 * make sure there is at least one instruction that uses the
-	 * addr value which is otherwise ready.
-	 *
-	 * TODO if any instructions use pred register and have other
-	 * src args, we would need to do the same for writes_pred()..
-	 */
-	if (writes_addr(instr)) {
-		struct ir3 *ir = instr->block->shader;
-		bool ready = false;
-		for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
-			struct ir3_instruction *indirect = ir->indirects[i];
-			if (!indirect)
-				continue;
-			if (indirect->address != instr)
-				continue;
-			ready = could_sched(indirect, instr);
-		}
-
-		/* nothing could be scheduled, so keep looking: */
-		if (!ready)
-			return false;
-	}
-
 	/* if this is a write to address/predicate register, and that
 	 * register is currently in use, we need to defer until it is
 	 * free:
@@ -312,15 +182,52 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 	if (writes_addr(instr) && ctx->addr) {
 		debug_assert(ctx->addr != instr);
 		notes->addr_conflict = true;
-		return false;
+		return true;
 	}
 
 	if (writes_pred(instr) && ctx->pred) {
 		debug_assert(ctx->pred != instr);
 		notes->pred_conflict = true;
-		return false;
+		return true;
+	}
+
+	return false;
+}
+
+/* is this instruction ready to be scheduled?  Return negative for not
+ * ready (updating notes if needed), or >= 0 to indicate number of
+ * delay slots needed.
+ */
+static int
+instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct ir3_instruction *instr)
+{
+	struct ir3_instruction *src;
+	unsigned delay = 0;
+
+	/* Phi instructions can have a dependency on something not
+	 * scheduled yet (for ex, loops).  But OTOH we don't really
+	 * care.  By definition phi's should appear at the top of
+	 * the block, and it's sources should be values from the
+	 * previously executing block, so they are always ready to
+	 * be scheduled:
+	 */
+	if (is_meta(instr) && (instr->opc == OPC_META_PHI))
+		return 0;
+
+	foreach_ssa_src(src, instr) {
+		/* if dependency not scheduled, we aren't ready yet: */
+		if (!is_scheduled(src))
+			return -1;
 	}
 
+	/* all our dependents are scheduled, figure out if
+	 * we have enough delay slots to schedule ourself:
+	 */
+	delay = delay_calc(ctx, instr);
+	if (delay)
+		return delay;
+
 	/* if the instruction is a kill, we need to ensure *every*
 	 * bary.f is scheduled.  The hw seems unhappy if the thread
 	 * gets killed before the end-input (ei) flag is hit.
@@ -339,110 +246,80 @@ check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 
 		for (unsigned i = 0; i < ir->baryfs_count; i++) {
 			struct ir3_instruction *baryf = ir->baryfs[i];
-			if (baryf->flags & IR3_INSTR_UNUSED)
+			if (baryf->depth == DEPTH_UNUSED)
 				continue;
 			if (!is_scheduled(baryf)) {
 				notes->blocked_kill = true;
-				return false;
+				return -1;
 			}
 		}
 	}
 
-	return true;
+	if (check_conflict(ctx, notes, instr))
+		return -1;
+
+	return 0;
 }
 
-/* Find the best instruction to schedule from specified instruction or
- * recursively it's ssa sources.
- */
-static struct ir3_instruction *
-find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		struct ir3_instruction *instr)
+/* could an instruction be scheduled if specified ssa src was scheduled? */
+static bool
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
 {
-	struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
-	struct ir3_instruction *src;
-	unsigned nsrcs = 0;
-
-	if (is_scheduled(instr))
-		return NULL;
-
-	/* use instr->data to cache the results of recursing up the
-	 * instr src's.  Otherwise the recursive algo can scale quite
-	 * badly w/ shader size.  But this takes some care to clear
-	 * the cache appropriately when instructions are scheduled.
-	 */
-	if (instr->data) {
-		if (instr->data == NULL_INSTR)
-			return NULL;
-		return instr->data;
-	}
-
-	/* find unscheduled srcs: */
-	foreach_ssa_src(src, instr) {
-		if (!is_scheduled(src)) {
-			debug_assert(nsrcs < ARRAY_SIZE(srcs));
-			srcs[nsrcs++] = src;
-		}
-	}
-
-	/* if all our src's are already scheduled: */
-	if (nsrcs == 0) {
-		if (check_instr(ctx, notes, instr)) {
-			instr->data = instr;
-			return instr;
-		}
-		return NULL;
-	}
-
-	while ((src = deepest(srcs, nsrcs))) {
-		struct ir3_instruction *candidate;
-
-		candidate = find_instr_recursive(ctx, notes, src);
-		if (!candidate)
-			continue;
-
-		if (check_instr(ctx, notes, candidate)) {
-			instr->data = candidate;
-			return candidate;
+	struct ir3_instruction *other_src;
+	foreach_ssa_src(other_src, instr) {
+		/* if dependency not scheduled, we aren't ready yet: */
+		if ((src != other_src) && !is_scheduled(other_src)) {
+			return false;
 		}
 	}
-
-	instr->data = NULL_INSTR;
-	return NULL;
+	return true;
 }
 
-/* find instruction to schedule: */
-static struct ir3_instruction *
-find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
-		bool soft)
+/* move eligible instructions to the priority list: */
+static unsigned
+add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct list_head *prio_queue, struct list_head *unscheduled_list)
 {
-	struct ir3_instruction *best_instr = NULL;
 	unsigned min_delay = ~0;
 
-	/* TODO we'd really rather use the list/array of block outputs.  But we
-	 * don't have such a thing.  Recursing *every* instruction in the list
-	 * will result in a lot of repeated traversal, since instructions will
-	 * get traversed both when they appear as ssa src to a later instruction
-	 * as well as where they appear in the depth_list.
-	 */
-	list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
-		struct ir3_instruction *candidate;
-		unsigned delay;
-
-		candidate = find_instr_recursive(ctx, notes, instr);
-		if (!candidate)
+	list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) {
+		int e = instr_eligibility(ctx, notes, instr);
+		if (e < 0)
 			continue;
 
-		delay = delay_calc(ctx->block, candidate, soft, false);
-		if (delay < min_delay) {
-			best_instr = candidate;
-			min_delay = delay;
+		/* For instructions that write address register we need to
+		 * make sure there is at least one instruction that uses the
+		 * addr value which is otherwise ready.
+		 *
+		 * TODO if any instructions use pred register and have other
+		 * src args, we would need to do the same for writes_pred()..
+		 */
+		if (unlikely(writes_addr(instr))) {
+			struct ir3 *ir = instr->block->shader;
+			bool ready = false;
+			for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+				struct ir3_instruction *indirect = ir->indirects[i];
+				if (!indirect)
+					continue;
+				if (indirect->address != instr)
+					continue;
+				ready = could_sched(indirect, instr);
+			}
+
+			/* nothing could be scheduled, so keep looking: */
+			if (!ready)
+				continue;
 		}
 
-		if (min_delay == 0)
-			break;
+		min_delay = MIN2(min_delay, e);
+		if (e == 0) {
+			/* remove from unscheduled list and into priority queue: */
+			list_delinit(&instr->node);
+			ir3_insert_by_depth(instr, prio_queue);
+		}
 	}
 
-	return best_instr;
+	return min_delay;
 }
 
 /* "spill" the address register by remapping any unscheduled
@@ -536,56 +413,50 @@ split_pred(struct ir3_sched_ctx *ctx)
 static void
 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 {
-	struct list_head unscheduled_list;
+	struct list_head unscheduled_list, prio_queue;
 
 	ctx->block = block;
 
-	/* addr/pred writes are per-block: */
-	ctx->addr = NULL;
-	ctx->pred = NULL;
-
 	/* move all instructions to the unscheduled list, and
 	 * empty the block's instruction list (to which we will
-	 * be inserting).
+	 * be inserting.
 	 */
 	list_replace(&block->instr_list, &unscheduled_list);
 	list_inithead(&block->instr_list);
-	list_inithead(&ctx->depth_list);
+	list_inithead(&prio_queue);
 
-	/* first a pre-pass to schedule all meta:input instructions
+	/* first a pre-pass to schedule all meta:input/phi instructions
 	 * (which need to appear first so that RA knows the register is
-	 * occupied), and move remaining to depth sorted list:
+	 * occupied:
 	 */
 	list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
-		if (instr->opc == OPC_META_INPUT) {
+		if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
+				(instr->opc == OPC_META_PHI)))
 			schedule(ctx, instr);
-		} else {
-			ir3_insert_by_depth(instr, &ctx->depth_list);
-		}
 	}
 
-	while (!list_empty(&ctx->depth_list)) {
+	while (!(list_empty(&unscheduled_list) &&
+			list_empty(&prio_queue))) {
 		struct ir3_sched_notes notes = {0};
-		struct ir3_instruction *instr;
-
-		instr = find_eligible_instr(ctx, &notes, true);
-		if (!instr)
-			instr = find_eligible_instr(ctx, &notes, false);
+		unsigned delay;
 
-		if (instr) {
-			unsigned delay = delay_calc(ctx->block, instr, false, false);
+		delay = add_eligible_instrs(ctx, &notes, &prio_queue, &unscheduled_list);
 
-			/* and if we run out of instructions that can be scheduled,
-			 * then it is time for nop's:
+		if (!list_empty(&prio_queue)) {
+			struct ir3_instruction *instr = list_last_entry(&prio_queue,
+					struct ir3_instruction, node);
+			/* ugg, this is a bit ugly, but between the time when
+			 * the instruction became eligible and now, a new
+			 * conflict may have arose..
 			 */
-			debug_assert(delay <= 6);
-			while (delay > 0) {
-				ir3_NOP(block);
-				delay--;
+			if (check_conflict(ctx, &notes, instr)) {
+				list_del(&instr->node);
+				list_addtail(&instr->node, &unscheduled_list);
+				continue;
 			}
 
 			schedule(ctx, instr);
-		} else {
+		} else if (delay == ~0) {
 			struct ir3_instruction *new_instr = NULL;
 
 			/* nothing available to schedule.. if we are blocked on
@@ -604,17 +475,23 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 			}
 
 			if (new_instr) {
-				/* clearing current addr/pred can change what is
-				 * available to schedule, so clear cache..
-				 */
-				clear_cache(ctx, NULL);
-
-				ir3_insert_by_depth(new_instr, &ctx->depth_list);
+				list_del(&new_instr->node);
+				list_addtail(&new_instr->node, &unscheduled_list);
 				/* the original instr that wrote addr/pred may have
 				 * originated from a different block:
 				 */
 				new_instr->block = block;
 			}
+
+		} else {
+			/* and if we run out of instructions that can be scheduled,
+			 * then it is time for nop's:
+			 */
+			debug_assert(delay <= 6);
+			while (delay > 0) {
+				ir3_NOP(block);
+				delay--;
+			}
 		}
 	}
 
@@ -630,7 +507,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 		debug_assert(ctx->pred);
 		debug_assert(block->condition);
 
-		delay -= distance(ctx->block, ctx->pred, delay, false);
+		delay -= distance(ctx, ctx->pred, delay);
 
 		while (delay > 0) {
 			ir3_NOP(block);
@@ -669,150 +546,36 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 	 */
 }
 
-/* After scheduling individual blocks, we still could have cases where
- * one (or more) paths into a block, a value produced by a previous
- * has too few delay slots to be legal.  We can't deal with this in the
- * first pass, because loops (ie. we can't ensure all predecessor blocks
- * are already scheduled in the first pass).  All we can really do at
- * this point is stuff in extra nop's until things are legal.
- */
+/* this is needed to ensure later RA stage succeeds: */
 static void
-sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+sched_insert_parallel_copies(struct ir3_block *block)
 {
-	unsigned n = 0;
-
-	ctx->block = block;
-
-	list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
-		unsigned delay = 0;
-
-		for (unsigned i = 0; i < block->predecessors_count; i++) {
-			unsigned d = delay_calc(block->predecessors[i], instr, false, true);
-			delay = MAX2(d, delay);
-		}
-
-		while (delay > n) {
-			struct ir3_instruction *nop = ir3_NOP(block);
-
-			/* move to before instr: */
-			list_delinit(&nop->node);
-			list_addtail(&nop->node, &instr->node);
-
-			n++;
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
+			struct ir3_register *reg;
+			foreach_src(reg, instr) {
+				struct ir3_instruction *src = reg->instr;
+				struct ir3_instruction *mov =
+					ir3_MOV(src->block, src, TYPE_U32);
+				mov->regs[0]->flags |= IR3_REG_PHI_SRC;
+				mov->regs[0]->instr = instr;
+				reg->instr = mov;
+			}
 		}
-
-		/* we can bail once we hit worst case delay: */
-		if (++n > 6)
-			break;
 	}
 }
 
 int ir3_sched(struct ir3 *ir)
 {
 	struct ir3_sched_ctx ctx = {0};
-
-	ir3_clear_mark(ir);
-
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		sched_block(&ctx, block);
+		sched_insert_parallel_copies(block);
 	}
-
+	ir3_clear_mark(ir);
 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		sched_intra_block(&ctx, block);
+		sched_block(&ctx, block);
 	}
-
 	if (ctx.error)
 		return -1;
 	return 0;
 }
-
-/* does instruction 'prior' need to be scheduled before 'instr'? */
-static bool
-depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
-{
-	/* TODO for dependencies that are related to a specific object, ie
-	 * a specific SSBO/image/array, we could relax this constraint to
-	 * make accesses to unrelated objects not depend on each other (at
-	 * least as long as not declared coherent)
-	 */
-	if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
-			((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
-		return true;
-	return !!(instr->barrier_class & prior->barrier_conflict);
-}
-
-static void
-add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
-{
-	struct list_head *prev = instr->node.prev;
-	struct list_head *next = instr->node.next;
-
-	/* add dependencies on previous instructions that must be scheduled
-	 * prior to the current instruction
-	 */
-	while (prev != &block->instr_list) {
-		struct ir3_instruction *pi =
-			LIST_ENTRY(struct ir3_instruction, prev, node);
-
-		prev = prev->prev;
-
-		if (is_meta(pi))
-			continue;
-
-		if (instr->barrier_class == pi->barrier_class) {
-			ir3_instr_add_dep(instr, pi);
-			break;
-		}
-
-		if (depends_on(instr, pi))
-			ir3_instr_add_dep(instr, pi);
-	}
-
-	/* add dependencies on this instruction to following instructions
-	 * that must be scheduled after the current instruction:
-	 */
-	while (next != &block->instr_list) {
-		struct ir3_instruction *ni =
-			LIST_ENTRY(struct ir3_instruction, next, node);
-
-		next = next->next;
-
-		if (is_meta(ni))
-			continue;
-
-		if (instr->barrier_class == ni->barrier_class) {
-			ir3_instr_add_dep(ni, instr);
-			break;
-		}
-
-		if (depends_on(ni, instr))
-			ir3_instr_add_dep(ni, instr);
-	}
-}
-
-/* before scheduling a block, we need to add any necessary false-dependencies
- * to ensure that:
- *
- *  (1) barriers are scheduled in the right order wrt instructions related
- *      to the barrier
- *
- *  (2) reads that come before a write actually get scheduled before the
- *      write
- */
-static void
-calculate_deps(struct ir3_block *block)
-{
-	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
-		if (instr->barrier_class) {
-			add_barrier_deps(block, instr);
-		}
-	}
-}
-
-void
-ir3_sched_add_deps(struct ir3 *ir)
-{
-	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
-		calculate_deps(block);
-	}
-}