summaryrefslogtreecommitdiff
path: root/lib/mesa/src/gallium/drivers/freedreno/ir3
diff options
context:
space:
mode:
authorJonathan Gray <jsg@cvs.openbsd.org>2016-05-29 10:22:51 +0000
committerJonathan Gray <jsg@cvs.openbsd.org>2016-05-29 10:22:51 +0000
commitc9223eed3c16cd3e98a8f56dda953d8f299de0e3 (patch)
tree53e2a1c3f13bcf6b4ed201d7bc135e7213c94ebe /lib/mesa/src/gallium/drivers/freedreno/ir3
parent6e8f2d062ab9c198239b9283b2b7ed12f4ea17d8 (diff)
Import Mesa 11.2.2
Diffstat (limited to 'lib/mesa/src/gallium/drivers/freedreno/ir3')
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c4
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/instr-a3xx.h1
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.c35
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.h104
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c36
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c932
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cp.c201
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_depth.c12
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.c153
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.h10
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c14
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c34
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_ra.c340
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c332
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.c140
-rw-r--r--lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.h111
16 files changed, 1431 insertions, 1028 deletions
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
index 83ed5ffdc..599872470 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -220,7 +220,7 @@ static void print_instr_cat1(instr_t *instr)
else if (cat1->off > 0)
printf("%c<a0.x + %d>", type, cat1->off);
else
- printf("c<a0.x>");
+ printf("%c<a0.x>", type);
} else {
print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32,
cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
@@ -650,7 +650,7 @@ static void print_instr_cat6(instr_t *instr)
/* size of largest OPC field of all the instruction categories: */
#define NOPC_BITS 6
-struct opc_info {
+static const struct opc_info {
uint16_t cat;
uint16_t opc;
const char *name;
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/lib/mesa/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
index c3fb68d51..1b1f1f0a7 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -261,6 +261,7 @@ typedef union PACKED {
/* to make compiler happy: */
uint32_t dummy32;
uint32_t dummy10 : 10;
+ int32_t idummy10 : 10;
uint32_t dummy11 : 11;
uint32_t dummy12 : 12;
uint32_t dummy13 : 13;
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.c
index b24825cff..7d89142d7 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -81,6 +81,7 @@ struct ir3 * ir3_create(struct ir3_compiler *compiler,
shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
list_inithead(&shader->block_list);
+ list_inithead(&shader->array_list);
return shader;
}
@@ -121,18 +122,19 @@ static uint32_t reg(struct ir3_register *reg, struct ir3_info *info,
val.iim_val = reg->iim_val;
} else {
unsigned components;
+ int16_t max;
if (reg->flags & IR3_REG_RELATIV) {
components = reg->size;
- val.dummy10 = reg->offset;
+ val.idummy10 = reg->array.offset;
+ max = (reg->array.offset + repeat + components - 1) >> 2;
} else {
components = util_last_bit(reg->wrmask);
val.comp = reg->num & 0x3;
val.num = reg->num >> 2;
+ max = (reg->num + repeat + components - 1) >> 2;
}
- int16_t max = (reg->num + repeat + components - 1) >> 2;
-
if (reg->flags & IR3_REG_CONST) {
info->max_const = MAX2(info->max_const, max);
} else if (val.num == 63) {
@@ -233,7 +235,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr,
iassert((instr->regs_count == 2) || (instr->regs_count == 3));
if (src1->flags & IR3_REG_RELATIV) {
- iassert(src1->num < (1 << 10));
+ iassert(src1->array.offset < (1 << 10));
cat2->rel1.src1 = reg(src1, info, instr->repeat,
IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
IR3_REG_HALF | absneg);
@@ -260,7 +262,7 @@ static int emit_cat2(struct ir3_instruction *instr, void *ptr,
!((src1->flags ^ src2->flags) & IR3_REG_HALF));
if (src2->flags & IR3_REG_RELATIV) {
- iassert(src2->num < (1 << 10));
+ iassert(src2->array.offset < (1 << 10));
cat2->rel2.src2 = reg(src2, info, instr->repeat,
IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
IR3_REG_HALF | absneg);
@@ -333,7 +335,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr,
iassert(!((src3->flags ^ src_flags) & IR3_REG_HALF));
if (src1->flags & IR3_REG_RELATIV) {
- iassert(src1->num < (1 << 10));
+ iassert(src1->array.offset < (1 << 10));
cat3->rel1.src1 = reg(src1, info, instr->repeat,
IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
IR3_REG_HALF | absneg);
@@ -361,7 +363,7 @@ static int emit_cat3(struct ir3_instruction *instr, void *ptr,
if (src3->flags & IR3_REG_RELATIV) {
- iassert(src3->num < (1 << 10));
+ iassert(src3->array.offset < (1 << 10));
cat3->rel2.src3 = reg(src3, info, instr->repeat,
IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_R |
IR3_REG_HALF | absneg);
@@ -404,7 +406,7 @@ static int emit_cat4(struct ir3_instruction *instr, void *ptr,
iassert(instr->regs_count == 2);
if (src->flags & IR3_REG_RELATIV) {
- iassert(src->num < (1 << 10));
+ iassert(src->array.offset < (1 << 10));
cat4->rel.src = reg(src, info, instr->repeat,
IR3_REG_RELATIV | IR3_REG_CONST | IR3_REG_FNEG |
IR3_REG_FABS | IR3_REG_R | IR3_REG_HALF);
@@ -737,6 +739,14 @@ struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
return reg;
}
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+ struct ir3_register *reg)
+{
+ struct ir3_register *new_reg = reg_create(shader, 0, 0);
+ *new_reg = *reg;
+ return new_reg;
+}
+
void
ir3_instr_set_address(struct ir3_instruction *instr,
struct ir3_instruction *addr)
@@ -777,3 +787,12 @@ ir3_count_instructions(struct ir3 *ir)
}
return cnt;
}
+
+struct ir3_array *
+ir3_lookup_array(struct ir3 *ir, unsigned id)
+{
+ list_for_each_entry (struct ir3_array, arr, &ir->array_list, node)
+ if (arr->id == id)
+ return arr;
+ return NULL;
+}
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.h b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.h
index 12f2ebe18..1a109d880 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -83,7 +83,8 @@ struct ir3_register {
* before register assignment is done:
*/
IR3_REG_SSA = 0x2000, /* 'instr' is ptr to assigning instr */
- IR3_REG_PHI_SRC= 0x4000, /* phi src, regs[0]->instr points to phi */
+ IR3_REG_ARRAY = 0x4000,
+ IR3_REG_PHI_SRC= 0x8000, /* phi src, regs[0]->instr points to phi */
} flags;
union {
@@ -97,11 +98,18 @@ struct ir3_register {
uint32_t uim_val;
float fim_val;
/* relative: */
- int offset;
+ struct {
+ uint16_t id;
+ int16_t offset;
+ } array;
};
- /* for IR3_REG_SSA, src registers contain ptr back to
- * assigning instruction.
+ /* For IR3_REG_SSA, src registers contain ptr back to assigning
+ * instruction.
+ *
+ * For IR3_REG_ARRAY, the pointer is back to the last dependent
+ * array access (although the net effect is the same, it points
+ * back to a previous instruction that we depend on).
*/
struct ir3_instruction *instr;
@@ -177,6 +185,7 @@ struct ir3_instruction {
* before register assignment is done:
*/
IR3_INSTR_MARK = 0x1000,
+ IR3_INSTR_UNUSED= 0x2000,
} flags;
int repeat;
#ifdef DEBUG
@@ -221,9 +230,6 @@ struct ir3_instruction {
int off; /* component/offset */
} fo;
struct {
- int aid;
- } fi;
- struct {
/* used to temporarily hold reference to nir_phi_instr
* until we resolve the phi srcs
*/
@@ -243,11 +249,7 @@ struct ir3_instruction {
* result of moving a const to a reg would have a low cost, so to
* it could make sense to duplicate the instruction at various
* points where the result is needed to reduce register footprint.
- *
- * DEPTH_UNUSED used to mark unused instructions after depth
- * calculation pass.
*/
-#define DEPTH_UNUSED ~0
unsigned depth;
/* When we get to the RA stage, we no longer need depth, but
* we do need instruction's position/name:
@@ -258,6 +260,10 @@ struct ir3_instruction {
};
};
+ /* used for per-pass extra instruction data.
+ */
+ void *data;
+
/* Used during CP and RA stages. For fanin and shader inputs/
* outputs where we need a sequence of consecutive registers,
* keep track of each src instructions left (ie 'n-1') and right
@@ -292,19 +298,6 @@ struct ir3_instruction {
*/
struct ir3_instruction *address;
- /* in case of a instruction with relative dst instruction, we need to
- * capture the dependency on the fanin for the previous values of
- * the array elements. Since we don't know at compile time actually
- * which array elements are written, this serves to preserve the
- * unconditional write to array elements prior to the conditional
- * write.
- *
- * TODO only cat1 can do indirect write.. we could maybe move this
- * into instr->cat1.fanin (but would require the frontend to insert
- * the extra mov)
- */
- struct ir3_instruction *fanin;
-
/* Entry in ir3_block's instruction list: */
struct list_head node;
@@ -378,10 +371,41 @@ struct ir3 {
/* List of blocks: */
struct list_head block_list;
+ /* List of ir3_array's: */
+ struct list_head array_list;
+
unsigned heap_idx;
struct ir3_heap_chunk *chunk;
};
+typedef struct nir_variable nir_variable;
+
+struct ir3_array {
+ struct list_head node;
+ unsigned length;
+ unsigned id;
+
+ nir_variable *var;
+
+ /* We track the last write and last access (read or write) to
+ * setup dependencies on instructions that read or write the
+ * array. Reads can be re-ordered wrt. other reads, but should
+ * not be re-ordered wrt. to writes. Writes cannot be reordered
+ * wrt. any other access to the array.
+ *
+ * So array reads depend on last write, and array writes depend
+ * on the last access.
+ */
+ struct ir3_instruction *last_write, *last_access;
+
+ /* extra stuff used in RA pass: */
+ unsigned base; /* base vreg name */
+ unsigned reg; /* base physical reg */
+ uint16_t start_ip, end_ip;
+};
+
+struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
+
typedef struct nir_block nir_block;
struct ir3_block {
@@ -404,7 +428,7 @@ struct ir3_block {
/* used for per-pass extra block data. Mainly used right
* now in RA step to track livein/liveout.
*/
- void *bd;
+ void *data;
#ifdef DEBUG
uint32_t serialno;
@@ -429,6 +453,8 @@ const char *ir3_instr_name(struct ir3_instruction *instr);
struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
int num, int flags);
+struct ir3_register * ir3_reg_clone(struct ir3 *shader,
+ struct ir3_register *reg);
void ir3_instr_set_address(struct ir3_instruction *instr,
struct ir3_instruction *addr);
@@ -509,6 +535,9 @@ static inline bool is_same_type_mov(struct ir3_instruction *instr)
if (dst->num == regid(REG_A0, 0))
return false;
+ if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
+ return false;
+
if ((instr->category == 1) &&
(instr->cat1.src_type == instr->cat1.dst_type))
return true;
@@ -622,8 +651,10 @@ static inline bool writes_pred(struct ir3_instruction *instr)
/* TODO better name */
static inline struct ir3_instruction *ssa(struct ir3_register *reg)
{
- if (reg->flags & IR3_REG_SSA)
+ if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
+ debug_assert(!(reg->instr && (reg->instr->flags & IR3_INSTR_UNUSED)));
return reg->instr;
+ }
return NULL;
}
@@ -812,8 +843,6 @@ static inline unsigned ir3_cat3_absneg(opc_t opc)
static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
{
- if (instr->fanin)
- return instr->regs_count + 2;
if (instr->address)
return instr->regs_count + 1;
return instr->regs_count;
@@ -821,8 +850,6 @@ static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
{
- if (n == (instr->regs_count + 1))
- return instr->fanin;
if (n == (instr->regs_count + 0))
return instr->address;
return ssa(instr->regs[n]);
@@ -833,8 +860,8 @@ static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr
/* iterator for an instruction's SSA sources (instr), also returns src #: */
#define foreach_ssa_src_n(__srcinst, __n, __instr) \
if ((__instr)->regs_count) \
- for (unsigned __cnt = __ssa_src_cnt(__instr) - 1, __n = 0; __n < __cnt; __n++) \
- if ((__srcinst = __ssa_src_n(__instr, __n + 1)))
+ for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
+ if ((__srcinst = __ssa_src_n(__instr, __n)))
/* iterator for an instruction's SSA sources (instr): */
#define foreach_ssa_src(__srcinst, __instr) \
@@ -877,7 +904,15 @@ ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
struct ir3_instruction *instr =
ir3_instr_create(block, 1, 0);
ir3_reg_create(instr, 0, 0); /* dst */
- ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+ if (src->regs[0]->flags & IR3_REG_ARRAY) {
+ struct ir3_register *src_reg =
+ ir3_reg_create(instr, 0, IR3_REG_ARRAY);
+ src_reg->array = src->regs[0]->array;
+ src_reg->instr = src;
+ } else {
+ ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+ }
+ debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
instr->cat1.src_type = type;
instr->cat1.dst_type = type;
return instr;
@@ -893,6 +928,7 @@ ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
instr->cat1.src_type = src_type;
instr->cat1.dst_type = dst_type;
+ debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
return instr;
}
@@ -1082,7 +1118,7 @@ typedef uint8_t regmask_t[2 * MAX_REG / 8];
static inline unsigned regmask_idx(struct ir3_register *reg)
{
- unsigned num = reg->num;
+ unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
debug_assert(num < MAX_REG);
if (reg->flags & IR3_REG_HALF)
num += MAX_REG;
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index ede29f445..481859efb 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -40,6 +40,7 @@
#include "freedreno_util.h"
#include "ir3_compiler.h"
+#include "ir3_nir.h"
#include "instr-a3xx.h"
#include "ir3.h"
@@ -94,6 +95,8 @@ static void print_usage(void)
printf(" --saturate-t MASK - bitmask of samplers to saturate T coord\n");
printf(" --saturate-r MASK - bitmask of samplers to saturate R coord\n");
printf(" --stream-out - enable stream-out (aka transform feedback)\n");
+ printf(" --ucp MASK - bitmask of enabled user-clip-planes\n");
+ printf(" --gpu GPU_ID - specify gpu-id (default 320)\n");
printf(" --help - show this message\n");
}
@@ -103,16 +106,15 @@ int main(int argc, char **argv)
const char *filename;
struct tgsi_token toks[65536];
struct tgsi_parse_context parse;
- struct ir3_compiler *compiler;
struct ir3_shader_variant v;
struct ir3_shader s;
struct ir3_shader_key key = {};
+ /* TODO cmdline option to target different gpus: */
+ unsigned gpu_id = 320;
const char *info;
void *ptr;
size_t size;
- fd_mesa_debug |= FD_DBG_DISASM;
-
memset(&s, 0, sizeof(s));
memset(&v, 0, sizeof(v));
@@ -125,7 +127,7 @@ int main(int argc, char **argv)
while (n < argc) {
if (!strcmp(argv[n], "--verbose")) {
- fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS;
+ fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS | FD_DBG_DISASM;
n++;
continue;
}
@@ -190,6 +192,20 @@ int main(int argc, char **argv)
continue;
}
+ if (!strcmp(argv[n], "--ucp")) {
+ debug_printf(" %s %s", argv[n], argv[n+1]);
+ key.ucp_enables = strtol(argv[n+1], NULL, 0);
+ n += 2;
+ continue;
+ }
+
+ if (!strcmp(argv[n], "--gpu")) {
+ debug_printf(" %s %s", argv[n], argv[n+1]);
+ gpu_id = strtol(argv[n+1], NULL, 0);
+ n += 2;
+ continue;
+ }
+
if (!strcmp(argv[n], "--help")) {
print_usage();
return 0;
@@ -213,7 +229,12 @@ int main(int argc, char **argv)
if (!tgsi_text_translate(ptr, toks, Elements(toks)))
errx(1, "could not parse `%s'", filename);
- s.tokens = toks;
+ if (fd_mesa_debug & FD_DBG_OPTMSGS)
+ tgsi_dump(toks, 0);
+
+ nir_shader *nir = ir3_tgsi_to_nir(toks);
+ s.compiler = ir3_compiler_create(gpu_id);
+ s.nir = ir3_optimize_nir(&s, nir, NULL);
v.key = key;
v.shader = &s;
@@ -231,11 +252,8 @@ int main(int argc, char **argv)
break;
}
- /* TODO cmdline option to target different gpus: */
- compiler = ir3_compiler_create(320);
-
info = "NIR compiler";
- ret = ir3_compile_shader_nir(compiler, &v);
+ ret = ir3_compile_shader_nir(s.compiler, &v);
if (ret) {
fprintf(stderr, "compiler failed!\n");
return ret;
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 13c395f3c..7a1812f25 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -32,11 +32,6 @@
#include "util/u_string.h"
#include "util/u_memory.h"
#include "util/u_inlines.h"
-#include "tgsi/tgsi_lowering.h"
-#include "tgsi/tgsi_strings.h"
-
-#include "nir/tgsi_to_nir.h"
-#include "glsl/shader_enums.h"
#include "freedreno_util.h"
@@ -51,7 +46,6 @@
struct ir3_compile {
struct ir3_compiler *compiler;
- const struct tgsi_token *tokens;
struct nir_shader *s;
struct ir3 *ir;
@@ -80,8 +74,6 @@ struct ir3_compile {
/* mapping from nir_register to defining instruction: */
struct hash_table *def_ht;
- /* mapping from nir_variable to ir3_array: */
- struct hash_table *var_ht;
unsigned num_arrays;
/* a common pattern for indirect addressing is to request the
@@ -97,9 +89,6 @@ struct ir3_compile {
*/
struct hash_table *block_ht;
- /* for calculating input/output positions/linkages: */
- unsigned next_inloc;
-
/* a4xx (at least patchlevel 0) cannot seem to flat-interpolate
* so we need to use ldlv.u32 to load the varying directly:
*/
@@ -127,101 +116,12 @@ struct ir3_compile {
static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
-static struct nir_shader *to_nir(const struct tgsi_token *tokens)
-{
- struct nir_shader_compiler_options options = {
- .lower_fpow = true,
- .lower_fsat = true,
- .lower_scmp = true,
- .lower_flrp = true,
- .native_integers = true,
- };
- bool progress;
-
- struct nir_shader *s = tgsi_to_nir(tokens, &options);
-
- if (fd_mesa_debug & FD_DBG_OPTMSGS) {
- debug_printf("----------------------\n");
- nir_print_shader(s, stdout);
- debug_printf("----------------------\n");
- }
-
- nir_opt_global_to_local(s);
- nir_convert_to_ssa(s);
- nir_lower_idiv(s);
- nir_lower_load_const_to_scalar(s);
-
- do {
- progress = false;
-
- nir_lower_vars_to_ssa(s);
- nir_lower_alu_to_scalar(s);
- nir_lower_phis_to_scalar(s);
-
- progress |= nir_copy_prop(s);
- progress |= nir_opt_dce(s);
- progress |= nir_opt_cse(s);
- progress |= ir3_nir_lower_if_else(s);
- progress |= nir_opt_algebraic(s);
- progress |= nir_opt_constant_folding(s);
-
- } while (progress);
-
- nir_remove_dead_variables(s);
- nir_validate_shader(s);
-
- if (fd_mesa_debug & FD_DBG_OPTMSGS) {
- debug_printf("----------------------\n");
- nir_print_shader(s, stdout);
- debug_printf("----------------------\n");
- }
-
- return s;
-}
-
-/* TODO nir doesn't lower everything for us yet, but ideally it would: */
-static const struct tgsi_token *
-lower_tgsi(struct ir3_compile *ctx, const struct tgsi_token *tokens,
- struct ir3_shader_variant *so)
-{
- struct tgsi_shader_info info;
- struct tgsi_lowering_config lconfig = {
- .color_two_side = so->key.color_two_side,
- .lower_FRC = true,
- };
-
- switch (so->type) {
- case SHADER_FRAGMENT:
- case SHADER_COMPUTE:
- lconfig.saturate_s = so->key.fsaturate_s;
- lconfig.saturate_t = so->key.fsaturate_t;
- lconfig.saturate_r = so->key.fsaturate_r;
- break;
- case SHADER_VERTEX:
- lconfig.saturate_s = so->key.vsaturate_s;
- lconfig.saturate_t = so->key.vsaturate_t;
- lconfig.saturate_r = so->key.vsaturate_r;
- break;
- }
-
- if (ctx->compiler->gpu_id >= 400) {
- /* a4xx seems to have *no* sam.p */
- lconfig.lower_TXP = ~0; /* lower all txp */
- } else {
- /* a3xx just needs to avoid sam.p for 3d tex */
- lconfig.lower_TXP = (1 << TGSI_TEXTURE_3D);
- }
-
- return tgsi_transform_lowering(&lconfig, tokens, &info);
-}
static struct ir3_compile *
compile_init(struct ir3_compiler *compiler,
- struct ir3_shader_variant *so,
- const struct tgsi_token *tokens)
+ struct ir3_shader_variant *so)
{
struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile);
- const struct tgsi_token *lowered_tokens;
if (compiler->gpu_id >= 400) {
/* need special handling for "flat" */
@@ -238,23 +138,33 @@ compile_init(struct ir3_compiler *compiler,
ctx->compiler = compiler;
ctx->ir = so->ir;
ctx->so = so;
- ctx->next_inloc = 8;
ctx->def_ht = _mesa_hash_table_create(ctx,
_mesa_hash_pointer, _mesa_key_pointer_equal);
- ctx->var_ht = _mesa_hash_table_create(ctx,
- _mesa_hash_pointer, _mesa_key_pointer_equal);
- ctx->addr_ht = _mesa_hash_table_create(ctx,
- _mesa_hash_pointer, _mesa_key_pointer_equal);
ctx->block_ht = _mesa_hash_table_create(ctx,
_mesa_hash_pointer, _mesa_key_pointer_equal);
- lowered_tokens = lower_tgsi(ctx, tokens, so);
- if (!lowered_tokens)
- lowered_tokens = tokens;
- ctx->s = to_nir(lowered_tokens);
+ /* TODO: maybe generate some sort of bitmask of what key
+ * lowers vs what shader has (ie. no need to lower
+ * texture clamp lowering if no texture sample instrs)..
+ * although should be done further up the stack to avoid
+ * creating duplicate variants..
+ */
+
+ if (ir3_key_lowers_nir(&so->key)) {
+ nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
+ ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
+ } else {
+ /* fast-path for shader key that lowers nothing in NIR: */
+ ctx->s = so->shader->nir;
+ }
- if (lowered_tokens != tokens)
- free((void *)lowered_tokens);
+ if (fd_mesa_debug & FD_DBG_DISASM) {
+ DBG("dump nir%dv%d: type=%d, k={bp=%u,cts=%u,hp=%u}",
+ so->shader->id, so->id, so->type,
+ so->key.binning_pass, so->key.color_two_side,
+ so->key.half_precision);
+ nir_print_shader(ctx->s, stdout);
+ }
so->first_driver_param = so->first_immediate = ctx->s->num_uniforms;
@@ -263,7 +173,7 @@ compile_init(struct ir3_compiler *compiler,
* num_uniform * vec4 - user consts
* 4 * vec4 - UBO addresses
* if (vertex shader) {
- * 1 * vec4 - driver params (IR3_DP_*)
+ * N * vec4 - driver params (IR3_DP_*)
* 1 * vec4 - stream-out addresses
* }
*
@@ -275,8 +185,8 @@ compile_init(struct ir3_compiler *compiler,
so->first_immediate += 4;
if (so->type == SHADER_VERTEX) {
- /* one (vec4) slot for driver params (see ir3_driver_param): */
- so->first_immediate++;
+ /* driver params (see ir3_driver_param): */
+ so->first_immediate += IR3_DP_COUNT/4; /* convert to vec4 */
/* one (vec4) slot for stream-output base addresses: */
so->first_immediate++;
}
@@ -306,206 +216,26 @@ compile_free(struct ir3_compile *ctx)
ralloc_free(ctx);
}
-/* global per-array information: */
-struct ir3_array {
- unsigned length, aid;
-};
-
-/* per-block array state: */
-struct ir3_array_value {
- /* TODO drop length/aid, and just have ptr back to ir3_array */
- unsigned length, aid;
- /* initial array element values are phi's, other than for the
- * entry block. The phi src's get added later in a resolve step
- * after we have visited all the blocks, to account for back
- * edges in the cfg.
- */
- struct ir3_instruction **phis;
- /* current array element values (as block is processed). When
- * the array phi's are resolved, it will contain the array state
- * at exit of block, so successor blocks can use it to add their
- * phi srcs.
- */
- struct ir3_instruction *arr[];
-};
-
-/* track array assignments per basic block. When an array is read
- * outside of the same basic block, we can use NIR's dominance-frontier
- * information to figure out where phi nodes are needed.
- */
-struct ir3_nir_block_data {
- unsigned foo;
- /* indexed by array-id (aid): */
- struct ir3_array_value *arrs[];
-};
-
-static struct ir3_nir_block_data *
-get_block_data(struct ir3_compile *ctx, struct ir3_block *block)
-{
- if (!block->bd) {
- struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) +
- ((ctx->num_arrays + 1) * sizeof(bd->arrs[0])));
- block->bd = bd;
- }
- return block->bd;
-}
-
static void
declare_var(struct ir3_compile *ctx, nir_variable *var)
{
unsigned length = glsl_get_length(var->type) * 4; /* always vec4, at least with ttn */
struct ir3_array *arr = ralloc(ctx, struct ir3_array);
+ arr->id = ++ctx->num_arrays;
arr->length = length;
- arr->aid = ++ctx->num_arrays;
- _mesa_hash_table_insert(ctx->var_ht, var, arr);
-}
-
-static nir_block *
-nir_block_pred(nir_block *block)
-{
- assert(block->predecessors->entries < 2);
- if (block->predecessors->entries == 0)
- return NULL;
- return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key;
+ arr->var = var;
+ list_addtail(&arr->node, &ctx->ir->array_list);
}
-static struct ir3_array_value *
+static struct ir3_array *
get_var(struct ir3_compile *ctx, nir_variable *var)
{
- struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var);
- struct ir3_block *block = ctx->block;
- struct ir3_nir_block_data *bd = get_block_data(ctx, block);
- struct ir3_array *arr = entry->data;
-
- if (!bd->arrs[arr->aid]) {
- struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) +
- (arr->length * sizeof(av->arr[0])));
- struct ir3_array_value *defn = NULL;
- nir_block *pred_block;
-
- av->length = arr->length;
- av->aid = arr->aid;
-
- /* For loops, we have to consider that we have not visited some
- * of the blocks who should feed into the phi (ie. back-edges in
- * the cfg).. for example:
- *
- * loop {
- * block { load_var; ... }
- * if then block {} else block {}
- * block { store_var; ... }
- * if then block {} else block {}
- * block {...}
- * }
- *
- * We can skip the phi if we can chase the block predecessors
- * until finding the block previously defining the array without
- * crossing a block that has more than one predecessor.
- *
- * Otherwise create phi's and resolve them as a post-pass after
- * all the blocks have been visited (to handle back-edges).
- */
-
- for (pred_block = block->nblock;
- pred_block && (pred_block->predecessors->entries < 2) && !defn;
- pred_block = nir_block_pred(pred_block)) {
- struct ir3_block *pblock = get_block(ctx, pred_block);
- struct ir3_nir_block_data *pbd = pblock->bd;
- if (!pbd)
- continue;
- defn = pbd->arrs[arr->aid];
- }
-
- if (defn) {
- /* only one possible definer: */
- for (unsigned i = 0; i < arr->length; i++)
- av->arr[i] = defn->arr[i];
- } else if (pred_block) {
- /* not the first block, and multiple potential definers: */
- av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0]));
-
- for (unsigned i = 0; i < arr->length; i++) {
- struct ir3_instruction *phi;
-
- phi = ir3_instr_create2(block, -1, OPC_META_PHI,
- 1 + ctx->impl->num_blocks);
- ir3_reg_create(phi, 0, 0); /* dst */
-
- /* phi's should go at head of block: */
- list_delinit(&phi->node);
- list_add(&phi->node, &block->instr_list);
-
- av->phis[i] = av->arr[i] = phi;
- }
- } else {
- /* Some shaders end up reading array elements without
- * first writing.. so initialize things to prevent null
- * instr ptrs later:
- */
- for (unsigned i = 0; i < arr->length; i++)
- av->arr[i] = create_immed(block, 0);
- }
-
- bd->arrs[arr->aid] = av;
- }
-
- return bd->arrs[arr->aid];
-}
-
-static void
-add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock,
- struct ir3_array_value *av, BITSET_WORD *visited)
-{
- struct ir3_block *block;
- struct ir3_nir_block_data *bd;
-
- if (BITSET_TEST(visited, nblock->index))
- return;
-
- BITSET_SET(visited, nblock->index);
-
- block = get_block(ctx, nblock);
- bd = block->bd;
-
- if (bd && bd->arrs[av->aid]) {
- struct ir3_array_value *dav = bd->arrs[av->aid];
- for (unsigned i = 0; i < av->length; i++) {
- ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr =
- dav->arr[i];
- }
- } else {
- /* didn't find defn, recurse predecessors: */
- struct set_entry *entry;
- set_foreach(nblock->predecessors, entry) {
- add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
- }
- }
-}
-
-static void
-resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block)
-{
- struct ir3_nir_block_data *bd = block->bd;
- unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks);
-
- if (!bd)
- return;
-
- /* TODO use nir dom_frontier to help us with this? */
-
- for (unsigned i = 1; i <= ctx->num_arrays; i++) {
- struct ir3_array_value *av = bd->arrs[i];
- BITSET_WORD visited[bitset_words];
- struct set_entry *entry;
-
- if (!(av && av->phis))
- continue;
-
- memset(visited, 0, sizeof(visited));
- set_foreach(block->nblock->predecessors, entry) {
- add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
- }
+ list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+ if (arr->var == var)
+ return arr;
}
+ compile_error(ctx, "bogus var: %s\n", var->name);
+ return NULL;
}
/* allocate a n element value array (to be populated by caller) and
@@ -523,6 +253,7 @@ __get_dst(struct ir3_compile *ctx, void *key, unsigned n)
static struct ir3_instruction **
get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n)
{
+ compile_assert(ctx, dst->is_ssa);
if (dst->is_ssa) {
return __get_dst(ctx, &dst->ssa, n);
} else {
@@ -540,6 +271,7 @@ static struct ir3_instruction **
get_src(struct ir3_compile *ctx, nir_src *src)
{
struct hash_entry *entry;
+ compile_assert(ctx, src->is_ssa);
if (src->is_ssa) {
entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
} else {
@@ -596,12 +328,17 @@ static struct ir3_instruction *
get_addr(struct ir3_compile *ctx, struct ir3_instruction *src)
{
struct ir3_instruction *addr;
- struct hash_entry *entry;
- entry = _mesa_hash_table_search(ctx->addr_ht, src);
- if (entry)
- return entry->data;
- /* TODO do we need to cache per block? */
+ if (!ctx->addr_ht) {
+ ctx->addr_ht = _mesa_hash_table_create(ctx,
+ _mesa_hash_pointer, _mesa_key_pointer_equal);
+ } else {
+ struct hash_entry *entry;
+ entry = _mesa_hash_table_search(ctx->addr_ht, src);
+ if (entry)
+ return entry->data;
+ }
+
addr = create_addr(ctx->block, src);
_mesa_hash_table_insert(ctx->addr_ht, src, addr);
@@ -640,7 +377,7 @@ create_uniform(struct ir3_compile *ctx, unsigned n)
}
static struct ir3_instruction *
-create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
+create_uniform_indirect(struct ir3_compile *ctx, int n,
struct ir3_instruction *address)
{
struct ir3_instruction *mov;
@@ -649,7 +386,7 @@ create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
mov->cat1.src_type = TYPE_U32;
mov->cat1.dst_type = TYPE_U32;
ir3_reg_create(mov, 0, 0);
- ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV);
+ ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
ir3_instr_set_address(mov, address);
@@ -674,7 +411,7 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr,
}
static struct ir3_instruction *
-create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
+create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, int n,
struct ir3_instruction *address, struct ir3_instruction *collect)
{
struct ir3_block *block = ctx->block;
@@ -688,17 +425,45 @@ create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
src->instr = collect;
src->size = arrsz;
- src->offset = n;
+ src->array.offset = n;
ir3_instr_set_address(mov, address);
return mov;
}
+/* relative (indirect) if address!=NULL */
static struct ir3_instruction *
-create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
- struct ir3_instruction *src, struct ir3_instruction *address,
- struct ir3_instruction *collect)
+create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n,
+ struct ir3_instruction *address)
+{
+ struct ir3_block *block = ctx->block;
+ struct ir3_instruction *mov;
+ struct ir3_register *src;
+
+ mov = ir3_instr_create(block, 1, 0);
+ mov->cat1.src_type = TYPE_U32;
+ mov->cat1.dst_type = TYPE_U32;
+ ir3_reg_create(mov, 0, 0);
+ src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+ COND(address, IR3_REG_RELATIV));
+ src->instr = arr->last_write;
+ src->size = arr->length;
+ src->array.id = arr->id;
+ src->array.offset = n;
+
+ if (address)
+ ir3_instr_set_address(mov, address);
+
+ arr->last_access = mov;
+
+ return mov;
+}
+
+/* relative (indirect) if address!=NULL */
+static struct ir3_instruction *
+create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n,
+ struct ir3_instruction *src, struct ir3_instruction *address)
{
struct ir3_block *block = ctx->block;
struct ir3_instruction *mov;
@@ -707,14 +472,18 @@ create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
mov = ir3_instr_create(block, 1, 0);
mov->cat1.src_type = TYPE_U32;
mov->cat1.dst_type = TYPE_U32;
- dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV);
- dst->size = arrsz;
- dst->offset = n;
+ dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
+ COND(address, IR3_REG_RELATIV));
+ dst->instr = arr->last_access;
+ dst->size = arr->length;
+ dst->array.id = arr->id;
+ dst->array.offset = n;
ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
- mov->fanin = collect;
ir3_instr_set_address(mov, address);
+ arr->last_write = arr->last_access = mov;
+
return mov;
}
@@ -731,11 +500,12 @@ create_input(struct ir3_block *block, unsigned n)
}
static struct ir3_instruction *
-create_frag_input(struct ir3_compile *ctx, unsigned n, bool use_ldlv)
+create_frag_input(struct ir3_compile *ctx, bool use_ldlv)
{
struct ir3_block *block = ctx->block;
struct ir3_instruction *instr;
- struct ir3_instruction *inloc = create_immed(block, n);
+ /* actual inloc is assigned and fixed up later: */
+ struct ir3_instruction *inloc = create_immed(block, 0);
if (use_ldlv) {
instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
@@ -786,6 +556,10 @@ create_frag_coord(struct ir3_compile *ctx, unsigned comp)
}
}
+/* NOTE: this creates the "TGSI" style fragface (ie. input slot
+ * VARYING_SLOT_FACE). For NIR style nir_intrinsic_load_front_face
+ * we can just use the value from hw directly (since it is boolean)
+ */
static struct ir3_instruction *
create_frag_face(struct ir3_compile *ctx, unsigned comp)
{
@@ -828,7 +602,9 @@ static struct ir3_instruction *
create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp)
{
/* first four vec4 sysval's reserved for UBOs: */
- unsigned r = regid(ctx->so->first_driver_param + 4, dp);
+ /* NOTE: dp is in scalar, but there can be >4 dp components: */
+ unsigned n = ctx->so->first_driver_param + IR3_DRIVER_PARAM_OFF;
+ unsigned r = regid(n + dp / 4, dp % 4);
return create_uniform(ctx, r);
}
@@ -1184,6 +960,33 @@ emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu)
dst[0] = ir3_SEL_B32(b, src[1], 0, ir3_b2n(b, src[0]), 0, src[2], 0);
break;
+ case nir_op_bit_count:
+ dst[0] = ir3_CBITS_B(b, src[0], 0);
+ break;
+ case nir_op_ifind_msb: {
+ struct ir3_instruction *cmp;
+ dst[0] = ir3_CLZ_S(b, src[0], 0);
+ cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
+ cmp->cat2.condition = IR3_COND_GE;
+ dst[0] = ir3_SEL_B32(b,
+ ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
+ cmp, 0, dst[0], 0);
+ break;
+ }
+ case nir_op_ufind_msb:
+ dst[0] = ir3_CLZ_B(b, src[0], 0);
+ dst[0] = ir3_SEL_B32(b,
+ ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
+ src[0], 0, dst[0], 0);
+ break;
+ case nir_op_find_lsb:
+ dst[0] = ir3_BFREV_B(b, src[0], 0);
+ dst[0] = ir3_CLZ_B(b, dst[0], 0);
+ break;
+ case nir_op_bitfield_reverse:
+ dst[0] = ir3_BFREV_B(b, src[0], 0);
+ break;
+
default:
compile_error(ctx, "Unhandled ALU op: %s\n",
nir_op_infos[alu->op].name);
@@ -1198,9 +1001,10 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
{
struct ir3_block *b = ctx->block;
struct ir3_instruction *addr, *src0, *src1;
+ nir_const_value *const_offset;
/* UBO addresses are the first driver params: */
- unsigned ubo = regid(ctx->so->first_driver_param, 0);
- unsigned off = intr->const_index[0];
+ unsigned ubo = regid(ctx->so->first_driver_param + IR3_UBOS_OFF, 0);
+ int off = 0;
/* First src is ubo index, which could either be an immed or not: */
src0 = get_src(ctx, &intr->src[0])[0];
@@ -1211,7 +1015,10 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
addr = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0));
}
- if (intr->intrinsic == nir_intrinsic_load_ubo_indirect) {
+ const_offset = nir_src_as_const_value(intr->src[1]);
+ if (const_offset) {
+ off += const_offset->u[0];
+ } else {
/* For load_ubo_indirect, second src is indirect offset: */
src1 = get_src(ctx, &intr->src[1])[0];
@@ -1240,12 +1047,12 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
/* handles array reads: */
static void
-emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
+emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
struct ir3_instruction **dst)
{
nir_deref_var *dvar = intr->variables[0];
nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
- struct ir3_array_value *arr = get_var(ctx, dvar->var);
+ struct ir3_array *arr = get_var(ctx, dvar->var);
compile_assert(ctx, dvar->deref.child &&
(dvar->deref.child->deref_type == nir_deref_type_array));
@@ -1256,19 +1063,17 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
for (int i = 0; i < intr->num_components; i++) {
unsigned n = darr->base_offset * 4 + i;
compile_assert(ctx, n < arr->length);
- dst[i] = arr->arr[n];
+ dst[i] = create_var_load(ctx, arr, n, NULL);
}
break;
case nir_deref_array_type_indirect: {
/* for indirect, we need to collect all the array elements: */
- struct ir3_instruction *collect =
- create_collect(ctx->block, arr->arr, arr->length);
struct ir3_instruction *addr =
get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
for (int i = 0; i < intr->num_components; i++) {
unsigned n = darr->base_offset * 4 + i;
compile_assert(ctx, n < arr->length);
- dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect);
+ dst[i] = create_var_load(ctx, arr, n, addr);
}
break;
}
@@ -1281,12 +1086,13 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
/* handles array writes: */
static void
-emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
+emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
{
nir_deref_var *dvar = intr->variables[0];
nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
- struct ir3_array_value *arr = get_var(ctx, dvar->var);
- struct ir3_instruction **src;
+ struct ir3_array *arr = get_var(ctx, dvar->var);
+ struct ir3_instruction *addr, **src;
+ unsigned wrmask = nir_intrinsic_write_mask(intr);
compile_assert(ctx, dvar->deref.child &&
(dvar->deref.child->deref_type == nir_deref_type_array));
@@ -1295,71 +1101,38 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
switch (darr->deref_array_type) {
case nir_deref_array_type_direct:
- /* direct access does not require anything special: */
- for (int i = 0; i < intr->num_components; i++) {
- unsigned n = darr->base_offset * 4 + i;
- compile_assert(ctx, n < arr->length);
- arr->arr[n] = src[i];
- }
+ addr = NULL;
break;
- case nir_deref_array_type_indirect: {
- /* for indirect, create indirect-store and fan that out: */
- struct ir3_instruction *collect =
- create_collect(ctx->block, arr->arr, arr->length);
- struct ir3_instruction *addr =
- get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
- for (int i = 0; i < intr->num_components; i++) {
- struct ir3_instruction *store;
- unsigned n = darr->base_offset * 4 + i;
- compile_assert(ctx, n < arr->length);
-
- store = create_indirect_store(ctx, arr->length,
- n, src[i], addr, collect);
-
- store->fanin->fi.aid = arr->aid;
-
- /* TODO: probably split this out to be used for
- * store_output_indirect? or move this into
- * create_indirect_store()?
- */
- for (int j = i; j < arr->length; j += intr->num_components) {
- struct ir3_instruction *split;
-
- split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
- split->fo.off = j;
- ir3_reg_create(split, 0, 0);
- ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store;
-
- arr->arr[j] = split;
- }
- }
- /* fixup fanout/split neighbors: */
- for (int i = 0; i < arr->length; i++) {
- arr->arr[i]->cp.right = (i < (arr->length - 1)) ?
- arr->arr[i+1] : NULL;
- arr->arr[i]->cp.left = (i > 0) ?
- arr->arr[i-1] : NULL;
- }
+ case nir_deref_array_type_indirect:
+ addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
break;
- }
default:
compile_error(ctx, "Unhandled store deref type: %u\n",
darr->deref_array_type);
break;
}
+
+ for (int i = 0; i < intr->num_components; i++) {
+ if (!(wrmask & (1 << i)))
+ continue;
+ unsigned n = darr->base_offset * 4 + i;
+ compile_assert(ctx, n < arr->length);
+ create_var_store(ctx, arr, n, src[i], addr);
+ }
}
-static void add_sysval_input(struct ir3_compile *ctx, unsigned name,
+static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot,
struct ir3_instruction *instr)
{
struct ir3_shader_variant *so = ctx->so;
unsigned r = regid(so->inputs_count, 0);
unsigned n = so->inputs_count++;
- so->inputs[n].semantic = ir3_semantic_name(name, 0);
+ so->inputs[n].sysval = true;
+ so->inputs[n].slot = slot;
so->inputs[n].compmask = 1;
so->inputs[n].regid = r;
- so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
+ so->inputs[n].interpolate = INTERP_QUALIFIER_FLAT;
so->total_in++;
ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
@@ -1367,12 +1140,13 @@ static void add_sysval_input(struct ir3_compile *ctx, unsigned name,
}
static void
-emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
+emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
{
const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
struct ir3_instruction **dst, **src;
struct ir3_block *b = ctx->block;
- unsigned idx = intr->const_index[0];
+ nir_const_value *const_offset;
+ int idx;
if (info->has_dest) {
dst = get_dst(ctx, &intr->dest, intr->num_components);
@@ -1382,52 +1156,65 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
switch (intr->intrinsic) {
case nir_intrinsic_load_uniform:
- for (int i = 0; i < intr->num_components; i++) {
- unsigned n = idx * 4 + i;
- dst[i] = create_uniform(ctx, n);
+ idx = nir_intrinsic_base(intr);
+ const_offset = nir_src_as_const_value(intr->src[0]);
+ if (const_offset) {
+ idx += const_offset->u[0];
+ for (int i = 0; i < intr->num_components; i++) {
+ unsigned n = idx * 4 + i;
+ dst[i] = create_uniform(ctx, n);
+ }
+ } else {
+ src = get_src(ctx, &intr->src[0]);
+ for (int i = 0; i < intr->num_components; i++) {
+ int n = idx * 4 + i;
+ dst[i] = create_uniform_indirect(ctx, n,
+ get_addr(ctx, src[0]));
+ }
+ /* NOTE: if relative addressing is used, we set
+ * constlen in the compiler (to worst-case value)
+ * since we don't know in the assembler what the max
+ * addr reg value can be:
+ */
+ ctx->so->constlen = ctx->s->num_uniforms;
}
break;
- case nir_intrinsic_load_uniform_indirect:
- src = get_src(ctx, &intr->src[0]);
- for (int i = 0; i < intr->num_components; i++) {
- unsigned n = idx * 4 + i;
- dst[i] = create_uniform_indirect(ctx, n,
- get_addr(ctx, src[0]));
- }
- /* NOTE: if relative addressing is used, we set constlen in
- * the compiler (to worst-case value) since we don't know in
- * the assembler what the max addr reg value can be:
- */
- ctx->so->constlen = ctx->s->num_uniforms;
- break;
case nir_intrinsic_load_ubo:
- case nir_intrinsic_load_ubo_indirect:
emit_intrinsic_load_ubo(ctx, intr, dst);
break;
case nir_intrinsic_load_input:
- for (int i = 0; i < intr->num_components; i++) {
- unsigned n = idx * 4 + i;
- dst[i] = ctx->ir->inputs[n];
- }
- break;
- case nir_intrinsic_load_input_indirect:
- src = get_src(ctx, &intr->src[0]);
- struct ir3_instruction *collect =
- create_collect(b, ctx->ir->inputs, ctx->ir->ninputs);
- struct ir3_instruction *addr = get_addr(ctx, src[0]);
- for (int i = 0; i < intr->num_components; i++) {
- unsigned n = idx * 4 + i;
- dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
- n, addr, collect);
+ idx = nir_intrinsic_base(intr);
+ const_offset = nir_src_as_const_value(intr->src[0]);
+ if (const_offset) {
+ idx += const_offset->u[0];
+ for (int i = 0; i < intr->num_components; i++) {
+ unsigned n = idx * 4 + i;
+ dst[i] = ctx->ir->inputs[n];
+ }
+ } else {
+ src = get_src(ctx, &intr->src[0]);
+ struct ir3_instruction *collect =
+ create_collect(b, ctx->ir->inputs, ctx->ir->ninputs);
+ struct ir3_instruction *addr = get_addr(ctx, src[0]);
+ for (int i = 0; i < intr->num_components; i++) {
+ unsigned n = idx * 4 + i;
+ dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
+ n, addr, collect);
+ }
}
break;
case nir_intrinsic_load_var:
- emit_intrinisic_load_var(ctx, intr, dst);
+ emit_intrinsic_load_var(ctx, intr, dst);
break;
case nir_intrinsic_store_var:
- emit_intrinisic_store_var(ctx, intr);
+ emit_intrinsic_store_var(ctx, intr);
break;
case nir_intrinsic_store_output:
+ idx = nir_intrinsic_base(intr);
+ const_offset = nir_src_as_const_value(intr->src[1]);
+ compile_assert(ctx, const_offset != NULL);
+ idx += const_offset->u[0];
+
src = get_src(ctx, &intr->src[0]);
for (int i = 0; i < intr->num_components; i++) {
unsigned n = idx * 4 + i;
@@ -1437,27 +1224,42 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
case nir_intrinsic_load_base_vertex:
if (!ctx->basevertex) {
ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
- add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX,
+ add_sysval_input(ctx, SYSTEM_VALUE_BASE_VERTEX,
ctx->basevertex);
}
dst[0] = ctx->basevertex;
break;
case nir_intrinsic_load_vertex_id_zero_base:
if (!ctx->vertex_id) {
- ctx->vertex_id = create_input(ctx->block, 0);
- add_sysval_input(ctx, TGSI_SEMANTIC_VERTEXID_NOBASE,
+ ctx->vertex_id = create_input(b, 0);
+ add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
ctx->vertex_id);
}
dst[0] = ctx->vertex_id;
break;
case nir_intrinsic_load_instance_id:
if (!ctx->instance_id) {
- ctx->instance_id = create_input(ctx->block, 0);
- add_sysval_input(ctx, TGSI_SEMANTIC_INSTANCEID,
+ ctx->instance_id = create_input(b, 0);
+ add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID,
ctx->instance_id);
}
dst[0] = ctx->instance_id;
break;
+ case nir_intrinsic_load_user_clip_plane:
+ idx = nir_intrinsic_ucp_id(intr);
+ for (int i = 0; i < intr->num_components; i++) {
+ unsigned n = idx * 4 + i;
+ dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
+ }
+ break;
+ case nir_intrinsic_load_front_face:
+ if (!ctx->frag_face) {
+ ctx->so->frag_face = true;
+ ctx->frag_face = create_input(b, 0);
+ ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
+ }
+ dst[0] = ir3_ADD_S(b, ctx->frag_face, 0, create_immed(b, 1), 0);
+ break;
case nir_intrinsic_discard_if:
case nir_intrinsic_discard: {
struct ir3_instruction *cond, *kill;
@@ -1547,10 +1349,10 @@ tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
unreachable("bad sampler_dim");
}
- if (tex->is_shadow)
+ if (tex->is_shadow && tex->op != nir_texop_lod)
flags |= IR3_INSTR_S;
- if (tex->is_array)
+ if (tex->is_array && tex->op != nir_texop_lod)
flags |= IR3_INSTR_A;
*flagsp = flags;
@@ -1606,7 +1408,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
ddy = get_src(ctx, &tex->src[i].src);
break;
default:
- compile_error(ctx, "Unhandled NIR tex serc type: %d\n",
+ compile_error(ctx, "Unhandled NIR tex src type: %d\n",
tex->src[i].src_type);
return;
}
@@ -1618,11 +1420,13 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
case nir_texop_txl: opc = OPC_SAML; break;
case nir_texop_txd: opc = OPC_SAMGQ; break;
case nir_texop_txf: opc = OPC_ISAML; break;
+ case nir_texop_lod: opc = OPC_GETLOD; break;
case nir_texop_txf_ms:
case nir_texop_txs:
- case nir_texop_lod:
case nir_texop_tg4:
case nir_texop_query_levels:
+ case nir_texop_texture_samples:
+ case nir_texop_samples_identical:
compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
return;
}
@@ -1664,10 +1468,10 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
src0[nsrc0++] = create_immed(b, fui(0.5));
}
- if (tex->is_shadow)
+ if (tex->is_shadow && tex->op != nir_texop_lod)
src0[nsrc0++] = compare;
- if (tex->is_array)
+ if (tex->is_array && tex->op != nir_texop_lod)
src0[nsrc0++] = coord[coords];
if (has_proj) {
@@ -1716,7 +1520,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
case nir_type_int:
type = TYPE_S32;
break;
- case nir_type_unsigned:
+ case nir_type_uint:
case nir_type_bool:
type = TYPE_U32;
break;
@@ -1724,12 +1528,26 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
unreachable("bad dest_type");
}
+ if (opc == OPC_GETLOD)
+ type = TYPE_U32;
+
sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW,
- flags, tex->sampler_index, tex->sampler_index,
+ flags, tex->texture_index, tex->texture_index,
create_collect(b, src0, nsrc0),
create_collect(b, src1, nsrc1));
split_dest(b, dst, sam, 4);
+
+ /* GETLOD returns results in 4.8 fixed point */
+ if (opc == OPC_GETLOD) {
+ struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
+
+ compile_assert(ctx, tex->dest_type == nir_type_float);
+ for (i = 0; i < 2; i++) {
+ dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0,
+ factor, 0);
+ }
+ }
}
static void
@@ -1741,7 +1559,7 @@ emit_tex_query_levels(struct ir3_compile *ctx, nir_tex_instr *tex)
dst = get_dst(ctx, &tex->dest, 1);
sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, TGSI_WRITEMASK_Z, 0,
- tex->sampler_index, tex->sampler_index, NULL, NULL);
+ tex->texture_index, tex->texture_index, NULL, NULL);
/* even though there is only one component, since it ends
* up in .z rather than .x, we need a split_dest()
@@ -1778,7 +1596,7 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
lod = get_src(ctx, &tex->src[0].src)[0];
sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags,
- tex->sampler_index, tex->sampler_index, lod, NULL);
+ tex->texture_index, tex->texture_index, lod, NULL);
split_dest(b, dst, sam, 4);
@@ -1840,8 +1658,6 @@ resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
}
}
-
- resolve_array_phis(ctx, block);
}
static void
@@ -1869,7 +1685,7 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
emit_alu(ctx, nir_instr_as_alu(instr));
break;
case nir_instr_type_intrinsic:
- emit_intrinisic(ctx, nir_instr_as_intrinsic(instr));
+ emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
break;
case nir_instr_type_load_const:
emit_load_const(ctx, nir_instr_as_load_const(instr));
@@ -1938,6 +1754,10 @@ emit_block(struct ir3_compile *ctx, nir_block *nblock)
ctx->block = block;
list_addtail(&block->node, &ctx->ir->block_list);
+ /* re-emit addr register in each block if needed: */
+ _mesa_hash_table_destroy(ctx->addr_ht, NULL);
+ ctx->addr_ht = NULL;
+
nir_foreach_instr(nblock, instr) {
emit_instr(ctx, instr);
if (ctx->error)
@@ -2020,7 +1840,7 @@ emit_stream_out(struct ir3_compile *ctx)
* of the shader:
*/
vtxcnt = create_input(ctx->in_block, 0);
- add_sysval_input(ctx, IR3_SEMANTIC_VTXCNT, vtxcnt);
+ add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt);
maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
@@ -2064,7 +1884,7 @@ emit_stream_out(struct ir3_compile *ctx)
unsigned stride = strmout->stride[i];
struct ir3_instruction *base, *off;
- base = create_uniform(ctx, regid(v->first_driver_param + 5, i));
+ base = create_uniform(ctx, regid(v->first_driver_param + IR3_TFBOS_OFF, i));
/* 24-bit should be enough: */
off = ir3_MUL_U(ctx->block, vtxcnt, 0,
@@ -2098,6 +1918,8 @@ emit_stream_out(struct ir3_compile *ctx)
static void
emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
{
+ nir_metadata_require(impl, nir_metadata_block_index);
+
emit_cf_list(ctx, &impl->body);
emit_block(ctx, impl->end_block);
@@ -2132,90 +1954,73 @@ setup_input(struct ir3_compile *ctx, nir_variable *in)
struct ir3_shader_variant *so = ctx->so;
unsigned array_len = MAX2(glsl_get_length(in->type), 1);
unsigned ncomp = glsl_get_components(in->type);
- /* XXX: map loc slots to semantics */
- unsigned semantic_name = in->data.location;
- unsigned semantic_index = in->data.index;
unsigned n = in->data.driver_location;
+ unsigned slot = in->data.location;
- DBG("; in: %u:%u, len=%ux%u, loc=%u",
- semantic_name, semantic_index, array_len,
- ncomp, n);
+ DBG("; in: slot=%u, len=%ux%u, drvloc=%u",
+ slot, array_len, ncomp, n);
- so->inputs[n].semantic =
- ir3_semantic_name(semantic_name, semantic_index);
+ so->inputs[n].slot = slot;
so->inputs[n].compmask = (1 << ncomp) - 1;
- so->inputs[n].inloc = ctx->next_inloc;
- so->inputs[n].interpolate = 0;
so->inputs_count = MAX2(so->inputs_count, n + 1);
+ so->inputs[n].interpolate = in->data.interpolation;
- /* the fdN_program_emit() code expects tgsi consts here, so map
- * things back to tgsi for now:
- */
- switch (in->data.interpolation) {
- case INTERP_QUALIFIER_FLAT:
- so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
- break;
- case INTERP_QUALIFIER_NOPERSPECTIVE:
- so->inputs[n].interpolate = TGSI_INTERPOLATE_LINEAR;
- break;
- case INTERP_QUALIFIER_SMOOTH:
- so->inputs[n].interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
- break;
- }
-
- for (int i = 0; i < ncomp; i++) {
- struct ir3_instruction *instr = NULL;
- unsigned idx = (n * 4) + i;
+ if (ctx->so->type == SHADER_FRAGMENT) {
+ for (int i = 0; i < ncomp; i++) {
+ struct ir3_instruction *instr = NULL;
+ unsigned idx = (n * 4) + i;
- if (ctx->so->type == SHADER_FRAGMENT) {
- if (semantic_name == TGSI_SEMANTIC_POSITION) {
+ if (slot == VARYING_SLOT_POS) {
so->inputs[n].bary = false;
so->frag_coord = true;
instr = create_frag_coord(ctx, i);
- } else if (semantic_name == TGSI_SEMANTIC_FACE) {
+ } else if (slot == VARYING_SLOT_FACE) {
so->inputs[n].bary = false;
so->frag_face = true;
instr = create_frag_face(ctx, i);
} else {
bool use_ldlv = false;
- /* with NIR, we need to infer TGSI_INTERPOLATE_COLOR
- * from the semantic name:
+ /* detect the special case for front/back colors where
+ * we need to do flat vs smooth shading depending on
+ * rast state:
*/
- if ((in->data.interpolation == INTERP_QUALIFIER_NONE) &&
- ((semantic_name == TGSI_SEMANTIC_COLOR) ||
- (semantic_name == TGSI_SEMANTIC_BCOLOR)))
- so->inputs[n].interpolate = TGSI_INTERPOLATE_COLOR;
+ if (in->data.interpolation == INTERP_QUALIFIER_NONE) {
+ switch (slot) {
+ case VARYING_SLOT_COL0:
+ case VARYING_SLOT_COL1:
+ case VARYING_SLOT_BFC0:
+ case VARYING_SLOT_BFC1:
+ so->inputs[n].rasterflat = true;
+ break;
+ default:
+ break;
+ }
+ }
if (ctx->flat_bypass) {
- /* with NIR, we need to infer TGSI_INTERPOLATE_COLOR
- * from the semantic name:
- */
- switch (so->inputs[n].interpolate) {
- case TGSI_INTERPOLATE_COLOR:
- if (!ctx->so->key.rasterflat)
- break;
- /* fallthrough */
- case TGSI_INTERPOLATE_CONSTANT:
+ if ((so->inputs[n].interpolate == INTERP_QUALIFIER_FLAT) ||
+ (so->inputs[n].rasterflat && ctx->so->key.rasterflat))
use_ldlv = true;
- break;
- }
}
so->inputs[n].bary = true;
- instr = create_frag_input(ctx,
- so->inputs[n].inloc + i - 8, use_ldlv);
+ instr = create_frag_input(ctx, use_ldlv);
}
- } else {
- instr = create_input(ctx->block, idx);
- }
- ctx->ir->inputs[idx] = instr;
+ ctx->ir->inputs[idx] = instr;
+ }
+ } else if (ctx->so->type == SHADER_VERTEX) {
+ for (int i = 0; i < ncomp; i++) {
+ unsigned idx = (n * 4) + i;
+ ctx->ir->inputs[idx] = create_input(ctx->block, idx);
+ }
+ } else {
+ compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
}
if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) {
- ctx->next_inloc += ncomp;
so->total_in += ncomp;
}
}
@@ -2226,56 +2031,62 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
struct ir3_shader_variant *so = ctx->so;
unsigned array_len = MAX2(glsl_get_length(out->type), 1);
unsigned ncomp = glsl_get_components(out->type);
- /* XXX: map loc slots to semantics */
- unsigned semantic_name = out->data.location;
- unsigned semantic_index = out->data.index;
unsigned n = out->data.driver_location;
+ unsigned slot = out->data.location;
unsigned comp = 0;
- DBG("; out: %u:%u, len=%ux%u, loc=%u",
- semantic_name, semantic_index, array_len,
- ncomp, n);
+ DBG("; out: slot=%u, len=%ux%u, drvloc=%u",
+ slot, array_len, ncomp, n);
- if (ctx->so->type == SHADER_VERTEX) {
- switch (semantic_name) {
- case TGSI_SEMANTIC_POSITION:
+ if (ctx->so->type == SHADER_FRAGMENT) {
+ switch (slot) {
+ case FRAG_RESULT_DEPTH:
+ comp = 2; /* tgsi will write to .z component */
so->writes_pos = true;
break;
- case TGSI_SEMANTIC_PSIZE:
- so->writes_psize = true;
- break;
- case TGSI_SEMANTIC_COLOR:
- case TGSI_SEMANTIC_BCOLOR:
- case TGSI_SEMANTIC_GENERIC:
- case TGSI_SEMANTIC_FOG:
- case TGSI_SEMANTIC_TEXCOORD:
+ case FRAG_RESULT_COLOR:
+ so->color0_mrt = 1;
break;
default:
- compile_error(ctx, "unknown VS semantic name: %s\n",
- tgsi_semantic_names[semantic_name]);
+ if (slot >= FRAG_RESULT_DATA0)
+ break;
+ compile_error(ctx, "unknown FS output name: %s\n",
+ gl_frag_result_name(slot));
}
- } else {
- switch (semantic_name) {
- case TGSI_SEMANTIC_POSITION:
- comp = 2; /* tgsi will write to .z component */
+ } else if (ctx->so->type == SHADER_VERTEX) {
+ switch (slot) {
+ case VARYING_SLOT_POS:
so->writes_pos = true;
break;
- case TGSI_SEMANTIC_COLOR:
- if (semantic_index == -1) {
- semantic_index = 0;
- so->color0_mrt = 1;
- }
+ case VARYING_SLOT_PSIZ:
+ so->writes_psize = true;
break;
+ case VARYING_SLOT_COL0:
+ case VARYING_SLOT_COL1:
+ case VARYING_SLOT_BFC0:
+ case VARYING_SLOT_BFC1:
+ case VARYING_SLOT_FOGC:
+ case VARYING_SLOT_CLIP_DIST0:
+ case VARYING_SLOT_CLIP_DIST1:
+ break;
+ case VARYING_SLOT_CLIP_VERTEX:
+ /* handled entirely in nir_lower_clip: */
+ return;
default:
- compile_error(ctx, "unknown FS semantic name: %s\n",
- tgsi_semantic_names[semantic_name]);
+ if (slot >= VARYING_SLOT_VAR0)
+ break;
+ if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
+ break;
+ compile_error(ctx, "unknown VS output name: %s\n",
+ gl_varying_slot_name(slot));
}
+ } else {
+ compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
}
compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
- so->outputs[n].semantic =
- ir3_semantic_name(semantic_name, semantic_index);
+ so->outputs[n].slot = slot;
so->outputs[n].regid = regid(n, comp);
so->outputs_count = MAX2(so->outputs_count, n + 1);
@@ -2293,10 +2104,10 @@ emit_instructions(struct ir3_compile *ctx)
nir_function_impl *fxn = NULL;
/* Find the main function: */
- nir_foreach_overload(ctx->s, overload) {
- compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
- compile_assert(ctx, overload->impl);
- fxn = overload->impl;
+ nir_foreach_function(ctx->s, function) {
+ compile_assert(ctx, strcmp(function->name, "main") == 0);
+ compile_assert(ctx, function->impl);
+ fxn = function->impl;
break;
}
@@ -2312,7 +2123,7 @@ emit_instructions(struct ir3_compile *ctx)
ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
/* Create inputs in first block: */
- ctx->block = get_block(ctx, fxn->start_block);
+ ctx->block = get_block(ctx, nir_start_block(fxn));
ctx->in_block = ctx->block;
list_addtail(&ctx->block->node, &ctx->ir->block_list);
@@ -2334,17 +2145,23 @@ emit_instructions(struct ir3_compile *ctx)
}
/* Setup inputs: */
- foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) {
+ nir_foreach_variable(var, &ctx->s->inputs) {
setup_input(ctx, var);
}
/* Setup outputs: */
- foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) {
+ nir_foreach_variable(var, &ctx->s->outputs) {
setup_output(ctx, var);
}
- /* Setup variables (which should only be arrays): */
- foreach_list_typed(nir_variable, var, node, &ctx->s->globals) {
+ /* Setup global variables (which should only be arrays): */
+ nir_foreach_variable(var, &ctx->s->globals) {
+ declare_var(ctx, var);
+ }
+
+ /* Setup local variables (which should only be arrays): */
+ /* NOTE: need to do something more clever when we support >1 fxn */
+ nir_foreach_variable(var, &fxn->locals) {
declare_var(ctx, var);
}
@@ -2436,12 +2253,12 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
struct ir3_compile *ctx;
struct ir3 *ir;
struct ir3_instruction **inputs;
- unsigned i, j, actual_in;
+ unsigned i, j, actual_in, inloc;
int ret = 0, max_bary;
assert(!so->ir);
- ctx = compile_init(compiler, so, so->shader->tokens);
+ ctx = compile_init(compiler, so);
if (!ctx) {
DBG("INIT failed!");
ret = -1;
@@ -2468,12 +2285,10 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
/* at this point, for binning pass, throw away unneeded outputs: */
if (so->key.binning_pass) {
for (i = 0, j = 0; i < so->outputs_count; i++) {
- unsigned name = sem2name(so->outputs[i].semantic);
- unsigned idx = sem2idx(so->outputs[i].semantic);
+ unsigned slot = so->outputs[i].slot;
/* throw away everything but first position/psize */
- if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
- (name == TGSI_SEMANTIC_PSIZE))) {
+ if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
if (i != j) {
so->outputs[j] = so->outputs[i];
ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
@@ -2558,13 +2373,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
ir3_print(ir);
}
- ir3_legalize(ir, &so->has_samp, &max_bary);
-
- if (fd_mesa_debug & FD_DBG_OPTMSGS) {
- printf("AFTER LEGALIZE:\n");
- ir3_print(ir);
- }
-
/* fixup input/outputs: */
for (i = 0; i < so->outputs_count; i++) {
so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
@@ -2572,38 +2380,52 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
* but what we give the hw is the scalar register:
*/
if ((so->type == SHADER_FRAGMENT) &&
- (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
+ (so->outputs[i].slot == FRAG_RESULT_DEPTH))
so->outputs[i].regid += 2;
}
/* Note that some or all channels of an input may be unused: */
actual_in = 0;
+ inloc = 0;
for (i = 0; i < so->inputs_count; i++) {
unsigned j, regid = ~0, compmask = 0;
so->inputs[i].ncomp = 0;
+ so->inputs[i].inloc = inloc + 8;
for (j = 0; j < 4; j++) {
struct ir3_instruction *in = inputs[(i*4) + j];
- if (in) {
+ if (in && !(in->flags & IR3_INSTR_UNUSED)) {
compmask |= (1 << j);
regid = in->regs[0]->num - j;
actual_in++;
so->inputs[i].ncomp++;
+ if ((so->type == SHADER_FRAGMENT) && so->inputs[i].bary) {
+ /* assign inloc: */
+ assert(in->regs[1]->flags & IR3_REG_IMMED);
+ in->regs[1]->iim_val = inloc++;
+ }
}
}
+ if ((so->type == SHADER_FRAGMENT) && compmask && so->inputs[i].bary)
+ so->varying_in++;
so->inputs[i].regid = regid;
so->inputs[i].compmask = compmask;
}
- /* fragment shader always gets full vec4's even if it doesn't
- * fetch all components, but vertex shader we need to update
- * with the actual number of components fetch, otherwise thing
- * will hang due to mismaptch between VFD_DECODE's and
- * TOTALATTRTOVS
+ /* We need to do legalize after (for frag shader's) the "bary.f"
+ * offsets (inloc) have been assigned.
*/
+ ir3_legalize(ir, &so->has_samp, &max_bary);
+
+ if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+ printf("AFTER LEGALIZE:\n");
+ ir3_print(ir);
+ }
+
+ /* Note that actual_in counts inputs that are not bary.f'd for FS: */
if (so->type == SHADER_VERTEX)
so->total_in = actual_in;
else
- so->total_in = align(max_bary + 1, 4);
+ so->total_in = max_bary + 1;
out:
if (ret) {
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index be4e4e811..1cc211a76 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -41,16 +41,22 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
struct ir3_register *dst = instr->regs[0];
struct ir3_register *src = instr->regs[1];
struct ir3_instruction *src_instr = ssa(src);
+
+ /* only if mov src is SSA (not const/immed): */
+ if (!src_instr)
+ return false;
+
+ /* no indirect: */
if (dst->flags & IR3_REG_RELATIV)
return false;
if (src->flags & IR3_REG_RELATIV)
return false;
+
if (!allow_flags)
if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG |
IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
return false;
- if (!src_instr)
- return false;
+
/* TODO: remove this hack: */
if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO))
return false;
@@ -82,10 +88,17 @@ static bool valid_flags(struct ir3_instruction *instr, unsigned n,
unsigned valid_flags;
flags = cp_flags(flags);
+ /* If destination is indirect, then source cannot be.. at least
+ * I don't think so..
+ */
+ if ((instr->regs[0]->flags & IR3_REG_RELATIV) &&
+ (flags & IR3_REG_RELATIV))
+ return false;
+
/* clear flags that are 'ok' */
switch (instr->category) {
case 1:
- valid_flags = IR3_REG_IMMED | IR3_REG_RELATIV;
+ valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
if (flags & ~valid_flags)
return false;
break;
@@ -183,9 +196,14 @@ static void combine_flags(unsigned *dstflags, unsigned srcflags)
*dstflags ^= IR3_REG_SNEG;
if (srcflags & IR3_REG_BNOT)
*dstflags ^= IR3_REG_BNOT;
-}
-static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, unsigned *flags);
+ *dstflags &= ~IR3_REG_SSA;
+ *dstflags |= srcflags & IR3_REG_SSA;
+ *dstflags |= srcflags & IR3_REG_CONST;
+ *dstflags |= srcflags & IR3_REG_IMMED;
+ *dstflags |= srcflags & IR3_REG_RELATIV;
+ *dstflags |= srcflags & IR3_REG_ARRAY;
+}
/* the "plain" MAD's (ie. the ones that don't shift first src prior to
* multiply) can swap their first two srcs if src[0] is !CONST and
@@ -206,52 +224,35 @@ static bool is_valid_mad(struct ir3_instruction *instr)
static void
reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
{
- unsigned src_flags = 0, new_flags;
- struct ir3_instruction *src_instr;
+ struct ir3_instruction *src = ssa(reg);
- if (is_meta(instr)) {
- /* meta instructions cannot fold up register
- * flags.. they are usually src for texture
- * fetch, etc, where we cannot specify abs/neg
- */
- reg->instr = instr_cp(reg->instr, NULL);
- return;
- }
-
- src_instr = instr_cp(reg->instr, &src_flags);
+ if (is_eligible_mov(src, true)) {
+ /* simple case, no immed/const/relativ, only mov's w/ ssa src: */
+ struct ir3_register *src_reg = src->regs[1];
+ unsigned new_flags = reg->flags;
- new_flags = reg->flags;
- combine_flags(&new_flags, src_flags);
+ combine_flags(&new_flags, src_reg->flags);
- reg->flags = new_flags;
- reg->instr = src_instr;
-
- if (!valid_flags(instr, n, reg->flags)) {
- /* insert an absneg.f */
- if (reg->flags & (IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT)) {
- debug_assert(!(reg->flags & (IR3_REG_FNEG | IR3_REG_FABS)));
- reg->instr = ir3_ABSNEG_S(instr->block,
- reg->instr, cp_flags(src_flags));
- } else {
- debug_assert(!(reg->flags & (IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT)));
- reg->instr = ir3_ABSNEG_F(instr->block,
- reg->instr, cp_flags(src_flags));
+ if (valid_flags(instr, n, new_flags)) {
+ if (new_flags & IR3_REG_ARRAY) {
+ debug_assert(!(reg->flags & IR3_REG_ARRAY));
+ reg->array = src_reg->array;
+ }
+ reg->flags = new_flags;
+ reg->instr = ssa(src_reg);
}
- reg->flags &= ~cp_flags(src_flags);
- debug_assert(valid_flags(instr, n, reg->flags));
- /* send it through instr_cp() again since
- * the absneg src might be a mov from const
- * that could be cleaned up:
- */
- reg->instr = instr_cp(reg->instr, NULL);
- return;
- }
- if (is_same_type_mov(reg->instr)) {
- struct ir3_register *src_reg = reg->instr->regs[1];
- unsigned new_flags = src_reg->flags;
+ src = ssa(reg); /* could be null for IR3_REG_ARRAY case */
+ if (!src)
+ return;
+ } else if (is_same_type_mov(src) &&
+ /* cannot collapse const/immed/etc into meta instrs: */
+ !is_meta(instr)) {
+ /* immed/const/etc cases, which require some special handling: */
+ struct ir3_register *src_reg = src->regs[1];
+ unsigned new_flags = reg->flags;
- combine_flags(&new_flags, reg->flags);
+ combine_flags(&new_flags, src_reg->flags);
if (!valid_flags(instr, n, new_flags)) {
/* special case for "normal" mad instructions, we can
@@ -287,6 +288,16 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
conflicts(instr->address, reg->instr->address))
return;
+ /* This seems to be a hw bug, or something where the timings
+ * just somehow don't work out. This restriction may only
+ * apply if the first src is also CONST.
+ */
+ if ((instr->category == 3) && (n == 2) &&
+ (src_reg->flags & IR3_REG_RELATIV) &&
+ (src_reg->array.offset == 0))
+ return;
+
+ src_reg = ir3_reg_clone(instr->block->shader, src_reg);
src_reg->flags = new_flags;
instr->regs[n+1] = src_reg;
@@ -298,6 +309,7 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
if ((src_reg->flags & IR3_REG_RELATIV) &&
!conflicts(instr->address, reg->instr->address)) {
+ src_reg = ir3_reg_clone(instr->block->shader, src_reg);
src_reg->flags = new_flags;
instr->regs[n+1] = src_reg;
ir3_instr_set_address(instr, reg->instr->address);
@@ -330,8 +342,10 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
if (new_flags & IR3_REG_BNOT)
iim_val = ~iim_val;
- if (!(iim_val & ~0x3ff)) {
+ /* other than category 1 (mov) we can only encode up to 10 bits: */
+ if ((instr->category == 1) || !(iim_val & ~0x3ff)) {
new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
+ src_reg = ir3_reg_clone(instr->block->shader, src_reg);
src_reg->flags = new_flags;
src_reg->iim_val = iim_val;
instr->regs[n+1] = src_reg;
@@ -342,56 +356,68 @@ reg_cp(struct ir3_instruction *instr, struct ir3_register *reg, unsigned n)
}
}
-/**
- * Given an SSA src (instruction), return the one with extraneous
- * mov's removed, ie, for (to copy NIR syntax):
- *
- * vec1 ssa1 = fadd <something>, <somethingelse>
- * vec1 ssa2 = fabs ssa1
- * vec1 ssa3 = fneg ssa1
- *
- * then calling instr_cp(ssa3, &flags) would return ssa1 with
- * (IR3_REG_ABS | IR3_REG_NEGATE) in flags. If flags is NULL,
- * then disallow eliminating copies which would require flag
- * propagation (for example, we cannot propagate abs/neg into
- * an output).
+/* Handle special case of eliminating output mov, and similar cases where
+ * there isn't a normal "consuming" instruction. In this case we cannot
+ * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot
+ * be eliminated)
*/
static struct ir3_instruction *
-instr_cp(struct ir3_instruction *instr, unsigned *flags)
+eliminate_output_mov(struct ir3_instruction *instr)
{
- struct ir3_register *reg;
-
- if (is_eligible_mov(instr, !!flags)) {
+ if (is_eligible_mov(instr, false)) {
struct ir3_register *reg = instr->regs[1];
- struct ir3_instruction *src_instr = ssa(reg);
- if (flags)
- combine_flags(flags, reg->flags);
- return instr_cp(src_instr, flags);
+ if (!(reg->flags & IR3_REG_ARRAY)) {
+ struct ir3_instruction *src_instr = ssa(reg);
+ debug_assert(src_instr);
+ return src_instr;
+ }
}
+ return instr;
+}
+
+/**
+ * Find instruction src's which are mov's that can be collapsed, replacing
+ * the mov dst with the mov src
+ */
+static void
+instr_cp(struct ir3_instruction *instr)
+{
+ struct ir3_register *reg;
+
+ if (instr->regs_count == 0)
+ return;
- /* Check termination condition before walking children (rather
- * than before checking eligible-mov). A mov instruction may
- * appear as ssa-src for multiple other instructions, and we
- * want to consider it for removal for each, rather than just
- * the first one. (But regardless of how many places it shows
- * up as a src, we only need to recursively walk the children
- * once.)
- */
if (ir3_instr_check_mark(instr))
- return instr;
+ return;
/* walk down the graph from each src: */
foreach_src_n(reg, n, instr) {
- if (!(reg->flags & IR3_REG_SSA))
+ struct ir3_instruction *src = ssa(reg);
+
+ if (!src)
+ continue;
+
+ instr_cp(src);
+
+ /* TODO non-indirect access we could figure out which register
+ * we actually want and allow cp..
+ */
+ if (reg->flags & IR3_REG_ARRAY)
continue;
reg_cp(instr, reg, n);
}
- if (instr->address)
- ir3_instr_set_address(instr, instr_cp(instr->address, NULL));
+ if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+ struct ir3_instruction *src = ssa(instr->regs[0]);
+ if (src)
+ instr_cp(src);
+ }
- return instr;
+ if (instr->address) {
+ instr_cp(instr->address);
+ ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
+ }
}
void
@@ -401,19 +427,20 @@ ir3_cp(struct ir3 *ir)
for (unsigned i = 0; i < ir->noutputs; i++) {
if (ir->outputs[i]) {
- struct ir3_instruction *out =
- instr_cp(ir->outputs[i], NULL);
-
- ir->outputs[i] = out;
+ instr_cp(ir->outputs[i]);
+ ir->outputs[i] = eliminate_output_mov(ir->outputs[i]);
}
}
for (unsigned i = 0; i < ir->keeps_count; i++) {
- ir->keeps[i] = instr_cp(ir->keeps[i], NULL);
+ instr_cp(ir->keeps[i]);
+ ir->keeps[i] = eliminate_output_mov(ir->keeps[i]);
}
list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
- if (block->condition)
- block->condition = instr_cp(block->condition, NULL);
+ if (block->condition) {
+ instr_cp(block->condition);
+ block->condition = eliminate_output_mov(block->condition);
+ }
}
}
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index 97df0c2ac..6d294f1a4 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -76,7 +76,7 @@ int ir3_delayslots(struct ir3_instruction *assigner,
return 6;
} else if ((consumer->category == 3) &&
(is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
- (n == 2)) {
+ (n == 3)) {
/* special case, 3rd src to cat3 not required on first cycle */
return 1;
} else {
@@ -118,6 +118,10 @@ ir3_instr_depth(struct ir3_instruction *instr)
/* visit child to compute it's depth: */
ir3_instr_depth(src);
+ /* for array writes, no need to delay on previous write: */
+ if (i == 0)
+ continue;
+
sd = ir3_delayslots(src, instr, i) + src->depth;
instr->depth = MAX2(instr->depth, sd);
@@ -139,7 +143,7 @@ remove_unused_by_block(struct ir3_block *block)
/* mark it, in case it is input, so we can
* remove unused inputs:
*/
- instr->depth = DEPTH_UNUSED;
+ instr->flags |= IR3_INSTR_UNUSED;
/* and remove from instruction list: */
list_delinit(&instr->node);
}
@@ -175,14 +179,14 @@ ir3_depth(struct ir3 *ir)
*/
for (i = 0; i < ir->indirects_count; i++) {
struct ir3_instruction *instr = ir->indirects[i];
- if (instr->depth == DEPTH_UNUSED)
+ if (instr->flags & IR3_INSTR_UNUSED)
ir->indirects[i] = NULL;
}
/* cleanup unused inputs: */
for (i = 0; i < ir->ninputs; i++) {
struct ir3_instruction *in = ir->inputs[i];
- if (in && (in->depth == DEPTH_UNUSED))
+ if (in && (in->flags & IR3_INSTR_UNUSED))
ir->inputs[i] = NULL;
}
}
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.c
new file mode 100644
index 000000000..565b9c32c
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.c
@@ -0,0 +1,153 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Rob Clark <robclark@freedesktop.org>
+ */
+
+
+#include "freedreno_util.h"
+
+#include "ir3_nir.h"
+#include "ir3_compiler.h"
+#include "ir3_shader.h"
+
+#include "nir/tgsi_to_nir.h"
+
+struct nir_shader *
+ir3_tgsi_to_nir(const struct tgsi_token *tokens)
+{
+ static const nir_shader_compiler_options options = {
+ .lower_fpow = true,
+ .lower_fsat = true,
+ .lower_scmp = true,
+ .lower_flrp = true,
+ .lower_ffract = true,
+ .native_integers = true,
+ };
+ return tgsi_to_nir(tokens, &options);
+}
+
+/* for given shader key, are any steps handled in nir? */
+bool
+ir3_key_lowers_nir(const struct ir3_shader_key *key)
+{
+ return key->fsaturate_s | key->fsaturate_t | key->fsaturate_r |
+ key->vsaturate_s | key->vsaturate_t | key->vsaturate_r |
+ key->ucp_enables | key->color_two_side;
+}
+
+#define OPT(nir, pass, ...) ({ \
+ bool this_progress = false; \
+ NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
+ this_progress; \
+})
+
+#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
+struct nir_shader *
+ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+ const struct ir3_shader_key *key)
+{
+ struct nir_lower_tex_options tex_options = {
+ .lower_rect = 0,
+ };
+ bool progress;
+
+ if (key) {
+ switch (shader->type) {
+ case SHADER_FRAGMENT:
+ case SHADER_COMPUTE:
+ tex_options.saturate_s = key->fsaturate_s;
+ tex_options.saturate_t = key->fsaturate_t;
+ tex_options.saturate_r = key->fsaturate_r;
+ break;
+ case SHADER_VERTEX:
+ tex_options.saturate_s = key->vsaturate_s;
+ tex_options.saturate_t = key->vsaturate_t;
+ tex_options.saturate_r = key->vsaturate_r;
+ break;
+ }
+ }
+
+ if (shader->compiler->gpu_id >= 400) {
+ /* a4xx seems to have *no* sam.p */
+ tex_options.lower_txp = ~0; /* lower all txp */
+ } else {
+ /* a3xx just needs to avoid sam.p for 3d tex */
+ tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D);
+ }
+
+ if (fd_mesa_debug & FD_DBG_DISASM) {
+ debug_printf("----------------------\n");
+ nir_print_shader(s, stdout);
+ debug_printf("----------------------\n");
+ }
+
+ OPT_V(s, nir_opt_global_to_local);
+ OPT_V(s, nir_convert_to_ssa);
+
+ if (key) {
+ if (s->stage == MESA_SHADER_VERTEX) {
+ OPT_V(s, nir_lower_clip_vs, key->ucp_enables);
+ } else if (s->stage == MESA_SHADER_FRAGMENT) {
+ OPT_V(s, nir_lower_clip_fs, key->ucp_enables);
+ }
+ if (key->color_two_side) {
+ OPT_V(s, nir_lower_two_sided_color);
+ }
+ }
+
+ OPT_V(s, nir_lower_tex, &tex_options);
+ OPT_V(s, nir_lower_idiv);
+ OPT_V(s, nir_lower_load_const_to_scalar);
+
+ do {
+ progress = false;
+
+ OPT_V(s, nir_lower_vars_to_ssa);
+ OPT_V(s, nir_lower_alu_to_scalar);
+ OPT_V(s, nir_lower_phis_to_scalar);
+
+ progress |= OPT(s, nir_copy_prop);
+ progress |= OPT(s, nir_opt_dce);
+ progress |= OPT(s, nir_opt_cse);
+ progress |= OPT(s, ir3_nir_lower_if_else);
+ progress |= OPT(s, nir_opt_algebraic);
+ progress |= OPT(s, nir_opt_constant_folding);
+
+ } while (progress);
+
+ OPT_V(s, nir_remove_dead_variables);
+
+ if (fd_mesa_debug & FD_DBG_DISASM) {
+ debug_printf("----------------------\n");
+ nir_print_shader(s, stdout);
+ debug_printf("----------------------\n");
+ }
+
+ nir_sweep(s);
+
+ return s;
+}
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.h b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.h
index f3d3075e6..e2d885960 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.h
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir.h
@@ -29,8 +29,16 @@
#ifndef IR3_NIR_H_
#define IR3_NIR_H_
-#include "glsl/nir/nir.h"
+#include "compiler/nir/nir.h"
+#include "compiler/shader_enums.h"
+
+#include "ir3_shader.h"
bool ir3_nir_lower_if_else(nir_shader *shader);
+struct nir_shader * ir3_tgsi_to_nir(const struct tgsi_token *tokens);
+bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
+struct nir_shader * ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
+ const struct ir3_shader_key *key);
+
#endif /* IR3_NIR_H_ */
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
index dc9e4626f..8815ac981 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
@@ -28,7 +28,8 @@
*/
#include "ir3_nir.h"
-#include "glsl/nir/nir_builder.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_control_flow.h"
/* Based on nir_opt_peephole_select, and hacked up to more aggressively
* flatten anything that can be flattened
@@ -171,7 +172,7 @@ flatten_block(nir_builder *bld, nir_block *if_block, nir_block *prev_block,
(intr->intrinsic == nir_intrinsic_discard_if)) {
nir_ssa_def *discard_cond;
- nir_builder_insert_after_instr(bld,
+ bld->cursor = nir_after_instr(
nir_block_last_instr(prev_block));
if (invert) {
@@ -293,8 +294,7 @@ lower_if_else_block(nir_block *block, void *void_state)
sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;
nir_ssa_def_rewrite_uses(&phi->dest.ssa,
- nir_src_for_ssa(&sel->dest.dest.ssa),
- state->mem_ctx);
+ nir_src_for_ssa(&sel->dest.dest.ssa));
nir_instr_insert_before(&phi->instr, &sel->instr);
nir_instr_remove(&phi->instr);
@@ -328,9 +328,9 @@ ir3_nir_lower_if_else(nir_shader *shader)
{
bool progress = false;
- nir_foreach_overload(shader, overload) {
- if (overload->impl)
- progress |= lower_if_else_impl(overload->impl);
+ nir_foreach_function(shader, function) {
+ if (function->impl)
+ progress |= lower_if_else_impl(function->impl);
}
return progress;
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c
index a84e7989c..ba0c4a57a 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -94,7 +94,7 @@ static void print_instr_name(struct ir3_instruction *instr)
}
}
-static void print_reg_name(struct ir3_register *reg, bool followssa)
+static void print_reg_name(struct ir3_register *reg)
{
if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
@@ -106,20 +106,29 @@ static void print_reg_name(struct ir3_register *reg, bool followssa)
if (reg->flags & IR3_REG_IMMED) {
printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
- } else if (reg->flags & IR3_REG_SSA) {
- printf("_");
- if (followssa) {
- printf("[");
+ } else if (reg->flags & IR3_REG_ARRAY) {
+ printf("arr[id=%u, offset=%d, size=%u", reg->array.id,
+ reg->array.offset, reg->size);
+ /* for ARRAY we could have null src, for example first write
+ * instruction..
+ */
+ if (reg->instr) {
+ printf(", _[");
print_instr_name(reg->instr);
printf("]");
}
+ printf("]");
+ } else if (reg->flags & IR3_REG_SSA) {
+ printf("_[");
+ print_instr_name(reg->instr);
+ printf("]");
} else if (reg->flags & IR3_REG_RELATIV) {
if (reg->flags & IR3_REG_HALF)
printf("h");
if (reg->flags & IR3_REG_CONST)
- printf("c<a0.x + %u>", reg->num);
+ printf("c<a0.x + %d>", reg->array.offset);
else
- printf("\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size);
+ printf("\x1b[0;31mr<a0.x + %d>\x1b[0m (%u)", reg->array.offset, reg->size);
} else {
if (reg->flags & IR3_REG_HALF)
printf("h");
@@ -158,7 +167,7 @@ print_instr(struct ir3_instruction *instr, int lvl)
for (i = 0; i < instr->regs_count; i++) {
struct ir3_register *reg = instr->regs[i];
printf(i ? ", " : " ");
- print_reg_name(reg, !!i);
+ print_reg_name(reg);
}
if (instr->address) {
@@ -168,13 +177,6 @@ print_instr(struct ir3_instruction *instr, int lvl)
printf("]");
}
- if (instr->fanin) {
- printf(", fanin=_");
- printf("[");
- print_instr_name(instr->fanin);
- printf("]");
- }
-
if (instr->cp.left) {
printf(", left=_");
printf("[");
@@ -192,8 +194,6 @@ print_instr(struct ir3_instruction *instr, int lvl)
if (is_meta(instr)) {
if (instr->opc == OPC_META_FO) {
printf(", off=%d", instr->fo.off);
- } else if ((instr->opc == OPC_META_FI) && instr->fi.aid) {
- printf(", aid=%d", instr->fi.aid);
}
}
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index 88018398e..bcad96e8a 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -68,25 +68,24 @@
* LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
* register assignment. But for us that is horrible from a scheduling
* standpoint. Instead what we do is use idea of 'definer' instruction.
- * Ie. the first instruction (lowest ip) to write to the array is the
+ * Ie. the first instruction (lowest ip) to write to the variable is the
* one we consider from use/def perspective when building interference
- * graph. (Other instructions which write other array elements just
- * define the variable some more.)
+ * graph. (Other instructions which write other variable components
+ * just define the variable some more.)
+ *
+ * Arrays of arbitrary size are handled via pre-coloring a consecutive
+ * sequence of registers. Additional scalar (single component) reg
+ * names are allocated starting at ctx->class_base[total_class_count]
+ * (see arr->base), which are pre-colored. In the use/def graph direct
+ * access is treated as a single element use/def, and indirect access
+ * is treated as use or def of all array elements. (Only the first
+ * def is tracked, in case of multiple indirect writes, etc.)
*/
static const unsigned class_sizes[] = {
1, 2, 3, 4,
4 + 4, /* txd + 1d/2d */
4 + 6, /* txd + 3d */
- /* temporary: until we can assign arrays, create classes so we
- * can round up array to fit. NOTE with tgsi arrays should
- * really all be multiples of four:
- */
- 4 * 4,
- 4 * 8,
- 4 * 16,
- 4 * 32,
-
};
#define class_count ARRAY_SIZE(class_sizes)
@@ -265,13 +264,21 @@ struct ir3_ra_ctx {
struct ir3_ra_reg_set *set;
struct ra_graph *g;
unsigned alloc_count;
- unsigned class_alloc_count[total_class_count];
- unsigned class_base[total_class_count];
+ /* one per class, plus one slot for arrays: */
+ unsigned class_alloc_count[total_class_count + 1];
+ unsigned class_base[total_class_count + 1];
unsigned instr_cnt;
unsigned *def, *use; /* def/use table */
struct ir3_ra_instr_data *instrd;
};
+/* does it conflict? */
+static inline bool
+intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
+{
+ return !((a_start >= b_end) || (b_start >= a_end));
+}
+
static bool
is_half(struct ir3_instruction *instr)
{
@@ -314,6 +321,14 @@ writes_gpr(struct ir3_instruction *instr)
return is_temp(instr->regs[0]);
}
+static bool
+instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
+{
+ if (a->flags & IR3_INSTR_UNUSED)
+ return false;
+ return (a->ip < b->ip);
+}
+
static struct ir3_instruction *
get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
int *sz, int *off)
@@ -321,9 +336,6 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
struct ir3_instruction *d = NULL;
- if (instr->fanin)
- return get_definer(ctx, instr->fanin, sz, off);
-
if (id->defn) {
*sz = id->sz;
*off = id->off;
@@ -348,7 +360,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
dd = get_definer(ctx, src->instr, &dsz, &doff);
- if ((!d) || (dd->ip < d->ip)) {
+ if ((!d) || instr_before(dd, d)) {
d = dd;
*sz = dsz;
*off = doff - n;
@@ -369,9 +381,14 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
*/
int cnt = 0;
- d = f;
+ /* need to skip over unused in the group: */
+ while (f && (f->flags & IR3_INSTR_UNUSED)) {
+ f = f->cp.right;
+ cnt++;
+ }
+
while (f) {
- if (f->ip < d->ip)
+ if ((!d) || instr_before(f, d))
d = f;
if (f == instr)
*off = cnt;
@@ -414,7 +431,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
*sz = MAX2(*sz, dsz);
*off = doff;
- if (dd->ip < d->ip) {
+ if (instr_before(dd, d)) {
d = dd;
}
}
@@ -432,7 +449,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
foreach_src(src, d) {
if (!src->instr)
continue;
- if (src->instr->ip < dd->ip)
+ if (instr_before(src->instr, dd))
dd = src->instr;
}
@@ -446,7 +463,7 @@ get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
/* by definition, should come before: */
- debug_assert(dd->ip < d->ip);
+ debug_assert(instr_before(dd, d));
*sz = MAX2(*sz, dsz);
@@ -472,10 +489,13 @@ ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
/* couple special cases: */
if (writes_addr(instr) || writes_pred(instr)) {
id->cls = -1;
- continue;
+ } else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
+ id->cls = total_class_count;
+ id->defn = instr;
+ } else {
+ id->defn = get_definer(ctx, instr, &id->sz, &id->off);
+ id->cls = size_to_class(id->sz, is_half(id->defn));
}
- id->defn = get_definer(ctx, instr, &id->sz, &id->off);
- id->cls = size_to_class(id->sz, is_half(id->defn));
}
}
@@ -505,8 +525,6 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
/* arrays which don't fit in one of the pre-defined class
* sizes are pre-colored:
- *
- * TODO but we still need to allocate names for them, don't we??
*/
if (id->cls >= 0) {
instr->name = ctx->class_alloc_count[id->cls]++;
@@ -518,7 +536,7 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
static void
ra_init(struct ir3_ra_ctx *ctx)
{
- unsigned n;
+ unsigned n, base;
ir3_clear_mark(ctx->ir);
n = ir3_count_instructions(ctx->ir);
@@ -537,11 +555,20 @@ ra_init(struct ir3_ra_ctx *ctx)
* actual ra name is class_base[cls] + instr->name;
*/
ctx->class_base[0] = 0;
- for (unsigned i = 1; i < total_class_count; i++) {
+ for (unsigned i = 1; i <= total_class_count; i++) {
ctx->class_base[i] = ctx->class_base[i-1] +
ctx->class_alloc_count[i-1];
}
+ /* and vreg names for array elements: */
+ base = ctx->class_base[total_class_count];
+ list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+ arr->base = base;
+ ctx->class_alloc_count[total_class_count] += arr->length;
+ base += arr->length;
+ }
+ ctx->alloc_count += ctx->class_alloc_count[total_class_count];
+
ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
ralloc_steal(ctx->g, ctx->instrd);
ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
@@ -549,15 +576,23 @@ ra_init(struct ir3_ra_ctx *ctx)
}
static unsigned
-ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+__ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
{
unsigned name;
debug_assert(cls >= 0);
+ debug_assert(cls < total_class_count); /* we shouldn't get arrays here.. */
name = ctx->class_base[cls] + defn->name;
debug_assert(name < ctx->alloc_count);
return name;
}
+static int
+ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
+{
+ /* TODO handle name mapping for arrays */
+ return __ra_name(ctx, id->cls, id->defn);
+}
+
static void
ra_destroy(struct ir3_ra_ctx *ctx)
{
@@ -570,6 +605,22 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
struct ir3_ra_block_data *bd;
unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+#define def(name, instr) \
+ do { \
+ /* defined on first write: */ \
+ if (!ctx->def[name]) \
+ ctx->def[name] = instr->ip; \
+ ctx->use[name] = instr->ip; \
+ BITSET_SET(bd->def, name); \
+ } while(0);
+
+#define use(name, instr) \
+ do { \
+ ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
+ if (!BITSET_TEST(bd->def, name)) \
+ BITSET_SET(bd->use, name); \
+ } while(0);
+
bd = rzalloc(ctx->g, struct ir3_ra_block_data);
bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words);
@@ -577,10 +628,11 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words);
bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
- block->bd = bd;
+ block->data = bd;
list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
struct ir3_instruction *src;
+ struct ir3_register *reg;
if (instr->regs_count == 0)
continue;
@@ -612,61 +664,101 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
if (writes_gpr(instr)) {
struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+ struct ir3_register *dst = instr->regs[0];
- if (id->defn == instr) {
- /* arrays which don't fit in one of the pre-defined class
- * sizes are pre-colored:
- */
- if (id->cls >= 0) {
- unsigned name = ra_name(ctx, id->cls, id->defn);
+ if (dst->flags & IR3_REG_ARRAY) {
+ struct ir3_array *arr =
+ ir3_lookup_array(ctx->ir, dst->array.id);
+ unsigned i;
- ctx->def[name] = id->defn->ip;
- ctx->use[name] = id->defn->ip;
+ debug_assert(!(dst->flags & IR3_REG_PHI_SRC));
- /* since we are in SSA at this point: */
- debug_assert(!BITSET_TEST(bd->use, name));
+ arr->start_ip = MIN2(arr->start_ip, instr->ip);
+ arr->end_ip = MAX2(arr->end_ip, instr->ip);
- BITSET_SET(bd->def, name);
+ /* set the node class now.. in case we don't encounter
+ * this array dst again. From register_alloc algo's
+ * perspective, these are all single/scalar regs:
+ */
+ for (i = 0; i < arr->length; i++) {
+ unsigned name = arr->base + i;
+ ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
+ }
- if (is_half(id->defn)) {
- ra_set_node_class(ctx->g, name,
- ctx->set->half_classes[id->cls - class_count]);
- } else {
- ra_set_node_class(ctx->g, name,
- ctx->set->classes[id->cls]);
+ /* indirect write is treated like a write to all array
+ * elements, since we don't know which one is actually
+ * written:
+ */
+ if (dst->flags & IR3_REG_RELATIV) {
+ for (i = 0; i < arr->length; i++) {
+ unsigned name = arr->base + i;
+ def(name, instr);
}
+ } else {
+ unsigned name = arr->base + dst->array.offset;
+ def(name, instr);
+ }
+
+ } else if (id->defn == instr) {
+ unsigned name = ra_name(ctx, id);
+
+ /* since we are in SSA at this point: */
+ debug_assert(!BITSET_TEST(bd->use, name));
+
+ def(name, id->defn);
+
+ if (is_half(id->defn)) {
+ ra_set_node_class(ctx->g, name,
+ ctx->set->half_classes[id->cls - class_count]);
+ } else {
+ ra_set_node_class(ctx->g, name,
+ ctx->set->classes[id->cls]);
+ }
- /* extend the live range for phi srcs, which may come
- * from the bottom of the loop
- */
- if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
- struct ir3_instruction *phi = id->defn->regs[0]->instr;
- foreach_ssa_src(src, phi) {
- /* if src is after phi, then we need to extend
- * the liverange to the end of src's block:
- */
- if (src->ip > phi->ip) {
- struct ir3_instruction *last =
+ /* extend the live range for phi srcs, which may come
+ * from the bottom of the loop
+ */
+ if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+ struct ir3_instruction *phi = id->defn->regs[0]->instr;
+ foreach_ssa_src(src, phi) {
+ /* if src is after phi, then we need to extend
+ * the liverange to the end of src's block:
+ */
+ if (src->ip > phi->ip) {
+ struct ir3_instruction *last =
list_last_entry(&src->block->instr_list,
- struct ir3_instruction, node);
- ctx->use[name] = MAX2(ctx->use[name], last->ip);
- }
+ struct ir3_instruction, node);
+ ctx->use[name] = MAX2(ctx->use[name], last->ip);
}
}
}
}
}
- foreach_ssa_src(src, instr) {
- if (writes_gpr(src)) {
- struct ir3_ra_instr_data *id = &ctx->instrd[src->ip];
-
- if (id->cls >= 0) {
- unsigned name = ra_name(ctx, id->cls, id->defn);
- ctx->use[name] = MAX2(ctx->use[name], instr->ip);
- if (!BITSET_TEST(bd->def, name))
- BITSET_SET(bd->use, name);
+ foreach_src(reg, instr) {
+ if (reg->flags & IR3_REG_ARRAY) {
+ struct ir3_array *arr =
+ ir3_lookup_array(ctx->ir, reg->array.id);
+ arr->start_ip = MIN2(arr->start_ip, instr->ip);
+ arr->end_ip = MAX2(arr->end_ip, instr->ip);
+ /* indirect read is treated like a read fromall array
+ * elements, since we don't know which one is actually
+ * read:
+ */
+ if (reg->flags & IR3_REG_RELATIV) {
+ unsigned i;
+ for (i = 0; i < arr->length; i++) {
+ unsigned name = arr->base + i;
+ use(name, instr);
+ }
+ } else {
+ unsigned name = arr->base + reg->array.offset;
+ use(name, instr);
+ debug_assert(reg->array.offset < arr->length);
}
+ } else if ((src = ssa(reg)) && writes_gpr(src)) {
+ unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
+ use(name, instr);
}
}
}
@@ -679,7 +771,7 @@ ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
bool progress = false;
list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
- struct ir3_ra_block_data *bd = block->bd;
+ struct ir3_ra_block_data *bd = block->data;
/* update livein: */
for (unsigned i = 0; i < bitset_words; i++) {
@@ -700,7 +792,7 @@ ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
if (!succ)
continue;
- succ_bd = succ->bd;
+ succ_bd = succ->data;
for (unsigned i = 0; i < bitset_words; i++) {
BITSET_WORD new_liveout =
@@ -722,6 +814,12 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
{
struct ir3 *ir = ctx->ir;
+ /* initialize array live ranges: */
+ list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
+ arr->start_ip = ~0;
+ arr->end_ip = 0;
+ }
+
/* compute live ranges (use/def) on a block level, also updating
* block's def/use bitmasks (used below to calculate per-block
* livein/liveout):
@@ -736,7 +834,7 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
/* extend start/end ranges based on livein/liveout info from cfg: */
unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
- struct ir3_ra_block_data *bd = block->bd;
+ struct ir3_ra_block_data *bd = block->data;
for (unsigned i = 0; i < bitset_words; i++) {
if (BITSET_TEST(bd->livein, i)) {
@@ -754,18 +852,14 @@ ra_add_interference(struct ir3_ra_ctx *ctx)
/* need to fix things up to keep outputs live: */
for (unsigned i = 0; i < ir->noutputs; i++) {
struct ir3_instruction *instr = ir->outputs[i];
- struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-
- if (id->cls >= 0) {
- unsigned name = ra_name(ctx, id->cls, id->defn);
- ctx->use[name] = ctx->instr_cnt;
- }
+ unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
+ ctx->use[name] = ctx->instr_cnt;
}
for (unsigned i = 0; i < ctx->alloc_count; i++) {
for (unsigned j = 0; j < ctx->alloc_count; j++) {
- if (!((ctx->def[i] >= ctx->use[j]) ||
- (ctx->def[j] >= ctx->use[i]))) {
+ if (intersects(ctx->def[i], ctx->use[i],
+ ctx->def[j], ctx->use[j])) {
ra_add_node_interference(ctx->g, i, j);
}
}
@@ -823,19 +917,36 @@ static void fixup_half_instr_src(struct ir3_instruction *instr)
}
}
+/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
+ * array access(es) which do not have any previous access to depend
+ * on from scheduling point of view
+ */
static void
reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
struct ir3_instruction *instr)
{
- struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+ struct ir3_ra_instr_data *id;
+
+ if (reg->flags & IR3_REG_ARRAY) {
+ struct ir3_array *arr =
+ ir3_lookup_array(ctx->ir, reg->array.id);
+ unsigned name = arr->base + reg->array.offset;
+ unsigned r = ra_get_node_reg(ctx->g, name);
+ unsigned num = ctx->set->ra_reg_to_gpr[r];
+
+ if (reg->flags & IR3_REG_RELATIV) {
+ reg->array.offset = num;
+ } else {
+ reg->num = num;
+ }
- if (id->cls >= 0) {
- unsigned name = ra_name(ctx, id->cls, id->defn);
+ reg->flags &= ~IR3_REG_ARRAY;
+ } else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
+ unsigned name = ra_name(ctx, id);
unsigned r = ra_get_node_reg(ctx->g, name);
unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
- if (reg->flags & IR3_REG_RELATIV)
- num += reg->offset;
+ debug_assert(!(reg->flags & IR3_REG_RELATIV));
reg->num = num;
reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
@@ -862,9 +973,9 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
foreach_src_n(reg, n, instr) {
struct ir3_instruction *src = reg->instr;
- if (!src)
+ /* Note: reg->instr could be null for IR3_REG_ARRAY */
+ if (!(src || (reg->flags & IR3_REG_ARRAY)))
continue;
-
reg_assign(ctx, instr->regs[n+1], src);
if (instr->regs[n+1]->flags & IR3_REG_HALF)
fixup_half_instr_src(instr);
@@ -875,6 +986,8 @@ ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
static int
ra_alloc(struct ir3_ra_ctx *ctx)
{
+ unsigned n = 0;
+
/* frag shader inputs get pre-assigned, since we have some
* constraints/unknowns about setup for some of these regs:
*/
@@ -884,7 +997,7 @@ ra_alloc(struct ir3_ra_ctx *ctx)
if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
struct ir3_instruction *instr = ir->inputs[i];
int cls = size_to_class(1, true);
- unsigned name = ra_name(ctx, cls, instr);
+ unsigned name = __ra_name(ctx, cls, instr);
unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
/* if we have frag_face, it gets hr0.x */
@@ -892,7 +1005,8 @@ ra_alloc(struct ir3_ra_ctx *ctx)
i += 4;
}
- for (j = 0; i < ir->ninputs; i++) {
+ j = 0;
+ for (; i < ir->ninputs; i++) {
struct ir3_instruction *instr = ir->inputs[i];
if (instr) {
struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
@@ -900,7 +1014,7 @@ ra_alloc(struct ir3_ra_ctx *ctx)
if (id->defn == instr) {
unsigned name, reg;
- name = ra_name(ctx, id->cls, id->defn);
+ name = ra_name(ctx, id);
reg = ctx->set->gpr_to_ra_reg[id->cls][j];
ra_set_node_reg(ctx->g, name, reg);
@@ -908,6 +1022,46 @@ ra_alloc(struct ir3_ra_ctx *ctx)
}
}
}
+ n = j;
+ }
+
+ /* pre-assign array elements:
+ */
+ list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
+ unsigned base = n;
+
+ if (arr->end_ip == 0)
+ continue;
+
+ /* figure out what else we conflict with which has already
+ * been assigned:
+ */
+retry:
+ list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
+ if (arr2 == arr)
+ break;
+ if (arr2->end_ip == 0)
+ continue;
+ /* if it intersects with liverange AND register range.. */
+ if (intersects(arr->start_ip, arr->end_ip,
+ arr2->start_ip, arr2->end_ip) &&
+ intersects(base, base + arr->length,
+ arr2->reg, arr2->reg + arr2->length)) {
+ base = MAX2(base, arr2->reg + arr2->length);
+ goto retry;
+ }
+ }
+
+ arr->reg = base;
+
+ for (unsigned i = 0; i < arr->length; i++) {
+ unsigned name, reg;
+
+ name = arr->base + i;
+ reg = ctx->set->gpr_to_ra_reg[0][base++];
+
+ ra_set_node_reg(ctx->g, name, reg);
+ }
}
if (!ra_allocate(ctx->g))
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index 2ee325518..8f640febc 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -34,11 +34,12 @@
/*
* Instruction Scheduling:
*
- * A priority-queue based scheduling algo. Add eligible instructions,
- * ie. ones with all their dependencies scheduled, to the priority
- * (depth) sorted queue (list). Pop highest priority instruction off
- * the queue and schedule it, add newly eligible instructions to the
- * priority queue, rinse, repeat.
+ * A recursive depth based scheduling algo. Recursively find an eligible
+ * instruction to schedule from the deepest instruction (recursing through
+ * it's unscheduled src instructions). Normally this would result in a
+ * lot of re-traversal of the same instructions, so we cache results in
+ * instr->data (and clear cached results that would be no longer valid
+ * after scheduling an instruction).
*
* There are a few special cases that need to be handled, since sched
* is currently independent of register allocation. Usages of address
@@ -52,6 +53,7 @@
struct ir3_sched_ctx {
struct ir3_block *block; /* the current block */
+ struct list_head depth_list; /* depth sorted unscheduled instrs */
struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
struct ir3_instruction *addr; /* current a0.x user, if any */
struct ir3_instruction *pred; /* current p0.x user, if any */
@@ -63,6 +65,17 @@ static bool is_sfu_or_mem(struct ir3_instruction *instr)
return is_sfu(instr) || is_mem(instr);
}
+#define NULL_INSTR ((void *)~0)
+
+static void
+clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
+{
+ list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
+ if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
+ instr2->data = NULL;
+ }
+}
+
static void
schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
{
@@ -93,6 +106,34 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
list_addtail(&instr->node, &instr->block->instr_list);
ctx->scheduled = instr;
+
+ if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
+ clear_cache(ctx, NULL);
+ } else {
+ /* invalidate only the necessary entries.. */
+ clear_cache(ctx, instr);
+ }
+}
+
+static struct ir3_instruction *
+deepest(struct ir3_instruction **srcs, unsigned nsrcs)
+{
+ struct ir3_instruction *d = NULL;
+ unsigned i = 0, id = 0;
+
+ while ((i < nsrcs) && !(d = srcs[id = i]))
+ i++;
+
+ if (!d)
+ return NULL;
+
+ for (; i < nsrcs; i++)
+ if (srcs[i] && (srcs[i]->depth > d->depth))
+ d = srcs[id = i];
+
+ srcs[id] = NULL;
+
+ return d;
}
static unsigned
@@ -146,6 +187,9 @@ delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
foreach_ssa_src_n(src, i, instr) {
unsigned d;
+ /* for array writes, no need to delay on previous write: */
+ if (i == 0)
+ continue;
if (src->block != instr->block)
continue;
d = delay_calc_srcn(ctx, src, instr, i);
@@ -171,10 +215,51 @@ static bool is_scheduled(struct ir3_instruction *instr)
return !!(instr->flags & IR3_INSTR_MARK);
}
+/* could an instruction be scheduled if specified ssa src was scheduled? */
static bool
-check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+{
+ struct ir3_instruction *other_src;
+ foreach_ssa_src(other_src, instr) {
+ /* if dependency not scheduled, we aren't ready yet: */
+ if ((src != other_src) && !is_scheduled(other_src)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Check if instruction is ok to schedule. Make sure it is not blocked
+ * by use of addr/predicate register, etc.
+ */
+static bool
+check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
struct ir3_instruction *instr)
{
+ /* For instructions that write address register we need to
+ * make sure there is at least one instruction that uses the
+ * addr value which is otherwise ready.
+ *
+ * TODO if any instructions use pred register and have other
+ * src args, we would need to do the same for writes_pred()..
+ */
+ if (writes_addr(instr)) {
+ struct ir3 *ir = instr->block->shader;
+ bool ready = false;
+ for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
+ struct ir3_instruction *indirect = ir->indirects[i];
+ if (!indirect)
+ continue;
+ if (indirect->address != instr)
+ continue;
+ ready = could_sched(indirect, instr);
+ }
+
+ /* nothing could be scheduled, so keep looking: */
+ if (!ready)
+ return false;
+ }
+
/* if this is a write to address/predicate register, and that
* register is currently in use, we need to defer until it is
* free:
@@ -182,52 +267,15 @@ check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
if (writes_addr(instr) && ctx->addr) {
debug_assert(ctx->addr != instr);
notes->addr_conflict = true;
- return true;
+ return false;
}
if (writes_pred(instr) && ctx->pred) {
debug_assert(ctx->pred != instr);
notes->pred_conflict = true;
- return true;
+ return false;
}
- return false;
-}
-
-/* is this instruction ready to be scheduled? Return negative for not
- * ready (updating notes if needed), or >= 0 to indicate number of
- * delay slots needed.
- */
-static int
-instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
- struct ir3_instruction *instr)
-{
- struct ir3_instruction *src;
- unsigned delay = 0;
-
- /* Phi instructions can have a dependency on something not
- * scheduled yet (for ex, loops). But OTOH we don't really
- * care. By definition phi's should appear at the top of
- * the block, and it's sources should be values from the
- * previously executing block, so they are always ready to
- * be scheduled:
- */
- if (is_meta(instr) && (instr->opc == OPC_META_PHI))
- return 0;
-
- foreach_ssa_src(src, instr) {
- /* if dependency not scheduled, we aren't ready yet: */
- if (!is_scheduled(src))
- return -1;
- }
-
- /* all our dependents are scheduled, figure out if
- * we have enough delay slots to schedule ourself:
- */
- delay = delay_calc(ctx, instr);
- if (delay)
- return delay;
-
/* if the instruction is a kill, we need to ensure *every*
* bary.f is scheduled. The hw seems unhappy if the thread
* gets killed before the end-input (ei) flag is hit.
@@ -246,80 +294,109 @@ instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
for (unsigned i = 0; i < ir->baryfs_count; i++) {
struct ir3_instruction *baryf = ir->baryfs[i];
- if (baryf->depth == DEPTH_UNUSED)
+ if (baryf->flags & IR3_INSTR_UNUSED)
continue;
if (!is_scheduled(baryf)) {
notes->blocked_kill = true;
- return -1;
+ return false;
}
}
}
- if (check_conflict(ctx, notes, instr))
- return -1;
-
- return 0;
+ return true;
}
-/* could an instruction be scheduled if specified ssa src was scheduled? */
-static bool
-could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
+/* Find the best instruction to schedule from specified instruction or
+ * recursively it's ssa sources.
+ */
+static struct ir3_instruction *
+find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+ struct ir3_instruction *instr)
{
- struct ir3_instruction *other_src;
- foreach_ssa_src(other_src, instr) {
- /* if dependency not scheduled, we aren't ready yet: */
- if ((src != other_src) && !is_scheduled(other_src)) {
- return false;
+ struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
+ struct ir3_instruction *src;
+ unsigned nsrcs = 0;
+
+ if (is_scheduled(instr))
+ return NULL;
+
+ /* use instr->data to cache the results of recursing up the
+ * instr src's. Otherwise the recursive algo can scale quite
+ * badly w/ shader size. But this takes some care to clear
+ * the cache appropriately when instructions are scheduled.
+ */
+ if (instr->data) {
+ if (instr->data == NULL_INSTR)
+ return NULL;
+ return instr->data;
+ }
+
+ /* find unscheduled srcs: */
+ foreach_ssa_src(src, instr) {
+ if (!is_scheduled(src)) {
+ debug_assert(nsrcs < ARRAY_SIZE(srcs));
+ srcs[nsrcs++] = src;
}
}
- return true;
+
+ /* if all our src's are already scheduled: */
+ if (nsrcs == 0) {
+ if (check_instr(ctx, notes, instr)) {
+ instr->data = instr;
+ return instr;
+ }
+ return NULL;
+ }
+
+ while ((src = deepest(srcs, nsrcs))) {
+ struct ir3_instruction *candidate;
+
+ candidate = find_instr_recursive(ctx, notes, src);
+ if (!candidate)
+ continue;
+
+ if (check_instr(ctx, notes, candidate)) {
+ instr->data = candidate;
+ return candidate;
+ }
+ }
+
+ instr->data = NULL_INSTR;
+ return NULL;
}
-/* move eligible instructions to the priority list: */
-static unsigned
-add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
- struct list_head *prio_queue, struct list_head *unscheduled_list)
+/* find instruction to schedule: */
+static struct ir3_instruction *
+find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
{
+ struct ir3_instruction *best_instr = NULL;
unsigned min_delay = ~0;
- list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) {
- int e = instr_eligibility(ctx, notes, instr);
- if (e < 0)
- continue;
+ /* TODO we'd really rather use the list/array of block outputs. But we
+ * don't have such a thing. Recursing *every* instruction in the list
+ * will result in a lot of repeated traversal, since instructions will
+ * get traversed both when they appear as ssa src to a later instruction
+ * as well as where they appear in the depth_list.
+ */
+ list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
+ struct ir3_instruction *candidate;
+ unsigned delay;
- /* For instructions that write address register we need to
- * make sure there is at least one instruction that uses the
- * addr value which is otherwise ready.
- *
- * TODO if any instructions use pred register and have other
- * src args, we would need to do the same for writes_pred()..
- */
- if (unlikely(writes_addr(instr))) {
- struct ir3 *ir = instr->block->shader;
- bool ready = false;
- for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
- struct ir3_instruction *indirect = ir->indirects[i];
- if (!indirect)
- continue;
- if (indirect->address != instr)
- continue;
- ready = could_sched(indirect, instr);
- }
+ candidate = find_instr_recursive(ctx, notes, instr);
+ if (!candidate)
+ continue;
- /* nothing could be scheduled, so keep looking: */
- if (!ready)
- continue;
+ delay = delay_calc(ctx, candidate);
+ if (delay < min_delay) {
+ best_instr = candidate;
+ min_delay = delay;
}
- min_delay = MIN2(min_delay, e);
- if (e == 0) {
- /* remove from unscheduled list and into priority queue: */
- list_delinit(&instr->node);
- ir3_insert_by_depth(instr, prio_queue);
- }
+ if (min_delay == 0)
+ break;
}
- return min_delay;
+ return best_instr;
}
/* "spill" the address register by remapping any unscheduled
@@ -413,50 +490,55 @@ split_pred(struct ir3_sched_ctx *ctx)
static void
sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
{
- struct list_head unscheduled_list, prio_queue;
+ struct list_head unscheduled_list;
ctx->block = block;
+ /* addr/pred writes are per-block: */
+ ctx->addr = NULL;
+ ctx->pred = NULL;
+
/* move all instructions to the unscheduled list, and
* empty the block's instruction list (to which we will
- * be inserting.
+ * be inserting).
*/
list_replace(&block->instr_list, &unscheduled_list);
list_inithead(&block->instr_list);
- list_inithead(&prio_queue);
+ list_inithead(&ctx->depth_list);
/* first a pre-pass to schedule all meta:input/phi instructions
* (which need to appear first so that RA knows the register is
- * occupied:
+ * occupied), and move remaining to depth sorted list:
*/
list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
- (instr->opc == OPC_META_PHI)))
+ (instr->opc == OPC_META_PHI))) {
schedule(ctx, instr);
+ } else {
+ ir3_insert_by_depth(instr, &ctx->depth_list);
+ }
}
- while (!(list_empty(&unscheduled_list) &&
- list_empty(&prio_queue))) {
+ while (!list_empty(&ctx->depth_list)) {
struct ir3_sched_notes notes = {0};
- unsigned delay;
+ struct ir3_instruction *instr;
+
+ instr = find_eligible_instr(ctx, &notes);
- delay = add_eligible_instrs(ctx, &notes, &prio_queue, &unscheduled_list);
+ if (instr) {
+ unsigned delay = delay_calc(ctx, instr);
- if (!list_empty(&prio_queue)) {
- struct ir3_instruction *instr = list_last_entry(&prio_queue,
- struct ir3_instruction, node);
- /* ugg, this is a bit ugly, but between the time when
- * the instruction became eligible and now, a new
- * conflict may have arose..
+ /* and if we run out of instructions that can be scheduled,
+ * then it is time for nop's:
*/
- if (check_conflict(ctx, &notes, instr)) {
- list_del(&instr->node);
- list_addtail(&instr->node, &unscheduled_list);
- continue;
+ debug_assert(delay <= 6);
+ while (delay > 0) {
+ ir3_NOP(block);
+ delay--;
}
schedule(ctx, instr);
- } else if (delay == ~0) {
+ } else {
struct ir3_instruction *new_instr = NULL;
/* nothing available to schedule.. if we are blocked on
@@ -475,23 +557,17 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
}
if (new_instr) {
- list_del(&new_instr->node);
- list_addtail(&new_instr->node, &unscheduled_list);
+ /* clearing current addr/pred can change what is
+ * available to schedule, so clear cache..
+ */
+ clear_cache(ctx, NULL);
+
+ ir3_insert_by_depth(new_instr, &ctx->depth_list);
/* the original instr that wrote addr/pred may have
* originated from a different block:
*/
new_instr->block = block;
}
-
- } else {
- /* and if we run out of instructions that can be scheduled,
- * then it is time for nop's:
- */
- debug_assert(delay <= 6);
- while (delay > 0) {
- ir3_NOP(block);
- delay--;
- }
}
}
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 312174c0c..7d17f426a 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -39,7 +39,7 @@
#include "ir3_shader.h"
#include "ir3_compiler.h"
-
+#include "ir3_nir.h"
static void
delete_variant(struct ir3_shader_variant *v)
@@ -187,12 +187,6 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
v->key = key;
v->type = shader->type;
- if (fd_mesa_debug & FD_DBG_DISASM) {
- DBG("dump tgsi: type=%d, k={bp=%u,cts=%u,hp=%u}", shader->type,
- key.binning_pass, key.color_two_side, key.half_precision);
- tgsi_dump(shader->tokens, 0);
- }
-
ret = ir3_compile_shader_nir(shader->compiler, v);
if (ret) {
debug_error("compile failed!");
@@ -267,7 +261,7 @@ ir3_shader_destroy(struct ir3_shader *shader)
v = v->next;
delete_variant(t);
}
- free((void *)shader->tokens);
+ ralloc_free(shader->nir);
free(shader);
}
@@ -281,14 +275,24 @@ ir3_shader_create(struct pipe_context *pctx,
shader->id = ++shader->compiler->shader_count;
shader->pctx = pctx;
shader->type = type;
- shader->tokens = tgsi_dup_tokens(cso->tokens);
+ if (fd_mesa_debug & FD_DBG_DISASM) {
+ DBG("dump tgsi: type=%d", shader->type);
+ tgsi_dump(cso->tokens, 0);
+ }
+ nir_shader *nir = ir3_tgsi_to_nir(cso->tokens);
+ /* do first pass optimization, ignoring the key: */
+ shader->nir = ir3_optimize_nir(shader, nir, NULL);
+ if (fd_mesa_debug & FD_DBG_DISASM) {
+ DBG("dump nir%d: type=%d", shader->id, shader->type);
+ nir_print_shader(shader->nir, stdout);
+ }
shader->stream_output = cso->stream_output;
if (fd_mesa_debug & FD_DBG_SHADERDB) {
/* if shader-db run, create a standard variant immediately
* (as otherwise nothing will trigger the shader to be
* actually compiled)
*/
- static struct ir3_shader_key key = {};
+ static struct ir3_shader_key key = {0};
ir3_shader_variant(shader, key);
}
return shader;
@@ -300,11 +304,11 @@ static void dump_reg(const char *name, uint32_t r)
debug_printf("; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
}
-static void dump_semantic(struct ir3_shader_variant *so,
- unsigned sem, const char *name)
+static void dump_output(struct ir3_shader_variant *so,
+ unsigned slot, const char *name)
{
uint32_t regid;
- regid = ir3_find_output_regid(so, ir3_semantic_name(sem, 0));
+ regid = ir3_find_output_regid(so, slot);
dump_reg(name, regid);
}
@@ -355,27 +359,51 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin)
disasm_a3xx(bin, so->info.sizedwords, 0, so->type);
- debug_printf("; %s: outputs:", type);
- for (i = 0; i < so->outputs_count; i++) {
- uint8_t regid = so->outputs[i].regid;
- ir3_semantic sem = so->outputs[i].semantic;
- debug_printf(" r%d.%c (%u:%u)",
- (regid >> 2), "xyzw"[regid & 0x3],
- sem2name(sem), sem2idx(sem));
- }
- debug_printf("\n");
- debug_printf("; %s: inputs:", type);
- for (i = 0; i < so->inputs_count; i++) {
- uint8_t regid = so->inputs[i].regid;
- ir3_semantic sem = so->inputs[i].semantic;
- debug_printf(" r%d.%c (%u:%u,cm=%x,il=%u,b=%u)",
- (regid >> 2), "xyzw"[regid & 0x3],
- sem2name(sem), sem2idx(sem),
- so->inputs[i].compmask,
- so->inputs[i].inloc,
- so->inputs[i].bary);
+ switch (so->type) {
+ case SHADER_VERTEX:
+ debug_printf("; %s: outputs:", type);
+ for (i = 0; i < so->outputs_count; i++) {
+ uint8_t regid = so->outputs[i].regid;
+ debug_printf(" r%d.%c (%s)",
+ (regid >> 2), "xyzw"[regid & 0x3],
+ gl_varying_slot_name(so->outputs[i].slot));
+ }
+ debug_printf("\n");
+ debug_printf("; %s: inputs:", type);
+ for (i = 0; i < so->inputs_count; i++) {
+ uint8_t regid = so->inputs[i].regid;
+ debug_printf(" r%d.%c (cm=%x,il=%u,b=%u)",
+ (regid >> 2), "xyzw"[regid & 0x3],
+ so->inputs[i].compmask,
+ so->inputs[i].inloc,
+ so->inputs[i].bary);
+ }
+ debug_printf("\n");
+ break;
+ case SHADER_FRAGMENT:
+ debug_printf("; %s: outputs:", type);
+ for (i = 0; i < so->outputs_count; i++) {
+ uint8_t regid = so->outputs[i].regid;
+ debug_printf(" r%d.%c (%s)",
+ (regid >> 2), "xyzw"[regid & 0x3],
+ gl_frag_result_name(so->outputs[i].slot));
+ }
+ debug_printf("\n");
+ debug_printf("; %s: inputs:", type);
+ for (i = 0; i < so->inputs_count; i++) {
+ uint8_t regid = so->inputs[i].regid;
+ debug_printf(" r%d.%c (%s,cm=%x,il=%u,b=%u)",
+ (regid >> 2), "xyzw"[regid & 0x3],
+ gl_varying_slot_name(so->inputs[i].slot),
+ so->inputs[i].compmask,
+ so->inputs[i].inloc,
+ so->inputs[i].bary);
+ }
+ debug_printf("\n");
+ break;
+ case SHADER_COMPUTE:
+ break;
}
- debug_printf("\n");
/* print generic shader info: */
debug_printf("; %s prog %d/%d: %u instructions, %d half, %d full\n",
@@ -391,13 +419,24 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin)
/* print shader type specific info: */
switch (so->type) {
case SHADER_VERTEX:
- dump_semantic(so, TGSI_SEMANTIC_POSITION, "pos");
- dump_semantic(so, TGSI_SEMANTIC_PSIZE, "psize");
+ dump_output(so, VARYING_SLOT_POS, "pos");
+ dump_output(so, VARYING_SLOT_PSIZ, "psize");
break;
case SHADER_FRAGMENT:
dump_reg("pos (bary)", so->pos_regid);
- dump_semantic(so, TGSI_SEMANTIC_POSITION, "posz");
- dump_semantic(so, TGSI_SEMANTIC_COLOR, "color");
+ dump_output(so, FRAG_RESULT_DEPTH, "posz");
+ if (so->color0_mrt) {
+ dump_output(so, FRAG_RESULT_COLOR, "color");
+ } else {
+ dump_output(so, FRAG_RESULT_DATA0, "data0");
+ dump_output(so, FRAG_RESULT_DATA1, "data1");
+ dump_output(so, FRAG_RESULT_DATA2, "data2");
+ dump_output(so, FRAG_RESULT_DATA3, "data3");
+ dump_output(so, FRAG_RESULT_DATA4, "data4");
+ dump_output(so, FRAG_RESULT_DATA5, "data5");
+ dump_output(so, FRAG_RESULT_DATA6, "data6");
+ dump_output(so, FRAG_RESULT_DATA7, "data7");
+ }
/* these two are hard-coded since we don't know how to
* program them to anything but all 0's...
*/
@@ -466,7 +505,7 @@ static void
emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
struct fd_constbuf_stateobj *constbuf)
{
- uint32_t offset = v->first_driver_param; /* UBOs after user consts */
+ uint32_t offset = v->first_driver_param + IR3_UBOS_OFF;
if (v->constlen > offset) {
struct fd_context *ctx = fd_context(v->shader->pctx);
uint32_t params = MIN2(4, v->constlen - offset) * 4;
@@ -519,7 +558,8 @@ emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
static void
emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
{
- uint32_t offset = v->first_driver_param + 5; /* streamout addresses after driver-params*/
+ /* streamout addresses after driver-params: */
+ uint32_t offset = v->first_driver_param + IR3_TFBOS_OFF;
if (v->constlen > offset) {
struct fd_context *ctx = fd_context(v->shader->pctx);
struct fd_streamout_stateobj *so = &ctx->streamout;
@@ -622,17 +662,33 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
/* emit driver params every time: */
/* TODO skip emit if shader doesn't use driver params to avoid WFI.. */
if (info && (v->type == SHADER_VERTEX)) {
- uint32_t offset = v->first_driver_param + 4; /* driver params after UBOs */
+ uint32_t offset = v->first_driver_param + IR3_DRIVER_PARAM_OFF;
if (v->constlen >= offset) {
- uint32_t vertex_params[4] = {
+ uint32_t vertex_params[IR3_DP_COUNT] = {
[IR3_DP_VTXID_BASE] = info->indexed ?
info->index_bias : info->start,
[IR3_DP_VTXCNT_MAX] = max_tf_vtx(v),
};
+ /* if no user-clip-planes, we don't need to emit the
+ * entire thing:
+ */
+ uint32_t vertex_params_size = 4;
+
+ if (v->key.ucp_enables) {
+ struct pipe_clip_state *ucp = &ctx->ucp;
+ unsigned pos = IR3_DP_UCP0_X;
+ for (unsigned i = 0; pos <= IR3_DP_UCP7_W; i++) {
+ for (unsigned j = 0; j < 4; j++) {
+ vertex_params[pos] = fui(ucp->ucp[i][j]);
+ pos++;
+ }
+ }
+ vertex_params_size = ARRAY_SIZE(vertex_params);
+ }
fd_wfi(ctx, ring);
ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0,
- ARRAY_SIZE(vertex_params), vertex_params, NULL);
+ vertex_params_size, vertex_params, NULL);
/* if needed, emit stream-out buffer addresses: */
if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) {
diff --git a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 1bbbdbd22..03d4fa2e9 100644
--- a/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/lib/mesa/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -30,6 +30,7 @@
#define IR3_SHADER_H_
#include "pipe/p_state.h"
+#include "compiler/shader_enums.h"
#include "ir3.h"
#include "disasm.h"
@@ -38,29 +39,28 @@
enum ir3_driver_param {
IR3_DP_VTXID_BASE = 0,
IR3_DP_VTXCNT_MAX = 1,
+ /* user-clip-plane components, up to 8x vec4's: */
+ IR3_DP_UCP0_X = 4,
+ /* .... */
+ IR3_DP_UCP7_W = 35,
+ IR3_DP_COUNT = 36 /* must be aligned to vec4 */
};
-/* internal semantic used for passing vtxcnt to vertex shader to
- * implement transform feedback:
+/* Layout of constant registers:
+ *
+ * num_uniform * vec4 - user consts
+ * 4 * vec4 - UBO addresses
+ * if (vertex shader) {
+ * N * vec4 - driver params (IR3_DP_*)
+ * 1 * vec4 - stream-out addresses
+ * }
+ *
+ * TODO this could be made more dynamic, to at least skip sections
+ * that we don't need..
*/
-#define IR3_SEMANTIC_VTXCNT (TGSI_SEMANTIC_COUNT + 0)
-
-typedef uint16_t ir3_semantic; /* semantic name + index */
-static inline ir3_semantic
-ir3_semantic_name(uint8_t name, uint16_t index)
-{
- return (name << 8) | (index & 0xff);
-}
-
-static inline uint8_t sem2name(ir3_semantic sem)
-{
- return sem >> 8;
-}
-
-static inline uint16_t sem2idx(ir3_semantic sem)
-{
- return sem & 0xff;
-}
+#define IR3_UBOS_OFF 0 /* UBOs after user consts */
+#define IR3_DRIVER_PARAM_OFF 4 /* driver params after UBOs */
+#define IR3_TFBOS_OFF (IR3_DRIVER_PARAM_OFF + IR3_DP_COUNT/4)
/* Configuration key used to identify a shader variant.. different
* shader variants can be used to implement features not supported
@@ -69,6 +69,11 @@ static inline uint16_t sem2idx(ir3_semantic sem)
struct ir3_shader_key {
union {
struct {
+ /*
+ * Combined Vertex/Fragment shader parameters:
+ */
+ unsigned ucp_enables : 8;
+
/* do we need to check {v,f}saturate_{s,t,r}? */
unsigned has_per_samp : 1;
@@ -82,8 +87,8 @@ struct ir3_shader_key {
*/
unsigned color_two_side : 1;
unsigned half_precision : 1;
- /* used when shader needs to handle flat varyings (a4xx),
- * for TGSI_INTERPOLATE_COLOR:
+ /* used when shader needs to handle flat varyings (a4xx)
+ * for front/back color inputs to frag shader:
*/
unsigned rasterflat : 1;
};
@@ -147,18 +152,26 @@ struct ir3_shader_variant {
uint8_t pos_regid;
bool frag_coord, frag_face, color0_mrt;
+ /* NOTE: for input/outputs, slot is:
+ * gl_vert_attrib - for VS inputs
+ * gl_varying_slot - for VS output / FS input
+ * gl_frag_result - for FS output
+ */
+
/* varyings/outputs: */
unsigned outputs_count;
struct {
- ir3_semantic semantic;
+ uint8_t slot;
uint8_t regid;
} outputs[16 + 2]; /* +POSITION +PSIZE */
bool writes_pos, writes_psize;
- /* vertices/inputs: */
+ /* attributes (VS) / varyings (FS):
+ * Note that sysval's should come *after* normal inputs.
+ */
unsigned inputs_count;
struct {
- ir3_semantic semantic;
+ uint8_t slot;
uint8_t regid;
uint8_t compmask;
uint8_t ncomp;
@@ -174,11 +187,23 @@ struct ir3_shader_variant {
* spots where inloc is used.
*/
uint8_t inloc;
- uint8_t bary;
- uint8_t interpolate;
+ /* vertex shader specific: */
+ bool sysval : 1; /* slot is a gl_system_value */
+ /* fragment shader specific: */
+ bool bary : 1; /* fetched varying (vs one loaded into reg) */
+ bool rasterflat : 1; /* special handling for emit->rasterflat */
+ enum glsl_interp_qualifier interpolate;
} inputs[16 + 2]; /* +POSITION +FACE */
- unsigned total_in; /* sum of inputs (scalar) */
+ /* sum of input components (scalar). For frag shaders, it only counts
+ * the varying inputs:
+ */
+ unsigned total_in;
+
+ /* For frag shaders, the total number of inputs (not scalar,
+ * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
+ */
+ unsigned varying_in;
/* do we have one or more texture sample instructions: */
bool has_samp;
@@ -205,6 +230,8 @@ struct ir3_shader_variant {
struct ir3_shader *shader;
};
+typedef struct nir_shader nir_shader;
+
struct ir3_shader {
enum shader_t type;
@@ -214,8 +241,8 @@ struct ir3_shader {
struct ir3_compiler *compiler;
- struct pipe_context *pctx;
- const struct tgsi_token *tokens;
+ struct pipe_context *pctx; /* TODO replace w/ pipe_screen */
+ nir_shader *nir;
struct pipe_stream_output_info stream_output;
struct ir3_shader_variant *variants;
@@ -254,12 +281,12 @@ ir3_shader_stage(struct ir3_shader *shader)
#include "pipe/p_shader_tokens.h"
static inline int
-ir3_find_output(const struct ir3_shader_variant *so, ir3_semantic semantic)
+ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
{
int j;
for (j = 0; j < so->outputs_count; j++)
- if (so->outputs[j].semantic == semantic)
+ if (so->outputs[j].slot == slot)
return j;
/* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
@@ -269,18 +296,20 @@ ir3_find_output(const struct ir3_shader_variant *so, ir3_semantic semantic)
* OUT.COLOR[n] to IN.BCOLOR[n]. And visa versa if there is only
* a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
*/
- if (sem2name(semantic) == TGSI_SEMANTIC_BCOLOR) {
- unsigned idx = sem2idx(semantic);
- semantic = ir3_semantic_name(TGSI_SEMANTIC_COLOR, idx);
- } else if (sem2name(semantic) == TGSI_SEMANTIC_COLOR) {
- unsigned idx = sem2idx(semantic);
- semantic = ir3_semantic_name(TGSI_SEMANTIC_BCOLOR, idx);
+ if (slot == VARYING_SLOT_BFC0) {
+ slot = VARYING_SLOT_COL0;
+ } else if (slot == VARYING_SLOT_BFC1) {
+ slot = VARYING_SLOT_COL1;
+ } else if (slot == VARYING_SLOT_COL0) {
+ slot = VARYING_SLOT_BFC0;
+ } else if (slot == VARYING_SLOT_COL1) {
+ slot = VARYING_SLOT_BFC1;
} else {
return 0;
}
for (j = 0; j < so->outputs_count; j++)
- if (so->outputs[j].semantic == semantic)
+ if (so->outputs[j].slot == slot)
return j;
debug_assert(0);
@@ -298,11 +327,11 @@ ir3_next_varying(const struct ir3_shader_variant *so, int i)
}
static inline uint32_t
-ir3_find_output_regid(const struct ir3_shader_variant *so, ir3_semantic semantic)
+ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
{
int j;
for (j = 0; j < so->outputs_count; j++)
- if (so->outputs[j].semantic == semantic)
+ if (so->outputs[j].slot == slot)
return so->outputs[j].regid;
return regid(63, 0);
}