77 files changed, 3381 insertions, 760 deletions
diff --git a/lib/mesa/src/gallium/auxiliary/draw/draw_cliptest_tmp.h b/lib/mesa/src/gallium/auxiliary/draw/draw_cliptest_tmp.h
index b7c77bfd8..dceae1fd0 100644
--- a/lib/mesa/src/gallium/auxiliary/draw/draw_cliptest_tmp.h
+++ b/lib/mesa/src/gallium/auxiliary/draw/draw_cliptest_tmp.h
@@ -26,6 +26,7 @@
  **************************************************************************/
 
 #include "util/u_bitcast.h"
+#include <math.h>
 
 static boolean TAG(do_cliptest)( struct pt_post_vs *pvs,
                                  struct draw_vertex_info *info,
@@ -182,11 +183,10 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs,
        * to NaN to help catch potential errors later.
        */
       else {
-         float zero = 0.0f;
          position[0] =
          position[1] =
          position[2] =
-         position[3] = zero / zero; /* MSVC doesn't accept 0.0 / 0.0 */
+         position[3] = NAN;
       }
 #endif
 
diff --git a/lib/mesa/src/gallium/auxiliary/draw/draw_decompose_tmp.h b/lib/mesa/src/gallium/auxiliary/draw/draw_decompose_tmp.h
index e9f3b1d88..7686afebe 100644
--- a/lib/mesa/src/gallium/auxiliary/draw/draw_decompose_tmp.h
+++ b/lib/mesa/src/gallium/auxiliary/draw/draw_decompose_tmp.h
@@ -170,7 +170,10 @@ FUNC(FUNC_VARS)
             idx[1] = GET_ELT(i + 1);
             idx[2] = GET_ELT(i + 2);
             idx[3] = GET_ELT(i + 3);
-
+#ifdef PASS_QUADS
+            QUAD(0, idx[0], idx[1],
+                  idx[2], idx[3]);
+#else
             flags = DRAW_PIPE_RESET_STIPPLE |
                     DRAW_PIPE_EDGE_FLAG_0 |
                     DRAW_PIPE_EDGE_FLAG_2;
@@ -180,6 +183,7 @@ FUNC(FUNC_VARS)
             flags = DRAW_PIPE_EDGE_FLAG_0 |
                     DRAW_PIPE_EDGE_FLAG_1;
             TRIANGLE(flags, idx[1], idx[2], idx[3]);
+#endif
          }
       }
       else {
@@ -188,7 +192,10 @@ FUNC(FUNC_VARS)
             idx[1] = GET_ELT(i + 1);
             idx[2] = GET_ELT(i + 2);
             idx[3] = GET_ELT(i + 3);
-
+#ifdef PASS_QUADS
+            QUAD(0, idx[0], idx[1],
+                  idx[2], idx[3]);
+#else
             flags = DRAW_PIPE_RESET_STIPPLE |
                     DRAW_PIPE_EDGE_FLAG_0 |
                     DRAW_PIPE_EDGE_FLAG_1;
@@ -204,6 +211,7 @@ FUNC(FUNC_VARS)
                TRIANGLE(flags, idx[3], idx[1], idx[2]);
             else
                TRIANGLE(flags, idx[0], idx[2], idx[3]);
+#endif
          }
       }
       break;
@@ -220,6 +228,10 @@ FUNC(FUNC_VARS)
                idx[2] = GET_ELT(i + 2);
                idx[3] = GET_ELT(i + 3);
 
+#ifdef PASS_QUADS
+               QUAD(0, idx[2], idx[0],
+                    idx[1], idx[3]);
+#else
                /* always emit idx[3] last */
                flags = DRAW_PIPE_RESET_STIPPLE |
                        DRAW_PIPE_EDGE_FLAG_0 |
@@ -229,6 +241,7 @@ FUNC(FUNC_VARS)
                flags = DRAW_PIPE_EDGE_FLAG_0 |
                        DRAW_PIPE_EDGE_FLAG_1;
                TRIANGLE(flags, idx[0], idx[1], idx[3]);
+#endif
             }
          }
          else {
@@ -238,6 +251,10 @@ FUNC(FUNC_VARS)
                idx[2] = GET_ELT(i + 2);
                idx[3] = GET_ELT(i + 3);
 
+#ifdef PASS_QUADS
+               QUAD(0, idx[3], idx[2],
+                    idx[0], idx[1]);
+#else
                flags = DRAW_PIPE_RESET_STIPPLE |
                        DRAW_PIPE_EDGE_FLAG_0 |
                        DRAW_PIPE_EDGE_FLAG_1;
@@ -253,6 +270,7 @@ FUNC(FUNC_VARS)
                   TRIANGLE(flags, idx[3], idx[0], idx[1]);
                else
                   TRIANGLE(flags, idx[0], idx[1], idx[3]);
+#endif
             }
          }
       }
diff --git a/lib/mesa/src/gallium/auxiliary/draw/draw_gs.c b/lib/mesa/src/gallium/auxiliary/draw/draw_gs.c
index ed698e920..90e66b643 100644
--- a/lib/mesa/src/gallium/auxiliary/draw/draw_gs.c
+++ b/lib/mesa/src/gallium/auxiliary/draw/draw_gs.c
@@ -834,12 +834,18 @@ draw_create_geometry_shader(struct draw_context *draw,
    gs->primitive_boundary = gs->max_output_vertices + 1;
 
    gs->position_output = -1;
+   bool found_clipvertex = false;
    for (i = 0; i < gs->info.num_outputs; i++) {
       if (gs->info.output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
           gs->info.output_semantic_index[i] == 0)
          gs->position_output = i;
       if (gs->info.output_semantic_name[i] == TGSI_SEMANTIC_VIEWPORT_INDEX)
          gs->viewport_index_output = i;
+      if (gs->info.output_semantic_name[i] == TGSI_SEMANTIC_CLIPVERTEX &&
+          gs->info.output_semantic_index[i] == 0) {
+         found_clipvertex = true;
+         gs->clipvertex_output = i;
+      }
       if (gs->info.output_semantic_name[i] == TGSI_SEMANTIC_CLIPDIST) {
          debug_assert(gs->info.output_semantic_index[i] <
                       PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT);
@@ -847,6 +853,9 @@ draw_create_geometry_shader(struct draw_context *draw,
       }
    }
 
+   if (!found_clipvertex)
+      gs->clipvertex_output = gs->position_output;
+
    gs->machine = draw->gs.tgsi.machine;
 
    gs->num_vertex_streams = 1;
@@ -900,6 +909,7 @@ void draw_bind_geometry_shader(struct draw_context *draw,
       draw->gs.geometry_shader = dgs;
       draw->gs.num_gs_outputs = dgs->info.num_outputs;
       draw->gs.position_output = dgs->position_output;
+      draw->gs.clipvertex_output = dgs->clipvertex_output;
       draw_geometry_shader_prepare(dgs, draw);
    }
    else {
diff --git a/lib/mesa/src/gallium/auxiliary/draw/draw_gs.h b/lib/mesa/src/gallium/auxiliary/draw/draw_gs.h
index 9449ec509..10969426f 100644
--- a/lib/mesa/src/gallium/auxiliary/draw/draw_gs.h
+++ b/lib/mesa/src/gallium/auxiliary/draw/draw_gs.h
@@ -75,6 +75,7 @@ struct draw_geometry_shader {
    struct tgsi_shader_info info;
    unsigned position_output;
    unsigned viewport_index_output;
+   unsigned clipvertex_output;
    unsigned ccdistance_output[PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT];
 
    unsigned max_output_vertices;
diff --git a/lib/mesa/src/gallium/auxiliary/draw/draw_llvm_sample.c b/lib/mesa/src/gallium/auxiliary/draw/draw_llvm_sample.c
index a3895c798..b3e98be55 100644
--- a/lib/mesa/src/gallium/auxiliary/draw/draw_llvm_sample.c
+++ b/lib/mesa/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -289,6 +289,7 @@ DRAW_LLVM_SAMPLER_MEMBER(min_lod,    DRAW_JIT_SAMPLER_MIN_LOD, TRUE)
 DRAW_LLVM_SAMPLER_MEMBER(max_lod,    DRAW_JIT_SAMPLER_MAX_LOD, TRUE)
 DRAW_LLVM_SAMPLER_MEMBER(lod_bias,   DRAW_JIT_SAMPLER_LOD_BIAS, TRUE)
 DRAW_LLVM_SAMPLER_MEMBER(border_color, DRAW_JIT_SAMPLER_BORDER_COLOR, FALSE)
+DRAW_LLVM_SAMPLER_MEMBER(max_aniso,  DRAW_JIT_SAMPLER_MAX_ANISO, TRUE)
 
 #define DRAW_LLVM_IMAGE_MEMBER(_name, _index, _emit_load)  \
    static LLVMValueRef \
@@ -405,6 +406,7 @@ draw_llvm_sampler_soa_create(const struct draw_sampler_static_state *static_stat
    sampler->dynamic_state.base.max_lod = draw_llvm_sampler_max_lod;
    sampler->dynamic_state.base.lod_bias = draw_llvm_sampler_lod_bias;
    sampler->dynamic_state.base.border_color = draw_llvm_sampler_border_color;
+   sampler->dynamic_state.base.max_aniso = draw_llvm_sampler_max_aniso;
    sampler->dynamic_state.static_state = static_state;
 
    sampler->nr_samplers = nr_samplers;
diff --git a/lib/mesa/src/gallium/auxiliary/draw/draw_pipe_offset.c b/lib/mesa/src/gallium/auxiliary/draw/draw_pipe_offset.c
index 08d47f005..87db9cdda 100644
--- a/lib/mesa/src/gallium/auxiliary/draw/draw_pipe_offset.c
+++ b/lib/mesa/src/gallium/auxiliary/draw/draw_pipe_offset.c
@@ -97,7 +97,7 @@ static void do_offset_tri( struct draw_stage *stage,
    if (stage->draw->floating_point_depth) {
       float bias;
       union fi maxz;
-      maxz.f = MAX3(v0[2], v1[2], v2[2]);
+      maxz.f = MAX3(fabs(v0[2]), fabs(v1[2]), fabs(v2[2]));
       /* just do the math directly on shifted number */
       maxz.ui &= 0xff << 23;
       maxz.i -= 23 << 23;
@@ -187,7 +187,7 @@ static void offset_first_tri( struct draw_stage *stage,
       if (stage->draw->floating_point_depth) {
          offset->units = (float) rast->offset_units;
       } else {
-         offset->units = (float) (rast->offset_units * stage->draw->mrd);
+         offset->units = (float) (rast->offset_units * stage->draw->mrd * 2);
       }
    }
    else {
diff --git a/lib/mesa/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/lib/mesa/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index eb7ad8bf2..d5f757eb1 100644
--- a/lib/mesa/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/lib/mesa/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -109,6 +109,7 @@ struct pstip_stage
                                     enum pipe_shader_type shader,
                                     unsigned start, unsigned count,
                                     unsigned unbind_num_trailing_slots,
+                                    bool take_ownership,
                                     struct pipe_sampler_view **);
 
    void (*driver_set_polygon_stipple)(struct pipe_context *,
@@ -224,7 +225,8 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
                                      num_samplers, pstip->state.samplers);
 
    pstip->driver_set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0,
-                                   num_sampler_views, 0, pstip->state.sampler_views);
+                                   num_sampler_views, 0, false,
+                                   pstip->state.sampler_views);
 
    draw->suspend_flushing = FALSE;
 
@@ -253,7 +255,7 @@ pstip_flush(struct draw_stage *stage, unsigned flags)
                                      pstip->state.samplers);
 
    pstip->driver_set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0,
-                                   pstip->num_sampler_views, 0,
+                                   pstip->num_sampler_views, 0, false,
                                    pstip->state.sampler_views);
 
    draw->suspend_flushing = FALSE;
@@ -418,6 +420,7 @@ pstip_set_sampler_views(struct pipe_context *pipe,
                         enum pipe_shader_type shader,
                         unsigned start, unsigned num,
                         unsigned unbind_num_trailing_slots,
+                        bool take_ownership,
                         struct pipe_sampler_view **views)
 {
    struct pstip_stage *pstip = pstip_stage_from_pipe(pipe);
@@ -438,7 +441,7 @@ pstip_set_sampler_views(struct pipe_context *pipe,
 
    /* pass-through */
    pstip->driver_set_sampler_views(pstip->pipe, shader, start, num,
-                                   unbind_num_trailing_slots, views);
+                                   unbind_num_trailing_slots, take_ownership, views);
 }
 
 
diff --git a/lib/mesa/src/gallium/auxiliary/draw/draw_prim_assembler.c b/lib/mesa/src/gallium/auxiliary/draw/draw_prim_assembler.c
index e628a143d..9a957f33f 100644
--- a/lib/mesa/src/gallium/auxiliary/draw/draw_prim_assembler.c
+++ b/lib/mesa/src/gallium/auxiliary/draw/draw_prim_assembler.c
@@ -201,6 +201,28 @@ prim_tri(struct draw_assembler *asmblr,
    copy_verts(asmblr, indices, 3);
 }
 
+static void
+prim_quad(struct draw_assembler *asmblr,
+          unsigned i0, unsigned i1,
+          unsigned i2, unsigned i3)
+{
+   unsigned indices[4];
+
+   if (asmblr->needs_primid) {
+      inject_primid(asmblr, i0, asmblr->primid);
+      inject_primid(asmblr, i1, asmblr->primid);
+      inject_primid(asmblr, i2, asmblr->primid);
+      inject_primid(asmblr, i3, asmblr->primid++);
+   }
+   indices[0] = i0;
+   indices[1] = i1;
+   indices[2] = i2;
+   indices[3] = i3;
+
+   add_prim(asmblr, 4);
+   copy_verts(asmblr, indices, 4);
+}
+
 void
 draw_prim_assembler_prepare_outputs(struct draw_assembler *ia)
 {
@@ -244,7 +266,9 @@ draw_prim_assembler_run(struct draw_context *draw,
 {
    struct draw_assembler *asmblr = draw->ia;
    unsigned start, i;
-   unsigned assembled_prim = u_reduced_prim(input_prims->prim);
+   unsigned assembled_prim = (input_prims->prim == PIPE_PRIM_QUADS ||
+                              input_prims->prim == PIPE_PRIM_QUAD_STRIP) ?
+      PIPE_PRIM_QUADS : u_reduced_prim(input_prims->prim);
    unsigned max_primitives = u_decomposed_prims_for_vertices(
       input_prims->prim, input_prims->count);
    unsigned max_verts = u_vertices_per_prim(assembled_prim) * max_primitives;
diff --git a/lib/mesa/src/gallium/auxiliary/draw/draw_pt.h b/lib/mesa/src/gallium/auxiliary/draw/draw_pt.h
index 00527527f..5201676e1 100644
--- a/lib/mesa/src/gallium/auxiliary/draw/draw_pt.h
+++ b/lib/mesa/src/gallium/auxiliary/draw/draw_pt.h
@@ -147,12 +147,7 @@ struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw);
  * Currently one general-purpose case which can do all possibilities,
  * at the slight expense of creating a vertex_header in some cases
  * unecessarily.
- *
- * The special case fetch_emit code avoids pipeline vertices
- * altogether and builds hardware vertices directly from API
- * vertex_elements.
  */
-struct draw_pt_middle_end *draw_pt_fetch_emit( struct draw_context *draw );
 struct draw_pt_middle_end *draw_pt_middle_fse( struct draw_context *draw );
 struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit(struct draw_context *draw);
 struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit_llvm(struct draw_context *draw);
diff --git a/lib/mesa/src/gallium/auxiliary/draw/draw_pt_fetch.c b/lib/mesa/src/gallium/auxiliary/draw/draw_pt_fetch.c
index 17fcfa067..b826b3381 100644
--- a/lib/mesa/src/gallium/auxiliary/draw/draw_pt_fetch.c
+++ b/lib/mesa/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -50,11 +50,6 @@ struct pt_fetch {
  * Perform the fetch from API vertex elements & vertex buffers, to a
  * contiguous set of float[4] attributes as required for the
  * vertex_shader->run_linear() method.
- *
- * This is used in all cases except pure passthrough
- * (draw_pt_fetch_emit.c) which has its own version to translate
- * directly to hw vertices.
- *
  */
 void
 draw_pt_fetch_prepare(struct pt_fetch *fetch,
diff --git a/lib/mesa/src/gallium/auxiliary/draw/draw_vs.c b/lib/mesa/src/gallium/auxiliary/draw/draw_vs.c
index e8dbc11bc..41e6c7a04 100644
--- a/lib/mesa/src/gallium/auxiliary/draw/draw_vs.c
+++ b/lib/mesa/src/gallium/auxiliary/draw/draw_vs.c
@@ -35,6 +35,8 @@
 #include "util/u_memory.h"
 
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
 
 #include "draw_private.h"
 #include "draw_context.h"
@@ -46,6 +48,8 @@
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_exec.h"
 
+#include "nir/nir_to_tgsi.h"
+
 DEBUG_GET_ONCE_BOOL_OPTION(gallium_dump_vs, "GALLIUM_DUMP_VS", FALSE)
 
 
@@ -54,6 +58,7 @@ draw_create_vertex_shader(struct draw_context *draw,
                           const struct pipe_shader_state *shader)
 {
    struct draw_vertex_shader *vs = NULL;
+   struct pipe_shader_state state = *shader;
 
    if (draw->dump_vs) {
       tgsi_dump(shader->tokens, 0);
@@ -61,12 +66,22 @@ draw_create_vertex_shader(struct draw_context *draw,
 
 #ifdef DRAW_LLVM_AVAILABLE
    if (draw->pt.middle.llvm) {
-      vs = draw_create_vs_llvm(draw, shader);
+      struct pipe_screen *screen = draw->pipe->screen;
+      if (shader->type == PIPE_SHADER_IR_NIR &&
+          ((!screen->get_shader_param(screen, PIPE_SHADER_VERTEX,
+                                     PIPE_SHADER_CAP_INTEGERS)) ||
+           (screen->get_shader_param(screen, PIPE_SHADER_VERTEX,
+                                     PIPE_SHADER_CAP_PREFERRED_IR) ==
+            PIPE_SHADER_IR_TGSI))) {
+        state.type = PIPE_SHADER_IR_TGSI;
+        state.tokens = nir_to_tgsi(shader->ir.nir, screen);
+      }
+      vs = draw_create_vs_llvm(draw, &state);
    }
 #endif
 
    if (!vs) {
-      vs = draw_create_vs_exec( draw, shader );
+      vs = draw_create_vs_exec( draw, &state );
    }
 
    if (vs)
diff --git a/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_context.c b/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_context.c
index d24b2c55c..53b68a352 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_context.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_context.c
@@ -411,6 +411,15 @@ static void dd_context_set_tess_state(struct pipe_context *_pipe,
    pipe->set_tess_state(pipe, default_outer_level, default_inner_level);
 }
 
+static void dd_context_set_patch_vertices(struct pipe_context *_pipe,
+                                          uint8_t patch_vertices)
+{
+   struct dd_context *dctx = dd_context(_pipe);
+   struct pipe_context *pipe = dctx->pipe;
+
+   pipe->set_patch_vertices(pipe, patch_vertices);
+}
+
 static void dd_context_set_window_rectangles(struct pipe_context *_pipe,
                                              bool include,
                                              unsigned num_rectangles,
@@ -511,6 +520,7 @@ dd_context_set_sampler_views(struct pipe_context *_pipe,
                              enum pipe_shader_type shader,
                              unsigned start, unsigned num,
                              unsigned unbind_num_trailing_slots,
+                             bool take_ownership,
                              struct pipe_sampler_view **views)
 {
    struct dd_context *dctx = dd_context(_pipe);
@@ -520,7 +530,7 @@ dd_context_set_sampler_views(struct pipe_context *_pipe,
                sizeof(views[0]) * num);
    safe_memcpy(&dctx->draw_state.sampler_views[shader][start + num], views,
                sizeof(views[0]) * unbind_num_trailing_slots);
-   pipe->set_sampler_views(pipe, shader, start, num,
+   pipe->set_sampler_views(pipe, shader, start, num, take_ownership,
                            unbind_num_trailing_slots, views);
 }
 
@@ -903,6 +913,7 @@ dd_context_create(struct dd_screen *dscreen, struct pipe_context *pipe)
    CTX_INIT(set_viewport_states);
    CTX_INIT(set_sampler_views);
    CTX_INIT(set_tess_state);
+   CTX_INIT(set_patch_vertices);
    CTX_INIT(set_shader_buffers);
    CTX_INIT(set_shader_images);
    CTX_INIT(set_vertex_buffers);
diff --git a/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_draw.c b/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_draw.c
index 9fc776d75..5e70765f5 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_draw.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_draw.c
@@ -352,13 +352,15 @@ dd_dump_flush(struct dd_draw_state *dstate, struct call_flush *info, FILE *f)
 
 static void
 dd_dump_draw_vbo(struct dd_draw_state *dstate, struct pipe_draw_info *info,
+                 unsigned drawid_offset,
                  const struct pipe_draw_indirect_info *indirect,
-                 const struct pipe_draw_start_count *draw, FILE *f)
+                 const struct pipe_draw_start_count_bias *draw, FILE *f)
 {
    int sh, i;
 
    DUMP(draw_info, info);
-   DUMP(draw_start_count, draw);
+   PRINT_NAMED(int, "drawid offset", drawid_offset);
+   DUMP(draw_start_count_bias, draw);
    if (indirect) {
       if (indirect->buffer)
          DUMP_M(resource, indirect, buffer);
@@ -636,6 +638,7 @@ dd_dump_call(FILE *f, struct dd_draw_state *state, struct dd_call *call)
       break;
    case CALL_DRAW_VBO:
       dd_dump_draw_vbo(state, &call->info.draw_vbo.info,
+                       call->info.draw_vbo.drawid_offset,
                        &call->info.draw_vbo.indirect,
                        &call->info.draw_vbo.draw, f);
       break;
@@ -1303,8 +1306,9 @@ dd_context_flush(struct pipe_context *_pipe,
 static void
 dd_context_draw_vbo(struct pipe_context *_pipe,
                     const struct pipe_draw_info *info,
+                    unsigned drawid_offset,
                     const struct pipe_draw_indirect_info *indirect,
-                    const struct pipe_draw_start_count *draws,
+                    const struct pipe_draw_start_count_bias *draws,
                     unsigned num_draws)
 {
    struct dd_context *dctx = dd_context(_pipe);
@@ -1313,6 +1317,7 @@ dd_context_draw_vbo(struct pipe_context *_pipe,
 
    record->call.type = CALL_DRAW_VBO;
    record->call.info.draw_vbo.info = *info;
+   record->call.info.draw_vbo.drawid_offset = drawid_offset;
    record->call.info.draw_vbo.draw = draws[0];
    if (info->index_size && !info->has_user_indices) {
       record->call.info.draw_vbo.info.index.resource = NULL;
@@ -1336,7 +1341,7 @@ dd_context_draw_vbo(struct pipe_context *_pipe,
    }
 
    dd_before_draw(dctx, record);
-   pipe->draw_vbo(pipe, info, indirect, draws, num_draws);
+   pipe->draw_vbo(pipe, info, drawid_offset, indirect, draws, num_draws);
    dd_after_draw(dctx, record);
 }
 
@@ -1594,10 +1599,10 @@ dd_context_clear_texture(struct pipe_context *_pipe,
  */
 
 static void *
-dd_context_transfer_map(struct pipe_context *_pipe,
-                        struct pipe_resource *resource, unsigned level,
-                        unsigned usage, const struct pipe_box *box,
-                        struct pipe_transfer **transfer)
+dd_context_buffer_map(struct pipe_context *_pipe,
+                      struct pipe_resource *resource, unsigned level,
+                      unsigned usage, const struct pipe_box *box,
+                      struct pipe_transfer **transfer)
 {
    struct dd_context *dctx = dd_context(_pipe);
    struct pipe_context *pipe = dctx->pipe;
@@ -1609,7 +1614,41 @@ dd_context_transfer_map(struct pipe_context *_pipe,
 
       dd_before_draw(dctx, record);
    }
-   void *ptr = pipe->transfer_map(pipe, resource, level, usage, box, transfer);
+   void *ptr = pipe->buffer_map(pipe, resource, level, usage, box, transfer);
+   if (record) {
+      record->call.info.transfer_map.transfer_ptr = *transfer;
+      record->call.info.transfer_map.ptr = ptr;
+      if (*transfer) {
+         record->call.info.transfer_map.transfer = **transfer;
+         record->call.info.transfer_map.transfer.resource = NULL;
+         pipe_resource_reference(&record->call.info.transfer_map.transfer.resource,
+                                 (*transfer)->resource);
+      } else {
+         memset(&record->call.info.transfer_map.transfer, 0, sizeof(struct pipe_transfer));
+      }
+
+      dd_after_draw(dctx, record);
+   }
+   return ptr;
+}
+
+static void *
+dd_context_texture_map(struct pipe_context *_pipe,
+                       struct pipe_resource *resource, unsigned level,
+                       unsigned usage, const struct pipe_box *box,
+                       struct pipe_transfer **transfer)
+{
+   struct dd_context *dctx = dd_context(_pipe);
+   struct pipe_context *pipe = dctx->pipe;
+   struct dd_draw_record *record =
+      dd_screen(dctx->base.screen)->transfers ? dd_create_record(dctx) : NULL;
+
+   if (record) {
+      record->call.type = CALL_TRANSFER_MAP;
+
+      dd_before_draw(dctx, record);
+   }
+   void *ptr = pipe->texture_map(pipe, resource, level, usage, box, transfer);
    if (record) {
       record->call.info.transfer_map.transfer_ptr = *transfer;
       record->call.info.transfer_map.ptr = ptr;
@@ -1655,7 +1694,32 @@ dd_context_transfer_flush_region(struct pipe_context *_pipe,
 }
 
 static void
-dd_context_transfer_unmap(struct pipe_context *_pipe,
+dd_context_buffer_unmap(struct pipe_context *_pipe,
+                          struct pipe_transfer *transfer)
+{
+   struct dd_context *dctx = dd_context(_pipe);
+   struct pipe_context *pipe = dctx->pipe;
+   struct dd_draw_record *record =
+      dd_screen(dctx->base.screen)->transfers ? dd_create_record(dctx) : NULL;
+
+   if (record) {
+      record->call.type = CALL_TRANSFER_UNMAP;
+      record->call.info.transfer_unmap.transfer_ptr = transfer;
+      record->call.info.transfer_unmap.transfer = *transfer;
+      record->call.info.transfer_unmap.transfer.resource = NULL;
+      pipe_resource_reference(
+            &record->call.info.transfer_unmap.transfer.resource,
+            transfer->resource);
+
+      dd_before_draw(dctx, record);
+   }
+   pipe->buffer_unmap(pipe, transfer);
+   if (record)
+      dd_after_draw(dctx, record);
+}
+
+static void
+dd_context_texture_unmap(struct pipe_context *_pipe,
                           struct pipe_transfer *transfer)
 {
    struct dd_context *dctx = dd_context(_pipe);
@@ -1674,7 +1738,7 @@ dd_context_transfer_unmap(struct pipe_context *_pipe,
 
       dd_before_draw(dctx, record);
    }
-   pipe->transfer_unmap(pipe, transfer);
+   pipe->texture_unmap(pipe, transfer);
    if (record)
       dd_after_draw(dctx, record);
 }
@@ -1754,9 +1818,11 @@ dd_init_draw_functions(struct dd_context *dctx)
    CTX_INIT(flush_resource);
    CTX_INIT(generate_mipmap);
    CTX_INIT(get_query_result_resource);
-   CTX_INIT(transfer_map);
+   CTX_INIT(buffer_map);
+   CTX_INIT(texture_map);
    CTX_INIT(transfer_flush_region);
-   CTX_INIT(transfer_unmap);
+   CTX_INIT(buffer_unmap);
+   CTX_INIT(texture_unmap);
    CTX_INIT(buffer_subdata);
    CTX_INIT(texture_subdata);
 }
diff --git a/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_pipe.h b/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_pipe.h
index 25bfc74fb..e7e23fe1c 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_pipe.h
+++ b/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_pipe.h
@@ -123,8 +123,9 @@ struct call_flush {
 
 struct call_draw_info {
    struct pipe_draw_info info;
+   unsigned drawid_offset;
    struct pipe_draw_indirect_info indirect;
-   struct pipe_draw_start_count draw;
+   struct pipe_draw_start_count_bias draw;
 };
 
 struct call_get_query_result_resource {
diff --git a/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_screen.c b/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_screen.c
index dadcde63a..b9a60b1a3 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_screen.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_ddebug/dd_screen.c
@@ -414,12 +414,12 @@ dd_screen_memobj_destroy(struct pipe_screen *_screen,
  * screen
  */
 
-static void
-dd_screen_finalize_nir(struct pipe_screen *_screen, void *nir, bool optimize)
+static char *
+dd_screen_finalize_nir(struct pipe_screen *_screen, void *nir)
 {
    struct pipe_screen *screen = dd_screen(_screen)->screen;
 
-   screen->finalize_nir(screen, nir, optimize);
+   return screen->finalize_nir(screen, nir);
 }
 
 static void
diff --git a/lib/mesa/src/gallium/auxiliary/driver_noop/noop_pipe.c b/lib/mesa/src/gallium/auxiliary/driver_noop/noop_pipe.c
index f107c71ec..73d35d003 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_noop/noop_pipe.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_noop/noop_pipe.c
@@ -29,7 +29,9 @@
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
 #include "util/format/u_format.h"
+#include "util/u_helpers.h"
 #include "util/u_upload_mgr.h"
+#include "util/u_threaded_context.h"
 #include "noop_public.h"
 
 DEBUG_GET_ONCE_BOOL_OPTION(noop, "GALLIUM_NOOP", false)
@@ -39,12 +41,14 @@ void noop_init_state_functions(struct pipe_context *ctx);
 struct noop_pipe_screen {
    struct pipe_screen	pscreen;
    struct pipe_screen	*oscreen;
+   struct slab_parent_pool pool_transfers;
 };
 
 /*
  * query
  */
 struct noop_query {
+   struct threaded_query b;
    unsigned	query;
 };
 static struct pipe_query *noop_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
@@ -90,7 +94,7 @@ noop_set_active_query_state(struct pipe_context *pipe, bool enable)
  * resource
  */
 struct noop_resource {
-   struct pipe_resource	base;
+   struct threaded_resource b;
    unsigned		size;
    char			*data;
    struct sw_displaytarget	*dt;
@@ -107,16 +111,34 @@ static struct pipe_resource *noop_resource_create(struct pipe_screen *screen,
       return NULL;
 
    stride = util_format_get_stride(templ->format, templ->width0);
-   nresource->base = *templ;
-   nresource->base.screen = screen;
+   nresource->b.b = *templ;
+   nresource->b.b.screen = screen;
    nresource->size = stride * templ->height0 * templ->depth0;
    nresource->data = MALLOC(nresource->size);
-   pipe_reference_init(&nresource->base.reference, 1);
+   pipe_reference_init(&nresource->b.b.reference, 1);
    if (nresource->data == NULL) {
       FREE(nresource);
       return NULL;
    }
-   return &nresource->base;
+   threaded_resource_init(&nresource->b.b);
+   return &nresource->b.b;
+}
+
+static struct pipe_resource *
+noop_resource_create_with_modifiers(struct pipe_screen *screen,
+                                    const struct pipe_resource *templ,
+                                    const uint64_t *modifiers, int count)
+{
+   struct noop_pipe_screen *noop_screen = (struct noop_pipe_screen*)screen;
+   struct pipe_screen *oscreen = noop_screen->oscreen;
+   struct pipe_resource *result;
+   struct pipe_resource *noop_resource;
+
+   result = oscreen->resource_create_with_modifiers(oscreen, templ,
+                                                    modifiers, count);
+   noop_resource = noop_resource_create(screen, result);
+   pipe_resource_reference(&result, NULL);
+   return noop_resource;
 }
 
 static struct pipe_resource *noop_resource_from_handle(struct pipe_screen *screen,
@@ -187,6 +209,7 @@ static void noop_resource_destroy(struct pipe_screen *screen,
 {
    struct noop_resource *nresource = (struct noop_resource *)resource;
 
+   threaded_resource_deinit(resource);
    FREE(nresource->data);
    FREE(resource);
 }
@@ -205,7 +228,7 @@ static void *noop_transfer_map(struct pipe_context *pipe,
    struct pipe_transfer *transfer;
    struct noop_resource *nresource = (struct noop_resource *)resource;
 
-   transfer = CALLOC_STRUCT(pipe_transfer);
+   transfer = (struct pipe_transfer*)CALLOC_STRUCT(threaded_transfer);
    if (!transfer)
       return NULL;
    pipe_resource_reference(&transfer->resource, resource);
@@ -310,8 +333,13 @@ static void noop_flush(struct pipe_context *ctx,
                        struct pipe_fence_handle **fence,
                        unsigned flags)
 {
-   if (fence)
-      *fence = NULL;
+   if (fence) {
+      struct pipe_reference *f = MALLOC_STRUCT(pipe_reference);
+      f->count = 1;
+
+      ctx->screen->fence_reference(ctx->screen, fence, NULL);
+      *fence = (struct pipe_fence_handle*)f;
+   }
 }
 
 static void noop_destroy_context(struct pipe_context *ctx)
@@ -319,6 +347,7 @@ static void noop_destroy_context(struct pipe_context *ctx)
    if (ctx->stream_uploader)
       u_upload_destroy(ctx->stream_uploader);
 
+   p_atomic_dec(&ctx->screen->num_contexts);
    FREE(ctx);
 }
 
@@ -348,6 +377,32 @@ static void noop_set_frontend_noop(struct pipe_context *ctx, bool enable)
 {
 }
 
+static void noop_replace_buffer_storage(struct pipe_context *ctx,
+                                        struct pipe_resource *dst,
+                                        struct pipe_resource *src,
+                                        unsigned num_rebinds,
+                                        uint32_t rebind_mask,
+                                        uint32_t delete_buffer_id)
+{
+}
+
+static struct pipe_fence_handle *
+noop_create_fence(struct pipe_context *ctx,
+                  struct tc_unflushed_batch_token *tc_token)
+{
+   struct pipe_reference *f = MALLOC_STRUCT(pipe_reference);
+
+   f->count = 1;
+   return (struct pipe_fence_handle*)f;
+}
+
+static bool noop_is_resource_busy(struct pipe_screen *screen,
+                                  struct pipe_resource *resource,
+                                  unsigned usage)
+{
+   return false;
+}
+
 static struct pipe_context *noop_create_context(struct pipe_screen *screen,
                                                 void *priv, unsigned flags)
 {
@@ -381,9 +436,11 @@ static struct pipe_context *noop_create_context(struct pipe_screen *screen,
    ctx->end_query = noop_end_query;
    ctx->get_query_result = noop_get_query_result;
    ctx->set_active_query_state = noop_set_active_query_state;
-   ctx->transfer_map = noop_transfer_map;
+   ctx->buffer_map = noop_transfer_map;
+   ctx->texture_map = noop_transfer_map;
    ctx->transfer_flush_region = noop_transfer_flush_region;
-   ctx->transfer_unmap = noop_transfer_unmap;
+   ctx->buffer_unmap = noop_transfer_unmap;
+   ctx->texture_unmap = noop_transfer_unmap;
    ctx->buffer_subdata = noop_buffer_subdata;
    ctx->texture_subdata = noop_texture_subdata;
    ctx->invalidate_resource = noop_invalidate_resource;
@@ -391,7 +448,25 @@ static struct pipe_context *noop_create_context(struct pipe_screen *screen,
    ctx->set_frontend_noop = noop_set_frontend_noop;
    noop_init_state_functions(ctx);
 
-   return ctx;
+   p_atomic_inc(&screen->num_contexts);
+
+   if (!(flags & PIPE_CONTEXT_PREFER_THREADED))
+      return ctx;
+
+   struct pipe_context *tc =
+      threaded_context_create(ctx,
+                              &((struct noop_pipe_screen*)screen)->pool_transfers,
+                              noop_replace_buffer_storage,
+                              &(struct threaded_context_options) {
+                                 .create_fence = noop_create_fence,
+                                 .is_resource_busy = noop_is_resource_busy,
+                              },
+                              NULL);
+
+   if (tc && tc != ctx)
+      threaded_context_init_bytes_mapped_limit((struct threaded_context *)tc, 4);
+
+   return tc;
 }
 
 
@@ -479,6 +554,7 @@ static void noop_destroy_screen(struct pipe_screen *screen)
    struct pipe_screen *oscreen = noop_screen->oscreen;
 
    oscreen->destroy(oscreen);
+   slab_destroy_parent(&noop_screen->pool_transfers);
    FREE(screen);
 }
 
@@ -486,6 +562,11 @@ static void noop_fence_reference(struct pipe_screen *screen,
                           struct pipe_fence_handle **ptr,
                           struct pipe_fence_handle *fence)
 {
+   if (pipe_reference((struct pipe_reference*)*ptr,
+                      (struct pipe_reference*)fence))
+      FREE(*ptr);
+
+   *ptr = fence;
 }
 
 static bool noop_fence_finish(struct pipe_screen *screen,
@@ -521,11 +602,104 @@ static const void *noop_get_compiler_options(struct pipe_screen *pscreen,
    return screen->get_compiler_options(screen, ir, shader);
 }
 
-static void noop_finalize_nir(struct pipe_screen *pscreen, void *nir, bool optimize)
+static char *noop_finalize_nir(struct pipe_screen *pscreen, void *nir)
 {
    struct pipe_screen *screen = ((struct noop_pipe_screen*)pscreen)->oscreen;
 
-   screen->finalize_nir(screen, nir, optimize);
+   return screen->finalize_nir(screen, nir);
+}
+
+static bool noop_check_resource_capability(struct pipe_screen *screen,
+                                           struct pipe_resource *resource,
+                                           unsigned bind)
+{
+   return true;
+}
+
+static void noop_set_max_shader_compiler_threads(struct pipe_screen *screen,
+                                                 unsigned max_threads)
+{
+}
+
+static bool noop_is_parallel_shader_compilation_finished(struct pipe_screen *screen,
+                                                         void *shader,
+                                                         unsigned shader_type)
+{
+   return true;
+}
+
+static bool noop_is_dmabuf_modifier_supported(struct pipe_screen *screen,
+                                              uint64_t modifier, enum pipe_format format,
+                                              bool *external_only)
+{
+   struct noop_pipe_screen *noop_screen = (struct noop_pipe_screen*)screen;
+   struct pipe_screen *oscreen = noop_screen->oscreen;
+
+   return oscreen->is_dmabuf_modifier_supported(oscreen, modifier, format, external_only);
+}
+
+static unsigned int noop_get_dmabuf_modifier_planes(struct pipe_screen *screen,
+                                                    uint64_t modifier,
+                                                    enum pipe_format format)
+{
+   struct noop_pipe_screen *noop_screen = (struct noop_pipe_screen*)screen;
+   struct pipe_screen *oscreen = noop_screen->oscreen;
+
+   return oscreen->get_dmabuf_modifier_planes(oscreen, modifier, format);
+}
+
+static void noop_get_driver_uuid(struct pipe_screen *screen, char *uuid)
+{
+   struct noop_pipe_screen *noop_screen = (struct noop_pipe_screen*)screen;
+   struct pipe_screen *oscreen = noop_screen->oscreen;
+
+   oscreen->get_driver_uuid(oscreen, uuid);
+}
+
+static void noop_get_device_uuid(struct pipe_screen *screen, char *uuid)
+{
+   struct noop_pipe_screen *noop_screen = (struct noop_pipe_screen*)screen;
+   struct pipe_screen *oscreen = noop_screen->oscreen;
+
+   oscreen->get_device_uuid(oscreen, uuid);
+}
+
+static void noop_query_dmabuf_modifiers(struct pipe_screen *screen,
+                                        enum pipe_format format, int max,
+                                        uint64_t *modifiers,
+                                        unsigned int *external_only, int *count)
+{
+   struct noop_pipe_screen *noop_screen = (struct noop_pipe_screen*)screen;
+   struct pipe_screen *oscreen = noop_screen->oscreen;
+
+   oscreen->query_dmabuf_modifiers(oscreen, format, max, modifiers,
+                                   external_only, count);
+}
+
+static struct pipe_vertex_state *
+noop_create_vertex_state(struct pipe_screen *screen,
+                         struct pipe_vertex_buffer *buffer,
+                         const struct pipe_vertex_element *elements,
+                         unsigned num_elements,
+                         struct pipe_resource *indexbuf,
+                         uint32_t full_velem_mask)
+{
+   struct pipe_vertex_state *state = CALLOC_STRUCT(pipe_vertex_state);
+
+   if (!state)
+      return NULL;
+
+   util_init_pipe_vertex_state(screen, buffer, elements, num_elements, indexbuf,
+                               full_velem_mask, state);
+   return state;
+}
+
+static void noop_vertex_state_destroy(struct pipe_screen *screen,
+                                      struct pipe_vertex_state *state)
+{
+   pipe_vertex_buffer_unreference(&state->input.vbuffer);
+   pipe_resource_reference(&state->input.indexbuf, NULL);
+   FREE(state);
 }
 
 struct pipe_screen *noop_screen_create(struct pipe_screen *oscreen)
@@ -568,6 +742,20 @@ struct pipe_screen *noop_screen_create(struct pipe_screen *oscreen)
    screen->get_disk_shader_cache = noop_get_disk_shader_cache;
    screen->get_compiler_options = noop_get_compiler_options;
    screen->finalize_nir = noop_finalize_nir;
+   screen->check_resource_capability = noop_check_resource_capability;
+   screen->set_max_shader_compiler_threads = noop_set_max_shader_compiler_threads;
+   screen->is_parallel_shader_compilation_finished = noop_is_parallel_shader_compilation_finished;
+   screen->is_dmabuf_modifier_supported = noop_is_dmabuf_modifier_supported;
+   screen->get_dmabuf_modifier_planes = noop_get_dmabuf_modifier_planes;
+   screen->get_driver_uuid = noop_get_driver_uuid;
+   screen->get_device_uuid = noop_get_device_uuid;
+   screen->query_dmabuf_modifiers = noop_query_dmabuf_modifiers;
+   screen->resource_create_with_modifiers = noop_resource_create_with_modifiers;
+   screen->create_vertex_state = noop_create_vertex_state;
+   screen->vertex_state_destroy = noop_vertex_state_destroy;
+
+   slab_create_parent(&noop_screen->pool_transfers,
+                      sizeof(struct pipe_transfer), 64);
 
    return screen;
 }
diff --git a/lib/mesa/src/gallium/auxiliary/driver_noop/noop_state.c b/lib/mesa/src/gallium/auxiliary/driver_noop/noop_state.c
index fb90a1ec4..56036e22e 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_noop/noop_state.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_noop/noop_state.c
@@ -31,12 +31,22 @@
 #include "util/u_transfer.h"
 
 static void noop_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info,
+                          unsigned drawid_offset,
                           const struct pipe_draw_indirect_info *indirect,
-                          const struct pipe_draw_start_count *draws,
+                          const struct pipe_draw_start_count_bias *draws,
                           unsigned num_draws)
 {
 }
 
+static void noop_draw_vertex_state(struct pipe_context *ctx,
+                                   struct pipe_vertex_state *state,
+                                   uint32_t partial_velem_mask,
+                                   struct pipe_draw_vertex_state_info info,
+                                   const struct pipe_draw_start_count_bias *draws,
+                                   unsigned num_draws)
+{
+}
+
 static void noop_launch_grid(struct pipe_context *ctx,
                              const struct pipe_grid_info *info)
 {
@@ -115,6 +125,7 @@ static void noop_set_sampler_views(struct pipe_context *ctx,
                                    enum pipe_shader_type shader,
                                    unsigned start, unsigned count,
                                    unsigned unbind_num_trailing_slots,
+                                   bool take_ownership,
                                    struct pipe_sampler_view **views)
 {
 }
@@ -267,6 +278,139 @@ static void noop_set_window_rectangles(struct pipe_context *ctx,
 {
 }
 
+static void noop_set_shader_buffers(struct pipe_context *ctx,
+                                    enum pipe_shader_type shader,
+                                    unsigned start_slot, unsigned count,
+                                    const struct pipe_shader_buffer *buffers,
+                                    unsigned writable_bitmask)
+{
+}
+
+static void noop_set_shader_images(struct pipe_context *ctx,
+                                   enum pipe_shader_type shader,
+                                   unsigned start_slot, unsigned count,
+                                   unsigned unbind_num_trailing_slots,
+                                   const struct pipe_image_view *images)
+{
+}
+
+static void noop_render_condition( struct pipe_context *pipe,
+                                   struct pipe_query *query,
+                                   bool condition,
+                                   enum pipe_render_cond_flag mode )
+{
+}
+
+static void noop_get_query_result_resource(struct pipe_context *pipe,
+                                           struct pipe_query *q,
+                                           bool wait,
+                                           enum pipe_query_value_type result_type,
+                                           int index,
+                                           struct pipe_resource *resource,
+                                           unsigned offset)
+{
+}
+
+static void noop_set_min_samples( struct pipe_context *ctx,
+                                  unsigned min_samples )
+{
+}
+
+static void noop_set_sample_locations( struct pipe_context *ctx,
+                                       size_t size, const uint8_t *locations )
+{
+}
+
+static void noop_set_tess_state(struct pipe_context *ctx,
+                                const float default_outer_level[4],
+                                const float default_inner_level[2])
+{
+}
+
+static void noop_clear_texture(struct pipe_context *pipe,
+                               struct pipe_resource *res,
+                               unsigned level,
+                               const struct pipe_box *box,
+                               const void *data)
+{
+}
+
+static void noop_clear_buffer(struct pipe_context *pipe,
+                              struct pipe_resource *res,
+                              unsigned offset,
+                              unsigned size,
+                              const void *clear_value,
+                              int clear_value_size)
+{
+}
+
+static void noop_fence_server_sync(struct pipe_context *pipe,
+                                   struct pipe_fence_handle *fence)
+{
+}
+
+static void noop_texture_barrier(struct pipe_context *ctx, unsigned flags)
+{
+}
+
+static void noop_memory_barrier(struct pipe_context *ctx, unsigned flags)
+{
+}
+
+static bool noop_resource_commit(struct pipe_context *ctx, struct pipe_resource *res,
+                                 unsigned level, struct pipe_box *box, bool commit)
+{
+   return true;
+}
+
+static void noop_get_sample_position(struct pipe_context *context,
+                                     unsigned sample_count,
+                                     unsigned sample_index,
+                                     float *out_value)
+{
+}
+
+static enum pipe_reset_status noop_get_device_reset_status(struct pipe_context *ctx)
+{
+   return PIPE_NO_RESET;
+}
+
+static uint64_t noop_create_texture_handle(struct pipe_context *ctx,
+                                           struct pipe_sampler_view *view,
+                                           const struct pipe_sampler_state *state)
+{
+   return 1;
+}
+
+static void noop_delete_texture_handle(struct pipe_context *ctx, uint64_t handle)
+{
+}
+
+static void noop_make_texture_handle_resident(struct pipe_context *ctx,
+                                              uint64_t handle, bool resident)
+{
+}
+
+static uint64_t noop_create_image_handle(struct pipe_context *ctx,
+                                         const struct pipe_image_view *image)
+{
+   return 2;
+}
+
+static void noop_delete_image_handle(struct pipe_context *ctx, uint64_t handle)
+{
+}
+
+static void noop_make_image_handle_resident(struct pipe_context *ctx, uint64_t handle,
+                                            unsigned access, bool resident)
+{
+}
+
+static void noop_set_patch_vertices(struct pipe_context *ctx,
+                                    uint8_t patch_vertices)
+{
+}
+
 void noop_init_state_functions(struct pipe_context *ctx);
 
 void noop_init_state_functions(struct pipe_context *ctx)
@@ -311,6 +455,8 @@ void noop_init_state_functions(struct pipe_context *ctx)
    ctx->set_constant_buffer = noop_set_constant_buffer;
    ctx->set_inlinable_constants = noop_set_inlinable_constants;
    ctx->set_sampler_views = noop_set_sampler_views;
+   ctx->set_shader_buffers = noop_set_shader_buffers;
+   ctx->set_shader_images = noop_set_shader_images;
    ctx->set_framebuffer_state = noop_set_framebuffer_state;
    ctx->set_polygon_stipple = noop_set_polygon_stipple;
    ctx->set_sample_mask = noop_set_sample_mask;
@@ -322,8 +468,29 @@ void noop_init_state_functions(struct pipe_context *ctx)
    ctx->sampler_view_destroy = noop_sampler_view_destroy;
    ctx->surface_destroy = noop_surface_destroy;
    ctx->draw_vbo = noop_draw_vbo;
+   ctx->draw_vertex_state = noop_draw_vertex_state;
    ctx->launch_grid = noop_launch_grid;
    ctx->create_stream_output_target = noop_create_stream_output_target;
    ctx->stream_output_target_destroy = noop_stream_output_target_destroy;
    ctx->set_stream_output_targets = noop_set_stream_output_targets;
+   ctx->render_condition = noop_render_condition;
+   ctx->get_query_result_resource = noop_get_query_result_resource;
+   ctx->set_min_samples = noop_set_min_samples;
+   ctx->set_sample_locations = noop_set_sample_locations;
+   ctx->set_tess_state = noop_set_tess_state;
+   ctx->clear_texture = noop_clear_texture;
+   ctx->clear_buffer = noop_clear_buffer;
+   ctx->fence_server_sync = noop_fence_server_sync;
+   ctx->texture_barrier = noop_texture_barrier;
+   ctx->memory_barrier = noop_memory_barrier;
+   ctx->resource_commit = noop_resource_commit;
+   ctx->get_sample_position = noop_get_sample_position;
+   ctx->get_device_reset_status = noop_get_device_reset_status;
+   ctx->create_texture_handle = noop_create_texture_handle;
+   ctx->delete_texture_handle = noop_delete_texture_handle;
+   ctx->make_texture_handle_resident = noop_make_texture_handle_resident;
+   ctx->create_image_handle = noop_create_image_handle;
+   ctx->delete_image_handle = noop_delete_image_handle;
+   ctx->make_image_handle_resident = noop_make_image_handle_resident;
+   ctx->set_patch_vertices = noop_set_patch_vertices;
 }
diff --git a/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_context.c b/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_context.c
index 4f72eb891..b0c283d9b 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_context.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_context.c
@@ -115,8 +115,9 @@ rbug_draw_block_locked(struct rbug_context *rb_pipe, int flag)
 
 static void
 rbug_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *_info,
+              unsigned _drawid_offset,
               const struct pipe_draw_indirect_info *_indirect,
-              const struct pipe_draw_start_count *draws,
+              const struct pipe_draw_start_count_bias *draws,
               unsigned num_draws)
 {
    struct rbug_context *rb_pipe = rbug_context(_pipe);
@@ -135,7 +136,7 @@ rbug_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *_info,
    if (!(rb_pipe->curr.shader[PIPE_SHADER_FRAGMENT] && rb_pipe->curr.shader[PIPE_SHADER_FRAGMENT]->disabled) &&
        !(rb_pipe->curr.shader[PIPE_SHADER_GEOMETRY] && rb_pipe->curr.shader[PIPE_SHADER_GEOMETRY]->disabled) &&
        !(rb_pipe->curr.shader[PIPE_SHADER_VERTEX] && rb_pipe->curr.shader[PIPE_SHADER_VERTEX]->disabled))
-      pipe->draw_vbo(pipe, &info, _indirect, draws, num_draws);
+      pipe->draw_vbo(pipe, &info, _drawid_offset, _indirect, draws, num_draws);
    mtx_unlock(&rb_pipe->call_mutex);
 
    rbug_draw_block_locked(rb_pipe, RBUG_BLOCK_AFTER);
@@ -739,6 +740,7 @@ rbug_set_sampler_views(struct pipe_context *_pipe,
                        unsigned start,
                        unsigned num,
                        unsigned unbind_num_trailing_slots,
+                       bool take_ownership,
                        struct pipe_sampler_view **_views)
 {
    struct rbug_context *rb_pipe = rbug_context(_pipe);
@@ -768,7 +770,7 @@ rbug_set_sampler_views(struct pipe_context *_pipe,
    }
 
    pipe->set_sampler_views(pipe, shader, start, num,
-                           unbind_num_trailing_slots, views);
+                           unbind_num_trailing_slots, take_ownership, views);
 
    mtx_unlock(&rb_pipe->call_mutex);
 }
@@ -1106,7 +1108,7 @@ rbug_context_surface_destroy(struct pipe_context *_pipe,
 
 
 static void *
-rbug_context_transfer_map(struct pipe_context *_context,
+rbug_context_buffer_map(struct pipe_context *_context,
                           struct pipe_resource *_resource,
                           unsigned level,
                           unsigned usage,
@@ -1121,7 +1123,34 @@ rbug_context_transfer_map(struct pipe_context *_context,
    void *map;
 
    mtx_lock(&rb_pipe->call_mutex);
-   map = context->transfer_map(context,
+   map = context->buffer_map(context,
+                               resource,
+                               level,
+                               usage,
+                               box, &result);
+   mtx_unlock(&rb_pipe->call_mutex);
+
+   *transfer = rbug_transfer_create(rb_pipe, rb_resource, result);
+   return *transfer ? map : NULL;
+}
+
+static void *
+rbug_context_texture_map(struct pipe_context *_context,
+                          struct pipe_resource *_resource,
+                          unsigned level,
+                          unsigned usage,
+                          const struct pipe_box *box,
+                          struct pipe_transfer **transfer)
+{
+   struct rbug_context *rb_pipe = rbug_context(_context);
+   struct rbug_resource *rb_resource = rbug_resource(_resource);
+   struct pipe_context *context = rb_pipe->pipe;
+   struct pipe_resource *resource = rb_resource->resource;
+   struct pipe_transfer *result;
+   void *map;
+
+   mtx_lock(&rb_pipe->call_mutex);
+   map = context->texture_map(context,
                                resource,
                                level,
                                usage,
@@ -1151,7 +1180,24 @@ rbug_context_transfer_flush_region(struct pipe_context *_context,
 
 
 static void
-rbug_context_transfer_unmap(struct pipe_context *_context,
+rbug_context_buffer_unmap(struct pipe_context *_context,
+                            struct pipe_transfer *_transfer)
+{
+   struct rbug_context *rb_pipe = rbug_context(_context);
+   struct rbug_transfer *rb_transfer = rbug_transfer(_transfer);
+   struct pipe_context *context = rb_pipe->pipe;
+   struct pipe_transfer *transfer = rb_transfer->transfer;
+
+   mtx_lock(&rb_pipe->call_mutex);
+   context->buffer_unmap(context,
+                           transfer);
+   rbug_transfer_destroy(rb_pipe,
+                         rb_transfer);
+   mtx_unlock(&rb_pipe->call_mutex);
+}
+
+static void
+rbug_context_texture_unmap(struct pipe_context *_context,
                             struct pipe_transfer *_transfer)
 {
    struct rbug_context *rb_pipe = rbug_context(_context);
@@ -1160,7 +1206,7 @@ rbug_context_transfer_unmap(struct pipe_context *_context,
    struct pipe_transfer *transfer = rb_transfer->transfer;
 
    mtx_lock(&rb_pipe->call_mutex);
-   context->transfer_unmap(context,
+   context->texture_unmap(context,
                            transfer);
    rbug_transfer_destroy(rb_pipe,
                          rb_transfer);
@@ -1307,8 +1353,10 @@ rbug_context_create(struct pipe_screen *_screen, struct pipe_context *pipe)
    rb_pipe->base.sampler_view_destroy = rbug_context_sampler_view_destroy;
    rb_pipe->base.create_surface = rbug_context_create_surface;
    rb_pipe->base.surface_destroy = rbug_context_surface_destroy;
-   rb_pipe->base.transfer_map = rbug_context_transfer_map;
-   rb_pipe->base.transfer_unmap = rbug_context_transfer_unmap;
+   rb_pipe->base.buffer_map = rbug_context_buffer_map;
+   rb_pipe->base.buffer_unmap = rbug_context_buffer_unmap;
+   rb_pipe->base.texture_map = rbug_context_texture_map;
+   rb_pipe->base.texture_unmap = rbug_context_texture_unmap;
    rb_pipe->base.transfer_flush_region = rbug_context_transfer_flush_region;
    rb_pipe->base.buffer_subdata = rbug_context_buffer_subdata;
    rb_pipe->base.texture_subdata = rbug_context_texture_subdata;
diff --git a/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_core.c b/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_core.c
index 6d6ca7ec0..aad4487f4 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_core.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_core.c
@@ -267,7 +267,7 @@ rbug_texture_read(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_
    }
 
    tex = tr_tex->resource;
-   map = pipe_transfer_map(context, tex,
+   map = pipe_texture_map(context, tex,
                            gptr->level, gptr->face + gptr->zslice,
                            PIPE_MAP_READ,
                            gptr->x, gptr->y, gptr->w, gptr->h, &t);
@@ -283,7 +283,7 @@ rbug_texture_read(struct rbug_rbug *tr_rbug, struct rbug_header *header, uint32_
                                 t->stride,
                                 NULL);
 
-   context->transfer_unmap(context, t);
+   context->texture_unmap(context, t);
 
    mtx_unlock(&rb_screen->list_mutex);
 
diff --git a/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_objects.c b/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_objects.c
index bf2790cf2..09455cf6f 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_objects.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_objects.c
@@ -186,7 +186,10 @@ rbug_transfer_create(struct rbug_context *rb_context,
    return &rb_transfer->base;
 
 error:
-   rb_context->pipe->transfer_unmap(rb_context->pipe, transfer);
+   if (rb_resource->base.target == PIPE_BUFFER)
+      rb_context->pipe->buffer_unmap(rb_context->pipe, transfer);
+   else
+      rb_context->pipe->texture_unmap(rb_context->pipe, transfer);
    return NULL;
 }
 
diff --git a/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_screen.c b/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_screen.c
index 25d0fcf0e..9eb9ba379 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_screen.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_rbug/rbug_screen.c
@@ -410,12 +410,12 @@ rbug_screen_fence_get_fd(struct pipe_screen *_screen,
    return screen->fence_get_fd(screen, fence);
 }
 
-static void
-rbug_screen_finalize_nir(struct pipe_screen *_screen, void *nir, bool optimize)
+static char *
+rbug_screen_finalize_nir(struct pipe_screen *_screen, void *nir)
 {
    struct pipe_screen *screen = rbug_screen(_screen)->screen;
 
-   screen->finalize_nir(screen, nir, optimize);
+   return screen->finalize_nir(screen, nir);
 }
 
 bool
diff --git a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_context.c b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_context.c
index 1252d367b..7e28a4028 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_context.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_context.c
@@ -45,6 +45,7 @@
 
 struct trace_query
 {
+   struct threaded_query base;
    unsigned type;
 
    struct pipe_query *query;
@@ -110,8 +111,9 @@ dump_fb_state(struct trace_context *tr_ctx,
 static void
 trace_context_draw_vbo(struct pipe_context *_pipe,
                        const struct pipe_draw_info *info,
+                       unsigned drawid_offset,
                        const struct pipe_draw_indirect_info *indirect,
-                       const struct pipe_draw_start_count *draws,
+                       const struct pipe_draw_start_count_bias *draws,
                        unsigned num_draws)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
@@ -124,6 +126,7 @@ trace_context_draw_vbo(struct pipe_context *_pipe,
 
    trace_dump_arg(ptr,  pipe);
    trace_dump_arg(draw_info, info);
+   trace_dump_arg(int, drawid_offset);
    trace_dump_arg(draw_indirect_info, indirect);
    trace_dump_arg_begin("draws");
    trace_dump_struct_array(draw_start_count, draws, num_draws);
@@ -132,12 +135,45 @@ trace_context_draw_vbo(struct pipe_context *_pipe,
 
    trace_dump_trace_flush();
 
-   pipe->draw_vbo(pipe, info, indirect, draws, num_draws);
+   pipe->draw_vbo(pipe, info, drawid_offset, indirect, draws, num_draws);
 
    trace_dump_call_end();
 }
 
 
+static void
+trace_context_draw_vertex_state(struct pipe_context *_pipe,
+                                struct pipe_vertex_state *state,
+                                uint32_t partial_velem_mask,
+                                struct pipe_draw_vertex_state_info info,
+                                const struct pipe_draw_start_count_bias *draws,
+                                unsigned num_draws)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   if (!tr_ctx->seen_fb_state && trace_dump_is_triggered())
+      dump_fb_state(tr_ctx, "current_framebuffer_state", true);
+
+   trace_dump_call_begin("pipe_context", "draw_vertex_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+   trace_dump_arg(uint, partial_velem_mask);
+   trace_dump_arg(draw_vertex_state_info, info);
+   trace_dump_arg_begin("draws");
+   trace_dump_struct_array(draw_start_count, draws, num_draws);
+   trace_dump_arg_end();
+   trace_dump_arg(uint, num_draws);
+
+   trace_dump_trace_flush();
+
+   pipe->draw_vertex_state(pipe, state, partial_velem_mask, info, draws,
+                           num_draws);
+   trace_dump_call_end();
+}
+
+
 static struct pipe_query *
 trace_context_create_query(struct pipe_context *_pipe,
                            unsigned query_type,
@@ -222,19 +258,21 @@ trace_context_begin_query(struct pipe_context *_pipe,
 
 static bool
 trace_context_end_query(struct pipe_context *_pipe,
-                        struct pipe_query *query)
+                        struct pipe_query *_query)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
    struct pipe_context *pipe = tr_ctx->pipe;
    bool ret;
 
-   query = trace_query_unwrap(query);
+   struct pipe_query *query = trace_query_unwrap(_query);
 
    trace_dump_call_begin("pipe_context", "end_query");
 
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, query);
 
+   if (tr_ctx->threaded)
+      threaded_query(query)->flushed = trace_query(_query)->base.flushed;
    ret = pipe->end_query(pipe, query);
 
    trace_dump_call_end();
@@ -258,6 +296,10 @@ trace_context_get_query_result(struct pipe_context *_pipe,
 
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, query);
+   trace_dump_arg(bool, wait);
+
+   if (tr_ctx->threaded)
+      threaded_query(query)->flushed = trace_query(_query)->base.flushed;
 
    ret = pipe->get_query_result(pipe, query, wait, result);
 
@@ -1013,6 +1055,8 @@ trace_context_create_sampler_view(struct pipe_context *_pipe,
    pipe_resource_reference(&tr_view->base.texture, resource);
    tr_view->base.context = _pipe;
    tr_view->sampler_view = result;
+   result->reference.count += 100000000;
+   tr_view->refcount = 100000000;
    result = &tr_view->base;
 
    return result;
@@ -1028,13 +1072,12 @@ trace_context_sampler_view_destroy(struct pipe_context *_pipe,
    struct pipe_context *pipe = tr_ctx->pipe;
    struct pipe_sampler_view *view = tr_view->sampler_view;
 
-   assert(_view->context == _pipe);
-
    trace_dump_call_begin("pipe_context", "sampler_view_destroy");
 
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, view);
 
+   p_atomic_add(&tr_view->sampler_view->reference.count, -tr_view->refcount);
    pipe_sampler_view_reference(&tr_view->sampler_view, NULL);
 
    trace_dump_call_end();
@@ -1105,6 +1148,7 @@ trace_context_set_sampler_views(struct pipe_context *_pipe,
                                 unsigned start,
                                 unsigned num,
                                 unsigned unbind_num_trailing_slots,
+                                bool take_ownership,
                                 struct pipe_sampler_view **views)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
@@ -1118,6 +1162,13 @@ trace_context_set_sampler_views(struct pipe_context *_pipe,
 
    for (i = 0; i < num; ++i) {
       tr_view = trace_sampler_view(views[i]);
+      if (tr_view) {
+         tr_view->refcount--;
+         if (!tr_view->refcount) {
+            tr_view->refcount = 100000000;
+            p_atomic_add(&tr_view->sampler_view->reference.count, tr_view->refcount);
+         }
+      }
       unwrapped_views[i] = tr_view ? tr_view->sampler_view : NULL;
    }
    views = unwrapped_views;
@@ -1129,10 +1180,11 @@ trace_context_set_sampler_views(struct pipe_context *_pipe,
    trace_dump_arg(uint, start);
    trace_dump_arg(uint, num);
    trace_dump_arg(uint, unbind_num_trailing_slots);
+   trace_dump_arg(bool, take_ownership);
    trace_dump_arg_array(ptr, views, num);
 
    pipe->set_sampler_views(pipe, shader, start, num,
-                           unbind_num_trailing_slots, views);
+                           unbind_num_trailing_slots, take_ownership, views);
 
    trace_dump_call_end();
 }
@@ -1405,6 +1457,32 @@ trace_context_clear_depth_stencil(struct pipe_context *_pipe,
 }
 
 static inline void
+trace_context_clear_buffer(struct pipe_context *_pipe,
+                           struct pipe_resource *res,
+                           unsigned offset,
+                           unsigned size,
+                           const void *clear_value,
+                           int clear_value_size)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+
+   trace_dump_call_begin("pipe_context", "clear_buffer");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, res);
+   trace_dump_arg(uint, offset);
+   trace_dump_arg(uint, size);
+   trace_dump_arg(ptr, clear_value);
+   trace_dump_arg(int, clear_value_size);
+
+   pipe->clear_buffer(pipe, res, offset, size, clear_value, clear_value_size);
+
+   trace_dump_call_end();
+}
+
+static inline void
 trace_context_clear_texture(struct pipe_context *_pipe,
                             struct pipe_resource *res,
                             unsigned level,
@@ -1563,20 +1641,29 @@ trace_context_transfer_map(struct pipe_context *_context,
                            struct pipe_transfer **transfer)
 {
    struct trace_context *tr_context = trace_context(_context);
-   struct pipe_context *context = tr_context->pipe;
-   struct pipe_transfer *result = NULL;
+   struct pipe_context *pipe = tr_context->pipe;
+   struct pipe_transfer *xfer = NULL;
    void *map;
 
-   /*
-    * Map and transfers can't be serialized so we convert all write transfers
-    * to texture/buffer_subdata and ignore read transfers.
-    */
-
-   map = context->transfer_map(context, resource, level, usage, box, &result);
+   if (resource->target == PIPE_BUFFER)
+      map = pipe->buffer_map(pipe, resource, level, usage, box, &xfer);
+   else
+      map = pipe->texture_map(pipe, resource, level, usage, box, &xfer);
    if (!map)
       return NULL;
+   *transfer = trace_transfer_create(tr_context, resource, xfer);
+   trace_dump_call_begin("pipe_context", resource->target == PIPE_BUFFER ? "buffer_map" : "texture_map");
 
-   *transfer = trace_transfer_create(tr_context, resource, result);
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, resource);
+   trace_dump_arg(uint, level);
+   trace_dump_arg(uint, usage);
+   trace_dump_arg(box, box);
+
+   trace_dump_arg(ptr, xfer);
+   trace_dump_ret(ptr, map);
+
+   trace_dump_call_end();
 
    if (map) {
       if (usage & PIPE_MAP_WRITE) {
@@ -1594,10 +1681,18 @@ trace_context_transfer_flush_region( struct pipe_context *_context,
 {
    struct trace_context *tr_context = trace_context(_context);
    struct trace_transfer *tr_transfer = trace_transfer(_transfer);
-   struct pipe_context *context = tr_context->pipe;
+   struct pipe_context *pipe = tr_context->pipe;
    struct pipe_transfer *transfer = tr_transfer->transfer;
 
-   context->transfer_flush_region(context, transfer, box);
+   trace_dump_call_begin("pipe_context", "transfer_flush_region");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, transfer);
+   trace_dump_arg(box, box);
+
+   trace_dump_call_end();
+
+   pipe->transfer_flush_region(pipe, transfer, box);
 }
 
 static void
@@ -1609,7 +1704,15 @@ trace_context_transfer_unmap(struct pipe_context *_context,
    struct pipe_context *context = tr_ctx->pipe;
    struct pipe_transfer *transfer = tr_trans->transfer;
 
-   if (tr_trans->map) {
+
+   trace_dump_call_begin("pipe_context", "transfer_unmap");
+
+   trace_dump_arg(ptr, context);
+   trace_dump_arg(ptr, transfer);
+
+   trace_dump_call_end();
+
+   if (tr_trans->map && !tr_ctx->threaded) {
       /*
        * Fake a texture/buffer_subdata
        */
@@ -1672,7 +1775,10 @@ trace_context_transfer_unmap(struct pipe_context *_context,
       tr_trans->map = NULL;
    }
 
-   context->transfer_unmap(context, transfer);
+   if (transfer->resource->target == PIPE_BUFFER)
+      context->buffer_unmap(context, transfer);
+   else
+      context->texture_unmap(context, transfer);
    trace_transfer_destroy(tr_ctx, tr_trans);
 }
 
@@ -1781,6 +1887,21 @@ trace_context_set_context_param(struct pipe_context *_context,
 }
 
 static void
+trace_context_set_debug_callback(struct pipe_context *_context, const struct pipe_debug_callback *cb)
+{
+   struct trace_context *tr_context = trace_context(_context);
+   struct pipe_context *context = tr_context->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_debug_callback");
+
+   trace_dump_arg(ptr, context);
+
+   trace_dump_call_end();
+
+   context->set_debug_callback(context, cb);
+}
+
+static void
 trace_context_render_condition(struct pipe_context *_context,
                                struct pipe_query *query,
                                bool condition,
@@ -1873,6 +1994,20 @@ trace_context_set_tess_state(struct pipe_context *_context,
    context->set_tess_state(context, default_outer_level, default_inner_level);
 }
 
+static void
+trace_context_set_patch_vertices(struct pipe_context *_context,
+                                 uint8_t patch_vertices)
+{
+   struct trace_context *tr_context = trace_context(_context);
+   struct pipe_context *context = tr_context->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_patch_vertices");
+   trace_dump_arg(ptr, context);
+   trace_dump_arg(uint, patch_vertices);
+   trace_dump_call_end();
+
+   context->set_patch_vertices(context, patch_vertices);
+}
 
 static void trace_context_set_shader_buffers(struct pipe_context *_context,
                                              enum pipe_shader_type shader,
@@ -2075,6 +2210,7 @@ trace_context_create(struct trace_screen *tr_scr,
    tr_ctx->base . _member = pipe -> _member ? trace_context_ ## _member : NULL
 
    TR_CTX_INIT(draw_vbo);
+   TR_CTX_INIT(draw_vertex_state);
    TR_CTX_INIT(render_condition);
    TR_CTX_INIT(create_query);
    TR_CTX_INIT(destroy_query);
@@ -2135,6 +2271,8 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(create_stream_output_target);
    TR_CTX_INIT(stream_output_target_destroy);
    TR_CTX_INIT(set_stream_output_targets);
+   /* this is lavapipe-only and can't be traced */
+   tr_ctx->base.stream_output_target_offset = pipe->stream_output_target_offset;
    TR_CTX_INIT(resource_copy_region);
    TR_CTX_INIT(blit);
    TR_CTX_INIT(flush_resource);
@@ -2142,6 +2280,7 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(clear_render_target);
    TR_CTX_INIT(clear_depth_stencil);
    TR_CTX_INIT(clear_texture);
+   TR_CTX_INIT(clear_buffer);
    TR_CTX_INIT(flush);
    TR_CTX_INIT(create_fence_fd);
    TR_CTX_INIT(fence_server_sync);
@@ -2150,6 +2289,7 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(memory_barrier);
    TR_CTX_INIT(resource_commit);
    TR_CTX_INIT(set_tess_state);
+   TR_CTX_INIT(set_patch_vertices);
    TR_CTX_INIT(set_shader_buffers);
    TR_CTX_INIT(launch_grid);
    TR_CTX_INIT(set_shader_images);
@@ -2160,13 +2300,14 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(delete_image_handle);
    TR_CTX_INIT(make_image_handle_resident);
 
-   TR_CTX_INIT(transfer_map);
-   TR_CTX_INIT(transfer_unmap);
+   tr_ctx->base.buffer_map = tr_ctx->base.texture_map = trace_context_transfer_map;
+   tr_ctx->base.buffer_unmap = tr_ctx->base.texture_unmap = trace_context_transfer_unmap;
    TR_CTX_INIT(transfer_flush_region);
    TR_CTX_INIT(buffer_subdata);
    TR_CTX_INIT(texture_subdata);
    TR_CTX_INIT(invalidate_resource);
    TR_CTX_INIT(set_context_param);
+   TR_CTX_INIT(set_debug_callback);
 
 #undef TR_CTX_INIT
 
@@ -2189,3 +2330,12 @@ trace_context_check(const struct pipe_context *pipe)
    ASSERTED struct trace_context *tr_ctx = (struct trace_context *) pipe;
    assert(tr_ctx->base.destroy == trace_context_destroy);
 }
+
+/**
+ * Threaded context is not wrapped, and so it may call fence functions directly
+ */
+struct pipe_context *
+trace_get_possibly_threaded_context(struct pipe_context *pipe)
+{
+   return pipe->destroy == trace_context_destroy ? ((struct trace_context*)pipe)->pipe : pipe;
+}
diff --git a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_context.h b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_context.h
index 95469e875..f687fa293 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_context.h
+++ b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_context.h
@@ -33,6 +33,7 @@
 #include "util/u_debug.h"
 #include "util/hash_table.h"
 #include "pipe/p_context.h"
+#include "util/u_threaded_context.h"
 
 #include "tr_screen.h"
 
@@ -52,15 +53,20 @@ struct trace_context
    struct hash_table depth_stencil_alpha_states;
 
    struct pipe_context *pipe;
+   tc_replace_buffer_storage_func replace_buffer_storage;
+   tc_create_fence_func create_fence;
 
    struct pipe_framebuffer_state unwrapped_state;
    bool seen_fb_state;
+
+   bool threaded;
 };
 
 
 void
 trace_context_check(const struct pipe_context *pipe);
-
+struct pipe_context *
+trace_get_possibly_threaded_context(struct pipe_context *pipe);
 
 static inline struct trace_context *
 trace_context(struct pipe_context *pipe)
@@ -77,7 +83,10 @@ struct pipe_context *
 trace_context_create(struct trace_screen *tr_scr,
                      struct pipe_context *pipe);
 
-
+struct pipe_context *
+trace_context_create_threaded(struct pipe_screen *screen, struct pipe_context *pipe,
+                              tc_replace_buffer_storage_func *replace_buffer,
+                              struct threaded_context_options *options);
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_dump_state.c b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_dump_state.c
index ec479b596..1f5da8019 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_dump_state.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_dump_state.c
@@ -552,6 +552,7 @@ void trace_dump_sampler_view_template(const struct pipe_sampler_view *state,
    trace_dump_struct_begin("pipe_sampler_view");
 
    trace_dump_member(format, state, format);
+   trace_dump_member(ptr, state, texture);
 
    trace_dump_member_begin("u");
    trace_dump_struct_begin(""); /* anonymous */
@@ -604,6 +605,7 @@ void trace_dump_surface_template(const struct pipe_surface *state,
    trace_dump_struct_begin("pipe_surface");
 
    trace_dump_member(format, state, format);
+   trace_dump_member(ptr, state, texture);
    trace_dump_member(uint, state, width);
    trace_dump_member(uint, state, height);
 
@@ -698,6 +700,10 @@ void trace_dump_vertex_element(const struct pipe_vertex_element *state)
 
    trace_dump_member(uint, state, vertex_buffer_index);
 
+   trace_dump_member(uint, state, instance_divisor);
+
+   trace_dump_member(bool, state, dual_slot);
+
    trace_dump_member(format, state, src_format);
 
    trace_dump_struct_end();
@@ -780,6 +786,26 @@ void trace_dump_image_view(const struct pipe_image_view *state)
 }
 
 
+void trace_dump_memory_info(const struct pipe_memory_info *state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   if (!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_memory_info");
+   trace_dump_member(uint, state, total_device_memory);
+   trace_dump_member(uint, state, avail_device_memory);
+   trace_dump_member(uint, state, total_staging_memory);
+   trace_dump_member(uint, state, avail_staging_memory);
+   trace_dump_member(uint, state, device_memory_evicted);
+   trace_dump_member(uint, state, nr_device_memory_evictions);
+   trace_dump_struct_end();
+}
+
 void trace_dump_draw_info(const struct pipe_draw_info *state)
 {
    if (!trace_dumping_enabled_locked())
@@ -798,9 +824,6 @@ void trace_dump_draw_info(const struct pipe_draw_info *state)
    trace_dump_member(uint, state, start_instance);
    trace_dump_member(uint, state, instance_count);
 
-   trace_dump_member(uint, state, vertices_per_patch);
-
-   trace_dump_member(int,  state, index_bias);
    trace_dump_member(uint, state, min_index);
    trace_dump_member(uint, state, max_index);
 
@@ -811,14 +834,26 @@ void trace_dump_draw_info(const struct pipe_draw_info *state)
    trace_dump_struct_end();
 }
 
-void trace_dump_draw_start_count(const struct pipe_draw_start_count *state)
+void trace_dump_draw_vertex_state_info(struct pipe_draw_vertex_state_info state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   trace_dump_struct_begin("pipe_draw_vertex_state_info");
+   trace_dump_member(uint, &state, mode);
+   trace_dump_member(uint, &state, take_vertex_state_ownership);
+   trace_dump_struct_end();
+}
+
+void trace_dump_draw_start_count(const struct pipe_draw_start_count_bias *state)
 {
    if (!trace_dumping_enabled_locked())
       return;
 
-   trace_dump_struct_begin("pipe_draw_start_count");
+   trace_dump_struct_begin("pipe_draw_start_count_bias");
    trace_dump_member(uint, state, start);
    trace_dump_member(uint, state, count);
+   trace_dump_member(int,  state, index_bias);
    trace_dump_struct_end();
 }
 
diff --git a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_dump_state.h b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_dump_state.h
index 1a969c750..f5633b3be 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_dump_state.h
+++ b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_dump_state.h
@@ -86,7 +86,9 @@ void trace_dump_shader_buffer(const struct pipe_shader_buffer *buffer);
 
 void trace_dump_draw_info(const struct pipe_draw_info *state);
 
-void trace_dump_draw_start_count(const struct pipe_draw_start_count *state);
+void trace_dump_draw_vertex_state_info(struct pipe_draw_vertex_state_info state);
+
+void trace_dump_draw_start_count(const struct pipe_draw_start_count_bias *state);
 
 void trace_dump_draw_indirect_info(const struct pipe_draw_indirect_info *state);
 
@@ -99,4 +101,5 @@ void trace_dump_grid_info(const struct pipe_grid_info *state);
 
 void trace_dump_image_view(const struct pipe_image_view *view);
 
+void trace_dump_memory_info(const struct pipe_memory_info *state);
 #endif /* TR_STATE_H */
diff --git a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_screen.c b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_screen.c
index 95ce875bc..32f0bba7c 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_screen.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_screen.c
@@ -27,6 +27,7 @@
 
 #include "util/format/u_format.h"
 #include "util/u_memory.h"
+#include "util/hash_table.h"
 #include "util/simple_list.h"
 
 #include "tr_dump.h"
@@ -39,6 +40,7 @@
 
 
 static bool trace = false;
+static struct hash_table *trace_screens;
 
 static const char *
 trace_screen_get_name(struct pipe_screen *_screen)
@@ -262,6 +264,104 @@ trace_screen_is_format_supported(struct pipe_screen *_screen,
    return result;
 }
 
+static void
+trace_context_replace_buffer_storage(struct pipe_context *_pipe,
+                                     struct pipe_resource *dst,
+                                     struct pipe_resource *src,
+                                     unsigned num_rebinds,
+                                     uint32_t rebind_mask,
+                                     unsigned delete_buffer_id)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "replace_buffer_storage");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, dst);
+   trace_dump_arg(ptr, src);
+   trace_dump_arg(uint, num_rebinds);
+   trace_dump_arg(uint, rebind_mask);
+   trace_dump_arg(uint, delete_buffer_id);
+   trace_dump_call_end();
+
+   tr_ctx->replace_buffer_storage(pipe, dst, src, num_rebinds, rebind_mask, delete_buffer_id);
+}
+
+static struct pipe_fence_handle *
+trace_context_create_fence(struct pipe_context *_pipe, struct tc_unflushed_batch_token *token)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "create_fence");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, token);
+
+   struct pipe_fence_handle *ret = tr_ctx->create_fence(pipe, token);
+   trace_dump_ret(ptr, ret);
+   trace_dump_call_end();
+
+   return ret;
+}
+
+static bool
+trace_context_is_resource_busy(struct pipe_screen *_screen,
+                               struct pipe_resource *resource,
+                               unsigned usage)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   bool result;
+
+   trace_dump_call_begin("pipe_screen", "is_resource_busy");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, resource);
+   trace_dump_arg(uint, usage);
+
+   result = tr_scr->is_resource_busy(screen, resource, usage);
+
+   trace_dump_ret(bool, result);
+
+   trace_dump_call_end();
+
+   return result;
+}
+
+struct pipe_context *
+trace_context_create_threaded(struct pipe_screen *screen, struct pipe_context *pipe,
+                              tc_replace_buffer_storage_func *replace_buffer,
+                              struct threaded_context_options *options)
+{
+   if (!trace_screens)
+      return pipe;
+
+   struct hash_entry *he = _mesa_hash_table_search(trace_screens, screen);
+   if (!he)
+      return pipe;
+   struct trace_screen *tr_scr = trace_screen(he->data);
+
+   if (tr_scr->trace_tc)
+      return pipe;
+
+   struct pipe_context *ctx = trace_context_create(tr_scr, pipe);
+   if (!ctx)
+      return pipe;
+
+   struct trace_context *tr_ctx = trace_context(ctx);
+   tr_ctx->replace_buffer_storage = *replace_buffer;
+   tr_ctx->create_fence = options->create_fence;
+   tr_scr->is_resource_busy = options->is_resource_busy;
+   tr_ctx->threaded = true;
+   *replace_buffer = trace_context_replace_buffer_storage;
+   if (options->create_fence)
+      options->create_fence = trace_context_create_fence;
+   if (options->is_resource_busy)
+      options->is_resource_busy = trace_context_is_resource_busy;
+   return ctx;
+}
 
 static struct pipe_context *
 trace_screen_context_create(struct pipe_screen *_screen, void *priv,
@@ -271,19 +371,20 @@ trace_screen_context_create(struct pipe_screen *_screen, void *priv,
    struct pipe_screen *screen = tr_scr->screen;
    struct pipe_context *result;
 
+   result = screen->context_create(screen, priv, flags);
+
    trace_dump_call_begin("pipe_screen", "context_create");
 
    trace_dump_arg(ptr, screen);
    trace_dump_arg(ptr, priv);
    trace_dump_arg(uint, flags);
 
-   result = screen->context_create(screen, priv, flags);
-
    trace_dump_ret(ptr, result);
 
    trace_dump_call_end();
 
-   result = trace_context_create(tr_scr, result);
+   if (result && (tr_scr->trace_tc || result->draw_vbo != tc_draw_vbo))
+      result = trace_context_create(tr_scr, result);
 
    return result;
 }
@@ -299,7 +400,7 @@ trace_screen_flush_frontbuffer(struct pipe_screen *_screen,
 {
    struct trace_screen *tr_scr = trace_screen(_screen);
    struct pipe_screen *screen = tr_scr->screen;
-   struct pipe_context *pipe = _pipe ? trace_context(_pipe)->pipe : NULL;
+   struct pipe_context *pipe = _pipe ? trace_get_possibly_threaded_context(_pipe) : NULL;
 
    trace_dump_call_begin("pipe_screen", "flush_frontbuffer");
 
@@ -311,9 +412,9 @@ trace_screen_flush_frontbuffer(struct pipe_screen *_screen,
    trace_dump_arg(ptr, context_private);
    */
 
-   screen->flush_frontbuffer(screen, pipe, resource, level, layer, context_private, sub_box);
-
    trace_dump_call_end();
+
+   screen->flush_frontbuffer(screen, pipe, resource, level, layer, context_private, sub_box);
 }
 
 
@@ -430,7 +531,7 @@ trace_screen_free_memory(struct pipe_screen *_screen,
    trace_dump_call_end();
 }
 
-static void
+static bool
 trace_screen_resource_bind_backing(struct pipe_screen *_screen,
                                    struct pipe_resource *resource,
                                    struct pipe_memory_allocation *pmem,
@@ -438,6 +539,7 @@ trace_screen_resource_bind_backing(struct pipe_screen *_screen,
 {
    struct trace_screen *tr_scr = trace_screen(_screen);
    struct pipe_screen *screen = tr_scr->screen;
+   bool result;
 
    trace_dump_call_begin("pipe_screen", "resource_bind_backing");
 
@@ -446,9 +548,13 @@ trace_screen_resource_bind_backing(struct pipe_screen *_screen,
    trace_dump_arg(ptr, pmem);
    trace_dump_arg(uint, offset);
 
-   screen->resource_bind_backing(screen, resource, pmem, offset);
+   result = screen->resource_bind_backing(screen, resource, pmem, offset);
+
+   trace_dump_ret(bool, result);
 
    trace_dump_call_end();
+
+   return result;
 }
 
 static struct pipe_resource *
@@ -504,6 +610,31 @@ trace_screen_resource_create(struct pipe_screen *_screen,
 }
 
 static struct pipe_resource *
+trace_screen_resource_create_with_modifiers(struct pipe_screen *_screen, const struct pipe_resource *templat,
+                                            const uint64_t *modifiers, int modifiers_count)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct pipe_resource *result;
+
+   trace_dump_call_begin("pipe_screen", "resource_create_with_modifiers");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(resource_template, templat);
+   trace_dump_arg_array(uint, modifiers, modifiers_count);
+
+   result = screen->resource_create_with_modifiers(screen, templat, modifiers, modifiers_count);
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+
+   if (result)
+      result->screen = _screen;
+   return result;
+}
+
+static struct pipe_resource *
 trace_screen_resource_from_handle(struct pipe_screen *_screen,
                                  const struct pipe_resource *templ,
                                  struct winsys_handle *handle,
@@ -540,12 +671,12 @@ trace_screen_resource_get_handle(struct pipe_screen *_screen,
                                  unsigned usage)
 {
    struct trace_screen *tr_screen = trace_screen(_screen);
-   struct trace_context *tr_pipe = _pipe ? trace_context(_pipe) : NULL;
+   struct pipe_context *pipe = _pipe ? trace_get_possibly_threaded_context(_pipe) : NULL;
    struct pipe_screen *screen = tr_screen->screen;
 
    /* TODO trace call */
 
-   return screen->resource_get_handle(screen, tr_pipe ? tr_pipe->pipe : NULL,
+   return screen->resource_get_handle(screen, pipe,
                                       resource, handle, usage);
 }
 
@@ -561,12 +692,12 @@ trace_screen_resource_get_param(struct pipe_screen *_screen,
                                 uint64_t *value)
 {
    struct trace_screen *tr_screen = trace_screen(_screen);
-   struct trace_context *tr_pipe = _pipe ? trace_context(_pipe) : NULL;
+   struct pipe_context *pipe = _pipe ? trace_get_possibly_threaded_context(_pipe) : NULL;
    struct pipe_screen *screen = tr_screen->screen;
 
    /* TODO trace call */
 
-   return screen->resource_get_param(screen, tr_pipe ? tr_pipe->pipe : NULL,
+   return screen->resource_get_param(screen, pipe,
                                      resource, plane, layer, level, param,
                                      handle_usage, value);
 }
@@ -704,9 +835,12 @@ trace_screen_fence_finish(struct pipe_screen *_screen,
 {
    struct trace_screen *tr_scr = trace_screen(_screen);
    struct pipe_screen *screen = tr_scr->screen;
-   struct pipe_context *ctx = _ctx ? trace_context(_ctx)->pipe : NULL;
+   struct pipe_context *ctx = _ctx ? trace_get_possibly_threaded_context(_ctx) : NULL;
    int result;
 
+   result = screen->fence_finish(screen, ctx, fence, timeout);
+
+
    trace_dump_call_begin("pipe_screen", "fence_finish");
 
    trace_dump_arg(ptr, screen);
@@ -714,8 +848,6 @@ trace_screen_fence_finish(struct pipe_screen *_screen,
    trace_dump_arg(ptr, fence);
    trace_dump_arg(uint, timeout);
 
-   result = screen->fence_finish(screen, ctx, fence, timeout);
-
    trace_dump_ret(bool, result);
 
    trace_dump_call_end();
@@ -786,12 +918,12 @@ trace_screen_get_timestamp(struct pipe_screen *_screen)
    return result;
 }
 
-static void
-trace_screen_finalize_nir(struct pipe_screen *_screen, void *nir, bool optimize)
+static char *
+trace_screen_finalize_nir(struct pipe_screen *_screen, void *nir)
 {
    struct pipe_screen *screen = trace_screen(_screen)->screen;
 
-   screen->finalize_nir(screen, nir, optimize);
+   return screen->finalize_nir(screen, nir);
 }
 
 static void
@@ -804,11 +936,154 @@ trace_screen_destroy(struct pipe_screen *_screen)
    trace_dump_arg(ptr, screen);
    trace_dump_call_end();
 
+   if (trace_screens) {
+      struct hash_entry *he = _mesa_hash_table_search(trace_screens, screen);
+      if (he) {
+         _mesa_hash_table_remove(trace_screens, he);
+         if (!_mesa_hash_table_num_entries(trace_screens)) {
+            _mesa_hash_table_destroy(trace_screens, NULL);
+            trace_screens = NULL;
+         }
+      }
+   }
+
    screen->destroy(screen);
 
    FREE(tr_scr);
 }
 
+static void
+trace_screen_query_memory_info(struct pipe_screen *_screen, struct pipe_memory_info *info)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+
+   trace_dump_call_begin("pipe_screen", "query_memory_info");
+
+   trace_dump_arg(ptr, screen);
+
+   screen->query_memory_info(screen, info);
+
+   trace_dump_ret(memory_info, info);
+
+   trace_dump_call_end();
+}
+
+static void
+trace_screen_query_dmabuf_modifiers(struct pipe_screen *_screen, enum pipe_format format, int max, uint64_t *modifiers, unsigned int *external_only, int *count)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+
+   trace_dump_call_begin("pipe_screen", "query_dmabuf_modifiers");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(format, format);
+   trace_dump_arg(int, max);
+
+   screen->query_dmabuf_modifiers(screen, format, max, modifiers, external_only, count);
+
+   if (max)
+      trace_dump_arg_array(uint, modifiers, *count);
+   else
+      trace_dump_arg_array(uint, modifiers, max);
+   trace_dump_arg_array(uint, external_only, max);
+   trace_dump_ret_begin();
+   trace_dump_uint(*count);
+   trace_dump_ret_end();
+
+   trace_dump_call_end();
+}
+
+static bool
+trace_screen_is_dmabuf_modifier_supported(struct pipe_screen *_screen, uint64_t modifier, enum pipe_format format, bool *external_only)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+
+   trace_dump_call_begin("pipe_screen", "is_dmabuf_modifier_supported");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(uint, modifier);
+   trace_dump_arg(format, format);
+
+   bool ret = screen->is_dmabuf_modifier_supported(screen, modifier, format, external_only);
+
+   trace_dump_arg_begin("external_only");
+   trace_dump_bool(external_only ? *external_only : false);
+   trace_dump_arg_end();
+
+   trace_dump_ret(bool, ret);
+
+   trace_dump_call_end();
+   return ret;
+}
+
+static unsigned int
+trace_screen_get_dmabuf_modifier_planes(struct pipe_screen *_screen, uint64_t modifier, enum pipe_format format)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+
+   trace_dump_call_begin("pipe_screen", "get_dmabuf_modifier_planes");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(uint, modifier);
+   trace_dump_arg(format, format);
+
+   unsigned ret = screen->get_dmabuf_modifier_planes(screen, modifier, format);
+
+   trace_dump_ret(uint, ret);
+
+   trace_dump_call_end();
+   return ret;
+}
+
+static struct pipe_vertex_state *
+trace_screen_create_vertex_state(struct pipe_screen *_screen,
+                                 struct pipe_vertex_buffer *buffer,
+                                 const struct pipe_vertex_element *elements,
+                                 unsigned num_elements,
+                                 struct pipe_resource *indexbuf,
+                                 uint32_t full_velem_mask)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+
+   trace_dump_call_begin("pipe_screen", "create_vertex_state");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, buffer->buffer.resource);
+   trace_dump_arg(vertex_buffer, buffer);
+   trace_dump_arg_begin("elements");
+   trace_dump_struct_array(vertex_element, elements, num_elements);
+   trace_dump_arg_end();
+   trace_dump_arg(uint, num_elements);
+   trace_dump_arg(ptr, indexbuf);
+   trace_dump_arg(uint, full_velem_mask);
+
+   struct pipe_vertex_state *vstate =
+      screen->create_vertex_state(screen, buffer, elements, num_elements,
+                                  indexbuf, full_velem_mask);
+   trace_dump_ret(ptr, vstate);
+   trace_dump_call_end();
+   return vstate;
+}
+
+static void trace_screen_vertex_state_destroy(struct pipe_screen *_screen,
+                                              struct pipe_vertex_state *state)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+
+   trace_dump_call_begin("pipe_screen", "vertex_state_destroy");
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, state);
+   trace_dump_call_end();
+
+   screen->vertex_state_destroy(screen, state);
+}
+
 bool
 trace_enabled(void)
 {
@@ -874,6 +1149,7 @@ trace_screen_create(struct pipe_screen *screen)
    assert(screen->context_create);
    tr_scr->base.context_create = trace_screen_context_create;
    tr_scr->base.resource_create = trace_screen_resource_create;
+   SCR_INIT(resource_create_with_modifiers);
    tr_scr->base.resource_create_unbacked = trace_screen_resource_create_unbacked;
    tr_scr->base.resource_bind_backing = trace_screen_resource_bind_backing;
    tr_scr->base.resource_from_handle = trace_screen_resource_from_handle;
@@ -881,6 +1157,10 @@ trace_screen_create(struct pipe_screen *screen)
    tr_scr->base.free_memory = trace_screen_free_memory;
    tr_scr->base.map_memory = trace_screen_map_memory;
    tr_scr->base.unmap_memory = trace_screen_unmap_memory;
+   SCR_INIT(query_memory_info);
+   SCR_INIT(query_dmabuf_modifiers);
+   SCR_INIT(is_dmabuf_modifier_supported);
+   SCR_INIT(get_dmabuf_modifier_planes);
    SCR_INIT(check_resource_capability);
    tr_scr->base.resource_get_handle = trace_screen_resource_get_handle;
    SCR_INIT(resource_get_param);
@@ -898,12 +1178,21 @@ trace_screen_create(struct pipe_screen *screen)
    SCR_INIT(get_driver_uuid);
    SCR_INIT(get_device_uuid);
    SCR_INIT(finalize_nir);
+   SCR_INIT(create_vertex_state);
+   SCR_INIT(vertex_state_destroy);
+   tr_scr->base.transfer_helper = screen->transfer_helper;
 
    tr_scr->screen = screen;
 
    trace_dump_ret(ptr, screen);
    trace_dump_call_end();
 
+   if (!trace_screens)
+      trace_screens = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   _mesa_hash_table_insert(trace_screens, screen, tr_scr);
+
+   tr_scr->trace_tc = debug_get_bool_option("GALLIUM_TRACE_TC", false);
+
    return &tr_scr->base;
 
 error2:
diff --git a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_screen.h b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_screen.h
index 65ea4fb6a..cce41d3fc 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_screen.h
+++ b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_screen.h
@@ -31,7 +31,7 @@
 
 #include "pipe/p_screen.h"
 #include "os/os_thread.h"
-
+#include "util/u_threaded_context.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -51,6 +51,8 @@ struct trace_screen
    struct pipe_screen base;
 
    struct pipe_screen *screen;
+   tc_is_resource_busy is_resource_busy;
+   bool trace_tc;
 };
 
 
diff --git a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_texture.c b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_texture.c
index d644e1b08..d9fba6d46 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_texture.c
+++ b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_texture.c
@@ -70,7 +70,6 @@ error:
 void
 trace_surf_destroy(struct trace_surface *tr_surf)
 {
-   trace_context_check(tr_surf->base.context);
    pipe_resource_reference(&tr_surf->base.texture, NULL);
    pipe_surface_reference(&tr_surf->surface, NULL);
    FREE(tr_surf);
@@ -91,18 +90,21 @@ trace_transfer_create(struct trace_context *tr_ctx,
    if (!tr_trans)
       goto error;
 
-   memcpy(&tr_trans->base, transfer, sizeof(struct pipe_transfer));
+   memcpy(&tr_trans->base, transfer, tr_ctx->threaded ? sizeof(struct threaded_transfer) : sizeof(struct pipe_transfer));
 
-   tr_trans->base.resource = NULL;
+   tr_trans->base.b.resource = NULL;
    tr_trans->transfer = transfer;
 
-   pipe_resource_reference(&tr_trans->base.resource, res);
-   assert(tr_trans->base.resource == res);
+   pipe_resource_reference(&tr_trans->base.b.resource, res);
+   assert(tr_trans->base.b.resource == res);
 
-   return &tr_trans->base;
+   return &tr_trans->base.b;
 
 error:
-   tr_ctx->pipe->transfer_unmap(tr_ctx->pipe, transfer);
+   if (res->target == PIPE_BUFFER)
+      tr_ctx->pipe->buffer_unmap(tr_ctx->pipe, transfer);
+   else
+      tr_ctx->pipe->texture_unmap(tr_ctx->pipe, transfer);
    return NULL;
 }
 
@@ -111,7 +113,7 @@ void
 trace_transfer_destroy(struct trace_context *tr_context,
                        struct trace_transfer *tr_trans)
 {
-   pipe_resource_reference(&tr_trans->base.resource, NULL);
+   pipe_resource_reference(&tr_trans->base.b.resource, NULL);
    FREE(tr_trans);
 }
 
diff --git a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_texture.h b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_texture.h
index e5dfc53fd..b9caf968d 100644
--- a/lib/mesa/src/gallium/auxiliary/driver_trace/tr_texture.h
+++ b/lib/mesa/src/gallium/auxiliary/driver_trace/tr_texture.h
@@ -33,6 +33,7 @@
 #include "pipe/p_state.h"
 
 #include "tr_screen.h"
+#include "util/u_threaded_context.h"
 
 struct trace_context;
 
@@ -56,6 +57,7 @@ struct trace_surface
 struct trace_sampler_view
 {
    struct pipe_sampler_view base;
+   unsigned refcount;
 
    struct pipe_sampler_view *sampler_view;
 };
@@ -63,7 +65,7 @@ struct trace_sampler_view
 
 struct trace_transfer
 {
-   struct pipe_transfer base;
+   struct threaded_transfer base;
 
    struct pipe_transfer *transfer;
    struct pipe_context *pipe;
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_const.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_const.c
index 18ece7324..4f4bddf44 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -42,7 +42,7 @@
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_init.h"
-
+#include "lp_bld_limits.h"
 
 unsigned
 lp_mantissa(struct lp_type type)
@@ -256,7 +256,7 @@ lp_build_one(struct gallivm_state *gallivm, struct lp_type type)
 
    elem_type = lp_build_elem_type(gallivm, type);
 
-   if(type.floating && type.width == 16)
+   if(!lp_has_fp16() && type.floating && type.width == 16)
       elems[0] = LLVMConstInt(elem_type, _mesa_float_to_half(1.0f), 0);
    else if(type.floating)
       elems[0] = LLVMConstReal(elem_type, 1.0);
@@ -303,7 +303,7 @@ lp_build_const_elem(struct gallivm_state *gallivm,
    LLVMTypeRef elem_type = lp_build_elem_type(gallivm, type);
    LLVMValueRef elem;
 
-   if(type.floating && type.width == 16) {
+   if (!lp_has_fp16() && type.floating && type.width == 16) {
       elem = LLVMConstInt(elem_type, _mesa_float_to_half((float)val), 0);
    } else if(type.floating) {
       elem = LLVMConstReal(elem_type, val);
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_coro.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_coro.c
index 28f722e93..d3d5e6dc9 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_coro.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_coro.c
@@ -176,9 +176,8 @@ void lp_build_coro_declare_malloc_hooks(struct gallivm_state *gallivm)
 
 LLVMValueRef lp_build_coro_begin_alloc_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id)
 {
-   LLVMValueRef do_alloc = lp_build_coro_alloc(gallivm, coro_id);
    LLVMTypeRef mem_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
-   LLVMValueRef alloc_mem_store = lp_build_alloca(gallivm, mem_ptr_type, "coro mem");
+   LLVMValueRef do_alloc = lp_build_coro_alloc(gallivm, coro_id);
    struct lp_build_if_state if_state_coro;
    lp_build_if(&if_state_coro, gallivm, do_alloc);
    LLVMValueRef coro_size = lp_build_coro_size(gallivm);
@@ -186,14 +185,40 @@ LLVMValueRef lp_build_coro_begin_alloc_mem(struct gallivm_state *gallivm, LLVMVa
 
    assert(gallivm->coro_malloc_hook);
    alloc_mem = LLVMBuildCall(gallivm->builder, gallivm->coro_malloc_hook, &coro_size, 1, "");
-
-   LLVMBuildStore(gallivm->builder, alloc_mem, alloc_mem_store);
    lp_build_endif(&if_state_coro);
-   alloc_mem = LLVMBuildLoad(gallivm->builder, alloc_mem_store, "");
-   LLVMValueRef coro_hdl = lp_build_coro_begin(gallivm, coro_id, alloc_mem);
+
+   LLVMValueRef phi = LLVMBuildPhi(gallivm->builder, mem_ptr_type, "");
+   LLVMValueRef zero_bool = LLVMConstNull(mem_ptr_type);
+   LLVMAddIncoming(phi, &alloc_mem, &if_state_coro.true_block, 1);
+   LLVMAddIncoming(phi, &zero_bool, &if_state_coro.entry_block, 1);
+
+   LLVMValueRef coro_hdl = lp_build_coro_begin(gallivm, coro_id, phi);
    return coro_hdl;
 }
 
+LLVMValueRef lp_build_coro_alloc_mem_array(struct gallivm_state *gallivm,
+					   LLVMValueRef coro_hdl_ptr, LLVMValueRef coro_idx,
+					   LLVMValueRef coro_num_hdls)
+{
+   LLVMTypeRef mem_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
+   LLVMValueRef alloced_ptr = LLVMBuildLoad(gallivm->builder, coro_hdl_ptr, "");
+
+   LLVMValueRef not_alloced = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, alloced_ptr, LLVMConstNull(mem_ptr_type), "");
+   LLVMValueRef coro_size = lp_build_coro_size(gallivm);
+
+   struct lp_build_if_state if_state_coro;
+   lp_build_if(&if_state_coro, gallivm, not_alloced);
+
+   LLVMValueRef alloc_mem;
+   LLVMValueRef alloc_size = LLVMBuildMul(gallivm->builder, coro_num_hdls, coro_size, "");
+   assert(gallivm->coro_malloc_hook);
+   alloc_mem = LLVMBuildCall(gallivm->builder, gallivm->coro_malloc_hook, &alloc_size, 1, "");
+   LLVMBuildStore(gallivm->builder, alloc_mem, coro_hdl_ptr);
+   lp_build_endif(&if_state_coro);
+
+   return LLVMBuildMul(gallivm->builder, coro_size, coro_idx, "");
+}
+
 void lp_build_coro_free_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id, LLVMValueRef coro_hdl)
 {
    LLVMValueRef alloc_mem = lp_build_coro_free(gallivm, coro_id, coro_hdl);
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_coro.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_coro.h
index 2ffc130c9..1853217ed 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_coro.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_coro.h
@@ -55,6 +55,10 @@ LLVMValueRef lp_build_coro_suspend(struct gallivm_state *gallivm, bool last);
 LLVMValueRef lp_build_coro_alloc(struct gallivm_state *gallivm, LLVMValueRef id);
 
 LLVMValueRef lp_build_coro_begin_alloc_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id);
+
+LLVMValueRef lp_build_coro_alloc_mem_array(struct gallivm_state *gallivm,
+					   LLVMValueRef coro_hdl_ptr, LLVMValueRef coro_idx,
+					   LLVMValueRef coro_num_hdls);
 void lp_build_coro_free_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id, LLVMValueRef coro_hdl);
 
 struct lp_build_coro_suspend_info {
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index 1ea133264..a5dd7b80d 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_debug.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -43,8 +43,8 @@
 #define GALLIVM_DEBUG_GC            (1 << 4)
 #define GALLIVM_DEBUG_DUMP_BC       (1 << 5)
 
-#define GALLIVM_PERF_NO_BRILINEAR    (1 << 0)
-#define GALLIVM_PERF_NO_RHO_APPROX   (1 << 1)
+#define GALLIVM_PERF_BRILINEAR       (1 << 0)
+#define GALLIVM_PERF_RHO_APPROX      (1 << 1)
 #define GALLIVM_PERF_NO_QUAD_LOD     (1 << 2)
 #define GALLIVM_PERF_NO_OPT          (1 << 3)
 #define GALLIVM_PERF_NO_AOS_SAMPLING (1 << 4)
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
index e17c7881e..497d403fa 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
@@ -2365,6 +2365,9 @@ lp_build_gather_rgtc(struct gallivm_state *gallivm,
                                              lp_build_const_int32(gallivm, 2), "");
          *green_hi = LLVMBuildExtractElement(builder, elem,
                                              lp_build_const_int32(gallivm, 3), "");
+      } else {
+         *green_lo = NULL;
+         *green_hi = NULL;
       }
    } else {
       LLVMValueRef tmp[4];
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_nir.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_nir.c
index 38afac47d..1ce4be0ec 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_nir.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_nir.c
@@ -27,14 +27,17 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_bitarit.h"
 #include "lp_bld_const.h"
+#include "lp_bld_conv.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_quad.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_intr.h"
 #include "lp_bld_struct.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_printf.h"
 #include "nir_deref.h"
+#include "nir_search_helpers.h"
 
 static void visit_cf_list(struct lp_build_nir_context *bld_base,
                           struct exec_list *list);
@@ -47,7 +50,7 @@ static LLVMValueRef cast_type(struct lp_build_nir_context *bld_base, LLVMValueRe
    case nir_type_float:
       switch (bit_size) {
       case 16:
-         return LLVMBuildBitCast(builder, val, LLVMVectorType(LLVMHalfTypeInContext(bld_base->base.gallivm->context), bld_base->base.type.length), "");
+         return LLVMBuildBitCast(builder, val, bld_base->half_bld.vec_type, "");
       case 32:
          return LLVMBuildBitCast(builder, val, bld_base->base.vec_type, "");
       case 64:
@@ -222,6 +225,8 @@ static LLVMValueRef flt_to_bool32(struct lp_build_nir_context *bld_base,
    LLVMValueRef result = lp_build_cmp(flt_bld, PIPE_FUNC_NOTEQUAL, val, flt_bld->zero);
    if (src_bit_size == 64)
       result = LLVMBuildTrunc(builder, result, bld_base->int_bld.vec_type, "");
+   if (src_bit_size == 16)
+      result = LLVMBuildSExt(builder, result, bld_base->int_bld.vec_type, "");
    return result;
 }
 
@@ -240,6 +245,8 @@ static LLVMValueRef fcmp32(struct lp_build_nir_context *bld_base,
       result = lp_build_cmp(flt_bld, compare, src[0], src[1]);
    if (src_bit_size == 64)
       result = LLVMBuildTrunc(builder, result, bld_base->int_bld.vec_type, "");
+   else if (src_bit_size == 16)
+      result = LLVMBuildSExt(builder, result, bld_base->int_bld.vec_type, "");
    return result;
 }
 
@@ -306,6 +313,9 @@ static LLVMValueRef emit_b2f(struct lp_build_nir_context *bld_base,
                                       "");
    result = LLVMBuildBitCast(builder, result, bld_base->base.vec_type, "");
    switch (bitsize) {
+   case 16:
+      result = LLVMBuildFPTrunc(builder, result, bld_base->half_bld.vec_type, "");
+      break;
    case 32:
       break;
    case 64:
@@ -447,6 +457,43 @@ merge_16bit(struct lp_build_nir_context *bld_base,
    return LLVMBuildShuffleVector(builder, input, input2, LLVMConstVector(shuffles, len), "");
 }
 
+static LLVMValueRef get_signed_divisor(struct gallivm_state *gallivm,
+                                       struct lp_build_context *int_bld,
+                                       struct lp_build_context *mask_bld,
+                                       int src_bit_size,
+                                       LLVMValueRef src, LLVMValueRef divisor)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   /* However for signed divides SIGFPE can occur if the numerator is INT_MIN
+      and divisor is -1. */
+   /* set mask if numerator == INT_MIN */
+   long long min_val;
+   switch (src_bit_size) {
+   case 8:
+      min_val = INT8_MIN;
+      break;
+   case 16:
+      min_val = INT16_MIN;
+      break;
+   default:
+   case 32:
+      min_val = INT_MIN;
+      break;
+   case 64:
+      min_val = INT64_MIN;
+      break;
+   }
+   LLVMValueRef div_mask2 = lp_build_cmp(mask_bld, PIPE_FUNC_EQUAL, src,
+                                         lp_build_const_int_vec(gallivm, int_bld->type, min_val));
+   /* set another mask if divisor is - 1 */
+   LLVMValueRef div_mask3 = lp_build_cmp(mask_bld, PIPE_FUNC_EQUAL, divisor,
+                                         lp_build_const_int_vec(gallivm, int_bld->type, -1));
+   div_mask2 = LLVMBuildAnd(builder, div_mask2, div_mask3, "");
+
+   divisor = lp_build_select(mask_bld, div_mask2, int_bld->one, divisor);
+   return divisor;
+}
+
 static LLVMValueRef
 do_int_divide(struct lp_build_nir_context *bld_base,
               bool is_unsigned, unsigned src_bit_size,
@@ -456,16 +503,16 @@ do_int_divide(struct lp_build_nir_context *bld_base,
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context *int_bld = get_int_bld(bld_base, is_unsigned, src_bit_size);
    struct lp_build_context *mask_bld = get_int_bld(bld_base, true, src_bit_size);
+
+   /* avoid divide by 0. Converted divisor from 0 to -1 */
    LLVMValueRef div_mask = lp_build_cmp(mask_bld, PIPE_FUNC_EQUAL, src2,
                                         mask_bld->zero);
 
+   LLVMValueRef divisor = LLVMBuildOr(builder, div_mask, src2, "");
    if (!is_unsigned) {
-      /* INT_MIN (0x80000000) / -1 (0xffffffff) causes sigfpe, seen with blender. */
-      div_mask = LLVMBuildAnd(builder, div_mask, lp_build_const_int_vec(gallivm, int_bld->type, 0x7fffffff), "");
+      divisor = get_signed_divisor(gallivm, int_bld, mask_bld,
+                                   src_bit_size, src, divisor);
    }
-   LLVMValueRef divisor = LLVMBuildOr(builder,
-                                      div_mask,
-                                      src2, "");
    LLVMValueRef result = lp_build_div(int_bld, src, divisor);
 
    if (!is_unsigned) {
@@ -485,11 +532,16 @@ do_int_mod(struct lp_build_nir_context *bld_base,
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context *int_bld = get_int_bld(bld_base, is_unsigned, src_bit_size);
-   LLVMValueRef div_mask = lp_build_cmp(int_bld, PIPE_FUNC_EQUAL, src2,
-                                        int_bld->zero);
+   struct lp_build_context *mask_bld = get_int_bld(bld_base, true, src_bit_size);
+   LLVMValueRef div_mask = lp_build_cmp(mask_bld, PIPE_FUNC_EQUAL, src2,
+                                        mask_bld->zero);
    LLVMValueRef divisor = LLVMBuildOr(builder,
                                       div_mask,
                                       src2, "");
+   if (!is_unsigned) {
+      divisor = get_signed_divisor(gallivm, int_bld, mask_bld,
+                                   src_bit_size, src, divisor);
+   }
    LLVMValueRef result = lp_build_mod(int_bld, src, divisor);
    return LLVMBuildOr(builder, div_mask, result, "");
 }
@@ -502,7 +554,7 @@ do_quantize_to_f16(struct lp_build_nir_context *bld_base,
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef result, cond, cond2, temp;
 
-   result = LLVMBuildFPTrunc(builder, src, LLVMVectorType(LLVMHalfTypeInContext(gallivm->context), bld_base->base.type.length), "");
+   result = LLVMBuildFPTrunc(builder, src, bld_base->half_bld.vec_type, "");
    result = LLVMBuildFPExt(builder, result, bld_base->base.vec_type, "");
 
    temp = lp_build_abs(get_flt_bld(bld_base, 32), result);
@@ -516,13 +568,18 @@ do_quantize_to_f16(struct lp_build_nir_context *bld_base,
 }
 
 static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
-                                  nir_op op, unsigned src_bit_size[NIR_MAX_VEC_COMPONENTS], LLVMValueRef src[NIR_MAX_VEC_COMPONENTS])
+                                  const nir_alu_instr *instr,
+                                  unsigned src_bit_size[NIR_MAX_VEC_COMPONENTS],
+                                  LLVMValueRef src[NIR_MAX_VEC_COMPONENTS])
 {
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef result;
-   enum gallivm_nan_behavior minmax_nan = bld_base->shader->info.stage == MESA_SHADER_KERNEL ? GALLIVM_NAN_RETURN_OTHER : GALLIVM_NAN_BEHAVIOR_UNDEFINED;
-   switch (op) {
+
+   switch (instr->op) {
+   case nir_op_b2f16:
+      result = emit_b2f(bld_base, src[0], 16);
+      break;
    case nir_op_b2f32:
       result = emit_b2f(bld_base, src[0], 32);
       break;
@@ -546,6 +603,10 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       break;
    case nir_op_bit_count:
       result = lp_build_popcount(get_int_bld(bld_base, false, src_bit_size[0]), src[0]);
+      if (src_bit_size[0] < 32)
+         result = LLVMBuildZExt(builder, result, bld_base->int_bld.vec_type, "");
+      else if (src_bit_size[0] > 32)
+         result = LLVMBuildTrunc(builder, result, bld_base->int_bld.vec_type, "");
       break;
    case nir_op_bitfield_select:
       result = lp_build_xor(&bld_base->uint_bld, src[2], lp_build_and(&bld_base->uint_bld, src[0], lp_build_xor(&bld_base->uint_bld, src[1], src[2])));
@@ -561,7 +622,7 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
          src[0] = LLVMBuildFPTrunc(builder, src[0],
                                    bld_base->base.vec_type, "");
       result = LLVMBuildFPTrunc(builder, src[0],
-                                LLVMVectorType(LLVMHalfTypeInContext(gallivm->context), bld_base->base.type.length), "");
+                                bld_base->half_bld.vec_type, "");
       break;
    case nir_op_f2f32:
       if (src_bit_size[0] < 32)
@@ -624,17 +685,17 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = lp_build_ceil(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fcos:
-      result = lp_build_cos(&bld_base->base, src[0]);
+      result = lp_build_cos(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fddx:
    case nir_op_fddx_coarse:
    case nir_op_fddx_fine:
-      result = lp_build_ddx(&bld_base->base, src[0]);
+      result = lp_build_ddx(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fddy:
    case nir_op_fddy_coarse:
    case nir_op_fddy_fine:
-      result = lp_build_ddy(&bld_base->base, src[0]);
+      result = lp_build_ddy(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fdiv:
       result = lp_build_div(get_flt_bld(bld_base, src_bit_size[0]),
@@ -644,7 +705,7 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = fcmp32(bld_base, PIPE_FUNC_EQUAL, src_bit_size[0], src);
       break;
    case nir_op_fexp2:
-      result = lp_build_exp2(&bld_base->base, src[0]);
+      result = lp_build_exp2(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_ffloor:
       result = lp_build_floor(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
@@ -670,16 +731,45 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
          result = LLVMBuildTrunc(builder, result, bld_base->uint_bld.vec_type, "");
       break;
    }
+   case nir_op_fisfinite32:
+      unreachable("Should have been lowered in nir_opt_algebraic_late.");
    case nir_op_flog2:
-      result = lp_build_log2_safe(&bld_base->base, src[0]);
+      result = lp_build_log2_safe(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_flt:
    case nir_op_flt32:
       result = fcmp32(bld_base, PIPE_FUNC_LESS, src_bit_size[0], src);
       break;
-   case nir_op_fmin:
-      result = lp_build_min_ext(get_flt_bld(bld_base, src_bit_size[0]), src[0], src[1], minmax_nan);
+   case nir_op_fmax:
+   case nir_op_fmin: {
+      enum gallivm_nan_behavior minmax_nan;
+      int first = 0;
+
+      /* If one of the sources is known to be a number (i.e., not NaN), then
+       * better code can be generated by passing that information along.
+       */
+      if (is_a_number(bld_base->range_ht, instr, 1,
+                      0 /* unused num_components */,
+                      NULL /* unused swizzle */)) {
+         minmax_nan = GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN;
+      } else if (is_a_number(bld_base->range_ht, instr, 0,
+                             0 /* unused num_components */,
+                             NULL /* unused swizzle */)) {
+         first = 1;
+         minmax_nan = GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN;
+      } else {
+         minmax_nan = GALLIVM_NAN_RETURN_OTHER;
+      }
+
+      if (instr->op == nir_op_fmin) {
+         result = lp_build_min_ext(get_flt_bld(bld_base, src_bit_size[0]),
+                                   src[first], src[1 - first], minmax_nan);
+      } else {
+         result = lp_build_max_ext(get_flt_bld(bld_base, src_bit_size[0]),
+                                   src[first], src[1 - first], minmax_nan);
+      }
       break;
+   }
    case nir_op_fmod: {
       struct lp_build_context *flt_bld = get_flt_bld(bld_base, src_bit_size[0]);
       result = lp_build_div(flt_bld, src[0], src[1]);
@@ -692,9 +782,6 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = lp_build_mul(get_flt_bld(bld_base, src_bit_size[0]),
                             src[0], src[1]);
       break;
-   case nir_op_fmax:
-      result = lp_build_max_ext(get_flt_bld(bld_base, src_bit_size[0]), src[0], src[1], minmax_nan);
-      break;
    case nir_op_fneu32:
       result = fcmp32(bld_base, PIPE_FUNC_NOTEQUAL, src_bit_size[0], src);
       break;
@@ -702,7 +789,7 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = lp_build_negate(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fpow:
-      result = lp_build_pow(&bld_base->base, src[0], src[1]);
+      result = lp_build_pow(get_flt_bld(bld_base, src_bit_size[0]), src[0], src[1]);
       break;
    case nir_op_fquantize2f16:
       result = do_quantize_to_f16(bld_base, src[0]);
@@ -711,7 +798,13 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = lp_build_rcp(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fround_even:
-      result = lp_build_round(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
+      if (src_bit_size[0] == 16) {
+	 struct lp_build_context *bld = get_flt_bld(bld_base, 16);
+	 char intrinsic[64];
+	 lp_format_intrinsic(intrinsic, 64, "llvm.roundeven", bld->vec_type);
+	 result = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, src[0]);
+      } else
+	 result = lp_build_round(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_frsq:
       result = lp_build_rsqrt(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
@@ -723,7 +816,7 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = lp_build_sgn(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fsin:
-      result = lp_build_sin(&bld_base->base, src[0]);
+      result = lp_build_sin(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fsqrt:
       result = lp_build_sqrt(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
@@ -734,6 +827,10 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
    case nir_op_i2b32:
       result = int_to_bool32(bld_base, src_bit_size[0], false, src[0]);
       break;
+   case nir_op_i2f16:
+      result = LLVMBuildSIToFP(builder, src[0],
+                               bld_base->half_bld.vec_type, "");
+      break;
    case nir_op_i2f32:
       result = lp_build_int_to_float(&bld_base->base, src[0]);
       break;
@@ -874,6 +971,10 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = LLVMBuildBitCast(builder, tmp, bld_base->uint64_bld.vec_type, "");
       break;
    }
+   case nir_op_u2f16:
+      result = LLVMBuildUIToFP(builder, src[0],
+                               bld_base->half_bld.vec_type, "");
+      break;
    case nir_op_u2f32:
       result = LLVMBuildUIToFP(builder, src[0], bld_base->base.vec_type, "");
       break;
@@ -970,14 +1071,14 @@ static void visit_alu(struct lp_build_nir_context *bld_base, const nir_alu_instr
    case nir_op_unpack_half_2x16:
       src_components = 1;
       break;
-   case nir_op_cube_face_coord:
-   case nir_op_cube_face_index:
+   case nir_op_cube_face_coord_amd:
+   case nir_op_cube_face_index_amd:
       src_components = 3;
       break;
    case nir_op_fsum2:
    case nir_op_fsum3:
    case nir_op_fsum4:
-      src_components = nir_src_num_components(instr->src[0].src);
+      src_components = nir_op_infos[instr->op].input_sizes[0];
       break;
    default:
       src_components = num_components;
@@ -994,7 +1095,7 @@ static void visit_alu(struct lp_build_nir_context *bld_base, const nir_alu_instr
          result[i] = cast_type(bld_base, src[i], nir_op_infos[instr->op].input_types[i], src_bit_size[i]);
       }
    } else if (instr->op == nir_op_fsum4 || instr->op == nir_op_fsum3 || instr->op == nir_op_fsum2) {
-      for (unsigned c = 0; c < nir_src_num_components(instr->src[0].src); c++) {
+      for (unsigned c = 0; c < nir_op_infos[instr->op].input_sizes[0]; c++) {
          LLVMValueRef temp_chan = LLVMBuildExtractValue(gallivm->builder,
                                                           src[0], c, "");
          temp_chan = cast_type(bld_base, temp_chan, nir_op_infos[instr->op].input_types[0], src_bit_size[0]);
@@ -1012,7 +1113,7 @@ static void visit_alu(struct lp_build_nir_context *bld_base, const nir_alu_instr
                src_chan[i] = src[i];
             src_chan[i] = cast_type(bld_base, src_chan[i], nir_op_infos[instr->op].input_types[i], src_bit_size[i]);
          }
-         result[c] = do_alu_action(bld_base, instr->op, src_bit_size, src_chan);
+         result[c] = do_alu_action(bld_base, instr, src_bit_size, src_chan);
          result[c] = cast_type(bld_base, result[c], nir_op_infos[instr->op].output_type, nir_dest_bit_size(instr->dest.dest));
       }
    }
@@ -1026,6 +1127,7 @@ static void visit_load_const(struct lp_build_nir_context *bld_base,
    struct lp_build_context *int_bld = get_int_bld(bld_base, true, instr->def.bit_size);
    for (unsigned i = 0; i < instr->def.num_components; i++)
       result[i] = lp_build_const_int_vec(bld_base->base.gallivm, int_bld->type, instr->def.bit_size == 32 ? instr->value[i].u32 : instr->value[i].u64);
+   memset(&result[instr->def.num_components], 0, NIR_MAX_VEC_COMPONENTS - instr->def.num_components);
    assign_ssa_dest(bld_base, &instr->def, result);
 }
 
@@ -1240,7 +1342,7 @@ static void visit_load_ssbo(struct lp_build_nir_context *bld_base,
                            nir_intrinsic_instr *instr,
                            LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
 {
-   LLVMValueRef idx = get_src(bld_base, instr->src[0]);
+   LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[0]), nir_type_uint, 32);
    LLVMValueRef offset = get_src(bld_base, instr->src[1]);
    bld_base->load_mem(bld_base, nir_dest_num_components(instr->dest), nir_dest_bit_size(instr->dest),
                        idx, offset, result);
@@ -1250,7 +1352,7 @@ static void visit_store_ssbo(struct lp_build_nir_context *bld_base,
                              nir_intrinsic_instr *instr)
 {
    LLVMValueRef val = get_src(bld_base, instr->src[0]);
-   LLVMValueRef idx = get_src(bld_base, instr->src[1]);
+   LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[1]), nir_type_uint, 32);
    LLVMValueRef offset = get_src(bld_base, instr->src[2]);
    int writemask = instr->const_index[0];
    int nc = nir_src_num_components(instr->src[0]);
@@ -1262,7 +1364,7 @@ static void visit_get_ssbo_size(struct lp_build_nir_context *bld_base,
                                 nir_intrinsic_instr *instr,
                                 LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
 {
-   LLVMValueRef idx = get_src(bld_base, instr->src[0]);
+   LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[0]), nir_type_uint, 32);
    result[0] = bld_base->get_ssbo_size(bld_base, idx);
 }
 
@@ -1270,7 +1372,7 @@ static void visit_ssbo_atomic(struct lp_build_nir_context *bld_base,
                               nir_intrinsic_instr *instr,
                               LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
 {
-   LLVMValueRef idx = get_src(bld_base, instr->src[0]);
+   LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[0]), nir_type_uint, 32);
    LLVMValueRef offset = get_src(bld_base, instr->src[1]);
    LLVMValueRef val = get_src(bld_base, instr->src[2]);
    LLVMValueRef val2 = NULL;
@@ -1662,13 +1764,14 @@ static void visit_intrinsic(struct lp_build_nir_context *bld_base,
    case nir_intrinsic_load_base_instance:
    case nir_intrinsic_load_base_vertex:
    case nir_intrinsic_load_first_vertex:
-   case nir_intrinsic_load_work_group_id:
+   case nir_intrinsic_load_workgroup_id:
    case nir_intrinsic_load_local_invocation_id:
-   case nir_intrinsic_load_num_work_groups:
+   case nir_intrinsic_load_local_invocation_index:
+   case nir_intrinsic_load_num_workgroups:
    case nir_intrinsic_load_invocation_id:
    case nir_intrinsic_load_front_face:
    case nir_intrinsic_load_draw_id:
-   case nir_intrinsic_load_local_group_size:
+   case nir_intrinsic_load_workgroup_size:
    case nir_intrinsic_load_work_dim:
    case nir_intrinsic_load_tess_coord:
    case nir_intrinsic_load_tess_level_outer:
@@ -1888,7 +1991,7 @@ static void visit_tex(struct lp_build_nir_context *bld_base, nir_tex_instr *inst
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef coords[5];
    LLVMValueRef offsets[3] = { NULL };
-   LLVMValueRef explicit_lod = NULL, projector = NULL, ms_index = NULL;
+   LLVMValueRef explicit_lod = NULL, ms_index = NULL;
    struct lp_sampler_params params;
    struct lp_derivatives derivs;
    unsigned sample_key = 0;
@@ -1935,9 +2038,6 @@ static void visit_tex(struct lp_build_nir_context *bld_base, nir_tex_instr *inst
       case nir_tex_src_sampler_deref:
          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
          break;
-      case nir_tex_src_projector:
-         projector = lp_build_rcp(&bld_base->base, cast_type(bld_base, get_src(bld_base, instr->src[i].src), nir_type_float, 32));
-         break;
       case nir_tex_src_comparator:
          sample_key |= LP_SAMPLER_SHADOW;
          coords[4] = get_src(bld_base, instr->src[i].src);
@@ -2038,13 +2138,6 @@ static void visit_tex(struct lp_build_nir_context *bld_base, nir_tex_instr *inst
       coords[1] = coord_undef;
    }
 
-   if (projector) {
-      for (unsigned chan = 0; chan < instr->coord_components; ++chan)
-         coords[chan] = lp_build_mul(&bld_base->base, coords[chan], projector);
-      if (sample_key & LP_SAMPLER_SHADOW)
-         coords[4] = lp_build_mul(&bld_base->base, coords[4], projector);
-   }
-
    uint32_t samp_base_index = 0, tex_base_index = 0;
    if (!sampler_deref_instr) {
       int samp_src_index = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle);
@@ -2081,8 +2174,38 @@ static void visit_tex(struct lp_build_nir_context *bld_base, nir_tex_instr *inst
    params.texel = texel;
    params.lod = explicit_lod;
    params.ms_index = ms_index;
+   params.aniso_filter_table = bld_base->aniso_filter_table;
    bld_base->tex(bld_base, &params);
+
+   if (nir_dest_bit_size(instr->dest) != 32) {
+      assert(nir_dest_bit_size(instr->dest) == 16);
+      LLVMTypeRef vec_type = NULL;
+      bool is_float = false;
+      switch (nir_alu_type_get_base_type(instr->dest_type)) {
+      case nir_type_float:
+         is_float = true;
+	 break;
+      case nir_type_int:
+         vec_type = bld_base->int16_bld.vec_type;
+         break;
+      case nir_type_uint:
+         vec_type = bld_base->uint16_bld.vec_type;
+         break;
+      default:
+         unreachable("unexpected alu type");
+      }
+      for (int i = 0; i < nir_dest_num_components(instr->dest); ++i) {
+         if (is_float) {
+            texel[i] = lp_build_float_to_half(gallivm, texel[i]);
+         } else {
+            texel[i] = LLVMBuildBitCast(builder, texel[i], bld_base->int_bld.vec_type, "");
+            texel[i] = LLVMBuildTrunc(builder, texel[i], vec_type, "");
+         }
+      }
+   }
+
    assign_dest(bld_base, &instr->dest, texel);
+
 }
 
 static void visit_ssa_undef(struct lp_build_nir_context *bld_base,
@@ -2093,6 +2216,7 @@ static void visit_ssa_undef(struct lp_build_nir_context *bld_base,
    struct lp_build_context *undef_bld = get_int_bld(bld_base, true, instr->def.bit_size);
    for (unsigned i = 0; i < num_components; i++)
       undef[i] = LLVMGetUndef(undef_bld->vec_type);
+   memset(&undef[num_components], 0, NIR_MAX_VEC_COMPONENTS - num_components);
    assign_ssa_dest(bld_base, &instr->def, undef);
 }
 
@@ -2279,6 +2403,7 @@ bool lp_build_nir_llvm(
                                             _mesa_key_pointer_equal);
    bld_base->vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                             _mesa_key_pointer_equal);
+   bld_base->range_ht = _mesa_pointer_hash_table_create(NULL);
 
    func = (struct nir_function *)exec_list_get_head(&nir->functions);
 
@@ -2295,6 +2420,7 @@ bool lp_build_nir_llvm(
    free(bld_base->ssa_defs);
    ralloc_free(bld_base->vars);
    ralloc_free(bld_base->regs);
+   ralloc_free(bld_base->range_ht);
    return true;
 }
 
@@ -2305,6 +2431,7 @@ void lp_build_opt_nir(struct nir_shader *nir)
 
    static const struct nir_lower_tex_options lower_tex_options = {
       .lower_tg4_offsets = true,
+      .lower_txp = ~0u,
    };
    NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
    NIR_PASS_V(nir, nir_lower_frexp);
@@ -2313,21 +2440,36 @@ void lp_build_opt_nir(struct nir_shader *nir)
    NIR_PASS_V(nir, nir_lower_fp16_casts);
    do {
       progress = false;
-      NIR_PASS_V(nir, nir_opt_constant_folding);
-      NIR_PASS_V(nir, nir_opt_algebraic);
-      NIR_PASS_V(nir, nir_lower_pack);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+      NIR_PASS(progress, nir, nir_opt_algebraic);
+      NIR_PASS(progress, nir, nir_lower_pack);
 
-      nir_lower_tex_options options = { .lower_tex_without_implicit_lod = true };
+      nir_lower_tex_options options = { 0, };
       NIR_PASS_V(nir, nir_lower_tex, &options);
 
       const nir_lower_subgroups_options subgroups_options = {
 	.subgroup_size = lp_native_vector_width / 32,
 	.ballot_bit_size = 32,
+        .ballot_components = 1,
 	.lower_to_scalar = true,
 	.lower_subgroup_masks = true,
       };
       NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
 
    } while (progress);
-   nir_lower_bool_to_int32(nir);
+
+   do {
+      progress = false;
+      NIR_PASS(progress, nir, nir_opt_algebraic_late);
+      if (progress) {
+         NIR_PASS_V(nir, nir_copy_prop);
+         NIR_PASS_V(nir, nir_opt_dce);
+         NIR_PASS_V(nir, nir_opt_cse);
+      }
+   } while (progress);
+
+   if (nir_lower_bool_to_int32(nir)) {
+      NIR_PASS_V(nir, nir_copy_prop);
+      NIR_PASS_V(nir, nir_opt_dce);
+   }
 }
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_nir.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_nir.h
index 1a92bbc03..874a5d55e 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_nir.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_nir.h
@@ -49,6 +49,7 @@ struct lp_build_nir_context
    struct lp_build_context int8_bld;
    struct lp_build_context uint16_bld;
    struct lp_build_context int16_bld;
+   struct lp_build_context half_bld;
    struct lp_build_context dbl_bld;
    struct lp_build_context uint64_bld;
    struct lp_build_context int64_bld;
@@ -57,6 +58,11 @@ struct lp_build_nir_context
    struct hash_table *regs;
    struct hash_table *vars;
 
+   /** Value range analysis hash table used in code generation. */
+   struct hash_table *range_ht;
+
+   LLVMValueRef aniso_filter_table;
+
    nir_shader *shader;
 
    void (*load_ubo)(struct lp_build_nir_context *bld_base,
@@ -284,6 +290,8 @@ static inline struct lp_build_context *get_flt_bld(struct lp_build_nir_context *
    switch (op_bit_size) {
    case 64:
       return &bld_base->dbl_bld;
+   case 16:
+      return &bld_base->half_bld;
    default:
    case 32:
       return &bld_base->base;
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
index 05e52083b..b771b7cc7 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
@@ -300,7 +300,8 @@ emit_mask_scatter(struct lp_build_nir_soa_context *bld,
       if (scalar_pred) {
          LLVMValueRef real_val, dst_val;
          dst_val = LLVMBuildLoad(builder, scalar_ptr, "");
-         real_val = lp_build_select(&bld->uint_elem_bld, scalar_pred, val, dst_val);
+         scalar_pred = LLVMBuildTrunc(builder, scalar_pred, LLVMInt1TypeInContext(gallivm->context), "");
+         real_val = LLVMBuildSelect(builder, scalar_pred, val, dst_val, "");
          LLVMBuildStore(builder, real_val, scalar_ptr);
       }
       else {
@@ -472,7 +473,7 @@ static void emit_load_var(struct lp_build_nir_context *bld_base,
       break;
    case nir_var_shader_out:
       if (bld->fs_iface && bld->fs_iface->fb_fetch) {
-         bld->fs_iface->fb_fetch(bld->fs_iface, &bld_base->base, var->data.driver_location, result);
+         bld->fs_iface->fb_fetch(bld->fs_iface, &bld_base->base, var->data.location, result);
          return;
       }
       for (unsigned i = 0; i < num_components; i++) {
@@ -1038,7 +1039,6 @@ static void emit_load_mem(struct lp_build_nir_context *bld_base,
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   LLVMValueRef ssbo_ptr = NULL;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    LLVMValueRef ssbo_limit = NULL;
    struct lp_build_context *load_bld;
@@ -1046,51 +1046,61 @@ static void emit_load_mem(struct lp_build_nir_context *bld_base,
 
    load_bld = get_int_bld(bld_base, true, bit_size);
 
+   offset = LLVMBuildAShr(gallivm->builder, offset, lp_build_const_int_vec(gallivm, uint_bld->type, shift_val), "");
+
+   /* although the index is dynamically uniform that doesn't count if exec mask isn't set, so read the one-by-one */
+
+   LLVMValueRef result[NIR_MAX_VEC_COMPONENTS];
+   for (unsigned c = 0; c < nc; c++)
+      result[c] = lp_build_alloca(gallivm, load_bld->vec_type, "");
+
+   LLVMValueRef exec_mask = mask_vec(bld_base);
+   LLVMValueRef cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
+   struct lp_build_loop_state loop_state;
+   lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+   LLVMValueRef loop_cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+   LLVMValueRef loop_offset = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
+
+   struct lp_build_if_state exec_ifthen;
+   lp_build_if(&exec_ifthen, gallivm, loop_cond);
+
+   LLVMValueRef mem_ptr;
+
    if (index) {
-      LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
+      LLVMValueRef ssbo_idx = LLVMBuildExtractElement(gallivm->builder, index, loop_state.counter, "");
+      LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, ssbo_idx);
+      LLVMValueRef ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, ssbo_idx);
       ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, shift_val), "");
-      ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
-
-      ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
+      mem_ptr = ssbo_ptr;
    } else
-      ssbo_ptr = bld->shared_ptr;
+      mem_ptr = bld->shared_ptr;
 
-   offset = LLVMBuildAShr(gallivm->builder, offset, lp_build_const_int_vec(gallivm, uint_bld->type, shift_val), "");
    for (unsigned c = 0; c < nc; c++) {
-      LLVMValueRef loop_index = lp_build_add(uint_bld, offset, lp_build_const_int_vec(gallivm, uint_bld->type, c));
-      LLVMValueRef exec_mask = mask_vec(bld_base);
-
+      LLVMValueRef loop_index = LLVMBuildAdd(builder, loop_offset, lp_build_const_int32(gallivm, c), "");
+      LLVMValueRef do_fetch = lp_build_const_int32(gallivm, -1);
       if (ssbo_limit) {
-         LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit);
-         exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+         LLVMValueRef ssbo_oob_cmp = lp_build_compare(gallivm, lp_elem_type(uint_bld->type), PIPE_FUNC_LESS, loop_index, ssbo_limit);
+         do_fetch = LLVMBuildAnd(builder, do_fetch, ssbo_oob_cmp, "");
       }
 
-      LLVMValueRef result = lp_build_alloca(gallivm, load_bld->vec_type, "");
-      struct lp_build_loop_state loop_state;
-      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
-
       struct lp_build_if_state ifthen;
-      LLVMValueRef cond, temp_res;
+      LLVMValueRef fetch_cond, temp_res;
 
-      loop_index = LLVMBuildExtractElement(gallivm->builder, loop_index,
-                                           loop_state.counter, "");
-
-      cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
-      cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+      fetch_cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, do_fetch, lp_build_const_int32(gallivm, 0), "");
 
-      lp_build_if(&ifthen, gallivm, cond);
+      lp_build_if(&ifthen, gallivm, fetch_cond);
       LLVMValueRef scalar;
       if (bit_size != 32) {
-         LLVMValueRef ssbo_ptr2 = LLVMBuildBitCast(builder, ssbo_ptr, LLVMPointerType(load_bld->elem_type, 0), "");
-         scalar = lp_build_pointer_get(builder, ssbo_ptr2, loop_index);
+         LLVMValueRef mem_ptr2 = LLVMBuildBitCast(builder, mem_ptr, LLVMPointerType(load_bld->elem_type, 0), "");
+         scalar = lp_build_pointer_get(builder, mem_ptr2, loop_index);
       } else
-         scalar = lp_build_pointer_get(builder, ssbo_ptr, loop_index);
+         scalar = lp_build_pointer_get(builder, mem_ptr, loop_index);
 
-      temp_res = LLVMBuildLoad(builder, result, "");
+      temp_res = LLVMBuildLoad(builder, result[c], "");
       temp_res = LLVMBuildInsertElement(builder, temp_res, scalar, loop_state.counter, "");
-      LLVMBuildStore(builder, temp_res, result);
+      LLVMBuildStore(builder, temp_res, result[c]);
       lp_build_else(&ifthen);
-      temp_res = LLVMBuildLoad(builder, result, "");
+      temp_res = LLVMBuildLoad(builder, result[c], "");
       LLVMValueRef zero;
       if (bit_size == 64)
          zero = LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), 0, 0);
@@ -1101,12 +1111,16 @@ static void emit_load_mem(struct lp_build_nir_context *bld_base,
       else
          zero = lp_build_const_int32(gallivm, 0);
       temp_res = LLVMBuildInsertElement(builder, temp_res, zero, loop_state.counter, "");
-      LLVMBuildStore(builder, temp_res, result);
+      LLVMBuildStore(builder, temp_res, result[c]);
       lp_build_endif(&ifthen);
-      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
-                                NULL, LLVMIntUGE);
-      outval[c] = LLVMBuildLoad(gallivm->builder, result, "");
    }
+
+   lp_build_endif(&exec_ifthen);
+   lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                          NULL, LLVMIntUGE);
+   for (unsigned c = 0; c < nc; c++)
+      outval[c] = LLVMBuildLoad(gallivm->builder, result[c], "");
+
 }
 
 static void emit_store_mem(struct lp_build_nir_context *bld_base,
@@ -1120,56 +1134,66 @@ static void emit_store_mem(struct lp_build_nir_context *bld_base,
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   LLVMValueRef ssbo_ptr;
+   LLVMValueRef mem_ptr;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    LLVMValueRef ssbo_limit = NULL;
    struct lp_build_context *store_bld;
    uint32_t shift_val = bit_size_to_shift_size(bit_size);
    store_bld = get_int_bld(bld_base, true, bit_size);
 
+   offset = lp_build_shr_imm(uint_bld, offset, shift_val);
+
+   LLVMValueRef exec_mask = mask_vec(bld_base);
+   LLVMValueRef cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
+   struct lp_build_loop_state loop_state;
+   lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+   LLVMValueRef loop_cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+   LLVMValueRef loop_offset = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
+
+   struct lp_build_if_state exec_ifthen;
+   lp_build_if(&exec_ifthen, gallivm, loop_cond);
+
    if (index) {
-      LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
+      LLVMValueRef ssbo_idx = LLVMBuildExtractElement(gallivm->builder, index, loop_state.counter, "");
+      LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, ssbo_idx);
+      LLVMValueRef ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, ssbo_idx);
       ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, shift_val), "");
-      ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
-      ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
+      mem_ptr = ssbo_ptr;
    } else
-      ssbo_ptr = bld->shared_ptr;
+      mem_ptr = bld->shared_ptr;
 
-   offset = lp_build_shr_imm(uint_bld, offset, shift_val);
    for (unsigned c = 0; c < nc; c++) {
       if (!(writemask & (1u << c)))
          continue;
-      LLVMValueRef loop_index = lp_build_add(uint_bld, offset, lp_build_const_int_vec(gallivm, uint_bld->type, c));
+      LLVMValueRef loop_index = LLVMBuildAdd(builder, loop_offset, lp_build_const_int32(gallivm, c), "");
       LLVMValueRef val = (nc == 1) ? dst : LLVMBuildExtractValue(builder, dst, c, "");
+      LLVMValueRef do_store = lp_build_const_int32(gallivm, -1);
 
-      LLVMValueRef exec_mask = mask_vec(bld_base);
       if (ssbo_limit) {
-         LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit);
-         exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+         LLVMValueRef ssbo_oob_cmp = lp_build_compare(gallivm, lp_elem_type(uint_bld->type), PIPE_FUNC_LESS, loop_index, ssbo_limit);
+         do_store = LLVMBuildAnd(builder, do_store, ssbo_oob_cmp, "");
       }
 
-      struct lp_build_loop_state loop_state;
-      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
       LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val,
                                                        loop_state.counter, "");
       value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, store_bld->elem_type, "");
       struct lp_build_if_state ifthen;
-      LLVMValueRef cond;
+      LLVMValueRef store_cond;
 
-      loop_index = LLVMBuildExtractElement(gallivm->builder, loop_index,
-                                           loop_state.counter, "");
-      cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
-      cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
-      lp_build_if(&ifthen, gallivm, cond);
+      store_cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, do_store, lp_build_const_int32(gallivm, 0), "");
+      lp_build_if(&ifthen, gallivm, store_cond);
       if (bit_size != 32) {
-         LLVMValueRef ssbo_ptr2 = LLVMBuildBitCast(builder, ssbo_ptr, LLVMPointerType(store_bld->elem_type, 0), "");
-         lp_build_pointer_set(builder, ssbo_ptr2, loop_index, value_ptr);
+         LLVMValueRef mem_ptr2 = LLVMBuildBitCast(builder, mem_ptr, LLVMPointerType(store_bld->elem_type, 0), "");
+         lp_build_pointer_set(builder, mem_ptr2, loop_index, value_ptr);
       } else
-         lp_build_pointer_set(builder, ssbo_ptr, loop_index, value_ptr);
+         lp_build_pointer_set(builder, mem_ptr, loop_index, value_ptr);
       lp_build_endif(&ifthen);
-      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
-                             NULL, LLVMIntUGE);
    }
+
+   lp_build_endif(&exec_ifthen);
+   lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                             NULL, LLVMIntUGE);
+
 }
 
 static void emit_atomic_mem(struct lp_build_nir_context *bld_base,
@@ -1182,52 +1206,58 @@ static void emit_atomic_mem(struct lp_build_nir_context *bld_base,
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   LLVMValueRef ssbo_ptr;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    LLVMValueRef ssbo_limit = NULL;
    uint32_t shift_val = bit_size_to_shift_size(bit_size);
    struct lp_build_context *atomic_bld = get_int_bld(bld_base, true, bit_size);
-   if (index) {
-      LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
-      ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, 2), "");
-      ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
-      ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
-   } else
-      ssbo_ptr = bld->shared_ptr;
 
    offset = lp_build_shr_imm(uint_bld, offset, shift_val);
    LLVMValueRef atom_res = lp_build_alloca(gallivm,
                                            atomic_bld->vec_type, "");
 
    LLVMValueRef exec_mask = mask_vec(bld_base);
-   if (ssbo_limit) {
-      LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, offset, ssbo_limit);
-      exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
-   }
-
+   LLVMValueRef cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
    struct lp_build_loop_state loop_state;
    lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+   LLVMValueRef loop_cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+   LLVMValueRef loop_offset = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
+
+   struct lp_build_if_state exec_ifthen;
+   lp_build_if(&exec_ifthen, gallivm, loop_cond);
+
+   LLVMValueRef mem_ptr;
+   if (index) {
+      LLVMValueRef ssbo_idx = LLVMBuildExtractElement(gallivm->builder, index, loop_state.counter, "");
+      LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, ssbo_idx);
+      LLVMValueRef ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, ssbo_idx);
+      ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, shift_val), "");
+      mem_ptr = ssbo_ptr;
+   } else
+      mem_ptr = bld->shared_ptr;
+
+   LLVMValueRef do_fetch = lp_build_const_int32(gallivm, -1);
+   if (ssbo_limit) {
+      LLVMValueRef ssbo_oob_cmp = lp_build_compare(gallivm, lp_elem_type(uint_bld->type), PIPE_FUNC_LESS, loop_offset, ssbo_limit);
+      do_fetch = LLVMBuildAnd(builder, do_fetch, ssbo_oob_cmp, "");
+   }
 
    LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val,
                                                     loop_state.counter, "");
    value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, atomic_bld->elem_type, "");
 
-   offset = LLVMBuildExtractElement(gallivm->builder, offset,
-                                   loop_state.counter, "");
-
    LLVMValueRef scalar_ptr;
    if (bit_size != 32) {
-      LLVMValueRef ssbo_ptr2 = LLVMBuildBitCast(builder, ssbo_ptr, LLVMPointerType(atomic_bld->elem_type, 0), "");
-      scalar_ptr = LLVMBuildGEP(builder, ssbo_ptr2, &offset, 1, "");
+      LLVMValueRef mem_ptr2 = LLVMBuildBitCast(builder, mem_ptr, LLVMPointerType(atomic_bld->elem_type, 0), "");
+      scalar_ptr = LLVMBuildGEP(builder, mem_ptr2, &loop_offset, 1, "");
    } else
-      scalar_ptr = LLVMBuildGEP(builder, ssbo_ptr, &offset, 1, "");
+      scalar_ptr = LLVMBuildGEP(builder, mem_ptr, &loop_offset, 1, "");
 
    struct lp_build_if_state ifthen;
-   LLVMValueRef cond, temp_res;
+   LLVMValueRef inner_cond, temp_res;
    LLVMValueRef scalar;
-   cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
-   cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
-   lp_build_if(&ifthen, gallivm, cond);
+
+   inner_cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, do_fetch, lp_build_const_int32(gallivm, 0), "");
+   lp_build_if(&ifthen, gallivm, inner_cond);
 
    if (nir_op == nir_intrinsic_ssbo_atomic_comp_swap || nir_op == nir_intrinsic_shared_atomic_comp_swap) {
       LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, val2,
@@ -1297,6 +1327,7 @@ static void emit_atomic_mem(struct lp_build_nir_context *bld_base,
    LLVMBuildStore(builder, temp_res, atom_res);
    lp_build_endif(&ifthen);
 
+   lp_build_endif(&exec_ifthen);
    lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
                           NULL, LLVMIntUGE);
    *result = LLVMBuildLoad(builder, atom_res, "");
@@ -1514,7 +1545,7 @@ static void emit_sysval_intrin(struct lp_build_nir_context *bld_base,
    case nir_intrinsic_load_primitive_id:
       result[0] = bld->system_values.prim_id;
       break;
-   case nir_intrinsic_load_work_group_id: {
+   case nir_intrinsic_load_workgroup_id: {
       LLVMValueRef tmp[3];
       for (unsigned i = 0; i < 3; i++) {
          tmp[i] = LLVMBuildExtractElement(gallivm->builder, bld->system_values.block_id, lp_build_const_int32(gallivm, i), "");
@@ -1528,7 +1559,21 @@ static void emit_sysval_intrin(struct lp_build_nir_context *bld_base,
       for (unsigned i = 0; i < 3; i++)
          result[i] = LLVMBuildExtractValue(gallivm->builder, bld->system_values.thread_id, i, "");
       break;
-   case nir_intrinsic_load_num_work_groups: {
+   case nir_intrinsic_load_local_invocation_index: {
+      LLVMValueRef tmp, tmp2;
+      tmp = lp_build_broadcast_scalar(&bld_base->uint_bld, LLVMBuildExtractElement(gallivm->builder, bld->system_values.block_size, lp_build_const_int32(gallivm, 1), ""));
+      tmp2 = lp_build_broadcast_scalar(&bld_base->uint_bld, LLVMBuildExtractElement(gallivm->builder, bld->system_values.block_size, lp_build_const_int32(gallivm, 0), ""));
+      tmp = lp_build_mul(&bld_base->uint_bld, tmp, tmp2);
+      tmp = lp_build_mul(&bld_base->uint_bld, tmp, LLVMBuildExtractValue(gallivm->builder, bld->system_values.thread_id, 2, ""));
+
+      tmp2 = lp_build_broadcast_scalar(&bld_base->uint_bld, LLVMBuildExtractElement(gallivm->builder, bld->system_values.block_size, lp_build_const_int32(gallivm, 0), ""));
+      tmp2 = lp_build_mul(&bld_base->uint_bld, tmp2, LLVMBuildExtractValue(gallivm->builder, bld->system_values.thread_id, 1, ""));
+      tmp = lp_build_add(&bld_base->uint_bld, tmp, tmp2);
+      tmp = lp_build_add(&bld_base->uint_bld, tmp, LLVMBuildExtractValue(gallivm->builder, bld->system_values.thread_id, 0, ""));
+      result[0] = tmp;
+      break;
+   }
+   case nir_intrinsic_load_num_workgroups: {
       LLVMValueRef tmp[3];
       for (unsigned i = 0; i < 3; i++) {
          tmp[i] = LLVMBuildExtractElement(gallivm->builder, bld->system_values.grid_size, lp_build_const_int32(gallivm, i), "");
@@ -1552,7 +1597,7 @@ static void emit_sysval_intrin(struct lp_build_nir_context *bld_base,
       break;
    default:
       break;
-   case nir_intrinsic_load_local_group_size:
+   case nir_intrinsic_load_workgroup_size:
      for (unsigned i = 0; i < 3; i++)
        result[i] = lp_build_broadcast_scalar(&bld_base->uint_bld, LLVMBuildExtractElement(gallivm->builder, bld->system_values.block_size, lp_build_const_int32(gallivm, i), ""));
      break;
@@ -1985,36 +2030,106 @@ static void emit_reduce(struct lp_build_nir_context *bld_base, LLVMValueRef src,
    switch (reduction_op) {
    case nir_op_fmin: {
       LLVMValueRef flt_max = bit_size == 64 ? LLVMConstReal(LLVMDoubleTypeInContext(gallivm->context), INFINITY) :
-         lp_build_const_float(gallivm, INFINITY);
+         (bit_size == 16 ? LLVMConstReal(LLVMHalfTypeInContext(gallivm->context), INFINITY) : lp_build_const_float(gallivm, INFINITY));
       store_val = LLVMBuildBitCast(builder, flt_max, int_bld->elem_type, "");
       break;
    }
    case nir_op_fmax: {
       LLVMValueRef flt_min = bit_size == 64 ? LLVMConstReal(LLVMDoubleTypeInContext(gallivm->context), -INFINITY) :
-         lp_build_const_float(gallivm, -INFINITY);
+         (bit_size == 16 ? LLVMConstReal(LLVMHalfTypeInContext(gallivm->context), -INFINITY) : lp_build_const_float(gallivm, -INFINITY));
       store_val = LLVMBuildBitCast(builder, flt_min, int_bld->elem_type, "");
       break;
    }
    case nir_op_fmul: {
       LLVMValueRef flt_one = bit_size == 64 ? LLVMConstReal(LLVMDoubleTypeInContext(gallivm->context), 1.0) :
-         lp_build_const_float(gallivm, 1.0);
+         (bit_size == 16 ? LLVMConstReal(LLVMHalfTypeInContext(gallivm->context), 1.0) : lp_build_const_float(gallivm, 1.0));
       store_val = LLVMBuildBitCast(builder, flt_one, int_bld->elem_type, "");
       break;
    }
    case nir_op_umin:
-      store_val = lp_build_const_int32(gallivm, UINT_MAX);
+      switch (bit_size) {
+      case 8:
+         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), UINT8_MAX, 0);
+         break;
+      case 16:
+         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), UINT16_MAX, 0);
+         break;
+      case 32:
+      default:
+         store_val  = lp_build_const_int32(gallivm, UINT_MAX);
+         break;
+      case 64:
+         store_val  = lp_build_const_int64(gallivm, UINT64_MAX);
+         break;
+      }
       break;
    case nir_op_imin:
-      store_val = lp_build_const_int32(gallivm, INT_MAX);
+      switch (bit_size) {
+      case 8:
+         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), INT8_MAX, 0);
+         break;
+      case 16:
+         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), INT16_MAX, 0);
+         break;
+      case 32:
+      default:
+         store_val  = lp_build_const_int32(gallivm, INT_MAX);
+         break;
+      case 64:
+         store_val  = lp_build_const_int64(gallivm, INT64_MAX);
+         break;
+      }
       break;
    case nir_op_imax:
-      store_val = lp_build_const_int32(gallivm, INT_MIN);
+      switch (bit_size) {
+      case 8:
+         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), INT8_MIN, 0);
+         break;
+      case 16:
+         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), INT16_MIN, 0);
+         break;
+      case 32:
+      default:
+         store_val  = lp_build_const_int32(gallivm, INT_MIN);
+         break;
+      case 64:
+         store_val  = lp_build_const_int64(gallivm, INT64_MIN);
+         break;
+      }
       break;
    case nir_op_imul:
-      store_val = lp_build_const_int32(gallivm, 1);
+      switch (bit_size) {
+      case 8:
+         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), 1, 0);
+         break;
+      case 16:
+         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), 1, 0);
+         break;
+      case 32:
+      default:
+         store_val  = lp_build_const_int32(gallivm, 1);
+         break;
+      case 64:
+         store_val  = lp_build_const_int64(gallivm, 1);
+         break;
+      }
       break;
    case nir_op_iand:
-      store_val = lp_build_const_int32(gallivm, 0xffffffff);
+      switch (bit_size) {
+      case 8:
+         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), 0xff, 0);
+         break;
+      case 16:
+         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), 0xffff, 0);
+         break;
+      case 32:
+      default:
+         store_val  = lp_build_const_int32(gallivm, 0xffffffff);
+         break;
+      case 64:
+         store_val  = lp_build_const_int64(gallivm, 0xffffffffffffffffLL);
+         break;
+      }
       break;
    default:
       break;
@@ -2105,28 +2220,27 @@ static void emit_read_invocation(struct lp_build_nir_context *bld_base,
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef idx;
    struct lp_build_context *uint_bld = get_int_bld(bld_base, true, bit_size);
-   if (invoc) {
-      idx = invoc;
-      idx = LLVMBuildExtractElement(gallivm->builder, idx, lp_build_const_int32(gallivm, 0), "");
-   } else {
-      /* have to find the first active invocation */
-      LLVMValueRef exec_mask = mask_vec(bld_base);
-      struct lp_build_loop_state loop_state;
-      LLVMValueRef res_store = lp_build_alloca(gallivm, bld_base->int_bld.elem_type, "");
-      LLVMValueRef outer_cond = LLVMBuildICmp(builder, LLVMIntNE, exec_mask, bld_base->uint_bld.zero, "");
-      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, bld_base->uint_bld.type.length));
 
-      LLVMValueRef if_cond = LLVMBuildExtractElement(gallivm->builder, outer_cond, loop_state.counter, "");
-      struct lp_build_if_state ifthen;
+   /* have to find the first active invocation */
+   LLVMValueRef exec_mask = mask_vec(bld_base);
+   struct lp_build_loop_state loop_state;
+   LLVMValueRef res_store = lp_build_alloca(gallivm, bld_base->int_bld.elem_type, "");
+   LLVMValueRef outer_cond = LLVMBuildICmp(builder, LLVMIntNE, exec_mask, bld_base->uint_bld.zero, "");
+   lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, bld_base->uint_bld.type.length));
 
-      lp_build_if(&ifthen, gallivm, if_cond);
-      LLVMBuildStore(builder, loop_state.counter, res_store);
-      lp_build_endif(&ifthen);
+   LLVMValueRef if_cond = LLVMBuildExtractElement(gallivm->builder, outer_cond, loop_state.counter, "");
+   struct lp_build_if_state ifthen;
 
-      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, -1),
-                             lp_build_const_int32(gallivm, -1), LLVMIntEQ);
-      idx = LLVMBuildLoad(builder, res_store, "");
-   }
+   lp_build_if(&ifthen, gallivm, if_cond);
+   LLVMValueRef store_val = loop_state.counter;
+   if (invoc)
+      store_val = LLVMBuildExtractElement(gallivm->builder, invoc, loop_state.counter, "");
+   LLVMBuildStore(builder, store_val, res_store);
+   lp_build_endif(&ifthen);
+
+   lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, -1),
+                          lp_build_const_int32(gallivm, -1), LLVMIntEQ);
+   idx = LLVMBuildLoad(builder, res_store, "");
 
    LLVMValueRef value = LLVMBuildExtractElement(gallivm->builder,
                                                 src, idx, "");
@@ -2312,6 +2426,12 @@ void lp_build_nir_soa(struct gallivm_state *gallivm,
       lp_build_context_init(&bld.bld_base.dbl_bld, gallivm, dbl_type);
    }
    {
+      struct lp_type half_type;
+      half_type = type;
+      half_type.width /= 2;
+      lp_build_context_init(&bld.bld_base.half_bld, gallivm, half_type);
+   }
+   {
       struct lp_type uint64_type;
       uint64_type = lp_uint_type(type);
       uint64_type.width *= 2;
@@ -2399,6 +2519,7 @@ void lp_build_nir_soa(struct gallivm_state *gallivm,
 
    bld.context_ptr = params->context_ptr;
    bld.thread_data_ptr = params->thread_data_ptr;
+   bld.bld_base.aniso_filter_table = params->aniso_filter_table;
    bld.image = params->image;
    bld.shared_ptr = params->shared_ptr;
    bld.coro = params->coro;
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index c608e42c1..604b3b04a 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -113,6 +113,7 @@ struct lp_sampler_params
    const LLVMValueRef *offsets;
    LLVMValueRef ms_index;
    LLVMValueRef lod;
+   LLVMValueRef aniso_filter_table;
    const struct lp_derivatives *derivs;
    LLVMValueRef *texel;
 };
@@ -201,10 +202,7 @@ struct lp_static_sampler_state
    unsigned apply_min_lod:1;  /**< min_lod > 0 ? */
    unsigned apply_max_lod:1;  /**< max_lod < last_level ? */
    unsigned seamless_cube_map:1;
-
-   /* Hacks */
-   unsigned force_nearest_s:1;
-   unsigned force_nearest_t:1;
+   unsigned aniso:1;
    unsigned reduction_mode:2;
 };
 
@@ -330,6 +328,13 @@ struct lp_sampler_dynamic_state
                    LLVMValueRef context_ptr,
                    unsigned sampler_unit);
 
+   /** Obtain maximum anisotropy */
+   LLVMValueRef
+   (*max_aniso)(const struct lp_sampler_dynamic_state *state,
+                struct gallivm_state *gallivm,
+                LLVMValueRef context_ptr,
+                unsigned sampler_unit);
+
    /** 
     * Obtain texture cache (returns ptr to lp_build_format_cache).
     *
@@ -444,6 +449,8 @@ struct lp_build_sample_context
    LLVMValueRef border_color_clamped;
 
    LLVMValueRef context_ptr;
+
+   LLVMValueRef aniso_filter_table;
 };
 
 /*
@@ -577,6 +584,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
                       unsigned mip_filter,
+                      LLVMValueRef max_aniso,
                       LLVMValueRef *out_lod,
                       LLVMValueRef *out_lod_ipart,
                       LLVMValueRef *out_lod_fpart,
@@ -790,6 +798,8 @@ lp_build_reduce_filter_3d(struct lp_build_context *bld,
                           LLVMValueRef *v110,
                           LLVMValueRef *v111,
                           LLVMValueRef *out);
+
+const float *lp_build_sample_aniso_filter_table(void);
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 875271883..ea2ec780f 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -408,7 +408,9 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
             switch (swizzles[i]) {
             default:
                assert(0);
+#if defined(NDEBUG) || defined(DEBUG)
                FALLTHROUGH;
+#endif
             case PIPE_SWIZZLE_X:
             case PIPE_SWIZZLE_Y:
             case PIPE_SWIZZLE_Z:
diff --git a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.c b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.c
index da139a838..a261ae981 100644
--- a/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.c
+++ b/lib/mesa/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -31,7 +31,7 @@
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_init.h"
-
+#include "lp_bld_limits.h"
 
 LLVMTypeRef
 lp_build_elem_type(struct gallivm_state *gallivm, struct lp_type type)
@@ -39,7 +39,7 @@ lp_build_elem_type(struct gallivm_state *gallivm, struct lp_type type)
    if (type.floating) {
       switch(type.width) {
       case 16:
-         return LLVMIntTypeInContext(gallivm->context, 16);
+         return lp_has_fp16() ? LLVMHalfTypeInContext(gallivm->context) : LLVMInt16TypeInContext(gallivm->context);
          break;
       case 32:
          return LLVMFloatTypeInContext(gallivm->context);
@@ -89,7 +89,7 @@ lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type)
    if (type.floating) {
       switch(type.width) {
       case 16:
-         if(elem_kind != LLVMIntegerTypeKind)
+         if(elem_kind != (lp_has_fp16() ? LLVMHalfTypeKind : LLVMIntegerTypeKind))
             return FALSE;
          break;
       case 32:
@@ -259,6 +259,8 @@ lp_sizeof_llvm_type(LLVMTypeRef t)
       return 8 * sizeof(float);
    case LLVMDoubleTypeKind:
       return 8 * sizeof(double);
+   case LLVMHalfTypeKind:
+      return 8 * sizeof(uint16_t);
    case LLVMVectorTypeKind:
       {
          LLVMTypeRef elem = LLVMGetElementType(t);
@@ -291,6 +293,8 @@ lp_typekind_name(LLVMTypeKind t)
       return "LLVMVoidTypeKind";
    case LLVMFloatTypeKind:
       return "LLVMFloatTypeKind";
+   case LLVMHalfTypeKind:
+      return "LLVMHalfTypeKind";
    case LLVMDoubleTypeKind:
       return "LLVMDoubleTypeKind";
    case LLVMX86_FP80TypeKind:
diff --git a/lib/mesa/src/gallium/auxiliary/hud/font.c b/lib/mesa/src/gallium/auxiliary/hud/font.c
index c7f8aef0d..a372410b1 100644
--- a/lib/mesa/src/gallium/auxiliary/hud/font.c
+++ b/lib/mesa/src/gallium/auxiliary/hud/font.c
@@ -417,8 +417,8 @@ util_font_create_fixed_8x13(struct pipe_context *pipe,
       return FALSE;
    }
 
-   map = pipe_transfer_map(pipe, tex, 0, 0, PIPE_MAP_WRITE, 0, 0,
-                           tex->width0, tex->height0, &transfer);
+   map = pipe_texture_map(pipe, tex, 0, 0, PIPE_MAP_WRITE, 0, 0,
+                          tex->width0, tex->height0, &transfer);
    if (!map) {
       pipe_resource_reference(&tex, NULL);
       return FALSE;
@@ -432,7 +432,7 @@ util_font_create_fixed_8x13(struct pipe_context *pipe,
                                transfer->stride, i);
    }
 
-   pipe_transfer_unmap(pipe, transfer);
+   pipe_texture_unmap(pipe, transfer);
 
    pipe_resource_reference(&out_font->texture, NULL);
    out_font->texture = tex;
diff --git a/lib/mesa/src/gallium/auxiliary/indices/u_indices.h b/lib/mesa/src/gallium/auxiliary/indices/u_indices.h
index f160fcbc6..3c57f9c5d 100644
--- a/lib/mesa/src/gallium/auxiliary/indices/u_indices.h
+++ b/lib/mesa/src/gallium/auxiliary/indices/u_indices.h
@@ -82,6 +82,18 @@ enum indices_mode {
 
 void u_index_init( void );
 
+/* returns the primitive type resulting from index translation */
+enum pipe_prim_type
+u_index_prim_type_convert(unsigned hw_mask, enum pipe_prim_type prim, bool pv_matches);
+
+static inline unsigned
+u_index_size_convert(unsigned index_size)
+{
+   return (index_size == 4) ? 4 : 2;
+}
+
+unsigned
+u_index_count_converted_indices(unsigned hw_mask, bool pv_matches, enum pipe_prim_type prim, unsigned nr);
 
 /**
  * For indexed drawing, this function determines what kind of primitive
diff --git a/lib/mesa/src/gallium/auxiliary/meson.build b/lib/mesa/src/gallium/auxiliary/meson.build
index a30e44186..7682357ed 100644
--- a/lib/mesa/src/gallium/auxiliary/meson.build
+++ b/lib/mesa/src/gallium/auxiliary/meson.build
@@ -60,7 +60,6 @@ files_libgallium = files(
   'draw/draw_pt_decompose.h',
   'draw/draw_pt_emit.c',
   'draw/draw_pt_fetch.c',
-  'draw/draw_pt_fetch_emit.c',
   'draw/draw_pt_fetch_shade_emit.c',
   'draw/draw_pt_fetch_shade_pipeline.c',
   'draw/draw_pt.h',
@@ -239,14 +238,10 @@ files_libgallium = files(
   'util/u_cache.h',
   'util/u_compute.c',
   'util/u_compute.h',
-  'util/u_debug_describe.c',
-  'util/u_debug_describe.h',
   'util/u_debug_flush.c',
   'util/u_debug_flush.h',
   'util/u_debug_image.c',
   'util/u_debug_image.h',
-  'util/u_debug_refcnt.c',
-  'util/u_debug_refcnt.h',
   'util/u_dirty_flags.h',
   'util/u_dirty_surfaces.h',
   'util/u_dl.c',
@@ -255,19 +250,18 @@ files_libgallium = files(
   'util/u_draw.h',
   'util/u_draw_quad.c',
   'util/u_draw_quad.h',
+  'util/u_driconf.c',
+  'util/u_driconf.h',
   'util/u_dual_blend.h',
   'util/u_dump_defines.c',
   'util/u_dump.h',
   'util/u_dump_state.c',
-  'util/u_fifo.h',
   'util/u_framebuffer.c',
   'util/u_framebuffer.h',
   'util/u_gen_mipmap.c',
   'util/u_gen_mipmap.h',
   'util/u_handle_table.c',
   'util/u_handle_table.h',
-  'util/u_hash_table.c',
-  'util/u_hash_table.h',
   'util/u_helpers.c',
   'util/u_helpers.h',
   'util/u_index_modify.c',
@@ -314,9 +308,6 @@ files_libgallium = files(
   'util/u_texture.h',
   'util/u_tile.c',
   'util/u_tile.h',
-  'util/u_trace.c',
-  'util/u_trace.h',
-  'util/u_trace_priv.h',
   'util/u_transfer.c',
   'util/u_transfer.h',
   'util/u_transfer_helper.c',
@@ -324,10 +315,14 @@ files_libgallium = files(
   'util/u_threaded_context.c',
   'util/u_threaded_context.h',
   'util/u_threaded_context_calls.h',
+  'util/u_trace_gallium.c',
+  'util/u_trace_gallium.h',
   'util/u_upload_mgr.c',
   'util/u_upload_mgr.h',
   'util/u_vbuf.c',
   'util/u_vbuf.h',
+  'util/u_vertex_state_cache.c',
+  'util/u_vertex_state_cache.h',
   'util/u_video.h',
   'util/u_viewport.h',
   'nir/tgsi_to_nir.c',
@@ -483,15 +478,13 @@ if with_dri2 and with_platform_x11
   endif
 endif
 
-u_trace_py = files('util/u_trace.py')
-
 files_libgallium += custom_target(
   'u_tracepoints.c',
   input: 'util/u_tracepoints.py',
   output: 'u_tracepoints.c',
   command: [
     prog_python, '@INPUT@',
-    '-p', join_paths(meson.source_root(), 'src/gallium/auxiliary/util/'),
+    '-p', join_paths(meson.source_root(), 'src/util/perf/'),
     '-C', '@OUTPUT@',
   ],
   depend_files: u_trace_py,
@@ -503,7 +496,7 @@ files_u_tracepoints = custom_target(
   output: 'u_tracepoints.h',
   command: [
     prog_python, '@INPUT@',
-    '-p', join_paths(meson.source_root(), 'src/gallium/auxiliary/util/'),
+    '-p', join_paths(meson.source_root(), 'src/util/perf/'),
     '-H', '@OUTPUT@',
   ],
   depend_files: u_trace_py,
diff --git a/lib/mesa/src/gallium/auxiliary/nir/nir_draw_helpers.c b/lib/mesa/src/gallium/auxiliary/nir/nir_draw_helpers.c
index b5706631f..5122ce401 100644
--- a/lib/mesa/src/gallium/auxiliary/nir/nir_draw_helpers.c
+++ b/lib/mesa/src/gallium/auxiliary/nir/nir_draw_helpers.c
@@ -167,7 +167,7 @@ nir_lower_aaline_block(nir_block *block,
       nir_variable *var = nir_intrinsic_get_var(intrin, 0);
       if (var->data.mode != nir_var_shader_out)
          continue;
-      if (var->data.location != FRAG_RESULT_COLOR)
+      if (var->data.location < FRAG_RESULT_DATA0 && var->data.location != FRAG_RESULT_COLOR)
          continue;
 
       nir_ssa_def *out_input = intrin->src[1].ssa;
@@ -262,7 +262,7 @@ nir_lower_aapoint_block(nir_block *block,
       nir_variable *var = nir_intrinsic_get_var(intrin, 0);
       if (var->data.mode != nir_var_shader_out)
          continue;
-      if (var->data.location != FRAG_RESULT_COLOR)
+      if (var->data.location < FRAG_RESULT_DATA0 && var->data.location != FRAG_RESULT_COLOR)
          continue;
 
       nir_ssa_def *out_input = intrin->src[1].ssa;
diff --git a/lib/mesa/src/gallium/auxiliary/nir/nir_to_tgsi.c b/lib/mesa/src/gallium/auxiliary/nir/nir_to_tgsi.c
index 3c73d342c..b016b07bf 100644
--- a/lib/mesa/src/gallium/auxiliary/nir/nir_to_tgsi.c
+++ b/lib/mesa/src/gallium/auxiliary/nir/nir_to_tgsi.c
@@ -31,6 +31,7 @@
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_ureg.h"
 #include "util/debug.h"
+#include "util/u_math.h"
 #include "util/u_memory.h"
 
 struct ntt_compile {
@@ -53,7 +54,7 @@ struct ntt_compile {
 
    /* TGSI temps for our NIR SSA and register values. */
    struct ureg_dst *reg_temp;
-   struct ureg_dst *ssa_temp;
+   struct ureg_src *ssa_temp;
 
    nir_instr_liveness *liveness;
 
@@ -65,11 +66,34 @@ struct ntt_compile {
    struct ureg_src *input_index_map;
    uint64_t centroid_inputs;
 
+   uint32_t first_ubo;
+
    struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
 };
 
 static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
 
+/**
+ * Interprets a nir_load_const used as a NIR src as a uint.
+ *
+ * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
+ * instruction (or in a phi-web used by an integer ALU instruction) were
+ * converted to floats and the ALU instruction swapped to the float equivalent.
+ * However, this means that integer load_consts used by intrinsics (which don't
+ * normally get that conversion) may have been reformatted to be floats.  Given
+ * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
+ * we can just look and see if they look like floats and convert them back to
+ * ints.
+ */
+static uint32_t
+ntt_src_as_uint(struct ntt_compile *c, nir_src src)
+{
+   uint32_t val = nir_src_as_uint(src);
+   if (!c->native_integers && val >= fui(1.0))
+      val = (uint32_t)uif(val);
+   return val;
+}
+
 static unsigned
 ntt_64bit_write_mask(unsigned write_mask)
 {
@@ -163,7 +187,7 @@ ntt_tgsi_var_usage_mask(const struct nir_variable *var)
 }
 
 static struct ureg_dst
-ntt_store_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
+ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
 {
    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
    int base = nir_intrinsic_base(instr);
@@ -172,9 +196,6 @@ ntt_store_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_
 
    struct ureg_dst out;
    if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
-      if (semantics.location == FRAG_RESULT_COLOR)
-         ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
-
       unsigned semantic_name, semantic_index;
       tgsi_get_gl_frag_result_semantic(semantics.location,
                                        &semantic_name, &semantic_index);
@@ -225,7 +246,11 @@ ntt_store_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_
                                     invariant);
    }
 
-   unsigned write_mask = nir_intrinsic_write_mask(instr);
+   unsigned write_mask;
+   if (nir_intrinsic_has_write_mask(instr))
+      write_mask = nir_intrinsic_write_mask(instr);
+   else
+      write_mask = ((1 << instr->num_components) - 1) << *frac;
 
    if (is_64) {
       write_mask = ntt_64bit_write_mask(write_mask);
@@ -274,8 +299,8 @@ ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
    }
 
    uint32_t frac;
-   *dst = ntt_store_output_decl(c, intr, &frac);
-   dst->Index += nir_src_as_uint(intr->src[1]);
+   *dst = ntt_output_decl(c, intr, &frac);
+   dst->Index += ntt_src_as_uint(c, intr->src[1]);
 
    return frac == 0;
 }
@@ -338,15 +363,14 @@ ntt_setup_inputs(struct ntt_compile *c)
 
       uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
 
-      decl = ureg_DECL_fs_input_cyl_centroid_layout(c->ureg,
-                                                    semantic_name,
-                                                    semantic_index,
-                                                    interpolation,
-                                                    0,
-                                                    sample_loc,
-                                                    var->data.driver_location,
-                                                    usage_mask,
-                                                    array_id, array_len);
+      decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
+                                                semantic_name,
+                                                semantic_index,
+                                                interpolation,
+                                                sample_loc,
+                                                var->data.driver_location,
+                                                usage_mask,
+                                                array_id, array_len);
 
       if (semantic_name == TGSI_SEMANTIC_FACE) {
          struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
@@ -362,49 +386,173 @@ ntt_setup_inputs(struct ntt_compile *c)
    }
 }
 
+static int
+ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
+{
+   return a->data.location - b->data.location;
+}
+
+/**
+ * Workaround for virglrenderer requiring that TGSI FS output color variables
+ * are declared in order.  Besides, it's a lot nicer to read the TGSI this way.
+ */
 static void
-ntt_setup_uniforms(struct ntt_compile *c)
+ntt_setup_outputs(struct ntt_compile *c)
 {
-   struct pipe_screen *screen = c->screen;
-   bool packed = screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS);
+   if (c->s->info.stage != MESA_SHADER_FRAGMENT)
+      return;
 
+   nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);
+
+   nir_foreach_shader_out_variable(var, c->s) {
+      if (var->data.location == FRAG_RESULT_COLOR)
+         ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
+
+      unsigned semantic_name, semantic_index;
+      tgsi_get_gl_frag_result_semantic(var->data.location,
+                                       &semantic_name, &semantic_index);
+
+      (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
+   }
+}
+
+static enum tgsi_texture_type
+tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
+{
+   switch (dim) {
+   case GLSL_SAMPLER_DIM_1D:
+      if (is_shadow)
+         return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
+      else
+         return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
+   case GLSL_SAMPLER_DIM_2D:
+   case GLSL_SAMPLER_DIM_EXTERNAL:
+      if (is_shadow)
+         return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
+      else
+         return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
+   case GLSL_SAMPLER_DIM_3D:
+      return TGSI_TEXTURE_3D;
+   case GLSL_SAMPLER_DIM_CUBE:
+      if (is_shadow)
+         return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
+      else
+         return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
+   case GLSL_SAMPLER_DIM_RECT:
+      if (is_shadow)
+         return TGSI_TEXTURE_SHADOWRECT;
+      else
+         return TGSI_TEXTURE_RECT;
+   case GLSL_SAMPLER_DIM_MS:
+      return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
+   case GLSL_SAMPLER_DIM_BUF:
+      return TGSI_TEXTURE_BUFFER;
+   default:
+      unreachable("unknown sampler dim");
+   }
+}
+
+static enum tgsi_return_type
+tgsi_return_type_from_base_type(enum glsl_base_type type)
+{
+   switch (type) {
+   case GLSL_TYPE_INT:
+      return TGSI_RETURN_TYPE_SINT;
+   case GLSL_TYPE_UINT:
+      return TGSI_RETURN_TYPE_UINT;
+   case GLSL_TYPE_FLOAT:
+     return TGSI_RETURN_TYPE_FLOAT;
+   default:
+      unreachable("unexpected texture type");
+   }
+}
+
+static void
+ntt_setup_uniforms(struct ntt_compile *c)
+{
    nir_foreach_uniform_variable(var, c->s) {
-      if (glsl_type_is_image(var->type)) {
-         c->images[var->data.binding] = ureg_DECL_image(c->ureg,
-                                                        var->data.binding,
-                                                        TGSI_TEXTURE_2D,
-                                                        var->data.image.format,
-                                                        !var->data.read_only,
-                                                        false);
-      } else {
-         unsigned size;
-         if (packed) {
-            size = DIV_ROUND_UP(glsl_count_dword_slots(var->type,
-                                                       var->data.bindless), 4);
-         } else {
-            size = glsl_count_vec4_slots(var->type, false, var->data.bindless);
-         }
+      int image_count = glsl_type_get_image_count(var->type);
 
-         for (unsigned i = 0; i < size; i++)
-            ureg_DECL_constant(c->ureg, var->data.driver_location + i);
+      if (glsl_type_is_sampler(glsl_without_array(var->type))) {
+         /* Don't use this size for the check for samplers -- arrays of structs
+          * containing samplers should be ignored, and just the separate lowered
+          * sampler uniform decl used.
+          */
+         int size = glsl_type_get_sampler_count(var->type);
+
+         const struct glsl_type *stype = glsl_without_array(var->type);
+         enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
+                                                                            glsl_sampler_type_is_array(stype),
+                                                                            glsl_sampler_type_is_shadow(stype));
+         enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
+         for (int i = 0; i < size; i++) {
+            ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
+               target, ret_type, ret_type, ret_type, ret_type);
+            ureg_DECL_sampler(c->ureg, var->data.binding + i);
+         }
+      } else if (image_count) {
+         const struct glsl_type *itype = glsl_without_array(var->type);
+         enum tgsi_texture_type tex_type =
+             tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
+                                                glsl_sampler_type_is_array(itype), false);
+
+         for (int i = 0; i < image_count; i++) {
+            c->images[var->data.binding] = ureg_DECL_image(c->ureg,
+                                                           var->data.binding + i,
+                                                           tex_type,
+                                                           var->data.image.format,
+                                                           !(var->data.access & ACCESS_NON_WRITEABLE),
+                                                           false);
+         }
+      } else if (glsl_contains_atomic(var->type)) {
+         uint32_t offset = var->data.offset / 4;
+         uint32_t size = glsl_atomic_size(var->type) / 4;
+         ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
       }
+
+      /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
+       * size declaration happens with other UBOs below.
+       */
    }
 
+   c->first_ubo = ~0;
+
+   unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
    nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
-      ureg_DECL_constant2D(c->ureg, 0, 0, var->data.driver_location);
+      int ubo = var->data.driver_location;
+      if (ubo == -1)
+         continue;
+
+      if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
+         c->first_ubo = MIN2(c->first_ubo, ubo);
+
+      unsigned size = glsl_get_explicit_size(var->interface_type, false);
+
+      int array_size = 1;
+      if (glsl_type_is_interface(glsl_without_array(var->type)))
+         array_size = MAX2(1, glsl_array_size(var->type));
+      for (int i = 0; i < array_size; i++) {
+         /* Even if multiple NIR variables are in the same uniform block, their
+          * explicit size is the size of the block.
+          */
+         if (ubo_sizes[ubo + i])
+            assert(ubo_sizes[ubo + i] == size);
+
+         ubo_sizes[ubo + i] = size;
+      }
+   }
+
+   for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
+      if (ubo_sizes[i])
+         ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
    }
 
-   nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ssbo) {
+   for (int i = 0; i < c->s->info.num_ssbos; i++) {
       /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
        * counters
        */
       bool atomic = false;
-      ureg_DECL_buffer(c->ureg, var->data.binding, atomic);
-   }
-
-   for (int i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      if (BITSET_TEST(c->s->info.textures_used, i))
-         ureg_DECL_sampler(c->ureg, i);
+      ureg_DECL_buffer(c->ureg, i, atomic);
    }
 }
 
@@ -438,22 +586,32 @@ ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
 static struct ureg_src
 ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
 {
-   uint32_t values[4];
    int num_components = instr->def.num_components;
 
-   if (instr->def.bit_size == 32) {
+   if (!c->native_integers) {
+      float values[4];
+      assert(instr->def.bit_size == 32);
       for (int i = 0; i < num_components; i++)
-         values[i] = instr->value[i].u32;
+         values[i] = uif(instr->value[i].u32);
+
+      return ureg_DECL_immediate(c->ureg, values, num_components);
    } else {
-      assert(num_components <= 2);
-      for (int i = 0; i < num_components; i++) {
-         values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
-         values[i * 2 + 1] = instr->value[i].u64 >> 32;
+      uint32_t values[4];
+
+      if (instr->def.bit_size == 32) {
+         for (int i = 0; i < num_components; i++)
+            values[i] = instr->value[i].u32;
+      } else {
+         assert(num_components <= 2);
+         for (int i = 0; i < num_components; i++) {
+            values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
+            values[i * 2 + 1] = instr->value[i].u64 >> 32;
+         }
+         num_components *= 2;
       }
-      num_components *= 2;
-   }
 
-   return ureg_DECL_immediate_uint(c->ureg, values, num_components);
+      return ureg_DECL_immediate_uint(c->ureg, values, num_components);
+   }
 }
 
 static struct ureg_src
@@ -509,7 +667,7 @@ ntt_get_src(struct ntt_compile *c, nir_src src)
       if (src.ssa->parent_instr->type == nir_instr_type_load_const)
          return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
 
-      return ureg_src(c->ssa_temp[src.ssa->index]);
+      return c->ssa_temp[src.ssa->index];
    } else {
       nir_register *reg = src.reg.reg;
       struct ureg_dst reg_temp = c->reg_temp[reg->index];
@@ -575,7 +733,7 @@ ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
                        (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
 }
 
-static struct ureg_dst *
+static struct ureg_dst
 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
 {
    uint32_t writemask = BITSET_MASK(ssa->num_components);
@@ -586,24 +744,24 @@ ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
    if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses))
       dst = ureg_DECL_temporary(c->ureg);
 
-   c->ssa_temp[ssa->index] = ureg_writemask(dst, writemask);
+   c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
 
-   return &c->ssa_temp[ssa->index];
+   return ureg_writemask(dst, writemask);
 }
 
-static struct ureg_dst *
+static struct ureg_dst
 ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
 {
    if (dest->is_ssa)
       return ntt_get_ssa_def_decl(c, &dest->ssa);
    else
-      return &c->reg_temp[dest->reg.reg->index];
+      return c->reg_temp[dest->reg.reg->index];
 }
 
 static struct ureg_dst
 ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
 {
-   struct ureg_dst dst = *ntt_get_dest_decl(c, dest);
+   struct ureg_dst dst = ntt_get_dest_decl(c, dest);
 
    if (!dest->is_ssa) {
       dst.Index += dest->reg.base_offset;
@@ -623,22 +781,18 @@ ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
 static void
 ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
 {
-   if (!src.Negate && !src.Absolute && !src.Indirect && !src.DimIndirect &&
-       src.SwizzleX == TGSI_SWIZZLE_X &&
-       (src.SwizzleY == TGSI_SWIZZLE_Y || def->num_components < 2) &&
-       (src.SwizzleZ == TGSI_SWIZZLE_Z || def->num_components < 3) &&
-       (src.SwizzleW == TGSI_SWIZZLE_W || def->num_components < 4)) {
+   if (!src.Indirect && !src.DimIndirect) {
       switch (src.File) {
       case TGSI_FILE_IMMEDIATE:
       case TGSI_FILE_INPUT:
       case TGSI_FILE_CONSTANT:
       case TGSI_FILE_SYSTEM_VALUE:
-         c->ssa_temp[def->index] = ureg_dst(src);
+         c->ssa_temp[def->index] = src;
          return;
       }
    }
 
-   ureg_MOV(c->ureg, *ntt_get_ssa_def_decl(c, def), src);
+   ureg_MOV(c->ureg, ntt_get_ssa_def_decl(c, def), src);
 }
 
 static void
@@ -1012,10 +1166,12 @@ ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
          /* NIR is src0 != 0 ? src1 : src2.
           * TGSI is src0 < 0 ? src1 : src2.
           *
-          * However, fcsel so far as I can find only appears on
-          * bools-as-floats (1.0 or 0.0), so we can negate it for the TGSI op.
+          * However, fcsel so far as I can find only appears on bools-as-floats
+          * (1.0 or 0.0), so we can just negate it for the TGSI op.  It's
+          * important to not have an abs here, as i915g has to make extra
+          * instructions to do the abs.
           */
-         ureg_CMP(c->ureg, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
+         ureg_CMP(c->ureg, dst, ureg_negate(src[0]), src[1], src[2]);
          break;
 
          /* It would be nice if we could get this left as scalar in NIR, since
@@ -1097,7 +1253,7 @@ ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
                       nir_src src)
 {
    if (nir_src_is_const(src)) {
-      usrc.Index += nir_src_as_uint(src);
+      usrc.Index += ntt_src_as_uint(c, src);
       return usrc;
    } else {
       return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src)));
@@ -1109,7 +1265,7 @@ ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
                       nir_src src)
 {
    if (nir_src_is_const(src)) {
-      dst.Index += nir_src_as_uint(src);
+      dst.Index += ntt_src_as_uint(c, src);
       return dst;
    } else {
       return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src)));
@@ -1121,7 +1277,7 @@ ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
                          nir_src src)
 {
    if (nir_src_is_const(src)) {
-      return ureg_src_dimension(usrc, nir_src_as_uint(src));
+      return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
    }
    else
    {
@@ -1136,7 +1292,7 @@ ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
                                 nir_src src)
 {
    if (nir_src_is_const(src)) {
-      return ureg_dst_dimension(udst, nir_src_as_uint(src));
+      return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
    } else {
       return ureg_dst_dimension_indirect(udst,
                                          ntt_reladdr(c, ntt_get_src(c, src)),
@@ -1165,7 +1321,25 @@ ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
 
    struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
 
-   src = ntt_ureg_src_dimension_indirect(c, src, instr->src[0]);
+   struct ureg_dst addr_temp = ureg_dst_undef();
+
+   if (nir_src_is_const(instr->src[0])) {
+      src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
+   } else {
+      /* virglrenderer requires that indirect UBO references have the UBO
+       * array's base index in the Index field, not added to the indrect
+       * address.
+       *
+       * Many nir intrinsics have a base address const value for the start of
+       * their array indirection, but load_ubo doesn't.  We fake it by
+       * subtracting it off here.
+       */
+      addr_temp = ureg_DECL_temporary(c->ureg);
+      ureg_UADD(c->ureg, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
+      src = ureg_src_dimension_indirect(src,
+                                         ntt_reladdr(c, ureg_src(addr_temp)),
+                                         c->first_ubo);
+   }
 
    if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
       /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
@@ -1173,7 +1347,7 @@ ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
        */
 
       if (nir_src_is_const(instr->src[1])) {
-         src.Index += nir_src_as_uint(instr->src[1]);
+         src.Index += ntt_src_as_uint(c, instr->src[1]);
       } else {
          src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1])));
       }
@@ -1203,6 +1377,8 @@ ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
                        0 /* format: unused */
       );
    }
+
+   ureg_release_temporary(c->ureg, addr_temp);
 }
 
 static unsigned
@@ -1227,12 +1403,14 @@ ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
 {
    bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
                     instr->intrinsic == nir_intrinsic_store_shared);
-   bool is_load = (instr->intrinsic == nir_intrinsic_load_ssbo ||
+   bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read ||
+                    instr->intrinsic == nir_intrinsic_load_ssbo ||
                     instr->intrinsic == nir_intrinsic_load_shared);
    unsigned opcode;
    struct ureg_src src[4];
    int num_src = 0;
    int nir_src;
+   struct ureg_dst addr_temp = ureg_dst_undef();
 
    struct ureg_src memory;
    switch (mode) {
@@ -1245,6 +1423,21 @@ ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
       memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
       nir_src = 0;
       break;
+   case nir_var_uniform: { /* HW atomic buffers */
+      memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, 0);
+      /* ntt_ureg_src_indirect, except dividing by 4 */
+      if (nir_src_is_const(instr->src[0])) {
+         memory.Index += nir_src_as_uint(instr->src[0]) / 4;
+      } else {
+         addr_temp = ureg_DECL_temporary(c->ureg);
+         ureg_USHR(c->ureg, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, 2));
+         memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp)));
+      }
+      memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
+      nir_src = 0;
+      break;
+   }
+
    default:
       unreachable("unknown memory type");
    }
@@ -1256,13 +1449,26 @@ ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
       src[num_src++] = memory;
       if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
          src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* offset */
-         if (!is_load)
-            src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* value */
+         switch (instr->intrinsic) {
+         case nir_intrinsic_atomic_counter_inc:
+            src[num_src++] = ureg_imm1i(c->ureg, 1);
+            break;
+         case nir_intrinsic_atomic_counter_post_dec:
+            src[num_src++] = ureg_imm1i(c->ureg, -1);
+            break;
+         default:
+            if (!is_load)
+               src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* value */
+            break;
+         }
       }
    }
 
 
    switch (instr->intrinsic) {
+   case nir_intrinsic_atomic_counter_add:
+   case nir_intrinsic_atomic_counter_inc:
+   case nir_intrinsic_atomic_counter_post_dec:
    case nir_intrinsic_ssbo_atomic_add:
    case nir_intrinsic_shared_atomic_add:
       opcode = TGSI_OPCODE_ATOMUADD;
@@ -1271,10 +1477,12 @@ ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
    case nir_intrinsic_shared_atomic_fadd:
       opcode = TGSI_OPCODE_ATOMFADD;
       break;
+   case nir_intrinsic_atomic_counter_min:
    case nir_intrinsic_ssbo_atomic_imin:
    case nir_intrinsic_shared_atomic_imin:
       opcode = TGSI_OPCODE_ATOMIMIN;
       break;
+   case nir_intrinsic_atomic_counter_max:
    case nir_intrinsic_ssbo_atomic_imax:
    case nir_intrinsic_shared_atomic_imax:
       opcode = TGSI_OPCODE_ATOMIMAX;
@@ -1287,27 +1495,33 @@ ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
    case nir_intrinsic_shared_atomic_umax:
       opcode = TGSI_OPCODE_ATOMUMAX;
       break;
+   case nir_intrinsic_atomic_counter_and:
    case nir_intrinsic_ssbo_atomic_and:
    case nir_intrinsic_shared_atomic_and:
       opcode = TGSI_OPCODE_ATOMAND;
       break;
+   case nir_intrinsic_atomic_counter_or:
    case nir_intrinsic_ssbo_atomic_or:
    case nir_intrinsic_shared_atomic_or:
       opcode = TGSI_OPCODE_ATOMOR;
       break;
+   case nir_intrinsic_atomic_counter_xor:
    case nir_intrinsic_ssbo_atomic_xor:
    case nir_intrinsic_shared_atomic_xor:
       opcode = TGSI_OPCODE_ATOMXOR;
       break;
+   case nir_intrinsic_atomic_counter_exchange:
    case nir_intrinsic_ssbo_atomic_exchange:
    case nir_intrinsic_shared_atomic_exchange:
       opcode = TGSI_OPCODE_ATOMXCHG;
       break;
+   case nir_intrinsic_atomic_counter_comp_swap:
    case nir_intrinsic_ssbo_atomic_comp_swap:
    case nir_intrinsic_shared_atomic_comp_swap:
       opcode = TGSI_OPCODE_ATOMCAS;
       src[num_src++] = ntt_get_src(c, instr->src[nir_src++]);
       break;
+   case nir_intrinsic_atomic_counter_read:
    case nir_intrinsic_load_ssbo:
    case nir_intrinsic_load_shared:
       opcode = TGSI_OPCODE_LOAD;
@@ -1347,27 +1561,8 @@ ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
                     qualifier,
                     TGSI_TEXTURE_BUFFER,
                     0 /* format: unused */);
-}
 
-static enum tgsi_texture_type
-tgsi_target_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array)
-{
-   switch (dim) {
-   case GLSL_SAMPLER_DIM_1D:
-      return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
-   case GLSL_SAMPLER_DIM_2D:
-      return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
-   case GLSL_SAMPLER_DIM_3D:
-      return TGSI_TEXTURE_3D;
-   case GLSL_SAMPLER_DIM_CUBE:
-      return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
-   case GLSL_SAMPLER_DIM_RECT:
-      return TGSI_TEXTURE_RECT;
-   case GLSL_SAMPLER_DIM_BUF:
-      return TGSI_TEXTURE_BUFFER;
-   default:
-      unreachable("unknown sampler dim");
-   }
+   ureg_release_temporary(c->ureg, addr_temp);
 }
 
 static void
@@ -1381,7 +1576,7 @@ ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
 
    struct ureg_dst temp = ureg_dst_undef();
 
-   enum tgsi_texture_type target = tgsi_target_from_sampler_dim(dim, is_array);
+   enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);
 
    struct ureg_src resource =
       ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
@@ -1530,6 +1725,10 @@ ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
 
       switch (bary_instr->intrinsic) {
       case nir_intrinsic_load_barycentric_pixel:
+      case nir_intrinsic_load_barycentric_sample:
+         /* For these, we know that the barycentric load matches the
+          * interpolation on the input declaration, so we can use it directly.
+          */
          ntt_store(c, &instr->dest, input);
          break;
 
@@ -1547,9 +1746,9 @@ ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
          break;
 
       case nir_intrinsic_load_barycentric_at_sample:
+         /* We stored the sample in the fake "bary" dest. */
          ureg_INTERP_SAMPLE(c->ureg, ntt_get_dest(c, &instr->dest), input,
-                            ureg_imm1u(c->ureg,
-                                       nir_src_as_uint(bary_instr->src[0])));
+                            ntt_get_src(c, instr->src[0]));
          break;
 
       case nir_intrinsic_load_barycentric_at_offset:
@@ -1583,7 +1782,7 @@ ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
    }
 
    uint32_t frac;
-   struct ureg_dst out = ntt_store_output_decl(c, instr, &frac);
+   struct ureg_dst out = ntt_output_decl(c, instr, &frac);
 
    if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
       out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
@@ -1605,6 +1804,29 @@ ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
 }
 
 static void
+ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
+{
+   /* ntt_try_store_in_tgsi_output() optimization is not valid if load_output
+    * is present.
+    */
+   assert(c->s->info.stage != MESA_SHADER_VERTEX &&
+          c->s->info.stage != MESA_SHADER_FRAGMENT);
+
+   uint32_t frac;
+   struct ureg_dst out = ntt_output_decl(c, instr, &frac);
+
+   if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
+      out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
+      out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
+   } else {
+      out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
+   }
+
+   ureg_MOV(c->ureg, ntt_get_dest(c, &instr->dest), ureg_src(out));
+   ntt_reladdr_dst_put(c, out);
+}
+
+static void
 ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
 {
    gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
@@ -1618,6 +1840,23 @@ ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
    uint32_t write_mask = BITSET_MASK(nir_dest_num_components(instr->dest));
    sv = ntt_swizzle_for_write_mask(sv, write_mask);
 
+   /* TGSI and NIR define these intrinsics as always loading ints, but they can
+    * still appear on hardware with non-native-integers fragment shaders using
+    * the draw path (i915g).  In that case, having called nir_lower_int_to_float
+    * means that we actually want floats instead.
+    */
+   if (!c->native_integers) {
+      switch (instr->intrinsic) {
+      case nir_intrinsic_load_vertex_id:
+      case nir_intrinsic_load_instance_id:
+         ureg_U2F(c->ureg, ntt_get_dest(c, &instr->dest), sv);
+         return;
+
+      default:
+         break;
+      }
+   }
+
    ntt_store(c, &instr->dest, sv);
 }
 
@@ -1642,6 +1881,7 @@ ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
    case nir_intrinsic_load_point_coord:
    case nir_intrinsic_load_front_face:
    case nir_intrinsic_load_sample_id:
+   case nir_intrinsic_load_sample_pos:
    case nir_intrinsic_load_sample_mask_in:
    case nir_intrinsic_load_helper_invocation:
    case nir_intrinsic_load_tess_coord:
@@ -1650,9 +1890,9 @@ ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
    case nir_intrinsic_load_tess_level_outer:
    case nir_intrinsic_load_tess_level_inner:
    case nir_intrinsic_load_local_invocation_id:
-   case nir_intrinsic_load_work_group_id:
-   case nir_intrinsic_load_num_work_groups:
-   case nir_intrinsic_load_local_group_size:
+   case nir_intrinsic_load_workgroup_id:
+   case nir_intrinsic_load_num_workgroups:
+   case nir_intrinsic_load_workgroup_size:
    case nir_intrinsic_load_subgroup_size:
    case nir_intrinsic_load_subgroup_invocation:
    case nir_intrinsic_load_subgroup_eq_mask:
@@ -1673,6 +1913,11 @@ ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
       ntt_emit_store_output(c, instr);
       break;
 
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output:
+      ntt_emit_load_output(c, instr);
+      break;
+
    case nir_intrinsic_discard:
       ureg_KILL(c->ureg);
       break;
@@ -1725,6 +1970,23 @@ ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
       ntt_emit_mem(c, instr, nir_var_mem_shared);
       break;
 
+   case nir_intrinsic_atomic_counter_read:
+   case nir_intrinsic_atomic_counter_add:
+   case nir_intrinsic_atomic_counter_inc:
+   case nir_intrinsic_atomic_counter_post_dec:
+   case nir_intrinsic_atomic_counter_min:
+   case nir_intrinsic_atomic_counter_max:
+   case nir_intrinsic_atomic_counter_and:
+   case nir_intrinsic_atomic_counter_or:
+   case nir_intrinsic_atomic_counter_xor:
+   case nir_intrinsic_atomic_counter_exchange:
+   case nir_intrinsic_atomic_counter_comp_swap:
+      ntt_emit_mem(c, instr, nir_var_uniform);
+      break;
+   case nir_intrinsic_atomic_counter_pre_dec:
+      unreachable("Should be lowered by ntt_lower_atomic_pre_dec()");
+      break;
+
    case nir_intrinsic_image_load:
    case nir_intrinsic_image_store:
    case nir_intrinsic_image_size:
@@ -1743,6 +2005,7 @@ ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
       break;
 
    case nir_intrinsic_control_barrier:
+   case nir_intrinsic_memory_barrier_tcs_patch:
       ureg_BARRIER(c->ureg);
       break;
 
@@ -1788,14 +2051,14 @@ ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
       break;
 
       /* In TGSI we don't actually generate the barycentric coords, and emit
-       * interp intrinsics later.  However, we do need to store the _at_offset
-       * argument so that we can use it at that point.
+       * interp intrinsics later.  However, we do need to store the
+       * load_barycentric_at_* argument so that we can use it at that point.
        */
    case nir_intrinsic_load_barycentric_pixel:
    case nir_intrinsic_load_barycentric_centroid:
-   case nir_intrinsic_load_barycentric_at_sample:
+   case nir_intrinsic_load_barycentric_sample:
       break;
-
+   case nir_intrinsic_load_barycentric_at_sample:
    case nir_intrinsic_load_barycentric_at_offset:
       ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
       break;
@@ -1811,8 +2074,6 @@ ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
 struct ntt_tex_operand_state {
    struct ureg_src srcs[4];
    unsigned i;
-   unsigned chan;
-   bool is_temp[4];
 };
 
 static void
@@ -1825,51 +2086,14 @@ ntt_push_tex_arg(struct ntt_compile *c,
    if (tex_src < 0)
       return;
 
-   struct ureg_src src = ntt_get_src(c, instr->src[tex_src].src);
-   int num_components = nir_tex_instr_src_size(instr, tex_src);
-
-   /* Find which src in the tex args we'll fit in. */
-   if (s->chan + num_components > 4) {
-      s->chan = 0;
-      s->i++;
-   }
-
-   /* Would need to fix up swizzling up to the writemask channel here. */
-   assert(num_components == 1 || s->chan == 0);
-   if (num_components == 1)
-      src = ureg_scalar(src, 0);
-
-   if (ureg_src_is_undef(s->srcs[s->i])) {
-      /* First emit of a tex operand's components, no need for a mov. */
-      s->srcs[s->i] = src;
-   } else {
-      /* Otherwise, we need to have a temporary for all the components that go
-       * in this operand.
-       */
-      if (!s->is_temp[s->i]) {
-         struct ureg_src prev_src = s->srcs[s->i];
-         s->srcs[s->i] = ureg_src(ureg_DECL_temporary(c->ureg));
-         s->is_temp[s->i] = true;
-
-         ureg_MOV(c->ureg,
-                  ureg_writemask(ureg_dst(s->srcs[s->i]),
-                                 BITFIELD_MASK(s->chan)), prev_src);
-      }
-
-      ureg_MOV(c->ureg,
-               ureg_writemask(ureg_dst(s->srcs[s->i]),
-                              BITFIELD_RANGE(s->chan, num_components)),
-               src);
-   }
-
-   s->chan += num_components;
+   s->srcs[s->i++] = ntt_get_src(c, instr->src[tex_src].src);
 }
 
 static void
 ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
 {
    struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
-   unsigned target;
+   enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
    unsigned tex_opcode;
 
    struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
@@ -1881,7 +2105,11 @@ ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
 
    switch (instr->op) {
    case nir_texop_tex:
-      tex_opcode = TGSI_OPCODE_TEX;
+      if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
+         MAX2(instr->coord_components, 2) + instr->is_shadow)
+         tex_opcode = TGSI_OPCODE_TXP;
+      else
+         tex_opcode = TGSI_OPCODE_TEX;
       break;
    case nir_texop_txf:
    case nir_texop_txf_ms:
@@ -1891,7 +2119,7 @@ ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
          int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
          if (lod_src >= 0 &&
              nir_src_is_const(instr->src[lod_src].src) &&
-             nir_src_as_uint(instr->src[lod_src].src) == 0) {
+             ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
             tex_opcode = TGSI_OPCODE_TXF_LZ;
          }
       }
@@ -1925,92 +2153,16 @@ ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
    }
 
    struct ntt_tex_operand_state s = { .i = 0 };
-   ntt_push_tex_arg(c, instr, nir_tex_src_coord, &s);
-   /* We always have at least two slots for the coordinate, even on 1D. */
-   s.chan = MAX2(s.chan, 2);
-
-   ntt_push_tex_arg(c, instr, nir_tex_src_comparator, &s);
-   s.chan = MAX2(s.chan, 3);
+   ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
+   ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
 
-   ntt_push_tex_arg(c, instr, nir_tex_src_bias, &s);
-   if (tex_opcode != TGSI_OPCODE_TXF_LZ)
+   /* non-coord arg for TXQ */
+   if (tex_opcode == TGSI_OPCODE_TXQ) {
       ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
-
-   /* End of packed src setup, everything that follows gets its own operand. */
-   if (s.chan)
-      s.i++;
-
-   switch (instr->sampler_dim) {
-   case GLSL_SAMPLER_DIM_1D:
-      if (instr->is_array) {
-         if (instr->is_shadow) {
-            target = TGSI_TEXTURE_SHADOW1D_ARRAY;
-         } else {
-            target = TGSI_TEXTURE_1D_ARRAY;
-         }
-      } else {
-         if (instr->is_shadow) {
-            target = TGSI_TEXTURE_SHADOW1D;
-         } else {
-            target = TGSI_TEXTURE_1D;
-         }
-      }
-      break;
-   case GLSL_SAMPLER_DIM_2D:
-   case GLSL_SAMPLER_DIM_EXTERNAL:
-      if (instr->is_array) {
-         if (instr->is_shadow) {
-            target = TGSI_TEXTURE_SHADOW2D_ARRAY;
-         } else {
-            target = TGSI_TEXTURE_2D_ARRAY;
-         }
-      } else {
-         if (instr->is_shadow) {
-            target = TGSI_TEXTURE_SHADOW2D;
-         } else {
-            target = TGSI_TEXTURE_2D;
-         }
-      }
-      break;
-   case GLSL_SAMPLER_DIM_MS:
-      if (instr->is_array) {
-         target = TGSI_TEXTURE_2D_ARRAY_MSAA;
-      } else {
-         target = TGSI_TEXTURE_2D_ARRAY;
-      }
-      break;
-   case GLSL_SAMPLER_DIM_3D:
-      assert(!instr->is_shadow);
-      target = TGSI_TEXTURE_3D;
-      break;
-   case GLSL_SAMPLER_DIM_RECT:
-      if (instr->is_shadow) {
-         target = TGSI_TEXTURE_SHADOWRECT;
-      } else {
-         target = TGSI_TEXTURE_RECT;
-      }
-      break;
-   case GLSL_SAMPLER_DIM_CUBE:
-      if (instr->is_array) {
-         if (instr->is_shadow) {
-            target = TGSI_TEXTURE_SHADOWCUBE_ARRAY;
-         } else {
-            target = TGSI_TEXTURE_CUBE_ARRAY;
-         }
-      } else {
-         if (instr->is_shadow) {
-            target = TGSI_TEXTURE_SHADOWCUBE;
-         } else {
-            target = TGSI_TEXTURE_CUBE;
-         }
-      }
-      break;
-   case GLSL_SAMPLER_DIM_BUF:
-      target = TGSI_TEXTURE_BUFFER;
-      break;
-   default:
-      fprintf(stderr, "Unknown sampler dimensions: %d\n", instr->sampler_dim);
-      abort();
+      /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
+       * scalar
+       */
+      s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
    }
 
    if (s.i > 1) {
@@ -2090,11 +2242,6 @@ ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
       ureg_MOV(c->ureg, dst, ureg_scalar(ureg_src(tex_dst), 3));
       ureg_release_temporary(c->ureg, tex_dst);
    }
-
-   for (int i = 0; i < s.i; i++) {
-      if (s.is_temp[i])
-         ureg_release_temporary(c->ureg, ureg_dst(s.srcs[i]));
-   }
 }
 
 static void
@@ -2209,7 +2356,7 @@ ntt_free_ssa_temp_by_index(struct ntt_compile *c, int index)
    if (c->ssa_temp[index].File != TGSI_FILE_TEMPORARY)
       return;
 
-   ureg_release_temporary(c->ureg, c->ssa_temp[index]);
+   ureg_release_temporary(c->ureg, ureg_dst(c->ssa_temp[index]));
    memset(&c->ssa_temp[index], 0, sizeof(c->ssa_temp[index]));
 }
 
@@ -2294,7 +2441,7 @@ ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
    c->impl = impl;
    c->liveness = nir_live_ssa_defs_per_instr(impl);
 
-   c->ssa_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
+   c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
    c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
 
    ntt_setup_registers(c, &impl->registers);
@@ -2402,7 +2549,6 @@ static void
 ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
 {
    bool progress;
-   nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
    unsigned control_flow_depth =
       screen->get_shader_param(screen, pipe_stage,
@@ -2436,7 +2582,7 @@ ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
       NIR_PASS(progress, s, nir_opt_trivial_continues);
       NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
       NIR_PASS(progress, s, nir_opt_undef);
-      NIR_PASS(progress, s, nir_opt_loop_unroll, no_indirects_mask);
+      NIR_PASS(progress, s, nir_opt_loop_unroll);
 
    } while (progress);
 }
@@ -2649,6 +2795,110 @@ nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
                                        NULL);
 }
 
+struct ntt_lower_tex_state {
+   nir_ssa_def *channels[8];
+   unsigned i;
+};
+
+static void
+nir_to_tgsi_lower_tex_instr_arg(nir_builder *b,
+                                nir_tex_instr *instr,
+                                nir_tex_src_type tex_src_type,
+                                struct ntt_lower_tex_state *s)
+{
+   int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
+   if (tex_src < 0)
+      return;
+
+   assert(instr->src[tex_src].src.is_ssa);
+
+   nir_ssa_def *def = instr->src[tex_src].src.ssa;
+   for (int i = 0; i < def->num_components; i++) {
+      s->channels[s->i++] = nir_channel(b, def, i);
+   }
+
+   nir_tex_instr_remove_src(instr, tex_src);
+}
+
+/**
+ * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
+ * src.  This lets NIR handle the coalescing of the vec4 rather than trying to
+ * manage it on our own, and may lead to more vectorization.
+ */
+static bool
+nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_tex)
+      return false;
+
+   nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+   if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
+      return false;
+
+   /* NIR after lower_tex will have LOD set to 0 for tex ops that wanted
+    * implicit lod in shader stages that don't have quad-based derivatives.
+    * TGSI doesn't want that, it requires that the backend do implict LOD 0 for
+    * those stages.
+    */
+   if (!nir_shader_supports_implicit_lod(b->shader) && tex->op == nir_texop_txl) {
+      int lod_index = nir_tex_instr_src_index(tex, nir_tex_src_lod);
+      nir_src *lod_src = &tex->src[lod_index].src;
+      if (nir_src_is_const(*lod_src) && nir_src_as_uint(*lod_src) == 0) {
+         nir_tex_instr_remove_src(tex, lod_index);
+         tex->op = nir_texop_tex;
+      }
+   }
+
+   b->cursor = nir_before_instr(instr);
+
+   struct ntt_lower_tex_state s = {0};
+
+   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
+   /* We always have at least two slots for the coordinate, even on 1D. */
+   s.i = MAX2(s.i, 2);
+
+   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
+   s.i = MAX2(s.i, 3);
+
+   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
+
+   /* XXX: LZ */
+   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
+   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
+   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
+
+   /* No need to pack undefs in unused channels of the tex instr */
+   while (!s.channels[s.i - 1])
+      s.i--;
+
+   /* Instead of putting undefs in the unused slots of the vecs, just put in
+    * another used channel.  Otherwise, we'll get unnecessary moves into
+    * registers.
+    */
+   assert(s.channels[0] != NULL);
+   for (int i = 1; i < s.i; i++) {
+      if (!s.channels[i])
+         s.channels[i] = s.channels[0];
+   }
+
+   nir_tex_instr_add_src(tex, nir_tex_src_backend1, nir_src_for_ssa(nir_vec(b, s.channels, MIN2(s.i, 4))));
+   if (s.i > 4)
+      nir_tex_instr_add_src(tex, nir_tex_src_backend2, nir_src_for_ssa(nir_vec(b, &s.channels[4], s.i - 4)));
+
+   return true;
+}
+
+static bool
+nir_to_tgsi_lower_tex(nir_shader *s)
+{
+   return nir_shader_instructions_pass(s,
+                                       nir_to_tgsi_lower_tex_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
+}
+
 static void
 ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s)
 {
@@ -2657,20 +2907,27 @@ ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s)
       !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
                                 PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
 
+   nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
+
    if (!options->lower_extract_byte ||
        !options->lower_extract_word ||
+       !options->lower_insert_byte ||
+       !options->lower_insert_word ||
        !options->lower_fdph ||
        !options->lower_flrp64 ||
        !options->lower_fmod ||
        !options->lower_rotate ||
        !options->lower_uniforms_to_ubo ||
        !options->lower_vector_cmp ||
-       options->lower_fsqrt != lower_fsqrt) {
+       options->lower_fsqrt != lower_fsqrt ||
+       options->force_indirect_unrolling != no_indirects_mask) {
       nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
       *new_options = *s->options;
 
       new_options->lower_extract_byte = true;
       new_options->lower_extract_word = true;
+      new_options->lower_insert_byte = true;
+      new_options->lower_insert_word = true;
       new_options->lower_fdph = true;
       new_options->lower_flrp64 = true;
       new_options->lower_fmod = true;
@@ -2678,11 +2935,116 @@ ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s)
       new_options->lower_uniforms_to_ubo = true,
       new_options->lower_vector_cmp = true;
       new_options->lower_fsqrt = lower_fsqrt;
+      new_options->force_indirect_unrolling = no_indirects_mask;
 
       s->options = new_options;
    }
 }
 
+static bool
+ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data)
+{
+   return (instr->type == nir_instr_type_intrinsic &&
+           nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec);
+}
+
+static nir_ssa_def *
+ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data)
+{
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+   nir_ssa_def *old_result = &intr->dest.ssa;
+   intr->intrinsic = nir_intrinsic_atomic_counter_post_dec;
+
+   return nir_iadd_imm(b, old_result, -1);
+}
+
+static bool
+ntt_lower_atomic_pre_dec(nir_shader *s)
+{
+   return nir_shader_lower_instructions(s,
+                                        ntt_lower_atomic_pre_dec_filter,
+                                        ntt_lower_atomic_pre_dec_lower, NULL);
+}
+
+/* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
+static void
+nir_to_tgsi_lower_txp(nir_shader *s)
+{
+   nir_lower_tex_options lower_tex_options = {
+       .lower_txp = 0,
+   };
+
+   nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
+      nir_foreach_instr(instr, block) {
+         if (instr->type != nir_instr_type_tex)
+            continue;
+         nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+         if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
+            continue;
+
+         bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
+         bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
+         bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
+
+         /* We can do TXP for any tex (not txg) where we can fit all the
+          * coordinates and comparator and projector in one vec4 without any
+          * other modifiers to add on.
+          *
+          * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
+          * if we get any funny projectors then we just blow them all away.
+          */
+         if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
+            lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
+      }
+   }
+
+   /* nir_lower_tex must be run even if no options are set, because we need the
+    * LOD to be set for query_levels and for non-fragment shaders.
+    */
+   NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
+}
+
+static bool
+nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
+{
+   return (instr->type == nir_instr_type_intrinsic &&
+           nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
+}
+
+static nir_ssa_def *
+nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
+{
+   nir_variable *var = *(nir_variable **)data;
+   if (!var) {
+      var = nir_variable_create(b->shader, nir_var_shader_in, glsl_uint_type(), "gl_PrimitiveID");
+      var->data.location = VARYING_SLOT_PRIMITIVE_ID;
+      b->shader->info.inputs_read |= VARYING_BIT_PRIMITIVE_ID;
+      var->data.driver_location = b->shader->num_outputs++;
+
+      *(nir_variable **)data = var;
+   }
+
+   nir_io_semantics semantics = {
+      .location = var->data.location,
+       .num_slots = 1
+   };
+   return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
+                         .base = var->data.driver_location,
+                         .io_semantics = semantics);
+}
+
+static bool
+nir_lower_primid_sysval_to_input(nir_shader *s)
+{
+   nir_variable *input = NULL;
+
+   return nir_shader_lower_instructions(s,
+                                        nir_lower_primid_sysval_to_input_filter,
+                                        nir_lower_primid_sysval_to_input_lower, &input);
+}
+
 /**
  * Translates the NIR shader to TGSI.
  *
@@ -2709,12 +3071,18 @@ nir_to_tgsi(struct nir_shader *s,
               type_size, (nir_lower_io_options)0);
    NIR_PASS_V(s, nir_lower_regs_to_ssa);
 
-   const nir_lower_tex_options lower_tex_options = {
-      /* XXX: We could skip lowering of TXP for TEX with <=3 coord_compoennts.
-       */
-      .lower_txp = ~0,
-   };
-   NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
+   nir_to_tgsi_lower_txp(s);
+   NIR_PASS_V(s, nir_to_tgsi_lower_tex);
+
+   /* While TGSI can represent PRIMID as either an input or a system value,
+    * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
+    * depend on that.
+    */
+   if (s->info.stage == MESA_SHADER_GEOMETRY)
+      NIR_PASS_V(s, nir_lower_primid_sysval_to_input);
+
+   if (s->info.num_abos)
+      NIR_PASS_V(s, ntt_lower_atomic_pre_dec);
 
    if (!original_options->lower_uniforms_to_ubo) {
       NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
@@ -2754,6 +3122,9 @@ nir_to_tgsi(struct nir_shader *s,
    } else {
       NIR_PASS_V(s, nir_lower_int_to_float);
       NIR_PASS_V(s, nir_lower_bool_to_float);
+      /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
+      NIR_PASS_V(s, nir_copy_prop);
+      NIR_PASS_V(s, nir_opt_dce);
    }
 
    /* Only lower 32-bit floats.  The only other modifier type officially
@@ -2789,6 +3160,7 @@ nir_to_tgsi(struct nir_shader *s,
    ureg_setup_shader_info(c->ureg, &s->info);
 
    ntt_setup_inputs(c);
+   ntt_setup_outputs(c);
    ntt_setup_uniforms(c);
 
    if (s->info.stage == MESA_SHADER_FRAGMENT) {
@@ -2835,6 +3207,8 @@ static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
    .fuse_ffma64 = true,
    .lower_extract_byte = true,
    .lower_extract_word = true,
+   .lower_insert_byte = true,
+   .lower_insert_word = true,
    .lower_fdph = true,
    .lower_flrp64 = true,
    .lower_fmod = true,
diff --git a/lib/mesa/src/gallium/auxiliary/nir/nir_to_tgsi_info.c b/lib/mesa/src/gallium/auxiliary/nir/nir_to_tgsi_info.c
index 65fc8d2d6..3bb5f1f8b 100644
--- a/lib/mesa/src/gallium/auxiliary/nir/nir_to_tgsi_info.c
+++ b/lib/mesa/src/gallium/auxiliary/nir/nir_to_tgsi_info.c
@@ -220,21 +220,21 @@ static void scan_instruction(const struct nir_shader *nir,
       case nir_intrinsic_load_invocation_id:
          info->uses_invocationid = true;
          break;
-      case nir_intrinsic_load_num_work_groups:
+      case nir_intrinsic_load_num_workgroups:
          info->uses_grid_size = true;
          break;
-      case nir_intrinsic_load_local_group_size:
+      case nir_intrinsic_load_workgroup_size:
          /* The block size is translated to IMM with a fixed block size. */
          if (info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0)
             info->uses_block_size = true;
          break;
       case nir_intrinsic_load_local_invocation_id:
-      case nir_intrinsic_load_work_group_id: {
+      case nir_intrinsic_load_workgroup_id: {
          unsigned mask = nir_ssa_def_components_read(&intr->dest.ssa);
          while (mask) {
             unsigned i = u_bit_scan(&mask);
 
-            if (intr->intrinsic == nir_intrinsic_load_work_group_id)
+            if (intr->intrinsic == nir_intrinsic_load_workgroup_id)
                info->uses_block_id[i] = true;
             else
                info->uses_thread_id[i] = true;
@@ -487,9 +487,9 @@ void nir_tgsi_scan_shader(const struct nir_shader *nir,
    }
 
    if (gl_shader_stage_is_compute(nir->info.stage)) {
-      info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.cs.local_size[0];
-      info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.cs.local_size[1];
-      info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.cs.local_size[2];
+      info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] = nir->info.workgroup_size[0];
+      info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] = nir->info.workgroup_size[1];
+      info->properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] = nir->info.workgroup_size[2];
    }
 
    i = 0;
@@ -498,7 +498,7 @@ void nir_tgsi_scan_shader(const struct nir_shader *nir,
       unsigned semantic_name, semantic_index;
 
       const struct glsl_type *type = variable->type;
-      if (nir_is_per_vertex_io(variable, nir->info.stage)) {
+      if (nir_is_arrayed_io(variable, nir->info.stage)) {
          assert(glsl_type_is_array(type));
          type = glsl_get_array_element(type);
       }
@@ -598,7 +598,7 @@ void nir_tgsi_scan_shader(const struct nir_shader *nir,
       i = variable->data.driver_location;
 
       const struct glsl_type *type = variable->type;
-      if (nir_is_per_vertex_io(variable, nir->info.stage)) {
+      if (nir_is_arrayed_io(variable, nir->info.stage)) {
          assert(glsl_type_is_array(type));
          type = glsl_get_array_element(type);
       }
diff --git a/lib/mesa/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h b/lib/mesa/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
index 8e7294629..3b630f776 100644
--- a/lib/mesa/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
+++ b/lib/mesa/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
@@ -31,11 +31,12 @@ DRI_CONF_SECTION_DEBUG
    DRI_CONF_GLSL_CORRECT_DERIVATIVES_AFTER_DISCARD(false)
    DRI_CONF_GLSL_IGNORE_WRITE_TO_READONLY_VAR(false)
    DRI_CONF_ALLOW_DRAW_OUT_OF_ORDER(false)
-   DRI_CONF_ALLOW_INCORRECT_PRIMITIVE_ID(false)
    DRI_CONF_FORCE_COMPAT_PROFILE(false)
    DRI_CONF_FORCE_GL_NAMES_REUSE(false)
    DRI_CONF_TRANSCODE_ETC(false)
+   DRI_CONF_TRANSCODE_ASTC(false)
    DRI_CONF_FORCE_GL_VENDOR()
+   DRI_CONF_FORCE_GL_RENDERER()
    DRI_CONF_OVERRIDE_VRAM_SIZE()
    DRI_CONF_GLX_EXTENSION_OVERRIDE()
    DRI_CONF_INDIRECT_GL_EXTENSION_OVERRIDE()
@@ -47,7 +48,7 @@ DRI_CONF_SECTION_MISCELLANEOUS
    DRI_CONF_ALWAYS_HAVE_DEPTH_BUFFER(false)
    DRI_CONF_GLSL_ZERO_INIT(false)
    DRI_CONF_VS_POSITION_ALWAYS_INVARIANT(false)
+   DRI_CONF_VS_POSITION_ALWAYS_PRECISE(false)
    DRI_CONF_ALLOW_RGB10_CONFIGS(true)
-   DRI_CONF_ALLOW_FP16_CONFIGS(false)
    DRI_CONF_FORCE_INTEGER_TEX_NEAREST(false)
 DRI_CONF_SECTION_END
diff --git a/lib/mesa/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/lib/mesa/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 8c0545505..7e3e8a4b1 100644
--- a/lib/mesa/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/lib/mesa/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -64,13 +64,13 @@ enum pb_usage_flags {
    PB_USAGE_CPU_WRITE = (1 << 1),
    PB_USAGE_GPU_READ = (1 << 2),
    PB_USAGE_GPU_WRITE = (1 << 3),
-   PB_USAGE_DONTBLOCK = (1 << 9),
-   PB_USAGE_UNSYNCHRONIZED = (1 << 10),
+   PB_USAGE_DONTBLOCK = (1 << 4),
+   PB_USAGE_UNSYNCHRONIZED = (1 << 5),
    /* Persistent mappings may remain across a flush. Note that contrary
     * to OpenGL persistent maps, there is no requirement at the pipebuffer
     * api level to explicitly enforce coherency by barriers or range flushes.
     */
-   PB_USAGE_PERSISTENT = (1 << 13)
+   PB_USAGE_PERSISTENT = (1 << 8)
 };
 
 /* For error checking elsewhere */
@@ -288,7 +288,7 @@ pb_reference_with_winsys(void *winsys,
  * the requested or not.
  */
 static inline boolean
-pb_check_alignment(pb_size requested, pb_size provided)
+pb_check_alignment(uint32_t requested, uint32_t provided)
 {
    if (!requested)
       return TRUE;
@@ -310,16 +310,6 @@ pb_check_usage(unsigned requested, unsigned provided)
    return (requested & provided) == requested ? TRUE : FALSE;
 }
 
-
-/**
- * Malloc-based buffer to store data that can't be used by the graphics
- * hardware.
- */
-struct pb_buffer *
-pb_malloc_buffer_create(pb_size size,
-                        const struct pb_desc *desc);
-
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/mesa/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/lib/mesa/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index 4595e1a40..d48f79c23 100644
--- a/lib/mesa/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/lib/mesa/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -232,7 +232,7 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    }
    
    assert(pipe_is_referenced(&buf->buffer->reference));
-   assert(pb_check_alignment(desc->alignment, 1ull << buf->buffer->alignment_log2));
+   assert(pb_check_alignment(desc->alignment, 1u << buf->buffer->alignment_log2));
    assert(buf->buffer->size >= size);
    
    pipe_reference_init(&buf->base.reference, 1);
diff --git a/lib/mesa/src/gallium/auxiliary/postprocess/pp_colors.c b/lib/mesa/src/gallium/auxiliary/postprocess/pp_colors.c
index f319ebb22..e7ce77758 100644
--- a/lib/mesa/src/gallium/auxiliary/postprocess/pp_colors.c
+++ b/lib/mesa/src/gallium/auxiliary/postprocess/pp_colors.c
@@ -47,7 +47,7 @@ pp_nocolor(struct pp_queue_t *ppq, struct pipe_resource *in,
    pp_filter_misc_state(p);
 
    cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 1, samplers);
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, &p->view);
+   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, false, &p->view);
 
    cso_set_vertex_shader_handle(p->cso, ppq->shaders[n][0]);
    cso_set_fragment_shader_handle(p->cso, ppq->shaders[n][1]);
diff --git a/lib/mesa/src/gallium/auxiliary/postprocess/pp_mlaa.c b/lib/mesa/src/gallium/auxiliary/postprocess/pp_mlaa.c
index 2bc2ac873..102e71f48 100644
--- a/lib/mesa/src/gallium/auxiliary/postprocess/pp_mlaa.c
+++ b/lib/mesa/src/gallium/auxiliary/postprocess/pp_mlaa.c
@@ -134,7 +134,7 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
       const struct pipe_sampler_state *samplers[] = {&p->sampler_point};
       cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 1, samplers);
    }
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, &p->view);
+   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, false, &p->view);
 
    cso_set_vertex_shader_handle(p->cso, ppq->shaders[n][1]);    /* offsetvs */
    cso_set_fragment_shader_handle(p->cso, ppq->shaders[n][2]);
@@ -166,7 +166,7 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    }
 
    arr[0] = p->view;
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 3, 0, arr);
+   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 3, 0, false, arr);
 
    cso_set_vertex_shader_handle(p->cso, ppq->shaders[n][0]);    /* passvs */
    cso_set_fragment_shader_handle(p->cso, ppq->shaders[n][3]);
@@ -198,7 +198,7 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    }
 
    arr[1] = p->view;
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 2, 0, arr);
+   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 2, 0, false, arr);
 
    cso_set_vertex_shader_handle(p->cso, ppq->shaders[n][1]);    /* offsetvs */
    cso_set_fragment_shader_handle(p->cso, ppq->shaders[n][4]);
diff --git a/lib/mesa/src/gallium/auxiliary/postprocess/pp_run.c b/lib/mesa/src/gallium/auxiliary/postprocess/pp_run.c
index 3615f348c..93e0fa7b7 100644
--- a/lib/mesa/src/gallium/auxiliary/postprocess/pp_run.c
+++ b/lib/mesa/src/gallium/auxiliary/postprocess/pp_run.c
@@ -184,14 +184,11 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    }
 
    /* restore state we changed */
-   cso_restore_state(cso);
-
-   /* Unbind resources that we have bound. */
-   struct pipe_context *pipe = ppq->p->pipe;
-   pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, false, NULL);
-   pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, false, NULL);
-   pipe->set_vertex_buffers(pipe, 0, 0, 1, false, NULL);
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 0, 3, NULL);
+   cso_restore_state(cso, CSO_UNBIND_FS_SAMPLERVIEWS |
+                          CSO_UNBIND_FS_IMAGE0 |
+                          CSO_UNBIND_VS_CONSTANTS |
+                          CSO_UNBIND_FS_CONSTANTS |
+                          CSO_UNBIND_VERTEX_BUFFER0);
 
    /* restore states not restored by cso */
    if (ppq->p->st) {
diff --git a/lib/mesa/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/lib/mesa/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
index 76eda8467..4fb74993d 100644
--- a/lib/mesa/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
+++ b/lib/mesa/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
@@ -33,6 +33,10 @@
 #include "d3d12/d3d12_public.h"
 #endif
 
+#ifdef GALLIUM_ASAHI
+#include "asahi/agx_public.h"
+#endif
+
 static inline struct pipe_screen *
 sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
 {
@@ -71,30 +75,38 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
       screen = d3d12_create_dxcore_screen(winsys, NULL);
 #endif
 
+#if defined(GALLIUM_ASAHI)
+   if (screen == NULL && strcmp(driver, "asahi") == 0)
+      screen = agx_screen_create(winsys);
+#endif
+
    return screen ? debug_screen_wrap(screen) : NULL;
 }
 
 
 static inline struct pipe_screen *
-sw_screen_create(struct sw_winsys *winsys)
+sw_screen_create_vk(struct sw_winsys *winsys, bool sw_vk)
 {
    UNUSED bool only_sw = env_var_as_boolean("LIBGL_ALWAYS_SOFTWARE", false);
    const char *drivers[] = {
-      debug_get_option("GALLIUM_DRIVER", ""),
+      (sw_vk ? "" : debug_get_option("GALLIUM_DRIVER", "")),
 #if defined(GALLIUM_D3D12)
-      only_sw ? "" : "d3d12",
+      (sw_vk || only_sw) ? "" : "d3d12",
+#endif
+#if defined(GALLIUM_ASAHI)
+      (sw_vk || only_sw) ? "" : "asahi",
 #endif
 #if defined(GALLIUM_LLVMPIPE)
       "llvmpipe",
 #endif
 #if defined(GALLIUM_SOFTPIPE)
-      "softpipe",
+      (sw_vk ? "" : "softpipe"),
 #endif
 #if defined(GALLIUM_SWR)
-      "swr",
+      (sw_vk ? "" : "swr"),
 #endif
 #if defined(GALLIUM_ZINK)
-      only_sw ? "" : "zink",
+      (sw_vk || only_sw) ? "" : "zink",
 #endif
    };
 
@@ -109,4 +121,9 @@ sw_screen_create(struct sw_winsys *winsys)
    return NULL;
 }
 
+static inline struct pipe_screen *
+sw_screen_create(struct sw_winsys *winsys)
+{
+   return sw_screen_create_vk(winsys, false);
+}
 #endif
diff --git a/lib/mesa/src/gallium/auxiliary/target-helpers/sw_helper.h b/lib/mesa/src/gallium/auxiliary/target-helpers/sw_helper.h
index 88a5086d2..059ae2d44 100644
--- a/lib/mesa/src/gallium/auxiliary/target-helpers/sw_helper.h
+++ b/lib/mesa/src/gallium/auxiliary/target-helpers/sw_helper.h
@@ -21,6 +21,10 @@
 #include "d3d12/d3d12_public.h"
 #endif
 
+#ifdef GALLIUM_ASAHI
+#include "asahi/agx_public.h"
+#endif
+
 #ifdef GALLIUM_SOFTPIPE
 #include "softpipe/sp_public.h"
 #endif
@@ -76,30 +80,37 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
       screen = d3d12_create_dxcore_screen(winsys, NULL);
 #endif
 
+#if defined(GALLIUM_ASAHI)
+   if (screen == NULL && strcmp(driver, "asahi") == 0)
+      screen = agx_screen_create(winsys);
+#endif
+
    return screen;
 }
 
-
 struct pipe_screen *
-sw_screen_create(struct sw_winsys *winsys)
+sw_screen_create_vk(struct sw_winsys *winsys, bool sw_vk)
 {
    UNUSED bool only_sw = env_var_as_boolean("LIBGL_ALWAYS_SOFTWARE", false);
    const char *drivers[] = {
-      debug_get_option("GALLIUM_DRIVER", ""),
+      (sw_vk ? "" : debug_get_option("GALLIUM_DRIVER", "")),
 #if defined(GALLIUM_D3D12)
-      only_sw ? "" : "d3d12",
+      (sw_vk || only_sw) ? "" : "d3d12",
+#endif
+#if defined(GALLIUM_ASAHI)
+      (sw_vk || only_sw) ? "" : "asahi",
 #endif
 #if defined(GALLIUM_LLVMPIPE)
       "llvmpipe",
 #endif
 #if defined(GALLIUM_SOFTPIPE)
-      "softpipe",
+      sw_vk ? "" : "softpipe",
 #endif
 #if defined(GALLIUM_SWR)
-      "swr",
+      sw_vk ? "" : "swr",
 #endif
 #if defined(GALLIUM_ZINK)
-      only_sw ? "" : "zink",
+      (sw_vk || only_sw) ? "" : "zink",
 #endif
    };
 
@@ -114,4 +125,9 @@ sw_screen_create(struct sw_winsys *winsys)
    return NULL;
 }
 
+struct pipe_screen *
+sw_screen_create(struct sw_winsys *winsys)
+{
+   return sw_screen_create_vk(winsys, false);
+}
 #endif
diff --git a/lib/mesa/src/gallium/auxiliary/target-helpers/sw_helper_public.h b/lib/mesa/src/gallium/auxiliary/target-helpers/sw_helper_public.h
index 12b301b6a..499813cca 100644
--- a/lib/mesa/src/gallium/auxiliary/target-helpers/sw_helper_public.h
+++ b/lib/mesa/src/gallium/auxiliary/target-helpers/sw_helper_public.h
@@ -5,6 +5,9 @@ struct pipe_screen;
 struct sw_winsys;
 
 struct pipe_screen *
+sw_screen_create_vk(struct sw_winsys *winsys, bool sw_vk);
+
+struct pipe_screen *
 sw_screen_create(struct sw_winsys *winsys);
 
 #endif /* _SW_HELPER_PUBLIC_H */
diff --git a/lib/mesa/src/gallium/auxiliary/translate/translate.h b/lib/mesa/src/gallium/auxiliary/translate/translate.h
index d77561aa7..b70d90b09 100644
--- a/lib/mesa/src/gallium/auxiliary/translate/translate.h
+++ b/lib/mesa/src/gallium/auxiliary/translate/translate.h
@@ -45,12 +45,19 @@
 #include "pipe/p_state.h"
 
 /**
- * Translate has to work on one more attribute because
- * the draw module has to be able to pass the vertex
- * position even if the fragment shader already consumes
- * PIPE_MAX_ATTRIBS inputs.
+ * Translate has to work on two more attributes because
+ * the draw module has to be able to pass a few fixed
+ * function vertex shader outputs even if the fragment
+ * shader already consumes PIPE_MAX_ATTRIBS inputs.
+ *
+ * These vertex shader outputs include:
+ * - position
+ * - bcolor (up to two)
+ * - point-size
+ * - viewport index
+ * - layer
  */
-#define TRANSLATE_MAX_ATTRIBS (PIPE_MAX_ATTRIBS + 1)
+#define TRANSLATE_MAX_ATTRIBS (PIPE_MAX_ATTRIBS + 6)
 
 enum translate_element_type {
    TRANSLATE_ELEMENT_NORMAL,
@@ -132,6 +139,7 @@ boolean translate_is_output_format_supported(enum pipe_format format);
 
 static inline int translate_keysize( const struct translate_key *key )
 {
+   assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS);
    return 2 * sizeof(int) + key->nr_elements * sizeof(struct translate_element);
 }
 
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_box.h b/lib/mesa/src/gallium/auxiliary/util/u_box.h
index 764bf5037..c39e13964 100644
--- a/lib/mesa/src/gallium/auxiliary/util/u_box.h
+++ b/lib/mesa/src/gallium/auxiliary/util/u_box.h
@@ -3,6 +3,7 @@
 
 #include "pipe/p_state.h"
 #include "util/u_math.h"
+#include "util/format/u_format.h"
 
 static inline void
 u_box_1d(unsigned x, unsigned w, struct pipe_box *box)
@@ -239,4 +240,22 @@ u_box_minify_3d(struct pipe_box *dst,
    dst->depth = MAX2(src->depth >> l, 1);
 }
 
+/* Converts a box specified in pixels to an equivalent box specified
+ * in blocks, where the boxes represent a region-of-interest of an image with
+ * the given format. This is trivial (a copy) for uncompressed formats.
+ */
+static inline void
+u_box_pixels_to_blocks(struct pipe_box *blocks,
+                       const struct pipe_box *pixels, enum pipe_format format)
+{
+   u_box_3d(
+         pixels->x / util_format_get_blockwidth(format),
+         pixels->y / util_format_get_blockheight(format),
+         pixels->z,
+         DIV_ROUND_UP(pixels->width, util_format_get_blockwidth(format)),
+         DIV_ROUND_UP(pixels->height, util_format_get_blockheight(format)),
+         pixels->depth,
+         blocks);
+}
+
 #endif
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_compute.c b/lib/mesa/src/gallium/auxiliary/util/u_compute.c
index 79755abaf..8d4d871b2 100644
--- a/lib/mesa/src/gallium/auxiliary/util/u_compute.c
+++ b/lib/mesa/src/gallium/auxiliary/util/u_compute.c
@@ -76,7 +76,7 @@ static void *blit_compute_shader(struct pipe_context *ctx)
 }
 
 void util_compute_blit(struct pipe_context *ctx, struct pipe_blit_info *blit_info,
-                       void **compute_state)
+                       void **compute_state, bool half_texel_offset)
 {
    if (blit_info->src.box.width == 0 || blit_info->src.box.height == 0 ||
        blit_info->dst.box.width == 0 || blit_info->dst.box.height == 0)
@@ -91,9 +91,10 @@ void util_compute_blit(struct pipe_context *ctx, struct pipe_blit_info *blit_inf
    float x_scale = blit_info->src.box.width / (float)blit_info->dst.box.width;
    float y_scale = blit_info->src.box.height / (float)blit_info->dst.box.height;
    float z_scale = blit_info->src.box.depth / (float)blit_info->dst.box.depth;
+   float offset = half_texel_offset ? 0.5 : 0.0;
 
-   unsigned data[] = {u_bitcast_f2u(blit_info->src.box.x / (float)src->width0),
-                      u_bitcast_f2u(blit_info->src.box.y / (float)src->height0),
+   unsigned data[] = {u_bitcast_f2u((blit_info->src.box.x + offset) / (float)src->width0),
+                      u_bitcast_f2u((blit_info->src.box.y + offset) / (float)src->height0),
                       u_bitcast_f2u(blit_info->src.box.z),
                       u_bitcast_f2u(0),
                       u_bitcast_f2u(x_scale / src->width0),
@@ -138,7 +139,7 @@ void util_compute_blit(struct pipe_context *ctx, struct pipe_blit_info *blit_inf
    u_sampler_view_default_template(&src_templ, src, src->format);
    src_templ.format = util_format_linear(blit_info->src.format);
    src_view = ctx->create_sampler_view(ctx, src, &src_templ);
-   ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, &src_view);
+   ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, false, &src_view);
 
    if (!*compute_state)
      *compute_state = blit_compute_shader(ctx);
@@ -159,7 +160,7 @@ void util_compute_blit(struct pipe_context *ctx, struct pipe_blit_info *blit_inf
 
    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 0, 1, NULL);
    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, false, NULL);
-   ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 0, 1, NULL);
+   ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 0, 1, false, NULL);
    pipe_sampler_view_reference(&src_view, NULL);
    ctx->delete_sampler_state(ctx, sampler_state_p);
    ctx->bind_compute_state(ctx, NULL);
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_compute.h b/lib/mesa/src/gallium/auxiliary/util/u_compute.h
index 8c2866af8..4a6c66e0e 100644
--- a/lib/mesa/src/gallium/auxiliary/util/u_compute.h
+++ b/lib/mesa/src/gallium/auxiliary/util/u_compute.h
@@ -36,7 +36,7 @@ extern "C" {
 #endif
 
 void util_compute_blit(struct pipe_context *ctx, struct pipe_blit_info *blit_info,
-                       void **compute_state);
+                       void **compute_state, bool half_texel_offset);
 
 #ifdef __cplusplus
 }
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_debug_image.c b/lib/mesa/src/gallium/auxiliary/util/u_debug_image.c
index 91bfa10af..fd0513f65 100644
--- a/lib/mesa/src/gallium/auxiliary/util/u_debug_image.c
+++ b/lib/mesa/src/gallium/auxiliary/util/u_debug_image.c
@@ -113,10 +113,10 @@ debug_dump_surface(struct pipe_context *pipe,
     */
    texture = surface->texture;
 
-   data = pipe_transfer_map(pipe, texture, surface->u.tex.level,
-                            surface->u.tex.first_layer,
-                            PIPE_MAP_READ,
-                            0, 0, surface->width, surface->height, &transfer);
+   data = pipe_texture_map(pipe, texture, surface->u.tex.level,
+                           surface->u.tex.first_layer,
+                           PIPE_MAP_READ,
+                           0, 0, surface->width, surface->height, &transfer);
    if (!data)
       return;
 
@@ -128,7 +128,7 @@ debug_dump_surface(struct pipe_context *pipe,
                     transfer->stride,
                     data);
 
-   pipe->transfer_unmap(pipe, transfer);
+   pipe->texture_unmap(pipe, transfer);
 }
 
 
@@ -192,13 +192,13 @@ debug_dump_surface_bmp(struct pipe_context *pipe,
    struct pipe_resource *texture = surface->texture;
    void *ptr;
 
-   ptr = pipe_transfer_map(pipe, texture, surface->u.tex.level,
-                           surface->u.tex.first_layer, PIPE_MAP_READ,
-                           0, 0, surface->width, surface->height, &transfer);
+   ptr = pipe_texture_map(pipe, texture, surface->u.tex.level,
+                          surface->u.tex.first_layer, PIPE_MAP_READ,
+                          0, 0, surface->width, surface->height, &transfer);
 
    debug_dump_transfer_bmp(pipe, filename, transfer, ptr);
 
-   pipe->transfer_unmap(pipe, transfer);
+   pipe->texture_unmap(pipe, transfer);
 }
 
 void
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_driconf.c b/lib/mesa/src/gallium/auxiliary/util/u_driconf.c
new file mode 100644
index 000000000..8ace84747
--- /dev/null
+++ b/lib/mesa/src/gallium/auxiliary/util/u_driconf.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "u_driconf.h"
+
+void
+u_driconf_fill_st_options(struct st_config_options *options,
+                          const struct driOptionCache *optionCache)
+{
+#define query_option_impl(option, type) \
+   options->option = driQueryOption##type(optionCache, #option)
+#define query_bool_option(option) query_option_impl(option, b)
+#define query_int_option(option) query_option_impl(option, i)
+#define query_string_option(option) \
+   do { \
+      char *option = driQueryOptionstr(optionCache, #option); \
+      if (*option) \
+         options->option = strdup(option); \
+   } while (0)
+
+   query_bool_option(disable_blend_func_extended);
+   query_bool_option(disable_arb_gpu_shader5);
+   query_bool_option(disable_glsl_line_continuations);
+   query_bool_option(force_glsl_extensions_warn);
+   query_int_option(force_glsl_version);
+   query_bool_option(allow_extra_pp_tokens);
+   query_bool_option(allow_glsl_extension_directive_midshader);
+   query_bool_option(allow_glsl_120_subset_in_110);
+   query_bool_option(allow_glsl_builtin_const_expression);
+   query_bool_option(allow_glsl_relaxed_es);
+   query_bool_option(allow_glsl_builtin_variable_redeclaration);
+   query_bool_option(allow_higher_compat_version);
+   query_bool_option(glsl_ignore_write_to_readonly_var);
+   query_bool_option(glsl_zero_init);
+   query_bool_option(force_integer_tex_nearest);
+   query_bool_option(vs_position_always_invariant);
+   query_bool_option(vs_position_always_precise);
+   query_bool_option(force_glsl_abs_sqrt);
+   query_bool_option(allow_glsl_cross_stage_interpolation_mismatch);
+   query_bool_option(allow_draw_out_of_order);
+   query_bool_option(ignore_map_unsynchronized);
+   query_bool_option(force_gl_names_reuse);
+   query_bool_option(transcode_etc);
+   query_bool_option(transcode_astc);
+   query_string_option(force_gl_vendor);
+   query_string_option(force_gl_renderer);
+
+   driComputeOptionsSha1(optionCache, options->config_options_sha1);
+}
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_driconf.h b/lib/mesa/src/gallium/auxiliary/util/u_driconf.h
new file mode 100644
index 000000000..00eead301
--- /dev/null
+++ b/lib/mesa/src/gallium/auxiliary/util/u_driconf.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef U_DRICONF_H_
+#define U_DRICONF_H_
+
+#include "util/xmlconfig.h"
+#include "frontend/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void
+u_driconf_fill_st_options(struct st_config_options *options,
+                          const struct driOptionCache *optionCache);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_DRICONF_H_ */
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_prim.c b/lib/mesa/src/gallium/auxiliary/util/u_prim.c
index cbd48e26a..a84d0e71e 100644
--- a/lib/mesa/src/gallium/auxiliary/util/u_prim.c
+++ b/lib/mesa/src/gallium/auxiliary/util/u_prim.c
@@ -21,12 +21,25 @@
  */
 
 #include "u_prim.h"
+#include "pipe/p_state.h"
 
 
 /** Return string name of given primitive type */
 const char *
 u_prim_name(enum pipe_prim_type prim)
 {
+#if defined(__GNUC__)
+   /* Check that the enum is packed: */
+   STATIC_ASSERT(sizeof(enum pipe_prim_type) == 1);
+#endif
+
+   /* Draw merging in u_threaded_context requires that sizeof(mode) == 1. */
+   struct pipe_draw_info info;
+   STATIC_ASSERT(sizeof(info.mode) == 1);
+
+   struct pipe_draw_vertex_state_info dvs_info;
+   STATIC_ASSERT(sizeof(dvs_info.mode) == 1);
+
    static const struct debug_named_value names[] = {
       DEBUG_NAMED_VALUE(PIPE_PRIM_POINTS),
       DEBUG_NAMED_VALUE(PIPE_PRIM_LINES),
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_prim.h b/lib/mesa/src/gallium/auxiliary/util/u_prim.h
index b9d4a9e80..1fbb2f5b5 100644
--- a/lib/mesa/src/gallium/auxiliary/util/u_prim.h
+++ b/lib/mesa/src/gallium/auxiliary/util/u_prim.h
@@ -201,12 +201,16 @@ u_vertices_per_prim(enum pipe_prim_type primitive)
    case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
       return 6;
 
+   case PIPE_PRIM_QUADS:
+   case PIPE_PRIM_QUAD_STRIP:
+      /* these won't be seen from geometry shaders
+         but prim assembly might for prim id. */
+      return 4;
+
    /* following primitives should never be used
     * with geometry shaders abd their size is
     * undefined */
    case PIPE_PRIM_POLYGON:
-   case PIPE_PRIM_QUADS:
-   case PIPE_PRIM_QUAD_STRIP:
    default:
       debug_printf("Unrecognized geometry shader primitive");
       return 3;
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_screen.c b/lib/mesa/src/gallium/auxiliary/util/u_screen.c
index 6e6aadbfd..eba554600 100644
--- a/lib/mesa/src/gallium/auxiliary/util/u_screen.c
+++ b/lib/mesa/src/gallium/auxiliary/util/u_screen.c
@@ -77,6 +77,7 @@ u_pipe_screen_get_param_defaults(struct pipe_screen *pscreen,
    case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
    case PIPE_CAP_DEPTH_CLIP_DISABLE:
    case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
+   case PIPE_CAP_DEPTH_CLAMP_ENABLE:
    case PIPE_CAP_SHADER_STENCIL_EXPORT:
    case PIPE_CAP_TGSI_INSTANCEID:
    case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
@@ -87,6 +88,10 @@ u_pipe_screen_get_param_defaults(struct pipe_screen *pscreen,
    case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND:
       return 0;
 
+   case PIPE_CAP_SUPPORTED_PRIM_MODES_WITH_RESTART:
+   case PIPE_CAP_SUPPORTED_PRIM_MODES:
+      return BITFIELD_MASK(PIPE_PRIM_MAX);
+
    case PIPE_CAP_MIN_TEXEL_OFFSET:
       /* GL 3.x minimum value. */
       return -8;
@@ -269,7 +274,6 @@ u_pipe_screen_get_param_defaults(struct pipe_screen *pscreen,
    case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
    case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
    case PIPE_CAP_CULL_DISTANCE:
-   case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES:
    case PIPE_CAP_TGSI_VOTE:
    case PIPE_CAP_MAX_WINDOW_RECTANGLES: /* Enables EXT_window_rectangles */
    case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED:
@@ -287,6 +291,7 @@ u_pipe_screen_get_param_defaults(struct pipe_screen *pscreen,
       return 4; /* GLES 2.0 minimum value */
 
    case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
+   case PIPE_CAP_PREFER_BACK_BUFFER_REUSE:
       return 1;
 
    case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
@@ -460,11 +465,16 @@ u_pipe_screen_get_param_defaults(struct pipe_screen *pscreen,
       return 0;
 
    case PIPE_CAP_SAMPLER_REDUCTION_MINMAX:
+   case PIPE_CAP_SAMPLER_REDUCTION_MINMAX_ARB:
       return 0;
 
    case PIPE_CAP_ALLOW_DYNAMIC_VAO_FASTPATH:
       return 1;
 
+   case PIPE_CAP_EMULATE_NONFIXED_PRIMITIVE_RESTART:
+   case PIPE_CAP_DRAW_VERTEX_STATE:
+      return 0;
+
    default:
       unreachable("bad PIPE_CAP_*");
    }
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_sse.h b/lib/mesa/src/gallium/auxiliary/util/u_sse.h
index cae4138ba..e372d3b6b 100644
--- a/lib/mesa/src/gallium/auxiliary/util/u_sse.h
+++ b/lib/mesa/src/gallium/auxiliary/util/u_sse.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2008 VMware, Inc.
+ * Copyright 2008-2021 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -38,6 +38,8 @@
 #define U_SSE_H_
 
 #include "pipe/p_config.h"
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
 
 #if defined(PIPE_ARCH_SSE)
 
@@ -296,6 +298,408 @@ transpose2_64_2_32(const __m128i * restrict a01,
 #define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
 
 
+/*
+ * Implements (1-w)*a + w*b = a - wa + wb = w(b-a) + a
+ * ((b-a)*w >> 8) + a
+ * The math behind negative sub results (logic shift/mask) is tricky.
+ *
+ * w -- weight values
+ * a -- src0 values
+ * b -- src1 values
+ */
+static ALWAYS_INLINE __m128i
+util_sse2_lerp_epi16(__m128i w, __m128i a, __m128i b)
+{
+   __m128i res;
+
+   res = _mm_sub_epi16(b, a);
+   res = _mm_mullo_epi16(res, w);
+   res = _mm_srli_epi16(res, 8);
+   /* use add_epi8 instead of add_epi16 so no need to mask off upper bits */
+   res = _mm_add_epi8(res, a);
+
+   return res;
+}
+
+
+/* Apply premultiplied-alpha blending on two pixels simultaneously.
+ * All parameters are packed as 8.8 fixed point values in __m128i SSE
+ * registers, with the upper 8 bits all zero.
+ *
+ * a -- src alpha values
+ * d -- dst color values
+ * s -- src color values
+ */
+static inline __m128i
+util_sse2_premul_blend_epi16( __m128i a, __m128i d, __m128i s)
+{
+   __m128i da, d_sub_da, tmp;
+   tmp      = _mm_mullo_epi16(d, a);
+   da       = _mm_srli_epi16(tmp, 8);
+   d_sub_da = _mm_sub_epi16(d, da);
+
+   return  _mm_add_epi16(s, d_sub_da);
+}
+
+
+/* Apply premultiplied-alpha blending on four pixels in packed BGRA
+ * format (one/inv_src_alpha blend mode).
+ *
+ * src    -- four pixels (bgra8 format)
+ * dst    -- four destination pixels (bgra8)
+ * return -- blended pixels (bgra8)
+ */
+static ALWAYS_INLINE __m128i
+util_sse2_blend_premul_4(const __m128i src,
+                         const __m128i dst)
+{
+
+   __m128i al, ah, dl, dh, sl, sh, rl, rh;
+   __m128i zero = _mm_setzero_si128();
+
+   /* Blend first two pixels:
+    */
+   sl = _mm_unpacklo_epi8(src, zero);
+   dl = _mm_unpacklo_epi8(dst, zero);
+
+   al = _mm_shufflehi_epi16(sl, 0xff);
+   al = _mm_shufflelo_epi16(al, 0xff);
+
+   rl = util_sse2_premul_blend_epi16(al, dl, sl);
+
+   /* Blend second two pixels:
+    */
+   sh = _mm_unpackhi_epi8(src, zero);
+   dh = _mm_unpackhi_epi8(dst, zero);
+
+   ah = _mm_shufflehi_epi16(sh, 0xff);
+   ah = _mm_shufflelo_epi16(ah, 0xff);
+
+   rh = util_sse2_premul_blend_epi16(ah, dh, sh);
+
+   /* Pack the results down to four bgra8 pixels:
+    */
+   return _mm_packus_epi16(rl, rh);
+}
+
+
+/* Apply src-alpha blending on four pixels in packed BGRA
+ * format (srcalpha/inv_src_alpha blend mode).
+ *
+ * src    -- four pixels (bgra8 format)
+ * dst    -- four destination pixels (bgra8)
+ * return -- blended pixels (bgra8)
+ */
+static ALWAYS_INLINE __m128i
+util_sse2_blend_srcalpha_4(const __m128i src,
+                           const __m128i dst)
+{
+
+   __m128i al, ah, dl, dh, sl, sh, rl, rh;
+   __m128i zero = _mm_setzero_si128();
+
+   /* Blend first two pixels:
+    */
+   sl = _mm_unpacklo_epi8(src, zero);
+   dl = _mm_unpacklo_epi8(dst, zero);
+
+   al = _mm_shufflehi_epi16(sl, 0xff);
+   al = _mm_shufflelo_epi16(al, 0xff);
+
+   rl = util_sse2_lerp_epi16(al, dl, sl);
+
+   /* Blend second two pixels:
+    */
+   sh = _mm_unpackhi_epi8(src, zero);
+   dh = _mm_unpackhi_epi8(dst, zero);
+
+   ah = _mm_shufflehi_epi16(sh, 0xff);
+   ah = _mm_shufflelo_epi16(ah, 0xff);
+
+   rh = util_sse2_lerp_epi16(ah, dh, sh);
+
+   /* Pack the results down to four bgra8 pixels:
+    */
+   return _mm_packus_epi16(rl, rh);
+}
+
+
+/**
+ * premultiplies src with constant alpha then
+ * does one/inv_src_alpha blend.
+ *
+ * src 16xi8 (normalized)
+ * dst 16xi8 (normalized)
+ * cst_alpha (constant alpha (u8 value))
+ */
+static ALWAYS_INLINE __m128i
+util_sse2_blend_premul_src_4(const __m128i src,
+                             const __m128i dst,
+                             const unsigned cst_alpha)
+{
+
+   __m128i srca, d, s, rl, rh;
+   __m128i zero = _mm_setzero_si128();
+   __m128i cst_alpha_vec = _mm_set1_epi16(cst_alpha);
+
+   /* Blend first two pixels:
+    */
+   s = _mm_unpacklo_epi8(src, zero);
+   s = _mm_mullo_epi16(s, cst_alpha_vec);
+   /* the shift will cause some precision loss */
+   s = _mm_srli_epi16(s, 8);
+
+   srca = _mm_shufflehi_epi16(s, 0xff);
+   srca = _mm_shufflelo_epi16(srca, 0xff);
+
+   d = _mm_unpacklo_epi8(dst, zero);
+   rl = util_sse2_premul_blend_epi16(srca, d, s);
+
+   /* Blend second two pixels:
+    */
+   s = _mm_unpackhi_epi8(src, zero);
+   s = _mm_mullo_epi16(s, cst_alpha_vec);
+   /* the shift will cause some precision loss */
+   s = _mm_srli_epi16(s, 8);
+
+   srca = _mm_shufflehi_epi16(s, 0xff);
+   srca = _mm_shufflelo_epi16(srca, 0xff);
+
+   d = _mm_unpackhi_epi8(dst, zero);
+   rh = util_sse2_premul_blend_epi16(srca, d, s);
+
+   /* Pack the results down to four bgra8 pixels:
+    */
+   return _mm_packus_epi16(rl, rh);
+}
+
+
+/**
+ * Linear interpolation with SSE2.
+ *
+ * dst, src0, src1 are 16 x i8 vectors, with [0..255] normalized values.
+ *
+ * weight_lo and weight_hi should be a 8 x i16 vectors, in 8.8 fixed point
+ * format, for the low and high components.
+ * We'd want to pass these as values but MSVC limitation forces us to pass these
+ * as pointers since it will complain if more than 3 __m128 are passed by value.
+ */
+static ALWAYS_INLINE __m128i
+util_sse2_lerp_epi8_fixed88(__m128i src0, __m128i src1,
+                            const __m128i * restrict weight_lo,
+                            const __m128i * restrict weight_hi)
+{
+   const __m128i zero = _mm_setzero_si128();
+
+   __m128i src0_lo = _mm_unpacklo_epi8(src0, zero);
+   __m128i src0_hi = _mm_unpackhi_epi8(src0, zero);
+
+   __m128i src1_lo = _mm_unpacklo_epi8(src1, zero);
+   __m128i src1_hi = _mm_unpackhi_epi8(src1, zero);
+
+   __m128i dst_lo;
+   __m128i dst_hi;
+
+   dst_lo = util_sse2_lerp_epi16(*weight_lo, src0_lo, src1_lo);
+   dst_hi = util_sse2_lerp_epi16(*weight_hi, src0_hi, src1_hi);
+
+   return _mm_packus_epi16(dst_lo, dst_hi);
+}
+
+
+/**
+ * Linear interpolation with SSE2.
+ *
+ * dst, src0, src1 are 16 x i8 vectors, with [0..255] normalized values.
+ *
+ * weight should be a 16 x i8 vector, in 0.8 fixed point values.
+ */
+static ALWAYS_INLINE __m128i
+util_sse2_lerp_epi8_fixed08(__m128i src0, __m128i src1,
+                            __m128i weight)
+{
+   const __m128i zero = _mm_setzero_si128();
+   __m128i weight_lo = _mm_unpacklo_epi8(weight, zero);
+   __m128i weight_hi = _mm_unpackhi_epi8(weight, zero);
+
+   return util_sse2_lerp_epi8_fixed88(src0, src1,
+                                      &weight_lo, &weight_hi);
+}
+
+
+/**
+ * Linear interpolation with SSE2.
+ *
+ * dst, src0, src1, and weight are 16 x i8 vectors, with [0..255] normalized
+ * values.
+ */
+static ALWAYS_INLINE __m128i
+util_sse2_lerp_unorm8(__m128i src0, __m128i src1,
+                      __m128i weight)
+{
+   const __m128i zero = _mm_setzero_si128();
+   __m128i weight_lo = _mm_unpacklo_epi8(weight, zero);
+   __m128i weight_hi = _mm_unpackhi_epi8(weight, zero);
+
+#if 0
+   /*
+    * Rescale from [0..255] to [0..256].
+    */
+   weight_lo = _mm_add_epi16(weight_lo, _mm_srli_epi16(weight_lo, 7));
+   weight_hi = _mm_add_epi16(weight_hi, _mm_srli_epi16(weight_hi, 7));
+#endif
+
+   return util_sse2_lerp_epi8_fixed88(src0, src1,
+                                      &weight_lo, &weight_hi);
+}
+
+
+/**
+ * Linear interpolation with SSE2.
+ *
+ * dst, src0, src1, src2, src3 are 16 x i8 vectors, with [0..255] normalized
+ * values.
+ *
+ * ws_lo, ws_hi, wt_lo, wt_hi should be a 8 x i16 vectors, in 8.8 fixed point
+ * format, for the low and high components.
+ * We'd want to pass these as values but MSVC limitation forces us to pass these
+ * as pointers since it will complain if more than 3 __m128 are passed by value.
+ *
+ * This uses ws_lo, ws_hi to interpolate between src0 and src1, as well as to
+ * interpolate between src2 and src3, then uses wt_lo and wt_hi to interpolate
+ * between the resulting vectors.
+ */
+static ALWAYS_INLINE __m128i
+util_sse2_lerp_2d_epi8_fixed88(__m128i src0, __m128i src1,
+                               const __m128i * restrict src2,
+                               const __m128i * restrict src3,
+                               const __m128i * restrict ws_lo,
+                               const __m128i * restrict ws_hi,
+                               const __m128i * restrict wt_lo,
+                               const __m128i * restrict wt_hi)
+{
+   const __m128i zero = _mm_setzero_si128();
+
+   __m128i src0_lo = _mm_unpacklo_epi8(src0, zero);
+   __m128i src0_hi = _mm_unpackhi_epi8(src0, zero);
+
+   __m128i src1_lo = _mm_unpacklo_epi8(src1, zero);
+   __m128i src1_hi = _mm_unpackhi_epi8(src1, zero);
+
+   __m128i src2_lo = _mm_unpacklo_epi8(*src2, zero);
+   __m128i src2_hi = _mm_unpackhi_epi8(*src2, zero);
+
+   __m128i src3_lo = _mm_unpacklo_epi8(*src3, zero);
+   __m128i src3_hi = _mm_unpackhi_epi8(*src3, zero);
+
+   __m128i dst_lo, dst01_lo, dst23_lo;
+   __m128i dst_hi, dst01_hi, dst23_hi;
+
+   dst01_lo = util_sse2_lerp_epi16(*ws_lo, src0_lo, src1_lo);
+   dst01_hi = util_sse2_lerp_epi16(*ws_hi, src0_hi, src1_hi);
+   dst23_lo = util_sse2_lerp_epi16(*ws_lo, src2_lo, src3_lo);
+   dst23_hi = util_sse2_lerp_epi16(*ws_hi, src2_hi, src3_hi);
+
+   dst_lo = util_sse2_lerp_epi16(*wt_lo, dst01_lo, dst23_lo);
+   dst_hi = util_sse2_lerp_epi16(*wt_hi, dst01_hi, dst23_hi);
+
+   return _mm_packus_epi16(dst_lo, dst_hi);
+}
+
+/**
+ * Stretch a row of pixels using linear filter.
+ *
+ * Uses Bresenham's line algorithm using 16.16 fixed point representation for
+ * the error term.
+ *
+ * @param dst_width destination width in pixels
+ * @param src_x    start x0 in 16.16 fixed point format
+ * @param src_xstep step in 16.16. fixed point format
+ *
+ * @return final src_x value (i.e., src_x + dst_width*src_xstep)
+ */
+static ALWAYS_INLINE int32_t
+util_sse2_stretch_row_8unorm(__m128i * restrict dst,
+                             int32_t dst_width,
+                             const uint32_t * restrict src,
+                             int32_t src_x,
+                             int32_t src_xstep)
+{
+   int16_t error0, error1, error2, error3;
+   __m128i error_lo, error_hi, error_step;
+
+   assert(dst_width >= 0);
+   assert(dst_width % 4 == 0);
+
+   error0 = src_x;
+   error1 = error0 + src_xstep;
+   error2 = error1 + src_xstep;
+   error3 = error2 + src_xstep;
+
+   error_lo   = _mm_setr_epi16(error0, error0, error0, error0,
+                               error1, error1, error1, error1);
+   error_hi   = _mm_setr_epi16(error2, error2, error2, error2,
+                               error3, error3, error3, error3);
+   error_step = _mm_set1_epi16(src_xstep << 2);
+
+   dst_width >>= 2;
+   while (dst_width) {
+      uint16_t src_x0;
+      uint16_t src_x1;
+      uint16_t src_x2;
+      uint16_t src_x3;
+      __m128i src0, src1;
+      __m128i weight_lo, weight_hi;
+
+      /*
+       * It is faster to re-compute the coordinates in the scalar integer unit here,
+       * than to fetch the values from the SIMD integer unit.
+       */
+
+      src_x0 = src_x >> 16;
+      src_x += src_xstep;
+      src_x1 = src_x >> 16;
+      src_x += src_xstep;
+      src_x2 = src_x >> 16;
+      src_x += src_xstep;
+      src_x3 = src_x >> 16;
+      src_x += src_xstep;
+
+      /*
+       * Fetch pairs of pixels 64bit at a time, and then swizzle them inplace.
+       */
+
+      {
+         __m128i src_00_10 = _mm_loadl_epi64((const __m128i *)&src[src_x0]);
+         __m128i src_01_11 = _mm_loadl_epi64((const __m128i *)&src[src_x1]);
+         __m128i src_02_12 = _mm_loadl_epi64((const __m128i *)&src[src_x2]);
+         __m128i src_03_13 = _mm_loadl_epi64((const __m128i *)&src[src_x3]);
+
+         __m128i src_00_01_10_11 = _mm_unpacklo_epi32(src_00_10, src_01_11);
+         __m128i src_02_03_12_13 = _mm_unpacklo_epi32(src_02_12, src_03_13);
+
+         src0 = _mm_unpacklo_epi64(src_00_01_10_11, src_02_03_12_13);
+         src1 = _mm_unpackhi_epi64(src_00_01_10_11, src_02_03_12_13);
+      }
+
+      weight_lo = _mm_srli_epi16(error_lo, 8);
+      weight_hi = _mm_srli_epi16(error_hi, 8);
+
+      *dst = util_sse2_lerp_epi8_fixed88(src0, src1,
+                                         &weight_lo, &weight_hi);
+
+      error_lo = _mm_add_epi16(error_lo, error_step);
+      error_hi = _mm_add_epi16(error_hi, error_step);
+
+      ++dst;
+      --dst_width;
+   }
+
+   return src_x;
+}
+
+
+
 #endif /* PIPE_ARCH_SSE */
 
 #endif /* U_SSE_H_ */
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_trace_gallium.c b/lib/mesa/src/gallium/auxiliary/util/u_trace_gallium.c
new file mode 100644
index 000000000..3e9a254a4
--- /dev/null
+++ b/lib/mesa/src/gallium/auxiliary/util/u_trace_gallium.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright © 2020 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "u_trace_gallium.h"
+#include "u_inlines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+
+#include "u_tracepoints.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+static void *
+u_trace_pipe_create_ts_buffer(struct u_trace_context *utctx, uint32_t size)
+{
+   struct pipe_context *ctx = utctx->pctx;
+
+   struct pipe_resource tmpl = {
+      .target     = PIPE_BUFFER,
+      .format     = PIPE_FORMAT_R8_UNORM,
+      .bind       = PIPE_BIND_QUERY_BUFFER | PIPE_BIND_LINEAR,
+      .width0     = size,
+      .height0    = 1,
+      .depth0     = 1,
+      .array_size = 1,
+   };
+
+   return ctx->screen->resource_create(ctx->screen, &tmpl);
+}
+
+static void
+u_trace_pipe_delete_ts_buffer(struct u_trace_context *utctx, void *timestamps)
+{
+   struct pipe_resource *buffer = timestamps;
+   pipe_resource_reference(&buffer, NULL);
+}
+
+void
+u_trace_pipe_context_init(struct u_trace_context *utctx,
+                          struct pipe_context *pctx,
+                          u_trace_record_ts record_timestamp,
+                          u_trace_read_ts read_timestamp,
+                          u_trace_delete_flush_data delete_flush_data)
+{
+   u_trace_context_init(utctx, pctx,
+                        u_trace_pipe_create_ts_buffer,
+                        u_trace_pipe_delete_ts_buffer,
+                        record_timestamp,
+                        read_timestamp,
+                        delete_flush_data);
+}
+
+inline void
+trace_framebuffer_state(struct u_trace *ut, void *cs, const struct pipe_framebuffer_state *pfb)
+{
+   if (likely(!ut->enabled))
+      return;
+
+   trace_framebuffer(ut, cs, pfb);
+
+   for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
+      if (pfb->cbufs[i]) {
+         trace_surface(ut, cs, pfb->cbufs[i]);
+      }
+   }
+   if (pfb->zsbuf) {
+      trace_surface(ut, cs, pfb->zsbuf);
+   }
+}
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_trace_gallium.h b/lib/mesa/src/gallium/auxiliary/util/u_trace_gallium.h
new file mode 100644
index 000000000..e37e3e663
--- /dev/null
+++ b/lib/mesa/src/gallium/auxiliary/util/u_trace_gallium.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2020 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _U_TRACE_GALLIUM_H
+#define _U_TRACE_GALLIUM_H
+
+#include "util/perf/u_trace.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/* Gallium specific u_trace helpers */
+
+struct pipe_context;
+struct pipe_framebuffer_state;
+
+void
+u_trace_pipe_context_init(struct u_trace_context *utctx,
+                          struct pipe_context *pctx,
+                          u_trace_record_ts record_timestamp,
+                          u_trace_read_ts read_timestamp,
+                          u_trace_delete_flush_data delete_flush_data);
+
+/*
+ * In some cases it is useful to have composite tracepoints like this,
+ * to log more complex data structures.
+ */
+
+void
+trace_framebuffer_state(struct u_trace *ut, void *cs, const struct pipe_framebuffer_state *pfb);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif  /* _U_TRACE_GALLIUM_H */
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_tracepoints.py b/lib/mesa/src/gallium/auxiliary/util/u_tracepoints.py
index f8a70d05c..30aaab9df 100644
--- a/lib/mesa/src/gallium/auxiliary/util/u_tracepoints.py
+++ b/lib/mesa/src/gallium/auxiliary/util/u_tracepoints.py
@@ -37,6 +37,8 @@ sys.path.insert(0, args.import_path)
 
 from u_trace import Header
 from u_trace import Tracepoint
+from u_trace import TracepointArg as Arg
+from u_trace import TracepointArgStruct as ArgStruct
 from u_trace import utrace_generate
 
 #
@@ -47,11 +49,11 @@ Header('pipe/p_state.h')
 Header('util/format/u_format.h')
 
 Tracepoint('surface',
-    args=[['const struct pipe_surface *', 'psurf']],
-    tp_struct=[['uint16_t',     'width',      'psurf->width'],
-               ['uint16_t',     'height',     'psurf->height'],
-               ['uint8_t',      'nr_samples', 'psurf->nr_samples'],
-               ['const char *', 'format',     'util_format_short_name(psurf->format)']],
+    args=[ArgStruct(type='const struct pipe_surface *', var='psurf')],
+    tp_struct=[Arg(type='uint16_t',     name='width',      var='psurf->width',                          c_format='%u'),
+               Arg(type='uint16_t',     name='height',     var='psurf->height',                         c_format='%u'),
+               Arg(type='uint8_t',      name='nr_samples', var='psurf->nr_samples',                     c_format='%u'),
+               Arg(type='const char *', name='format',     var='util_format_short_name(psurf->format)', c_format='%s')],
     tp_print=['%ux%u@%u, fmt=%s',
         '__entry->width',
         '__entry->height',
@@ -61,12 +63,12 @@ Tracepoint('surface',
 
 # Note: called internally from trace_framebuffer_state()
 Tracepoint('framebuffer',
-    args=[['const struct pipe_framebuffer_state *', 'pfb']],
-    tp_struct=[['uint16_t',     'width',      'pfb->width'],
-               ['uint16_t',     'height',     'pfb->height'],
-               ['uint8_t',      'layers',     'pfb->layers'],
-               ['uint8_t',      'samples',    'pfb->samples'],
-               ['uint8_t',      'nr_cbufs',   'pfb->nr_cbufs']],
+    args=[ArgStruct(type='const struct pipe_framebuffer_state *', var='pfb')],
+    tp_struct=[Arg(type='uint16_t', name='width',    var='pfb->width',    c_format='%u'),
+               Arg(type='uint16_t', name='height',   var='pfb->height',   c_format='%u'),
+               Arg(type='uint8_t',  name='layers',   var='pfb->layers',   c_format='%u'),
+               Arg(type='uint8_t',  name='samples',  var='pfb->samples',  c_format='%u'),
+               Arg(type='uint8_t',  name='nr_cbufs', var='pfb->nr_cbufs', c_format='%u')],
     tp_print=['%ux%ux%u@%u, nr_cbufs: %u',
         '__entry->width',
         '__entry->height',
@@ -76,17 +78,17 @@ Tracepoint('framebuffer',
 )
 
 Tracepoint('grid_info',
-    args=[['const struct pipe_grid_info *', 'pgrid']],
-    tp_struct=[['uint8_t',  'work_dim',  'pgrid->work_dim'],
-               ['uint16_t', 'block_x',   'pgrid->block[0]'],
-               ['uint16_t', 'block_y',   'pgrid->block[1]'],
-               ['uint16_t', 'block_z',   'pgrid->block[2]'],
-               ['uint16_t', 'grid_x',    'pgrid->grid[0]'],
-               ['uint16_t', 'grid_y',    'pgrid->grid[1]'],
-               ['uint16_t', 'grid_z',    'pgrid->grid[2]']],
+    args=[ArgStruct(type='const struct pipe_grid_info *', var='pgrid')],
+    tp_struct=[Arg(type='uint8_t',  name='work_dim', var='pgrid->work_dim', c_format='%u'),
+               Arg(type='uint16_t', name='block_x',  var='pgrid->block[0]', c_format='%u'),
+               Arg(type='uint16_t', name='block_y',  var='pgrid->block[1]', c_format='%u'),
+               Arg(type='uint16_t', name='block_z',  var='pgrid->block[2]', c_format='%u'),
+               Arg(type='uint16_t', name='grid_x',   var='pgrid->grid[0]',  c_format='%u'),
+               Arg(type='uint16_t', name='grid_y',   var='pgrid->grid[1]',  c_format='%u'),
+               Arg(type='uint16_t', name='grid_z',   var='pgrid->grid[2]',  c_format='%u')],
     tp_print=['work_dim=%u, block=%ux%ux%u, grid=%ux%ux%u', '__entry->work_dim',
         '__entry->block_x', '__entry->block_y', '__entry->block_z',
         '__entry->grid_x', '__entry->grid_y', '__entry->grid_z'],
 )
 
-utrace_generate(cpath=args.src, hpath=args.hdr)
+utrace_generate(cpath=args.src, hpath=args.hdr, ctx_param='struct pipe_context *pctx')
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_transfer.c b/lib/mesa/src/gallium/auxiliary/util/u_transfer.c
index 84b80d400..80576ddf1 100644
--- a/lib/mesa/src/gallium/auxiliary/util/u_transfer.c
+++ b/lib/mesa/src/gallium/auxiliary/util/u_transfer.c
@@ -31,12 +31,12 @@ void u_default_buffer_subdata(struct pipe_context *pipe,
 
    u_box_1d(offset, size, &box);
 
-   map = pipe->transfer_map(pipe, resource, 0, usage, &box, &transfer);
+   map = pipe->buffer_map(pipe, resource, 0, usage, &box, &transfer);
    if (!map)
       return;
 
    memcpy(map, data, size);
-   pipe_transfer_unmap(pipe, transfer);
+   pipe_buffer_unmap(pipe, transfer);
 }
 
 void u_default_texture_subdata(struct pipe_context *pipe,
@@ -60,7 +60,7 @@ void u_default_texture_subdata(struct pipe_context *pipe,
    /* texture_subdata implicitly discards the rewritten buffer range */
    usage |= PIPE_MAP_DISCARD_RANGE;
 
-   map = pipe->transfer_map(pipe,
+   map = pipe->texture_map(pipe,
                             resource,
                             level,
                             usage,
@@ -81,19 +81,9 @@ void u_default_texture_subdata(struct pipe_context *pipe,
                  layer_stride, /* bytes */
                  0, 0, 0);
 
-   pipe_transfer_unmap(pipe, transfer);
+   pipe_texture_unmap(pipe, transfer);
 }
 
-
-bool u_default_resource_get_handle(UNUSED struct pipe_screen *screen,
-                                   UNUSED struct pipe_resource *resource,
-                                   UNUSED struct winsys_handle *handle)
-{
-   return FALSE;
-}
-
-
-
 void u_default_transfer_flush_region(UNUSED struct pipe_context *pipe,
                                      UNUSED struct pipe_transfer *transfer,
                                      UNUSED const struct pipe_box *box)
@@ -101,59 +91,3 @@ void u_default_transfer_flush_region(UNUSED struct pipe_context *pipe,
    /* This is a no-op implementation, nothing to do.
     */
 }
-
-void u_default_transfer_unmap(UNUSED struct pipe_context *pipe,
-                              UNUSED struct pipe_transfer *transfer)
-{
-}
-
-
-static inline struct u_resource *
-u_resource( struct pipe_resource *res )
-{
-   return (struct u_resource *)res;
-}
-
-bool u_resource_get_handle_vtbl(struct pipe_screen *screen,
-                                UNUSED struct pipe_context *ctx,
-                                struct pipe_resource *resource,
-                                struct winsys_handle *handle,
-                                UNUSED unsigned usage)
-{
-   struct u_resource *ur = u_resource(resource);
-   return ur->vtbl->resource_get_handle(screen, resource, handle);
-}
-
-void u_resource_destroy_vtbl(struct pipe_screen *screen,
-                             struct pipe_resource *resource)
-{
-   struct u_resource *ur = u_resource(resource);
-   ur->vtbl->resource_destroy(screen, resource);
-}
-
-void *u_transfer_map_vtbl(struct pipe_context *context,
-                          struct pipe_resource *resource,
-                          unsigned level,
-                          unsigned usage,
-                          const struct pipe_box *box,
-                          struct pipe_transfer **transfer)
-{
-   struct u_resource *ur = u_resource(resource);
-   return ur->vtbl->transfer_map(context, resource, level, usage, box,
-                                 transfer);
-}
-
-void u_transfer_flush_region_vtbl( struct pipe_context *pipe,
-                                   struct pipe_transfer *transfer,
-                                   const struct pipe_box *box)
-{
-   struct u_resource *ur = u_resource(transfer->resource);
-   ur->vtbl->transfer_flush_region(pipe, transfer, box);
-}
-
-void u_transfer_unmap_vtbl( struct pipe_context *pipe,
-                            struct pipe_transfer *transfer )
-{
-   struct u_resource *ur = u_resource(transfer->resource);
-   ur->vtbl->transfer_unmap(pipe, transfer);
-}
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_transfer_helper.c b/lib/mesa/src/gallium/auxiliary/util/u_transfer_helper.c
index 47898e0bd..d1e8d123a 100644
--- a/lib/mesa/src/gallium/auxiliary/util/u_transfer_helper.c
+++ b/lib/mesa/src/gallium/auxiliary/util/u_transfer_helper.c
@@ -213,7 +213,7 @@ transfer_map_msaa(struct pipe_context *pctx,
    map_box.x = 0;
    map_box.y = 0;
 
-   void *ss_map = pctx->transfer_map(pctx, trans->ss, 0, usage, &map_box,
+   void *ss_map = pctx->texture_map(pctx, trans->ss, 0, usage, &map_box,
          &trans->trans);
    if (!ss_map) {
       free(trans);
@@ -505,7 +505,7 @@ u_transfer_helper_transfer_unmap(struct pipe_context *pctx,
        * so don't call helper->vtbl->transfer_unmap() directly
        */
       if (trans->ss) {
-         pctx->transfer_unmap(pctx, trans->trans);
+         pctx->texture_unmap(pctx, trans->trans);
          pipe_resource_reference(&trans->ss, NULL);
       } else {
          helper->vtbl->transfer_unmap(pctx, trans->trans);
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_vertex_state_cache.c b/lib/mesa/src/gallium/auxiliary/util/u_vertex_state_cache.c
new file mode 100644
index 000000000..f98a1071a
--- /dev/null
+++ b/lib/mesa/src/gallium/auxiliary/util/u_vertex_state_cache.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "util/u_vertex_state_cache.h"
+#include "util/u_inlines.h"
+#include "util/hash_table.h"
+#include "util/set.h"
+
+static uint32_t key_hash(const void *key)
+{
+   const struct pipe_vertex_state *state = key;
+
+   return _mesa_hash_data(&state->input, sizeof(state->input));
+}
+
+static bool key_equals(const void *a, const void *b)
+{
+   const struct pipe_vertex_state *sa = a;
+   const struct pipe_vertex_state *sb = b;
+
+   return !memcmp(&sa->input, &sb->input, sizeof(sa->input));
+}
+
+void
+util_vertex_state_cache_init(struct util_vertex_state_cache *cache,
+                             pipe_create_vertex_state_func create,
+                             pipe_vertex_state_destroy_func destroy)
+{
+   simple_mtx_init(&cache->lock, mtx_plain);
+   cache->set = _mesa_set_create(NULL, key_hash, key_equals);
+   cache->create = create;
+   cache->destroy = destroy;
+}
+
+void
+util_vertex_state_cache_deinit(struct util_vertex_state_cache *cache)
+{
+   if (cache->set) {
+      set_foreach(cache->set, entry) {
+         fprintf(stderr, "mesa: vertex state cache should be empty\n");
+         assert(!"vertex state cache should be empty");
+      }
+
+      _mesa_set_destroy(cache->set, NULL);
+      simple_mtx_destroy(&cache->lock);
+   }
+}
+
+struct pipe_vertex_state *
+util_vertex_state_cache_get(struct pipe_screen *screen,
+                            struct pipe_vertex_buffer *buffer,
+                            const struct pipe_vertex_element *elements,
+                            unsigned num_elements,
+                            struct pipe_resource *indexbuf,
+                            uint32_t full_velem_mask,
+                            struct util_vertex_state_cache *cache)
+{
+   struct pipe_vertex_state key;
+
+   memset(&key, 0, sizeof(key));
+   key.input.indexbuf = indexbuf;
+   key.input.vbuffer.stride = buffer->stride;
+   assert(!buffer->is_user_buffer);
+   key.input.vbuffer.buffer_offset = buffer->buffer_offset;
+   key.input.vbuffer.buffer = buffer->buffer;
+   key.input.num_elements = num_elements;
+   for (unsigned i = 0; i < num_elements; i++)
+      key.input.elements[i] = elements[i];
+   key.input.full_velem_mask = full_velem_mask;
+
+   uint32_t hash = key_hash(&key);
+
+   /* Find the state in the live cache. */
+   simple_mtx_lock(&cache->lock);
+   struct set_entry *entry = _mesa_set_search_pre_hashed(cache->set, hash, &key);
+   struct pipe_vertex_state *state = entry ? (void*)entry->key : NULL;
+
+   /* Return if the state already exists. */
+   if (state) {
+      /* Increase the refcount. */
+      p_atomic_inc(&state->reference.count);
+      assert(state->reference.count >= 1);
+      simple_mtx_unlock(&cache->lock);
+      return state;
+   }
+
+   state = cache->create(screen, buffer, elements, num_elements, indexbuf,
+                         full_velem_mask);
+   if (state) {
+      assert(key_hash(state) == hash);
+      _mesa_set_add_pre_hashed(cache->set, hash, state);
+   }
+
+   simple_mtx_unlock(&cache->lock);
+   return state;
+}
+
+void
+util_vertex_state_destroy(struct pipe_screen *screen,
+                          struct util_vertex_state_cache *cache,
+                          struct pipe_vertex_state *state)
+{
+   simple_mtx_lock(&cache->lock);
+   /* There could have been a thread race and the cache might have returned
+    * the vertex state being destroyed. Check the reference count and do
+    * nothing if it's positive.
+    */
+   if (p_atomic_read(&state->reference.count) <= 0) {
+      _mesa_set_remove_key(cache->set, state);
+      cache->destroy(screen, state);
+   }
+   simple_mtx_unlock(&cache->lock);
+}
diff --git a/lib/mesa/src/gallium/auxiliary/util/u_vertex_state_cache.h b/lib/mesa/src/gallium/auxiliary/util/u_vertex_state_cache.h
new file mode 100644
index 000000000..902e91e43
--- /dev/null
+++ b/lib/mesa/src/gallium/auxiliary/util/u_vertex_state_cache.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* This deduplicates pipe_vertex_state CSOs to enable draw merging in
+ * u_threaded_context because the draw merging is possible only if different
+ * display lists use the same pipe_vertex_state CSO.
+ */
+
+#ifndef U_VERTEX_STATE_CACHE_H
+#define U_VERTEX_STATE_CACHE_H
+
+#include "util/simple_mtx.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+
+struct util_vertex_state_cache {
+   simple_mtx_t lock;
+   struct set *set;
+
+   pipe_create_vertex_state_func create;
+   pipe_vertex_state_destroy_func destroy;
+};
+
+void
+util_vertex_state_cache_init(struct util_vertex_state_cache *cache,
+                             pipe_create_vertex_state_func create,
+                             pipe_vertex_state_destroy_func destroy);
+
+void
+util_vertex_state_cache_deinit(struct util_vertex_state_cache *cache);
+
+struct pipe_vertex_state *
+util_vertex_state_cache_get(struct pipe_screen *screen,
+                            struct pipe_vertex_buffer *buffer,
+                            const struct pipe_vertex_element *elements,
+                            unsigned num_elements,
+                            struct pipe_resource *indexbuf,
+                            uint32_t full_velem_mask,
+                            struct util_vertex_state_cache *cache);
+
+void
+util_vertex_state_destroy(struct pipe_screen *screen,
+                          struct util_vertex_state_cache *cache,
+                          struct pipe_vertex_state *state);
+
+#endif
diff --git a/lib/mesa/src/gallium/auxiliary/vl/vl_compositor_cs.c b/lib/mesa/src/gallium/auxiliary/vl/vl_compositor_cs.c
index 813aa1489..ad2175213 100644
--- a/lib/mesa/src/gallium/auxiliary/vl/vl_compositor_cs.c
+++ b/lib/mesa/src/gallium/auxiliary/vl/vl_compositor_cs.c
@@ -727,7 +727,7 @@ draw_layers(struct vl_compositor       *c,
          c->pipe->bind_sampler_states(c->pipe, PIPE_SHADER_COMPUTE, 0,
                         num_sampler_views, layer->samplers);
          c->pipe->set_sampler_views(c->pipe, PIPE_SHADER_COMPUTE, 0,
-                        num_sampler_views, 0, samplers);
+                        num_sampler_views, 0, false, samplers);
 
          cs_launch(c, layer->cs, &(drawn.area));
 
@@ -735,7 +735,7 @@ draw_layers(struct vl_compositor       *c,
          c->pipe->set_shader_images(c->pipe, PIPE_SHADER_COMPUTE, 0, 0, 1, NULL);
          c->pipe->set_constant_buffer(c->pipe, PIPE_SHADER_COMPUTE, 0, false, NULL);
          c->pipe->set_sampler_views(c->pipe, PIPE_SHADER_FRAGMENT, 0, 0,
-                        num_sampler_views, NULL);
+                        num_sampler_views, false, NULL);
          c->pipe->bind_compute_state(c->pipe, NULL);
          c->pipe->bind_sampler_states(c->pipe, PIPE_SHADER_COMPUTE, 0,
                         num_sampler_views, NULL);
diff --git a/lib/mesa/src/gallium/auxiliary/vl/vl_compositor_gfx.c b/lib/mesa/src/gallium/auxiliary/vl/vl_compositor_gfx.c
index c4eba2293..24f5625b4 100644
--- a/lib/mesa/src/gallium/auxiliary/vl/vl_compositor_gfx.c
+++ b/lib/mesa/src/gallium/auxiliary/vl/vl_compositor_gfx.c
@@ -665,7 +665,7 @@ draw_layers(struct vl_compositor *c, struct vl_compositor_state *s, struct u_rec
          c->pipe->bind_sampler_states(c->pipe, PIPE_SHADER_FRAGMENT, 0,
                                       num_sampler_views, layer->samplers);
          c->pipe->set_sampler_views(c->pipe, PIPE_SHADER_FRAGMENT, 0,
-                                    num_sampler_views, 0, samplers);
+                                    num_sampler_views, 0, false, samplers);
 
          util_draw_arrays(c->pipe, PIPE_PRIM_QUADS, vb_index * 4, 4);
          vb_index++;
diff --git a/lib/mesa/src/gallium/auxiliary/vl/vl_idct.c b/lib/mesa/src/gallium/auxiliary/vl/vl_idct.c
index ccee0d488..58fd5329d 100644
--- a/lib/mesa/src/gallium/auxiliary/vl/vl_idct.c
+++ b/lib/mesa/src/gallium/auxiliary/vl/vl_idct.c
@@ -718,7 +718,7 @@ vl_idct_upload_matrix(struct pipe_context *pipe, float scale)
    if (!matrix)
       goto error_matrix;
 
-   f = pipe->transfer_map(pipe, matrix, 0,
+   f = pipe->texture_map(pipe, matrix, 0,
                                      PIPE_MAP_WRITE |
                                      PIPE_MAP_DISCARD_RANGE,
                                      &rect, &buf_transfer);
@@ -732,7 +732,7 @@ vl_idct_upload_matrix(struct pipe_context *pipe, float scale)
          // transpose and scale
          f[i * pitch + j] = ((const float (*)[8])const_matrix)[j][i] * scale;
 
-   pipe->transfer_unmap(pipe, buf_transfer);
+   pipe->texture_unmap(pipe, buf_transfer);
 
    memset(&sv_templ, 0, sizeof(sv_templ));
    u_sampler_view_default_template(&sv_templ, matrix, matrix->format);
@@ -836,7 +836,7 @@ vl_idct_flush(struct vl_idct *idct, struct vl_idct_buffer *buffer, unsigned num_
                                    0, 2, idct->samplers);
 
    idct->pipe->set_sampler_views(idct->pipe, PIPE_SHADER_FRAGMENT, 0, 2, 0,
-                                 buffer->sampler_views.stage[0]);
+                                 false, buffer->sampler_views.stage[0]);
 
    /* mismatch control */
    idct->pipe->set_framebuffer_state(idct->pipe, &buffer->fb_state_mismatch);
@@ -863,6 +863,6 @@ vl_idct_prepare_stage2(struct vl_idct *idct, struct vl_idct_buffer *buffer)
    idct->pipe->bind_sampler_states(idct->pipe, PIPE_SHADER_FRAGMENT,
                                    0, 2, idct->samplers);
    idct->pipe->set_sampler_views(idct->pipe, PIPE_SHADER_FRAGMENT,
-                                 0, 2, 0, buffer->sampler_views.stage[1]);
+                                 0, 2, 0, false, buffer->sampler_views.stage[1]);
 }
 
diff --git a/lib/mesa/src/gallium/auxiliary/vl/vl_mc.c b/lib/mesa/src/gallium/auxiliary/vl/vl_mc.c
index 0b2a210cb..d331da1d5 100644
--- a/lib/mesa/src/gallium/auxiliary/vl/vl_mc.c
+++ b/lib/mesa/src/gallium/auxiliary/vl/vl_mc.c
@@ -622,7 +622,7 @@ vl_mc_render_ref(struct vl_mc *renderer, struct vl_mc_buffer *buffer, struct pip
    renderer->pipe->bind_fs_state(renderer->pipe, renderer->fs_ref);
 
    renderer->pipe->set_sampler_views(renderer->pipe, PIPE_SHADER_FRAGMENT,
-                                     0, 1, 0, &ref);
+                                     0, 1, 0, false, &ref);
    renderer->pipe->bind_sampler_states(renderer->pipe, PIPE_SHADER_FRAGMENT,
                                        0, 1, &renderer->sampler_ref);