summaryrefslogtreecommitdiff
path: root/lib/mesa/src/broadcom
diff options
context:
space:
mode:
authorJonathan Gray <jsg@cvs.openbsd.org>2020-01-22 02:13:18 +0000
committerJonathan Gray <jsg@cvs.openbsd.org>2020-01-22 02:13:18 +0000
commitfdcc03929065b5bf5dd93553db219ea3e05c8c34 (patch)
treeca90dc8d9e89febdcd4160956c1b8ec098a4efc9 /lib/mesa/src/broadcom
parent3c9de4a7e13712b5696750bbd59a18c848742022 (diff)
Import Mesa 19.2.8
Diffstat (limited to 'lib/mesa/src/broadcom')
-rw-r--r--lib/mesa/src/broadcom/.editorconfig3
-rw-r--r--lib/mesa/src/broadcom/Android.cle.mk39
-rw-r--r--lib/mesa/src/broadcom/Android.genxml.mk83
-rw-r--r--lib/mesa/src/broadcom/Android.mk29
-rw-r--r--lib/mesa/src/broadcom/cle/meson.build2
-rw-r--r--lib/mesa/src/broadcom/cle/v3d_decoder.c3
-rw-r--r--lib/mesa/src/broadcom/cle/v3d_packet_v33.xml141
-rw-r--r--lib/mesa/src/broadcom/common/v3d_device_info.c79
-rw-r--r--lib/mesa/src/broadcom/common/v3d_limits.h5
-rw-r--r--lib/mesa/src/broadcom/compiler/meson.build4
-rw-r--r--lib/mesa/src/broadcom/compiler/nir_to_vir.c1471
-rw-r--r--lib/mesa/src/broadcom/compiler/qpu_schedule.c312
-rw-r--r--lib/mesa/src/broadcom/compiler/v3d33_tex.c22
-rw-r--r--lib/mesa/src/broadcom/compiler/v3d40_tex.c104
-rw-r--r--lib/mesa/src/broadcom/compiler/v3d_compiler.h169
-rw-r--r--lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c287
-rw-r--r--lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c411
-rw-r--r--lib/mesa/src/broadcom/compiler/v3d_nir_lower_scratch.c153
-rw-r--r--lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c42
-rw-r--r--lib/mesa/src/broadcom/compiler/vir.c302
-rw-r--r--lib/mesa/src/broadcom/compiler/vir_dump.c87
-rw-r--r--lib/mesa/src/broadcom/compiler/vir_live_variables.c60
-rw-r--r--lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c33
-rw-r--r--lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c36
-rw-r--r--lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c143
-rw-r--r--lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c20
-rw-r--r--lib/mesa/src/broadcom/compiler/vir_register_allocate.c228
-rw-r--r--lib/mesa/src/broadcom/compiler/vir_to_qpu.c86
-rw-r--r--lib/mesa/src/broadcom/drm-shim/README.md17
-rw-r--r--lib/mesa/src/broadcom/drm-shim/meson.build62
-rw-r--r--lib/mesa/src/broadcom/drm-shim/v3d.c98
-rw-r--r--lib/mesa/src/broadcom/drm-shim/v3d.h70
-rw-r--r--lib/mesa/src/broadcom/drm-shim/v3d_noop.c158
-rw-r--r--lib/mesa/src/broadcom/drm-shim/v3dx.c370
-rw-r--r--lib/mesa/src/broadcom/meson.build6
-rw-r--r--lib/mesa/src/broadcom/qpu/meson.build3
-rw-r--r--lib/mesa/src/broadcom/qpu/qpu_disasm.c4
-rw-r--r--lib/mesa/src/broadcom/qpu/qpu_instr.c120
-rw-r--r--lib/mesa/src/broadcom/qpu/qpu_instr.h4
-rw-r--r--lib/mesa/src/broadcom/qpu/qpu_pack.c34
-rw-r--r--lib/mesa/src/broadcom/qpu/qpu_validate.c0
-rw-r--r--lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c3
42 files changed, 3915 insertions, 1388 deletions
diff --git a/lib/mesa/src/broadcom/.editorconfig b/lib/mesa/src/broadcom/.editorconfig
new file mode 100644
index 000000000..f3d8c4791
--- /dev/null
+++ b/lib/mesa/src/broadcom/.editorconfig
@@ -0,0 +1,3 @@
+[*.{c,h}]
+indent_style = space
+indent_size = 8
diff --git a/lib/mesa/src/broadcom/Android.cle.mk b/lib/mesa/src/broadcom/Android.cle.mk
new file mode 100644
index 000000000..5634a8d4a
--- /dev/null
+++ b/lib/mesa/src/broadcom/Android.cle.mk
@@ -0,0 +1,39 @@
+# Copyright © 2016 Intel Corporation
+# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_broadcom_cle
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+
+LOCAL_SRC_FILES := $(BROADCOM_DECODER_FILES)
+
+LOCAL_STATIC_LIBRARIES := libmesa_broadcom_genxml
+
+LOCAL_C_INCLUDES += $(MESA_TOP)/src/gallium/include
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)
+
+LOCAL_SHARED_LIBRARIES := libexpat libz
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/broadcom/Android.genxml.mk b/lib/mesa/src/broadcom/Android.genxml.mk
new file mode 100644
index 000000000..91e0de05d
--- /dev/null
+++ b/lib/mesa/src/broadcom/Android.genxml.mk
@@ -0,0 +1,83 @@
+# Copyright © 2016 Intel Corporation
+# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_broadcom_genxml
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+
+intermediates := $(call local-generated-sources-dir)
+
+# dummy.c source file is generated to meet the build system's rules.
+LOCAL_GENERATED_SOURCES += $(intermediates)/dummy.c
+
+$(intermediates)/dummy.c:
+ @mkdir -p $(dir $@)
+ @echo "Gen Dummy: $(PRIVATE_MODULE) <= $(notdir $(@))"
+ $(hide) touch $@
+
+# This is the list of auto-generated files headers
+LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/broadcom/, $(BROADCOM_GENXML_GENERATED_FILES))
+
+define pack-header-gen
+ @mkdir -p $(dir $@)
+ @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+ $(hide) $(PRIVATE_SCRIPT) $(PRIVATE_SCRIPT_FLAGS) $(PRIVATE_XML) $(PRIVATE_VER) > $@
+endef
+
+$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
+$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v21.xml
+$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_VER := 21
+$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v21.xml $(LOCAL_PATH)/cle/gen_pack_header.py
+ $(call pack-header-gen)
+
+$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
+$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v33.xml
+$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: PRIVATE_VER := 33
+$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v33.xml $(LOCAL_PATH)/cle/gen_pack_header.py
+ $(call pack-header-gen)
+
+$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
+$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v33.xml
+$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_VER := 41
+$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v33.xml $(LOCAL_PATH)/cle/gen_pack_header.py
+ $(call pack-header-gen)
+
+$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
+$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v33.xml
+$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_VER := 42
+$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v33.xml $(LOCAL_PATH)/cle/gen_pack_header.py
+ $(call pack-header-gen)
+
+$(intermediates)/broadcom/cle/v3d_xml.h: $(addprefix $(MESA_TOP)/src/broadcom/,$(BROADCOM_GENXML_XML_FILES)) $(MESA_TOP)/src/intel/genxml/gen_zipped_file.py
+ @mkdir -p $(dir $@)
+ @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
+ $(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/intel/genxml/gen_zipped_file.py $(addprefix $(MESA_TOP)/src/broadcom/,$(BROADCOM_GENXML_XML_FILES)) > $@ || (rm -f $@; false)
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := \
+ $(MESA_TOP)/src/broadcom/cle \
+ $(intermediates)/broadcom/cle \
+ $(intermediates)
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/lib/mesa/src/broadcom/Android.mk b/lib/mesa/src/broadcom/Android.mk
new file mode 100644
index 000000000..b3bf40510
--- /dev/null
+++ b/lib/mesa/src/broadcom/Android.mk
@@ -0,0 +1,29 @@
+# Copyright © 2016 Intel Corporation
+# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+LOCAL_PATH := $(call my-dir)
+
+# Import variables
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(LOCAL_PATH)/Android.genxml.mk
+include $(LOCAL_PATH)/Android.cle.mk
diff --git a/lib/mesa/src/broadcom/cle/meson.build b/lib/mesa/src/broadcom/cle/meson.build
index afaf5a1b4..a2f47625a 100644
--- a/lib/mesa/src/broadcom/cle/meson.build
+++ b/lib/mesa/src/broadcom/cle/meson.build
@@ -58,6 +58,6 @@ libbroadcom_cle = static_library(
'v3d_decoder.c',
include_directories : [inc_common, inc_broadcom],
c_args : [c_vis_args, no_override_init_args],
- dependencies : [dep_libdrm, dep_valgrind],
+ dependencies : [dep_libdrm, dep_valgrind, dep_expat, dep_zlib],
build_by_default : false,
)
diff --git a/lib/mesa/src/broadcom/cle/v3d_decoder.c b/lib/mesa/src/broadcom/cle/v3d_decoder.c
index 373a1d996..23ee59fd0 100644
--- a/lib/mesa/src/broadcom/cle/v3d_decoder.c
+++ b/lib/mesa/src/broadcom/cle/v3d_decoder.c
@@ -651,7 +651,8 @@ v3d_spec_load(const struct v3d_device_info *devinfo)
struct parser_context ctx;
void *buf;
uint8_t *text_data = NULL;
- uint32_t text_offset = 0, text_length = 0, total_length;
+ uint32_t text_offset = 0, text_length = 0;
+ ASSERTED uint32_t total_length;
for (int i = 0; i < ARRAY_SIZE(genxml_files_table); i++) {
if (i != 0) {
diff --git a/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml b/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml
index 06e8ddad7..f40796612 100644
--- a/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml
+++ b/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml
@@ -250,6 +250,28 @@
<value name="RGBA" value="3"/>
</enum>
+ <enum name="Pack Mode" prefix="V3D_PACK_MODE">
+ <value name="16-way" value="0"/>
+ <value name="8-way" value="1"/>
+ <value name="4-way" value="2"/>
+ </enum>
+
+ <enum name="TCS flush mode" prefix="V3D_TCS_FLUSH_MODE">
+ <value name="fully packed" value="0"/>
+ <value name="single patch" value="1"/>
+ <value name="packed complete patches" value="2"/>
+ </enum>
+
+ <enum name="Primitve counters" prefix="V3D_PRIM_COUNTS">
+ <value name="tf_words_buffer0" value="0"/>
+ <value name="tf_words_buffer1" value="1"/>
+ <value name="tf_words_buffer2" value="2"/>
+ <value name="tf_words_buffer3" value="3"/>
+ <value name="written" value="4"/>
+ <value name="tf_written" value="5"/>
+ <value name="tf_overflow" value="6"/>
+ </enum>
+
<packet code="0" name="Halt"/>
<packet code="1" name="NOP"/>
<packet code="4" name="Flush"/>
@@ -552,6 +574,14 @@
<field name="mode" size="8" start="0" type="Primitive"/>
</packet>
+ <packet code="39" name="Vertex Array Single Instance Prims" cl="B">
+ <field name="Index of First Vertex" size="32" start="72" type="uint"/>
+ <field name="Instance ID" size="32" start="40" type="uint"/>
+ <field name="Instance Length" size="32" start="8" type="uint"/>
+
+ <field name="mode" size="8" start="0" type="Primitive"/>
+ </packet>
+
<packet code="43" name="Base Vertex Base Instance" cl="B">
<field name="Base Instance" size="32" start="32" type="uint"/>
@@ -563,6 +593,14 @@
<field name="Size" size="32" start="32" type="uint"/>
</packet>
+ <packet code="54" name="Set InstanceID" cl="B" min_ver="41">
+ <field name="Instance ID" size="32" start="32" type="uint"/>
+ </packet>
+
+ <packet code="55" name="Set PrimitiveID" cl="B" min_ver="41">
+ <field name="Primitive ID" size="32" start="32" type="uint"/>
+ </packet>
+
<packet code="56" name="Prim List Format">
<field name="tri strip or fan" size="1" start="7" type="bool"/>
<field name="primitive type" size="6" start="0" type="uint">
@@ -572,16 +610,64 @@
</field>
</packet>
+ <packet code="57" name="Serial Number List Start">
+ <field name="address" size="26" start="6" type="address"/>
+ <field name="block size" size="2" start="0" type="uint">
+ <value name="block size 64b" value="0"/>
+ <value name="block size 128b" value="1"/>
+ <value name="block size 256b" value="2"/>
+ </field>
+ </packet>
+
<packet code="64" shortname="gl_shader" name="GL Shader State">
<field name="address" size="27" start="5" type="address"/>
<field name="number of attribute arrays" size="5" start="0" type="uint"/>
</packet>
+ <packet code="65" shortname="gl_t_shader" name="GL Shader State including TS" min_ver="41">
+ <field name="address" size="27" start="5" type="address"/>
+ <field name="number of attribute arrays" size="5" start="0" type="uint"/>
+ </packet>
+
+ <packet code="66" shortname="gl_g_shader" name="GL Shader State including GS" min_ver="41">
+ <field name="address" size="27" start="5" type="address"/>
+ <field name="number of attribute arrays" size="5" start="0" type="uint"/>
+ </packet>
+
+ <packet code="67" shortname="gl_tg_shader" name="GL Shader State including TS/GS" min_ver="41">
+ <field name="address" size="27" start="5" type="address"/>
+ <field name="number of attribute arrays" size="5" start="0" type="uint"/>
+ </packet>
+
<packet code="71" name="VCM Cache Size" min_ver="41">
<field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
<field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
</packet>
+ <packet code="72" shortname="prim_counts_feedback" name="Primitive Counts Feedback">
+ <field name="address" size="27" start="5" type="address"/>
+ <field name="read/write 64byte" size="1" start="4" type="bool"/>
+ <field name="op" size="4" start="0" type="uint">
+ <!--
+ dword 0-3 are words written to TFB 0-3. 4 is prims generated, 5 is prims written, 6 is
+ prims overflowed
+ -->
+ <value name="store primitive counts" value="0"/>
+ <value name="store primitive counts and zero" value="1"/>
+ <!--
+ write 4 pairs of TFB state: remaining TFB space in buffer n, current address in buffer n
+ -->
+ <value name="store buffer state" value="2"/>
+ <value name="store buffer state CL" value="3"/>
+ <!--
+ Waits for buffer state stores to complete, then loads from
+ the given buffer state. This op can be offset by n to skip
+ waiting for the last n.
+ -->
+ <value name="load buffer state" value="8"/>
+ </field>
+ </packet>
+
<packet code="73" name="VCM Cache Size" max_ver="33">
<field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
<field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
@@ -1200,6 +1286,61 @@
<field name="Tessellation Render Mode Evaluation Shader Uniforms Address" size="32" start="28b" type="address"/>
</struct>
+ <struct name="Tessellation/Geometry Common Params" min_ver="41">
+ <field name="Tessellation Type" size="2" start="1" type="uint">
+ <value name="Tessellation Type Triangle" value="0"/>
+ <value name="Tessellation Type Quads" value="1"/>
+ <value name="Tessellation Type Isolines" value="2"/>
+ </field>
+
+ <field name="Tessellation point mode" size="1" start="3" type="bool"/>
+
+ <field name="Tessellation Edge Spacing" size="2" start="4" type="uint">
+ <value name="Tessellation Edge Spacing Even" value="0"/>
+ <value name="Tessellation Edge Spacing Fractional Even" value="1"/>
+ <value name="Tessellation Edge Spacing Fractional Odd" value="2"/>
+ </field>
+
+ <field name="Tessellation clockwise" size="1" start="6" type="bool"/>
+
+ <field name="Tessellation Invocations" size="5" start="12" type="uint"/> <!-- 0 == 32 -->
+
+ <field name="Geometry Shader output format" size="2" start="17" type="uint">
+ <value name="Geometry Shader Points" value="0"/>
+ <value name="Geometry Shader Line Strip" value="1"/>
+ <value name="Geometry Shader Tri Strip" value="2"/>
+ </field>
+
+ <field name="Geometry Shader Instances" size="5" start="19" type="uint"/> <!-- 0 == 32 -->
+
+ <!-- followed by "Tessellation/Geometry Shader Params" for bin, then render -->
+ </struct>
+
+ <struct name="Tessellation/Geometry Shader Params">
+ <field name="TCS Batch Flush Mode" size="2" start="0" type="TCS flush mode"/>
+ <field name="Per-patch data column depth" size="4" start="2" type="uint"/> <!-- 8-dword units, 0==16 -->
+
+ <field name="TCS output segment size in sectors" size="6" start="8" type="uint"/>
+ <field name="TCS output segment pack mode" size="2" start="14" type="Pack Mode"/>
+
+ <field name="TES output segment size in sectors" size="6" start="16" type="uint"/>
+ <field name="TES output segment pack mode" size="2" start="22" type="Pack Mode"/>
+
+ <field name="GS output segment size in sectors" size="6" start="24" type="uint"/>
+ <field name="GS output segment pack mode" size="2" start="30" type="Pack Mode"/>
+
+ <field name="TBG max patches per TCS batch" size="4" start="32" type="uint" minus_one="true"/>
+ <field name="TBG max extra vertex segs for patches after first" size="2" start="36" type="uint"/>
+ <field name="TBG min TCS output segments required in play" size="2" start="38" type="uint" minus_one="true"/>
+ <field name="TBG min per-patch data segments required in play" size="3" start="40" type="uint" minus_one="true"/>
+ <field name="TPG max patches per TES batch" size="4" start="45" type="uint" minus_one="true"/>
+ <field name="TPG max vertex segments per TES batch" size="2" start="49" type="uint"/>
+ <field name="TPG max TCS output segments per TES batch" size="3" start="51" type="uint" minus_one="true"/>
+ <field name="TPG min TES output segments required in play" size="3" start="54" type="uint" minus_one="true"/>
+ <field name="GBG max TES output/vertex segments per GS batch" size="2" start="57" type="uint"/>
+ <field name="GBG max TES output/vertex segments required in play" size="3" start="59" type="uint" minus_one="true"/>
+ </struct>
+
<struct name="GL Shader State Attribute Record" max_ver="33">
<field name="Address" size="32" start="0" type="address"/>
diff --git a/lib/mesa/src/broadcom/common/v3d_device_info.c b/lib/mesa/src/broadcom/common/v3d_device_info.c
new file mode 100644
index 000000000..272190eb2
--- /dev/null
+++ b/lib/mesa/src/broadcom/common/v3d_device_info.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "common/v3d_device_info.h"
+#include "drm-uapi/v3d_drm.h"
+
+bool
+v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_ioctl) {
+ struct drm_v3d_get_param ident0 = {
+ .param = DRM_V3D_PARAM_V3D_CORE0_IDENT0,
+ };
+ struct drm_v3d_get_param ident1 = {
+ .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
+ };
+ int ret;
+
+ ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
+ if (ret != 0) {
+ fprintf(stderr, "Couldn't get V3D core IDENT0: %s\n",
+ strerror(errno));
+ return false;
+ }
+ ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident1);
+ if (ret != 0) {
+ fprintf(stderr, "Couldn't get V3D core IDENT1: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ uint32_t major = (ident0.value >> 24) & 0xff;
+ uint32_t minor = (ident1.value >> 0) & 0xf;
+
+ devinfo->ver = major * 10 + minor;
+
+ devinfo->vpm_size = (ident1.value >> 28 & 0xf) * 8192;
+
+ int nslc = (ident1.value >> 4) & 0xf;
+ int qups = (ident1.value >> 8) & 0xf;
+ devinfo->qpu_count = nslc * qups;
+
+ switch (devinfo->ver) {
+ case 33:
+ case 41:
+ case 42:
+ break;
+ default:
+ fprintf(stderr,
+ "V3D %d.%d not supported by this version of Mesa.\n",
+ devinfo->ver / 10,
+ devinfo->ver % 10);
+ return false;
+ }
+
+ return true;
+}
diff --git a/lib/mesa/src/broadcom/common/v3d_limits.h b/lib/mesa/src/broadcom/common/v3d_limits.h
index e21ee246e..776847622 100644
--- a/lib/mesa/src/broadcom/common/v3d_limits.h
+++ b/lib/mesa/src/broadcom/common/v3d_limits.h
@@ -24,6 +24,11 @@
#ifndef V3D_LIMITS_H
#define V3D_LIMITS_H
+/* Number of channels a QPU thread executes in parallel. Also known as
+ * gl_SubGroupSizeARB.
+ */
+#define V3D_CHANNELS 16
+
#define V3D_MAX_FS_INPUTS 64
#define V3D_MAX_VS_INPUTS 64
diff --git a/lib/mesa/src/broadcom/compiler/meson.build b/lib/mesa/src/broadcom/compiler/meson.build
index c80918db3..d7af999c3 100644
--- a/lib/mesa/src/broadcom/compiler/meson.build
+++ b/lib/mesa/src/broadcom/compiler/meson.build
@@ -23,9 +23,9 @@ libbroadcom_compiler_files = files(
'vir.c',
'vir_dump.c',
'vir_live_variables.c',
- 'vir_lower_uniforms.c',
'vir_opt_copy_propagate.c',
'vir_opt_dead_code.c',
+ 'vir_opt_redundant_flags.c',
'vir_opt_small_immediates.c',
'vir_register_allocate.c',
'vir_to_qpu.c',
@@ -37,6 +37,8 @@ libbroadcom_compiler_files = files(
'v3d_compiler.h',
'v3d_nir_lower_io.c',
'v3d_nir_lower_image_load_store.c',
+ 'v3d_nir_lower_logic_ops.c',
+ 'v3d_nir_lower_scratch.c',
'v3d_nir_lower_txf_ms.c',
)
diff --git a/lib/mesa/src/broadcom/compiler/nir_to_vir.c b/lib/mesa/src/broadcom/compiler/nir_to_vir.c
index bd19bb9b0..01468fa87 100644
--- a/lib/mesa/src/broadcom/compiler/nir_to_vir.c
+++ b/lib/mesa/src/broadcom/compiler/nir_to_vir.c
@@ -32,18 +32,15 @@
#include "common/v3d_device_info.h"
#include "v3d_compiler.h"
+/* We don't do any address packing. */
+#define __gen_user_data void
+#define __gen_address_type uint32_t
+#define __gen_address_offset(reloc) (*reloc)
+#define __gen_emit_reloc(cl, reloc)
+#include "cle/v3d_packet_v41_pack.h"
+
#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7)
#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7)
-#define GENERAL_TMU_READ_OP_PREFETCH (0 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_CLEAR (1 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_FLUSH (3 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_CLEAN (3 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_L1T_CLEAR (4 << 3)
-#define GENERAL_TMU_READ_OP_CACHE_L1T_FLUSH_AGGREGATION (5 << 3)
-#define GENERAL_TMU_READ_OP_ATOMIC_INC (8 << 3)
-#define GENERAL_TMU_READ_OP_ATOMIC_DEC (9 << 3)
-#define GENERAL_TMU_READ_OP_ATOMIC_NOT (10 << 3)
-#define GENERAL_TMU_READ_OP_READ (15 << 3)
#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0)
#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0)
#define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0)
@@ -53,19 +50,6 @@
#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0)
#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP (0 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_SUB_WRAP (1 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_XCHG (2 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG (3 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_UMIN (4 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_UMAX (5 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_SMIN (6 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_SMAX (7 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_AND (8 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_OR (9 << 3)
-#define GENERAL_TMU_WRITE_OP_ATOMIC_XOR (10 << 3)
-#define GENERAL_TMU_WRITE_OP_WRITE (15 << 3)
-
#define V3D_TSY_SET_QUORUM 0
#define V3D_TSY_INC_WAITERS 1
#define V3D_TSY_DEC_WAITERS 2
@@ -122,6 +106,27 @@ vir_emit_thrsw(struct v3d_compile *c)
c->last_thrsw = vir_NOP(c);
c->last_thrsw->qpu.sig.thrsw = true;
c->last_thrsw_at_top_level = !c->in_control_flow;
+
+ /* We need to lock the scoreboard before any tlb acess happens. If this
+ * thread switch comes after we have emitted a tlb load, then it means
+ * that we can't lock on the last thread switch any more.
+ */
+ if (c->emitted_tlb_load)
+ c->lock_scoreboard_on_first_thrsw = true;
+}
+
+uint32_t
+v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
+{
+ if (nir_src_is_const(instr->src[src])) {
+ int64_t add_val = nir_src_as_int(instr->src[src]);
+ if (add_val == 1)
+ return V3D_TMU_OP_WRITE_AND_READ_INC;
+ else if (add_val == -1)
+ return V3D_TMU_OP_WRITE_OR_READ_DEC;
+ }
+
+ return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
}
static uint32_t
@@ -132,40 +137,42 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_shared:
- return GENERAL_TMU_READ_OP_READ;
+ case nir_intrinsic_load_scratch:
case nir_intrinsic_store_ssbo:
case nir_intrinsic_store_shared:
- return GENERAL_TMU_WRITE_OP_WRITE;
+ case nir_intrinsic_store_scratch:
+ return V3D_TMU_OP_REGULAR;
case nir_intrinsic_ssbo_atomic_add:
+ return v3d_get_op_for_atomic_add(instr, 2);
case nir_intrinsic_shared_atomic_add:
- return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP;
+ return v3d_get_op_for_atomic_add(instr, 1);
case nir_intrinsic_ssbo_atomic_imin:
case nir_intrinsic_shared_atomic_imin:
- return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN;
+ return V3D_TMU_OP_WRITE_SMIN;
case nir_intrinsic_ssbo_atomic_umin:
case nir_intrinsic_shared_atomic_umin:
- return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN;
+ return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
case nir_intrinsic_ssbo_atomic_imax:
case nir_intrinsic_shared_atomic_imax:
- return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX;
+ return V3D_TMU_OP_WRITE_SMAX;
case nir_intrinsic_ssbo_atomic_umax:
case nir_intrinsic_shared_atomic_umax:
- return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX;
+ return V3D_TMU_OP_WRITE_UMAX;
case nir_intrinsic_ssbo_atomic_and:
case nir_intrinsic_shared_atomic_and:
- return GENERAL_TMU_WRITE_OP_ATOMIC_AND;
+ return V3D_TMU_OP_WRITE_AND_READ_INC;
case nir_intrinsic_ssbo_atomic_or:
case nir_intrinsic_shared_atomic_or:
- return GENERAL_TMU_WRITE_OP_ATOMIC_OR;
+ return V3D_TMU_OP_WRITE_OR_READ_DEC;
case nir_intrinsic_ssbo_atomic_xor:
case nir_intrinsic_shared_atomic_xor:
- return GENERAL_TMU_WRITE_OP_ATOMIC_XOR;
+ return V3D_TMU_OP_WRITE_XOR_READ_NOT;
case nir_intrinsic_ssbo_atomic_exchange:
case nir_intrinsic_shared_atomic_exchange:
- return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG;
+ return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
case nir_intrinsic_ssbo_atomic_comp_swap:
case nir_intrinsic_shared_atomic_comp_swap:
- return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG;
+ return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
default:
unreachable("unknown intrinsic op");
}
@@ -177,147 +184,217 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
*/
static void
ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
- bool is_shared)
+ bool is_shared_or_scratch)
{
- /* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR
- * wants to have support for inc/dec?
+ uint32_t tmu_op = v3d_general_tmu_op(instr);
+
+ /* If we were able to replace atomic_add for an inc/dec, then we
+ * need/can to do things slightly different, like not loading the
+ * amount to add/sub, as that is implicit.
*/
+ bool atomic_add_replaced =
+ ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
+ instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
+ (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
+ tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
- uint32_t tmu_op = v3d_general_tmu_op(instr);
bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
+ instr->intrinsic == nir_intrinsic_store_scratch ||
instr->intrinsic == nir_intrinsic_store_shared);
- bool has_index = !is_shared;
+
+ bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
+ instr->intrinsic == nir_intrinsic_load_ubo ||
+ instr->intrinsic == nir_intrinsic_load_ssbo ||
+ instr->intrinsic == nir_intrinsic_load_scratch ||
+ instr->intrinsic == nir_intrinsic_load_shared);
+
+ bool has_index = !is_shared_or_scratch;
int offset_src;
- int tmu_writes = 1; /* address */
if (instr->intrinsic == nir_intrinsic_load_uniform) {
offset_src = 0;
} else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
instr->intrinsic == nir_intrinsic_load_ubo ||
- instr->intrinsic == nir_intrinsic_load_shared) {
+ instr->intrinsic == nir_intrinsic_load_scratch ||
+ instr->intrinsic == nir_intrinsic_load_shared ||
+ atomic_add_replaced) {
offset_src = 0 + has_index;
} else if (is_store) {
offset_src = 1 + has_index;
- for (int i = 0; i < instr->num_components; i++) {
- vir_MOV_dest(c,
- vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
- ntq_get_src(c, instr->src[0], i));
- tmu_writes++;
- }
} else {
offset_src = 0 + has_index;
- vir_MOV_dest(c,
- vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
- ntq_get_src(c, instr->src[1 + has_index], 0));
- tmu_writes++;
- if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) {
- vir_MOV_dest(c,
- vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
- ntq_get_src(c, instr->src[2 + has_index],
- 0));
- tmu_writes++;
- }
}
- /* Make sure we won't exceed the 16-entry TMU fifo if each thread is
- * storing at the same time.
- */
- while (tmu_writes > 16 / c->threads)
- c->threads /= 2;
+ bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
+ uint32_t const_offset = 0;
+ if (!dynamic_src)
+ const_offset = nir_src_as_uint(instr->src[offset_src]);
- struct qreg offset;
+ struct qreg base_offset;
if (instr->intrinsic == nir_intrinsic_load_uniform) {
- offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 0);
-
- /* Find what variable in the default uniform block this
- * uniform load is coming from.
- */
- uint32_t base = nir_intrinsic_base(instr);
- int i;
- struct v3d_ubo_range *range = NULL;
- for (i = 0; i < c->num_ubo_ranges; i++) {
- range = &c->ubo_ranges[i];
- if (base >= range->src_offset &&
- base < range->src_offset + range->size) {
- break;
- }
- }
- /* The driver-location-based offset always has to be within a
- * declared uniform range.
- */
- assert(i != c->num_ubo_ranges);
- if (!c->ubo_range_used[i]) {
- c->ubo_range_used[i] = true;
- range->dst_offset = c->next_ubo_dst_offset;
- c->next_ubo_dst_offset += range->size;
- }
-
- base = base - range->src_offset + range->dst_offset;
-
- if (base != 0)
- offset = vir_ADD(c, offset, vir_uniform_ui(c, base));
+ const_offset += nir_intrinsic_base(instr);
+ base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
+ v3d_unit_data_create(0, const_offset));
+ const_offset = 0;
} else if (instr->intrinsic == nir_intrinsic_load_ubo) {
+ uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
/* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
* 1 (0 is gallium's constant buffer 0).
*/
- offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
- nir_src_as_uint(instr->src[0]) + 1);
- } else if (is_shared) {
- /* Shared variables have no buffer index, and all start from a
- * common base that we set up at the start of dispatch
+ base_offset =
+ vir_uniform(c, QUNIFORM_UBO_ADDR,
+ v3d_unit_data_create(index, const_offset));
+ const_offset = 0;
+ } else if (is_shared_or_scratch) {
+ /* Shared and scratch variables have no buffer index, and all
+ * start from a common base that we set up at the start of
+ * dispatch.
*/
- offset = c->cs_shared_offset;
+ if (instr->intrinsic == nir_intrinsic_load_scratch ||
+ instr->intrinsic == nir_intrinsic_store_scratch) {
+ base_offset = c->spill_base;
+ } else {
+ base_offset = c->cs_shared_offset;
+ const_offset += nir_intrinsic_base(instr);
+ }
} else {
- offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
- nir_src_as_uint(instr->src[is_store ?
- 1 : 0]));
+ base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
+ nir_src_as_uint(instr->src[is_store ?
+ 1 : 0]));
}
- uint32_t config = (0xffffff00 |
- tmu_op |
- GENERAL_TMU_LOOKUP_PER_PIXEL);
- if (instr->num_components == 1) {
- config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
- } else {
- config |= (GENERAL_TMU_LOOKUP_TYPE_VEC2 +
- instr->num_components - 2);
- }
+ struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
+ unsigned writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
+ uint32_t base_const_offset = const_offset;
+ int first_component = -1;
+ int last_component = -1;
+ do {
+ int tmu_writes = 1; /* address */
- if (c->execute.file != QFILE_NULL)
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ if (is_store) {
+ /* Find the first set of consecutive components that
+ * are enabled in the writemask and emit the TMUD
+ * instructions for them.
+ */
+ first_component = ffs(writemask) - 1;
+ last_component = first_component;
+ while (writemask & BITFIELD_BIT(last_component + 1))
+ last_component++;
+
+ assert(first_component >= 0 &&
+ first_component <= last_component &&
+ last_component < instr->num_components);
+
+ struct qreg tmud = vir_reg(QFILE_MAGIC,
+ V3D_QPU_WADDR_TMUD);
+ for (int i = first_component; i <= last_component; i++) {
+ struct qreg data =
+ ntq_get_src(c, instr->src[0], i);
+ vir_MOV_dest(c, tmud, data);
+ tmu_writes++;
+ }
- struct qreg dest;
- if (config == ~0)
- dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
- else
- dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+ /* Update the offset for the TMU write based on the
+ * the first component we are writing.
+ */
+ const_offset = base_const_offset + first_component * 4;
+
+ /* Clear these components from the writemask */
+ uint32_t written_mask =
+ BITFIELD_RANGE(first_component, tmu_writes - 1);
+ writemask &= ~written_mask;
+ } else if (!is_load && !atomic_add_replaced) {
+ struct qreg data =
+ ntq_get_src(c, instr->src[1 + has_index], 0);
+ vir_MOV_dest(c, tmud, data);
+ tmu_writes++;
+ if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
+ data = ntq_get_src(c, instr->src[2 + has_index],
+ 0);
+ vir_MOV_dest(c, tmud, data);
+ tmu_writes++;
+ }
+ }
- struct qinst *tmu;
- if (nir_src_is_const(instr->src[offset_src]) &&
- nir_src_as_uint(instr->src[offset_src]) == 0) {
- tmu = vir_MOV_dest(c, dest, offset);
- } else {
- tmu = vir_ADD_dest(c, dest,
- offset,
- ntq_get_src(c, instr->src[offset_src], 0));
- }
+ /* Make sure we won't exceed the 16-entry TMU fifo if each
+ * thread is storing at the same time.
+ */
+ while (tmu_writes > 16 / c->threads)
+ c->threads /= 2;
- if (config != ~0) {
- tmu->src[vir_get_implicit_uniform_src(tmu)] =
- vir_uniform_ui(c, config);
- }
+ /* The spec says that for atomics, the TYPE field is ignored,
+ * but that doesn't seem to be the case for CMPXCHG. Just use
+ * the number of tmud writes we did to decide the type (or
+ * choose "32bit" for atomic reads, which has been fine).
+ */
+ uint32_t num_components;
+ if (is_load || atomic_add_replaced) {
+ num_components = instr->num_components;
+ } else {
+ assert(tmu_writes > 1);
+ num_components = tmu_writes - 1;
+ }
+
+ uint32_t config = (0xffffff00 |
+ tmu_op << 3|
+ GENERAL_TMU_LOOKUP_PER_PIXEL);
+ if (num_components == 1) {
+ config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+ } else {
+ config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
+ num_components - 2;
+ }
+
+ if (vir_in_nonuniform_control_flow(c)) {
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+ }
- if (c->execute.file != QFILE_NULL)
- vir_set_cond(tmu, V3D_QPU_COND_IFA);
+ struct qreg tmua;
+ if (config == ~0)
+ tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
+ else
+ tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
+
+ struct qinst *tmu;
+ if (dynamic_src) {
+ struct qreg offset = base_offset;
+ if (const_offset != 0) {
+ offset = vir_ADD(c, offset,
+ vir_uniform_ui(c, const_offset));
+ }
+ struct qreg data =
+ ntq_get_src(c, instr->src[offset_src], 0);
+ tmu = vir_ADD_dest(c, tmua, offset, data);
+ } else {
+ if (const_offset != 0) {
+ tmu = vir_ADD_dest(c, tmua, base_offset,
+ vir_uniform_ui(c, const_offset));
+ } else {
+ tmu = vir_MOV_dest(c, tmua, base_offset);
+ }
+ }
- vir_emit_thrsw(c);
+ if (config != ~0) {
+ tmu->uniform =
+ vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+ config);
+ }
+
+ if (vir_in_nonuniform_control_flow(c))
+ vir_set_cond(tmu, V3D_QPU_COND_IFA);
+
+ vir_emit_thrsw(c);
- /* Read the result, or wait for the TMU op to complete. */
- for (int i = 0; i < nir_intrinsic_dest_components(instr); i++)
- ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));
+ /* Read the result, or wait for the TMU op to complete. */
+ for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
+ ntq_store_dest(c, &instr->dest, i,
+ vir_MOV(c, vir_LDTMU(c)));
+ }
- if (nir_intrinsic_dest_components(instr) == 0)
- vir_TMUWT(c);
+ if (nir_intrinsic_dest_components(instr) == 0)
+ vir_TMUWT(c);
+ } while (is_store && writemask != 0);
}
static struct qreg *
@@ -329,6 +406,20 @@ ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
return qregs;
}
+static bool
+is_ld_signal(const struct v3d_qpu_sig *sig)
+{
+ return (sig->ldunif ||
+ sig->ldunifa ||
+ sig->ldunifrf ||
+ sig->ldunifarf ||
+ sig->ldtmu ||
+ sig->ldvary ||
+ sig->ldvpm ||
+ sig->ldtlb ||
+ sig->ldtlbu);
+}
+
/**
* This function is responsible for getting VIR results into the associated
* storage for a NIR instruction.
@@ -352,8 +443,7 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
if (!list_empty(&c->cur_block->instructions))
last_inst = (struct qinst *)c->cur_block->instructions.prev;
- assert(result.file == QFILE_UNIF ||
- (result.file == QFILE_TEMP &&
+ assert((result.file == QFILE_TEMP &&
last_inst && last_inst == c->defs[result.index]));
if (dest->is_ssa) {
@@ -377,10 +467,12 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
_mesa_hash_table_search(c->def_ht, reg);
struct qreg *qregs = entry->data;
- /* Insert a MOV if the source wasn't an SSA def in the
- * previous instruction.
+ /* If the previous instruction can't be predicated for
+ * the store into the nir_register, then emit a MOV
+ * that can be.
*/
- if (result.file == QFILE_UNIF) {
+ if (vir_in_nonuniform_control_flow(c) &&
+ is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig)) {
result = vir_MOV(c, result);
last_inst = c->defs[result.index];
}
@@ -392,17 +484,17 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
/* If we're in control flow, then make this update of the reg
* conditional on the execution mask.
*/
- if (c->execute.file != QFILE_NULL) {
+ if (vir_in_nonuniform_control_flow(c)) {
last_inst->dst.index = qregs[chan].index;
/* Set the flags to the current exec mask.
*/
c->cursor = vir_before_inst(last_inst);
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
c->cursor = vir_after_inst(last_inst);
vir_set_cond(last_inst, V3D_QPU_COND_IFA);
- last_inst->cond_is_exec_mask = true;
}
}
}
@@ -540,26 +632,13 @@ ntq_fsign(struct v3d_compile *c, struct qreg src)
struct qreg t = vir_get_temp(c);
vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
- vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
- vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN);
+ vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
return vir_MOV(c, t);
}
-static struct qreg
-ntq_isign(struct v3d_compile *c, struct qreg src)
-{
- struct qreg t = vir_get_temp(c);
-
- vir_MOV_dest(c, t, vir_uniform_ui(c, 0));
- vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ);
- vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1));
- vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN);
- vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1));
- return vir_MOV(c, t);
-}
-
static void
emit_fragcoord_input(struct v3d_compile *c, int attr)
{
@@ -675,27 +754,6 @@ add_output(struct v3d_compile *c,
v3d_slot_from_slot_and_component(slot, swizzle);
}
-static void
-declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size)
-{
- unsigned array_id = c->num_ubo_ranges++;
- if (array_id >= c->ubo_ranges_array_size) {
- c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
- array_id + 1);
- c->ubo_ranges = reralloc(c, c->ubo_ranges,
- struct v3d_ubo_range,
- c->ubo_ranges_array_size);
- c->ubo_range_used = reralloc(c, c->ubo_range_used,
- bool,
- c->ubo_ranges_array_size);
- }
-
- c->ubo_ranges[array_id].dst_offset = 0;
- c->ubo_ranges[array_id].src_offset = start;
- c->ubo_ranges[array_id].size = size;
- c->ubo_range_used[array_id] = false;
-}
-
/**
* If compare_instr is a valid comparison instruction, emits the
* compare_instr's comparison and returns the sel_instr's return value based
@@ -711,7 +769,7 @@ ntq_emit_comparison(struct v3d_compile *c,
if (nir_op_infos[compare_instr->op].num_inputs > 1)
src1 = ntq_get_alu_src(c, compare_instr, 1);
bool cond_invert = false;
- struct qreg nop = vir_reg(QFILE_NULL, 0);
+ struct qreg nop = vir_nop_reg();
switch (compare_instr->op) {
case nir_op_feq32:
@@ -756,6 +814,16 @@ ntq_emit_comparison(struct v3d_compile *c,
vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
break;
+ case nir_op_i2b32:
+ vir_set_pf(vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
+ cond_invert = true;
+ break;
+
+ case nir_op_f2b32:
+ vir_set_pf(vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
+ cond_invert = true;
+ break;
+
default:
return false;
}
@@ -789,28 +857,24 @@ ntq_get_alu_parent(nir_src src)
return instr;
}
-/**
- * Attempts to fold a comparison generating a boolean result into the
- * condition code for selecting between two values, instead of comparing the
- * boolean result against 0 to generate the condition code.
- */
-static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr,
- struct qreg *src)
+/* Turns a NIR bool into a condition code to predicate on. */
+static enum v3d_qpu_cond
+ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
{
- nir_alu_instr *compare = ntq_get_alu_parent(instr->src[0].src);
+ nir_alu_instr *compare = ntq_get_alu_parent(src);
if (!compare)
goto out;
enum v3d_qpu_cond cond;
if (ntq_emit_comparison(c, compare, &cond))
- return vir_MOV(c, vir_SEL(c, cond, src[1], src[2]));
+ return cond;
out:
- vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
- return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2]));
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
+ V3D_QPU_PF_PUSHZ);
+ return V3D_QPU_COND_IFNA;
}
-
static void
ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
{
@@ -843,8 +907,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
struct qreg result;
switch (instr->op) {
- case nir_op_fmov:
- case nir_op_imov:
+ case nir_op_mov:
result = vir_MOV(c, src[0]);
break;
@@ -871,9 +934,16 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
result = vir_FMAX(c, src[0], src[1]);
break;
- case nir_op_f2i32:
- result = vir_FTOIZ(c, src[0]);
+ case nir_op_f2i32: {
+ nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);
+ if (src0_alu && src0_alu->op == nir_op_fround_even) {
+ result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));
+ } else {
+ result = vir_FTOIZ(c, src[0]);
+ }
break;
+ }
+
case nir_op_f2u32:
result = vir_FTOUZ(c, src[0]);
break;
@@ -889,13 +959,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
case nir_op_b2i32:
result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
break;
- case nir_op_i2b32:
- case nir_op_f2b32:
- vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
- result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
- vir_uniform_ui(c, ~0),
- vir_uniform_ui(c, 0)));
- break;
case nir_op_iadd:
result = vir_ADD(c, src[0], src[1]);
@@ -950,7 +1013,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
case nir_op_sge:
case nir_op_slt: {
enum v3d_qpu_cond cond;
- MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
+ ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
assert(ok);
result = vir_MOV(c, vir_SEL(c, cond,
vir_uniform_f(c, 1.0),
@@ -958,6 +1021,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
break;
}
+ case nir_op_i2b32:
+ case nir_op_f2b32:
case nir_op_feq32:
case nir_op_fne32:
case nir_op_fge32:
@@ -969,7 +1034,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
case nir_op_ilt32:
case nir_op_ult32: {
enum v3d_qpu_cond cond;
- MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond);
+ ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
assert(ok);
result = vir_MOV(c, vir_SEL(c, cond,
vir_uniform_ui(c, ~0),
@@ -978,10 +1043,15 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
}
case nir_op_b32csel:
- result = ntq_emit_bcsel(c, instr, src);
+ result = vir_MOV(c,
+ vir_SEL(c,
+ ntq_emit_bool_to_cond(c, instr->src[0].src),
+ src[1], src[2]));
break;
+
case nir_op_fcsel:
- vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), src[0]),
+ V3D_QPU_PF_PUSHZ);
result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
src[1], src[2]));
break;
@@ -1011,9 +1081,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
case nir_op_ftrunc:
result = vir_FTRUNC(c, src[0]);
break;
- case nir_op_ffract:
- result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0]));
- break;
case nir_op_fsin:
result = ntq_fsincos(c, src[0], false);
@@ -1025,9 +1092,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
case nir_op_fsign:
result = ntq_fsign(c, src[0]);
break;
- case nir_op_isign:
- result = ntq_isign(c, src[0]);
- break;
case nir_op_fabs: {
result = vir_FMOV(c, src[0]);
@@ -1036,8 +1100,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
}
case nir_op_iabs:
- result = vir_MAX(c, src[0],
- vir_SUB(c, vir_uniform_ui(c, 0), src[0]));
+ result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
break;
case nir_op_fddx:
@@ -1053,7 +1116,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
break;
case nir_op_uadd_carry:
- vir_PF(c, vir_ADD(c, src[0], src[1]), V3D_QPU_PF_PUSHC);
+ vir_set_pf(vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
+ V3D_QPU_PF_PUSHC);
result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
vir_uniform_ui(c, ~0),
vir_uniform_ui(c, 0)));
@@ -1064,9 +1128,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
break;
case nir_op_unpack_half_2x16_split_x:
- /* XXX perf: It would be good to be able to merge this unpack
- * with whatever uses our result.
- */
result = vir_FMOV(c, src[0]);
vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
break;
@@ -1120,6 +1181,107 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
#define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4))
static void
+vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
+{
+ if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
+ return;
+
+ struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
+ struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
+
+ nir_variable *var = c->output_color_var[rt];
+ int num_components = glsl_get_vector_elements(var->type);
+ uint32_t conf = 0xffffff00;
+ struct qinst *inst;
+
+ conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
+ TLB_SAMPLE_MODE_PER_PIXEL;
+ conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
+
+ if (c->fs_key->swap_color_rb & (1 << rt))
+ num_components = MAX2(num_components, 3);
+ assert(num_components != 0);
+
+ enum glsl_base_type type = glsl_get_base_type(var->type);
+ bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
+ bool is_32b_tlb_format = is_int_format ||
+ (c->fs_key->f32_color_rb & (1 << rt));
+
+ if (is_int_format) {
+ /* The F32 vs I32 distinction was dropped in 4.2. */
+ if (c->devinfo->ver < 42)
+ conf |= TLB_TYPE_I32_COLOR;
+ else
+ conf |= TLB_TYPE_F32_COLOR;
+ conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
+ } else {
+ if (c->fs_key->f32_color_rb & (1 << rt)) {
+ conf |= TLB_TYPE_F32_COLOR;
+ conf |= ((num_components - 1) <<
+ TLB_VEC_SIZE_MINUS_1_SHIFT);
+ } else {
+ conf |= TLB_TYPE_F16_COLOR;
+ conf |= TLB_F16_SWAP_HI_LO;
+ if (num_components >= 3)
+ conf |= TLB_VEC_SIZE_4_F16;
+ else
+ conf |= TLB_VEC_SIZE_2_F16;
+ }
+ }
+
+ int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
+ for (int i = 0; i < num_samples; i++) {
+ struct qreg *color = c->msaa_per_sample_output ?
+ &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
+ &c->outputs[var->data.driver_location * 4];
+
+ struct qreg r = color[0];
+ struct qreg g = color[1];
+ struct qreg b = color[2];
+ struct qreg a = color[3];
+
+ if (c->fs_key->swap_color_rb & (1 << rt)) {
+ r = color[2];
+ b = color[0];
+ }
+
+ if (c->fs_key->sample_alpha_to_one)
+ a = vir_uniform_f(c, 1.0);
+
+ if (is_32b_tlb_format) {
+ if (i == 0) {
+ inst = vir_MOV_dest(c, tlbu_reg, r);
+ inst->uniform =
+ vir_get_uniform_index(c,
+ QUNIFORM_CONSTANT,
+ conf);
+ } else {
+ inst = vir_MOV_dest(c, tlb_reg, r);
+ }
+
+ if (num_components >= 2)
+ vir_MOV_dest(c, tlb_reg, g);
+ if (num_components >= 3)
+ vir_MOV_dest(c, tlb_reg, b);
+ if (num_components >= 4)
+ vir_MOV_dest(c, tlb_reg, a);
+ } else {
+ inst = vir_VFPACK_dest(c, tlb_reg, r, g);
+ if (conf != ~0 && i == 0) {
+ inst->dst = tlbu_reg;
+ inst->uniform =
+ vir_get_uniform_index(c,
+ QUNIFORM_CONSTANT,
+ conf);
+ }
+
+ if (num_components >= 3)
+ inst = vir_VFPACK_dest(c, tlb_reg, b, a);
+ }
+ }
+}
+
+static void
emit_frag_end(struct v3d_compile *c)
{
/* XXX
@@ -1129,8 +1291,8 @@ emit_frag_end(struct v3d_compile *c)
*/
bool has_any_tlb_color_write = false;
- for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
- if (c->output_color_var[rt])
+ for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
+ if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
has_any_tlb_color_write = true;
}
@@ -1138,15 +1300,15 @@ emit_frag_end(struct v3d_compile *c)
struct nir_variable *var = c->output_color_var[0];
struct qreg *color = &c->outputs[var->data.driver_location * 4];
- vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
+ vir_SETMSF_dest(c, vir_nop_reg(),
vir_AND(c,
vir_MSF(c),
vir_FTOC(c, color[3])));
}
+ struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
if (c->output_position_index != -1) {
- struct qinst *inst = vir_MOV_dest(c,
- vir_reg(QFILE_TLBU, 0),
+ struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
c->outputs[c->output_position_index]);
uint8_t tlb_specifier = TLB_TYPE_DEPTH;
@@ -1156,8 +1318,9 @@ emit_frag_end(struct v3d_compile *c)
} else
tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
- inst->src[vir_get_implicit_uniform_src(inst)] =
- vir_uniform_ui(c, tlb_specifier | 0xffffff00);
+ inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+ tlb_specifier |
+ 0xffffff00);
c->writes_z = true;
} else if (c->s->info.fs.uses_discard ||
!c->s->info.fs.early_fragment_tests ||
@@ -1173,9 +1336,8 @@ emit_frag_end(struct v3d_compile *c)
*/
c->s->info.fs.uses_discard = true;
- struct qinst *inst = vir_MOV_dest(c,
- vir_reg(QFILE_TLBU, 0),
- vir_reg(QFILE_NULL, 0));
+ struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
+ vir_nop_reg());
uint8_t tlb_specifier = TLB_TYPE_DEPTH;
if (c->devinfo->ver >= 42) {
@@ -1188,254 +1350,34 @@ emit_frag_end(struct v3d_compile *c)
tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
}
- inst->src[vir_get_implicit_uniform_src(inst)] =
- vir_uniform_ui(c, tlb_specifier | 0xffffff00);
+ inst->uniform = vir_get_uniform_index(c,
+ QUNIFORM_CONSTANT,
+ tlb_specifier |
+ 0xffffff00);
c->writes_z = true;
}
/* XXX: Performance improvement: Merge Z write and color writes TLB
* uniform setup
*/
-
- for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
- if (!c->output_color_var[rt])
- continue;
-
- nir_variable *var = c->output_color_var[rt];
- struct qreg *color = &c->outputs[var->data.driver_location * 4];
- int num_components = glsl_get_vector_elements(var->type);
- uint32_t conf = 0xffffff00;
- struct qinst *inst;
-
- conf |= TLB_SAMPLE_MODE_PER_PIXEL;
- conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
-
- if (c->fs_key->swap_color_rb & (1 << rt))
- num_components = MAX2(num_components, 3);
-
- assert(num_components != 0);
- switch (glsl_get_base_type(var->type)) {
- case GLSL_TYPE_UINT:
- case GLSL_TYPE_INT:
- /* The F32 vs I32 distinction was dropped in 4.2. */
- if (c->devinfo->ver < 42)
- conf |= TLB_TYPE_I32_COLOR;
- else
- conf |= TLB_TYPE_F32_COLOR;
- conf |= ((num_components - 1) <<
- TLB_VEC_SIZE_MINUS_1_SHIFT);
-
- inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
- inst->src[vir_get_implicit_uniform_src(inst)] =
- vir_uniform_ui(c, conf);
-
- for (int i = 1; i < num_components; i++) {
- inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
- color[i]);
- }
- break;
-
- default: {
- struct qreg r = color[0];
- struct qreg g = color[1];
- struct qreg b = color[2];
- struct qreg a = color[3];
-
- if (c->fs_key->f32_color_rb & (1 << rt)) {
- conf |= TLB_TYPE_F32_COLOR;
- conf |= ((num_components - 1) <<
- TLB_VEC_SIZE_MINUS_1_SHIFT);
- } else {
- conf |= TLB_TYPE_F16_COLOR;
- conf |= TLB_F16_SWAP_HI_LO;
- if (num_components >= 3)
- conf |= TLB_VEC_SIZE_4_F16;
- else
- conf |= TLB_VEC_SIZE_2_F16;
- }
-
- if (c->fs_key->swap_color_rb & (1 << rt)) {
- r = color[2];
- b = color[0];
- }
-
- if (c->fs_key->sample_alpha_to_one)
- a = vir_uniform_f(c, 1.0);
-
- if (c->fs_key->f32_color_rb & (1 << rt)) {
- inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), r);
- inst->src[vir_get_implicit_uniform_src(inst)] =
- vir_uniform_ui(c, conf);
-
- if (num_components >= 2)
- vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), g);
- if (num_components >= 3)
- vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), b);
- if (num_components >= 4)
- vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), a);
- } else {
- inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g);
- if (conf != ~0) {
- inst->dst.file = QFILE_TLBU;
- inst->src[vir_get_implicit_uniform_src(inst)] =
- vir_uniform_ui(c, conf);
- }
-
- if (num_components >= 3)
- inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a);
- }
- break;
- }
- }
- }
+ for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
+ vir_emit_tlb_color_write(c, rt);
}
static void
-vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t *vpm_index)
+vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
{
if (c->devinfo->ver >= 40) {
- vir_STVPMV(c, vir_uniform_ui(c, *vpm_index), val);
- *vpm_index = *vpm_index + 1;
+ vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val);
} else {
+ /* XXX: v3d33_vir_vpm_write_setup(c); */
vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
}
-
- c->num_vpm_writes++;
-}
-
-static void
-emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w,
- uint32_t *vpm_index)
-{
- for (int i = 0; i < 2; i++) {
- struct qreg coord = c->outputs[c->output_position_index + i];
- coord = vir_FMUL(c, coord,
- vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i,
- 0));
- coord = vir_FMUL(c, coord, rcp_w);
- vir_VPM_WRITE(c, vir_FTOIN(c, coord), vpm_index);
- }
-
-}
-
-static void
-emit_zs_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
-{
- struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
- struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
-
- struct qreg z = c->outputs[c->output_position_index + 2];
- z = vir_FMUL(c, z, zscale);
- z = vir_FMUL(c, z, rcp_w);
- z = vir_FADD(c, z, zoffset);
- vir_VPM_WRITE(c, z, vpm_index);
-}
-
-static void
-emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
-{
- vir_VPM_WRITE(c, rcp_w, vpm_index);
-}
-
-static void
-emit_point_size_write(struct v3d_compile *c, uint32_t *vpm_index)
-{
- struct qreg point_size;
-
- if (c->output_point_size_index != -1)
- point_size = c->outputs[c->output_point_size_index];
- else
- point_size = vir_uniform_f(c, 1.0);
-
- /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
- * BCM21553).
- */
- point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125));
-
- vir_VPM_WRITE(c, point_size, vpm_index);
-}
-
-static void
-emit_vpm_write_setup(struct v3d_compile *c)
-{
- if (c->devinfo->ver >= 40)
- return;
-
- v3d33_vir_vpm_write_setup(c);
-}
-
-/**
- * Sets up c->outputs[c->output_position_index] for the vertex shader
- * epilogue, if an output vertex position wasn't specified in the user's
- * shader. This may be the case for transform feedback with rasterizer
- * discard enabled.
- */
-static void
-setup_default_position(struct v3d_compile *c)
-{
- if (c->output_position_index != -1)
- return;
-
- c->output_position_index = c->outputs_array_size;
- for (int i = 0; i < 4; i++) {
- add_output(c,
- c->output_position_index + i,
- VARYING_SLOT_POS, i);
- }
}
static void
emit_vert_end(struct v3d_compile *c)
{
- setup_default_position(c);
-
- uint32_t vpm_index = 0;
- struct qreg rcp_w = vir_RECIP(c,
- c->outputs[c->output_position_index + 3]);
-
- emit_vpm_write_setup(c);
-
- if (c->vs_key->is_coord) {
- for (int i = 0; i < 4; i++)
- vir_VPM_WRITE(c, c->outputs[c->output_position_index + i],
- &vpm_index);
- emit_scaled_viewport_write(c, rcp_w, &vpm_index);
- if (c->vs_key->per_vertex_point_size) {
- emit_point_size_write(c, &vpm_index);
- /* emit_rcp_wc_write(c, rcp_w); */
- }
- /* XXX: Z-only rendering */
- if (0)
- emit_zs_write(c, rcp_w, &vpm_index);
- } else {
- emit_scaled_viewport_write(c, rcp_w, &vpm_index);
- emit_zs_write(c, rcp_w, &vpm_index);
- emit_rcp_wc_write(c, rcp_w, &vpm_index);
- if (c->vs_key->per_vertex_point_size)
- emit_point_size_write(c, &vpm_index);
- }
-
- for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
- struct v3d_varying_slot input = c->vs_key->fs_inputs[i];
- int j;
-
- for (j = 0; j < c->num_outputs; j++) {
- struct v3d_varying_slot output = c->output_slots[j];
-
- if (!memcmp(&input, &output, sizeof(input))) {
- vir_VPM_WRITE(c, c->outputs[j],
- &vpm_index);
- break;
- }
- }
- /* Emit padding if we didn't find a declared VS output for
- * this FS input.
- */
- if (j == c->num_outputs)
- vir_VPM_WRITE(c, vir_uniform_f(c, 0.0),
- &vpm_index);
- }
-
/* GFXH-1684: VPM writes need to be complete by the end of the shader.
*/
if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
@@ -1446,25 +1388,48 @@ void
v3d_optimize_nir(struct nir_shader *s)
{
bool progress;
+ unsigned lower_flrp =
+ (s->options->lower_flrp16 ? 16 : 0) |
+ (s->options->lower_flrp32 ? 32 : 0) |
+ (s->options->lower_flrp64 ? 64 : 0);
do {
progress = false;
NIR_PASS_V(s, nir_lower_vars_to_ssa);
- NIR_PASS(progress, s, nir_lower_alu_to_scalar);
+ NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL);
NIR_PASS(progress, s, nir_lower_phis_to_scalar);
NIR_PASS(progress, s, nir_copy_prop);
NIR_PASS(progress, s, nir_opt_remove_phis);
NIR_PASS(progress, s, nir_opt_dce);
NIR_PASS(progress, s, nir_opt_dead_cf);
NIR_PASS(progress, s, nir_opt_cse);
- NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
+ NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
NIR_PASS(progress, s, nir_opt_algebraic);
NIR_PASS(progress, s, nir_opt_constant_folding);
+
+ if (lower_flrp != 0) {
+ bool lower_flrp_progress = false;
+
+ NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
+ lower_flrp,
+ false /* always_precise */,
+ s->options->lower_ffma);
+ if (lower_flrp_progress) {
+ NIR_PASS(progress, s, nir_opt_constant_folding);
+ progress = true;
+ }
+
+ /* Nothing should rematerialize any flrps, so we only
+ * need to do this lowering once.
+ */
+ lower_flrp = 0;
+ }
+
NIR_PASS(progress, s, nir_opt_undef);
} while (progress);
- NIR_PASS(progress, s, nir_opt_move_load_ubo);
+ NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
}
static int
@@ -1492,7 +1457,6 @@ ntq_emit_vpm_read(struct v3d_compile *c,
if (*num_components_queued != 0) {
(*num_components_queued)--;
- c->num_inputs++;
return vir_MOV(c, vpm);
}
@@ -1502,7 +1466,6 @@ ntq_emit_vpm_read(struct v3d_compile *c,
*num_components_queued = num_components - 1;
*remaining -= num_components;
- c->num_inputs++;
return vir_MOV(c, vpm);
}
@@ -1550,6 +1513,12 @@ ntq_setup_vpm_inputs(struct v3d_compile *c)
&num_components, ~0);
}
+ /* The actual loads will happen directly in nir_intrinsic_load_input
+ * on newer versions.
+ */
+ if (c->devinfo->ver >= 40)
+ return;
+
for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
(loc + 1) * 4);
@@ -1572,6 +1541,26 @@ ntq_setup_vpm_inputs(struct v3d_compile *c)
}
}
+static bool
+var_needs_point_coord(struct v3d_compile *c, nir_variable *var)
+{
+ return (var->data.location == VARYING_SLOT_PNTC ||
+ (var->data.location >= VARYING_SLOT_VAR0 &&
+ (c->fs_key->point_sprite_mask &
+ (1 << (var->data.location - VARYING_SLOT_VAR0)))));
+}
+
+static bool
+program_reads_point_coord(struct v3d_compile *c)
+{
+ nir_foreach_variable(var, &c->s->inputs) {
+ if (var_needs_point_coord(c, var))
+ return true;
+ }
+
+ return false;
+}
+
static void
ntq_setup_fs_inputs(struct v3d_compile *c)
{
@@ -1605,11 +1594,7 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
if (var->data.location == VARYING_SLOT_POS) {
emit_fragcoord_input(c, loc);
- } else if (var->data.location == VARYING_SLOT_PNTC ||
- (var->data.location >= VARYING_SLOT_VAR0 &&
- (c->fs_key->point_sprite_mask &
- (1 << (var->data.location -
- VARYING_SLOT_VAR0))))) {
+ } else if (var_needs_point_coord(c, var)) {
c->inputs[loc * 4 + 0] = c->point_x;
c->inputs[loc * 4 + 1] = c->point_y;
} else {
@@ -1622,6 +1607,9 @@ ntq_setup_fs_inputs(struct v3d_compile *c)
static void
ntq_setup_outputs(struct v3d_compile *c)
{
+ if (c->s->info.stage != MESA_SHADER_FRAGMENT)
+ return;
+
nir_foreach_variable(var, &c->s->outputs) {
unsigned array_len = MAX2(glsl_get_length(var->type), 1);
unsigned loc = var->data.driver_location * 4;
@@ -1635,58 +1623,30 @@ ntq_setup_outputs(struct v3d_compile *c)
var->data.location_frac + i);
}
- if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
- switch (var->data.location) {
- case FRAG_RESULT_COLOR:
- c->output_color_var[0] = var;
- c->output_color_var[1] = var;
- c->output_color_var[2] = var;
- c->output_color_var[3] = var;
- break;
- case FRAG_RESULT_DATA0:
- case FRAG_RESULT_DATA1:
- case FRAG_RESULT_DATA2:
- case FRAG_RESULT_DATA3:
- c->output_color_var[var->data.location -
- FRAG_RESULT_DATA0] = var;
- break;
- case FRAG_RESULT_DEPTH:
- c->output_position_index = loc;
- break;
- case FRAG_RESULT_SAMPLE_MASK:
- c->output_sample_mask_index = loc;
- break;
- }
- } else {
- switch (var->data.location) {
- case VARYING_SLOT_POS:
- c->output_position_index = loc;
- break;
- case VARYING_SLOT_PSIZ:
- c->output_point_size_index = loc;
- break;
- }
+ switch (var->data.location) {
+ case FRAG_RESULT_COLOR:
+ c->output_color_var[0] = var;
+ c->output_color_var[1] = var;
+ c->output_color_var[2] = var;
+ c->output_color_var[3] = var;
+ break;
+ case FRAG_RESULT_DATA0:
+ case FRAG_RESULT_DATA1:
+ case FRAG_RESULT_DATA2:
+ case FRAG_RESULT_DATA3:
+ c->output_color_var[var->data.location -
+ FRAG_RESULT_DATA0] = var;
+ break;
+ case FRAG_RESULT_DEPTH:
+ c->output_position_index = loc;
+ break;
+ case FRAG_RESULT_SAMPLE_MASK:
+ c->output_sample_mask_index = loc;
+ break;
}
}
}
-static void
-ntq_setup_uniforms(struct v3d_compile *c)
-{
- nir_foreach_variable(var, &c->s->uniforms) {
- uint32_t vec4_count = glsl_count_attribute_slots(var->type,
- false);
- unsigned vec4_size = 4 * sizeof(float);
-
- if (var->data.mode != nir_var_uniform)
- continue;
-
- declare_uniform_range(c, var->data.driver_location * vec4_size,
- vec4_count * vec4_size);
-
- }
-}
-
/**
* Sets up the mapping from nir_register to struct qreg *.
*
@@ -1717,7 +1677,7 @@ ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
*/
struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
for (int i = 0; i < instr->def.num_components; i++)
- qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
+ qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
_mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
}
@@ -1761,26 +1721,239 @@ ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
}
static void
-ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
+vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
- unsigned offset;
+ assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
- switch (instr->intrinsic) {
- case nir_intrinsic_load_uniform:
- if (nir_src_is_const(instr->src[0])) {
- int offset = (nir_intrinsic_base(instr) +
- nir_src_as_uint(instr->src[0]));
- assert(offset % 4 == 0);
- /* We need dwords */
- offset = offset / 4;
- for (int i = 0; i < instr->num_components; i++) {
- ntq_store_dest(c, &instr->dest, i,
- vir_uniform(c, QUNIFORM_UNIFORM,
- offset + i));
- }
+ int rt = nir_src_as_uint(instr->src[0]);
+ assert(rt < V3D_MAX_DRAW_BUFFERS);
+
+ int sample_index = nir_intrinsic_base(instr) ;
+ assert(sample_index < V3D_MAX_SAMPLES);
+
+ int component = nir_intrinsic_component(instr);
+ assert(component < 4);
+
+ /* We need to emit our TLB reads after we have acquired the scoreboard
+ * lock, or the GPU will hang. Usually, we do our scoreboard locking on
+ * the last thread switch to improve parallelism, however, that is only
+ * guaranteed to happen before the tlb color writes.
+ *
+ * To fix that, we make sure we always emit a thread switch before the
+ * first tlb color read. If that happens to be the last thread switch
+ * we emit, then everything is fine, but otherwsie, if any code after
+ * this point needs to emit additional thread switches, then we will
+ * switch the strategy to locking the scoreboard on the first thread
+ * switch instead -- see vir_emit_thrsw().
+ */
+ if (!c->emitted_tlb_load) {
+ if (!c->last_thrsw_at_top_level) {
+ assert(c->devinfo->ver >= 41);
+ vir_emit_thrsw(c);
+ }
+
+ c->emitted_tlb_load = true;
+ }
+
+ struct qreg *color_reads_for_sample =
+ &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
+
+ if (color_reads_for_sample[component].file == QFILE_NULL) {
+ enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
+ int num_components =
+ util_format_get_nr_components(rt_format);
+
+ const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
+ if (swap_rb)
+ num_components = MAX2(num_components, 3);
+
+ nir_variable *var = c->output_color_var[rt];
+ enum glsl_base_type type = glsl_get_base_type(var->type);
+
+ bool is_int_format = type == GLSL_TYPE_INT ||
+ type == GLSL_TYPE_UINT;
+
+ bool is_32b_tlb_format = is_int_format ||
+ (c->fs_key->f32_color_rb & (1 << rt));
+
+ int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
+
+ uint32_t conf = 0xffffff00;
+ conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
+ TLB_SAMPLE_MODE_PER_PIXEL;
+ conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
+
+ if (is_32b_tlb_format) {
+ /* The F32 vs I32 distinction was dropped in 4.2. */
+ conf |= (c->devinfo->ver < 42 && is_int_format) ?
+ TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
+
+ conf |= ((num_components - 1) <<
+ TLB_VEC_SIZE_MINUS_1_SHIFT);
} else {
- ntq_emit_tmu_general(c, instr, false);
+ conf |= TLB_TYPE_F16_COLOR;
+ conf |= TLB_F16_SWAP_HI_LO;
+
+ if (num_components >= 3)
+ conf |= TLB_VEC_SIZE_4_F16;
+ else
+ conf |= TLB_VEC_SIZE_2_F16;
}
+
+
+ for (int i = 0; i < num_samples; i++) {
+ struct qreg r, g, b, a;
+ if (is_32b_tlb_format) {
+ r = conf != 0xffffffff && i == 0?
+ vir_TLBU_COLOR_READ(c, conf) :
+ vir_TLB_COLOR_READ(c);
+ if (num_components >= 2)
+ g = vir_TLB_COLOR_READ(c);
+ if (num_components >= 3)
+ b = vir_TLB_COLOR_READ(c);
+ if (num_components >= 4)
+ a = vir_TLB_COLOR_READ(c);
+ } else {
+ struct qreg rg = conf != 0xffffffff && i == 0 ?
+ vir_TLBU_COLOR_READ(c, conf) :
+ vir_TLB_COLOR_READ(c);
+ r = vir_FMOV(c, rg);
+ vir_set_unpack(c->defs[r.index], 0,
+ V3D_QPU_UNPACK_L);
+ g = vir_FMOV(c, rg);
+ vir_set_unpack(c->defs[g.index], 0,
+ V3D_QPU_UNPACK_H);
+
+ if (num_components > 2) {
+ struct qreg ba = vir_TLB_COLOR_READ(c);
+ b = vir_FMOV(c, ba);
+ vir_set_unpack(c->defs[b.index], 0,
+ V3D_QPU_UNPACK_L);
+ a = vir_FMOV(c, ba);
+ vir_set_unpack(c->defs[a.index], 0,
+ V3D_QPU_UNPACK_H);
+ }
+ }
+
+ struct qreg *color_reads =
+ &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
+
+ color_reads[0] = swap_rb ? b : r;
+ if (num_components >= 2)
+ color_reads[1] = g;
+ if (num_components >= 3)
+ color_reads[2] = swap_rb ? r : b;
+ if (num_components >= 4)
+ color_reads[3] = a;
+ }
+ }
+
+ assert(color_reads_for_sample[component].file != QFILE_NULL);
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_MOV(c, color_reads_for_sample[component]));
+}
+
+static void
+ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ if (nir_src_is_const(instr->src[0])) {
+ int offset = (nir_intrinsic_base(instr) +
+ nir_src_as_uint(instr->src[0]));
+ assert(offset % 4 == 0);
+ /* We need dwords */
+ offset = offset / 4;
+ for (int i = 0; i < instr->num_components; i++) {
+ ntq_store_dest(c, &instr->dest, i,
+ vir_uniform(c, QUNIFORM_UNIFORM,
+ offset + i));
+ }
+ } else {
+ ntq_emit_tmu_general(c, instr, false);
+ }
+}
+
+static void
+ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset)
+ * and enable PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR.
+ */
+ unsigned offset =
+ nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
+
+ if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
+ /* Emit the LDVPM directly now, rather than at the top
+ * of the shader like we did for V3D 3.x (which needs
+ * vpmsetup when not just taking the next offset).
+ *
+ * Note that delaying like this may introduce stalls,
+ * as LDVPMV takes a minimum of 1 instruction but may
+ * be slower if the VPM unit is busy with another QPU.
+ */
+ int index = 0;
+ if (c->s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_INSTANCE_ID)) {
+ index++;
+ }
+ if (c->s->info.system_values_read &
+ (1ull << SYSTEM_VALUE_VERTEX_ID)) {
+ index++;
+ }
+ for (int i = 0; i < offset; i++)
+ index += c->vattr_sizes[i];
+ index += nir_intrinsic_component(instr);
+ for (int i = 0; i < instr->num_components; i++) {
+ struct qreg vpm_offset = vir_uniform_ui(c, index++);
+ ntq_store_dest(c, &instr->dest, i,
+ vir_LDVPMV_IN(c, vpm_offset));
+ }
+ } else {
+ for (int i = 0; i < instr->num_components; i++) {
+ int comp = nir_intrinsic_component(instr) + i;
+ ntq_store_dest(c, &instr->dest, i,
+ vir_MOV(c, c->inputs[offset * 4 + comp]));
+ }
+ }
+}
+
+static void
+ntq_emit_per_sample_color_write(struct v3d_compile *c,
+ nir_intrinsic_instr *instr)
+{
+ assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
+
+ unsigned rt = nir_src_as_uint(instr->src[1]);
+ assert(rt < V3D_MAX_DRAW_BUFFERS);
+
+ unsigned sample_idx = nir_intrinsic_base(instr);
+ assert(sample_idx < V3D_MAX_SAMPLES);
+
+ unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
+ for (int i = 0; i < instr->num_components; i++) {
+ c->sample_colors[offset + i] =
+ vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+ }
+}
+
+static void
+ntq_emit_color_write(struct v3d_compile *c,
+ nir_intrinsic_instr *instr)
+{
+ unsigned offset = (nir_intrinsic_base(instr) +
+ nir_src_as_uint(instr->src[1])) * 4 +
+ nir_intrinsic_component(instr);
+ for (int i = 0; i < instr->num_components; i++) {
+ c->outputs[offset + i] =
+ vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+ }
+}
+
+static void
+ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
+{
+ switch (instr->intrinsic) {
+ case nir_intrinsic_load_uniform:
+ ntq_emit_load_uniform(c, instr);
break;
case nir_intrinsic_load_ubo:
@@ -1814,6 +1987,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
case nir_intrinsic_shared_atomic_comp_swap:
case nir_intrinsic_load_shared:
case nir_intrinsic_store_shared:
+ case nir_intrinsic_load_scratch:
+ case nir_intrinsic_store_scratch:
ntq_emit_tmu_general(c, instr, true);
break;
@@ -1845,6 +2020,26 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
}
break;
+ case nir_intrinsic_load_viewport_x_scale:
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
+ break;
+
+ case nir_intrinsic_load_viewport_y_scale:
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
+ break;
+
+ case nir_intrinsic_load_viewport_z_scale:
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
+ break;
+
+ case nir_intrinsic_load_viewport_z_offset:
+ ntq_store_dest(c, &instr->dest, 0,
+ vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
+ break;
+
case nir_intrinsic_load_alpha_ref_float:
ntq_store_dest(c, &instr->dest, 0,
vir_uniform(c, QUNIFORM_ALPHA_REF, 0));
@@ -1855,7 +2050,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_helper_invocation:
- vir_PF(c, vir_MSF(c), V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
ntq_store_dest(c, &instr->dest, 0,
vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
vir_uniform_ui(c, ~0),
@@ -1880,27 +2075,32 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
break;
+ case nir_intrinsic_load_tlb_color_v3d:
+ vir_emit_tlb_color_read(c, instr);
+ break;
+
case nir_intrinsic_load_input:
- for (int i = 0; i < instr->num_components; i++) {
- offset = (nir_intrinsic_base(instr) +
- nir_src_as_uint(instr->src[0]));
- int comp = nir_intrinsic_component(instr) + i;
- ntq_store_dest(c, &instr->dest, i,
- vir_MOV(c, c->inputs[offset * 4 + comp]));
- }
+ ntq_emit_load_input(c, instr);
break;
- case nir_intrinsic_store_output:
- offset = ((nir_intrinsic_base(instr) +
- nir_src_as_uint(instr->src[1])) * 4 +
- nir_intrinsic_component(instr));
+ case nir_intrinsic_store_tlb_sample_color_v3d:
+ ntq_emit_per_sample_color_write(c, instr);
+ break;
- for (int i = 0; i < instr->num_components; i++) {
- c->outputs[offset + i] =
- vir_MOV(c, ntq_get_src(c, instr->src[0], i));
+ case nir_intrinsic_store_output:
+ /* XXX perf: Use stvpmv with uniform non-constant offsets and
+ * stvpmd with non-uniform offsets and enable
+ * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR.
+ */
+ if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
+ ntq_emit_color_write(c, instr);
+ } else {
+ assert(instr->num_components == 1);
+
+ vir_VPM_WRITE(c,
+ ntq_get_src(c, instr->src[0], 0),
+ nir_intrinsic_base(instr));
}
- c->num_outputs = MAX2(c->num_outputs,
- offset + instr->num_components);
break;
case nir_intrinsic_image_deref_size:
@@ -1908,38 +2108,35 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_discard:
- if (c->execute.file != QFILE_NULL) {
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
- vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
+ if (vir_in_nonuniform_control_flow(c)) {
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+ vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
vir_uniform_ui(c, 0)),
V3D_QPU_COND_IFA);
} else {
- vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
+ vir_SETMSF_dest(c, vir_nop_reg(),
vir_uniform_ui(c, 0));
}
break;
case nir_intrinsic_discard_if: {
- /* true (~0) if we're discarding */
- struct qreg cond = ntq_get_src(c, instr->src[0], 0);
+ enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
- if (c->execute.file != QFILE_NULL) {
- /* execute == 0 means the channel is active. Invert
- * the condition so that we can use zero as "executing
- * and discarding."
- */
- vir_PF(c, vir_OR(c, c->execute, vir_NOT(c, cond)),
- V3D_QPU_PF_PUSHZ);
- vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
- vir_uniform_ui(c, 0)),
- V3D_QPU_COND_IFA);
- } else {
- vir_PF(c, cond, V3D_QPU_PF_PUSHZ);
- vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
- vir_uniform_ui(c, 0)),
- V3D_QPU_COND_IFNA);
+ if (vir_in_nonuniform_control_flow(c)) {
+ struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
+ c->execute);
+ if (cond == V3D_QPU_COND_IFA) {
+ vir_set_uf(exec_flag, V3D_QPU_UF_ANDZ);
+ } else {
+ vir_set_uf(exec_flag, V3D_QPU_UF_NORNZ);
+ cond = V3D_QPU_COND_IFA;
+ }
}
+ vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
+ vir_uniform_ui(c, 0)), cond);
+
break;
}
@@ -1948,6 +2145,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
case nir_intrinsic_memory_barrier_buffer:
case nir_intrinsic_memory_barrier_image:
case nir_intrinsic_memory_barrier_shared:
+ case nir_intrinsic_group_memory_barrier:
/* We don't do any instruction scheduling of these NIR
* instructions between each other, so we just need to make
* sure that the TMU operations before the barrier are flushed
@@ -1970,10 +2168,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
vir_BARRIERID_dest(c,
vir_reg(QFILE_MAGIC,
V3D_QPU_WADDR_SYNCU));
- sync->src[vir_get_implicit_uniform_src(sync)] =
- vir_uniform_ui(c,
- 0xffffff00 |
- V3D_TSY_WAIT_INC_CHECK);
+ sync->uniform =
+ vir_get_uniform_index(c, QUNIFORM_CONSTANT,
+ 0xffffff00 |
+ V3D_TSY_WAIT_INC_CHECK);
}
@@ -2010,6 +2208,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
vir_uniform_ui(c, 0xffff)));
break;
+ case nir_intrinsic_load_subgroup_id:
+ ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+ break;
+
default:
fprintf(stderr, "Unknown intrinsic: ");
nir_print_instr(&instr->instr, stderr);
@@ -2030,7 +2232,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
static void
ntq_activate_execute_for_block(struct v3d_compile *c)
{
- vir_set_pf(vir_XOR_dest(c, vir_reg(QFILE_NULL, 0),
+ vir_set_pf(vir_XOR_dest(c, vir_nop_reg(),
c->execute, vir_uniform_ui(c, c->cur_block->index)),
V3D_QPU_PF_PUSHZ);
@@ -2054,14 +2256,7 @@ ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
else_block = vir_new_block(c);
/* Set up the flags for the IF condition (taking the THEN branch). */
- nir_alu_instr *if_condition_alu = ntq_get_alu_parent(if_stmt->condition);
- enum v3d_qpu_cond cond;
- if (!if_condition_alu ||
- !ntq_emit_comparison(c, if_condition_alu, &cond)) {
- vir_PF(c, ntq_get_src(c, if_stmt->condition, 0),
- V3D_QPU_PF_PUSHZ);
- cond = V3D_QPU_COND_IFNA;
- }
+ enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
/* Jump to ELSE. */
vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
@@ -2081,7 +2276,6 @@ ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
/* Emit the else block. */
vir_set_emit_block(c, else_block);
- ntq_activate_execute_for_block(c);
ntq_emit_cf_list(c, &if_stmt->else_list);
}
@@ -2107,20 +2301,13 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
else_block = vir_new_block(c);
bool was_uniform_control_flow = false;
- if (c->execute.file == QFILE_NULL) {
+ if (!vir_in_nonuniform_control_flow(c)) {
c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
was_uniform_control_flow = true;
}
/* Set up the flags for the IF condition (taking the THEN branch). */
- nir_alu_instr *if_condition_alu = ntq_get_alu_parent(if_stmt->condition);
- enum v3d_qpu_cond cond;
- if (!if_condition_alu ||
- !ntq_emit_comparison(c, if_condition_alu, &cond)) {
- vir_PF(c, ntq_get_src(c, if_stmt->condition, 0),
- V3D_QPU_PF_PUSHZ);
- cond = V3D_QPU_COND_IFNA;
- }
+ enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
/* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
* was previously active (execute Z) for updating the exec flags.
@@ -2128,8 +2315,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
if (was_uniform_control_flow) {
cond = v3d_qpu_cond_invert(cond);
} else {
- struct qinst *inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0),
- c->execute);
+ struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
if (cond == V3D_QPU_COND_IFA) {
vir_set_uf(inst, V3D_QPU_UF_NORNZ);
} else {
@@ -2145,7 +2331,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
/* Jump to ELSE if nothing is active for THEN, otherwise fall
* through.
*/
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
vir_link_blocks(c->cur_block, else_block);
vir_link_blocks(c->cur_block, then_block);
@@ -2159,14 +2345,16 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
* active channels update their execute flags to point to
* ENDIF
*/
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
vir_uniform_ui(c, after_block->index));
/* If everything points at ENDIF, then jump there immediately. */
- vir_PF(c, vir_XOR(c, c->execute,
- vir_uniform_ui(c, after_block->index)),
- V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_XOR_dest(c, vir_nop_reg(),
+ c->execute,
+ vir_uniform_ui(c, after_block->index)),
+ V3D_QPU_PF_PUSHZ);
vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
vir_link_blocks(c->cur_block, after_block);
vir_link_blocks(c->cur_block, else_block);
@@ -2190,7 +2378,7 @@ ntq_emit_if(struct v3d_compile *c, nir_if *nif)
{
bool was_in_control_flow = c->in_control_flow;
c->in_control_flow = true;
- if (c->execute.file == QFILE_NULL &&
+ if (!vir_in_nonuniform_control_flow(c) &&
nir_src_is_dynamically_uniform(nif->condition)) {
ntq_emit_uniform_if(c, nif);
} else {
@@ -2204,13 +2392,15 @@ ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
{
switch (jump->type) {
case nir_jump_break:
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
vir_uniform_ui(c, c->loop_break_block->index));
break;
case nir_jump_continue:
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
vir_uniform_ui(c, c->loop_cont_block->index));
break;
@@ -2277,7 +2467,7 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
c->in_control_flow = true;
bool was_uniform_control_flow = false;
- if (c->execute.file == QFILE_NULL) {
+ if (!vir_in_nonuniform_control_flow(c)) {
c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
was_uniform_control_flow = true;
}
@@ -2299,13 +2489,14 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
*
* XXX: Use the .ORZ flags update, instead.
*/
- vir_PF(c, vir_XOR(c,
- c->execute,
- vir_uniform_ui(c, c->loop_cont_block->index)),
- V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_XOR_dest(c,
+ vir_nop_reg(),
+ c->execute,
+ vir_uniform_ui(c, c->loop_cont_block->index)),
+ V3D_QPU_PF_PUSHZ);
vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
- vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
/* Pixels that were not dispatched or have been discarded should not
@@ -2380,15 +2571,17 @@ nir_to_vir(struct v3d_compile *c)
c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
- /* XXX perf: We could set the "disable implicit point/line
- * varyings" field in the shader record and not emit these, if
- * they're not going to be used.
+ /* V3D 4.x can disable implicit point coordinate varyings if
+ * they are not used.
*/
- if (c->fs_key->is_points) {
+ if (c->fs_key->is_points &&
+ (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
c->point_x = emit_fragment_varying(c, NULL, 0, 0);
c->point_y = emit_fragment_varying(c, NULL, 0, 0);
- } else if (c->fs_key->is_lines) {
+ c->uses_implicit_point_line_varyings = true;
+ } else if (c->fs_key->is_lines && c->devinfo->ver < 40) {
c->line_x = emit_fragment_varying(c, NULL, 0, 0);
+ c->uses_implicit_point_line_varyings = true;
}
break;
case MESA_SHADER_COMPUTE:
@@ -2398,16 +2591,8 @@ nir_to_vir(struct v3d_compile *c)
V3D_QPU_WADDR_SYNC));
}
- if (c->s->info.system_values_read &
- ((1ull << SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) |
- (1ull << SYSTEM_VALUE_WORK_GROUP_ID))) {
- c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
- }
- if ((c->s->info.system_values_read &
- ((1ull << SYSTEM_VALUE_WORK_GROUP_ID))) ||
- c->s->info.cs.shared_size) {
- c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
- }
+ c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+ c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
/* Set up the division between gl_LocalInvocationIndex and
* wg_in_mem in the payload reg.
@@ -2444,14 +2629,17 @@ nir_to_vir(struct v3d_compile *c)
break;
}
+ if (c->s->scratch_size) {
+ v3d_setup_spill_base(c);
+ c->spill_size += V3D_CHANNELS * c->s->scratch_size;
+ }
+
if (c->s->info.stage == MESA_SHADER_FRAGMENT)
ntq_setup_fs_inputs(c);
else
ntq_setup_vpm_inputs(c);
ntq_setup_outputs(c);
- ntq_setup_uniforms(c);
- ntq_setup_registers(c, &c->s->registers);
/* Find the main function and emit the body. */
nir_foreach_function(function, c->s) {
@@ -2465,12 +2653,13 @@ const nir_shader_compiler_options v3d_nir_options = {
.lower_all_io_to_temps = true,
.lower_extract_byte = true,
.lower_extract_word = true,
- .lower_bfm = true,
.lower_bitfield_insert_to_shifts = true,
.lower_bitfield_extract_to_shifts = true,
.lower_bitfield_reverse = true,
.lower_bit_count = true,
.lower_cs_local_id_from_index = true,
+ .lower_ffract = true,
+ .lower_fmod = true,
.lower_pack_unorm_2x16 = true,
.lower_pack_snorm_2x16 = true,
.lower_pack_unorm_4x8 = true,
@@ -2487,10 +2676,11 @@ const nir_shader_compiler_options v3d_nir_options = {
.lower_fsat = true,
.lower_fsqrt = true,
.lower_ifind_msb = true,
+ .lower_isign = true,
.lower_ldexp = true,
.lower_mul_high = true,
.lower_wpos_pntc = true,
- .native_integers = true,
+ .lower_rotate = true,
};
/**
@@ -2595,6 +2785,8 @@ v3d_nir_to_vir(struct v3d_compile *c)
case MESA_SHADER_VERTEX:
emit_vert_end(c);
break;
+ case MESA_SHADER_COMPUTE:
+ break;
default:
unreachable("bad stage");
}
@@ -2609,7 +2801,6 @@ v3d_nir_to_vir(struct v3d_compile *c)
}
vir_optimize(c);
- vir_lower_uniforms(c);
vir_check_payload_w(c);
@@ -2659,5 +2850,15 @@ v3d_nir_to_vir(struct v3d_compile *c)
vir_remove_thrsw(c);
}
+ if (c->spills &&
+ (V3D_DEBUG & (V3D_DEBUG_VIR |
+ v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
+ fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id);
+ vir_dump(c);
+ fprintf(stderr, "\n");
+ }
+
v3d_vir_to_qpu(c, temp_registers);
}
diff --git a/lib/mesa/src/broadcom/compiler/qpu_schedule.c b/lib/mesa/src/broadcom/compiler/qpu_schedule.c
index 0f8001ff5..c15218e26 100644
--- a/lib/mesa/src/broadcom/compiler/qpu_schedule.c
+++ b/lib/mesa/src/broadcom/compiler/qpu_schedule.c
@@ -37,18 +37,16 @@
#include "qpu/qpu_disasm.h"
#include "v3d_compiler.h"
#include "util/ralloc.h"
+#include "util/dag.h"
static bool debug;
struct schedule_node_child;
struct schedule_node {
+ struct dag_node dag;
struct list_head link;
struct qinst *inst;
- struct schedule_node_child *children;
- uint32_t child_count;
- uint32_t child_array_size;
- uint32_t parent_count;
/* Longest cycles + instruction_latency() of any parent of this node. */
uint32_t unblocked_time;
@@ -67,11 +65,6 @@ struct schedule_node {
uint32_t latency;
};
-struct schedule_node_child {
- struct schedule_node *node;
- bool write_after_read;
-};
-
/* When walking the instructions in reverse, we need to swap before/after in
* add_dep().
*/
@@ -79,6 +72,7 @@ enum direction { F, R };
struct schedule_state {
const struct v3d_device_info *devinfo;
+ struct dag *dag;
struct schedule_node *last_r[6];
struct schedule_node *last_rf[64];
struct schedule_node *last_sf;
@@ -101,37 +95,17 @@ add_dep(struct schedule_state *state,
bool write)
{
bool write_after_read = !write && state->dir == R;
+ void *edge_data = (void *)(uintptr_t)write_after_read;
if (!before || !after)
return;
assert(before != after);
- if (state->dir == R) {
- struct schedule_node *t = before;
- before = after;
- after = t;
- }
-
- for (int i = 0; i < before->child_count; i++) {
- if (before->children[i].node == after &&
- (before->children[i].write_after_read == write_after_read)) {
- return;
- }
- }
-
- if (before->child_array_size <= before->child_count) {
- before->child_array_size = MAX2(before->child_array_size * 2, 16);
- before->children = reralloc(before, before->children,
- struct schedule_node_child,
- before->child_array_size);
- }
-
- before->children[before->child_count].node = after;
- before->children[before->child_count].write_after_read =
- write_after_read;
- before->child_count++;
- after->parent_count++;
+ if (state->dir == F)
+ dag_add_edge(&before->dag, &after->dag, edge_data);
+ else
+ dag_add_edge(&after->dag, &before->dag, edge_data);
}
static void
@@ -154,6 +128,9 @@ add_write_dep(struct schedule_state *state,
static bool
qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
{
+ if (inst->sig.ldtlb || inst->sig.ldtlbu)
+ return true;
+
if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
return false;
@@ -179,7 +156,10 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
break;
case V3D_QPU_MUX_B:
- add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n);
+ if (!n->inst->qpu.sig.small_imm) {
+ add_read_dep(state,
+ state->last_rf[n->inst->qpu.raddr_b], n);
+ }
break;
default:
add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
@@ -402,7 +382,7 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
add_write_dep(state, &state->last_tmu_config, n);
if (inst->sig.ldtlb | inst->sig.ldtlbu)
- add_read_dep(state, state->last_tlb, n);
+ add_write_dep(state, &state->last_tlb, n);
if (inst->sig.ldvpm) {
add_write_dep(state, &state->last_vpm_read, n);
@@ -415,7 +395,7 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
}
/* inst->sig.ldunif or sideband uniform read */
- if (qinst->uniform != ~0)
+ if (vir_has_uniform(qinst))
add_write_dep(state, &state->last_unif, n);
if (v3d_qpu_reads_flags(inst))
@@ -425,11 +405,13 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
}
static void
-calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list)
+calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
+ struct list_head *schedule_list)
{
struct schedule_state state;
memset(&state, 0, sizeof(state));
+ state.dag = dag;
state.devinfo = c->devinfo;
state.dir = F;
@@ -438,23 +420,28 @@ calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list)
}
static void
-calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list)
+calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
+ struct list_head *schedule_list)
{
- struct list_head *node;
struct schedule_state state;
memset(&state, 0, sizeof(state));
+ state.dag = dag;
state.devinfo = c->devinfo;
state.dir = R;
- for (node = schedule_list->prev; schedule_list != node; node = node->prev) {
+ list_for_each_entry_rev(struct schedule_node, node, schedule_list,
+ link) {
calculate_deps(&state, (struct schedule_node *)node);
}
}
struct choose_scoreboard {
+ struct dag *dag;
int tick;
int last_magic_sfu_write_tick;
+ int last_stallable_sfu_reg;
+ int last_stallable_sfu_tick;
int last_ldvary_tick;
int last_uniforms_reset_tick;
int last_thrsw_tick;
@@ -546,6 +533,38 @@ pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard,
return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst));
}
+static bool
+qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+ uint32_t waddr) {
+
+ if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+ return false;
+
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+ inst->raddr_a == waddr)
+ return true;
+
+ if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+ !inst->sig.small_imm && (inst->raddr_b == waddr))
+ return true;
+
+ return false;
+}
+
+static bool
+mux_read_stalls(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst)
+{
+ return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
+ qpu_instruction_uses_rf(inst,
+ scoreboard->last_stallable_sfu_reg);
+}
+
+/* We define a max schedule priority to allow negative priorities as result of
+ * substracting this max when an instruction stalls. So instructions that
+ * stall have lower priority than regular instructions. */
+#define MAX_SCHEDULE_PRIORITY 16
+
static int
get_instruction_priority(const struct v3d_qpu_instr *inst)
{
@@ -564,10 +583,6 @@ get_instruction_priority(const struct v3d_qpu_instr *inst)
return next_score;
next_score++;
- /* XXX perf: We should schedule SFU ALU ops so that the reader is 2
- * instructions after the producer if possible, not just 1.
- */
-
/* Default score for things that aren't otherwise special. */
baseline_score = next_score;
next_score++;
@@ -577,6 +592,9 @@ get_instruction_priority(const struct v3d_qpu_instr *inst)
return next_score;
next_score++;
+ /* We should increase the maximum if we assert here */
+ assert(next_score < MAX_SCHEDULE_PRIORITY);
+
return baseline_score;
}
@@ -623,6 +641,37 @@ qpu_accesses_peripheral(const struct v3d_qpu_instr *inst)
}
static bool
+qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
+ const struct v3d_qpu_instr *a,
+ const struct v3d_qpu_instr *b)
+{
+ const bool a_uses_peripheral = qpu_accesses_peripheral(a);
+ const bool b_uses_peripheral = qpu_accesses_peripheral(b);
+
+ /* We can always do one peripheral access per instruction. */
+ if (!a_uses_peripheral || !b_uses_peripheral)
+ return true;
+
+ if (devinfo->ver < 41)
+ return false;
+
+ /* V3D 4.1 and later allow TMU read along with a VPM read or write, and
+ * WRTMUC with a TMU magic register write (other than tmuc).
+ */
+ if ((a->sig.ldtmu && v3d_qpu_uses_vpm(b)) ||
+ (b->sig.ldtmu && v3d_qpu_uses_vpm(a))) {
+ return true;
+ }
+
+ if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(b)) ||
+ (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(a))) {
+ return true;
+ }
+
+ return false;
+}
+
+static bool
qpu_merge_inst(const struct v3d_device_info *devinfo,
struct v3d_qpu_instr *result,
const struct v3d_qpu_instr *a,
@@ -633,12 +682,7 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
return false;
}
- /* Can't do more than one peripheral access in an instruction.
- *
- * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and
- * WRTMUC with a TMU magic register write (other than tmuc).
- */
- if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b))
+ if (!qpu_compatible_peripheral_access(devinfo, a, b))
return false;
struct v3d_qpu_instr merge = *a;
@@ -714,7 +758,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
static struct schedule_node *
choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
struct choose_scoreboard *scoreboard,
- struct list_head *schedule_list,
struct schedule_node *prev_inst)
{
struct schedule_node *chosen = NULL;
@@ -728,7 +771,8 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
return NULL;
}
- list_for_each_entry(struct schedule_node, n, schedule_list, link) {
+ list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
+ dag.link) {
const struct v3d_qpu_instr *inst = &n->inst->qpu;
/* Don't choose the branch instruction until it's the last one
@@ -736,7 +780,7 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
* choose it.
*/
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
- !list_is_singular(schedule_list)) {
+ !list_is_singular(&scoreboard->dag->heads)) {
continue;
}
@@ -805,6 +849,18 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo,
int prio = get_instruction_priority(inst);
+ if (mux_read_stalls(scoreboard, inst)) {
+ /* Don't merge an instruction that stalls */
+ if (prev_inst)
+ continue;
+ else {
+ /* Any instruction that don't stall will have
+ * higher scheduling priority */
+ prio -= MAX_SCHEDULE_PRIORITY;
+ assert(prio < 0);
+ }
+ }
+
/* Found a valid instruction. If nothing better comes along,
* this one works.
*/
@@ -841,6 +897,16 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
}
static void
+update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
+ const struct v3d_qpu_instr *inst)
+{
+ if (v3d_qpu_instr_is_sfu(inst)) {
+ scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
+ scoreboard->last_stallable_sfu_tick = scoreboard->tick;
+ }
+}
+
+static void
update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
const struct v3d_qpu_instr *inst)
{
@@ -853,6 +919,9 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
if (inst->alu.add.magic_write) {
update_scoreboard_for_magic_waddr(scoreboard,
inst->alu.add.waddr);
+ } else {
+ update_scoreboard_for_sfu_stall_waddr(scoreboard,
+ inst);
}
}
@@ -871,24 +940,24 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
}
static void
-dump_state(const struct v3d_device_info *devinfo,
- struct list_head *schedule_list)
+dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
{
- list_for_each_entry(struct schedule_node, n, schedule_list, link) {
+ list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
fprintf(stderr, " t=%4d: ", n->unblocked_time);
v3d_qpu_dump(devinfo, &n->inst->qpu);
fprintf(stderr, "\n");
- for (int i = 0; i < n->child_count; i++) {
- struct schedule_node *child = n->children[i].node;
+ util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+ struct schedule_node *child =
+ (struct schedule_node *)edge->child;
if (!child)
continue;
fprintf(stderr, " - ");
v3d_qpu_dump(devinfo, &child->inst->qpu);
fprintf(stderr, " (%d parents, %c)\n",
- child->parent_count,
- n->children[i].write_after_read ? 'w' : 'r');
+ child->dag.parent_count,
+ edge->data ? 'w' : 'r');
}
}
}
@@ -952,64 +1021,64 @@ instruction_latency(struct schedule_node *before, struct schedule_node *after)
after_inst));
}
+ if (v3d_qpu_instr_is_sfu(before_inst))
+ return 2;
+
return latency;
}
/** Recursive computation of the delay member of a node. */
static void
-compute_delay(struct schedule_node *n)
+compute_delay(struct dag_node *node, void *state)
{
- if (!n->child_count) {
- n->delay = 1;
- } else {
- for (int i = 0; i < n->child_count; i++) {
- if (!n->children[i].node->delay)
- compute_delay(n->children[i].node);
- n->delay = MAX2(n->delay,
- n->children[i].node->delay +
- instruction_latency(n, n->children[i].node));
- }
+ struct schedule_node *n = (struct schedule_node *)node;
+
+ n->delay = 1;
+
+ util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+ struct schedule_node *child =
+ (struct schedule_node *)edge->child;
+
+ n->delay = MAX2(n->delay, (child->delay +
+ instruction_latency(n, child)));
}
}
+/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
+ * should be called on it later to finish pruning the other edges).
+ */
static void
-mark_instruction_scheduled(struct list_head *schedule_list,
+pre_remove_head(struct dag *dag, struct schedule_node *n)
+{
+ list_delinit(&n->dag.link);
+
+ util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
+ if (edge->data)
+ dag_remove_edge(dag, edge);
+ }
+}
+
+static void
+mark_instruction_scheduled(struct dag *dag,
uint32_t time,
- struct schedule_node *node,
- bool war_only)
+ struct schedule_node *node)
{
if (!node)
return;
- for (int i = node->child_count - 1; i >= 0; i--) {
+ util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
struct schedule_node *child =
- node->children[i].node;
+ (struct schedule_node *)edge->child;
if (!child)
continue;
- if (war_only && !node->children[i].write_after_read)
- continue;
-
- /* If the requirement is only that the node not appear before
- * the last read of its destination, then it can be scheduled
- * immediately after (or paired with!) the thing reading the
- * destination.
- */
- uint32_t latency = 0;
- if (!war_only) {
- latency = instruction_latency(node,
- node->children[i].node);
- }
+ uint32_t latency = instruction_latency(node, child);
child->unblocked_time = MAX2(child->unblocked_time,
time + latency);
- child->parent_count--;
- if (child->parent_count == 0)
- list_add(&child->link, schedule_list);
-
- node->children[i].node = NULL;
}
+ dag_prune_head(dag, &node->dag);
}
static void
@@ -1028,7 +1097,7 @@ insert_scheduled_instruction(struct v3d_compile *c,
static struct qinst *
vir_nop()
{
- struct qreg undef = { QFILE_NULL, 0 };
+ struct qreg undef = vir_nop_reg();
struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
return qinst;
@@ -1223,7 +1292,6 @@ static uint32_t
schedule_instructions(struct v3d_compile *c,
struct choose_scoreboard *scoreboard,
struct qblock *block,
- struct list_head *schedule_list,
enum quniform_contents *orig_uniform_contents,
uint32_t *orig_uniform_data,
uint32_t *next_uniform)
@@ -1231,23 +1299,10 @@ schedule_instructions(struct v3d_compile *c,
const struct v3d_device_info *devinfo = c->devinfo;
uint32_t time = 0;
- if (debug) {
- fprintf(stderr, "initial deps:\n");
- dump_state(devinfo, schedule_list);
- fprintf(stderr, "\n");
- }
-
- /* Remove non-DAG heads from the list. */
- list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
- if (n->parent_count != 0)
- list_del(&n->link);
- }
-
- while (!list_empty(schedule_list)) {
+ while (!list_empty(&scoreboard->dag->heads)) {
struct schedule_node *chosen =
choose_instruction_to_schedule(devinfo,
scoreboard,
- schedule_list,
NULL);
struct schedule_node *merge = NULL;
@@ -1260,7 +1315,7 @@ schedule_instructions(struct v3d_compile *c,
if (debug) {
fprintf(stderr, "t=%4d: current list:\n",
time);
- dump_state(devinfo, schedule_list);
+ dump_state(devinfo, scoreboard->dag);
fprintf(stderr, "t=%4d: chose: ", time);
v3d_qpu_dump(devinfo, inst);
fprintf(stderr, "\n");
@@ -1278,17 +1333,14 @@ schedule_instructions(struct v3d_compile *c,
*/
if (chosen) {
time = MAX2(chosen->unblocked_time, time);
- list_del(&chosen->link);
- mark_instruction_scheduled(schedule_list, time,
- chosen, true);
+ pre_remove_head(scoreboard->dag, chosen);
while ((merge =
choose_instruction_to_schedule(devinfo,
scoreboard,
- schedule_list,
chosen))) {
time = MAX2(merge->unblocked_time, time);
- list_del(&merge->link);
+ pre_remove_head(scoreboard->dag, chosen);
list_addtail(&merge->link, &merged_list);
(void)qpu_merge_inst(devinfo, inst,
inst, &merge->inst->qpu);
@@ -1307,6 +1359,8 @@ schedule_instructions(struct v3d_compile *c,
fprintf(stderr, "\n");
}
}
+ if (mux_read_stalls(scoreboard, inst))
+ c->qpu_inst_stalled_count++;
}
/* Update the uniform index for the rewritten location --
@@ -1334,11 +1388,10 @@ schedule_instructions(struct v3d_compile *c,
* be scheduled. Update the children's unblocked time for this
* DAG edge as we do so.
*/
- mark_instruction_scheduled(schedule_list, time, chosen, false);
+ mark_instruction_scheduled(scoreboard->dag, time, chosen);
list_for_each_entry(struct schedule_node, merge, &merged_list,
link) {
- mark_instruction_scheduled(schedule_list, time, merge,
- false);
+ mark_instruction_scheduled(scoreboard->dag, time, merge);
/* The merged VIR instruction doesn't get re-added to the
* block, so free it now.
@@ -1380,9 +1433,10 @@ qpu_schedule_instructions_block(struct v3d_compile *c,
uint32_t *next_uniform)
{
void *mem_ctx = ralloc_context(NULL);
- struct list_head schedule_list;
+ scoreboard->dag = dag_create(mem_ctx);
+ struct list_head setup_list;
- list_inithead(&schedule_list);
+ list_inithead(&setup_list);
/* Wrap each instruction in a scheduler structure. */
while (!list_empty(&block->instructions)) {
@@ -1390,26 +1444,25 @@ qpu_schedule_instructions_block(struct v3d_compile *c,
struct schedule_node *n =
rzalloc(mem_ctx, struct schedule_node);
+ dag_init_node(scoreboard->dag, &n->dag);
n->inst = qinst;
list_del(&qinst->link);
- list_addtail(&n->link, &schedule_list);
+ list_addtail(&n->link, &setup_list);
}
- calculate_forward_deps(c, &schedule_list);
- calculate_reverse_deps(c, &schedule_list);
+ calculate_forward_deps(c, scoreboard->dag, &setup_list);
+ calculate_reverse_deps(c, scoreboard->dag, &setup_list);
- list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
- compute_delay(n);
- }
+ dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL);
uint32_t cycles = schedule_instructions(c, scoreboard, block,
- &schedule_list,
orig_uniform_contents,
orig_uniform_data,
next_uniform);
ralloc_free(mem_ctx);
+ scoreboard->dag = NULL;
return cycles;
}
@@ -1491,6 +1544,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
scoreboard.last_magic_sfu_write_tick = -10;
scoreboard.last_uniforms_reset_tick = -10;
scoreboard.last_thrsw_tick = -10;
+ scoreboard.last_stallable_sfu_tick = -10;
if (debug) {
fprintf(stderr, "Pre-schedule instructions\n");
diff --git a/lib/mesa/src/broadcom/compiler/v3d33_tex.c b/lib/mesa/src/broadcom/compiler/v3d33_tex.c
index 7e9cd27d3..488021bfc 100644
--- a/lib/mesa/src/broadcom/compiler/v3d33_tex.c
+++ b/lib/mesa/src/broadcom/compiler/v3d33_tex.c
@@ -106,18 +106,16 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
break;
case nir_tex_src_offset: {
- nir_const_value *offset =
- nir_src_as_const_value(instr->src[i].src);
p0_unpacked.texel_offset_for_s_coordinate =
- offset->i32[0];
+ nir_src_comp_as_int(instr->src[i].src, 0);
if (instr->coord_components >= 2)
p0_unpacked.texel_offset_for_t_coordinate =
- offset->i32[1];
+ nir_src_comp_as_int(instr->src[i].src, 1);
if (instr->coord_components >= 3)
p0_unpacked.texel_offset_for_r_coordinate =
- offset->i32[2];
+ nir_src_comp_as_int(instr->src[i].src, 2);
break;
}
@@ -161,11 +159,10 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
unit));
}
- struct qreg texture_u[] = {
- vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed),
- vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed),
+ int texture_u[] = {
+ vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed),
+ vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed),
};
- uint32_t next_texture_u = 0;
for (int i = 0; i < next_coord; i++) {
struct qreg dst;
@@ -177,11 +174,8 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]);
- if (i < 2) {
- tmu->has_implicit_uniform = true;
- tmu->src[vir_get_implicit_uniform_src(tmu)] =
- texture_u[next_texture_u++];
- }
+ if (i < 2)
+ tmu->uniform = texture_u[i];
}
vir_emit_thrsw(c);
diff --git a/lib/mesa/src/broadcom/compiler/v3d40_tex.c b/lib/mesa/src/broadcom/compiler/v3d40_tex.c
index 9f5c56079..1c39289b6 100644
--- a/lib/mesa/src/broadcom/compiler/v3d40_tex.c
+++ b/lib/mesa/src/broadcom/compiler/v3d40_tex.c
@@ -48,8 +48,7 @@ vir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data
{
struct qinst *inst = vir_NOP(c);
inst->qpu.sig.wrtmuc = true;
- inst->has_implicit_uniform = true;
- inst->src[0] = vir_uniform(c, contents, data);
+ inst->uniform = vir_get_uniform_index(c, contents, data);
}
static const struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = {
@@ -139,14 +138,13 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
case nir_tex_src_offset: {
if (nir_src_is_const(instr->src[i].src)) {
- nir_const_value *offset =
- nir_src_as_const_value(instr->src[i].src);
-
- p2_unpacked.offset_s = offset->i32[0];
+ p2_unpacked.offset_s = nir_src_comp_as_int(instr->src[i].src, 0);
if (instr->coord_components >= 2)
- p2_unpacked.offset_t = offset->i32[1];
- if (instr->coord_components >= 3)
- p2_unpacked.offset_r = offset->i32[2];
+ p2_unpacked.offset_t =
+ nir_src_comp_as_int(instr->src[i].src, 1);
+ if (non_array_components >= 3)
+ p2_unpacked.offset_r =
+ nir_src_comp_as_int(instr->src[i].src, 2);
} else {
struct qreg mask = vir_uniform_ui(c, 0xf);
struct qreg x, y, offset;
@@ -185,6 +183,8 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
assert(p1_unpacked.output_type_32_bit ||
p0_unpacked.return_words_of_texture_data < (1 << 2));
+ assert(p0_unpacked.return_words_of_texture_data != 0);
+
uint32_t p0_packed;
V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
(uint8_t *)&p0_packed,
@@ -243,6 +243,34 @@ type_size_align_1(const struct glsl_type *type, unsigned *size, unsigned *align)
*align = 1;
}
+static uint32_t
+v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr)
+{
+ switch (instr->intrinsic) {
+ case nir_intrinsic_image_deref_load:
+ case nir_intrinsic_image_deref_store:
+ return V3D_TMU_OP_REGULAR;
+ case nir_intrinsic_image_deref_atomic_add:
+ return v3d_get_op_for_atomic_add(instr, 3);
+ case nir_intrinsic_image_deref_atomic_min:
+ return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
+ case nir_intrinsic_image_deref_atomic_max:
+ return V3D_TMU_OP_WRITE_UMAX;
+ case nir_intrinsic_image_deref_atomic_and:
+ return V3D_TMU_OP_WRITE_AND_READ_INC;
+ case nir_intrinsic_image_deref_atomic_or:
+ return V3D_TMU_OP_WRITE_OR_READ_DEC;
+ case nir_intrinsic_image_deref_atomic_xor:
+ return V3D_TMU_OP_WRITE_XOR_READ_NOT;
+ case nir_intrinsic_image_deref_atomic_exchange:
+ return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
+ case nir_intrinsic_image_deref_atomic_comp_swap:
+ return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
+ default:
+ unreachable("unknown image intrinsic");
+ };
+}
+
void
v3d40_vir_emit_image_load_store(struct v3d_compile *c,
nir_intrinsic_instr *instr)
@@ -264,42 +292,15 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 };
- /* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR
- * wants to have support for inc/dec?
- */
- switch (instr->intrinsic) {
- case nir_intrinsic_image_deref_load:
- case nir_intrinsic_image_deref_store:
- p2_unpacked.op = V3D_TMU_OP_REGULAR;
- break;
- case nir_intrinsic_image_deref_atomic_add:
- p2_unpacked.op = V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
- break;
- case nir_intrinsic_image_deref_atomic_min:
- p2_unpacked.op = V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
- break;
+ p2_unpacked.op = v3d40_image_load_store_tmu_op(instr);
- case nir_intrinsic_image_deref_atomic_max:
- p2_unpacked.op = V3D_TMU_OP_WRITE_UMAX;
- break;
- case nir_intrinsic_image_deref_atomic_and:
- p2_unpacked.op = V3D_TMU_OP_WRITE_AND_READ_INC;
- break;
- case nir_intrinsic_image_deref_atomic_or:
- p2_unpacked.op = V3D_TMU_OP_WRITE_OR_READ_DEC;
- break;
- case nir_intrinsic_image_deref_atomic_xor:
- p2_unpacked.op = V3D_TMU_OP_WRITE_XOR_READ_NOT;
- break;
- case nir_intrinsic_image_deref_atomic_exchange:
- p2_unpacked.op = V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
- break;
- case nir_intrinsic_image_deref_atomic_comp_swap:
- p2_unpacked.op = V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
- break;
- default:
- unreachable("unknown image intrinsic");
- };
+ /* If we were able to replace atomic_add for an inc/dec, then we
+ * need/can to do things slightly different, like not loading the
+ * amount to add/sub, as that is implicit.
+ */
+ bool atomic_add_replaced = (instr->intrinsic == nir_intrinsic_image_deref_atomic_add &&
+ (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC ||
+ p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC));
bool is_1d = false;
switch (glsl_get_sampler_dim(sampler_type)) {
@@ -368,7 +369,8 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
/* Emit the data writes for atomics or image store. */
- if (instr->intrinsic != nir_intrinsic_image_deref_load) {
+ if (instr->intrinsic != nir_intrinsic_image_deref_load &&
+ !atomic_add_replaced) {
/* Vector for stores, or first atomic argument */
struct qreg src[4];
for (int i = 0; i < nir_intrinsic_src_components(instr, 3); i++) {
@@ -386,9 +388,21 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
}
}
+ if (vir_in_nonuniform_control_flow(c) &&
+ instr->intrinsic != nir_intrinsic_image_deref_load) {
+ vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute),
+ V3D_QPU_PF_PUSHZ);
+ }
+
vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, ntq_get_src(c, instr->src[1], 0),
&tmu_writes);
+ if (vir_in_nonuniform_control_flow(c) &&
+ instr->intrinsic != nir_intrinsic_image_deref_load) {
+ struct qinst *last_inst= (struct qinst *)c->cur_block->instructions.prev;
+ vir_set_cond(last_inst, V3D_QPU_COND_IFA);
+ }
+
vir_emit_thrsw(c);
/* The input FIFO has 16 slots across all threads, so make sure we
diff --git a/lib/mesa/src/broadcom/compiler/v3d_compiler.h b/lib/mesa/src/broadcom/compiler/v3d_compiler.h
index 671aba3c5..b61119f56 100644
--- a/lib/mesa/src/broadcom/compiler/v3d_compiler.h
+++ b/lib/mesa/src/broadcom/compiler/v3d_compiler.h
@@ -69,9 +69,6 @@ enum qfile {
* or physical registers later.
*/
QFILE_TEMP,
- QFILE_UNIF,
- QFILE_TLB,
- QFILE_TLBU,
/**
* VPM reads use this with an index value to say what part of the VPM
@@ -105,6 +102,16 @@ static inline struct qreg vir_reg(enum qfile file, uint32_t index)
return (struct qreg){file, index};
}
+static inline struct qreg vir_magic_reg(uint32_t index)
+{
+ return (struct qreg){QFILE_MAGIC, index};
+}
+
+static inline struct qreg vir_nop_reg(void)
+{
+ return (struct qreg){QFILE_NULL, 0};
+}
+
/**
* A reference to an actual register at the QPU level, for register
* allocation.
@@ -129,12 +136,11 @@ struct qinst {
/* Pre-register-allocation references to src/dst registers */
struct qreg dst;
struct qreg src[3];
- bool cond_is_exec_mask;
- bool has_implicit_uniform;
bool is_last_thrsw;
- /* After vir_to_qpu.c: If instr reads a uniform, which uniform from
- * the uncompiled stream it is.
+ /* If the instruction reads a uniform (other than through src[i].file
+ * == QFILE_UNIF), that uniform's index in c->uniform_contents. ~0
+ * otherwise.
*/
int uniform;
};
@@ -275,17 +281,18 @@ enum quniform_contents {
QUNIFORM_SHARED_OFFSET,
};
-static inline uint32_t v3d_tmu_config_data_create(uint32_t unit, uint32_t value)
+static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value)
{
+ assert(value < (1 << 24));
return unit << 24 | value;
}
-static inline uint32_t v3d_tmu_config_data_get_unit(uint32_t data)
+static inline uint32_t v3d_unit_data_get_unit(uint32_t data)
{
return data >> 24;
}
-static inline uint32_t v3d_tmu_config_data_get_value(uint32_t data)
+static inline uint32_t v3d_unit_data_get_offset(uint32_t data)
{
return data & 0xffffff;
}
@@ -311,25 +318,6 @@ static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot)
return slot.slot_and_component & 3;
}
-struct v3d_ubo_range {
- /**
- * offset in bytes from the start of the ubo where this range is
- * uploaded.
- *
- * Only set once used is set.
- */
- uint32_t dst_offset;
-
- /**
- * offset in bytes from the start of the gallium uniforms where the
- * data comes from.
- */
- uint32_t src_offset;
-
- /** size in bytes of this ubo range */
- uint32_t size;
-};
-
struct v3d_key {
void *shader_state;
struct {
@@ -357,7 +345,8 @@ struct v3d_fs_key {
bool sample_alpha_to_one;
bool clamp_color;
bool shade_model_flat;
- uint8_t nr_cbufs;
+ /* Mask of which color render targets are present. */
+ uint8_t cbufs;
uint8_t swap_color_rb;
/* Mask of which render targets need to be written as 32-bit floats */
uint8_t f32_color_rb;
@@ -366,6 +355,15 @@ struct v3d_fs_key {
*/
uint8_t int_color_rb;
uint8_t uint_color_rb;
+
+ /* Color format information per render target. Only set when logic
+ * operations are enabled.
+ */
+ struct {
+ enum pipe_format format;
+ const uint8_t *swizzle;
+ } color_fmt[V3D_MAX_DRAW_BUFFERS];
+
uint8_t alpha_test_func;
uint8_t logicop_func;
uint32_t point_sprite_mask;
@@ -413,6 +411,8 @@ struct qblock {
/** @{ used by v3d_vir_live_variables.c */
BITSET_WORD *def;
+ BITSET_WORD *defin;
+ BITSET_WORD *defout;
BITSET_WORD *use;
BITSET_WORD *live_in;
BITSET_WORD *live_out;
@@ -469,6 +469,8 @@ vir_after_block(struct qblock *block)
struct v3d_compiler {
const struct v3d_device_info *devinfo;
struct ra_regs *regs;
+ unsigned int reg_class_any[3];
+ unsigned int reg_class_r5[3];
unsigned int reg_class_phys[3];
unsigned int reg_class_phys_or_acc[3];
};
@@ -502,8 +504,8 @@ struct v3d_compile {
struct qreg *inputs;
struct qreg *outputs;
bool msaa_per_sample_output;
- struct qreg color_reads[V3D_MAX_SAMPLES];
- struct qreg sample_colors[V3D_MAX_SAMPLES];
+ struct qreg color_reads[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4];
+ struct qreg sample_colors[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4];
uint32_t inputs_array_size;
uint32_t outputs_array_size;
uint32_t uniforms_array_size;
@@ -520,13 +522,7 @@ struct v3d_compile {
bool uses_center_w;
bool writes_z;
-
- struct v3d_ubo_range *ubo_ranges;
- bool *ubo_range_used;
- uint32_t ubo_ranges_array_size;
- /** Number of uniform areas tracked in ubo_ranges. */
- uint32_t num_ubo_ranges;
- uint32_t next_ubo_dst_offset;
+ bool uses_implicit_point_line_varyings;
/* State for whether we're executing on each channel currently. 0 if
* yes, otherwise a block number + 1 that the channel jumped to.
@@ -556,7 +552,7 @@ struct v3d_compile {
int local_invocation_index_bits;
uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4];
- uint32_t num_vpm_writes;
+ uint32_t vpm_output_size;
/* Size in bytes of registers that have been spilled. This is how much
* space needs to be available in the spill BO per thread per QPU.
@@ -600,10 +596,8 @@ struct v3d_compile {
enum quniform_contents *uniform_contents;
uint32_t uniform_array_size;
uint32_t num_uniforms;
- uint32_t num_outputs;
uint32_t output_position_index;
nir_variable *output_color_var[4];
- uint32_t output_point_size_index;
uint32_t output_sample_mask_index;
struct qreg undef;
@@ -619,24 +613,13 @@ struct v3d_compile {
uint64_t *qpu_insts;
uint32_t qpu_inst_count;
uint32_t qpu_inst_size;
+ uint32_t qpu_inst_stalled_count;
/* For the FS, the number of varying inputs not counting the
* point/line varyings payload
*/
uint32_t num_inputs;
- /**
- * Number of inputs from num_inputs remaining to be queued to the read
- * FIFO in the VS/CS.
- */
- uint32_t num_inputs_remaining;
-
- /* Number of inputs currently in the read FIFO for the VS/CS */
- uint32_t num_inputs_in_fifo;
-
- /** Next offset in the VPM to read from in the VS/CS */
- uint32_t vpm_read_offset;
-
uint32_t program_id;
uint32_t variant_id;
@@ -652,6 +635,9 @@ struct v3d_compile {
struct qinst *last_thrsw;
bool last_thrsw_at_top_level;
+ bool emitted_tlb_load;
+ bool lock_scoreboard_on_first_thrsw;
+
bool failed;
};
@@ -664,12 +650,8 @@ struct v3d_uniform_list {
struct v3d_prog_data {
struct v3d_uniform_list uniforms;
- struct v3d_ubo_range *ubo_ranges;
- uint32_t num_ubo_ranges;
- uint32_t ubo_size;
uint32_t spill_size;
- uint8_t num_inputs;
uint8_t threads;
/* For threads > 1, whether the program should be dispatched in the
@@ -717,17 +699,25 @@ struct v3d_fs_prog_data {
uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];
+ uint8_t num_inputs;
bool writes_z;
bool disable_ez;
bool uses_center_w;
+ bool uses_implicit_point_line_varyings;
+ bool lock_scoreboard_on_first_thrsw;
};
-/* Special nir_load_input intrinsic index for loading the current TLB
- * destination color.
- */
-#define V3D_NIR_TLB_COLOR_READ_INPUT 2000000000
+struct v3d_compute_prog_data {
+ struct v3d_prog_data base;
+ /* Size in bytes of the workgroup's shared space. */
+ uint32_t shared_size;
+};
-#define V3D_NIR_MS_MASK_OUTPUT 2000000000
+static inline bool
+vir_has_uniform(struct qinst *inst)
+{
+ return inst->uniform != ~0;
+}
extern const nir_shader_compiler_options v3d_nir_options;
@@ -758,12 +748,17 @@ struct qinst *vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst,
struct qreg src0, struct qreg src1);
struct qinst *vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst,
struct qreg src0, struct qreg src1);
-struct qinst *vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src0);
+struct qinst *vir_branch_inst(struct v3d_compile *c,
+ enum v3d_qpu_branch_cond cond);
void vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst);
+uint32_t vir_get_uniform_index(struct v3d_compile *c,
+ enum quniform_contents contents,
+ uint32_t data);
struct qreg vir_uniform(struct v3d_compile *c,
enum quniform_contents contents,
uint32_t data);
void vir_schedule_instructions(struct v3d_compile *c);
+void v3d_setup_spill_base(struct v3d_compile *c);
struct v3d_qpu_instr v3d_qpu_nop(void);
struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst);
@@ -777,9 +772,6 @@ void vir_set_unpack(struct qinst *inst, int src,
struct qreg vir_get_temp(struct v3d_compile *c);
void vir_emit_last_thrsw(struct v3d_compile *c);
void vir_calculate_live_intervals(struct v3d_compile *c);
-bool vir_has_implicit_uniform(struct qinst *inst);
-int vir_get_implicit_uniform_src(struct qinst *inst);
-int vir_get_non_sideband_nsrc(struct qinst *inst);
int vir_get_nsrc(struct qinst *inst);
bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst);
bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op);
@@ -788,7 +780,6 @@ bool vir_is_raw_mov(struct qinst *inst);
bool vir_is_tex(struct qinst *inst);
bool vir_is_add(struct qinst *inst);
bool vir_is_mul(struct qinst *inst);
-bool vir_is_float_input(struct qinst *inst);
bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
@@ -810,10 +801,13 @@ bool vir_opt_constant_folding(struct v3d_compile *c);
bool vir_opt_copy_propagate(struct v3d_compile *c);
bool vir_opt_dead_code(struct v3d_compile *c);
bool vir_opt_peephole_sf(struct v3d_compile *c);
+bool vir_opt_redundant_flags(struct v3d_compile *c);
bool vir_opt_small_immediates(struct v3d_compile *c);
bool vir_opt_vpm(struct v3d_compile *c);
void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c);
void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
+void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
+void v3d_nir_lower_scratch(nir_shader *s);
void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
void v3d_nir_lower_image_load_store(nir_shader *s);
void vir_lower_uniforms(struct v3d_compile *c);
@@ -833,7 +827,8 @@ bool vir_init_reg_sets(struct v3d_compiler *compiler);
bool v3d_gl_format_is_return_32(GLenum format);
-void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf);
+uint32_t
+v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src);
static inline bool
quniform_contents_is_texture_p0(enum quniform_contents contents)
@@ -843,6 +838,12 @@ quniform_contents_is_texture_p0(enum quniform_contents contents)
V3D_MAX_TEXTURE_SAMPLERS));
}
+static inline bool
+vir_in_nonuniform_control_flow(struct v3d_compile *c)
+{
+ return c->execute.file != QFILE_NULL;
+}
+
static inline struct qreg
vir_uniform_ui(struct v3d_compile *c, uint32_t ui)
{
@@ -1086,6 +1087,30 @@ vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1)
return vir_UMUL24(c, src0, src1);
}
+static inline struct qreg
+vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config)
+{
+ assert(c->devinfo->ver >= 41); /* XXX */
+ assert((config & 0xffffff00) == 0xffffff00);
+
+ struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+ c->undef, c->undef);
+ ldtlb->qpu.sig.ldtlbu = true;
+ ldtlb->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
+ return vir_emit_def(c, ldtlb);
+}
+
+static inline struct qreg
+vir_TLB_COLOR_READ(struct v3d_compile *c)
+{
+ assert(c->devinfo->ver >= 41); /* XXX */
+
+ struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef,
+ c->undef, c->undef);
+ ldtlb->qpu.sig.ldtlb = true;
+ return vir_emit_def(c, ldtlb);
+}
+
/*
static inline struct qreg
vir_LOAD_IMM(struct v3d_compile *c, uint32_t val)
@@ -1114,7 +1139,7 @@ static inline struct qinst *
vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
{
/* The actual uniform_data value will be set at scheduling time */
- return vir_emit_nondef(c, vir_branch_inst(cond, vir_uniform_ui(c, 0)));
+ return vir_emit_nondef(c, vir_branch_inst(c, cond));
}
#define vir_for_each_block(block, c) \
@@ -1143,4 +1168,8 @@ vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
vir_for_each_block(_block, c) \
vir_for_each_inst(inst, _block)
+#define vir_for_each_inst_inorder_safe(inst, c) \
+ vir_for_each_block(_block, c) \
+ vir_for_each_inst_safe(inst, _block)
+
#endif /* V3D_COMPILER_H */
diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c
index b65a82b7f..2a68efb7b 100644
--- a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c
+++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c
@@ -28,11 +28,47 @@
* Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
* intrinsics into something amenable to the V3D architecture.
*
- * After moving more and more logic to NIR, all that's left here is fixing up
- * addressing on uniform loads. FS input and VS output scalarization is
- * handled by nir_lower_io_to_scalar().
+ * Most of the work is turning the VS's store_output intrinsics from working
+ * on a base representing the gallium-level vec4 driver_location to an offset
+ * within the VPM, and emitting the header that's read by the fixed function
+ * hardware between the VS and FS.
+ *
+ * We also adjust the offsets on uniform loads to be in bytes, since that's
+ * what we need for indirect addressing with general TMU access.
*/
+struct v3d_nir_lower_io_state {
+ int pos_vpm_offset;
+ int vp_vpm_offset;
+ int zs_vpm_offset;
+ int rcp_wc_vpm_offset;
+ int psiz_vpm_offset;
+ int varyings_vpm_offset;
+
+ BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+ nir_ssa_def *pos[4];
+};
+
+static void
+v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *chan)
+{
+ nir_intrinsic_instr *intr =
+ nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+ nir_ssa_dest_init(&intr->instr, &intr->dest,
+ 1, intr->dest.ssa.bit_size, NULL);
+ intr->num_components = 1;
+
+ intr->src[0] = nir_src_for_ssa(chan);
+ intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
+
+ nir_intrinsic_set_base(intr, base);
+ nir_intrinsic_set_write_mask(intr, 0x1);
+ nir_intrinsic_set_component(intr, 0);
+
+ nir_builder_instr_insert(b, &intr->instr);
+}
+
/* Convert the uniform offset to bytes. If it happens to be a constant,
* constant-folding will clean up the shift for us.
*/
@@ -50,9 +86,90 @@ v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
nir_imm_int(b, 4))));
}
+static int
+v3d_varying_slot_vpm_offset(struct v3d_compile *c, nir_variable *var, int chan)
+{
+ int component = var->data.location_frac + chan;
+
+ for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
+ struct v3d_varying_slot slot = c->vs_key->fs_inputs[i];
+
+ if (v3d_slot_get_slot(slot) == var->data.location &&
+ v3d_slot_get_component(slot) == component) {
+ return i;
+ }
+ }
+
+ return -1;
+}
+
+/* Lowers a store_output(gallium driver location) to a series of store_outputs
+ * with a driver_location equal to the offset in the VPM.
+ */
+static void
+v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
+ nir_intrinsic_instr *intr,
+ struct v3d_nir_lower_io_state *state)
+{
+ b->cursor = nir_before_instr(&intr->instr);
+
+ int start_comp = nir_intrinsic_component(intr);
+ nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0],
+ intr->num_components);
+
+ nir_variable *var = NULL;
+ nir_foreach_variable(scan_var, &c->s->outputs) {
+ if (scan_var->data.driver_location != nir_intrinsic_base(intr) ||
+ start_comp < scan_var->data.location_frac ||
+ start_comp >= scan_var->data.location_frac +
+ glsl_get_components(scan_var->type)) {
+ continue;
+ }
+ var = scan_var;
+ }
+
+ /* Save off the components of the position for the setup of VPM inputs
+ * read by fixed function HW.
+ */
+ if (var->data.location == VARYING_SLOT_POS) {
+ for (int i = 0; i < intr->num_components; i++) {
+ state->pos[start_comp + i] = nir_channel(b, src, i);
+ }
+ }
+
+ /* Just psiz to the position in the FF header right now. */
+ if (var->data.location == VARYING_SLOT_PSIZ &&
+ state->psiz_vpm_offset != -1) {
+ v3d_nir_store_output(b, state->psiz_vpm_offset, src);
+ }
+
+ /* Scalarize outputs if it hasn't happened already, since we want to
+ * schedule each VPM write individually. We can skip any outut
+ * components not read by the FS.
+ */
+ for (int i = 0; i < intr->num_components; i++) {
+ int vpm_offset =
+ v3d_varying_slot_vpm_offset(c, var,
+ i +
+ start_comp -
+ var->data.location_frac);
+
+ if (vpm_offset == -1)
+ continue;
+
+ BITSET_SET(state->varyings_stored, vpm_offset);
+
+ v3d_nir_store_output(b, state->varyings_vpm_offset + vpm_offset,
+ nir_channel(b, src, i));
+ }
+
+ nir_instr_remove(&intr->instr);
+}
+
static void
v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
- struct nir_instr *instr)
+ struct nir_instr *instr,
+ struct v3d_nir_lower_io_state *state)
{
if (instr->type != nir_instr_type_intrinsic)
return;
@@ -63,33 +180,171 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
v3d_nir_lower_uniform(c, b, intr);
break;
+ case nir_intrinsic_store_output:
+ if (c->s->info.stage == MESA_SHADER_VERTEX)
+ v3d_nir_lower_vpm_output(c, b, intr, state);
+ break;
+
default:
break;
}
}
-static bool
-v3d_nir_lower_io_impl(struct v3d_compile *c, nir_function_impl *impl)
+/* Remap the output var's .driver_location. This is purely for
+ * nir_print_shader() so that store_output can map back to a variable name.
+ */
+static void
+v3d_nir_lower_io_update_output_var_base(struct v3d_compile *c,
+ struct v3d_nir_lower_io_state *state)
+{
+ nir_foreach_variable_safe(var, &c->s->outputs) {
+ if (var->data.location == VARYING_SLOT_POS &&
+ state->pos_vpm_offset != -1) {
+ var->data.driver_location = state->pos_vpm_offset;
+ continue;
+ }
+
+ if (var->data.location == VARYING_SLOT_PSIZ &&
+ state->psiz_vpm_offset != -1) {
+ var->data.driver_location = state->psiz_vpm_offset;
+ continue;
+ }
+
+ int vpm_offset = v3d_varying_slot_vpm_offset(c, var, 0);
+ if (vpm_offset != -1) {
+ var->data.driver_location =
+ state->varyings_vpm_offset + vpm_offset;
+ } else {
+ /* If we couldn't find a mapping for the var, delete
+ * it so that its old .driver_location doesn't confuse
+ * nir_print_shader().
+ */
+ exec_node_remove(&var->node);
+ }
+ }
+}
+
+static void
+v3d_nir_setup_vpm_layout(struct v3d_compile *c,
+ struct v3d_nir_lower_io_state *state)
+{
+ uint32_t vpm_offset = 0;
+
+ if (c->vs_key->is_coord) {
+ state->pos_vpm_offset = vpm_offset;
+ vpm_offset += 4;
+ } else {
+ state->pos_vpm_offset = -1;
+ }
+
+ state->vp_vpm_offset = vpm_offset;
+ vpm_offset += 2;
+
+ if (!c->vs_key->is_coord) {
+ state->zs_vpm_offset = vpm_offset++;
+ state->rcp_wc_vpm_offset = vpm_offset++;
+ } else {
+ state->zs_vpm_offset = -1;
+ state->rcp_wc_vpm_offset = -1;
+ }
+
+ if (c->vs_key->per_vertex_point_size)
+ state->psiz_vpm_offset = vpm_offset++;
+ else
+ state->psiz_vpm_offset = -1;
+
+ state->varyings_vpm_offset = vpm_offset;
+
+ c->vpm_output_size = vpm_offset + c->vs_key->num_fs_inputs;
+}
+
+static void
+v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
+ struct v3d_nir_lower_io_state *state)
{
- nir_builder b;
- nir_builder_init(&b, impl);
+ for (int i = 0; i < 4; i++) {
+ if (!state->pos[i])
+ state->pos[i] = nir_ssa_undef(b, 1, 32);
+ }
+
+ nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]);
+
+ if (state->pos_vpm_offset != -1) {
+ for (int i = 0; i < 4; i++) {
+ v3d_nir_store_output(b, state->pos_vpm_offset + i,
+ state->pos[i]);
+ }
+ }
- nir_foreach_block(block, impl) {
- nir_foreach_instr_safe(instr, block)
- v3d_nir_lower_io_instr(c, &b, instr);
+ for (int i = 0; i < 2; i++) {
+ nir_ssa_def *pos;
+ nir_ssa_def *scale;
+ pos = state->pos[i];
+ if (i == 0)
+ scale = nir_load_viewport_x_scale(b);
+ else
+ scale = nir_load_viewport_y_scale(b);
+ pos = nir_fmul(b, pos, scale);
+ pos = nir_fmul(b, pos, rcp_wc);
+ pos = nir_f2i32(b, nir_fround_even(b, pos));
+ v3d_nir_store_output(b, state->vp_vpm_offset + i,
+ pos);
}
- nir_metadata_preserve(impl, nir_metadata_block_index |
- nir_metadata_dominance);
+ if (state->zs_vpm_offset != -1) {
+ nir_ssa_def *z = state->pos[2];
+ z = nir_fmul(b, z, nir_load_viewport_z_scale(b));
+ z = nir_fmul(b, z, rcp_wc);
+ z = nir_fadd(b, z, nir_load_viewport_z_offset(b));
+ v3d_nir_store_output(b, state->zs_vpm_offset, z);
+ }
+
+ if (state->rcp_wc_vpm_offset != -1)
+ v3d_nir_store_output(b, state->rcp_wc_vpm_offset, rcp_wc);
- return true;
+ /* Store 0 to varyings requested by the FS but not stored in the VS.
+ * This should be undefined behavior, but glsl-routing seems to rely
+ * on it.
+ */
+ for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
+ if (!BITSET_TEST(state->varyings_stored, i)) {
+ v3d_nir_store_output(b, state->varyings_vpm_offset + i,
+ nir_imm_int(b, 0));
+ }
+ }
}
void
v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
{
+ struct v3d_nir_lower_io_state state = { 0 };
+
+ /* Set up the layout of the VPM outputs. */
+ if (s->info.stage == MESA_SHADER_VERTEX)
+ v3d_nir_setup_vpm_layout(c, &state);
+
nir_foreach_function(function, s) {
- if (function->impl)
- v3d_nir_lower_io_impl(c, function->impl);
+ if (function->impl) {
+ nir_builder b;
+ nir_builder_init(&b, function->impl);
+
+ nir_foreach_block(block, function->impl) {
+ nir_foreach_instr_safe(instr, block)
+ v3d_nir_lower_io_instr(c, &b, instr,
+ &state);
+ }
+
+ nir_block *last = nir_impl_last_block(function->impl);
+ b.cursor = nir_after_block(last);
+ if (s->info.stage == MESA_SHADER_VERTEX)
+ v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
+
+ nir_metadata_preserve(function->impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
}
+
+ if (s->info.stage == MESA_SHADER_VERTEX)
+ v3d_nir_lower_io_update_output_var_base(c, &state);
}
diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
new file mode 100644
index 000000000..5c3a7c58a
--- /dev/null
+++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright © 2019 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * Implements lowering for logical operations.
+ *
+ * V3D doesn't have any hardware support for logic ops. Instead, you read the
+ * current contents of the destination from the tile buffer, then do math using
+ * your output color and that destination value, and update the output color
+ * appropriately.
+ */
+
+#include "util/u_format.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+#include "v3d_compiler.h"
+
+
+typedef nir_ssa_def *(*nir_pack_func)(nir_builder *b, nir_ssa_def *c);
+typedef nir_ssa_def *(*nir_unpack_func)(nir_builder *b, nir_ssa_def *c);
+
+static bool
+logicop_depends_on_dst_color(int logicop_func)
+{
+ switch (logicop_func) {
+ case PIPE_LOGICOP_SET:
+ case PIPE_LOGICOP_CLEAR:
+ case PIPE_LOGICOP_COPY:
+ case PIPE_LOGICOP_COPY_INVERTED:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static nir_ssa_def *
+v3d_logicop(nir_builder *b, int logicop_func,
+ nir_ssa_def *src, nir_ssa_def *dst)
+{
+ switch (logicop_func) {
+ case PIPE_LOGICOP_CLEAR:
+ return nir_imm_int(b, 0);
+ case PIPE_LOGICOP_NOR:
+ return nir_inot(b, nir_ior(b, src, dst));
+ case PIPE_LOGICOP_AND_INVERTED:
+ return nir_iand(b, nir_inot(b, src), dst);
+ case PIPE_LOGICOP_COPY_INVERTED:
+ return nir_inot(b, src);
+ case PIPE_LOGICOP_AND_REVERSE:
+ return nir_iand(b, src, nir_inot(b, dst));
+ case PIPE_LOGICOP_INVERT:
+ return nir_inot(b, dst);
+ case PIPE_LOGICOP_XOR:
+ return nir_ixor(b, src, dst);
+ case PIPE_LOGICOP_NAND:
+ return nir_inot(b, nir_iand(b, src, dst));
+ case PIPE_LOGICOP_AND:
+ return nir_iand(b, src, dst);
+ case PIPE_LOGICOP_EQUIV:
+ return nir_inot(b, nir_ixor(b, src, dst));
+ case PIPE_LOGICOP_NOOP:
+ return dst;
+ case PIPE_LOGICOP_OR_INVERTED:
+ return nir_ior(b, nir_inot(b, src), dst);
+ case PIPE_LOGICOP_OR_REVERSE:
+ return nir_ior(b, src, nir_inot(b, dst));
+ case PIPE_LOGICOP_OR:
+ return nir_ior(b, src, dst);
+ case PIPE_LOGICOP_SET:
+ return nir_imm_int(b, ~0);
+ default:
+ fprintf(stderr, "Unknown logic op %d\n", logicop_func);
+ /* FALLTHROUGH */
+ case PIPE_LOGICOP_COPY:
+ return src;
+ }
+}
+
+static nir_ssa_def *
+v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz)
+{
+ switch (swiz) {
+ default:
+ case PIPE_SWIZZLE_NONE:
+ fprintf(stderr, "warning: unknown swizzle\n");
+ /* FALLTHROUGH */
+ case PIPE_SWIZZLE_0:
+ return nir_imm_float(b, 0.0);
+ case PIPE_SWIZZLE_1:
+ return nir_imm_float(b, 1.0);
+ case PIPE_SWIZZLE_X:
+ case PIPE_SWIZZLE_Y:
+ case PIPE_SWIZZLE_Z:
+ case PIPE_SWIZZLE_W:
+ return srcs[swiz];
+ }
+}
+
+static nir_ssa_def *
+v3d_nir_swizzle_and_pack(nir_builder *b, nir_ssa_def **chans,
+ const uint8_t *swiz, nir_pack_func pack_func)
+{
+ nir_ssa_def *c[4];
+ for (int i = 0; i < 4; i++)
+ c[i] = v3d_nir_get_swizzled_channel(b, chans, swiz[i]);
+
+ return pack_func(b, nir_vec4(b, c[0], c[1], c[2], c[3]));
+}
+
+static nir_ssa_def *
+v3d_nir_unpack_and_swizzle(nir_builder *b, nir_ssa_def *packed,
+ const uint8_t *swiz, nir_unpack_func unpack_func)
+{
+ nir_ssa_def *unpacked = unpack_func(b, packed);
+
+ nir_ssa_def *unpacked_chans[4];
+ for (int i = 0; i < 4; i++)
+ unpacked_chans[i] = nir_channel(b, unpacked, i);
+
+ nir_ssa_def *c[4];
+ for (int i = 0; i < 4; i++)
+ c[i] = v3d_nir_get_swizzled_channel(b, unpacked_chans, swiz[i]);
+
+ return nir_vec4(b, c[0], c[1], c[2], c[3]);
+}
+
+static nir_ssa_def *
+pack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
+{
+ const unsigned bits[4] = { 10, 10, 10, 2 };
+ nir_ssa_def *unorm = nir_format_float_to_unorm(b, c, bits);
+
+ nir_ssa_def *chans[4];
+ for (int i = 0; i < 4; i++)
+ chans[i] = nir_channel(b, unorm, i);
+
+ nir_ssa_def *result = nir_mov(b, chans[0]);
+ int offset = bits[0];
+ for (int i = 1; i < 4; i++) {
+ nir_ssa_def *shifted_chan =
+ nir_ishl(b, chans[i], nir_imm_int(b, offset));
+ result = nir_ior(b, result, shifted_chan);
+ offset += bits[i];
+ }
+ return result;
+}
+
+static nir_ssa_def *
+unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c)
+{
+ const unsigned bits[4] = { 10, 10, 10, 2 };
+ const unsigned masks[4] = { BITFIELD_MASK(bits[0]),
+ BITFIELD_MASK(bits[1]),
+ BITFIELD_MASK(bits[2]),
+ BITFIELD_MASK(bits[3]) };
+
+ nir_ssa_def *chans[4];
+ for (int i = 0; i < 4; i++) {
+ nir_ssa_def *unorm = nir_iand(b, c, nir_imm_int(b, masks[i]));
+ chans[i] = nir_format_unorm_to_float(b, unorm, &bits[i]);
+ c = nir_ushr(b, c, nir_imm_int(b, bits[i]));
+ }
+
+ return nir_vec4(b, chans[0], chans[1], chans[2], chans[3]);
+}
+
+static const uint8_t *
+v3d_get_format_swizzle_for_rt(struct v3d_compile *c, int rt)
+{
+ static const uint8_t ident[4] = { 0, 1, 2, 3 };
+
+ /* We will automatically swap R and B channels for BGRA formats
+ * on tile loads and stores (see 'swap_rb' field in v3d_resource) so
+ * we want to treat these surfaces as if they were regular RGBA formats.
+ */
+ if (c->fs_key->color_fmt[rt].swizzle[0] == 2 &&
+ c->fs_key->color_fmt[rt].format != PIPE_FORMAT_B5G6R5_UNORM) {
+ return ident;
+ } else {
+ return c->fs_key->color_fmt[rt].swizzle;
+ }
+}
+
+static nir_ssa_def *
+v3d_nir_get_tlb_color(nir_builder *b, int rt, int sample)
+{
+ nir_ssa_def *color[4];
+ for (int i = 0; i < 4; i++) {
+ nir_intrinsic_instr *load =
+ nir_intrinsic_instr_create(b->shader,
+ nir_intrinsic_load_tlb_color_v3d);
+ load->num_components = 1;
+ nir_intrinsic_set_base(load, sample);
+ nir_intrinsic_set_component(load, i);
+ load->src[0] = nir_src_for_ssa(nir_imm_int(b, rt));
+ nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+ nir_builder_instr_insert(b, &load->instr);
+ color[i] = &load->dest.ssa;
+ }
+
+ return nir_vec4(b, color[0], color[1], color[2], color[3]);
+}
+
+static nir_ssa_def *
+v3d_emit_logic_op_raw(struct v3d_compile *c, nir_builder *b,
+ nir_ssa_def **src_chans, nir_ssa_def **dst_chans,
+ int rt, int sample)
+{
+ const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt);
+
+ nir_ssa_def *op_res[4];
+ for (int i = 0; i < 4; i++) {
+ nir_ssa_def *src = src_chans[i];
+ nir_ssa_def *dst =
+ v3d_nir_get_swizzled_channel(b, dst_chans, fmt_swz[i]);
+ op_res[i] = v3d_logicop(b, c->fs_key->logicop_func, src, dst);
+ }
+
+ nir_ssa_def *r[4];
+ for (int i = 0; i < 4; i++)
+ r[i] = v3d_nir_get_swizzled_channel(b, op_res, fmt_swz[i]);
+
+ return nir_vec4(b, r[0], r[1], r[2], r[3]);
+}
+
+static nir_ssa_def *
+v3d_emit_logic_op_unorm(struct v3d_compile *c, nir_builder *b,
+ nir_ssa_def **src_chans, nir_ssa_def **dst_chans,
+ int rt, int sample,
+ nir_pack_func pack_func, nir_unpack_func unpack_func)
+{
+ const uint8_t src_swz[4] = { 0, 1, 2, 3 };
+ nir_ssa_def *packed_src =
+ v3d_nir_swizzle_and_pack(b, src_chans, src_swz, pack_func);
+
+ const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt);
+ nir_ssa_def *packed_dst =
+ v3d_nir_swizzle_and_pack(b, dst_chans, fmt_swz, pack_func);
+
+ nir_ssa_def *packed_result =
+ v3d_logicop(b, c->fs_key->logicop_func, packed_src, packed_dst);
+
+ return v3d_nir_unpack_and_swizzle(b, packed_result, fmt_swz, unpack_func);
+}
+
+static nir_ssa_def *
+v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b,
+ nir_ssa_def *src, int rt, int sample)
+{
+ nir_ssa_def *dst = v3d_nir_get_tlb_color(b, rt, sample);
+
+ nir_ssa_def *src_chans[4], *dst_chans[4];
+ for (unsigned i = 0; i < 4; i++) {
+ src_chans[i] = nir_channel(b, src, i);
+ dst_chans[i] = nir_channel(b, dst, i);
+ }
+
+ if (c->fs_key->color_fmt[rt].format == PIPE_FORMAT_R10G10B10A2_UNORM) {
+ return v3d_emit_logic_op_unorm(
+ c, b, src_chans, dst_chans, rt, 0,
+ pack_unorm_rgb10a2, unpack_unorm_rgb10a2);
+ }
+
+ if (util_format_is_unorm(c->fs_key->color_fmt[rt].format)) {
+ return v3d_emit_logic_op_unorm(
+ c, b, src_chans, dst_chans, rt, 0,
+ nir_pack_unorm_4x8, nir_unpack_unorm_4x8);
+ }
+
+ return v3d_emit_logic_op_raw(c, b, src_chans, dst_chans, rt, 0);
+}
+
+static void
+v3d_emit_ms_output(struct v3d_compile *c, nir_builder *b,
+ nir_ssa_def *color, nir_src *offset,
+ nir_alu_type type, int rt, int sample)
+{
+
+ nir_intrinsic_instr *store =
+ nir_intrinsic_instr_create(b->shader,
+ nir_intrinsic_store_tlb_sample_color_v3d);
+ store->num_components = 4;
+ nir_intrinsic_set_base(store, sample);
+ nir_intrinsic_set_component(store, 0);
+ nir_intrinsic_set_type(store, type);
+ store->src[0] = nir_src_for_ssa(color);
+ store->src[1] = nir_src_for_ssa(nir_imm_int(b, rt));
+ nir_builder_instr_insert(b, &store->instr);
+}
+
+static void
+v3d_nir_lower_logic_op_instr(struct v3d_compile *c,
+ nir_builder *b,
+ nir_intrinsic_instr *intr,
+ int rt)
+{
+ nir_ssa_def *frag_color = intr->src[0].ssa;
+
+
+ const int logic_op = c->fs_key->logicop_func;
+ if (c->fs_key->msaa && logicop_depends_on_dst_color(logic_op)) {
+ c->msaa_per_sample_output = true;
+
+ nir_src *offset = &intr->src[1];
+ nir_alu_type type = nir_intrinsic_type(intr);
+ for (int i = 0; i < V3D_MAX_SAMPLES; i++) {
+ nir_ssa_def *sample =
+ v3d_nir_emit_logic_op(c, b, frag_color, rt, i);
+
+ v3d_emit_ms_output(c, b, sample, offset, type, rt, i);
+ }
+
+ nir_instr_remove(&intr->instr);
+ } else {
+ nir_ssa_def *result =
+ v3d_nir_emit_logic_op(c, b, frag_color, rt, 0);
+
+ nir_instr_rewrite_src(&intr->instr, &intr->src[0],
+ nir_src_for_ssa(result));
+ intr->num_components = result->num_components;
+ }
+}
+
+static bool
+v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c)
+{
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+ if (intr->intrinsic != nir_intrinsic_store_output)
+ continue;
+
+ nir_foreach_variable(var, &c->s->outputs) {
+ const int driver_loc = var->data.driver_location;
+ if (driver_loc != nir_intrinsic_base(intr))
+ continue;
+
+ const int loc = var->data.location;
+ if (loc != FRAG_RESULT_COLOR &&
+ (loc < FRAG_RESULT_DATA0 ||
+ loc >= FRAG_RESULT_DATA0 + V3D_MAX_DRAW_BUFFERS)) {
+ continue;
+ }
+
+ /* Logic operations do not apply on floating point or
+ * sRGB enabled render targets.
+ */
+ const int rt = driver_loc;
+ assert(rt < V3D_MAX_DRAW_BUFFERS);
+
+ const enum pipe_format format =
+ c->fs_key->color_fmt[rt].format;
+ if (util_format_is_float(format) ||
+ util_format_is_srgb(format)) {
+ continue;
+ }
+
+ nir_function_impl *impl =
+ nir_cf_node_get_function(&block->cf_node);
+ nir_builder b;
+ nir_builder_init(&b, impl);
+ b.cursor = nir_before_instr(&intr->instr);
+ v3d_nir_lower_logic_op_instr(c, &b, intr, rt);
+ }
+ }
+
+ return true;
+}
+
+void
+v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c)
+{
+ /* Nothing to do if logic op is 'copy src to dst' or if logic ops are
+ * disabled (we set the logic op to copy in that case).
+ */
+ if (c->fs_key->logicop_func == PIPE_LOGICOP_COPY)
+ return;
+
+ nir_foreach_function(function, s) {
+ if (function->impl) {
+ nir_foreach_block(block, function->impl)
+ v3d_nir_lower_logic_ops_block(block, c);
+
+ nir_metadata_preserve(function->impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
+ }
+}
diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_scratch.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_scratch.c
new file mode 100644
index 000000000..d23b8be83
--- /dev/null
+++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_scratch.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ * Copyright © 2018 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "v3d_compiler.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
+
+/** @file v3d_nir_lower_scratch.c
+ *
+ * Swizzles around the addresses of
+ * nir_intrinsic_load_scratch/nir_intrinsic_store_scratch so that a QPU stores
+ * a cacheline at a time per dword of scratch access, scalarizing and removing
+ * writemasks in the process.
+ */
+
+static nir_ssa_def *
+v3d_nir_scratch_offset(nir_builder *b, nir_intrinsic_instr *instr)
+{
+ bool is_store = instr->intrinsic == nir_intrinsic_store_scratch;
+ nir_ssa_def *offset = nir_ssa_for_src(b, instr->src[is_store ? 1 : 0], 1);
+
+ assert(nir_intrinsic_align_mul(instr) >= 4);
+ assert(nir_intrinsic_align_offset(instr) == 0);
+
+ /* The spill_offset register will already have the subgroup ID (EIDX)
+ * shifted and ORed in at bit 2, so all we need to do is to move the
+ * dword index up above V3D_CHANNELS.
+ */
+ return nir_imul_imm(b, offset, V3D_CHANNELS);
+}
+
+static void
+v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr)
+{
+ b->cursor = nir_before_instr(&instr->instr);
+
+ nir_ssa_def *offset = v3d_nir_scratch_offset(b,instr);
+
+ nir_ssa_def *chans[NIR_MAX_VEC_COMPONENTS];
+ for (int i = 0; i < instr->num_components; i++) {
+ nir_ssa_def *chan_offset =
+ nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4);
+
+ nir_intrinsic_instr *chan_instr =
+ nir_intrinsic_instr_create(b->shader, instr->intrinsic);
+ chan_instr->num_components = 1;
+ nir_ssa_dest_init(&chan_instr->instr, &chan_instr->dest, 1,
+ instr->dest.ssa.bit_size, NULL);
+
+ chan_instr->src[0] = nir_src_for_ssa(chan_offset);
+
+ nir_intrinsic_set_align(chan_instr, 4, 0);
+
+ nir_builder_instr_insert(b, &chan_instr->instr);
+
+ chans[i] = &chan_instr->dest.ssa;
+ }
+
+ nir_ssa_def *result = nir_vec(b, chans, instr->num_components);
+ nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(result));
+ nir_instr_remove(&instr->instr);
+}
+
+static void
+v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr)
+{
+ b->cursor = nir_before_instr(&instr->instr);
+
+ nir_ssa_def *offset = v3d_nir_scratch_offset(b, instr);
+ nir_ssa_def *value = nir_ssa_for_src(b, instr->src[0],
+ instr->num_components);
+
+ for (int i = 0; i < instr->num_components; i++) {
+ if (!(nir_intrinsic_write_mask(instr) & (1 << i)))
+ continue;
+
+ nir_ssa_def *chan_offset =
+ nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4);
+
+ nir_intrinsic_instr *chan_instr =
+ nir_intrinsic_instr_create(b->shader, instr->intrinsic);
+ chan_instr->num_components = 1;
+
+ chan_instr->src[0] = nir_src_for_ssa(nir_channel(b,
+ value,
+ i));
+ chan_instr->src[1] = nir_src_for_ssa(chan_offset);
+ nir_intrinsic_set_write_mask(chan_instr, 0x1);
+ nir_intrinsic_set_align(chan_instr, 4, 0);
+
+ nir_builder_instr_insert(b, &chan_instr->instr);
+ }
+
+ nir_instr_remove(&instr->instr);
+}
+
+void
+v3d_nir_lower_scratch(nir_shader *s)
+{
+ nir_foreach_function(function, s) {
+ if (!function->impl)
+ continue;
+
+ nir_builder b;
+ nir_builder_init(&b, function->impl);
+
+ nir_foreach_block(block, function->impl) {
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intr =
+ nir_instr_as_intrinsic(instr);
+
+ switch (intr->intrinsic) {
+ case nir_intrinsic_load_scratch:
+ v3d_nir_lower_load_scratch(&b, intr);
+ break;
+ case nir_intrinsic_store_scratch:
+ v3d_nir_lower_store_scratch(&b, intr);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ nir_metadata_preserve(function->impl,
+ nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
+}
diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
index 68591529d..d79969374 100644
--- a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
+++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c
@@ -34,12 +34,10 @@
#define V3D_MAX_SAMPLES 4
-static void
-vc4_nir_lower_txf_ms_instr(struct v3d_compile *c, nir_builder *b,
- nir_tex_instr *instr)
+static nir_ssa_def *
+v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data)
{
- if (instr->op != nir_texop_txf_ms)
- return;
+ nir_tex_instr *instr = nir_instr_as_tex(in_instr);
b->cursor = nir_before_instr(&instr->instr);
@@ -66,30 +64,22 @@ vc4_nir_lower_txf_ms_instr(struct v3d_compile *c, nir_builder *b,
nir_tex_instr_remove_src(instr, sample_index);
instr->op = nir_texop_txf;
instr->sampler_dim = GLSL_SAMPLER_DIM_2D;
+
+ return NIR_LOWER_INSTR_PROGRESS;
+}
+
+static bool
+v3d_nir_lower_txf_ms_filter(const nir_instr *instr, const void *data)
+{
+ return (instr->type == nir_instr_type_tex &&
+ nir_instr_as_tex(instr)->op == nir_texop_txf_ms);
}
void
v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c)
{
- nir_foreach_function(function, s) {
- if (!function->impl)
- continue;
-
- nir_builder b;
- nir_builder_init(&b, function->impl);
-
- nir_foreach_block(block, function->impl) {
- nir_foreach_instr_safe(instr, block) {
- if (instr->type != nir_instr_type_tex)
- continue;
-
- vc4_nir_lower_txf_ms_instr(c, &b,
- nir_instr_as_tex(instr));
- }
- }
-
- nir_metadata_preserve(function->impl,
- nir_metadata_block_index |
- nir_metadata_dominance);
- }
+ nir_shader_lower_instructions(s,
+ v3d_nir_lower_txf_ms_filter,
+ v3d_nir_lower_txf_ms_instr,
+ NULL);
}
diff --git a/lib/mesa/src/broadcom/compiler/vir.c b/lib/mesa/src/broadcom/compiler/vir.c
index 20f700414..78362a294 100644
--- a/lib/mesa/src/broadcom/compiler/vir.c
+++ b/lib/mesa/src/broadcom/compiler/vir.c
@@ -25,7 +25,7 @@
#include "v3d_compiler.h"
int
-vir_get_non_sideband_nsrc(struct qinst *inst)
+vir_get_nsrc(struct qinst *inst)
{
switch (inst->qpu.type) {
case V3D_QPU_INSTR_TYPE_BRANCH:
@@ -40,55 +40,6 @@ vir_get_non_sideband_nsrc(struct qinst *inst)
return 0;
}
-int
-vir_get_nsrc(struct qinst *inst)
-{
- int nsrc = vir_get_non_sideband_nsrc(inst);
-
- if (vir_has_implicit_uniform(inst))
- nsrc++;
-
- return nsrc;
-}
-
-bool
-vir_has_implicit_uniform(struct qinst *inst)
-{
- switch (inst->qpu.type) {
- case V3D_QPU_INSTR_TYPE_BRANCH:
- return true;
- case V3D_QPU_INSTR_TYPE_ALU:
- switch (inst->dst.file) {
- case QFILE_TLBU:
- return true;
- case QFILE_MAGIC:
- switch (inst->dst.index) {
- case V3D_QPU_WADDR_TLBU:
- case V3D_QPU_WADDR_TMUAU:
- case V3D_QPU_WADDR_SYNCU:
- return true;
- default:
- break;
- }
- break;
- default:
- return inst->has_implicit_uniform;
- }
- }
- return false;
-}
-
-/* The sideband uniform for textures gets stored after the normal ALU
- * arguments.
- */
-int
-vir_get_implicit_uniform_src(struct qinst *inst)
-{
- if (!vir_has_implicit_uniform(inst))
- return -1;
- return vir_get_nsrc(inst) - 1;
-}
-
/**
* Returns whether the instruction has any side effects that must be
* preserved.
@@ -124,6 +75,8 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
if (inst->qpu.sig.ldtmu ||
inst->qpu.sig.ldvary ||
+ inst->qpu.sig.ldtlbu ||
+ inst->qpu.sig.ldtlb ||
inst->qpu.sig.wrtmuc ||
inst->qpu.sig.thrsw) {
return true;
@@ -133,38 +86,6 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
}
bool
-vir_is_float_input(struct qinst *inst)
-{
- /* XXX: More instrs */
- switch (inst->qpu.type) {
- case V3D_QPU_INSTR_TYPE_BRANCH:
- return false;
- case V3D_QPU_INSTR_TYPE_ALU:
- switch (inst->qpu.alu.add.op) {
- case V3D_QPU_A_FADD:
- case V3D_QPU_A_FSUB:
- case V3D_QPU_A_FMIN:
- case V3D_QPU_A_FMAX:
- case V3D_QPU_A_FTOIN:
- return true;
- default:
- break;
- }
-
- switch (inst->qpu.alu.mul.op) {
- case V3D_QPU_M_FMOV:
- case V3D_QPU_M_VFMUL:
- case V3D_QPU_M_FMUL:
- return true;
- default:
- break;
- }
- }
-
- return false;
-}
-
-bool
vir_is_raw_mov(struct qinst *inst)
{
if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
@@ -178,6 +99,13 @@ vir_is_raw_mov(struct qinst *inst)
return false;
}
+ if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
+ inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
+ return false;
+ }
+
if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
inst->qpu.flags.mc != V3D_QPU_COND_NONE)
return false;
@@ -421,7 +349,7 @@ vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct q
}
struct qinst *
-vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src)
+vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
{
struct qinst *inst = calloc(1, sizeof(*inst));
@@ -433,9 +361,8 @@ vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src)
inst->qpu.branch.ub = true;
inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
- inst->dst = vir_reg(QFILE_NULL, 0);
- inst->src[0] = src;
- inst->uniform = ~0;
+ inst->dst = vir_nop_reg();
+ inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
return inst;
}
@@ -591,7 +518,6 @@ vir_compile_init(const struct v3d_compiler *compiler,
vir_set_emit_block(c, vir_new_block(c));
c->output_position_index = -1;
- c->output_point_size_index = -1;
c->output_sample_mask_index = -1;
c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
@@ -601,7 +527,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
}
static int
-type_size_vec4(const struct glsl_type *type)
+type_size_vec4(const struct glsl_type *type, bool bindless)
{
return glsl_count_attribute_slots(type, false);
}
@@ -638,8 +564,29 @@ v3d_lower_nir(struct v3d_compile *c)
}
}
+ /* CS textures may not have return_size reflecting the shadow state. */
+ nir_foreach_variable(var, &c->s->uniforms) {
+ const struct glsl_type *type = glsl_without_array(var->type);
+ unsigned array_len = MAX2(glsl_get_length(var->type), 1);
+
+ if (!glsl_type_is_sampler(type) ||
+ !glsl_sampler_type_is_shadow(type))
+ continue;
+
+ for (int i = 0; i < array_len; i++) {
+ tex_options.lower_tex_packing[var->data.binding + i] =
+ nir_lower_tex_packing_16;
+ }
+ }
+
NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
NIR_PASS_V(c->s, nir_lower_system_values);
+
+ NIR_PASS_V(c->s, nir_lower_vars_to_scratch,
+ nir_var_function_temp,
+ 0,
+ glsl_get_natural_size_align_bytes);
+ NIR_PASS_V(c->s, v3d_nir_lower_scratch);
}
static void
@@ -658,47 +605,10 @@ v3d_set_prog_data_uniforms(struct v3d_compile *c,
count * sizeof(*ulist->contents));
}
-/* Copy the compiler UBO range state to the compiled shader, dropping out
- * arrays that were never referenced by an indirect load.
- *
- * (Note that QIR dead code elimination of an array access still leaves that
- * array alive, though)
- */
-static void
-v3d_set_prog_data_ubo(struct v3d_compile *c,
- struct v3d_prog_data *prog_data)
-{
- if (!c->num_ubo_ranges)
- return;
-
- prog_data->num_ubo_ranges = 0;
- prog_data->ubo_ranges = ralloc_array(prog_data, struct v3d_ubo_range,
- c->num_ubo_ranges);
- for (int i = 0; i < c->num_ubo_ranges; i++) {
- if (!c->ubo_range_used[i])
- continue;
-
- struct v3d_ubo_range *range = &c->ubo_ranges[i];
- prog_data->ubo_ranges[prog_data->num_ubo_ranges++] = *range;
- prog_data->ubo_size += range->size;
- }
-
- if (prog_data->ubo_size) {
- if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
- fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
- vir_get_stage_name(c),
- c->program_id, c->variant_id,
- prog_data->ubo_size / 4);
- }
- }
-}
-
static void
v3d_vs_set_prog_data(struct v3d_compile *c,
struct v3d_vs_prog_data *prog_data)
{
- prog_data->base.num_inputs = c->num_inputs;
-
/* The vertex data gets format converted by the VPM so that
* each attribute channel takes up a VPM column. Precompute
* the sizes for the shader record.
@@ -722,7 +632,7 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
* channel).
*/
prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
- prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8;
+ prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
/* Set us up for shared input/output segments. This is apparently
* necessary for our VCM setup to avoid varying corruption.
@@ -741,7 +651,7 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
* batches.
*/
assert(c->devinfo->vpm_size);
- int sector_size = 16 * sizeof(uint32_t) * 8;
+ int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
int half_vpm = vpm_size_in_sectors / 2;
int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
@@ -754,7 +664,7 @@ static void
v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
struct v3d_fs_prog_data *prog_data)
{
- prog_data->base.num_inputs = c->num_inputs;
+ prog_data->num_inputs = c->num_inputs;
memcpy(prog_data->input_slots, c->input_slots,
c->num_inputs * sizeof(*c->input_slots));
@@ -780,6 +690,17 @@ v3d_fs_set_prog_data(struct v3d_compile *c,
prog_data->writes_z = c->writes_z;
prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
prog_data->uses_center_w = c->uses_center_w;
+ prog_data->uses_implicit_point_line_varyings =
+ c->uses_implicit_point_line_varyings;
+ prog_data->lock_scoreboard_on_first_thrsw =
+ c->lock_scoreboard_on_first_thrsw;
+}
+
+static void
+v3d_cs_set_prog_data(struct v3d_compile *c,
+ struct v3d_compute_prog_data *prog_data)
+{
+ prog_data->shared_size = c->s->info.cs.shared_size;
}
static void
@@ -791,9 +712,10 @@ v3d_set_prog_data(struct v3d_compile *c,
prog_data->spill_size = c->spill_size;
v3d_set_prog_data_uniforms(c, prog_data);
- v3d_set_prog_data_ubo(c, prog_data);
- if (c->s->info.stage == MESA_SHADER_VERTEX) {
+ if (c->s->info.stage == MESA_SHADER_COMPUTE) {
+ v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
+ } else if (c->s->info.stage == MESA_SHADER_VERTEX) {
v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
} else {
assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
@@ -836,9 +758,16 @@ v3d_nir_lower_vs_early(struct v3d_compile *c)
NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
v3d_optimize_nir(c->s);
NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in);
+
+ /* This must go before nir_lower_io */
+ if (c->vs_key->per_vertex_point_size)
+ NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
+
NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
type_size_vec4,
(nir_lower_io_options)0);
+ /* clean up nir_lower_io's deref_var remains */
+ NIR_PASS_V(c->s, nir_opt_dce);
}
static void
@@ -877,6 +806,8 @@ v3d_nir_lower_fs_early(struct v3d_compile *c)
if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
v3d_fixup_fs_output_types(c);
+ NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c);
+
/* If the shader has no non-TLB side effects, we can promote it to
* enabling early_fragment_tests even if the user didn't.
*/
@@ -928,6 +859,33 @@ v3d_nir_lower_fs_late(struct v3d_compile *c)
NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
}
+static uint32_t
+vir_get_max_temps(struct v3d_compile *c)
+{
+ int max_ip = 0;
+ vir_for_each_inst_inorder(inst, c)
+ max_ip++;
+
+ uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
+
+ for (int t = 0; t < c->num_temps; t++) {
+ for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
+ i < max_ip); i++) {
+ if (i > max_ip)
+ break;
+ pressure[i]++;
+ }
+ }
+
+ uint32_t max_temps = 0;
+ for (int i = 0; i < max_ip; i++)
+ max_temps = MAX2(max_temps, pressure[i]);
+
+ ralloc_free(pressure);
+
+ return max_temps;
+}
+
uint64_t *v3d_compile(const struct v3d_compiler *compiler,
struct v3d_key *key,
struct v3d_prog_data **out_prog_data,
@@ -952,13 +910,17 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
c->fs_key = (struct v3d_fs_key *)key;
prog_data = rzalloc_size(NULL, sizeof(struct v3d_fs_prog_data));
break;
+ case MESA_SHADER_COMPUTE:
+ prog_data = rzalloc_size(NULL,
+ sizeof(struct v3d_compute_prog_data));
+ break;
default:
unreachable("unsupported shader stage");
}
if (c->s->info.stage == MESA_SHADER_VERTEX) {
v3d_nir_lower_vs_early(c);
- } else {
+ } else if (c->s->info.stage != MESA_SHADER_COMPUTE) {
assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
v3d_nir_lower_fs_early(c);
}
@@ -967,7 +929,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
if (c->s->info.stage == MESA_SHADER_VERTEX) {
v3d_nir_lower_vs_late(c);
- } else {
+ } else if (c->s->info.stage != MESA_SHADER_COMPUTE) {
assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
v3d_nir_lower_fs_late(c);
}
@@ -990,15 +952,22 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
char *shaderdb;
int ret = asprintf(&shaderdb,
"%s shader: %d inst, %d threads, %d loops, "
- "%d uniforms, %d:%d spills:fills",
+ "%d uniforms, %d max-temps, %d:%d spills:fills, "
+ "%d sfu-stalls, %d inst-and-stalls",
vir_get_stage_name(c),
c->qpu_inst_count,
c->threads,
c->loops,
c->num_uniforms,
+ vir_get_max_temps(c),
c->spills,
- c->fills);
+ c->fills,
+ c->qpu_inst_stalled_count,
+ c->qpu_inst_count + c->qpu_inst_stalled_count);
if (ret >= 0) {
+ if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
+ fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
+
c->debug_output(shaderdb, c->debug_output_data);
free(shaderdb);
}
@@ -1059,15 +1028,15 @@ vir_compile_destroy(struct v3d_compile *c)
ralloc_free(c);
}
-struct qreg
-vir_uniform(struct v3d_compile *c,
- enum quniform_contents contents,
- uint32_t data)
+uint32_t
+vir_get_uniform_index(struct v3d_compile *c,
+ enum quniform_contents contents,
+ uint32_t data)
{
for (int i = 0; i < c->num_uniforms; i++) {
if (c->uniform_contents[i] == contents &&
c->uniform_data[i] == data) {
- return vir_reg(QFILE_UNIF, i);
+ return i;
}
}
@@ -1088,52 +1057,20 @@ vir_uniform(struct v3d_compile *c,
c->uniform_contents[uniform] = contents;
c->uniform_data[uniform] = data;
- return vir_reg(QFILE_UNIF, uniform);
-}
-
-static bool
-vir_can_set_flags(struct v3d_compile *c, struct qinst *inst)
-{
- if (c->devinfo->ver >= 40 && (v3d_qpu_reads_vpm(&inst->qpu) ||
- v3d_qpu_uses_sfu(&inst->qpu))) {
- return false;
- }
-
- if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
- (inst->qpu.alu.add.op == V3D_QPU_A_NOP &&
- inst->qpu.alu.mul.op == V3D_QPU_M_NOP)) {
- return false;
- }
-
- return true;
+ return uniform;
}
-void
-vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf)
+struct qreg
+vir_uniform(struct v3d_compile *c,
+ enum quniform_contents contents,
+ uint32_t data)
{
- struct qinst *last_inst = NULL;
-
- if (!list_empty(&c->cur_block->instructions)) {
- last_inst = (struct qinst *)c->cur_block->instructions.prev;
-
- /* Can't stuff the PF into the last last inst if our cursor
- * isn't pointing after it.
- */
- struct vir_cursor after_inst = vir_after_inst(last_inst);
- if (c->cursor.mode != after_inst.mode ||
- c->cursor.link != after_inst.link)
- last_inst = NULL;
- }
-
- if (src.file != QFILE_TEMP ||
- !c->defs[src.index] ||
- last_inst != c->defs[src.index] ||
- !vir_can_set_flags(c, last_inst)) {
- /* XXX: Make the MOV be the appropriate type */
- last_inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), src);
- }
-
- vir_set_pf(last_inst, pf);
+ struct qinst *inst = vir_NOP(c);
+ inst->qpu.sig.ldunif = true;
+ inst->uniform = vir_get_uniform_index(c, contents, data);
+ inst->dst = vir_get_temp(c);
+ c->defs[inst->dst.index] = inst;
+ return inst->dst;
}
#define OPTPASS(func) \
@@ -1160,6 +1097,7 @@ vir_optimize(struct v3d_compile *c)
bool progress = false;
OPTPASS(vir_opt_copy_propagate);
+ OPTPASS(vir_opt_redundant_flags);
OPTPASS(vir_opt_dead_code);
OPTPASS(vir_opt_small_immediates);
diff --git a/lib/mesa/src/broadcom/compiler/vir_dump.c b/lib/mesa/src/broadcom/compiler/vir_dump.c
index ecf6f3e1f..9e1ef1e9d 100644
--- a/lib/mesa/src/broadcom/compiler/vir_dump.c
+++ b/lib/mesa/src/broadcom/compiler/vir_dump.c
@@ -30,6 +30,7 @@ vir_dump_uniform(enum quniform_contents contents,
uint32_t data)
{
static const char *quniform_names[] = {
+ [QUNIFORM_ALPHA_REF] = "alpha_ref",
[QUNIFORM_VIEWPORT_X_SCALE] = "vp_x_scale",
[QUNIFORM_VIEWPORT_Y_SCALE] = "vp_y_scale",
[QUNIFORM_VIEWPORT_Z_OFFSET] = "vp_z_offset",
@@ -52,20 +53,20 @@ vir_dump_uniform(enum quniform_contents contents,
case QUNIFORM_TMU_CONFIG_P0:
fprintf(stderr, "tex[%d].p0 | 0x%x",
- v3d_tmu_config_data_get_unit(data),
- v3d_tmu_config_data_get_value(data));
+ v3d_unit_data_get_unit(data),
+ v3d_unit_data_get_offset(data));
break;
case QUNIFORM_TMU_CONFIG_P1:
fprintf(stderr, "tex[%d].p1 | 0x%x",
- v3d_tmu_config_data_get_unit(data),
- v3d_tmu_config_data_get_value(data));
+ v3d_unit_data_get_unit(data),
+ v3d_unit_data_get_offset(data));
break;
case QUNIFORM_IMAGE_TMU_CONFIG_P0:
fprintf(stderr, "img[%d].p0 | 0x%x",
- v3d_tmu_config_data_get_unit(data),
- v3d_tmu_config_data_get_value(data));
+ v3d_unit_data_get_unit(data),
+ v3d_unit_data_get_offset(data));
break;
case QUNIFORM_TEXTURE_WIDTH:
@@ -97,8 +98,18 @@ vir_dump_uniform(enum quniform_contents contents,
fprintf(stderr, "img[%d].array_size", data);
break;
+ case QUNIFORM_SPILL_OFFSET:
+ fprintf(stderr, "spill_offset");
+ break;
+
+ case QUNIFORM_SPILL_SIZE_PER_THREAD:
+ fprintf(stderr, "spill_size_per_thread");
+ break;
+
case QUNIFORM_UBO_ADDR:
- fprintf(stderr, "ubo[%d]", data);
+ fprintf(stderr, "ubo[%d]+0x%x",
+ v3d_unit_data_get_unit(data),
+ v3d_unit_data_get_offset(data));
break;
case QUNIFORM_SSBO_OFFSET:
@@ -118,7 +129,8 @@ vir_dump_uniform(enum quniform_contents contents,
fprintf(stderr, "tex[%d].p0: 0x%08x",
contents - QUNIFORM_TEXTURE_CONFIG_P0_0,
data);
- } else if (contents < ARRAY_SIZE(quniform_names)) {
+ } else if (contents < ARRAY_SIZE(quniform_names) &&
+ quniform_names[contents]) {
fprintf(stderr, "%s",
quniform_names[contents]);
} else {
@@ -131,13 +143,6 @@ static void
vir_print_reg(struct v3d_compile *c, const struct qinst *inst,
struct qreg reg)
{
- static const char *files[] = {
- [QFILE_TEMP] = "t",
- [QFILE_UNIF] = "u",
- [QFILE_TLB] = "tlb",
- [QFILE_TLBU] = "tlbu",
- };
-
switch (reg.file) {
case QFILE_NULL:
@@ -176,21 +181,8 @@ vir_print_reg(struct v3d_compile *c, const struct qinst *inst,
reg.index / 4, reg.index % 4);
break;
- case QFILE_TLB:
- case QFILE_TLBU:
- fprintf(stderr, "%s", files[reg.file]);
- break;
-
- case QFILE_UNIF:
- fprintf(stderr, "%s%d", files[reg.file], reg.index);
- fprintf(stderr, " (");
- vir_dump_uniform(c->uniform_contents[reg.index],
- c->uniform_data[reg.index]);
- fprintf(stderr, ")");
- break;
-
- default:
- fprintf(stderr, "%s%d", files[reg.file], reg.index);
+ case QFILE_TEMP:
+ fprintf(stderr, "t%d", reg.index);
break;
}
}
@@ -258,8 +250,7 @@ static void
vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
{
struct v3d_qpu_instr *instr = &inst->qpu;
- int nsrc = vir_get_non_sideband_nsrc(inst);
- int sideband_nsrc = vir_get_nsrc(inst);
+ int nsrc = vir_get_nsrc(inst);
enum v3d_qpu_input_unpack unpack[2];
if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) {
@@ -288,11 +279,10 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
unpack[1] = instr->alu.mul.b_unpack;
}
- for (int i = 0; i < sideband_nsrc; i++) {
+ for (int i = 0; i < nsrc; i++) {
fprintf(stderr, ", ");
vir_print_reg(c, inst, inst->src[i]);
- if (i < nsrc)
- fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i]));
+ fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i]));
}
vir_dump_sig(c, inst);
@@ -353,25 +343,34 @@ vir_dump_inst(struct v3d_compile *c, struct qinst *inst)
break;
}
}
-
- if (vir_has_implicit_uniform(inst)) {
- fprintf(stderr, " ");
- vir_print_reg(c, inst, inst->src[vir_get_implicit_uniform_src(inst)]);
- }
-
break;
}
+
+ if (vir_has_uniform(inst)) {
+ fprintf(stderr, " (");
+ vir_dump_uniform(c->uniform_contents[inst->uniform],
+ c->uniform_data[inst->uniform]);
+ fprintf(stderr, ")");
+ }
}
void
vir_dump(struct v3d_compile *c)
{
int ip = 0;
+ int pressure = 0;
vir_for_each_block(block, c) {
fprintf(stderr, "BLOCK %d:\n", block->index);
vir_for_each_inst(inst, block) {
if (c->live_intervals_valid) {
+ for (int i = 0; i < c->num_temps; i++) {
+ if (c->temp_start[i] == ip)
+ pressure++;
+ }
+
+ fprintf(stderr, "P%4d ", pressure);
+
bool first = true;
for (int i = 0; i < c->num_temps; i++) {
@@ -383,7 +382,10 @@ vir_dump(struct v3d_compile *c)
} else {
fprintf(stderr, ", ");
}
- fprintf(stderr, "S%4d", i);
+ if (BITSET_TEST(c->spillable, i))
+ fprintf(stderr, "S%4d", i);
+ else
+ fprintf(stderr, "U%4d", i);
}
if (first)
@@ -405,6 +407,7 @@ vir_dump(struct v3d_compile *c)
fprintf(stderr, ", ");
}
fprintf(stderr, "E%4d", i);
+ pressure--;
}
if (first)
diff --git a/lib/mesa/src/broadcom/compiler/vir_live_variables.c b/lib/mesa/src/broadcom/compiler/vir_live_variables.c
index 2879e23b4..d3ca02f18 100644
--- a/lib/mesa/src/broadcom/compiler/vir_live_variables.c
+++ b/lib/mesa/src/broadcom/compiler/vir_live_variables.c
@@ -109,24 +109,18 @@ vir_setup_def(struct v3d_compile *c, struct qblock *block, int ip,
c->temp_start[var] = MIN2(c->temp_start[var], ip);
c->temp_end[var] = MAX2(c->temp_end[var], ip);
- /* If we've already tracked this as a def, or already used it within
- * the block, there's nothing to do.
+ /* Mark the block as having a (partial) def of the var. */
+ BITSET_SET(block->defout, var);
+
+ /* If we've already tracked this as a def that screens off previous
+ * uses, or already used it within the block, there's nothing to do.
*/
if (BITSET_TEST(block->use, var) || BITSET_TEST(block->def, var))
return;
- /* Easy, common case: unconditional full register update.
- *
- * We treat conditioning on the exec mask as the same as not being
- * conditional. This makes sure that if the register gets set on
- * either side of an if, it is treated as being screened off before
- * the if. Otherwise, if there was no intervening def, its live
- * interval doesn't extend back to the start of he program, and if too
- * many registers did that we'd fail to register allocate.
- */
- if (((inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
- inst->qpu.flags.mc == V3D_QPU_COND_NONE) ||
- inst->cond_is_exec_mask) &&
+ /* Easy, common case: unconditional full register update.*/
+ if ((inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
+ inst->qpu.flags.mc == V3D_QPU_COND_NONE) &&
inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE &&
inst->qpu.alu.mul.output_pack == V3D_QPU_PACK_NONE) {
BITSET_SET(block->def, var);
@@ -278,6 +272,33 @@ vir_live_variables_dataflow(struct v3d_compile *c, int bitset_words)
return cont;
}
+static bool
+vir_live_variables_defin_defout_dataflow(struct v3d_compile *c, int bitset_words)
+{
+ bool cont = false;
+
+ vir_for_each_block_rev(block, c) {
+ /* Propagate defin/defout down the successors to produce the
+ * union of blocks with a reachable (partial) definition of
+ * the var.
+ *
+ * This keeps a conditional first write to a reg from
+ * extending its lifetime back to the start of the program.
+ */
+ vir_for_each_successor(succ, block) {
+ for (int i = 0; i < bitset_words; i++) {
+ BITSET_WORD new_def = (block->defout[i] &
+ ~succ->defin[i]);
+ succ->defin[i] |= new_def;
+ succ->defout[i] |= new_def;
+ cont |= new_def;
+ }
+ }
+ }
+
+ return cont;
+}
+
/**
* Extend the start/end ranges for each variable to account for the
* new information calculated from control flow.
@@ -287,14 +308,16 @@ vir_compute_start_end(struct v3d_compile *c, int num_vars)
{
vir_for_each_block(block, c) {
for (int i = 0; i < num_vars; i++) {
- if (BITSET_TEST(block->live_in, i)) {
+ if (BITSET_TEST(block->live_in, i) &&
+ BITSET_TEST(block->defin, i)) {
c->temp_start[i] = MIN2(c->temp_start[i],
block->start_ip);
c->temp_end[i] = MAX2(c->temp_end[i],
block->start_ip);
}
- if (BITSET_TEST(block->live_out, i)) {
+ if (BITSET_TEST(block->live_out, i) &&
+ BITSET_TEST(block->defout, i)) {
c->temp_start[i] = MIN2(c->temp_start[i],
block->end_ip);
c->temp_end[i] = MAX2(c->temp_end[i],
@@ -334,6 +357,8 @@ vir_calculate_live_intervals(struct v3d_compile *c)
vir_for_each_block(block, c) {
block->def = rzalloc_array(c, BITSET_WORD, bitset_words);
+ block->defin = rzalloc_array(c, BITSET_WORD, bitset_words);
+ block->defout = rzalloc_array(c, BITSET_WORD, bitset_words);
block->use = rzalloc_array(c, BITSET_WORD, bitset_words);
block->live_in = rzalloc_array(c, BITSET_WORD, bitset_words);
block->live_out = rzalloc_array(c, BITSET_WORD, bitset_words);
@@ -344,6 +369,9 @@ vir_calculate_live_intervals(struct v3d_compile *c)
while (vir_live_variables_dataflow(c, bitset_words))
;
+ while (vir_live_variables_defin_defout_dataflow(c, bitset_words))
+ ;
+
vir_compute_start_end(c, c->num_temps);
c->live_intervals_valid = true;
diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c b/lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c
index 2a22a1b55..c5bb61121 100644
--- a/lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c
+++ b/lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c
@@ -49,10 +49,8 @@ is_copy_mov(struct qinst *inst)
if (inst->dst.file != QFILE_TEMP)
return false;
- if (inst->src[0].file != QFILE_TEMP &&
- inst->src[0].file != QFILE_UNIF) {
+ if (inst->src[0].file != QFILE_TEMP)
return false;
- }
if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
@@ -151,13 +149,36 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
* would be the same between the two
* instructions.
*/
- if (vir_is_float_input(inst) !=
- vir_is_float_input(mov)) {
+ if (v3d_qpu_unpacks_f32(&inst->qpu) !=
+ v3d_qpu_unpacks_f32(&mov->qpu) ||
+ v3d_qpu_unpacks_f16(&inst->qpu) !=
+ v3d_qpu_unpacks_f16(&mov->qpu)) {
continue;
}
+
/* No composing the unpacks. */
if (vir_has_unpack(inst, i))
- continue;
+ continue;
+
+ /* these ops can't represent abs. */
+ if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
+ switch (inst->qpu.alu.add.op) {
+ case V3D_QPU_A_VFPACK:
+ case V3D_QPU_A_FROUND:
+ case V3D_QPU_A_FTRUNC:
+ case V3D_QPU_A_FFLOOR:
+ case V3D_QPU_A_FCEIL:
+ case V3D_QPU_A_FDX:
+ case V3D_QPU_A_FDY:
+ case V3D_QPU_A_FTOIN:
+ case V3D_QPU_A_FTOIZ:
+ case V3D_QPU_A_FTOUZ:
+ case V3D_QPU_A_FTOC:
+ continue;
+ default:
+ break;
+ }
+ }
}
if (debug) {
diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c b/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c
index a486708bf..6048ccfcc 100644
--- a/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c
+++ b/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c
@@ -55,28 +55,8 @@ static bool
has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst)
{
for (int i = 0; i < vir_get_nsrc(inst); i++) {
- if (inst->src[i].file == QFILE_VPM) {
- /* Instance ID, Vertex ID: Should have been removed at
- * the NIR level
- */
- if (inst->src[i].index == ~0)
- return true;
-
- uint32_t attr = inst->src[i].index / 4;
- uint32_t offset = inst->src[i].index % 4;
-
- if (c->vattr_sizes[attr] != offset)
- return true;
-
- /* Can't get rid of the last VPM read, or the
- * simulator (at least) throws an error.
- */
- uint32_t total_size = 0;
- for (uint32_t i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)
- total_size += c->vattr_sizes[i];
- if (total_size == 1)
- return true;
- }
+ if (inst->src[i].file == QFILE_VPM)
+ return true;
}
return false;
@@ -187,18 +167,6 @@ vir_opt_dead_code(struct v3d_compile *c)
continue;
}
- for (int i = 0; i < vir_get_nsrc(inst); i++) {
- if (inst->src[i].file != QFILE_VPM)
- continue;
- uint32_t attr = inst->src[i].index / 4;
- uint32_t offset = (inst->src[i].index % 4);
-
- if (c->vattr_sizes[attr] == offset) {
- c->num_inputs--;
- c->vattr_sizes[attr]--;
- }
- }
-
assert(inst != last_flags_write);
dce(c, inst);
progress = true;
diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c b/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c
new file mode 100644
index 000000000..8749f3cd6
--- /dev/null
+++ b/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright © 2019 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_opt_redundant_flags.c
+ *
+ * This eliminates the APF/MPF flags for redundant flags updates. These are
+ * often produced by our channel masking in nonuniform control flow.
+ */
+
+#include "v3d_compiler.h"
+
+static bool debug;
+
+static void
+vir_dce_pf(struct v3d_compile *c, struct qinst *inst)
+{
+ if (debug) {
+ fprintf(stderr,
+ "Removing flags write from: ");
+ vir_dump_inst(c, inst);
+ fprintf(stderr, "\n");
+ }
+
+ assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
+
+ inst->qpu.flags.apf = V3D_QPU_PF_NONE;
+ inst->qpu.flags.mpf = V3D_QPU_PF_NONE;
+}
+
+static bool
+vir_sources_modified(struct qinst *srcs, struct qinst *write)
+{
+ for (int i = 0; i < vir_get_nsrc(srcs); i++) {
+ if (write->dst.file == QFILE_TEMP &&
+ srcs->src[i].file == QFILE_TEMP &&
+ srcs->src[i].index == write->dst.index) {
+ return true;
+ }
+
+ /* assume magic regs may be modified by basically anything. */
+ if (srcs->src[i].file != QFILE_TEMP &&
+ srcs->src[i].file != QFILE_SMALL_IMM)
+ return true;
+ }
+
+ return false;
+}
+
+static bool
+vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
+{
+ for (int i = 0; i < vir_get_nsrc(a); i++) {
+ if (a->src[i].file != b->src[i].file ||
+ a->src[i].index != b->src[i].index) {
+ return false;
+ }
+ }
+
+ if (a->qpu.flags.apf != b->qpu.flags.apf ||
+ a->qpu.flags.mpf != b->qpu.flags.mpf ||
+ a->qpu.alu.add.op != b->qpu.alu.add.op ||
+ a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
+ a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
+ a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
+ a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
+ a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
+ a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
+ a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+vir_opt_redundant_flags_block(struct v3d_compile *c, struct qblock *block)
+{
+ struct qinst *last_flags = NULL;
+ bool progress = false;
+
+ vir_for_each_inst(inst, block) {
+ if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+ inst->qpu.flags.auf != V3D_QPU_UF_NONE ||
+ inst->qpu.flags.muf != V3D_QPU_UF_NONE) {
+ last_flags = NULL;
+ continue;
+ }
+
+ /* Flags aren't preserved across a thrsw. */
+ if (inst->qpu.sig.thrsw)
+ last_flags = NULL;
+
+ if (inst->qpu.flags.apf != V3D_QPU_PF_NONE ||
+ inst->qpu.flags.mpf != V3D_QPU_PF_NONE) {
+ if (last_flags &&
+ vir_instr_flags_op_equal(inst, last_flags)) {
+ vir_dce_pf(c, inst);
+ progress = true;
+ } else {
+ last_flags = inst;
+ }
+ }
+
+ if (last_flags && vir_sources_modified(last_flags, inst)) {
+ last_flags = NULL;
+ }
+ }
+
+ return progress;
+}
+
+bool
+vir_opt_redundant_flags(struct v3d_compile *c)
+{
+ bool progress = false;
+
+ vir_for_each_block(block, c) {
+ progress = vir_opt_redundant_flags_block(c, block) || progress;
+ }
+
+ return progress;
+}
diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c b/lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c
index 5491f9c24..47d772296 100644
--- a/lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c
+++ b/lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c
@@ -55,26 +55,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
continue;
for (int i = 0; i < vir_get_nsrc(inst); i++) {
- struct qreg src = vir_follow_movs(c, inst->src[i]);
+ if (inst->src[i].file != QFILE_TEMP)
+ continue;
- if (src.file != QFILE_UNIF ||
- c->uniform_contents[src.index] !=
- QUNIFORM_CONSTANT) {
+ /* See if it's a uniform load. */
+ struct qinst *src_def = c->defs[inst->src[i].index];
+ if (!src_def || !src_def->qpu.sig.ldunif)
continue;
- }
+ int uniform = src_def->uniform;
- if (vir_has_implicit_uniform(inst) &&
- i == vir_get_implicit_uniform_src(inst)) {
- /* No turning the implicit uniform read into
- * an immediate.
- */
+ if (c->uniform_contents[uniform] != QUNIFORM_CONSTANT)
continue;
- }
/* Check if the uniform is suitable as a small
* immediate.
*/
- uint32_t imm = c->uniform_data[src.index];
+ uint32_t imm = c->uniform_data[uniform];
uint32_t packed;
if (!v3d_qpu_small_imm_pack(c->devinfo, imm, &packed))
continue;
diff --git a/lib/mesa/src/broadcom/compiler/vir_register_allocate.c b/lib/mesa/src/broadcom/compiler/vir_register_allocate.c
index 79ab5acd7..7583acf15 100644
--- a/lib/mesa/src/broadcom/compiler/vir_register_allocate.c
+++ b/lib/mesa/src/broadcom/compiler/vir_register_allocate.c
@@ -29,28 +29,44 @@
#define QPU_R(i) { .magic = false, .index = i }
#define ACC_INDEX 0
-#define ACC_COUNT 5
+#define ACC_COUNT 6
#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
#define PHYS_COUNT 64
+static inline bool
+qinst_writes_tmu(struct qinst *inst)
+{
+ return (inst->dst.file == QFILE_MAGIC &&
+ v3d_qpu_magic_waddr_is_tmu(inst->dst.index));
+}
+
static bool
is_last_ldtmu(struct qinst *inst, struct qblock *block)
{
- list_for_each_entry_from(struct qinst, scan_inst, inst,
+ list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
&block->instructions, link) {
- if (inst->qpu.sig.ldtmu)
+ if (scan_inst->qpu.sig.ldtmu)
return false;
- if (v3d_qpu_writes_tmu(&inst->qpu))
+ if (qinst_writes_tmu(scan_inst))
return true;
}
return true;
}
+static bool
+vir_is_mov_uniform(struct v3d_compile *c, int temp)
+{
+ struct qinst *def = c->defs[temp];
+
+ return def && def->qpu.sig.ldunif;
+}
+
static int
v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
uint32_t *temp_to_node)
{
+ const float tmu_scale = 5;
float block_scale = 1.0;
float spill_costs[c->num_temps];
bool in_tmu_operation = false;
@@ -75,22 +91,28 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
continue;
int temp = inst->src[i].index;
- if (no_spilling) {
- BITSET_CLEAR(c->spillable,
- temp);
- } else {
+ if (vir_is_mov_uniform(c, temp)) {
spill_costs[temp] += block_scale;
+ } else if (!no_spilling) {
+ spill_costs[temp] += (block_scale *
+ tmu_scale);
+ } else {
+ BITSET_CLEAR(c->spillable, temp);
}
}
if (inst->dst.file == QFILE_TEMP) {
int temp = inst->dst.index;
- if (no_spilling) {
- BITSET_CLEAR(c->spillable,
- temp);
+ if (vir_is_mov_uniform(c, temp)) {
+ /* We just rematerialize the unform
+ * later.
+ */
+ } else if (!no_spilling) {
+ spill_costs[temp] += (block_scale *
+ tmu_scale);
} else {
- spill_costs[temp] += block_scale;
+ BITSET_CLEAR(c->spillable, temp);
}
}
@@ -123,7 +145,7 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
inst->qpu.alu.add.op == V3D_QPU_A_TMUWT)
in_tmu_operation = false;
- if (v3d_qpu_writes_tmu(&inst->qpu))
+ if (qinst_writes_tmu(inst))
in_tmu_operation = true;
}
}
@@ -141,7 +163,7 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
/* The spill offset for this thread takes a bit of setup, so do it once at
* program start.
*/
-static void
+void
v3d_setup_spill_base(struct v3d_compile *c)
{
c->cursor = vir_before_block(vir_entry_block(c));
@@ -170,6 +192,8 @@ v3d_setup_spill_base(struct v3d_compile *c)
/* Make sure that we don't spill the spilling setup instructions. */
for (int i = start_num_temps; i < c->num_temps; i++)
BITSET_CLEAR(c->spillable, i);
+
+ c->cursor = vir_after_block(c->cur_block);
}
static void
@@ -184,18 +208,30 @@ v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
static void
v3d_spill_reg(struct v3d_compile *c, int spill_temp)
{
- uint32_t spill_offset = c->spill_size;
- c->spill_size += 16 * sizeof(uint32_t);
+ bool is_uniform = vir_is_mov_uniform(c, spill_temp);
+
+ uint32_t spill_offset = 0;
- if (spill_offset == 0)
- v3d_setup_spill_base(c);
+ if (!is_uniform) {
+ uint32_t spill_offset = c->spill_size;
+ c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
+
+ if (spill_offset == 0)
+ v3d_setup_spill_base(c);
+ }
struct qinst *last_thrsw = c->last_thrsw;
assert(!last_thrsw || last_thrsw->is_last_thrsw);
int start_num_temps = c->num_temps;
- vir_for_each_inst_inorder(inst, c) {
+ int uniform_index = ~0;
+ if (is_uniform) {
+ struct qinst *orig_unif = c->defs[spill_temp];
+ uniform_index = orig_unif->uniform;
+ }
+
+ vir_for_each_inst_inorder_safe(inst, c) {
for (int i = 0; i < vir_get_nsrc(inst); i++) {
if (inst->src[i].file != QFILE_TEMP ||
inst->src[i].index != spill_temp) {
@@ -204,23 +240,37 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
c->cursor = vir_before_inst(inst);
- v3d_emit_spill_tmua(c, spill_offset);
- vir_emit_thrsw(c);
- inst->src[i] = vir_LDTMU(c);
- c->fills++;
+ if (is_uniform) {
+ struct qreg unif =
+ vir_uniform(c,
+ c->uniform_contents[uniform_index],
+ c->uniform_data[uniform_index]);
+ inst->src[i] = unif;
+ } else {
+ v3d_emit_spill_tmua(c, spill_offset);
+ vir_emit_thrsw(c);
+ inst->src[i] = vir_LDTMU(c);
+ c->fills++;
+ }
}
if (inst->dst.file == QFILE_TEMP &&
inst->dst.index == spill_temp) {
- c->cursor = vir_after_inst(inst);
-
- inst->dst.index = c->num_temps++;
- vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
- inst->dst);
- v3d_emit_spill_tmua(c, spill_offset);
- vir_emit_thrsw(c);
- vir_TMUWT(c);
- c->spills++;
+ if (is_uniform) {
+ c->cursor.link = NULL;
+ vir_remove_instruction(c, inst);
+ } else {
+ c->cursor = vir_after_inst(inst);
+
+ inst->dst.index = c->num_temps++;
+ vir_MOV_dest(c, vir_reg(QFILE_MAGIC,
+ V3D_QPU_WADDR_TMUD),
+ inst->dst);
+ v3d_emit_spill_tmua(c, spill_offset);
+ vir_emit_thrsw(c);
+ vir_TMUWT(c);
+ c->spills++;
+ }
}
/* If we didn't have a last-thrsw inserted by nir_to_vir and
@@ -228,7 +278,7 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
* right before we start the vpm/tlb sequence for the last
* thread segment.
*/
- if (!last_thrsw && c->last_thrsw &&
+ if (!is_uniform && !last_thrsw && c->last_thrsw &&
(v3d_qpu_writes_vpm(&inst->qpu) ||
v3d_qpu_uses_tlb(&inst->qpu))) {
c->cursor = vir_before_inst(inst);
@@ -261,6 +311,14 @@ static unsigned int
v3d_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
{
struct v3d_ra_select_callback_data *v3d_ra = data;
+ int r5 = ACC_INDEX + 5;
+
+ /* Choose r5 for our ldunifs if possible (nobody else can load to that
+ * reg, and it keeps the QPU cond field free from being occupied by
+ * ldunifrf).
+ */
+ if (BITSET_TEST(regs, r5))
+ return r5;
/* Choose an accumulator if possible (I think it's lower power than
* phys regs), but round-robin through them to give post-RA
@@ -303,6 +361,10 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
return false;
for (int threads = 0; threads < max_thread_index; threads++) {
+ compiler->reg_class_any[threads] =
+ ra_alloc_reg_class(compiler->regs);
+ compiler->reg_class_r5[threads] =
+ ra_alloc_reg_class(compiler->regs);
compiler->reg_class_phys_or_acc[threads] =
ra_alloc_reg_class(compiler->regs);
compiler->reg_class_phys[threads] =
@@ -314,12 +376,25 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
compiler->reg_class_phys_or_acc[threads], i);
ra_class_add_reg(compiler->regs,
compiler->reg_class_phys[threads], i);
+ ra_class_add_reg(compiler->regs,
+ compiler->reg_class_any[threads], i);
}
- for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT; i++) {
+ for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
ra_class_add_reg(compiler->regs,
compiler->reg_class_phys_or_acc[threads], i);
+ ra_class_add_reg(compiler->regs,
+ compiler->reg_class_any[threads], i);
}
+ /* r5 can only store a single 32-bit value, so not much can
+ * use it.
+ */
+ ra_class_add_reg(compiler->regs,
+ compiler->reg_class_r5[threads],
+ ACC_INDEX + 5);
+ ra_class_add_reg(compiler->regs,
+ compiler->reg_class_any[threads],
+ ACC_INDEX + 5);
}
ra_set_finalize(compiler->regs, NULL);
@@ -342,9 +417,11 @@ node_to_temp_priority(const void *in_a, const void *in_b)
}
#define CLASS_BIT_PHYS (1 << 0)
-#define CLASS_BIT_R0_R2 (1 << 1)
-#define CLASS_BIT_R3 (1 << 2)
-#define CLASS_BIT_R4 (1 << 3)
+#define CLASS_BIT_ACC (1 << 1)
+#define CLASS_BIT_R5 (1 << 4)
+#define CLASS_BITS_ANY (CLASS_BIT_PHYS | \
+ CLASS_BIT_ACC | \
+ CLASS_BIT_R5)
/**
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
@@ -357,8 +434,6 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
struct node_to_temp_map map[c->num_temps];
uint32_t temp_to_node[c->num_temps];
uint8_t class_bits[c->num_temps];
- struct qpu_reg *temp_registers = calloc(c->num_temps,
- sizeof(*temp_registers));
int acc_nodes[ACC_COUNT];
struct v3d_ra_select_callback_data callback_data = {
.next_acc = 0,
@@ -412,9 +487,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
* start with any temp being able to be in any file, then instructions
* incrementally remove bits that the temp definitely can't be in.
*/
- memset(class_bits,
- CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4,
- sizeof(class_bits));
+ memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits));
int ip = 0;
vir_for_each_inst_inorder(inst, c) {
@@ -497,6 +570,24 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
}
}
+ if (inst->dst.file == QFILE_TEMP) {
+ /* Only a ldunif gets to write to R5, which only has a
+ * single 32-bit channel of storage.
+ */
+ if (!inst->qpu.sig.ldunif) {
+ class_bits[inst->dst.index] &= ~CLASS_BIT_R5;
+ } else {
+ /* Until V3D 4.x, we could only load a uniform
+ * to r5, so we'll need to spill if uniform
+ * loads interfere with each other.
+ */
+ if (c->devinfo->ver < 40) {
+ class_bits[inst->dst.index] &=
+ CLASS_BIT_R5;
+ }
+ }
+ }
+
if (inst->qpu.sig.thrsw) {
/* All accumulators are invalidated across a thread
* switch.
@@ -514,13 +605,16 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
if (class_bits[i] == CLASS_BIT_PHYS) {
ra_set_node_class(g, temp_to_node[i],
c->compiler->reg_class_phys[thread_index]);
- } else {
- assert(class_bits[i] == (CLASS_BIT_PHYS |
- CLASS_BIT_R0_R2 |
- CLASS_BIT_R3 |
- CLASS_BIT_R4));
+ } else if (class_bits[i] == (CLASS_BIT_R5)) {
+ ra_set_node_class(g, temp_to_node[i],
+ c->compiler->reg_class_r5[thread_index]);
+ } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) {
ra_set_node_class(g, temp_to_node[i],
c->compiler->reg_class_phys_or_acc[thread_index]);
+ } else {
+ assert(class_bits[i] == CLASS_BITS_ANY);
+ ra_set_node_class(g, temp_to_node[i],
+ c->compiler->reg_class_any[thread_index]);
}
}
@@ -539,7 +633,8 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
* conformance tests to make sure that spilling works.
*/
int force_register_spills = 0;
- if (c->spill_size < 16 * sizeof(uint32_t) * force_register_spills) {
+ if (c->spill_size <
+ V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
int node = v3d_choose_spill_node(c, g, temp_to_node);
if (node != -1) {
v3d_spill_reg(c, map[node].temp);
@@ -551,24 +646,27 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
bool ok = ra_allocate(g);
if (!ok) {
- /* Try to spill, if we can't reduce threading first. */
- if (thread_index == 0) {
- int node = v3d_choose_spill_node(c, g, temp_to_node);
+ int node = v3d_choose_spill_node(c, g, temp_to_node);
- if (node != -1) {
- v3d_spill_reg(c, map[node].temp);
- ralloc_free(g);
+ /* Don't emit spills using the TMU until we've dropped thread
+ * conut first.
+ */
+ if (node != -1 &&
+ (vir_is_mov_uniform(c, map[node].temp) ||
+ thread_index == 0)) {
+ v3d_spill_reg(c, map[node].temp);
- /* Ask the outer loop to call back in. */
- *spilled = true;
- return NULL;
- }
+ /* Ask the outer loop to call back in. */
+ *spilled = true;
}
- free(temp_registers);
+ ralloc_free(g);
return NULL;
}
+ struct qpu_reg *temp_registers = calloc(c->num_temps,
+ sizeof(*temp_registers));
+
for (uint32_t i = 0; i < c->num_temps; i++) {
int ra_reg = ra_get_node_reg(g, temp_to_node[i]);
if (ra_reg < PHYS_INDEX) {
@@ -591,17 +689,5 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
ralloc_free(g);
- if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
- fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d spills\n",
- vir_get_stage_name(c),
- c->program_id, c->variant_id,
- c->spills);
-
- fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d fills\n",
- vir_get_stage_name(c),
- c->program_id, c->variant_id,
- c->fills);
- }
-
return temp_registers;
}
diff --git a/lib/mesa/src/broadcom/compiler/vir_to_qpu.c b/lib/mesa/src/broadcom/compiler/vir_to_qpu.c
index c66bb84b3..e6461ff94 100644
--- a/lib/mesa/src/broadcom/compiler/vir_to_qpu.c
+++ b/lib/mesa/src/broadcom/compiler/vir_to_qpu.c
@@ -76,7 +76,7 @@ v3d_qpu_nop(void)
static struct qinst *
vir_nop(void)
{
- struct qreg undef = { QFILE_NULL, 0 };
+ struct qreg undef = vir_nop_reg();
struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
return qinst;
@@ -92,16 +92,6 @@ new_qpu_nop_before(struct qinst *inst)
return q;
}
-static void
-new_ldunif_instr(struct qinst *inst, int i)
-{
- struct qinst *ldunif = new_qpu_nop_before(inst);
-
- ldunif->qpu.sig.ldunif = true;
- assert(inst->src[i].file == QFILE_UNIF);
- ldunif->uniform = inst->src[i].index;
-}
-
/**
* Allocates the src register (accumulator or register file) into the RADDR
* fields of the instruction.
@@ -214,16 +204,11 @@ v3d_generate_code_block(struct v3d_compile *c,
struct qinst *temp;
- if (vir_has_implicit_uniform(qinst)) {
- int src = vir_get_implicit_uniform_src(qinst);
- assert(qinst->src[src].file == QFILE_UNIF);
- qinst->uniform = qinst->src[src].index;
+ if (vir_has_uniform(qinst))
c->num_uniforms++;
- }
- int nsrc = vir_get_non_sideband_nsrc(qinst);
+ int nsrc = vir_get_nsrc(qinst);
struct qpu_reg src[ARRAY_SIZE(qinst->src)];
- bool emitted_ldunif = false;
for (int i = 0; i < nsrc; i++) {
int index = qinst->src[i].index;
switch (qinst->src[i].file) {
@@ -240,19 +225,6 @@ v3d_generate_code_block(struct v3d_compile *c,
case QFILE_TEMP:
src[i] = temp_registers[index];
break;
- case QFILE_UNIF:
- /* XXX perf: If the last ldunif we emitted was
- * the same uniform value, skip it. Common
- * for multop/umul24 sequences.
- */
- if (!emitted_ldunif) {
- new_ldunif_instr(qinst, i);
- c->num_uniforms++;
- emitted_ldunif = true;
- }
-
- src[i] = qpu_acc(5);
- break;
case QFILE_SMALL_IMM:
src[i].smimm = true;
break;
@@ -268,10 +240,6 @@ v3d_generate_code_block(struct v3d_compile *c,
src[i] = qpu_acc(3);
break;
-
- case QFILE_TLB:
- case QFILE_TLBU:
- unreachable("bad vir src file");
}
}
@@ -297,15 +265,6 @@ v3d_generate_code_block(struct v3d_compile *c,
dst = qpu_magic(V3D_QPU_WADDR_VPM);
break;
- case QFILE_TLB:
- dst = qpu_magic(V3D_QPU_WADDR_TLB);
- break;
-
- case QFILE_TLBU:
- dst = qpu_magic(V3D_QPU_WADDR_TLBU);
- break;
-
- case QFILE_UNIF:
case QFILE_SMALL_IMM:
case QFILE_LOAD_IMM:
assert(!"not reached");
@@ -313,7 +272,20 @@ v3d_generate_code_block(struct v3d_compile *c,
}
if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
- if (v3d_qpu_sig_writes_address(c->devinfo,
+ if (qinst->qpu.sig.ldunif) {
+ assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
+ assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+
+ if (!dst.magic ||
+ dst.index != V3D_QPU_WADDR_R5) {
+ assert(c->devinfo->ver >= 40);
+
+ qinst->qpu.sig.ldunif = false;
+ qinst->qpu.sig.ldunifrf = true;
+ qinst->qpu.sig_addr = dst.index;
+ qinst->qpu.sig_magic = dst.magic;
+ }
+ } else if (v3d_qpu_sig_writes_address(c->devinfo,
&qinst->qpu.sig)) {
assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
@@ -361,11 +333,12 @@ static bool
reads_uniform(const struct v3d_device_info *devinfo, uint64_t instruction)
{
struct v3d_qpu_instr qpu;
- MAYBE_UNUSED bool ok = v3d_qpu_instr_unpack(devinfo, instruction, &qpu);
+ ASSERTED bool ok = v3d_qpu_instr_unpack(devinfo, instruction, &qpu);
assert(ok);
if (qpu.sig.ldunif ||
- qpu.sig.ldunifarf ||
+ qpu.sig.ldunifrf ||
+ qpu.sig.ldtlbu ||
qpu.sig.wrtmuc) {
return true;
}
@@ -433,7 +406,7 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
vir_for_each_block(block, c)
v3d_generate_code_block(c, block, temp_registers);
- uint32_t cycles = v3d_qpu_schedule_instructions(c);
+ v3d_qpu_schedule_instructions(c);
c->qpu_insts = rzalloc_array(c, uint64_t, c->qpu_inst_count);
int i = 0;
@@ -450,23 +423,6 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
}
assert(i == c->qpu_inst_count);
- if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
- fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d instructions\n",
- vir_get_stage_name(c),
- c->program_id, c->variant_id,
- c->qpu_inst_count);
- }
-
- /* The QPU cycle estimates are pretty broken (see waddr_latency()), so
- * don't report them for now.
- */
- if (false) {
- fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
- vir_get_stage_name(c),
- c->program_id, c->variant_id,
- cycles);
- }
-
if (V3D_DEBUG & (V3D_DEBUG_QPU |
v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
v3d_dump_qpu(c);
diff --git a/lib/mesa/src/broadcom/drm-shim/README.md b/lib/mesa/src/broadcom/drm-shim/README.md
new file mode 100644
index 000000000..dde21c1b8
--- /dev/null
+++ b/lib/mesa/src/broadcom/drm-shim/README.md
@@ -0,0 +1,17 @@
+### v3d backend
+
+This implements some of v3d using the closed source v3dv3 tree's
+C/C++-based simulator. All execution is synchronous.
+
+Export: `MESA_LOADER_DRIVER_OVERRIDE=v3d
+LD_PRELOAD=$prefix/lib/libv3d_drm_shim.so`. The v3dv3 version exposed
+will depend on the v3dv3 build -- 3.3, 4.1, and 4.2 are supported.
+
+### v3d_noop backend
+
+This implements the minimum of v3d in order to make shader-db work.
+The submit ioctl is stubbed out to not execute anything.
+
+Export `MESA_LOADER_DRIVER_OVERRIDE=v3d
+LD_PRELOAD=$prefix/lib/libv3d_noop_drm_shim.so`. This will be a V3D
+4.2 device.
diff --git a/lib/mesa/src/broadcom/drm-shim/meson.build b/lib/mesa/src/broadcom/drm-shim/meson.build
new file mode 100644
index 000000000..4fcc594ad
--- /dev/null
+++ b/lib/mesa/src/broadcom/drm-shim/meson.build
@@ -0,0 +1,62 @@
+# Copyright © 2019 Broadcom
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libv3d_noop_drm_shim = shared_library(
+ ['v3d_noop_drm_shim'],
+ 'v3d_noop.c',
+ include_directories: inc_common,
+ dependencies: dep_drm_shim,
+ c_args : c_vis_args,
+ install : true,
+)
+
+dep_v3dv3 = dependency('v3dv3', required: false)
+if dep_v3dv3.found()
+ v3dv3_c_args = '-DUSE_V3D_SIMULATOR'
+
+ inc_gallium_v3d = include_directories('../../gallium/drivers/v3d')
+
+ per_version_libs = []
+ foreach ver : v3d_versions
+ per_version_libs += static_library(
+ 'libv3d_drm_shim-v' + ver,
+ [
+ 'v3dx.c',
+ v3d_xml_pack
+ ],
+ include_directories : [inc_common, inc_broadcom, inc_src, inc_gallium_v3d],
+ c_args : [c_vis_args, no_override_init_args, '-DV3D_VERSION=' + ver, v3dv3_c_args],
+ dependencies: [dep_valgrind, dep_thread, dep_v3dv3],
+ )
+ endforeach
+
+ libv3d_drm_shim = shared_library(
+ ['v3d_drm_shim'],
+ [
+ 'v3d.c',
+ '../../gallium/drivers/v3d/v3d_simulator_wrapper.cpp',
+ ],
+ dependencies: [idep_mesautil, dep_dl, dep_drm_shim, dep_v3dv3],
+ link_with: per_version_libs,
+ include_directories : [inc_common, inc_broadcom, inc_gallium_v3d],
+ c_args : [c_vis_args, no_override_init_args, '-std=gnu99', v3dv3_c_args],
+ cpp_args : [v3dv3_c_args]
+ )
+endif
diff --git a/lib/mesa/src/broadcom/drm-shim/v3d.c b/lib/mesa/src/broadcom/drm-shim/v3d.c
new file mode 100644
index 000000000..e75657f59
--- /dev/null
+++ b/lib/mesa/src/broadcom/drm-shim/v3d.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright © 2018 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include "drm-uapi/v3d_drm.h"
+#include "drm-shim/drm_shim.h"
+#include "v3d.h"
+#include "v3d_simulator_wrapper.h"
+
+static struct v3d_device_info devinfo;
+struct v3d_shim_device v3d = {
+ .devinfo = &devinfo
+};
+
+struct v3d_bo *v3d_bo_lookup(struct shim_fd *shim_fd, int handle)
+{
+ return v3d_bo(drm_shim_bo_lookup(shim_fd, handle));
+}
+
+int
+v3d_ioctl_wait_bo(int fd, unsigned long request, void *arg)
+{
+ /* No need to wait on anything yet, given that we submit
+ * synchronously.
+ */
+ return 0;
+}
+
+int
+v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg)
+{
+ struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+ struct drm_v3d_mmap_bo *map = arg;
+ struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, map->handle);
+
+ map->offset = drm_shim_bo_get_mmap_offset(shim_fd, bo);
+
+ drm_shim_bo_put(bo);
+
+ return 0;
+}
+
+int
+v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg)
+{
+ struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+ struct drm_v3d_get_bo_offset *get = arg;
+ struct v3d_bo *bo = v3d_bo_lookup(shim_fd, get->handle);
+
+ get->offset = bo->offset;
+
+ drm_shim_bo_put(&bo->base);
+
+ return 0;
+}
+
+void
+drm_shim_driver_init(void)
+{
+ shim_device.driver_name = "v3d";
+
+ drm_shim_override_file("OF_FULLNAME=/rdb/v3d\n"
+ "OF_COMPATIBLE_N=1\n"
+ "OF_COMPATIBLE_0=brcm,7278-v3d\n",
+ "/sys/dev/char/%d:%d/device/uevent",
+ DRM_MAJOR, render_node_minor);
+
+ v3d.hw = v3d_hw_auto_new(NULL);
+ v3d.devinfo->ver = v3d_hw_get_version(v3d.hw);
+
+ if (v3d.devinfo->ver >= 42)
+ v3d42_drm_shim_driver_init();
+ else if (v3d.devinfo->ver >= 41)
+ v3d41_drm_shim_driver_init();
+ else
+ v3d33_drm_shim_driver_init();
+}
diff --git a/lib/mesa/src/broadcom/drm-shim/v3d.h b/lib/mesa/src/broadcom/drm-shim/v3d.h
new file mode 100644
index 000000000..0712b8b3f
--- /dev/null
+++ b/lib/mesa/src/broadcom/drm-shim/v3d.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright © 2018 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef DRM_SHIM_V3D_H
+#define DRM_SHIM_V3D_H
+
+#include "broadcom/common/v3d_device_info.h"
+#include "util/vma.h"
+
+struct drm_shim_fd;
+
+struct v3d_shim_device {
+ struct v3d_hw *hw;
+ struct v3d_device_info *devinfo;
+
+ /* Base virtual address of the heap. */
+ void *mem;
+ /* Base hardware address of the heap. */
+ uint32_t mem_base;
+ /* Size of the heap. */
+ size_t mem_size;
+
+ /* Allocator for the GPU virtual addresses. */
+ struct util_vma_heap heap;
+};
+extern struct v3d_shim_device v3d;
+
+struct v3d_bo {
+ struct shim_bo base;
+ uint64_t offset;
+ void *sim_vaddr;
+ void *gem_vaddr;
+};
+
+static inline struct v3d_bo *
+v3d_bo(struct shim_bo *bo)
+{
+ return (struct v3d_bo *)bo;
+}
+
+struct v3d_bo *v3d_bo_lookup(struct shim_fd *shim_fd, int handle);
+int v3d_ioctl_wait_bo(int fd, unsigned long request, void *arg);
+int v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg);
+int v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg);
+
+void v3d33_drm_shim_driver_init(void);
+void v3d41_drm_shim_driver_init(void);
+void v3d42_drm_shim_driver_init(void);
+
+#endif /* DRM_SHIM_V3D_H */
diff --git a/lib/mesa/src/broadcom/drm-shim/v3d_noop.c b/lib/mesa/src/broadcom/drm-shim/v3d_noop.c
new file mode 100644
index 000000000..7c7d75128
--- /dev/null
+++ b/lib/mesa/src/broadcom/drm-shim/v3d_noop.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright © 2018 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include "drm-uapi/v3d_drm.h"
+#include "drm-shim/drm_shim.h"
+
+struct v3d_bo {
+ struct shim_bo base;
+ uint32_t offset;
+};
+
+static struct v3d_bo *
+v3d_bo(struct shim_bo *bo)
+{
+ return (struct v3d_bo *)bo;
+}
+
+struct v3d_device {
+ uint32_t next_offset;
+};
+
+static struct v3d_device v3d = {
+ .next_offset = 0x1000,
+};
+
+static int
+v3d_ioctl_noop(int fd, unsigned long request, void *arg)
+{
+ return 0;
+}
+
+static int
+v3d_ioctl_create_bo(int fd, unsigned long request, void *arg)
+{
+ struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+ struct drm_v3d_create_bo *create = arg;
+ struct v3d_bo *bo = calloc(1, sizeof(*bo));
+
+ drm_shim_bo_init(&bo->base, create->size);
+
+ assert(UINT_MAX - v3d.next_offset > create->size);
+ bo->offset = v3d.next_offset;
+ v3d.next_offset += create->size;
+
+ create->offset = bo->offset;
+ create->handle = drm_shim_bo_get_handle(shim_fd, &bo->base);
+
+ drm_shim_bo_put(&bo->base);
+
+ return 0;
+}
+
+static int
+v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg)
+{
+ struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+ struct drm_v3d_get_bo_offset *args = arg;
+ struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, args->handle);
+
+ args->offset = v3d_bo(bo)->offset;
+
+ drm_shim_bo_put(bo);
+
+ return 0;
+}
+
+static int
+v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg)
+{
+ struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+ struct drm_v3d_mmap_bo *map = arg;
+ struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, map->handle);
+
+ map->offset = drm_shim_bo_get_mmap_offset(shim_fd, bo);
+
+ drm_shim_bo_put(bo);
+
+ return 0;
+}
+
+static int
+v3d_ioctl_get_param(int fd, unsigned long request, void *arg)
+{
+ struct drm_v3d_get_param *gp = arg;
+ static const uint32_t v3d42_reg_map[] = {
+ [DRM_V3D_PARAM_V3D_UIFCFG] = 0x00000045,
+ [DRM_V3D_PARAM_V3D_HUB_IDENT1] = 0x000e1124,
+ [DRM_V3D_PARAM_V3D_HUB_IDENT2] = 0x00000100,
+ [DRM_V3D_PARAM_V3D_HUB_IDENT3] = 0x00000e00,
+ [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = 0x04443356,
+ [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = 0x81001422,
+ [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = 0x40078121,
+ };
+
+ switch (gp->param) {
+ case DRM_V3D_PARAM_SUPPORTS_TFU:
+ gp->value = 1;
+ return 0;
+ default:
+ break;
+ }
+
+ if (gp->param < ARRAY_SIZE(v3d42_reg_map) && v3d42_reg_map[gp->param]) {
+ gp->value = v3d42_reg_map[gp->param];
+ return 0;
+ }
+
+ fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM %d\n", gp->param);
+ return -1;
+}
+
+static ioctl_fn_t driver_ioctls[] = {
+ [DRM_V3D_SUBMIT_CL] = v3d_ioctl_noop,
+ [DRM_V3D_SUBMIT_TFU] = v3d_ioctl_noop,
+ [DRM_V3D_WAIT_BO] = v3d_ioctl_noop,
+ [DRM_V3D_CREATE_BO] = v3d_ioctl_create_bo,
+ [DRM_V3D_GET_PARAM] = v3d_ioctl_get_param,
+ [DRM_V3D_GET_BO_OFFSET] = v3d_ioctl_get_bo_offset,
+ [DRM_V3D_MMAP_BO] = v3d_ioctl_mmap_bo,
+};
+
+void
+drm_shim_driver_init(void)
+{
+ shim_device.driver_name = "v3d";
+ shim_device.driver_ioctls = driver_ioctls;
+ shim_device.driver_ioctl_count = ARRAY_SIZE(driver_ioctls);
+
+ drm_shim_override_file("OF_FULLNAME=/rdb/v3d\n"
+ "OF_COMPATIBLE_N=1\n"
+ "OF_COMPATIBLE_0=brcm,7278-v3d\n",
+ "/sys/dev/char/%d:%d/device/uevent",
+ DRM_MAJOR, render_node_minor);
+}
diff --git a/lib/mesa/src/broadcom/drm-shim/v3dx.c b/lib/mesa/src/broadcom/drm-shim/v3dx.c
new file mode 100644
index 000000000..a22550a03
--- /dev/null
+++ b/lib/mesa/src/broadcom/drm-shim/v3dx.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/* @file
+ *
+ * v3d driver code interacting v3dv3 simulator/fpga library.
+ *
+ * This is compiled per V3D version we support, since the register definitions
+ * conflict.
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include "util/macros.h"
+#include "util/u_mm.h"
+#include "broadcom/common/v3d_macros.h"
+#include "v3d_simulator_wrapper.h"
+#include "drm-shim/drm_shim.h"
+#include "drm-uapi/v3d_drm.h"
+#include "v3d.h"
+
+#define HW_REGISTER_RO(x) (x)
+#define HW_REGISTER_RW(x) (x)
+#if V3D_VERSION >= 41
+#include "libs/core/v3d/registers/4.1.34.0/v3d.h"
+#else
+#include "libs/core/v3d/registers/3.3.0.0/v3d.h"
+#endif
+
+#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d.hw, reg, val)
+#define V3D_READ(reg) v3d_hw_read_reg(v3d.hw, reg)
+
+static void
+v3d_flush_l3()
+{
+ if (!v3d_hw_has_gca(v3d.hw))
+ return;
+
+#if V3D_VERSION < 40
+ uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL);
+
+ V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET);
+ V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET);
+#endif
+}
+
+/* Invalidates the L2 cache. This is a read-only cache. */
+static void
+v3d_flush_l2(void)
+{
+ V3D_WRITE(V3D_CTL_0_L2CACTL,
+ V3D_CTL_0_L2CACTL_L2CCLR_SET |
+ V3D_CTL_0_L2CACTL_L2CENA_SET);
+}
+
+/* Invalidates texture L2 cachelines */
+static void
+v3d_flush_l2t(void)
+{
+ V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0);
+ V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0);
+ V3D_WRITE(V3D_CTL_0_L2TCACTL,
+ V3D_CTL_0_L2TCACTL_L2TFLS_SET |
+ (0 << V3D_CTL_0_L2TCACTL_L2TFLM_LSB));
+}
+
+/* Invalidates the slice caches. These are read-only caches. */
+static void
+v3d_flush_slices(void)
+{
+ V3D_WRITE(V3D_CTL_0_SLCACTL, ~0);
+}
+
+static void
+v3d_flush_caches(void)
+{
+ v3d_flush_l3();
+ v3d_flush_l2();
+ v3d_flush_l2t();
+ v3d_flush_slices();
+}
+
+static void
+v3d_simulator_copy_in_handle(struct shim_fd *shim_fd, int handle)
+{
+ if (!handle)
+ return;
+
+ struct v3d_bo *bo = v3d_bo_lookup(shim_fd, handle);
+
+ memcpy(bo->sim_vaddr, bo->gem_vaddr, bo->base.size);
+}
+
+static void
+v3d_simulator_copy_out_handle(struct shim_fd *shim_fd, int handle)
+{
+ if (!handle)
+ return;
+
+ struct v3d_bo *bo = v3d_bo_lookup(shim_fd, handle);
+
+ memcpy(bo->gem_vaddr, bo->sim_vaddr, bo->base.size);
+}
+
+static int
+v3dX(v3d_ioctl_submit_cl)(int fd, unsigned long request, void *arg)
+{
+ struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+ struct drm_v3d_submit_cl *submit = arg;
+ uint32_t *bo_handles = (uint32_t *)(uintptr_t)submit->bo_handles;
+
+ for (int i = 0; i < submit->bo_handle_count; i++)
+ v3d_simulator_copy_in_handle(shim_fd, bo_handles[i]);
+
+ v3d_flush_caches();
+
+ if (submit->qma) {
+ V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma);
+ V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms);
+ }
+#if V3D_VERSION >= 41
+ if (submit->qts) {
+ V3D_WRITE(V3D_CLE_0_CT0QTS,
+ V3D_CLE_0_CT0QTS_CTQTSEN_SET |
+ submit->qts);
+ }
+#endif
+
+ fprintf(stderr, "submit %x..%x!\n", submit->bcl_start, submit->bcl_end);
+
+ V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start);
+ V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end);
+
+ /* Wait for bin to complete before firing render, as it seems the
+ * simulator doesn't implement the semaphores.
+ */
+ while (V3D_READ(V3D_CLE_0_CT0CA) !=
+ V3D_READ(V3D_CLE_0_CT0EA)) {
+ v3d_hw_tick(v3d.hw);
+ }
+
+ fprintf(stderr, "submit %x..%x!\n", submit->rcl_start, submit->rcl_end);
+
+ v3d_flush_caches();
+
+ V3D_WRITE(V3D_CLE_0_CT1QBA, submit->rcl_start);
+ V3D_WRITE(V3D_CLE_0_CT1QEA, submit->rcl_end);
+
+ while (V3D_READ(V3D_CLE_0_CT1CA) !=
+ V3D_READ(V3D_CLE_0_CT1EA)) {
+ v3d_hw_tick(v3d.hw);
+ }
+
+ for (int i = 0; i < submit->bo_handle_count; i++)
+ v3d_simulator_copy_out_handle(shim_fd, bo_handles[i]);
+
+ return 0;
+}
+
+static int
+v3dX(v3d_ioctl_submit_tfu)(int fd, unsigned long request, void *arg)
+{
+ struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+ struct drm_v3d_submit_tfu *submit = arg;
+
+ v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[0]);
+ v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[1]);
+ v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[2]);
+ v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[3]);
+
+ int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET;
+
+ V3D_WRITE(V3D_TFU_IIA, submit->iia);
+ V3D_WRITE(V3D_TFU_IIS, submit->iis);
+ V3D_WRITE(V3D_TFU_ICA, submit->ica);
+ V3D_WRITE(V3D_TFU_IUA, submit->iua);
+ V3D_WRITE(V3D_TFU_IOA, submit->ioa);
+ V3D_WRITE(V3D_TFU_IOS, submit->ios);
+ V3D_WRITE(V3D_TFU_COEF0, submit->coef[0]);
+ V3D_WRITE(V3D_TFU_COEF1, submit->coef[1]);
+ V3D_WRITE(V3D_TFU_COEF2, submit->coef[2]);
+ V3D_WRITE(V3D_TFU_COEF3, submit->coef[3]);
+
+ V3D_WRITE(V3D_TFU_ICFG, submit->icfg);
+
+ while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
+ v3d_hw_tick(v3d.hw);
+ }
+
+ v3d_simulator_copy_out_handle(shim_fd, submit->bo_handles[0]);
+
+ return 0;
+}
+
+static int
+v3dX(v3d_ioctl_create_bo)(int fd, unsigned long request, void *arg)
+{
+ struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+ struct drm_v3d_create_bo *create = arg;
+ struct v3d_bo *bo = calloc(1, sizeof(*bo));
+
+ drm_shim_bo_init(&bo->base, create->size);
+ bo->offset = util_vma_heap_alloc(&v3d.heap, create->size, 4096);
+ if (bo->offset == 0)
+ return -ENOMEM;
+
+ bo->sim_vaddr = v3d.mem + bo->offset - v3d.mem_base;
+#if 0
+ /* Place a mapping of the BO inside of the simulator's address space
+ * for V3D memory. This lets us avoid copy in/out for simpenrose, but
+ * I'm betting we'll need something else for FPGA.
+ */
+ void *sim_addr = v3d.mem + bo->block->ofs;
+ void *mmap_ret = mmap(sim_addr, create->size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, bo->base.fd, 0);
+ assert(mmap_ret == sim_addr);
+#else
+ /* Make a simulator-private mapping of the shim GEM object. */
+ bo->gem_vaddr = mmap(NULL, bo->base.size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED,
+ bo->base.fd, 0);
+ if (bo->gem_vaddr == MAP_FAILED) {
+ fprintf(stderr, "v3d: mmap of shim bo failed\n");
+ abort();
+ }
+#endif
+
+ create->offset = bo->offset;
+ create->handle = drm_shim_bo_get_handle(shim_fd, &bo->base);
+
+ drm_shim_bo_put(&bo->base);
+
+ return 0;
+}
+
+static int
+v3dX(v3d_ioctl_get_param)(int fd, unsigned long request, void *arg)
+{
+ struct drm_v3d_get_param *gp = arg;
+ static const uint32_t reg_map[] = {
+ [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG,
+ [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1,
+ [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2,
+ [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3,
+ [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_0_IDENT0,
+ [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_0_IDENT1,
+ [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_0_IDENT2,
+ };
+
+ switch (gp->param) {
+ case DRM_V3D_PARAM_SUPPORTS_TFU:
+ gp->value = 1;
+ return 0;
+ }
+
+ if (gp->param < ARRAY_SIZE(reg_map) && reg_map[gp->param]) {
+ gp->value = V3D_READ(reg_map[gp->param]);
+ return 0;
+ }
+
+ fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM %d\n", gp->param);
+ return -1;
+}
+
+static ioctl_fn_t driver_ioctls[] = {
+ [DRM_V3D_SUBMIT_CL] = v3dX(v3d_ioctl_submit_cl),
+ [DRM_V3D_SUBMIT_TFU] = v3dX(v3d_ioctl_submit_tfu),
+ [DRM_V3D_WAIT_BO] = v3d_ioctl_wait_bo,
+ [DRM_V3D_CREATE_BO] = v3dX(v3d_ioctl_create_bo),
+ [DRM_V3D_GET_PARAM] = v3dX(v3d_ioctl_get_param),
+ [DRM_V3D_MMAP_BO] = v3d_ioctl_mmap_bo,
+ [DRM_V3D_GET_BO_OFFSET] = v3d_ioctl_get_bo_offset,
+};
+
+static void
+v3d_isr(uint32_t hub_status)
+{
+ /* Check the per-core bits */
+ if (hub_status & (1 << 0)) {
+ uint32_t core_status = V3D_READ(V3D_CTL_0_INT_STS);
+
+ if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
+ fprintf(stderr, "GMP violation at 0x%08x\n",
+ V3D_READ(V3D_GMP_0_VIO_ADDR));
+ abort();
+ } else {
+ fprintf(stderr,
+ "Unexpected ISR with core status 0x%08x\n",
+ core_status);
+ }
+ abort();
+ }
+
+ return;
+}
+
+static void
+v3dX(simulator_init_regs)(void)
+{
+#if V3D_VERSION == 33
+ /* Set OVRTMUOUT to match kernel behavior.
+ *
+ * This means that the texture sampler uniform configuration's tmu
+ * output type field is used, instead of using the hardware default
+ * behavior based on the texture type. If you want the default
+ * behavior, you can still put "2" in the indirect texture state's
+ * output_type field.
+ */
+ V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET);
+#endif
+
+ uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_GMPV_SET;
+ V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
+ V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
+
+ v3d_hw_set_isr(v3d.hw, v3d_isr);
+}
+
+static void
+v3d_bo_free(struct shim_bo *shim_bo)
+{
+ struct v3d_bo *bo = v3d_bo(shim_bo);
+
+ if (bo->gem_vaddr)
+ munmap(bo->gem_vaddr, shim_bo->size);
+
+ util_vma_heap_free(&v3d.heap, bo->offset, bo->base.size);
+}
+
+void
+v3dX(drm_shim_driver_init)(void)
+{
+ shim_device.driver_ioctls = driver_ioctls;
+ shim_device.driver_ioctl_count = ARRAY_SIZE(driver_ioctls);
+
+ shim_device.driver_bo_free = v3d_bo_free;
+
+ /* Allocate a gig of memory to play in. */
+ v3d_hw_alloc_mem(v3d.hw, 1024 * 1024 * 1024);
+ v3d.mem_base =
+ v3d_hw_get_mem(v3d.hw, &v3d.mem_size,
+ &v3d.mem);
+ util_vma_heap_init(&v3d.heap, 4096, v3d.mem_size - 4096);
+
+ v3dX(simulator_init_regs)();
+}
diff --git a/lib/mesa/src/broadcom/meson.build b/lib/mesa/src/broadcom/meson.build
index d3ea362f2..57f0d889b 100644
--- a/lib/mesa/src/broadcom/meson.build
+++ b/lib/mesa/src/broadcom/meson.build
@@ -30,6 +30,10 @@ if with_gallium_v3d
subdir('qpu')
endif
+if with_tools.contains('drm-shim')
+ subdir('drm-shim')
+endif
+
per_version_libs = []
foreach ver : v3d_versions
per_version_libs += static_library(
@@ -47,7 +51,7 @@ endforeach
libbroadcom_v3d = static_library(
'libbroadcom_v3d',
[
- files('common/v3d_debug.c', 'clif/clif_dump.c'),
+ files('common/v3d_debug.c', 'common/v3d_device_info.c', 'clif/clif_dump.c'),
v3d_xml_pack,
],
include_directories : [inc_common, inc_broadcom, inc_src],
diff --git a/lib/mesa/src/broadcom/qpu/meson.build b/lib/mesa/src/broadcom/qpu/meson.build
index 279b09cb9..c9cf7b9e9 100644
--- a/lib/mesa/src/broadcom/qpu/meson.build
+++ b/lib/mesa/src/broadcom/qpu/meson.build
@@ -39,7 +39,8 @@ test(
'qpu_disasm',
executable(
'qpu_disasm', 'tests/qpu_disasm.c',
- link_with: [libbroadcom_qpu, libmesa_util],
+ link_with: libbroadcom_qpu,
+ dependencies : idep_mesautil,
include_directories: inc_common
),
suite : ['broadcom'],
diff --git a/lib/mesa/src/broadcom/qpu/qpu_disasm.c b/lib/mesa/src/broadcom/qpu/qpu_disasm.c
index 32e7ba12a..9f59bcdf7 100644
--- a/lib/mesa/src/broadcom/qpu/qpu_disasm.c
+++ b/lib/mesa/src/broadcom/qpu/qpu_disasm.c
@@ -64,7 +64,7 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
} else if (mux == V3D_QPU_MUX_B) {
if (instr->sig.small_imm) {
uint32_t val;
- MAYBE_UNUSED bool ok =
+ ASSERTED bool ok =
v3d_qpu_small_imm_unpack(disasm->devinfo,
instr->raddr_b,
&val);
@@ -205,6 +205,8 @@ v3d_qpu_disasm_sig(struct disasm_state *disasm,
!sig->ldvary &&
!sig->ldvpm &&
!sig->ldtmu &&
+ !sig->ldtlb &&
+ !sig->ldtlbu &&
!sig->ldunif &&
!sig->ldunifrf &&
!sig->ldunifa &&
diff --git a/lib/mesa/src/broadcom/qpu/qpu_instr.c b/lib/mesa/src/broadcom/qpu/qpu_instr.c
index add2d2a23..09d06b3fa 100644
--- a/lib/mesa/src/broadcom/qpu/qpu_instr.c
+++ b/lib/mesa/src/broadcom/qpu/qpu_instr.c
@@ -645,19 +645,10 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
bool
v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
{
- if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
- switch (inst->alu.add.op) {
- case V3D_QPU_A_RECIP:
- case V3D_QPU_A_RSQRT:
- case V3D_QPU_A_EXP:
- case V3D_QPU_A_LOG:
- case V3D_QPU_A_SIN:
- case V3D_QPU_A_RSQRT2:
- return true;
- default:
- break;
- }
+ if (v3d_qpu_instr_is_sfu(inst))
+ return true;
+ if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.magic_write &&
v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) {
return true;
@@ -673,6 +664,25 @@ v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
}
bool
+v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst)
+{
+ if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+ switch (inst->alu.add.op) {
+ case V3D_QPU_A_RECIP:
+ case V3D_QPU_A_RSQRT:
+ case V3D_QPU_A_EXP:
+ case V3D_QPU_A_LOG:
+ case V3D_QPU_A_SIN:
+ case V3D_QPU_A_RSQRT2:
+ return true;
+ default:
+ return false;
+ }
+ }
+ return false;
+}
+
+bool
v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst)
{
return (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
@@ -683,6 +693,16 @@ v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst)
}
bool
+v3d_qpu_writes_tmu_not_tmuc(const struct v3d_qpu_instr *inst)
+{
+ return v3d_qpu_writes_tmu(inst) &&
+ (!inst->alu.add.magic_write ||
+ inst->alu.add.waddr != V3D_QPU_WADDR_TMUC) &&
+ (!inst->alu.mul.magic_write ||
+ inst->alu.mul.waddr != V3D_QPU_WADDR_TMUC);
+}
+
+bool
v3d_qpu_reads_vpm(const struct v3d_qpu_instr *inst)
{
if (inst->sig.ldvpm)
@@ -751,9 +771,6 @@ bool
v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *inst)
{
- if (inst->sig.ldtmu)
- return true;
-
if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
if (inst->alu.add.magic_write &&
(inst->alu.add.waddr == V3D_QPU_WADDR_R4 ||
@@ -768,8 +785,10 @@ v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
}
}
- if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
- inst->sig_magic && inst->sig_addr == V3D_QPU_WADDR_R4) {
+ if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
+ if (inst->sig_magic && inst->sig_addr == V3D_QPU_WADDR_R4)
+ return true;
+ } else if (inst->sig.ldtmu) {
return true;
}
@@ -867,3 +886,70 @@ v3d_qpu_writes_flags(const struct v3d_qpu_instr *inst)
return false;
}
+
+bool
+v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst)
+{
+ if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+ return false;
+
+ switch (inst->alu.add.op) {
+ case V3D_QPU_A_FADD:
+ case V3D_QPU_A_FADDNF:
+ case V3D_QPU_A_FSUB:
+ case V3D_QPU_A_FMIN:
+ case V3D_QPU_A_FMAX:
+ case V3D_QPU_A_FCMP:
+ case V3D_QPU_A_FROUND:
+ case V3D_QPU_A_FTRUNC:
+ case V3D_QPU_A_FFLOOR:
+ case V3D_QPU_A_FCEIL:
+ case V3D_QPU_A_FDX:
+ case V3D_QPU_A_FDY:
+ case V3D_QPU_A_FTOIN:
+ case V3D_QPU_A_FTOIZ:
+ case V3D_QPU_A_FTOUZ:
+ case V3D_QPU_A_FTOC:
+ case V3D_QPU_A_VFPACK:
+ return true;
+ break;
+ default:
+ break;
+ }
+
+ switch (inst->alu.mul.op) {
+ case V3D_QPU_M_FMOV:
+ case V3D_QPU_M_FMUL:
+ return true;
+ break;
+ default:
+ break;
+ }
+
+ return false;
+}
+bool
+v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst)
+{
+ if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+ return false;
+
+ switch (inst->alu.add.op) {
+ case V3D_QPU_A_VFMIN:
+ case V3D_QPU_A_VFMAX:
+ return true;
+ break;
+ default:
+ break;
+ }
+
+ switch (inst->alu.mul.op) {
+ case V3D_QPU_M_VFMUL:
+ return true;
+ break;
+ default:
+ break;
+ }
+
+ return false;
+}
diff --git a/lib/mesa/src/broadcom/qpu/qpu_instr.h b/lib/mesa/src/broadcom/qpu/qpu_instr.h
index 1e2dcb78a..ad2d37b60 100644
--- a/lib/mesa/src/broadcom/qpu/qpu_instr.h
+++ b/lib/mesa/src/broadcom/qpu/qpu_instr.h
@@ -447,8 +447,10 @@ bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_writes_tmu_not_tmuc(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
@@ -464,5 +466,7 @@ bool v3d_qpu_reads_flags(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_writes_flags(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
bool v3d_qpu_sig_writes_address(const struct v3d_device_info *devinfo,
const struct v3d_qpu_sig *sig) ATTRIBUTE_CONST;
+bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
#endif
diff --git a/lib/mesa/src/broadcom/qpu/qpu_pack.c b/lib/mesa/src/broadcom/qpu/qpu_pack.c
index 70f31d734..516b0cf53 100644
--- a/lib/mesa/src/broadcom/qpu/qpu_pack.c
+++ b/lib/mesa/src/broadcom/qpu/qpu_pack.c
@@ -776,7 +776,11 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
case V3D_QPU_A_FMIN:
case V3D_QPU_A_FMAX:
case V3D_QPU_A_FCMP:
- instr->alu.add.output_pack = (op >> 4) & 0x3;
+ case V3D_QPU_A_VFPACK:
+ if (instr->alu.add.op != V3D_QPU_A_VFPACK)
+ instr->alu.add.output_pack = (op >> 4) & 0x3;
+ else
+ instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
&instr->alu.add.a_unpack)) {
@@ -1042,6 +1046,32 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
opcode |= a_unpack << 2;
opcode |= b_unpack << 0;
+
+ break;
+ }
+
+ case V3D_QPU_A_VFPACK: {
+ uint32_t a_unpack;
+ uint32_t b_unpack;
+
+ if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS ||
+ instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
+ &a_unpack)) {
+ return false;
+ }
+
+ if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
+ &b_unpack)) {
+ return false;
+ }
+
+ opcode = (opcode & ~(1 << 2)) | (a_unpack << 2);
+ opcode = (opcode & ~(1 << 0)) | (b_unpack << 0);
+
break;
}
@@ -1065,7 +1095,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
}
if (packed == 0)
return false;
- opcode |= packed << 2;
+ opcode = (opcode & ~(1 << 2)) | packed << 2;
break;
}
diff --git a/lib/mesa/src/broadcom/qpu/qpu_validate.c b/lib/mesa/src/broadcom/qpu/qpu_validate.c
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/lib/mesa/src/broadcom/qpu/qpu_validate.c
diff --git a/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c b/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c
index 2e8d98058..1bc3c9ec6 100644
--- a/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c
+++ b/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c
@@ -48,6 +48,9 @@ static const struct {
{ 33, 0x1c0a0dfde2294000ull, "fcmp.ifna rf61.h, r4.abs, r2.l; vfmul rf55, r2.hh, r1" },
{ 33, 0x2011c89b402cc000ull, "fsub.norz rf27, r4.abs, r1.abs; vfmul.ifa rf34, r3.swp, r1" },
+ { 33, 0xe01b42ab3bb063c0ull, "vfpack.andnc rf43, rf15.l, r0.h; fmul.ifna rf10.h, r4.l, r5.abs" },
+ { 33, 0x600b8b87fb4d1000ull, "fdx.ifnb rf7.h, r1.l; fmul.pushn rf46, r3.l, r2.abs" },
+
/* small immediates */
{ 33, 0x5de24398bbdc6218ull, "vflb.andnn rf24 ; fmul rf14, -8, rf8.h" },
{ 33, 0x25ef83d8b166f00full, "vfmin.pushn rf24, 15.ff, r5; smul24.ifnb rf15, r1, r3" },