diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2020-01-22 02:13:18 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2020-01-22 02:13:18 +0000 |
commit | fdcc03929065b5bf5dd93553db219ea3e05c8c34 (patch) | |
tree | ca90dc8d9e89febdcd4160956c1b8ec098a4efc9 /lib/mesa/src/broadcom | |
parent | 3c9de4a7e13712b5696750bbd59a18c848742022 (diff) |
Import Mesa 19.2.8
Diffstat (limited to 'lib/mesa/src/broadcom')
42 files changed, 3915 insertions, 1388 deletions
diff --git a/lib/mesa/src/broadcom/.editorconfig b/lib/mesa/src/broadcom/.editorconfig new file mode 100644 index 000000000..f3d8c4791 --- /dev/null +++ b/lib/mesa/src/broadcom/.editorconfig @@ -0,0 +1,3 @@ +[*.{c,h}] +indent_style = space +indent_size = 8 diff --git a/lib/mesa/src/broadcom/Android.cle.mk b/lib/mesa/src/broadcom/Android.cle.mk new file mode 100644 index 000000000..5634a8d4a --- /dev/null +++ b/lib/mesa/src/broadcom/Android.cle.mk @@ -0,0 +1,39 @@ +# Copyright © 2016 Intel Corporation +# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com> +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +include $(CLEAR_VARS) + +LOCAL_MODULE := libmesa_broadcom_cle + +LOCAL_MODULE_CLASS := STATIC_LIBRARIES + +LOCAL_SRC_FILES := $(BROADCOM_DECODER_FILES) + +LOCAL_STATIC_LIBRARIES := libmesa_broadcom_genxml + +LOCAL_C_INCLUDES += $(MESA_TOP)/src/gallium/include + +LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH) + +LOCAL_SHARED_LIBRARIES := libexpat libz + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) diff --git a/lib/mesa/src/broadcom/Android.genxml.mk b/lib/mesa/src/broadcom/Android.genxml.mk new file mode 100644 index 000000000..91e0de05d --- /dev/null +++ b/lib/mesa/src/broadcom/Android.genxml.mk @@ -0,0 +1,83 @@ +# Copyright © 2016 Intel Corporation +# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com> +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# + +include $(CLEAR_VARS) + +LOCAL_MODULE := libmesa_broadcom_genxml + +LOCAL_MODULE_CLASS := STATIC_LIBRARIES + +intermediates := $(call local-generated-sources-dir) + +# dummy.c source file is generated to meet the build system's rules. +LOCAL_GENERATED_SOURCES += $(intermediates)/dummy.c + +$(intermediates)/dummy.c: + @mkdir -p $(dir $@) + @echo "Gen Dummy: $(PRIVATE_MODULE) <= $(notdir $(@))" + $(hide) touch $@ + +# This is the list of auto-generated files headers +LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/broadcom/, $(BROADCOM_GENXML_GENERATED_FILES)) + +define pack-header-gen + @mkdir -p $(dir $@) + @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" + $(hide) $(PRIVATE_SCRIPT) $(PRIVATE_SCRIPT_FLAGS) $(PRIVATE_XML) $(PRIVATE_VER) > $@ +endef + +$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py +$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v21.xml +$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_VER := 21 +$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v21.xml $(LOCAL_PATH)/cle/gen_pack_header.py + $(call pack-header-gen) + +$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py +$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v33.xml +$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: PRIVATE_VER := 33 +$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v33.xml $(LOCAL_PATH)/cle/gen_pack_header.py + $(call pack-header-gen) + +$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py +$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v33.xml +$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_VER := 41 +$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v33.xml $(LOCAL_PATH)/cle/gen_pack_header.py + $(call pack-header-gen) + +$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py +$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v33.xml +$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_VER := 42 +$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v33.xml $(LOCAL_PATH)/cle/gen_pack_header.py + $(call pack-header-gen) + +$(intermediates)/broadcom/cle/v3d_xml.h: $(addprefix $(MESA_TOP)/src/broadcom/,$(BROADCOM_GENXML_XML_FILES)) $(MESA_TOP)/src/intel/genxml/gen_zipped_file.py + @mkdir -p $(dir $@) + @echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))" + $(hide) $(MESA_PYTHON2) $(MESA_TOP)/src/intel/genxml/gen_zipped_file.py $(addprefix $(MESA_TOP)/src/broadcom/,$(BROADCOM_GENXML_XML_FILES)) > $@ || (rm -f $@; false) + +LOCAL_EXPORT_C_INCLUDE_DIRS := \ + $(MESA_TOP)/src/broadcom/cle \ + $(intermediates)/broadcom/cle \ + $(intermediates) + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) diff --git a/lib/mesa/src/broadcom/Android.mk b/lib/mesa/src/broadcom/Android.mk new file mode 100644 index 000000000..b3bf40510 --- /dev/null +++ b/lib/mesa/src/broadcom/Android.mk @@ -0,0 +1,29 @@ +# Copyright © 2016 Intel Corporation +# Copyright © 2016 Mauro Rossi <issor.oruam@gmail.com> +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +# + +LOCAL_PATH := $(call my-dir) + +# Import variables +include $(LOCAL_PATH)/Makefile.sources + +include $(LOCAL_PATH)/Android.genxml.mk +include $(LOCAL_PATH)/Android.cle.mk diff --git a/lib/mesa/src/broadcom/cle/meson.build b/lib/mesa/src/broadcom/cle/meson.build index afaf5a1b4..a2f47625a 100644 --- a/lib/mesa/src/broadcom/cle/meson.build +++ b/lib/mesa/src/broadcom/cle/meson.build @@ -58,6 +58,6 @@ libbroadcom_cle = static_library( 'v3d_decoder.c', include_directories : [inc_common, inc_broadcom], c_args : [c_vis_args, no_override_init_args], - dependencies : [dep_libdrm, dep_valgrind], + dependencies : [dep_libdrm, dep_valgrind, dep_expat, dep_zlib], build_by_default : false, ) diff --git a/lib/mesa/src/broadcom/cle/v3d_decoder.c b/lib/mesa/src/broadcom/cle/v3d_decoder.c index 373a1d996..23ee59fd0 100644 --- a/lib/mesa/src/broadcom/cle/v3d_decoder.c +++ b/lib/mesa/src/broadcom/cle/v3d_decoder.c @@ -651,7 +651,8 @@ v3d_spec_load(const struct v3d_device_info *devinfo) struct parser_context ctx; void *buf; uint8_t *text_data = NULL; - uint32_t text_offset = 0, text_length = 0, total_length; + uint32_t text_offset = 0, text_length = 0; + ASSERTED uint32_t total_length; for (int i = 0; i < ARRAY_SIZE(genxml_files_table); i++) { if (i != 0) { diff --git a/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml b/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml index 06e8ddad7..f40796612 100644 --- a/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml +++ b/lib/mesa/src/broadcom/cle/v3d_packet_v33.xml @@ -250,6 +250,28 @@ <value name="RGBA" value="3"/> </enum> + <enum name="Pack Mode" prefix="V3D_PACK_MODE"> + <value name="16-way" value="0"/> + <value name="8-way" value="1"/> + <value name="4-way" value="2"/> + </enum> + + <enum name="TCS flush mode" prefix="V3D_TCS_FLUSH_MODE"> + <value name="fully packed" value="0"/> + <value name="single patch" value="1"/> + <value name="packed complete patches" value="2"/> + </enum> + + <enum name="Primitve counters" prefix="V3D_PRIM_COUNTS"> + <value name="tf_words_buffer0" value="0"/> + <value name="tf_words_buffer1" value="1"/> + <value name="tf_words_buffer2" value="2"/> + <value name="tf_words_buffer3" value="3"/> + <value name="written" value="4"/> + <value name="tf_written" value="5"/> + <value name="tf_overflow" value="6"/> + </enum> + <packet code="0" name="Halt"/> <packet code="1" name="NOP"/> <packet code="4" name="Flush"/> @@ -552,6 +574,14 @@ <field name="mode" size="8" start="0" type="Primitive"/> </packet> + <packet code="39" name="Vertex Array Single Instance Prims" cl="B"> + <field name="Index of First Vertex" size="32" start="72" type="uint"/> + <field name="Instance ID" size="32" start="40" type="uint"/> + <field name="Instance Length" size="32" start="8" type="uint"/> + + <field name="mode" size="8" start="0" type="Primitive"/> + </packet> + <packet code="43" name="Base Vertex Base Instance" cl="B"> <field name="Base Instance" size="32" start="32" type="uint"/> @@ -563,6 +593,14 @@ <field name="Size" size="32" start="32" type="uint"/> </packet> + <packet code="54" name="Set InstanceID" cl="B" min_ver="41"> + <field name="Instance ID" size="32" start="32" type="uint"/> + </packet> + + <packet code="55" name="Set PrimitiveID" cl="B" min_ver="41"> + <field name="Primitive ID" size="32" start="32" type="uint"/> + </packet> + <packet code="56" name="Prim List Format"> <field name="tri strip or fan" size="1" start="7" type="bool"/> <field name="primitive type" size="6" start="0" type="uint"> @@ -572,16 +610,64 @@ </field> </packet> + <packet code="57" name="Serial Number List Start"> + <field name="address" size="26" start="6" type="address"/> + <field name="block size" size="2" start="0" type="uint"> + <value name="block size 64b" value="0"/> + <value name="block size 128b" value="1"/> + <value name="block size 256b" value="2"/> + </field> + </packet> + <packet code="64" shortname="gl_shader" name="GL Shader State"> <field name="address" size="27" start="5" type="address"/> <field name="number of attribute arrays" size="5" start="0" type="uint"/> </packet> + <packet code="65" shortname="gl_t_shader" name="GL Shader State including TS" min_ver="41"> + <field name="address" size="27" start="5" type="address"/> + <field name="number of attribute arrays" size="5" start="0" type="uint"/> + </packet> + + <packet code="66" shortname="gl_g_shader" name="GL Shader State including GS" min_ver="41"> + <field name="address" size="27" start="5" type="address"/> + <field name="number of attribute arrays" size="5" start="0" type="uint"/> + </packet> + + <packet code="67" shortname="gl_tg_shader" name="GL Shader State including TS/GS" min_ver="41"> + <field name="address" size="27" start="5" type="address"/> + <field name="number of attribute arrays" size="5" start="0" type="uint"/> + </packet> + <packet code="71" name="VCM Cache Size" min_ver="41"> <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/> <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/> </packet> + <packet code="72" shortname="prim_counts_feedback" name="Primitive Counts Feedback"> + <field name="address" size="27" start="5" type="address"/> + <field name="read/write 64byte" size="1" start="4" type="bool"/> + <field name="op" size="4" start="0" type="uint"> + <!-- + dword 0-3 are words written to TFB 0-3. 4 is prims generated, 5 is prims written, 6 is + prims overflowed + --> + <value name="store primitive counts" value="0"/> + <value name="store primitive counts and zero" value="1"/> + <!-- + write 4 pairs of TFB state: remaining TFB space in buffer n, current address in buffer n + --> + <value name="store buffer state" value="2"/> + <value name="store buffer state CL" value="3"/> + <!-- + Waits for buffer state stores to complete, then loads from + the given buffer state. This op can be offset by n to skip + waiting for the last n. + --> + <value name="load buffer state" value="8"/> + </field> + </packet> + <packet code="73" name="VCM Cache Size" max_ver="33"> <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/> <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/> @@ -1200,6 +1286,61 @@ <field name="Tessellation Render Mode Evaluation Shader Uniforms Address" size="32" start="28b" type="address"/> </struct> + <struct name="Tessellation/Geometry Common Params" min_ver="41"> + <field name="Tessellation Type" size="2" start="1" type="uint"> + <value name="Tessellation Type Triangle" value="0"/> + <value name="Tessellation Type Quads" value="1"/> + <value name="Tessellation Type Isolines" value="2"/> + </field> + + <field name="Tessellation point mode" size="1" start="3" type="bool"/> + + <field name="Tessellation Edge Spacing" size="2" start="4" type="uint"> + <value name="Tessellation Edge Spacing Even" value="0"/> + <value name="Tessellation Edge Spacing Fractional Even" value="1"/> + <value name="Tessellation Edge Spacing Fractional Odd" value="2"/> + </field> + + <field name="Tessellation clockwise" size="1" start="6" type="bool"/> + + <field name="Tessellation Invocations" size="5" start="12" type="uint"/> <!-- 0 == 32 --> + + <field name="Geometry Shader output format" size="2" start="17" type="uint"> + <value name="Geometry Shader Points" value="0"/> + <value name="Geometry Shader Line Strip" value="1"/> + <value name="Geometry Shader Tri Strip" value="2"/> + </field> + + <field name="Geometry Shader Instances" size="5" start="19" type="uint"/> <!-- 0 == 32 --> + + <!-- followed by "Tessellation/Geometry Shader Params" for bin, then render --> + </struct> + + <struct name="Tessellation/Geometry Shader Params"> + <field name="TCS Batch Flush Mode" size="2" start="0" type="TCS flush mode"/> + <field name="Per-patch data column depth" size="4" start="2" type="uint"/> <!-- 8-dword units, 0==16 --> + + <field name="TCS output segment size in sectors" size="6" start="8" type="uint"/> + <field name="TCS output segment pack mode" size="2" start="14" type="Pack Mode"/> + + <field name="TES output segment size in sectors" size="6" start="16" type="uint"/> + <field name="TES output segment pack mode" size="2" start="22" type="Pack Mode"/> + + <field name="GS output segment size in sectors" size="6" start="24" type="uint"/> + <field name="GS output segment pack mode" size="2" start="30" type="Pack Mode"/> + + <field name="TBG max patches per TCS batch" size="4" start="32" type="uint" minus_one="true"/> + <field name="TBG max extra vertex segs for patches after first" size="2" start="36" type="uint"/> + <field name="TBG min TCS output segments required in play" size="2" start="38" type="uint" minus_one="true"/> + <field name="TBG min per-patch data segments required in play" size="3" start="40" type="uint" minus_one="true"/> + <field name="TPG max patches per TES batch" size="4" start="45" type="uint" minus_one="true"/> + <field name="TPG max vertex segments per TES batch" size="2" start="49" type="uint"/> + <field name="TPG max TCS output segments per TES batch" size="3" start="51" type="uint" minus_one="true"/> + <field name="TPG min TES output segments required in play" size="3" start="54" type="uint" minus_one="true"/> + <field name="GBG max TES output/vertex segments per GS batch" size="2" start="57" type="uint"/> + <field name="GBG max TES output/vertex segments required in play" size="3" start="59" type="uint" minus_one="true"/> + </struct> + <struct name="GL Shader State Attribute Record" max_ver="33"> <field name="Address" size="32" start="0" type="address"/> diff --git a/lib/mesa/src/broadcom/common/v3d_device_info.c b/lib/mesa/src/broadcom/common/v3d_device_info.c new file mode 100644 index 000000000..272190eb2 --- /dev/null +++ b/lib/mesa/src/broadcom/common/v3d_device_info.c @@ -0,0 +1,79 @@ +/* + * Copyright © 2016 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <errno.h> +#include <stdio.h> +#include <string.h> + +#include "common/v3d_device_info.h" +#include "drm-uapi/v3d_drm.h" + +bool +v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_ioctl) { + struct drm_v3d_get_param ident0 = { + .param = DRM_V3D_PARAM_V3D_CORE0_IDENT0, + }; + struct drm_v3d_get_param ident1 = { + .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1, + }; + int ret; + + ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0); + if (ret != 0) { + fprintf(stderr, "Couldn't get V3D core IDENT0: %s\n", + strerror(errno)); + return false; + } + ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident1); + if (ret != 0) { + fprintf(stderr, "Couldn't get V3D core IDENT1: %s\n", + strerror(errno)); + return false; + } + + uint32_t major = (ident0.value >> 24) & 0xff; + uint32_t minor = (ident1.value >> 0) & 0xf; + + devinfo->ver = major * 10 + minor; + + devinfo->vpm_size = (ident1.value >> 28 & 0xf) * 8192; + + int nslc = (ident1.value >> 4) & 0xf; + int qups = (ident1.value >> 8) & 0xf; + devinfo->qpu_count = nslc * qups; + + switch (devinfo->ver) { + case 33: + case 41: + case 42: + break; + default: + fprintf(stderr, + "V3D %d.%d not supported by this version of Mesa.\n", + devinfo->ver / 10, + devinfo->ver % 10); + return false; + } + + return true; +} diff --git a/lib/mesa/src/broadcom/common/v3d_limits.h b/lib/mesa/src/broadcom/common/v3d_limits.h index e21ee246e..776847622 100644 --- a/lib/mesa/src/broadcom/common/v3d_limits.h +++ b/lib/mesa/src/broadcom/common/v3d_limits.h @@ -24,6 +24,11 @@ #ifndef V3D_LIMITS_H #define V3D_LIMITS_H +/* Number of channels a QPU thread executes in parallel. Also known as + * gl_SubGroupSizeARB. + */ +#define V3D_CHANNELS 16 + #define V3D_MAX_FS_INPUTS 64 #define V3D_MAX_VS_INPUTS 64 diff --git a/lib/mesa/src/broadcom/compiler/meson.build b/lib/mesa/src/broadcom/compiler/meson.build index c80918db3..d7af999c3 100644 --- a/lib/mesa/src/broadcom/compiler/meson.build +++ b/lib/mesa/src/broadcom/compiler/meson.build @@ -23,9 +23,9 @@ libbroadcom_compiler_files = files( 'vir.c', 'vir_dump.c', 'vir_live_variables.c', - 'vir_lower_uniforms.c', 'vir_opt_copy_propagate.c', 'vir_opt_dead_code.c', + 'vir_opt_redundant_flags.c', 'vir_opt_small_immediates.c', 'vir_register_allocate.c', 'vir_to_qpu.c', @@ -37,6 +37,8 @@ libbroadcom_compiler_files = files( 'v3d_compiler.h', 'v3d_nir_lower_io.c', 'v3d_nir_lower_image_load_store.c', + 'v3d_nir_lower_logic_ops.c', + 'v3d_nir_lower_scratch.c', 'v3d_nir_lower_txf_ms.c', ) diff --git a/lib/mesa/src/broadcom/compiler/nir_to_vir.c b/lib/mesa/src/broadcom/compiler/nir_to_vir.c index bd19bb9b0..01468fa87 100644 --- a/lib/mesa/src/broadcom/compiler/nir_to_vir.c +++ b/lib/mesa/src/broadcom/compiler/nir_to_vir.c @@ -32,18 +32,15 @@ #include "common/v3d_device_info.h" #include "v3d_compiler.h" +/* We don't do any address packing. */ +#define __gen_user_data void +#define __gen_address_type uint32_t +#define __gen_address_offset(reloc) (*reloc) +#define __gen_emit_reloc(cl, reloc) +#include "cle/v3d_packet_v41_pack.h" + #define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7) #define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7) -#define GENERAL_TMU_READ_OP_PREFETCH (0 << 3) -#define GENERAL_TMU_READ_OP_CACHE_CLEAR (1 << 3) -#define GENERAL_TMU_READ_OP_CACHE_FLUSH (3 << 3) -#define GENERAL_TMU_READ_OP_CACHE_CLEAN (3 << 3) -#define GENERAL_TMU_READ_OP_CACHE_L1T_CLEAR (4 << 3) -#define GENERAL_TMU_READ_OP_CACHE_L1T_FLUSH_AGGREGATION (5 << 3) -#define GENERAL_TMU_READ_OP_ATOMIC_INC (8 << 3) -#define GENERAL_TMU_READ_OP_ATOMIC_DEC (9 << 3) -#define GENERAL_TMU_READ_OP_ATOMIC_NOT (10 << 3) -#define GENERAL_TMU_READ_OP_READ (15 << 3) #define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0) #define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0) #define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0) @@ -53,19 +50,6 @@ #define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0) #define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0) -#define GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP (0 << 3) -#define GENERAL_TMU_WRITE_OP_ATOMIC_SUB_WRAP (1 << 3) -#define GENERAL_TMU_WRITE_OP_ATOMIC_XCHG (2 << 3) -#define GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG (3 << 3) -#define GENERAL_TMU_WRITE_OP_ATOMIC_UMIN (4 << 3) -#define GENERAL_TMU_WRITE_OP_ATOMIC_UMAX (5 << 3) -#define GENERAL_TMU_WRITE_OP_ATOMIC_SMIN (6 << 3) -#define GENERAL_TMU_WRITE_OP_ATOMIC_SMAX (7 << 3) -#define GENERAL_TMU_WRITE_OP_ATOMIC_AND (8 << 3) -#define GENERAL_TMU_WRITE_OP_ATOMIC_OR (9 << 3) -#define GENERAL_TMU_WRITE_OP_ATOMIC_XOR (10 << 3) -#define GENERAL_TMU_WRITE_OP_WRITE (15 << 3) - #define V3D_TSY_SET_QUORUM 0 #define V3D_TSY_INC_WAITERS 1 #define V3D_TSY_DEC_WAITERS 2 @@ -122,6 +106,27 @@ vir_emit_thrsw(struct v3d_compile *c) c->last_thrsw = vir_NOP(c); c->last_thrsw->qpu.sig.thrsw = true; c->last_thrsw_at_top_level = !c->in_control_flow; + + /* We need to lock the scoreboard before any tlb acess happens. If this + * thread switch comes after we have emitted a tlb load, then it means + * that we can't lock on the last thread switch any more. + */ + if (c->emitted_tlb_load) + c->lock_scoreboard_on_first_thrsw = true; +} + +uint32_t +v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) +{ + if (nir_src_is_const(instr->src[src])) { + int64_t add_val = nir_src_as_int(instr->src[src]); + if (add_val == 1) + return V3D_TMU_OP_WRITE_AND_READ_INC; + else if (add_val == -1) + return V3D_TMU_OP_WRITE_OR_READ_DEC; + } + + return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH; } static uint32_t @@ -132,40 +137,42 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr) case nir_intrinsic_load_ubo: case nir_intrinsic_load_uniform: case nir_intrinsic_load_shared: - return GENERAL_TMU_READ_OP_READ; + case nir_intrinsic_load_scratch: case nir_intrinsic_store_ssbo: case nir_intrinsic_store_shared: - return GENERAL_TMU_WRITE_OP_WRITE; + case nir_intrinsic_store_scratch: + return V3D_TMU_OP_REGULAR; case nir_intrinsic_ssbo_atomic_add: + return v3d_get_op_for_atomic_add(instr, 2); case nir_intrinsic_shared_atomic_add: - return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP; + return v3d_get_op_for_atomic_add(instr, 1); case nir_intrinsic_ssbo_atomic_imin: case nir_intrinsic_shared_atomic_imin: - return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN; + return V3D_TMU_OP_WRITE_SMIN; case nir_intrinsic_ssbo_atomic_umin: case nir_intrinsic_shared_atomic_umin: - return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN; + return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; case nir_intrinsic_ssbo_atomic_imax: case nir_intrinsic_shared_atomic_imax: - return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX; + return V3D_TMU_OP_WRITE_SMAX; case nir_intrinsic_ssbo_atomic_umax: case nir_intrinsic_shared_atomic_umax: - return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX; + return V3D_TMU_OP_WRITE_UMAX; case nir_intrinsic_ssbo_atomic_and: case nir_intrinsic_shared_atomic_and: - return GENERAL_TMU_WRITE_OP_ATOMIC_AND; + return V3D_TMU_OP_WRITE_AND_READ_INC; case nir_intrinsic_ssbo_atomic_or: case nir_intrinsic_shared_atomic_or: - return GENERAL_TMU_WRITE_OP_ATOMIC_OR; + return V3D_TMU_OP_WRITE_OR_READ_DEC; case nir_intrinsic_ssbo_atomic_xor: case nir_intrinsic_shared_atomic_xor: - return GENERAL_TMU_WRITE_OP_ATOMIC_XOR; + return V3D_TMU_OP_WRITE_XOR_READ_NOT; case nir_intrinsic_ssbo_atomic_exchange: case nir_intrinsic_shared_atomic_exchange: - return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG; + return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; case nir_intrinsic_ssbo_atomic_comp_swap: case nir_intrinsic_shared_atomic_comp_swap: - return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG; + return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; default: unreachable("unknown intrinsic op"); } @@ -177,147 +184,217 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr) */ static void ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, - bool is_shared) + bool is_shared_or_scratch) { - /* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR - * wants to have support for inc/dec? + uint32_t tmu_op = v3d_general_tmu_op(instr); + + /* If we were able to replace atomic_add for an inc/dec, then we + * need/can to do things slightly different, like not loading the + * amount to add/sub, as that is implicit. */ + bool atomic_add_replaced = + ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add || + instr->intrinsic == nir_intrinsic_shared_atomic_add) && + (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC || + tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC)); - uint32_t tmu_op = v3d_general_tmu_op(instr); bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || + instr->intrinsic == nir_intrinsic_store_scratch || instr->intrinsic == nir_intrinsic_store_shared); - bool has_index = !is_shared; + + bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform || + instr->intrinsic == nir_intrinsic_load_ubo || + instr->intrinsic == nir_intrinsic_load_ssbo || + instr->intrinsic == nir_intrinsic_load_scratch || + instr->intrinsic == nir_intrinsic_load_shared); + + bool has_index = !is_shared_or_scratch; int offset_src; - int tmu_writes = 1; /* address */ if (instr->intrinsic == nir_intrinsic_load_uniform) { offset_src = 0; } else if (instr->intrinsic == nir_intrinsic_load_ssbo || instr->intrinsic == nir_intrinsic_load_ubo || - instr->intrinsic == nir_intrinsic_load_shared) { + instr->intrinsic == nir_intrinsic_load_scratch || + instr->intrinsic == nir_intrinsic_load_shared || + atomic_add_replaced) { offset_src = 0 + has_index; } else if (is_store) { offset_src = 1 + has_index; - for (int i = 0; i < instr->num_components; i++) { - vir_MOV_dest(c, - vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - ntq_get_src(c, instr->src[0], i)); - tmu_writes++; - } } else { offset_src = 0 + has_index; - vir_MOV_dest(c, - vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - ntq_get_src(c, instr->src[1 + has_index], 0)); - tmu_writes++; - if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) { - vir_MOV_dest(c, - vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - ntq_get_src(c, instr->src[2 + has_index], - 0)); - tmu_writes++; - } } - /* Make sure we won't exceed the 16-entry TMU fifo if each thread is - * storing at the same time. - */ - while (tmu_writes > 16 / c->threads) - c->threads /= 2; + bool dynamic_src = !nir_src_is_const(instr->src[offset_src]); + uint32_t const_offset = 0; + if (!dynamic_src) + const_offset = nir_src_as_uint(instr->src[offset_src]); - struct qreg offset; + struct qreg base_offset; if (instr->intrinsic == nir_intrinsic_load_uniform) { - offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 0); - - /* Find what variable in the default uniform block this - * uniform load is coming from. - */ - uint32_t base = nir_intrinsic_base(instr); - int i; - struct v3d_ubo_range *range = NULL; - for (i = 0; i < c->num_ubo_ranges; i++) { - range = &c->ubo_ranges[i]; - if (base >= range->src_offset && - base < range->src_offset + range->size) { - break; - } - } - /* The driver-location-based offset always has to be within a - * declared uniform range. - */ - assert(i != c->num_ubo_ranges); - if (!c->ubo_range_used[i]) { - c->ubo_range_used[i] = true; - range->dst_offset = c->next_ubo_dst_offset; - c->next_ubo_dst_offset += range->size; - } - - base = base - range->src_offset + range->dst_offset; - - if (base != 0) - offset = vir_ADD(c, offset, vir_uniform_ui(c, base)); + const_offset += nir_intrinsic_base(instr); + base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR, + v3d_unit_data_create(0, const_offset)); + const_offset = 0; } else if (instr->intrinsic == nir_intrinsic_load_ubo) { + uint32_t index = nir_src_as_uint(instr->src[0]) + 1; /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by * 1 (0 is gallium's constant buffer 0). */ - offset = vir_uniform(c, QUNIFORM_UBO_ADDR, - nir_src_as_uint(instr->src[0]) + 1); - } else if (is_shared) { - /* Shared variables have no buffer index, and all start from a - * common base that we set up at the start of dispatch + base_offset = + vir_uniform(c, QUNIFORM_UBO_ADDR, + v3d_unit_data_create(index, const_offset)); + const_offset = 0; + } else if (is_shared_or_scratch) { + /* Shared and scratch variables have no buffer index, and all + * start from a common base that we set up at the start of + * dispatch. */ - offset = c->cs_shared_offset; + if (instr->intrinsic == nir_intrinsic_load_scratch || + instr->intrinsic == nir_intrinsic_store_scratch) { + base_offset = c->spill_base; + } else { + base_offset = c->cs_shared_offset; + const_offset += nir_intrinsic_base(instr); + } } else { - offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, - nir_src_as_uint(instr->src[is_store ? - 1 : 0])); + base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, + nir_src_as_uint(instr->src[is_store ? + 1 : 0])); } - uint32_t config = (0xffffff00 | - tmu_op | - GENERAL_TMU_LOOKUP_PER_PIXEL); - if (instr->num_components == 1) { - config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; - } else { - config |= (GENERAL_TMU_LOOKUP_TYPE_VEC2 + - instr->num_components - 2); - } + struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); + unsigned writemask = is_store ? nir_intrinsic_write_mask(instr) : 0; + uint32_t base_const_offset = const_offset; + int first_component = -1; + int last_component = -1; + do { + int tmu_writes = 1; /* address */ - if (c->execute.file != QFILE_NULL) - vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + if (is_store) { + /* Find the first set of consecutive components that + * are enabled in the writemask and emit the TMUD + * instructions for them. + */ + first_component = ffs(writemask) - 1; + last_component = first_component; + while (writemask & BITFIELD_BIT(last_component + 1)) + last_component++; + + assert(first_component >= 0 && + first_component <= last_component && + last_component < instr->num_components); + + struct qreg tmud = vir_reg(QFILE_MAGIC, + V3D_QPU_WADDR_TMUD); + for (int i = first_component; i <= last_component; i++) { + struct qreg data = + ntq_get_src(c, instr->src[0], i); + vir_MOV_dest(c, tmud, data); + tmu_writes++; + } - struct qreg dest; - if (config == ~0) - dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); - else - dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); + /* Update the offset for the TMU write based on the + * the first component we are writing. + */ + const_offset = base_const_offset + first_component * 4; + + /* Clear these components from the writemask */ + uint32_t written_mask = + BITFIELD_RANGE(first_component, tmu_writes - 1); + writemask &= ~written_mask; + } else if (!is_load && !atomic_add_replaced) { + struct qreg data = + ntq_get_src(c, instr->src[1 + has_index], 0); + vir_MOV_dest(c, tmud, data); + tmu_writes++; + if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { + data = ntq_get_src(c, instr->src[2 + has_index], + 0); + vir_MOV_dest(c, tmud, data); + tmu_writes++; + } + } - struct qinst *tmu; - if (nir_src_is_const(instr->src[offset_src]) && - nir_src_as_uint(instr->src[offset_src]) == 0) { - tmu = vir_MOV_dest(c, dest, offset); - } else { - tmu = vir_ADD_dest(c, dest, - offset, - ntq_get_src(c, instr->src[offset_src], 0)); - } + /* Make sure we won't exceed the 16-entry TMU fifo if each + * thread is storing at the same time. + */ + while (tmu_writes > 16 / c->threads) + c->threads /= 2; - if (config != ~0) { - tmu->src[vir_get_implicit_uniform_src(tmu)] = - vir_uniform_ui(c, config); - } + /* The spec says that for atomics, the TYPE field is ignored, + * but that doesn't seem to be the case for CMPXCHG. Just use + * the number of tmud writes we did to decide the type (or + * choose "32bit" for atomic reads, which has been fine). + */ + uint32_t num_components; + if (is_load || atomic_add_replaced) { + num_components = instr->num_components; + } else { + assert(tmu_writes > 1); + num_components = tmu_writes - 1; + } + + uint32_t config = (0xffffff00 | + tmu_op << 3| + GENERAL_TMU_LOOKUP_PER_PIXEL); + if (num_components == 1) { + config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; + } else { + config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + + num_components - 2; + } + + if (vir_in_nonuniform_control_flow(c)) { + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + } - if (c->execute.file != QFILE_NULL) - vir_set_cond(tmu, V3D_QPU_COND_IFA); + struct qreg tmua; + if (config == ~0) + tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); + else + tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); + + struct qinst *tmu; + if (dynamic_src) { + struct qreg offset = base_offset; + if (const_offset != 0) { + offset = vir_ADD(c, offset, + vir_uniform_ui(c, const_offset)); + } + struct qreg data = + ntq_get_src(c, instr->src[offset_src], 0); + tmu = vir_ADD_dest(c, tmua, offset, data); + } else { + if (const_offset != 0) { + tmu = vir_ADD_dest(c, tmua, base_offset, + vir_uniform_ui(c, const_offset)); + } else { + tmu = vir_MOV_dest(c, tmua, base_offset); + } + } - vir_emit_thrsw(c); + if (config != ~0) { + tmu->uniform = + vir_get_uniform_index(c, QUNIFORM_CONSTANT, + config); + } + + if (vir_in_nonuniform_control_flow(c)) + vir_set_cond(tmu, V3D_QPU_COND_IFA); + + vir_emit_thrsw(c); - /* Read the result, or wait for the TMU op to complete. */ - for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) - ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c))); + /* Read the result, or wait for the TMU op to complete. */ + for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { + ntq_store_dest(c, &instr->dest, i, + vir_MOV(c, vir_LDTMU(c))); + } - if (nir_intrinsic_dest_components(instr) == 0) - vir_TMUWT(c); + if (nir_intrinsic_dest_components(instr) == 0) + vir_TMUWT(c); + } while (is_store && writemask != 0); } static struct qreg * @@ -329,6 +406,20 @@ ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) return qregs; } +static bool +is_ld_signal(const struct v3d_qpu_sig *sig) +{ + return (sig->ldunif || + sig->ldunifa || + sig->ldunifrf || + sig->ldunifarf || + sig->ldtmu || + sig->ldvary || + sig->ldvpm || + sig->ldtlb || + sig->ldtlbu); +} + /** * This function is responsible for getting VIR results into the associated * storage for a NIR instruction. @@ -352,8 +443,7 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, if (!list_empty(&c->cur_block->instructions)) last_inst = (struct qinst *)c->cur_block->instructions.prev; - assert(result.file == QFILE_UNIF || - (result.file == QFILE_TEMP && + assert((result.file == QFILE_TEMP && last_inst && last_inst == c->defs[result.index])); if (dest->is_ssa) { @@ -377,10 +467,12 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, _mesa_hash_table_search(c->def_ht, reg); struct qreg *qregs = entry->data; - /* Insert a MOV if the source wasn't an SSA def in the - * previous instruction. + /* If the previous instruction can't be predicated for + * the store into the nir_register, then emit a MOV + * that can be. */ - if (result.file == QFILE_UNIF) { + if (vir_in_nonuniform_control_flow(c) && + is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig)) { result = vir_MOV(c, result); last_inst = c->defs[result.index]; } @@ -392,17 +484,17 @@ ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, /* If we're in control flow, then make this update of the reg * conditional on the execution mask. */ - if (c->execute.file != QFILE_NULL) { + if (vir_in_nonuniform_control_flow(c)) { last_inst->dst.index = qregs[chan].index; /* Set the flags to the current exec mask. */ c->cursor = vir_before_inst(last_inst); - vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); c->cursor = vir_after_inst(last_inst); vir_set_cond(last_inst, V3D_QPU_COND_IFA); - last_inst->cond_is_exec_mask = true; } } } @@ -540,26 +632,13 @@ ntq_fsign(struct v3d_compile *c, struct qreg src) struct qreg t = vir_get_temp(c); vir_MOV_dest(c, t, vir_uniform_f(c, 0.0)); - vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ); + vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ); vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0)); - vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN); + vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN); vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0)); return vir_MOV(c, t); } -static struct qreg -ntq_isign(struct v3d_compile *c, struct qreg src) -{ - struct qreg t = vir_get_temp(c); - - vir_MOV_dest(c, t, vir_uniform_ui(c, 0)); - vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ); - vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1)); - vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN); - vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1)); - return vir_MOV(c, t); -} - static void emit_fragcoord_input(struct v3d_compile *c, int attr) { @@ -675,27 +754,6 @@ add_output(struct v3d_compile *c, v3d_slot_from_slot_and_component(slot, swizzle); } -static void -declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size) -{ - unsigned array_id = c->num_ubo_ranges++; - if (array_id >= c->ubo_ranges_array_size) { - c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2, - array_id + 1); - c->ubo_ranges = reralloc(c, c->ubo_ranges, - struct v3d_ubo_range, - c->ubo_ranges_array_size); - c->ubo_range_used = reralloc(c, c->ubo_range_used, - bool, - c->ubo_ranges_array_size); - } - - c->ubo_ranges[array_id].dst_offset = 0; - c->ubo_ranges[array_id].src_offset = start; - c->ubo_ranges[array_id].size = size; - c->ubo_range_used[array_id] = false; -} - /** * If compare_instr is a valid comparison instruction, emits the * compare_instr's comparison and returns the sel_instr's return value based @@ -711,7 +769,7 @@ ntq_emit_comparison(struct v3d_compile *c, if (nir_op_infos[compare_instr->op].num_inputs > 1) src1 = ntq_get_alu_src(c, compare_instr, 1); bool cond_invert = false; - struct qreg nop = vir_reg(QFILE_NULL, 0); + struct qreg nop = vir_nop_reg(); switch (compare_instr->op) { case nir_op_feq32: @@ -756,6 +814,16 @@ ntq_emit_comparison(struct v3d_compile *c, vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); break; + case nir_op_i2b32: + vir_set_pf(vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); + cond_invert = true; + break; + + case nir_op_f2b32: + vir_set_pf(vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); + cond_invert = true; + break; + default: return false; } @@ -789,28 +857,24 @@ ntq_get_alu_parent(nir_src src) return instr; } -/** - * Attempts to fold a comparison generating a boolean result into the - * condition code for selecting between two values, instead of comparing the - * boolean result against 0 to generate the condition code. - */ -static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr, - struct qreg *src) +/* Turns a NIR bool into a condition code to predicate on. */ +static enum v3d_qpu_cond +ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src) { - nir_alu_instr *compare = ntq_get_alu_parent(instr->src[0].src); + nir_alu_instr *compare = ntq_get_alu_parent(src); if (!compare) goto out; enum v3d_qpu_cond cond; if (ntq_emit_comparison(c, compare, &cond)) - return vir_MOV(c, vir_SEL(c, cond, src[1], src[2])); + return cond; out: - vir_PF(c, src[0], V3D_QPU_PF_PUSHZ); - return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2])); + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)), + V3D_QPU_PF_PUSHZ); + return V3D_QPU_COND_IFNA; } - static void ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) { @@ -843,8 +907,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) struct qreg result; switch (instr->op) { - case nir_op_fmov: - case nir_op_imov: + case nir_op_mov: result = vir_MOV(c, src[0]); break; @@ -871,9 +934,16 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) result = vir_FMAX(c, src[0], src[1]); break; - case nir_op_f2i32: - result = vir_FTOIZ(c, src[0]); + case nir_op_f2i32: { + nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src); + if (src0_alu && src0_alu->op == nir_op_fround_even) { + result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0)); + } else { + result = vir_FTOIZ(c, src[0]); + } break; + } + case nir_op_f2u32: result = vir_FTOUZ(c, src[0]); break; @@ -889,13 +959,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) case nir_op_b2i32: result = vir_AND(c, src[0], vir_uniform_ui(c, 1)); break; - case nir_op_i2b32: - case nir_op_f2b32: - vir_PF(c, src[0], V3D_QPU_PF_PUSHZ); - result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, - vir_uniform_ui(c, ~0), - vir_uniform_ui(c, 0))); - break; case nir_op_iadd: result = vir_ADD(c, src[0], src[1]); @@ -950,7 +1013,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) case nir_op_sge: case nir_op_slt: { enum v3d_qpu_cond cond; - MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond); + ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond); assert(ok); result = vir_MOV(c, vir_SEL(c, cond, vir_uniform_f(c, 1.0), @@ -958,6 +1021,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) break; } + case nir_op_i2b32: + case nir_op_f2b32: case nir_op_feq32: case nir_op_fne32: case nir_op_fge32: @@ -969,7 +1034,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) case nir_op_ilt32: case nir_op_ult32: { enum v3d_qpu_cond cond; - MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond); + ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond); assert(ok); result = vir_MOV(c, vir_SEL(c, cond, vir_uniform_ui(c, ~0), @@ -978,10 +1043,15 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) } case nir_op_b32csel: - result = ntq_emit_bcsel(c, instr, src); + result = vir_MOV(c, + vir_SEL(c, + ntq_emit_bool_to_cond(c, instr->src[0].src), + src[1], src[2])); break; + case nir_op_fcsel: - vir_PF(c, src[0], V3D_QPU_PF_PUSHZ); + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), src[0]), + V3D_QPU_PF_PUSHZ); result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2])); break; @@ -1011,9 +1081,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) case nir_op_ftrunc: result = vir_FTRUNC(c, src[0]); break; - case nir_op_ffract: - result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0])); - break; case nir_op_fsin: result = ntq_fsincos(c, src[0], false); @@ -1025,9 +1092,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) case nir_op_fsign: result = ntq_fsign(c, src[0]); break; - case nir_op_isign: - result = ntq_isign(c, src[0]); - break; case nir_op_fabs: { result = vir_FMOV(c, src[0]); @@ -1036,8 +1100,7 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) } case nir_op_iabs: - result = vir_MAX(c, src[0], - vir_SUB(c, vir_uniform_ui(c, 0), src[0])); + result = vir_MAX(c, src[0], vir_NEG(c, src[0])); break; case nir_op_fddx: @@ -1053,7 +1116,8 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) break; case nir_op_uadd_carry: - vir_PF(c, vir_ADD(c, src[0], src[1]), V3D_QPU_PF_PUSHC); + vir_set_pf(vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]), + V3D_QPU_PF_PUSHC); result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA, vir_uniform_ui(c, ~0), vir_uniform_ui(c, 0))); @@ -1064,9 +1128,6 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) break; case nir_op_unpack_half_2x16_split_x: - /* XXX perf: It would be good to be able to merge this unpack - * with whatever uses our result. - */ result = vir_FMOV(c, src[0]); vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); break; @@ -1120,6 +1181,107 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) #define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4)) static void +vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt) +{ + if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt]) + return; + + struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB); + struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); + + nir_variable *var = c->output_color_var[rt]; + int num_components = glsl_get_vector_elements(var->type); + uint32_t conf = 0xffffff00; + struct qinst *inst; + + conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE : + TLB_SAMPLE_MODE_PER_PIXEL; + conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; + + if (c->fs_key->swap_color_rb & (1 << rt)) + num_components = MAX2(num_components, 3); + assert(num_components != 0); + + enum glsl_base_type type = glsl_get_base_type(var->type); + bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT; + bool is_32b_tlb_format = is_int_format || + (c->fs_key->f32_color_rb & (1 << rt)); + + if (is_int_format) { + /* The F32 vs I32 distinction was dropped in 4.2. */ + if (c->devinfo->ver < 42) + conf |= TLB_TYPE_I32_COLOR; + else + conf |= TLB_TYPE_F32_COLOR; + conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT); + } else { + if (c->fs_key->f32_color_rb & (1 << rt)) { + conf |= TLB_TYPE_F32_COLOR; + conf |= ((num_components - 1) << + TLB_VEC_SIZE_MINUS_1_SHIFT); + } else { + conf |= TLB_TYPE_F16_COLOR; + conf |= TLB_F16_SWAP_HI_LO; + if (num_components >= 3) + conf |= TLB_VEC_SIZE_4_F16; + else + conf |= TLB_VEC_SIZE_2_F16; + } + } + + int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1; + for (int i = 0; i < num_samples; i++) { + struct qreg *color = c->msaa_per_sample_output ? + &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] : + &c->outputs[var->data.driver_location * 4]; + + struct qreg r = color[0]; + struct qreg g = color[1]; + struct qreg b = color[2]; + struct qreg a = color[3]; + + if (c->fs_key->swap_color_rb & (1 << rt)) { + r = color[2]; + b = color[0]; + } + + if (c->fs_key->sample_alpha_to_one) + a = vir_uniform_f(c, 1.0); + + if (is_32b_tlb_format) { + if (i == 0) { + inst = vir_MOV_dest(c, tlbu_reg, r); + inst->uniform = + vir_get_uniform_index(c, + QUNIFORM_CONSTANT, + conf); + } else { + inst = vir_MOV_dest(c, tlb_reg, r); + } + + if (num_components >= 2) + vir_MOV_dest(c, tlb_reg, g); + if (num_components >= 3) + vir_MOV_dest(c, tlb_reg, b); + if (num_components >= 4) + vir_MOV_dest(c, tlb_reg, a); + } else { + inst = vir_VFPACK_dest(c, tlb_reg, r, g); + if (conf != ~0 && i == 0) { + inst->dst = tlbu_reg; + inst->uniform = + vir_get_uniform_index(c, + QUNIFORM_CONSTANT, + conf); + } + + if (num_components >= 3) + inst = vir_VFPACK_dest(c, tlb_reg, b, a); + } + } +} + +static void emit_frag_end(struct v3d_compile *c) { /* XXX @@ -1129,8 +1291,8 @@ emit_frag_end(struct v3d_compile *c) */ bool has_any_tlb_color_write = false; - for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) { - if (c->output_color_var[rt]) + for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) { + if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt]) has_any_tlb_color_write = true; } @@ -1138,15 +1300,15 @@ emit_frag_end(struct v3d_compile *c) struct nir_variable *var = c->output_color_var[0]; struct qreg *color = &c->outputs[var->data.driver_location * 4]; - vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), + vir_SETMSF_dest(c, vir_nop_reg(), vir_AND(c, vir_MSF(c), vir_FTOC(c, color[3]))); } + struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); if (c->output_position_index != -1) { - struct qinst *inst = vir_MOV_dest(c, - vir_reg(QFILE_TLBU, 0), + struct qinst *inst = vir_MOV_dest(c, tlbu_reg, c->outputs[c->output_position_index]); uint8_t tlb_specifier = TLB_TYPE_DEPTH; @@ -1156,8 +1318,9 @@ emit_frag_end(struct v3d_compile *c) } else tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL; - inst->src[vir_get_implicit_uniform_src(inst)] = - vir_uniform_ui(c, tlb_specifier | 0xffffff00); + inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, + tlb_specifier | + 0xffffff00); c->writes_z = true; } else if (c->s->info.fs.uses_discard || !c->s->info.fs.early_fragment_tests || @@ -1173,9 +1336,8 @@ emit_frag_end(struct v3d_compile *c) */ c->s->info.fs.uses_discard = true; - struct qinst *inst = vir_MOV_dest(c, - vir_reg(QFILE_TLBU, 0), - vir_reg(QFILE_NULL, 0)); + struct qinst *inst = vir_MOV_dest(c, tlbu_reg, + vir_nop_reg()); uint8_t tlb_specifier = TLB_TYPE_DEPTH; if (c->devinfo->ver >= 42) { @@ -1188,254 +1350,34 @@ emit_frag_end(struct v3d_compile *c) tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT; } - inst->src[vir_get_implicit_uniform_src(inst)] = - vir_uniform_ui(c, tlb_specifier | 0xffffff00); + inst->uniform = vir_get_uniform_index(c, + QUNIFORM_CONSTANT, + tlb_specifier | + 0xffffff00); c->writes_z = true; } /* XXX: Performance improvement: Merge Z write and color writes TLB * uniform setup */ - - for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) { - if (!c->output_color_var[rt]) - continue; - - nir_variable *var = c->output_color_var[rt]; - struct qreg *color = &c->outputs[var->data.driver_location * 4]; - int num_components = glsl_get_vector_elements(var->type); - uint32_t conf = 0xffffff00; - struct qinst *inst; - - conf |= TLB_SAMPLE_MODE_PER_PIXEL; - conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; - - if (c->fs_key->swap_color_rb & (1 << rt)) - num_components = MAX2(num_components, 3); - - assert(num_components != 0); - switch (glsl_get_base_type(var->type)) { - case GLSL_TYPE_UINT: - case GLSL_TYPE_INT: - /* The F32 vs I32 distinction was dropped in 4.2. */ - if (c->devinfo->ver < 42) - conf |= TLB_TYPE_I32_COLOR; - else - conf |= TLB_TYPE_F32_COLOR; - conf |= ((num_components - 1) << - TLB_VEC_SIZE_MINUS_1_SHIFT); - - inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]); - inst->src[vir_get_implicit_uniform_src(inst)] = - vir_uniform_ui(c, conf); - - for (int i = 1; i < num_components; i++) { - inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), - color[i]); - } - break; - - default: { - struct qreg r = color[0]; - struct qreg g = color[1]; - struct qreg b = color[2]; - struct qreg a = color[3]; - - if (c->fs_key->f32_color_rb & (1 << rt)) { - conf |= TLB_TYPE_F32_COLOR; - conf |= ((num_components - 1) << - TLB_VEC_SIZE_MINUS_1_SHIFT); - } else { - conf |= TLB_TYPE_F16_COLOR; - conf |= TLB_F16_SWAP_HI_LO; - if (num_components >= 3) - conf |= TLB_VEC_SIZE_4_F16; - else - conf |= TLB_VEC_SIZE_2_F16; - } - - if (c->fs_key->swap_color_rb & (1 << rt)) { - r = color[2]; - b = color[0]; - } - - if (c->fs_key->sample_alpha_to_one) - a = vir_uniform_f(c, 1.0); - - if (c->fs_key->f32_color_rb & (1 << rt)) { - inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), r); - inst->src[vir_get_implicit_uniform_src(inst)] = - vir_uniform_ui(c, conf); - - if (num_components >= 2) - vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), g); - if (num_components >= 3) - vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), b); - if (num_components >= 4) - vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), a); - } else { - inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g); - if (conf != ~0) { - inst->dst.file = QFILE_TLBU; - inst->src[vir_get_implicit_uniform_src(inst)] = - vir_uniform_ui(c, conf); - } - - if (num_components >= 3) - inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a); - } - break; - } - } - } + for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) + vir_emit_tlb_color_write(c, rt); } static void -vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t *vpm_index) +vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) { if (c->devinfo->ver >= 40) { - vir_STVPMV(c, vir_uniform_ui(c, *vpm_index), val); - *vpm_index = *vpm_index + 1; + vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val); } else { + /* XXX: v3d33_vir_vpm_write_setup(c); */ vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); } - - c->num_vpm_writes++; -} - -static void -emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w, - uint32_t *vpm_index) -{ - for (int i = 0; i < 2; i++) { - struct qreg coord = c->outputs[c->output_position_index + i]; - coord = vir_FMUL(c, coord, - vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, - 0)); - coord = vir_FMUL(c, coord, rcp_w); - vir_VPM_WRITE(c, vir_FTOIN(c, coord), vpm_index); - } - -} - -static void -emit_zs_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index) -{ - struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0); - struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0); - - struct qreg z = c->outputs[c->output_position_index + 2]; - z = vir_FMUL(c, z, zscale); - z = vir_FMUL(c, z, rcp_w); - z = vir_FADD(c, z, zoffset); - vir_VPM_WRITE(c, z, vpm_index); -} - -static void -emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index) -{ - vir_VPM_WRITE(c, rcp_w, vpm_index); -} - -static void -emit_point_size_write(struct v3d_compile *c, uint32_t *vpm_index) -{ - struct qreg point_size; - - if (c->output_point_size_index != -1) - point_size = c->outputs[c->output_point_size_index]; - else - point_size = vir_uniform_f(c, 1.0); - - /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835, - * BCM21553). - */ - point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125)); - - vir_VPM_WRITE(c, point_size, vpm_index); -} - -static void -emit_vpm_write_setup(struct v3d_compile *c) -{ - if (c->devinfo->ver >= 40) - return; - - v3d33_vir_vpm_write_setup(c); -} - -/** - * Sets up c->outputs[c->output_position_index] for the vertex shader - * epilogue, if an output vertex position wasn't specified in the user's - * shader. This may be the case for transform feedback with rasterizer - * discard enabled. - */ -static void -setup_default_position(struct v3d_compile *c) -{ - if (c->output_position_index != -1) - return; - - c->output_position_index = c->outputs_array_size; - for (int i = 0; i < 4; i++) { - add_output(c, - c->output_position_index + i, - VARYING_SLOT_POS, i); - } } static void emit_vert_end(struct v3d_compile *c) { - setup_default_position(c); - - uint32_t vpm_index = 0; - struct qreg rcp_w = vir_RECIP(c, - c->outputs[c->output_position_index + 3]); - - emit_vpm_write_setup(c); - - if (c->vs_key->is_coord) { - for (int i = 0; i < 4; i++) - vir_VPM_WRITE(c, c->outputs[c->output_position_index + i], - &vpm_index); - emit_scaled_viewport_write(c, rcp_w, &vpm_index); - if (c->vs_key->per_vertex_point_size) { - emit_point_size_write(c, &vpm_index); - /* emit_rcp_wc_write(c, rcp_w); */ - } - /* XXX: Z-only rendering */ - if (0) - emit_zs_write(c, rcp_w, &vpm_index); - } else { - emit_scaled_viewport_write(c, rcp_w, &vpm_index); - emit_zs_write(c, rcp_w, &vpm_index); - emit_rcp_wc_write(c, rcp_w, &vpm_index); - if (c->vs_key->per_vertex_point_size) - emit_point_size_write(c, &vpm_index); - } - - for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { - struct v3d_varying_slot input = c->vs_key->fs_inputs[i]; - int j; - - for (j = 0; j < c->num_outputs; j++) { - struct v3d_varying_slot output = c->output_slots[j]; - - if (!memcmp(&input, &output, sizeof(input))) { - vir_VPM_WRITE(c, c->outputs[j], - &vpm_index); - break; - } - } - /* Emit padding if we didn't find a declared VS output for - * this FS input. - */ - if (j == c->num_outputs) - vir_VPM_WRITE(c, vir_uniform_f(c, 0.0), - &vpm_index); - } - /* GFXH-1684: VPM writes need to be complete by the end of the shader. */ if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) @@ -1446,25 +1388,48 @@ void v3d_optimize_nir(struct nir_shader *s) { bool progress; + unsigned lower_flrp = + (s->options->lower_flrp16 ? 16 : 0) | + (s->options->lower_flrp32 ? 32 : 0) | + (s->options->lower_flrp64 ? 64 : 0); do { progress = false; NIR_PASS_V(s, nir_lower_vars_to_ssa); - NIR_PASS(progress, s, nir_lower_alu_to_scalar); + NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL); NIR_PASS(progress, s, nir_lower_phis_to_scalar); NIR_PASS(progress, s, nir_copy_prop); NIR_PASS(progress, s, nir_opt_remove_phis); NIR_PASS(progress, s, nir_opt_dce); NIR_PASS(progress, s, nir_opt_dead_cf); NIR_PASS(progress, s, nir_opt_cse); - NIR_PASS(progress, s, nir_opt_peephole_select, 8, true); + NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); + + if (lower_flrp != 0) { + bool lower_flrp_progress = false; + + NIR_PASS(lower_flrp_progress, s, nir_lower_flrp, + lower_flrp, + false /* always_precise */, + s->options->lower_ffma); + if (lower_flrp_progress) { + NIR_PASS(progress, s, nir_opt_constant_folding); + progress = true; + } + + /* Nothing should rematerialize any flrps, so we only + * need to do this lowering once. + */ + lower_flrp = 0; + } + NIR_PASS(progress, s, nir_opt_undef); } while (progress); - NIR_PASS(progress, s, nir_opt_move_load_ubo); + NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo); } static int @@ -1492,7 +1457,6 @@ ntq_emit_vpm_read(struct v3d_compile *c, if (*num_components_queued != 0) { (*num_components_queued)--; - c->num_inputs++; return vir_MOV(c, vpm); } @@ -1502,7 +1466,6 @@ ntq_emit_vpm_read(struct v3d_compile *c, *num_components_queued = num_components - 1; *remaining -= num_components; - c->num_inputs++; return vir_MOV(c, vpm); } @@ -1550,6 +1513,12 @@ ntq_setup_vpm_inputs(struct v3d_compile *c) &num_components, ~0); } + /* The actual loads will happen directly in nir_intrinsic_load_input + * on newer versions. + */ + if (c->devinfo->ver >= 40) + return; + for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) { resize_qreg_array(c, &c->inputs, &c->inputs_array_size, (loc + 1) * 4); @@ -1572,6 +1541,26 @@ ntq_setup_vpm_inputs(struct v3d_compile *c) } } +static bool +var_needs_point_coord(struct v3d_compile *c, nir_variable *var) +{ + return (var->data.location == VARYING_SLOT_PNTC || + (var->data.location >= VARYING_SLOT_VAR0 && + (c->fs_key->point_sprite_mask & + (1 << (var->data.location - VARYING_SLOT_VAR0))))); +} + +static bool +program_reads_point_coord(struct v3d_compile *c) +{ + nir_foreach_variable(var, &c->s->inputs) { + if (var_needs_point_coord(c, var)) + return true; + } + + return false; +} + static void ntq_setup_fs_inputs(struct v3d_compile *c) { @@ -1605,11 +1594,7 @@ ntq_setup_fs_inputs(struct v3d_compile *c) if (var->data.location == VARYING_SLOT_POS) { emit_fragcoord_input(c, loc); - } else if (var->data.location == VARYING_SLOT_PNTC || - (var->data.location >= VARYING_SLOT_VAR0 && - (c->fs_key->point_sprite_mask & - (1 << (var->data.location - - VARYING_SLOT_VAR0))))) { + } else if (var_needs_point_coord(c, var)) { c->inputs[loc * 4 + 0] = c->point_x; c->inputs[loc * 4 + 1] = c->point_y; } else { @@ -1622,6 +1607,9 @@ ntq_setup_fs_inputs(struct v3d_compile *c) static void ntq_setup_outputs(struct v3d_compile *c) { + if (c->s->info.stage != MESA_SHADER_FRAGMENT) + return; + nir_foreach_variable(var, &c->s->outputs) { unsigned array_len = MAX2(glsl_get_length(var->type), 1); unsigned loc = var->data.driver_location * 4; @@ -1635,58 +1623,30 @@ ntq_setup_outputs(struct v3d_compile *c) var->data.location_frac + i); } - if (c->s->info.stage == MESA_SHADER_FRAGMENT) { - switch (var->data.location) { - case FRAG_RESULT_COLOR: - c->output_color_var[0] = var; - c->output_color_var[1] = var; - c->output_color_var[2] = var; - c->output_color_var[3] = var; - break; - case FRAG_RESULT_DATA0: - case FRAG_RESULT_DATA1: - case FRAG_RESULT_DATA2: - case FRAG_RESULT_DATA3: - c->output_color_var[var->data.location - - FRAG_RESULT_DATA0] = var; - break; - case FRAG_RESULT_DEPTH: - c->output_position_index = loc; - break; - case FRAG_RESULT_SAMPLE_MASK: - c->output_sample_mask_index = loc; - break; - } - } else { - switch (var->data.location) { - case VARYING_SLOT_POS: - c->output_position_index = loc; - break; - case VARYING_SLOT_PSIZ: - c->output_point_size_index = loc; - break; - } + switch (var->data.location) { + case FRAG_RESULT_COLOR: + c->output_color_var[0] = var; + c->output_color_var[1] = var; + c->output_color_var[2] = var; + c->output_color_var[3] = var; + break; + case FRAG_RESULT_DATA0: + case FRAG_RESULT_DATA1: + case FRAG_RESULT_DATA2: + case FRAG_RESULT_DATA3: + c->output_color_var[var->data.location - + FRAG_RESULT_DATA0] = var; + break; + case FRAG_RESULT_DEPTH: + c->output_position_index = loc; + break; + case FRAG_RESULT_SAMPLE_MASK: + c->output_sample_mask_index = loc; + break; } } } -static void -ntq_setup_uniforms(struct v3d_compile *c) -{ - nir_foreach_variable(var, &c->s->uniforms) { - uint32_t vec4_count = glsl_count_attribute_slots(var->type, - false); - unsigned vec4_size = 4 * sizeof(float); - - if (var->data.mode != nir_var_uniform) - continue; - - declare_uniform_range(c, var->data.driver_location * vec4_size, - vec4_count * vec4_size); - - } -} - /** * Sets up the mapping from nir_register to struct qreg *. * @@ -1717,7 +1677,7 @@ ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr) */ struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); for (int i = 0; i < instr->def.num_components; i++) - qregs[i] = vir_uniform_ui(c, instr->value.u32[i]); + qregs[i] = vir_uniform_ui(c, instr->value[i].u32); _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); } @@ -1761,26 +1721,239 @@ ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr) } static void -ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) +vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr) { - unsigned offset; + assert(c->s->info.stage == MESA_SHADER_FRAGMENT); - switch (instr->intrinsic) { - case nir_intrinsic_load_uniform: - if (nir_src_is_const(instr->src[0])) { - int offset = (nir_intrinsic_base(instr) + - nir_src_as_uint(instr->src[0])); - assert(offset % 4 == 0); - /* We need dwords */ - offset = offset / 4; - for (int i = 0; i < instr->num_components; i++) { - ntq_store_dest(c, &instr->dest, i, - vir_uniform(c, QUNIFORM_UNIFORM, - offset + i)); - } + int rt = nir_src_as_uint(instr->src[0]); + assert(rt < V3D_MAX_DRAW_BUFFERS); + + int sample_index = nir_intrinsic_base(instr) ; + assert(sample_index < V3D_MAX_SAMPLES); + + int component = nir_intrinsic_component(instr); + assert(component < 4); + + /* We need to emit our TLB reads after we have acquired the scoreboard + * lock, or the GPU will hang. Usually, we do our scoreboard locking on + * the last thread switch to improve parallelism, however, that is only + * guaranteed to happen before the tlb color writes. + * + * To fix that, we make sure we always emit a thread switch before the + * first tlb color read. If that happens to be the last thread switch + * we emit, then everything is fine, but otherwsie, if any code after + * this point needs to emit additional thread switches, then we will + * switch the strategy to locking the scoreboard on the first thread + * switch instead -- see vir_emit_thrsw(). + */ + if (!c->emitted_tlb_load) { + if (!c->last_thrsw_at_top_level) { + assert(c->devinfo->ver >= 41); + vir_emit_thrsw(c); + } + + c->emitted_tlb_load = true; + } + + struct qreg *color_reads_for_sample = + &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4]; + + if (color_reads_for_sample[component].file == QFILE_NULL) { + enum pipe_format rt_format = c->fs_key->color_fmt[rt].format; + int num_components = + util_format_get_nr_components(rt_format); + + const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt); + if (swap_rb) + num_components = MAX2(num_components, 3); + + nir_variable *var = c->output_color_var[rt]; + enum glsl_base_type type = glsl_get_base_type(var->type); + + bool is_int_format = type == GLSL_TYPE_INT || + type == GLSL_TYPE_UINT; + + bool is_32b_tlb_format = is_int_format || + (c->fs_key->f32_color_rb & (1 << rt)); + + int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1; + + uint32_t conf = 0xffffff00; + conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE : + TLB_SAMPLE_MODE_PER_PIXEL; + conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; + + if (is_32b_tlb_format) { + /* The F32 vs I32 distinction was dropped in 4.2. */ + conf |= (c->devinfo->ver < 42 && is_int_format) ? + TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR; + + conf |= ((num_components - 1) << + TLB_VEC_SIZE_MINUS_1_SHIFT); } else { - ntq_emit_tmu_general(c, instr, false); + conf |= TLB_TYPE_F16_COLOR; + conf |= TLB_F16_SWAP_HI_LO; + + if (num_components >= 3) + conf |= TLB_VEC_SIZE_4_F16; + else + conf |= TLB_VEC_SIZE_2_F16; } + + + for (int i = 0; i < num_samples; i++) { + struct qreg r, g, b, a; + if (is_32b_tlb_format) { + r = conf != 0xffffffff && i == 0? + vir_TLBU_COLOR_READ(c, conf) : + vir_TLB_COLOR_READ(c); + if (num_components >= 2) + g = vir_TLB_COLOR_READ(c); + if (num_components >= 3) + b = vir_TLB_COLOR_READ(c); + if (num_components >= 4) + a = vir_TLB_COLOR_READ(c); + } else { + struct qreg rg = conf != 0xffffffff && i == 0 ? + vir_TLBU_COLOR_READ(c, conf) : + vir_TLB_COLOR_READ(c); + r = vir_FMOV(c, rg); + vir_set_unpack(c->defs[r.index], 0, + V3D_QPU_UNPACK_L); + g = vir_FMOV(c, rg); + vir_set_unpack(c->defs[g.index], 0, + V3D_QPU_UNPACK_H); + + if (num_components > 2) { + struct qreg ba = vir_TLB_COLOR_READ(c); + b = vir_FMOV(c, ba); + vir_set_unpack(c->defs[b.index], 0, + V3D_QPU_UNPACK_L); + a = vir_FMOV(c, ba); + vir_set_unpack(c->defs[a.index], 0, + V3D_QPU_UNPACK_H); + } + } + + struct qreg *color_reads = + &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4]; + + color_reads[0] = swap_rb ? b : r; + if (num_components >= 2) + color_reads[1] = g; + if (num_components >= 3) + color_reads[2] = swap_rb ? r : b; + if (num_components >= 4) + color_reads[3] = a; + } + } + + assert(color_reads_for_sample[component].file != QFILE_NULL); + ntq_store_dest(c, &instr->dest, 0, + vir_MOV(c, color_reads_for_sample[component])); +} + +static void +ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + if (nir_src_is_const(instr->src[0])) { + int offset = (nir_intrinsic_base(instr) + + nir_src_as_uint(instr->src[0])); + assert(offset % 4 == 0); + /* We need dwords */ + offset = offset / 4; + for (int i = 0; i < instr->num_components; i++) { + ntq_store_dest(c, &instr->dest, i, + vir_uniform(c, QUNIFORM_UNIFORM, + offset + i)); + } + } else { + ntq_emit_tmu_general(c, instr, false); + } +} + +static void +ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset) + * and enable PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR. + */ + unsigned offset = + nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]); + + if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) { + /* Emit the LDVPM directly now, rather than at the top + * of the shader like we did for V3D 3.x (which needs + * vpmsetup when not just taking the next offset). + * + * Note that delaying like this may introduce stalls, + * as LDVPMV takes a minimum of 1 instruction but may + * be slower if the VPM unit is busy with another QPU. + */ + int index = 0; + if (c->s->info.system_values_read & + (1ull << SYSTEM_VALUE_INSTANCE_ID)) { + index++; + } + if (c->s->info.system_values_read & + (1ull << SYSTEM_VALUE_VERTEX_ID)) { + index++; + } + for (int i = 0; i < offset; i++) + index += c->vattr_sizes[i]; + index += nir_intrinsic_component(instr); + for (int i = 0; i < instr->num_components; i++) { + struct qreg vpm_offset = vir_uniform_ui(c, index++); + ntq_store_dest(c, &instr->dest, i, + vir_LDVPMV_IN(c, vpm_offset)); + } + } else { + for (int i = 0; i < instr->num_components; i++) { + int comp = nir_intrinsic_component(instr) + i; + ntq_store_dest(c, &instr->dest, i, + vir_MOV(c, c->inputs[offset * 4 + comp])); + } + } +} + +static void +ntq_emit_per_sample_color_write(struct v3d_compile *c, + nir_intrinsic_instr *instr) +{ + assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d); + + unsigned rt = nir_src_as_uint(instr->src[1]); + assert(rt < V3D_MAX_DRAW_BUFFERS); + + unsigned sample_idx = nir_intrinsic_base(instr); + assert(sample_idx < V3D_MAX_SAMPLES); + + unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4; + for (int i = 0; i < instr->num_components; i++) { + c->sample_colors[offset + i] = + vir_MOV(c, ntq_get_src(c, instr->src[0], i)); + } +} + +static void +ntq_emit_color_write(struct v3d_compile *c, + nir_intrinsic_instr *instr) +{ + unsigned offset = (nir_intrinsic_base(instr) + + nir_src_as_uint(instr->src[1])) * 4 + + nir_intrinsic_component(instr); + for (int i = 0; i < instr->num_components; i++) { + c->outputs[offset + i] = + vir_MOV(c, ntq_get_src(c, instr->src[0], i)); + } +} + +static void +ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_uniform: + ntq_emit_load_uniform(c, instr); break; case nir_intrinsic_load_ubo: @@ -1814,6 +1987,8 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) case nir_intrinsic_shared_atomic_comp_swap: case nir_intrinsic_load_shared: case nir_intrinsic_store_shared: + case nir_intrinsic_load_scratch: + case nir_intrinsic_store_scratch: ntq_emit_tmu_general(c, instr, true); break; @@ -1845,6 +2020,26 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) } break; + case nir_intrinsic_load_viewport_x_scale: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); + break; + + case nir_intrinsic_load_viewport_y_scale: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); + break; + + case nir_intrinsic_load_viewport_z_scale: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); + break; + + case nir_intrinsic_load_viewport_z_offset: + ntq_store_dest(c, &instr->dest, 0, + vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); + break; + case nir_intrinsic_load_alpha_ref_float: ntq_store_dest(c, &instr->dest, 0, vir_uniform(c, QUNIFORM_ALPHA_REF, 0)); @@ -1855,7 +2050,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_helper_invocation: - vir_PF(c, vir_MSF(c), V3D_QPU_PF_PUSHZ); + vir_set_pf(vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA, vir_uniform_ui(c, ~0), @@ -1880,27 +2075,32 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid)); break; + case nir_intrinsic_load_tlb_color_v3d: + vir_emit_tlb_color_read(c, instr); + break; + case nir_intrinsic_load_input: - for (int i = 0; i < instr->num_components; i++) { - offset = (nir_intrinsic_base(instr) + - nir_src_as_uint(instr->src[0])); - int comp = nir_intrinsic_component(instr) + i; - ntq_store_dest(c, &instr->dest, i, - vir_MOV(c, c->inputs[offset * 4 + comp])); - } + ntq_emit_load_input(c, instr); break; - case nir_intrinsic_store_output: - offset = ((nir_intrinsic_base(instr) + - nir_src_as_uint(instr->src[1])) * 4 + - nir_intrinsic_component(instr)); + case nir_intrinsic_store_tlb_sample_color_v3d: + ntq_emit_per_sample_color_write(c, instr); + break; - for (int i = 0; i < instr->num_components; i++) { - c->outputs[offset + i] = - vir_MOV(c, ntq_get_src(c, instr->src[0], i)); + case nir_intrinsic_store_output: + /* XXX perf: Use stvpmv with uniform non-constant offsets and + * stvpmd with non-uniform offsets and enable + * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR. + */ + if (c->s->info.stage == MESA_SHADER_FRAGMENT) { + ntq_emit_color_write(c, instr); + } else { + assert(instr->num_components == 1); + + vir_VPM_WRITE(c, + ntq_get_src(c, instr->src[0], 0), + nir_intrinsic_base(instr)); } - c->num_outputs = MAX2(c->num_outputs, - offset + instr->num_components); break; case nir_intrinsic_image_deref_size: @@ -1908,38 +2108,35 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_discard: - if (c->execute.file != QFILE_NULL) { - vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); - vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), + if (vir_in_nonuniform_control_flow(c)) { + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), vir_uniform_ui(c, 0)), V3D_QPU_COND_IFA); } else { - vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), + vir_SETMSF_dest(c, vir_nop_reg(), vir_uniform_ui(c, 0)); } break; case nir_intrinsic_discard_if: { - /* true (~0) if we're discarding */ - struct qreg cond = ntq_get_src(c, instr->src[0], 0); + enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]); - if (c->execute.file != QFILE_NULL) { - /* execute == 0 means the channel is active. Invert - * the condition so that we can use zero as "executing - * and discarding." - */ - vir_PF(c, vir_OR(c, c->execute, vir_NOT(c, cond)), - V3D_QPU_PF_PUSHZ); - vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), - vir_uniform_ui(c, 0)), - V3D_QPU_COND_IFA); - } else { - vir_PF(c, cond, V3D_QPU_PF_PUSHZ); - vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), - vir_uniform_ui(c, 0)), - V3D_QPU_COND_IFNA); + if (vir_in_nonuniform_control_flow(c)) { + struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(), + c->execute); + if (cond == V3D_QPU_COND_IFA) { + vir_set_uf(exec_flag, V3D_QPU_UF_ANDZ); + } else { + vir_set_uf(exec_flag, V3D_QPU_UF_NORNZ); + cond = V3D_QPU_COND_IFA; + } } + vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), + vir_uniform_ui(c, 0)), cond); + break; } @@ -1948,6 +2145,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) case nir_intrinsic_memory_barrier_buffer: case nir_intrinsic_memory_barrier_image: case nir_intrinsic_memory_barrier_shared: + case nir_intrinsic_group_memory_barrier: /* We don't do any instruction scheduling of these NIR * instructions between each other, so we just need to make * sure that the TMU operations before the barrier are flushed @@ -1970,10 +2168,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCU)); - sync->src[vir_get_implicit_uniform_src(sync)] = - vir_uniform_ui(c, - 0xffffff00 | - V3D_TSY_WAIT_INC_CHECK); + sync->uniform = + vir_get_uniform_index(c, QUNIFORM_CONSTANT, + 0xffffff00 | + V3D_TSY_WAIT_INC_CHECK); } @@ -2010,6 +2208,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) vir_uniform_ui(c, 0xffff))); break; + case nir_intrinsic_load_subgroup_id: + ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); + break; + default: fprintf(stderr, "Unknown intrinsic: "); nir_print_instr(&instr->instr, stderr); @@ -2030,7 +2232,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) static void ntq_activate_execute_for_block(struct v3d_compile *c) { - vir_set_pf(vir_XOR_dest(c, vir_reg(QFILE_NULL, 0), + vir_set_pf(vir_XOR_dest(c, vir_nop_reg(), c->execute, vir_uniform_ui(c, c->cur_block->index)), V3D_QPU_PF_PUSHZ); @@ -2054,14 +2256,7 @@ ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt) else_block = vir_new_block(c); /* Set up the flags for the IF condition (taking the THEN branch). */ - nir_alu_instr *if_condition_alu = ntq_get_alu_parent(if_stmt->condition); - enum v3d_qpu_cond cond; - if (!if_condition_alu || - !ntq_emit_comparison(c, if_condition_alu, &cond)) { - vir_PF(c, ntq_get_src(c, if_stmt->condition, 0), - V3D_QPU_PF_PUSHZ); - cond = V3D_QPU_COND_IFNA; - } + enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition); /* Jump to ELSE. */ vir_BRANCH(c, cond == V3D_QPU_COND_IFA ? @@ -2081,7 +2276,6 @@ ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt) /* Emit the else block. */ vir_set_emit_block(c, else_block); - ntq_activate_execute_for_block(c); ntq_emit_cf_list(c, &if_stmt->else_list); } @@ -2107,20 +2301,13 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) else_block = vir_new_block(c); bool was_uniform_control_flow = false; - if (c->execute.file == QFILE_NULL) { + if (!vir_in_nonuniform_control_flow(c)) { c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); was_uniform_control_flow = true; } /* Set up the flags for the IF condition (taking the THEN branch). */ - nir_alu_instr *if_condition_alu = ntq_get_alu_parent(if_stmt->condition); - enum v3d_qpu_cond cond; - if (!if_condition_alu || - !ntq_emit_comparison(c, if_condition_alu, &cond)) { - vir_PF(c, ntq_get_src(c, if_stmt->condition, 0), - V3D_QPU_PF_PUSHZ); - cond = V3D_QPU_COND_IFNA; - } + enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition); /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and * was previously active (execute Z) for updating the exec flags. @@ -2128,8 +2315,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) if (was_uniform_control_flow) { cond = v3d_qpu_cond_invert(cond); } else { - struct qinst *inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), - c->execute); + struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute); if (cond == V3D_QPU_COND_IFA) { vir_set_uf(inst, V3D_QPU_UF_NORNZ); } else { @@ -2145,7 +2331,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) /* Jump to ELSE if nothing is active for THEN, otherwise fall * through. */ - vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); vir_link_blocks(c->cur_block, else_block); vir_link_blocks(c->cur_block, then_block); @@ -2159,14 +2345,16 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) * active channels update their execute flags to point to * ENDIF */ - vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, after_block->index)); /* If everything points at ENDIF, then jump there immediately. */ - vir_PF(c, vir_XOR(c, c->execute, - vir_uniform_ui(c, after_block->index)), - V3D_QPU_PF_PUSHZ); + vir_set_pf(vir_XOR_dest(c, vir_nop_reg(), + c->execute, + vir_uniform_ui(c, after_block->index)), + V3D_QPU_PF_PUSHZ); vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); vir_link_blocks(c->cur_block, after_block); vir_link_blocks(c->cur_block, else_block); @@ -2190,7 +2378,7 @@ ntq_emit_if(struct v3d_compile *c, nir_if *nif) { bool was_in_control_flow = c->in_control_flow; c->in_control_flow = true; - if (c->execute.file == QFILE_NULL && + if (!vir_in_nonuniform_control_flow(c) && nir_src_is_dynamically_uniform(nif->condition)) { ntq_emit_uniform_if(c, nif); } else { @@ -2204,13 +2392,15 @@ ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump) { switch (jump->type) { case nir_jump_break: - vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, c->loop_break_block->index)); break; case nir_jump_continue: - vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, c->loop_cont_block->index)); break; @@ -2277,7 +2467,7 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) c->in_control_flow = true; bool was_uniform_control_flow = false; - if (c->execute.file == QFILE_NULL) { + if (!vir_in_nonuniform_control_flow(c)) { c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); was_uniform_control_flow = true; } @@ -2299,13 +2489,14 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) * * XXX: Use the .ORZ flags update, instead. */ - vir_PF(c, vir_XOR(c, - c->execute, - vir_uniform_ui(c, c->loop_cont_block->index)), - V3D_QPU_PF_PUSHZ); + vir_set_pf(vir_XOR_dest(c, + vir_nop_reg(), + c->execute, + vir_uniform_ui(c, c->loop_cont_block->index)), + V3D_QPU_PF_PUSHZ); vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); - vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA); /* Pixels that were not dispatched or have been discarded should not @@ -2380,15 +2571,17 @@ nir_to_vir(struct v3d_compile *c) c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); - /* XXX perf: We could set the "disable implicit point/line - * varyings" field in the shader record and not emit these, if - * they're not going to be used. + /* V3D 4.x can disable implicit point coordinate varyings if + * they are not used. */ - if (c->fs_key->is_points) { + if (c->fs_key->is_points && + (c->devinfo->ver < 40 || program_reads_point_coord(c))) { c->point_x = emit_fragment_varying(c, NULL, 0, 0); c->point_y = emit_fragment_varying(c, NULL, 0, 0); - } else if (c->fs_key->is_lines) { + c->uses_implicit_point_line_varyings = true; + } else if (c->fs_key->is_lines && c->devinfo->ver < 40) { c->line_x = emit_fragment_varying(c, NULL, 0, 0); + c->uses_implicit_point_line_varyings = true; } break; case MESA_SHADER_COMPUTE: @@ -2398,16 +2591,8 @@ nir_to_vir(struct v3d_compile *c) V3D_QPU_WADDR_SYNC)); } - if (c->s->info.system_values_read & - ((1ull << SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) | - (1ull << SYSTEM_VALUE_WORK_GROUP_ID))) { - c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); - } - if ((c->s->info.system_values_read & - ((1ull << SYSTEM_VALUE_WORK_GROUP_ID))) || - c->s->info.cs.shared_size) { - c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); - } + c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); + c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); /* Set up the division between gl_LocalInvocationIndex and * wg_in_mem in the payload reg. @@ -2444,14 +2629,17 @@ nir_to_vir(struct v3d_compile *c) break; } + if (c->s->scratch_size) { + v3d_setup_spill_base(c); + c->spill_size += V3D_CHANNELS * c->s->scratch_size; + } + if (c->s->info.stage == MESA_SHADER_FRAGMENT) ntq_setup_fs_inputs(c); else ntq_setup_vpm_inputs(c); ntq_setup_outputs(c); - ntq_setup_uniforms(c); - ntq_setup_registers(c, &c->s->registers); /* Find the main function and emit the body. */ nir_foreach_function(function, c->s) { @@ -2465,12 +2653,13 @@ const nir_shader_compiler_options v3d_nir_options = { .lower_all_io_to_temps = true, .lower_extract_byte = true, .lower_extract_word = true, - .lower_bfm = true, .lower_bitfield_insert_to_shifts = true, .lower_bitfield_extract_to_shifts = true, .lower_bitfield_reverse = true, .lower_bit_count = true, .lower_cs_local_id_from_index = true, + .lower_ffract = true, + .lower_fmod = true, .lower_pack_unorm_2x16 = true, .lower_pack_snorm_2x16 = true, .lower_pack_unorm_4x8 = true, @@ -2487,10 +2676,11 @@ const nir_shader_compiler_options v3d_nir_options = { .lower_fsat = true, .lower_fsqrt = true, .lower_ifind_msb = true, + .lower_isign = true, .lower_ldexp = true, .lower_mul_high = true, .lower_wpos_pntc = true, - .native_integers = true, + .lower_rotate = true, }; /** @@ -2595,6 +2785,8 @@ v3d_nir_to_vir(struct v3d_compile *c) case MESA_SHADER_VERTEX: emit_vert_end(c); break; + case MESA_SHADER_COMPUTE: + break; default: unreachable("bad stage"); } @@ -2609,7 +2801,6 @@ v3d_nir_to_vir(struct v3d_compile *c) } vir_optimize(c); - vir_lower_uniforms(c); vir_check_payload_w(c); @@ -2659,5 +2850,15 @@ v3d_nir_to_vir(struct v3d_compile *c) vir_remove_thrsw(c); } + if (c->spills && + (V3D_DEBUG & (V3D_DEBUG_VIR | + v3d_debug_flag_for_shader_stage(c->s->info.stage)))) { + fprintf(stderr, "%s prog %d/%d spilled VIR:\n", + vir_get_stage_name(c), + c->program_id, c->variant_id); + vir_dump(c); + fprintf(stderr, "\n"); + } + v3d_vir_to_qpu(c, temp_registers); } diff --git a/lib/mesa/src/broadcom/compiler/qpu_schedule.c b/lib/mesa/src/broadcom/compiler/qpu_schedule.c index 0f8001ff5..c15218e26 100644 --- a/lib/mesa/src/broadcom/compiler/qpu_schedule.c +++ b/lib/mesa/src/broadcom/compiler/qpu_schedule.c @@ -37,18 +37,16 @@ #include "qpu/qpu_disasm.h" #include "v3d_compiler.h" #include "util/ralloc.h" +#include "util/dag.h" static bool debug; struct schedule_node_child; struct schedule_node { + struct dag_node dag; struct list_head link; struct qinst *inst; - struct schedule_node_child *children; - uint32_t child_count; - uint32_t child_array_size; - uint32_t parent_count; /* Longest cycles + instruction_latency() of any parent of this node. */ uint32_t unblocked_time; @@ -67,11 +65,6 @@ struct schedule_node { uint32_t latency; }; -struct schedule_node_child { - struct schedule_node *node; - bool write_after_read; -}; - /* When walking the instructions in reverse, we need to swap before/after in * add_dep(). */ @@ -79,6 +72,7 @@ enum direction { F, R }; struct schedule_state { const struct v3d_device_info *devinfo; + struct dag *dag; struct schedule_node *last_r[6]; struct schedule_node *last_rf[64]; struct schedule_node *last_sf; @@ -101,37 +95,17 @@ add_dep(struct schedule_state *state, bool write) { bool write_after_read = !write && state->dir == R; + void *edge_data = (void *)(uintptr_t)write_after_read; if (!before || !after) return; assert(before != after); - if (state->dir == R) { - struct schedule_node *t = before; - before = after; - after = t; - } - - for (int i = 0; i < before->child_count; i++) { - if (before->children[i].node == after && - (before->children[i].write_after_read == write_after_read)) { - return; - } - } - - if (before->child_array_size <= before->child_count) { - before->child_array_size = MAX2(before->child_array_size * 2, 16); - before->children = reralloc(before, before->children, - struct schedule_node_child, - before->child_array_size); - } - - before->children[before->child_count].node = after; - before->children[before->child_count].write_after_read = - write_after_read; - before->child_count++; - after->parent_count++; + if (state->dir == F) + dag_add_edge(&before->dag, &after->dag, edge_data); + else + dag_add_edge(&after->dag, &before->dag, edge_data); } static void @@ -154,6 +128,9 @@ add_write_dep(struct schedule_state *state, static bool qpu_inst_is_tlb(const struct v3d_qpu_instr *inst) { + if (inst->sig.ldtlb || inst->sig.ldtlbu) + return true; + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) return false; @@ -179,7 +156,10 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n, add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); break; case V3D_QPU_MUX_B: - add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n); + if (!n->inst->qpu.sig.small_imm) { + add_read_dep(state, + state->last_rf[n->inst->qpu.raddr_b], n); + } break; default: add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n); @@ -402,7 +382,7 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) add_write_dep(state, &state->last_tmu_config, n); if (inst->sig.ldtlb | inst->sig.ldtlbu) - add_read_dep(state, state->last_tlb, n); + add_write_dep(state, &state->last_tlb, n); if (inst->sig.ldvpm) { add_write_dep(state, &state->last_vpm_read, n); @@ -415,7 +395,7 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) } /* inst->sig.ldunif or sideband uniform read */ - if (qinst->uniform != ~0) + if (vir_has_uniform(qinst)) add_write_dep(state, &state->last_unif, n); if (v3d_qpu_reads_flags(inst)) @@ -425,11 +405,13 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) } static void -calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list) +calculate_forward_deps(struct v3d_compile *c, struct dag *dag, + struct list_head *schedule_list) { struct schedule_state state; memset(&state, 0, sizeof(state)); + state.dag = dag; state.devinfo = c->devinfo; state.dir = F; @@ -438,23 +420,28 @@ calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list) } static void -calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list) +calculate_reverse_deps(struct v3d_compile *c, struct dag *dag, + struct list_head *schedule_list) { - struct list_head *node; struct schedule_state state; memset(&state, 0, sizeof(state)); + state.dag = dag; state.devinfo = c->devinfo; state.dir = R; - for (node = schedule_list->prev; schedule_list != node; node = node->prev) { + list_for_each_entry_rev(struct schedule_node, node, schedule_list, + link) { calculate_deps(&state, (struct schedule_node *)node); } } struct choose_scoreboard { + struct dag *dag; int tick; int last_magic_sfu_write_tick; + int last_stallable_sfu_reg; + int last_stallable_sfu_tick; int last_ldvary_tick; int last_uniforms_reset_tick; int last_thrsw_tick; @@ -546,6 +533,38 @@ pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst)); } +static bool +qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst, + uint32_t waddr) { + + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return false; + + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) && + inst->raddr_a == waddr) + return true; + + if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) && + !inst->sig.small_imm && (inst->raddr_b == waddr)) + return true; + + return false; +} + +static bool +mux_read_stalls(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) +{ + return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 && + qpu_instruction_uses_rf(inst, + scoreboard->last_stallable_sfu_reg); +} + +/* We define a max schedule priority to allow negative priorities as result of + * substracting this max when an instruction stalls. So instructions that + * stall have lower priority than regular instructions. */ +#define MAX_SCHEDULE_PRIORITY 16 + static int get_instruction_priority(const struct v3d_qpu_instr *inst) { @@ -564,10 +583,6 @@ get_instruction_priority(const struct v3d_qpu_instr *inst) return next_score; next_score++; - /* XXX perf: We should schedule SFU ALU ops so that the reader is 2 - * instructions after the producer if possible, not just 1. - */ - /* Default score for things that aren't otherwise special. */ baseline_score = next_score; next_score++; @@ -577,6 +592,9 @@ get_instruction_priority(const struct v3d_qpu_instr *inst) return next_score; next_score++; + /* We should increase the maximum if we assert here */ + assert(next_score < MAX_SCHEDULE_PRIORITY); + return baseline_score; } @@ -623,6 +641,37 @@ qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) } static bool +qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo, + const struct v3d_qpu_instr *a, + const struct v3d_qpu_instr *b) +{ + const bool a_uses_peripheral = qpu_accesses_peripheral(a); + const bool b_uses_peripheral = qpu_accesses_peripheral(b); + + /* We can always do one peripheral access per instruction. */ + if (!a_uses_peripheral || !b_uses_peripheral) + return true; + + if (devinfo->ver < 41) + return false; + + /* V3D 4.1 and later allow TMU read along with a VPM read or write, and + * WRTMUC with a TMU magic register write (other than tmuc). + */ + if ((a->sig.ldtmu && v3d_qpu_uses_vpm(b)) || + (b->sig.ldtmu && v3d_qpu_uses_vpm(a))) { + return true; + } + + if ((a->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(b)) || + (b->sig.wrtmuc && v3d_qpu_writes_tmu_not_tmuc(a))) { + return true; + } + + return false; +} + +static bool qpu_merge_inst(const struct v3d_device_info *devinfo, struct v3d_qpu_instr *result, const struct v3d_qpu_instr *a, @@ -633,12 +682,7 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, return false; } - /* Can't do more than one peripheral access in an instruction. - * - * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and - * WRTMUC with a TMU magic register write (other than tmuc). - */ - if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b)) + if (!qpu_compatible_peripheral_access(devinfo, a, b)) return false; struct v3d_qpu_instr merge = *a; @@ -714,7 +758,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo, static struct schedule_node * choose_instruction_to_schedule(const struct v3d_device_info *devinfo, struct choose_scoreboard *scoreboard, - struct list_head *schedule_list, struct schedule_node *prev_inst) { struct schedule_node *chosen = NULL; @@ -728,7 +771,8 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, return NULL; } - list_for_each_entry(struct schedule_node, n, schedule_list, link) { + list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads, + dag.link) { const struct v3d_qpu_instr *inst = &n->inst->qpu; /* Don't choose the branch instruction until it's the last one @@ -736,7 +780,7 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, * choose it. */ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && - !list_is_singular(schedule_list)) { + !list_is_singular(&scoreboard->dag->heads)) { continue; } @@ -805,6 +849,18 @@ choose_instruction_to_schedule(const struct v3d_device_info *devinfo, int prio = get_instruction_priority(inst); + if (mux_read_stalls(scoreboard, inst)) { + /* Don't merge an instruction that stalls */ + if (prev_inst) + continue; + else { + /* Any instruction that don't stall will have + * higher scheduling priority */ + prio -= MAX_SCHEDULE_PRIORITY; + assert(prio < 0); + } + } + /* Found a valid instruction. If nothing better comes along, * this one works. */ @@ -841,6 +897,16 @@ update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, } static void +update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard, + const struct v3d_qpu_instr *inst) +{ + if (v3d_qpu_instr_is_sfu(inst)) { + scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr; + scoreboard->last_stallable_sfu_tick = scoreboard->tick; + } +} + +static void update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, const struct v3d_qpu_instr *inst) { @@ -853,6 +919,9 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, if (inst->alu.add.magic_write) { update_scoreboard_for_magic_waddr(scoreboard, inst->alu.add.waddr); + } else { + update_scoreboard_for_sfu_stall_waddr(scoreboard, + inst); } } @@ -871,24 +940,24 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, } static void -dump_state(const struct v3d_device_info *devinfo, - struct list_head *schedule_list) +dump_state(const struct v3d_device_info *devinfo, struct dag *dag) { - list_for_each_entry(struct schedule_node, n, schedule_list, link) { + list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) { fprintf(stderr, " t=%4d: ", n->unblocked_time); v3d_qpu_dump(devinfo, &n->inst->qpu); fprintf(stderr, "\n"); - for (int i = 0; i < n->child_count; i++) { - struct schedule_node *child = n->children[i].node; + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + struct schedule_node *child = + (struct schedule_node *)edge->child; if (!child) continue; fprintf(stderr, " - "); v3d_qpu_dump(devinfo, &child->inst->qpu); fprintf(stderr, " (%d parents, %c)\n", - child->parent_count, - n->children[i].write_after_read ? 'w' : 'r'); + child->dag.parent_count, + edge->data ? 'w' : 'r'); } } } @@ -952,64 +1021,64 @@ instruction_latency(struct schedule_node *before, struct schedule_node *after) after_inst)); } + if (v3d_qpu_instr_is_sfu(before_inst)) + return 2; + return latency; } /** Recursive computation of the delay member of a node. */ static void -compute_delay(struct schedule_node *n) +compute_delay(struct dag_node *node, void *state) { - if (!n->child_count) { - n->delay = 1; - } else { - for (int i = 0; i < n->child_count; i++) { - if (!n->children[i].node->delay) - compute_delay(n->children[i].node); - n->delay = MAX2(n->delay, - n->children[i].node->delay + - instruction_latency(n, n->children[i].node)); - } + struct schedule_node *n = (struct schedule_node *)node; + + n->delay = 1; + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + struct schedule_node *child = + (struct schedule_node *)edge->child; + + n->delay = MAX2(n->delay, (child->delay + + instruction_latency(n, child))); } } +/* Removes a DAG head, but removing only the WAR edges. (dag_prune_head() + * should be called on it later to finish pruning the other edges). + */ static void -mark_instruction_scheduled(struct list_head *schedule_list, +pre_remove_head(struct dag *dag, struct schedule_node *n) +{ + list_delinit(&n->dag.link); + + util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { + if (edge->data) + dag_remove_edge(dag, edge); + } +} + +static void +mark_instruction_scheduled(struct dag *dag, uint32_t time, - struct schedule_node *node, - bool war_only) + struct schedule_node *node) { if (!node) return; - for (int i = node->child_count - 1; i >= 0; i--) { + util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) { struct schedule_node *child = - node->children[i].node; + (struct schedule_node *)edge->child; if (!child) continue; - if (war_only && !node->children[i].write_after_read) - continue; - - /* If the requirement is only that the node not appear before - * the last read of its destination, then it can be scheduled - * immediately after (or paired with!) the thing reading the - * destination. - */ - uint32_t latency = 0; - if (!war_only) { - latency = instruction_latency(node, - node->children[i].node); - } + uint32_t latency = instruction_latency(node, child); child->unblocked_time = MAX2(child->unblocked_time, time + latency); - child->parent_count--; - if (child->parent_count == 0) - list_add(&child->link, schedule_list); - - node->children[i].node = NULL; } + dag_prune_head(dag, &node->dag); } static void @@ -1028,7 +1097,7 @@ insert_scheduled_instruction(struct v3d_compile *c, static struct qinst * vir_nop() { - struct qreg undef = { QFILE_NULL, 0 }; + struct qreg undef = vir_nop_reg(); struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); return qinst; @@ -1223,7 +1292,6 @@ static uint32_t schedule_instructions(struct v3d_compile *c, struct choose_scoreboard *scoreboard, struct qblock *block, - struct list_head *schedule_list, enum quniform_contents *orig_uniform_contents, uint32_t *orig_uniform_data, uint32_t *next_uniform) @@ -1231,23 +1299,10 @@ schedule_instructions(struct v3d_compile *c, const struct v3d_device_info *devinfo = c->devinfo; uint32_t time = 0; - if (debug) { - fprintf(stderr, "initial deps:\n"); - dump_state(devinfo, schedule_list); - fprintf(stderr, "\n"); - } - - /* Remove non-DAG heads from the list. */ - list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) { - if (n->parent_count != 0) - list_del(&n->link); - } - - while (!list_empty(schedule_list)) { + while (!list_empty(&scoreboard->dag->heads)) { struct schedule_node *chosen = choose_instruction_to_schedule(devinfo, scoreboard, - schedule_list, NULL); struct schedule_node *merge = NULL; @@ -1260,7 +1315,7 @@ schedule_instructions(struct v3d_compile *c, if (debug) { fprintf(stderr, "t=%4d: current list:\n", time); - dump_state(devinfo, schedule_list); + dump_state(devinfo, scoreboard->dag); fprintf(stderr, "t=%4d: chose: ", time); v3d_qpu_dump(devinfo, inst); fprintf(stderr, "\n"); @@ -1278,17 +1333,14 @@ schedule_instructions(struct v3d_compile *c, */ if (chosen) { time = MAX2(chosen->unblocked_time, time); - list_del(&chosen->link); - mark_instruction_scheduled(schedule_list, time, - chosen, true); + pre_remove_head(scoreboard->dag, chosen); while ((merge = choose_instruction_to_schedule(devinfo, scoreboard, - schedule_list, chosen))) { time = MAX2(merge->unblocked_time, time); - list_del(&merge->link); + pre_remove_head(scoreboard->dag, chosen); list_addtail(&merge->link, &merged_list); (void)qpu_merge_inst(devinfo, inst, inst, &merge->inst->qpu); @@ -1307,6 +1359,8 @@ schedule_instructions(struct v3d_compile *c, fprintf(stderr, "\n"); } } + if (mux_read_stalls(scoreboard, inst)) + c->qpu_inst_stalled_count++; } /* Update the uniform index for the rewritten location -- @@ -1334,11 +1388,10 @@ schedule_instructions(struct v3d_compile *c, * be scheduled. Update the children's unblocked time for this * DAG edge as we do so. */ - mark_instruction_scheduled(schedule_list, time, chosen, false); + mark_instruction_scheduled(scoreboard->dag, time, chosen); list_for_each_entry(struct schedule_node, merge, &merged_list, link) { - mark_instruction_scheduled(schedule_list, time, merge, - false); + mark_instruction_scheduled(scoreboard->dag, time, merge); /* The merged VIR instruction doesn't get re-added to the * block, so free it now. @@ -1380,9 +1433,10 @@ qpu_schedule_instructions_block(struct v3d_compile *c, uint32_t *next_uniform) { void *mem_ctx = ralloc_context(NULL); - struct list_head schedule_list; + scoreboard->dag = dag_create(mem_ctx); + struct list_head setup_list; - list_inithead(&schedule_list); + list_inithead(&setup_list); /* Wrap each instruction in a scheduler structure. */ while (!list_empty(&block->instructions)) { @@ -1390,26 +1444,25 @@ qpu_schedule_instructions_block(struct v3d_compile *c, struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); + dag_init_node(scoreboard->dag, &n->dag); n->inst = qinst; list_del(&qinst->link); - list_addtail(&n->link, &schedule_list); + list_addtail(&n->link, &setup_list); } - calculate_forward_deps(c, &schedule_list); - calculate_reverse_deps(c, &schedule_list); + calculate_forward_deps(c, scoreboard->dag, &setup_list); + calculate_reverse_deps(c, scoreboard->dag, &setup_list); - list_for_each_entry(struct schedule_node, n, &schedule_list, link) { - compute_delay(n); - } + dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL); uint32_t cycles = schedule_instructions(c, scoreboard, block, - &schedule_list, orig_uniform_contents, orig_uniform_data, next_uniform); ralloc_free(mem_ctx); + scoreboard->dag = NULL; return cycles; } @@ -1491,6 +1544,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c) scoreboard.last_magic_sfu_write_tick = -10; scoreboard.last_uniforms_reset_tick = -10; scoreboard.last_thrsw_tick = -10; + scoreboard.last_stallable_sfu_tick = -10; if (debug) { fprintf(stderr, "Pre-schedule instructions\n"); diff --git a/lib/mesa/src/broadcom/compiler/v3d33_tex.c b/lib/mesa/src/broadcom/compiler/v3d33_tex.c index 7e9cd27d3..488021bfc 100644 --- a/lib/mesa/src/broadcom/compiler/v3d33_tex.c +++ b/lib/mesa/src/broadcom/compiler/v3d33_tex.c @@ -106,18 +106,16 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) break; case nir_tex_src_offset: { - nir_const_value *offset = - nir_src_as_const_value(instr->src[i].src); p0_unpacked.texel_offset_for_s_coordinate = - offset->i32[0]; + nir_src_comp_as_int(instr->src[i].src, 0); if (instr->coord_components >= 2) p0_unpacked.texel_offset_for_t_coordinate = - offset->i32[1]; + nir_src_comp_as_int(instr->src[i].src, 1); if (instr->coord_components >= 3) p0_unpacked.texel_offset_for_r_coordinate = - offset->i32[2]; + nir_src_comp_as_int(instr->src[i].src, 2); break; } @@ -161,11 +159,10 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) unit)); } - struct qreg texture_u[] = { - vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed), - vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed), + int texture_u[] = { + vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed), + vir_get_uniform_index(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed), }; - uint32_t next_texture_u = 0; for (int i = 0; i < next_coord; i++) { struct qreg dst; @@ -177,11 +174,8 @@ v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]); - if (i < 2) { - tmu->has_implicit_uniform = true; - tmu->src[vir_get_implicit_uniform_src(tmu)] = - texture_u[next_texture_u++]; - } + if (i < 2) + tmu->uniform = texture_u[i]; } vir_emit_thrsw(c); diff --git a/lib/mesa/src/broadcom/compiler/v3d40_tex.c b/lib/mesa/src/broadcom/compiler/v3d40_tex.c index 9f5c56079..1c39289b6 100644 --- a/lib/mesa/src/broadcom/compiler/v3d40_tex.c +++ b/lib/mesa/src/broadcom/compiler/v3d40_tex.c @@ -48,8 +48,7 @@ vir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data { struct qinst *inst = vir_NOP(c); inst->qpu.sig.wrtmuc = true; - inst->has_implicit_uniform = true; - inst->src[0] = vir_uniform(c, contents, data); + inst->uniform = vir_get_uniform_index(c, contents, data); } static const struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = { @@ -139,14 +138,13 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) case nir_tex_src_offset: { if (nir_src_is_const(instr->src[i].src)) { - nir_const_value *offset = - nir_src_as_const_value(instr->src[i].src); - - p2_unpacked.offset_s = offset->i32[0]; + p2_unpacked.offset_s = nir_src_comp_as_int(instr->src[i].src, 0); if (instr->coord_components >= 2) - p2_unpacked.offset_t = offset->i32[1]; - if (instr->coord_components >= 3) - p2_unpacked.offset_r = offset->i32[2]; + p2_unpacked.offset_t = + nir_src_comp_as_int(instr->src[i].src, 1); + if (non_array_components >= 3) + p2_unpacked.offset_r = + nir_src_comp_as_int(instr->src[i].src, 2); } else { struct qreg mask = vir_uniform_ui(c, 0xf); struct qreg x, y, offset; @@ -185,6 +183,8 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) assert(p1_unpacked.output_type_32_bit || p0_unpacked.return_words_of_texture_data < (1 << 2)); + assert(p0_unpacked.return_words_of_texture_data != 0); + uint32_t p0_packed; V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL, (uint8_t *)&p0_packed, @@ -243,6 +243,34 @@ type_size_align_1(const struct glsl_type *type, unsigned *size, unsigned *align) *align = 1; } +static uint32_t +v3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_store: + return V3D_TMU_OP_REGULAR; + case nir_intrinsic_image_deref_atomic_add: + return v3d_get_op_for_atomic_add(instr, 3); + case nir_intrinsic_image_deref_atomic_min: + return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; + case nir_intrinsic_image_deref_atomic_max: + return V3D_TMU_OP_WRITE_UMAX; + case nir_intrinsic_image_deref_atomic_and: + return V3D_TMU_OP_WRITE_AND_READ_INC; + case nir_intrinsic_image_deref_atomic_or: + return V3D_TMU_OP_WRITE_OR_READ_DEC; + case nir_intrinsic_image_deref_atomic_xor: + return V3D_TMU_OP_WRITE_XOR_READ_NOT; + case nir_intrinsic_image_deref_atomic_exchange: + return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; + case nir_intrinsic_image_deref_atomic_comp_swap: + return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; + default: + unreachable("unknown image intrinsic"); + }; +} + void v3d40_vir_emit_image_load_store(struct v3d_compile *c, nir_intrinsic_instr *instr) @@ -264,42 +292,15 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c, struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 }; - /* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR - * wants to have support for inc/dec? - */ - switch (instr->intrinsic) { - case nir_intrinsic_image_deref_load: - case nir_intrinsic_image_deref_store: - p2_unpacked.op = V3D_TMU_OP_REGULAR; - break; - case nir_intrinsic_image_deref_atomic_add: - p2_unpacked.op = V3D_TMU_OP_WRITE_ADD_READ_PREFETCH; - break; - case nir_intrinsic_image_deref_atomic_min: - p2_unpacked.op = V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; - break; + p2_unpacked.op = v3d40_image_load_store_tmu_op(instr); - case nir_intrinsic_image_deref_atomic_max: - p2_unpacked.op = V3D_TMU_OP_WRITE_UMAX; - break; - case nir_intrinsic_image_deref_atomic_and: - p2_unpacked.op = V3D_TMU_OP_WRITE_AND_READ_INC; - break; - case nir_intrinsic_image_deref_atomic_or: - p2_unpacked.op = V3D_TMU_OP_WRITE_OR_READ_DEC; - break; - case nir_intrinsic_image_deref_atomic_xor: - p2_unpacked.op = V3D_TMU_OP_WRITE_XOR_READ_NOT; - break; - case nir_intrinsic_image_deref_atomic_exchange: - p2_unpacked.op = V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; - break; - case nir_intrinsic_image_deref_atomic_comp_swap: - p2_unpacked.op = V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; - break; - default: - unreachable("unknown image intrinsic"); - }; + /* If we were able to replace atomic_add for an inc/dec, then we + * need/can to do things slightly different, like not loading the + * amount to add/sub, as that is implicit. + */ + bool atomic_add_replaced = (instr->intrinsic == nir_intrinsic_image_deref_atomic_add && + (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC || + p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC)); bool is_1d = false; switch (glsl_get_sampler_dim(sampler_type)) { @@ -368,7 +369,8 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c, vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); /* Emit the data writes for atomics or image store. */ - if (instr->intrinsic != nir_intrinsic_image_deref_load) { + if (instr->intrinsic != nir_intrinsic_image_deref_load && + !atomic_add_replaced) { /* Vector for stores, or first atomic argument */ struct qreg src[4]; for (int i = 0; i < nir_intrinsic_src_components(instr, 3); i++) { @@ -386,9 +388,21 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c, } } + if (vir_in_nonuniform_control_flow(c) && + instr->intrinsic != nir_intrinsic_image_deref_load) { + vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), + V3D_QPU_PF_PUSHZ); + } + vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, ntq_get_src(c, instr->src[1], 0), &tmu_writes); + if (vir_in_nonuniform_control_flow(c) && + instr->intrinsic != nir_intrinsic_image_deref_load) { + struct qinst *last_inst= (struct qinst *)c->cur_block->instructions.prev; + vir_set_cond(last_inst, V3D_QPU_COND_IFA); + } + vir_emit_thrsw(c); /* The input FIFO has 16 slots across all threads, so make sure we diff --git a/lib/mesa/src/broadcom/compiler/v3d_compiler.h b/lib/mesa/src/broadcom/compiler/v3d_compiler.h index 671aba3c5..b61119f56 100644 --- a/lib/mesa/src/broadcom/compiler/v3d_compiler.h +++ b/lib/mesa/src/broadcom/compiler/v3d_compiler.h @@ -69,9 +69,6 @@ enum qfile { * or physical registers later. */ QFILE_TEMP, - QFILE_UNIF, - QFILE_TLB, - QFILE_TLBU, /** * VPM reads use this with an index value to say what part of the VPM @@ -105,6 +102,16 @@ static inline struct qreg vir_reg(enum qfile file, uint32_t index) return (struct qreg){file, index}; } +static inline struct qreg vir_magic_reg(uint32_t index) +{ + return (struct qreg){QFILE_MAGIC, index}; +} + +static inline struct qreg vir_nop_reg(void) +{ + return (struct qreg){QFILE_NULL, 0}; +} + /** * A reference to an actual register at the QPU level, for register * allocation. @@ -129,12 +136,11 @@ struct qinst { /* Pre-register-allocation references to src/dst registers */ struct qreg dst; struct qreg src[3]; - bool cond_is_exec_mask; - bool has_implicit_uniform; bool is_last_thrsw; - /* After vir_to_qpu.c: If instr reads a uniform, which uniform from - * the uncompiled stream it is. + /* If the instruction reads a uniform (other than through src[i].file + * == QFILE_UNIF), that uniform's index in c->uniform_contents. ~0 + * otherwise. */ int uniform; }; @@ -275,17 +281,18 @@ enum quniform_contents { QUNIFORM_SHARED_OFFSET, }; -static inline uint32_t v3d_tmu_config_data_create(uint32_t unit, uint32_t value) +static inline uint32_t v3d_unit_data_create(uint32_t unit, uint32_t value) { + assert(value < (1 << 24)); return unit << 24 | value; } -static inline uint32_t v3d_tmu_config_data_get_unit(uint32_t data) +static inline uint32_t v3d_unit_data_get_unit(uint32_t data) { return data >> 24; } -static inline uint32_t v3d_tmu_config_data_get_value(uint32_t data) +static inline uint32_t v3d_unit_data_get_offset(uint32_t data) { return data & 0xffffff; } @@ -311,25 +318,6 @@ static inline uint8_t v3d_slot_get_component(struct v3d_varying_slot slot) return slot.slot_and_component & 3; } -struct v3d_ubo_range { - /** - * offset in bytes from the start of the ubo where this range is - * uploaded. - * - * Only set once used is set. - */ - uint32_t dst_offset; - - /** - * offset in bytes from the start of the gallium uniforms where the - * data comes from. - */ - uint32_t src_offset; - - /** size in bytes of this ubo range */ - uint32_t size; -}; - struct v3d_key { void *shader_state; struct { @@ -357,7 +345,8 @@ struct v3d_fs_key { bool sample_alpha_to_one; bool clamp_color; bool shade_model_flat; - uint8_t nr_cbufs; + /* Mask of which color render targets are present. */ + uint8_t cbufs; uint8_t swap_color_rb; /* Mask of which render targets need to be written as 32-bit floats */ uint8_t f32_color_rb; @@ -366,6 +355,15 @@ struct v3d_fs_key { */ uint8_t int_color_rb; uint8_t uint_color_rb; + + /* Color format information per render target. Only set when logic + * operations are enabled. + */ + struct { + enum pipe_format format; + const uint8_t *swizzle; + } color_fmt[V3D_MAX_DRAW_BUFFERS]; + uint8_t alpha_test_func; uint8_t logicop_func; uint32_t point_sprite_mask; @@ -413,6 +411,8 @@ struct qblock { /** @{ used by v3d_vir_live_variables.c */ BITSET_WORD *def; + BITSET_WORD *defin; + BITSET_WORD *defout; BITSET_WORD *use; BITSET_WORD *live_in; BITSET_WORD *live_out; @@ -469,6 +469,8 @@ vir_after_block(struct qblock *block) struct v3d_compiler { const struct v3d_device_info *devinfo; struct ra_regs *regs; + unsigned int reg_class_any[3]; + unsigned int reg_class_r5[3]; unsigned int reg_class_phys[3]; unsigned int reg_class_phys_or_acc[3]; }; @@ -502,8 +504,8 @@ struct v3d_compile { struct qreg *inputs; struct qreg *outputs; bool msaa_per_sample_output; - struct qreg color_reads[V3D_MAX_SAMPLES]; - struct qreg sample_colors[V3D_MAX_SAMPLES]; + struct qreg color_reads[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4]; + struct qreg sample_colors[V3D_MAX_DRAW_BUFFERS * V3D_MAX_SAMPLES * 4]; uint32_t inputs_array_size; uint32_t outputs_array_size; uint32_t uniforms_array_size; @@ -520,13 +522,7 @@ struct v3d_compile { bool uses_center_w; bool writes_z; - - struct v3d_ubo_range *ubo_ranges; - bool *ubo_range_used; - uint32_t ubo_ranges_array_size; - /** Number of uniform areas tracked in ubo_ranges. */ - uint32_t num_ubo_ranges; - uint32_t next_ubo_dst_offset; + bool uses_implicit_point_line_varyings; /* State for whether we're executing on each channel currently. 0 if * yes, otherwise a block number + 1 that the channel jumped to. @@ -556,7 +552,7 @@ struct v3d_compile { int local_invocation_index_bits; uint8_t vattr_sizes[V3D_MAX_VS_INPUTS / 4]; - uint32_t num_vpm_writes; + uint32_t vpm_output_size; /* Size in bytes of registers that have been spilled. This is how much * space needs to be available in the spill BO per thread per QPU. @@ -600,10 +596,8 @@ struct v3d_compile { enum quniform_contents *uniform_contents; uint32_t uniform_array_size; uint32_t num_uniforms; - uint32_t num_outputs; uint32_t output_position_index; nir_variable *output_color_var[4]; - uint32_t output_point_size_index; uint32_t output_sample_mask_index; struct qreg undef; @@ -619,24 +613,13 @@ struct v3d_compile { uint64_t *qpu_insts; uint32_t qpu_inst_count; uint32_t qpu_inst_size; + uint32_t qpu_inst_stalled_count; /* For the FS, the number of varying inputs not counting the * point/line varyings payload */ uint32_t num_inputs; - /** - * Number of inputs from num_inputs remaining to be queued to the read - * FIFO in the VS/CS. - */ - uint32_t num_inputs_remaining; - - /* Number of inputs currently in the read FIFO for the VS/CS */ - uint32_t num_inputs_in_fifo; - - /** Next offset in the VPM to read from in the VS/CS */ - uint32_t vpm_read_offset; - uint32_t program_id; uint32_t variant_id; @@ -652,6 +635,9 @@ struct v3d_compile { struct qinst *last_thrsw; bool last_thrsw_at_top_level; + bool emitted_tlb_load; + bool lock_scoreboard_on_first_thrsw; + bool failed; }; @@ -664,12 +650,8 @@ struct v3d_uniform_list { struct v3d_prog_data { struct v3d_uniform_list uniforms; - struct v3d_ubo_range *ubo_ranges; - uint32_t num_ubo_ranges; - uint32_t ubo_size; uint32_t spill_size; - uint8_t num_inputs; uint8_t threads; /* For threads > 1, whether the program should be dispatched in the @@ -717,17 +699,25 @@ struct v3d_fs_prog_data { uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; + uint8_t num_inputs; bool writes_z; bool disable_ez; bool uses_center_w; + bool uses_implicit_point_line_varyings; + bool lock_scoreboard_on_first_thrsw; }; -/* Special nir_load_input intrinsic index for loading the current TLB - * destination color. - */ -#define V3D_NIR_TLB_COLOR_READ_INPUT 2000000000 +struct v3d_compute_prog_data { + struct v3d_prog_data base; + /* Size in bytes of the workgroup's shared space. */ + uint32_t shared_size; +}; -#define V3D_NIR_MS_MASK_OUTPUT 2000000000 +static inline bool +vir_has_uniform(struct qinst *inst) +{ + return inst->uniform != ~0; +} extern const nir_shader_compiler_options v3d_nir_options; @@ -758,12 +748,17 @@ struct qinst *vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1); struct qinst *vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1); -struct qinst *vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src0); +struct qinst *vir_branch_inst(struct v3d_compile *c, + enum v3d_qpu_branch_cond cond); void vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst); +uint32_t vir_get_uniform_index(struct v3d_compile *c, + enum quniform_contents contents, + uint32_t data); struct qreg vir_uniform(struct v3d_compile *c, enum quniform_contents contents, uint32_t data); void vir_schedule_instructions(struct v3d_compile *c); +void v3d_setup_spill_base(struct v3d_compile *c); struct v3d_qpu_instr v3d_qpu_nop(void); struct qreg vir_emit_def(struct v3d_compile *c, struct qinst *inst); @@ -777,9 +772,6 @@ void vir_set_unpack(struct qinst *inst, int src, struct qreg vir_get_temp(struct v3d_compile *c); void vir_emit_last_thrsw(struct v3d_compile *c); void vir_calculate_live_intervals(struct v3d_compile *c); -bool vir_has_implicit_uniform(struct qinst *inst); -int vir_get_implicit_uniform_src(struct qinst *inst); -int vir_get_non_sideband_nsrc(struct qinst *inst); int vir_get_nsrc(struct qinst *inst); bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst); bool vir_get_add_op(struct qinst *inst, enum v3d_qpu_add_op *op); @@ -788,7 +780,6 @@ bool vir_is_raw_mov(struct qinst *inst); bool vir_is_tex(struct qinst *inst); bool vir_is_add(struct qinst *inst); bool vir_is_mul(struct qinst *inst); -bool vir_is_float_input(struct qinst *inst); bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst); bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst); struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg); @@ -810,10 +801,13 @@ bool vir_opt_constant_folding(struct v3d_compile *c); bool vir_opt_copy_propagate(struct v3d_compile *c); bool vir_opt_dead_code(struct v3d_compile *c); bool vir_opt_peephole_sf(struct v3d_compile *c); +bool vir_opt_redundant_flags(struct v3d_compile *c); bool vir_opt_small_immediates(struct v3d_compile *c); bool vir_opt_vpm(struct v3d_compile *c); void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c); void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c); +void v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c); +void v3d_nir_lower_scratch(nir_shader *s); void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c); void v3d_nir_lower_image_load_store(nir_shader *s); void vir_lower_uniforms(struct v3d_compile *c); @@ -833,7 +827,8 @@ bool vir_init_reg_sets(struct v3d_compiler *compiler); bool v3d_gl_format_is_return_32(GLenum format); -void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf); +uint32_t +v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src); static inline bool quniform_contents_is_texture_p0(enum quniform_contents contents) @@ -843,6 +838,12 @@ quniform_contents_is_texture_p0(enum quniform_contents contents) V3D_MAX_TEXTURE_SAMPLERS)); } +static inline bool +vir_in_nonuniform_control_flow(struct v3d_compile *c) +{ + return c->execute.file != QFILE_NULL; +} + static inline struct qreg vir_uniform_ui(struct v3d_compile *c, uint32_t ui) { @@ -1086,6 +1087,30 @@ vir_UMUL(struct v3d_compile *c, struct qreg src0, struct qreg src1) return vir_UMUL24(c, src0, src1); } +static inline struct qreg +vir_TLBU_COLOR_READ(struct v3d_compile *c, uint32_t config) +{ + assert(c->devinfo->ver >= 41); /* XXX */ + assert((config & 0xffffff00) == 0xffffff00); + + struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef, + c->undef, c->undef); + ldtlb->qpu.sig.ldtlbu = true; + ldtlb->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, config); + return vir_emit_def(c, ldtlb); +} + +static inline struct qreg +vir_TLB_COLOR_READ(struct v3d_compile *c) +{ + assert(c->devinfo->ver >= 41); /* XXX */ + + struct qinst *ldtlb = vir_add_inst(V3D_QPU_A_NOP, c->undef, + c->undef, c->undef); + ldtlb->qpu.sig.ldtlb = true; + return vir_emit_def(c, ldtlb); +} + /* static inline struct qreg vir_LOAD_IMM(struct v3d_compile *c, uint32_t val) @@ -1114,7 +1139,7 @@ static inline struct qinst * vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) { /* The actual uniform_data value will be set at scheduling time */ - return vir_emit_nondef(c, vir_branch_inst(cond, vir_uniform_ui(c, 0))); + return vir_emit_nondef(c, vir_branch_inst(c, cond)); } #define vir_for_each_block(block, c) \ @@ -1143,4 +1168,8 @@ vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) vir_for_each_block(_block, c) \ vir_for_each_inst(inst, _block) +#define vir_for_each_inst_inorder_safe(inst, c) \ + vir_for_each_block(_block, c) \ + vir_for_each_inst_safe(inst, _block) + #endif /* V3D_COMPILER_H */ diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c index b65a82b7f..2a68efb7b 100644 --- a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c +++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_io.c @@ -28,11 +28,47 @@ * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io * intrinsics into something amenable to the V3D architecture. * - * After moving more and more logic to NIR, all that's left here is fixing up - * addressing on uniform loads. FS input and VS output scalarization is - * handled by nir_lower_io_to_scalar(). + * Most of the work is turning the VS's store_output intrinsics from working + * on a base representing the gallium-level vec4 driver_location to an offset + * within the VPM, and emitting the header that's read by the fixed function + * hardware between the VS and FS. + * + * We also adjust the offsets on uniform loads to be in bytes, since that's + * what we need for indirect addressing with general TMU access. */ +struct v3d_nir_lower_io_state { + int pos_vpm_offset; + int vp_vpm_offset; + int zs_vpm_offset; + int rcp_wc_vpm_offset; + int psiz_vpm_offset; + int varyings_vpm_offset; + + BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; + + nir_ssa_def *pos[4]; +}; + +static void +v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *chan) +{ + nir_intrinsic_instr *intr = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output); + nir_ssa_dest_init(&intr->instr, &intr->dest, + 1, intr->dest.ssa.bit_size, NULL); + intr->num_components = 1; + + intr->src[0] = nir_src_for_ssa(chan); + intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0)); + + nir_intrinsic_set_base(intr, base); + nir_intrinsic_set_write_mask(intr, 0x1); + nir_intrinsic_set_component(intr, 0); + + nir_builder_instr_insert(b, &intr->instr); +} + /* Convert the uniform offset to bytes. If it happens to be a constant, * constant-folding will clean up the shift for us. */ @@ -50,9 +86,90 @@ v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b, nir_imm_int(b, 4)))); } +static int +v3d_varying_slot_vpm_offset(struct v3d_compile *c, nir_variable *var, int chan) +{ + int component = var->data.location_frac + chan; + + for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { + struct v3d_varying_slot slot = c->vs_key->fs_inputs[i]; + + if (v3d_slot_get_slot(slot) == var->data.location && + v3d_slot_get_component(slot) == component) { + return i; + } + } + + return -1; +} + +/* Lowers a store_output(gallium driver location) to a series of store_outputs + * with a driver_location equal to the offset in the VPM. + */ +static void +v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b, + nir_intrinsic_instr *intr, + struct v3d_nir_lower_io_state *state) +{ + b->cursor = nir_before_instr(&intr->instr); + + int start_comp = nir_intrinsic_component(intr); + nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0], + intr->num_components); + + nir_variable *var = NULL; + nir_foreach_variable(scan_var, &c->s->outputs) { + if (scan_var->data.driver_location != nir_intrinsic_base(intr) || + start_comp < scan_var->data.location_frac || + start_comp >= scan_var->data.location_frac + + glsl_get_components(scan_var->type)) { + continue; + } + var = scan_var; + } + + /* Save off the components of the position for the setup of VPM inputs + * read by fixed function HW. + */ + if (var->data.location == VARYING_SLOT_POS) { + for (int i = 0; i < intr->num_components; i++) { + state->pos[start_comp + i] = nir_channel(b, src, i); + } + } + + /* Just psiz to the position in the FF header right now. */ + if (var->data.location == VARYING_SLOT_PSIZ && + state->psiz_vpm_offset != -1) { + v3d_nir_store_output(b, state->psiz_vpm_offset, src); + } + + /* Scalarize outputs if it hasn't happened already, since we want to + * schedule each VPM write individually. We can skip any outut + * components not read by the FS. + */ + for (int i = 0; i < intr->num_components; i++) { + int vpm_offset = + v3d_varying_slot_vpm_offset(c, var, + i + + start_comp - + var->data.location_frac); + + if (vpm_offset == -1) + continue; + + BITSET_SET(state->varyings_stored, vpm_offset); + + v3d_nir_store_output(b, state->varyings_vpm_offset + vpm_offset, + nir_channel(b, src, i)); + } + + nir_instr_remove(&intr->instr); +} + static void v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b, - struct nir_instr *instr) + struct nir_instr *instr, + struct v3d_nir_lower_io_state *state) { if (instr->type != nir_instr_type_intrinsic) return; @@ -63,33 +180,171 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b, v3d_nir_lower_uniform(c, b, intr); break; + case nir_intrinsic_store_output: + if (c->s->info.stage == MESA_SHADER_VERTEX) + v3d_nir_lower_vpm_output(c, b, intr, state); + break; + default: break; } } -static bool -v3d_nir_lower_io_impl(struct v3d_compile *c, nir_function_impl *impl) +/* Remap the output var's .driver_location. This is purely for + * nir_print_shader() so that store_output can map back to a variable name. + */ +static void +v3d_nir_lower_io_update_output_var_base(struct v3d_compile *c, + struct v3d_nir_lower_io_state *state) +{ + nir_foreach_variable_safe(var, &c->s->outputs) { + if (var->data.location == VARYING_SLOT_POS && + state->pos_vpm_offset != -1) { + var->data.driver_location = state->pos_vpm_offset; + continue; + } + + if (var->data.location == VARYING_SLOT_PSIZ && + state->psiz_vpm_offset != -1) { + var->data.driver_location = state->psiz_vpm_offset; + continue; + } + + int vpm_offset = v3d_varying_slot_vpm_offset(c, var, 0); + if (vpm_offset != -1) { + var->data.driver_location = + state->varyings_vpm_offset + vpm_offset; + } else { + /* If we couldn't find a mapping for the var, delete + * it so that its old .driver_location doesn't confuse + * nir_print_shader(). + */ + exec_node_remove(&var->node); + } + } +} + +static void +v3d_nir_setup_vpm_layout(struct v3d_compile *c, + struct v3d_nir_lower_io_state *state) +{ + uint32_t vpm_offset = 0; + + if (c->vs_key->is_coord) { + state->pos_vpm_offset = vpm_offset; + vpm_offset += 4; + } else { + state->pos_vpm_offset = -1; + } + + state->vp_vpm_offset = vpm_offset; + vpm_offset += 2; + + if (!c->vs_key->is_coord) { + state->zs_vpm_offset = vpm_offset++; + state->rcp_wc_vpm_offset = vpm_offset++; + } else { + state->zs_vpm_offset = -1; + state->rcp_wc_vpm_offset = -1; + } + + if (c->vs_key->per_vertex_point_size) + state->psiz_vpm_offset = vpm_offset++; + else + state->psiz_vpm_offset = -1; + + state->varyings_vpm_offset = vpm_offset; + + c->vpm_output_size = vpm_offset + c->vs_key->num_fs_inputs; +} + +static void +v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b, + struct v3d_nir_lower_io_state *state) { - nir_builder b; - nir_builder_init(&b, impl); + for (int i = 0; i < 4; i++) { + if (!state->pos[i]) + state->pos[i] = nir_ssa_undef(b, 1, 32); + } + + nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]); + + if (state->pos_vpm_offset != -1) { + for (int i = 0; i < 4; i++) { + v3d_nir_store_output(b, state->pos_vpm_offset + i, + state->pos[i]); + } + } - nir_foreach_block(block, impl) { - nir_foreach_instr_safe(instr, block) - v3d_nir_lower_io_instr(c, &b, instr); + for (int i = 0; i < 2; i++) { + nir_ssa_def *pos; + nir_ssa_def *scale; + pos = state->pos[i]; + if (i == 0) + scale = nir_load_viewport_x_scale(b); + else + scale = nir_load_viewport_y_scale(b); + pos = nir_fmul(b, pos, scale); + pos = nir_fmul(b, pos, rcp_wc); + pos = nir_f2i32(b, nir_fround_even(b, pos)); + v3d_nir_store_output(b, state->vp_vpm_offset + i, + pos); } - nir_metadata_preserve(impl, nir_metadata_block_index | - nir_metadata_dominance); + if (state->zs_vpm_offset != -1) { + nir_ssa_def *z = state->pos[2]; + z = nir_fmul(b, z, nir_load_viewport_z_scale(b)); + z = nir_fmul(b, z, rcp_wc); + z = nir_fadd(b, z, nir_load_viewport_z_offset(b)); + v3d_nir_store_output(b, state->zs_vpm_offset, z); + } + + if (state->rcp_wc_vpm_offset != -1) + v3d_nir_store_output(b, state->rcp_wc_vpm_offset, rcp_wc); - return true; + /* Store 0 to varyings requested by the FS but not stored in the VS. + * This should be undefined behavior, but glsl-routing seems to rely + * on it. + */ + for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { + if (!BITSET_TEST(state->varyings_stored, i)) { + v3d_nir_store_output(b, state->varyings_vpm_offset + i, + nir_imm_int(b, 0)); + } + } } void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c) { + struct v3d_nir_lower_io_state state = { 0 }; + + /* Set up the layout of the VPM outputs. */ + if (s->info.stage == MESA_SHADER_VERTEX) + v3d_nir_setup_vpm_layout(c, &state); + nir_foreach_function(function, s) { - if (function->impl) - v3d_nir_lower_io_impl(c, function->impl); + if (function->impl) { + nir_builder b; + nir_builder_init(&b, function->impl); + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) + v3d_nir_lower_io_instr(c, &b, instr, + &state); + } + + nir_block *last = nir_impl_last_block(function->impl); + b.cursor = nir_after_block(last); + if (s->info.stage == MESA_SHADER_VERTEX) + v3d_nir_emit_ff_vpm_outputs(c, &b, &state); + + nir_metadata_preserve(function->impl, + nir_metadata_block_index | + nir_metadata_dominance); + } } + + if (s->info.stage == MESA_SHADER_VERTEX) + v3d_nir_lower_io_update_output_var_base(c, &state); } diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c new file mode 100644 index 000000000..5c3a7c58a --- /dev/null +++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_logic_ops.c @@ -0,0 +1,411 @@ +/* + * Copyright © 2019 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * Implements lowering for logical operations. + * + * V3D doesn't have any hardware support for logic ops. Instead, you read the + * current contents of the destination from the tile buffer, then do math using + * your output color and that destination value, and update the output color + * appropriately. + */ + +#include "util/u_format.h" +#include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_format_convert.h" +#include "v3d_compiler.h" + + +typedef nir_ssa_def *(*nir_pack_func)(nir_builder *b, nir_ssa_def *c); +typedef nir_ssa_def *(*nir_unpack_func)(nir_builder *b, nir_ssa_def *c); + +static bool +logicop_depends_on_dst_color(int logicop_func) +{ + switch (logicop_func) { + case PIPE_LOGICOP_SET: + case PIPE_LOGICOP_CLEAR: + case PIPE_LOGICOP_COPY: + case PIPE_LOGICOP_COPY_INVERTED: + return false; + default: + return true; + } +} + +static nir_ssa_def * +v3d_logicop(nir_builder *b, int logicop_func, + nir_ssa_def *src, nir_ssa_def *dst) +{ + switch (logicop_func) { + case PIPE_LOGICOP_CLEAR: + return nir_imm_int(b, 0); + case PIPE_LOGICOP_NOR: + return nir_inot(b, nir_ior(b, src, dst)); + case PIPE_LOGICOP_AND_INVERTED: + return nir_iand(b, nir_inot(b, src), dst); + case PIPE_LOGICOP_COPY_INVERTED: + return nir_inot(b, src); + case PIPE_LOGICOP_AND_REVERSE: + return nir_iand(b, src, nir_inot(b, dst)); + case PIPE_LOGICOP_INVERT: + return nir_inot(b, dst); + case PIPE_LOGICOP_XOR: + return nir_ixor(b, src, dst); + case PIPE_LOGICOP_NAND: + return nir_inot(b, nir_iand(b, src, dst)); + case PIPE_LOGICOP_AND: + return nir_iand(b, src, dst); + case PIPE_LOGICOP_EQUIV: + return nir_inot(b, nir_ixor(b, src, dst)); + case PIPE_LOGICOP_NOOP: + return dst; + case PIPE_LOGICOP_OR_INVERTED: + return nir_ior(b, nir_inot(b, src), dst); + case PIPE_LOGICOP_OR_REVERSE: + return nir_ior(b, src, nir_inot(b, dst)); + case PIPE_LOGICOP_OR: + return nir_ior(b, src, dst); + case PIPE_LOGICOP_SET: + return nir_imm_int(b, ~0); + default: + fprintf(stderr, "Unknown logic op %d\n", logicop_func); + /* FALLTHROUGH */ + case PIPE_LOGICOP_COPY: + return src; + } +} + +static nir_ssa_def * +v3d_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz) +{ + switch (swiz) { + default: + case PIPE_SWIZZLE_NONE: + fprintf(stderr, "warning: unknown swizzle\n"); + /* FALLTHROUGH */ + case PIPE_SWIZZLE_0: + return nir_imm_float(b, 0.0); + case PIPE_SWIZZLE_1: + return nir_imm_float(b, 1.0); + case PIPE_SWIZZLE_X: + case PIPE_SWIZZLE_Y: + case PIPE_SWIZZLE_Z: + case PIPE_SWIZZLE_W: + return srcs[swiz]; + } +} + +static nir_ssa_def * +v3d_nir_swizzle_and_pack(nir_builder *b, nir_ssa_def **chans, + const uint8_t *swiz, nir_pack_func pack_func) +{ + nir_ssa_def *c[4]; + for (int i = 0; i < 4; i++) + c[i] = v3d_nir_get_swizzled_channel(b, chans, swiz[i]); + + return pack_func(b, nir_vec4(b, c[0], c[1], c[2], c[3])); +} + +static nir_ssa_def * +v3d_nir_unpack_and_swizzle(nir_builder *b, nir_ssa_def *packed, + const uint8_t *swiz, nir_unpack_func unpack_func) +{ + nir_ssa_def *unpacked = unpack_func(b, packed); + + nir_ssa_def *unpacked_chans[4]; + for (int i = 0; i < 4; i++) + unpacked_chans[i] = nir_channel(b, unpacked, i); + + nir_ssa_def *c[4]; + for (int i = 0; i < 4; i++) + c[i] = v3d_nir_get_swizzled_channel(b, unpacked_chans, swiz[i]); + + return nir_vec4(b, c[0], c[1], c[2], c[3]); +} + +static nir_ssa_def * +pack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c) +{ + const unsigned bits[4] = { 10, 10, 10, 2 }; + nir_ssa_def *unorm = nir_format_float_to_unorm(b, c, bits); + + nir_ssa_def *chans[4]; + for (int i = 0; i < 4; i++) + chans[i] = nir_channel(b, unorm, i); + + nir_ssa_def *result = nir_mov(b, chans[0]); + int offset = bits[0]; + for (int i = 1; i < 4; i++) { + nir_ssa_def *shifted_chan = + nir_ishl(b, chans[i], nir_imm_int(b, offset)); + result = nir_ior(b, result, shifted_chan); + offset += bits[i]; + } + return result; +} + +static nir_ssa_def * +unpack_unorm_rgb10a2(nir_builder *b, nir_ssa_def *c) +{ + const unsigned bits[4] = { 10, 10, 10, 2 }; + const unsigned masks[4] = { BITFIELD_MASK(bits[0]), + BITFIELD_MASK(bits[1]), + BITFIELD_MASK(bits[2]), + BITFIELD_MASK(bits[3]) }; + + nir_ssa_def *chans[4]; + for (int i = 0; i < 4; i++) { + nir_ssa_def *unorm = nir_iand(b, c, nir_imm_int(b, masks[i])); + chans[i] = nir_format_unorm_to_float(b, unorm, &bits[i]); + c = nir_ushr(b, c, nir_imm_int(b, bits[i])); + } + + return nir_vec4(b, chans[0], chans[1], chans[2], chans[3]); +} + +static const uint8_t * +v3d_get_format_swizzle_for_rt(struct v3d_compile *c, int rt) +{ + static const uint8_t ident[4] = { 0, 1, 2, 3 }; + + /* We will automatically swap R and B channels for BGRA formats + * on tile loads and stores (see 'swap_rb' field in v3d_resource) so + * we want to treat these surfaces as if they were regular RGBA formats. + */ + if (c->fs_key->color_fmt[rt].swizzle[0] == 2 && + c->fs_key->color_fmt[rt].format != PIPE_FORMAT_B5G6R5_UNORM) { + return ident; + } else { + return c->fs_key->color_fmt[rt].swizzle; + } +} + +static nir_ssa_def * +v3d_nir_get_tlb_color(nir_builder *b, int rt, int sample) +{ + nir_ssa_def *color[4]; + for (int i = 0; i < 4; i++) { + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_load_tlb_color_v3d); + load->num_components = 1; + nir_intrinsic_set_base(load, sample); + nir_intrinsic_set_component(load, i); + load->src[0] = nir_src_for_ssa(nir_imm_int(b, rt)); + nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); + nir_builder_instr_insert(b, &load->instr); + color[i] = &load->dest.ssa; + } + + return nir_vec4(b, color[0], color[1], color[2], color[3]); +} + +static nir_ssa_def * +v3d_emit_logic_op_raw(struct v3d_compile *c, nir_builder *b, + nir_ssa_def **src_chans, nir_ssa_def **dst_chans, + int rt, int sample) +{ + const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt); + + nir_ssa_def *op_res[4]; + for (int i = 0; i < 4; i++) { + nir_ssa_def *src = src_chans[i]; + nir_ssa_def *dst = + v3d_nir_get_swizzled_channel(b, dst_chans, fmt_swz[i]); + op_res[i] = v3d_logicop(b, c->fs_key->logicop_func, src, dst); + } + + nir_ssa_def *r[4]; + for (int i = 0; i < 4; i++) + r[i] = v3d_nir_get_swizzled_channel(b, op_res, fmt_swz[i]); + + return nir_vec4(b, r[0], r[1], r[2], r[3]); +} + +static nir_ssa_def * +v3d_emit_logic_op_unorm(struct v3d_compile *c, nir_builder *b, + nir_ssa_def **src_chans, nir_ssa_def **dst_chans, + int rt, int sample, + nir_pack_func pack_func, nir_unpack_func unpack_func) +{ + const uint8_t src_swz[4] = { 0, 1, 2, 3 }; + nir_ssa_def *packed_src = + v3d_nir_swizzle_and_pack(b, src_chans, src_swz, pack_func); + + const uint8_t *fmt_swz = v3d_get_format_swizzle_for_rt(c, rt); + nir_ssa_def *packed_dst = + v3d_nir_swizzle_and_pack(b, dst_chans, fmt_swz, pack_func); + + nir_ssa_def *packed_result = + v3d_logicop(b, c->fs_key->logicop_func, packed_src, packed_dst); + + return v3d_nir_unpack_and_swizzle(b, packed_result, fmt_swz, unpack_func); +} + +static nir_ssa_def * +v3d_nir_emit_logic_op(struct v3d_compile *c, nir_builder *b, + nir_ssa_def *src, int rt, int sample) +{ + nir_ssa_def *dst = v3d_nir_get_tlb_color(b, rt, sample); + + nir_ssa_def *src_chans[4], *dst_chans[4]; + for (unsigned i = 0; i < 4; i++) { + src_chans[i] = nir_channel(b, src, i); + dst_chans[i] = nir_channel(b, dst, i); + } + + if (c->fs_key->color_fmt[rt].format == PIPE_FORMAT_R10G10B10A2_UNORM) { + return v3d_emit_logic_op_unorm( + c, b, src_chans, dst_chans, rt, 0, + pack_unorm_rgb10a2, unpack_unorm_rgb10a2); + } + + if (util_format_is_unorm(c->fs_key->color_fmt[rt].format)) { + return v3d_emit_logic_op_unorm( + c, b, src_chans, dst_chans, rt, 0, + nir_pack_unorm_4x8, nir_unpack_unorm_4x8); + } + + return v3d_emit_logic_op_raw(c, b, src_chans, dst_chans, rt, 0); +} + +static void +v3d_emit_ms_output(struct v3d_compile *c, nir_builder *b, + nir_ssa_def *color, nir_src *offset, + nir_alu_type type, int rt, int sample) +{ + + nir_intrinsic_instr *store = + nir_intrinsic_instr_create(b->shader, + nir_intrinsic_store_tlb_sample_color_v3d); + store->num_components = 4; + nir_intrinsic_set_base(store, sample); + nir_intrinsic_set_component(store, 0); + nir_intrinsic_set_type(store, type); + store->src[0] = nir_src_for_ssa(color); + store->src[1] = nir_src_for_ssa(nir_imm_int(b, rt)); + nir_builder_instr_insert(b, &store->instr); +} + +static void +v3d_nir_lower_logic_op_instr(struct v3d_compile *c, + nir_builder *b, + nir_intrinsic_instr *intr, + int rt) +{ + nir_ssa_def *frag_color = intr->src[0].ssa; + + + const int logic_op = c->fs_key->logicop_func; + if (c->fs_key->msaa && logicop_depends_on_dst_color(logic_op)) { + c->msaa_per_sample_output = true; + + nir_src *offset = &intr->src[1]; + nir_alu_type type = nir_intrinsic_type(intr); + for (int i = 0; i < V3D_MAX_SAMPLES; i++) { + nir_ssa_def *sample = + v3d_nir_emit_logic_op(c, b, frag_color, rt, i); + + v3d_emit_ms_output(c, b, sample, offset, type, rt, i); + } + + nir_instr_remove(&intr->instr); + } else { + nir_ssa_def *result = + v3d_nir_emit_logic_op(c, b, frag_color, rt, 0); + + nir_instr_rewrite_src(&intr->instr, &intr->src[0], + nir_src_for_ssa(result)); + intr->num_components = result->num_components; + } +} + +static bool +v3d_nir_lower_logic_ops_block(nir_block *block, struct v3d_compile *c) +{ + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; + + nir_foreach_variable(var, &c->s->outputs) { + const int driver_loc = var->data.driver_location; + if (driver_loc != nir_intrinsic_base(intr)) + continue; + + const int loc = var->data.location; + if (loc != FRAG_RESULT_COLOR && + (loc < FRAG_RESULT_DATA0 || + loc >= FRAG_RESULT_DATA0 + V3D_MAX_DRAW_BUFFERS)) { + continue; + } + + /* Logic operations do not apply on floating point or + * sRGB enabled render targets. + */ + const int rt = driver_loc; + assert(rt < V3D_MAX_DRAW_BUFFERS); + + const enum pipe_format format = + c->fs_key->color_fmt[rt].format; + if (util_format_is_float(format) || + util_format_is_srgb(format)) { + continue; + } + + nir_function_impl *impl = + nir_cf_node_get_function(&block->cf_node); + nir_builder b; + nir_builder_init(&b, impl); + b.cursor = nir_before_instr(&intr->instr); + v3d_nir_lower_logic_op_instr(c, &b, intr, rt); + } + } + + return true; +} + +void +v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c) +{ + /* Nothing to do if logic op is 'copy src to dst' or if logic ops are + * disabled (we set the logic op to copy in that case). + */ + if (c->fs_key->logicop_func == PIPE_LOGICOP_COPY) + return; + + nir_foreach_function(function, s) { + if (function->impl) { + nir_foreach_block(block, function->impl) + v3d_nir_lower_logic_ops_block(block, c); + + nir_metadata_preserve(function->impl, + nir_metadata_block_index | + nir_metadata_dominance); + } + } +} diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_scratch.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_scratch.c new file mode 100644 index 000000000..d23b8be83 --- /dev/null +++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_scratch.c @@ -0,0 +1,153 @@ +/* + * Copyright © 2018 Intel Corporation + * Copyright © 2018 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "v3d_compiler.h" +#include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_format_convert.h" + +/** @file v3d_nir_lower_scratch.c + * + * Swizzles around the addresses of + * nir_intrinsic_load_scratch/nir_intrinsic_store_scratch so that a QPU stores + * a cacheline at a time per dword of scratch access, scalarizing and removing + * writemasks in the process. + */ + +static nir_ssa_def * +v3d_nir_scratch_offset(nir_builder *b, nir_intrinsic_instr *instr) +{ + bool is_store = instr->intrinsic == nir_intrinsic_store_scratch; + nir_ssa_def *offset = nir_ssa_for_src(b, instr->src[is_store ? 1 : 0], 1); + + assert(nir_intrinsic_align_mul(instr) >= 4); + assert(nir_intrinsic_align_offset(instr) == 0); + + /* The spill_offset register will already have the subgroup ID (EIDX) + * shifted and ORed in at bit 2, so all we need to do is to move the + * dword index up above V3D_CHANNELS. + */ + return nir_imul_imm(b, offset, V3D_CHANNELS); +} + +static void +v3d_nir_lower_load_scratch(nir_builder *b, nir_intrinsic_instr *instr) +{ + b->cursor = nir_before_instr(&instr->instr); + + nir_ssa_def *offset = v3d_nir_scratch_offset(b,instr); + + nir_ssa_def *chans[NIR_MAX_VEC_COMPONENTS]; + for (int i = 0; i < instr->num_components; i++) { + nir_ssa_def *chan_offset = + nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4); + + nir_intrinsic_instr *chan_instr = + nir_intrinsic_instr_create(b->shader, instr->intrinsic); + chan_instr->num_components = 1; + nir_ssa_dest_init(&chan_instr->instr, &chan_instr->dest, 1, + instr->dest.ssa.bit_size, NULL); + + chan_instr->src[0] = nir_src_for_ssa(chan_offset); + + nir_intrinsic_set_align(chan_instr, 4, 0); + + nir_builder_instr_insert(b, &chan_instr->instr); + + chans[i] = &chan_instr->dest.ssa; + } + + nir_ssa_def *result = nir_vec(b, chans, instr->num_components); + nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(result)); + nir_instr_remove(&instr->instr); +} + +static void +v3d_nir_lower_store_scratch(nir_builder *b, nir_intrinsic_instr *instr) +{ + b->cursor = nir_before_instr(&instr->instr); + + nir_ssa_def *offset = v3d_nir_scratch_offset(b, instr); + nir_ssa_def *value = nir_ssa_for_src(b, instr->src[0], + instr->num_components); + + for (int i = 0; i < instr->num_components; i++) { + if (!(nir_intrinsic_write_mask(instr) & (1 << i))) + continue; + + nir_ssa_def *chan_offset = + nir_iadd_imm(b, offset, V3D_CHANNELS * i * 4); + + nir_intrinsic_instr *chan_instr = + nir_intrinsic_instr_create(b->shader, instr->intrinsic); + chan_instr->num_components = 1; + + chan_instr->src[0] = nir_src_for_ssa(nir_channel(b, + value, + i)); + chan_instr->src[1] = nir_src_for_ssa(chan_offset); + nir_intrinsic_set_write_mask(chan_instr, 0x1); + nir_intrinsic_set_align(chan_instr, 4, 0); + + nir_builder_instr_insert(b, &chan_instr->instr); + } + + nir_instr_remove(&instr->instr); +} + +void +v3d_nir_lower_scratch(nir_shader *s) +{ + nir_foreach_function(function, s) { + if (!function->impl) + continue; + + nir_builder b; + nir_builder_init(&b, function->impl); + + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = + nir_instr_as_intrinsic(instr); + + switch (intr->intrinsic) { + case nir_intrinsic_load_scratch: + v3d_nir_lower_load_scratch(&b, intr); + break; + case nir_intrinsic_store_scratch: + v3d_nir_lower_store_scratch(&b, intr); + break; + default: + break; + } + } + } + + nir_metadata_preserve(function->impl, + nir_metadata_block_index | + nir_metadata_dominance); + } +} diff --git a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c index 68591529d..d79969374 100644 --- a/lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c +++ b/lib/mesa/src/broadcom/compiler/v3d_nir_lower_txf_ms.c @@ -34,12 +34,10 @@ #define V3D_MAX_SAMPLES 4 -static void -vc4_nir_lower_txf_ms_instr(struct v3d_compile *c, nir_builder *b, - nir_tex_instr *instr) +static nir_ssa_def * +v3d_nir_lower_txf_ms_instr(nir_builder *b, nir_instr *in_instr, void *data) { - if (instr->op != nir_texop_txf_ms) - return; + nir_tex_instr *instr = nir_instr_as_tex(in_instr); b->cursor = nir_before_instr(&instr->instr); @@ -66,30 +64,22 @@ vc4_nir_lower_txf_ms_instr(struct v3d_compile *c, nir_builder *b, nir_tex_instr_remove_src(instr, sample_index); instr->op = nir_texop_txf; instr->sampler_dim = GLSL_SAMPLER_DIM_2D; + + return NIR_LOWER_INSTR_PROGRESS; +} + +static bool +v3d_nir_lower_txf_ms_filter(const nir_instr *instr, const void *data) +{ + return (instr->type == nir_instr_type_tex && + nir_instr_as_tex(instr)->op == nir_texop_txf_ms); } void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c) { - nir_foreach_function(function, s) { - if (!function->impl) - continue; - - nir_builder b; - nir_builder_init(&b, function->impl); - - nir_foreach_block(block, function->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_tex) - continue; - - vc4_nir_lower_txf_ms_instr(c, &b, - nir_instr_as_tex(instr)); - } - } - - nir_metadata_preserve(function->impl, - nir_metadata_block_index | - nir_metadata_dominance); - } + nir_shader_lower_instructions(s, + v3d_nir_lower_txf_ms_filter, + v3d_nir_lower_txf_ms_instr, + NULL); } diff --git a/lib/mesa/src/broadcom/compiler/vir.c b/lib/mesa/src/broadcom/compiler/vir.c index 20f700414..78362a294 100644 --- a/lib/mesa/src/broadcom/compiler/vir.c +++ b/lib/mesa/src/broadcom/compiler/vir.c @@ -25,7 +25,7 @@ #include "v3d_compiler.h" int -vir_get_non_sideband_nsrc(struct qinst *inst) +vir_get_nsrc(struct qinst *inst) { switch (inst->qpu.type) { case V3D_QPU_INSTR_TYPE_BRANCH: @@ -40,55 +40,6 @@ vir_get_non_sideband_nsrc(struct qinst *inst) return 0; } -int -vir_get_nsrc(struct qinst *inst) -{ - int nsrc = vir_get_non_sideband_nsrc(inst); - - if (vir_has_implicit_uniform(inst)) - nsrc++; - - return nsrc; -} - -bool -vir_has_implicit_uniform(struct qinst *inst) -{ - switch (inst->qpu.type) { - case V3D_QPU_INSTR_TYPE_BRANCH: - return true; - case V3D_QPU_INSTR_TYPE_ALU: - switch (inst->dst.file) { - case QFILE_TLBU: - return true; - case QFILE_MAGIC: - switch (inst->dst.index) { - case V3D_QPU_WADDR_TLBU: - case V3D_QPU_WADDR_TMUAU: - case V3D_QPU_WADDR_SYNCU: - return true; - default: - break; - } - break; - default: - return inst->has_implicit_uniform; - } - } - return false; -} - -/* The sideband uniform for textures gets stored after the normal ALU - * arguments. - */ -int -vir_get_implicit_uniform_src(struct qinst *inst) -{ - if (!vir_has_implicit_uniform(inst)) - return -1; - return vir_get_nsrc(inst) - 1; -} - /** * Returns whether the instruction has any side effects that must be * preserved. @@ -124,6 +75,8 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst) if (inst->qpu.sig.ldtmu || inst->qpu.sig.ldvary || + inst->qpu.sig.ldtlbu || + inst->qpu.sig.ldtlb || inst->qpu.sig.wrtmuc || inst->qpu.sig.thrsw) { return true; @@ -133,38 +86,6 @@ vir_has_side_effects(struct v3d_compile *c, struct qinst *inst) } bool -vir_is_float_input(struct qinst *inst) -{ - /* XXX: More instrs */ - switch (inst->qpu.type) { - case V3D_QPU_INSTR_TYPE_BRANCH: - return false; - case V3D_QPU_INSTR_TYPE_ALU: - switch (inst->qpu.alu.add.op) { - case V3D_QPU_A_FADD: - case V3D_QPU_A_FSUB: - case V3D_QPU_A_FMIN: - case V3D_QPU_A_FMAX: - case V3D_QPU_A_FTOIN: - return true; - default: - break; - } - - switch (inst->qpu.alu.mul.op) { - case V3D_QPU_M_FMOV: - case V3D_QPU_M_VFMUL: - case V3D_QPU_M_FMUL: - return true; - default: - break; - } - } - - return false; -} - -bool vir_is_raw_mov(struct qinst *inst) { if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || @@ -178,6 +99,13 @@ vir_is_raw_mov(struct qinst *inst) return false; } + if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || + inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) { + return false; + } + if (inst->qpu.flags.ac != V3D_QPU_COND_NONE || inst->qpu.flags.mc != V3D_QPU_COND_NONE) return false; @@ -421,7 +349,7 @@ vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct q } struct qinst * -vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src) +vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) { struct qinst *inst = calloc(1, sizeof(*inst)); @@ -433,9 +361,8 @@ vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src) inst->qpu.branch.ub = true; inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL; - inst->dst = vir_reg(QFILE_NULL, 0); - inst->src[0] = src; - inst->uniform = ~0; + inst->dst = vir_nop_reg(); + inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0); return inst; } @@ -591,7 +518,6 @@ vir_compile_init(const struct v3d_compiler *compiler, vir_set_emit_block(c, vir_new_block(c)); c->output_position_index = -1; - c->output_point_size_index = -1; c->output_sample_mask_index = -1; c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer, @@ -601,7 +527,7 @@ vir_compile_init(const struct v3d_compiler *compiler, } static int -type_size_vec4(const struct glsl_type *type) +type_size_vec4(const struct glsl_type *type, bool bindless) { return glsl_count_attribute_slots(type, false); } @@ -638,8 +564,29 @@ v3d_lower_nir(struct v3d_compile *c) } } + /* CS textures may not have return_size reflecting the shadow state. */ + nir_foreach_variable(var, &c->s->uniforms) { + const struct glsl_type *type = glsl_without_array(var->type); + unsigned array_len = MAX2(glsl_get_length(var->type), 1); + + if (!glsl_type_is_sampler(type) || + !glsl_sampler_type_is_shadow(type)) + continue; + + for (int i = 0; i < array_len; i++) { + tex_options.lower_tex_packing[var->data.binding + i] = + nir_lower_tex_packing_16; + } + } + NIR_PASS_V(c->s, nir_lower_tex, &tex_options); NIR_PASS_V(c->s, nir_lower_system_values); + + NIR_PASS_V(c->s, nir_lower_vars_to_scratch, + nir_var_function_temp, + 0, + glsl_get_natural_size_align_bytes); + NIR_PASS_V(c->s, v3d_nir_lower_scratch); } static void @@ -658,47 +605,10 @@ v3d_set_prog_data_uniforms(struct v3d_compile *c, count * sizeof(*ulist->contents)); } -/* Copy the compiler UBO range state to the compiled shader, dropping out - * arrays that were never referenced by an indirect load. - * - * (Note that QIR dead code elimination of an array access still leaves that - * array alive, though) - */ -static void -v3d_set_prog_data_ubo(struct v3d_compile *c, - struct v3d_prog_data *prog_data) -{ - if (!c->num_ubo_ranges) - return; - - prog_data->num_ubo_ranges = 0; - prog_data->ubo_ranges = ralloc_array(prog_data, struct v3d_ubo_range, - c->num_ubo_ranges); - for (int i = 0; i < c->num_ubo_ranges; i++) { - if (!c->ubo_range_used[i]) - continue; - - struct v3d_ubo_range *range = &c->ubo_ranges[i]; - prog_data->ubo_ranges[prog_data->num_ubo_ranges++] = *range; - prog_data->ubo_size += range->size; - } - - if (prog_data->ubo_size) { - if (V3D_DEBUG & V3D_DEBUG_SHADERDB) { - fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n", - vir_get_stage_name(c), - c->program_id, c->variant_id, - prog_data->ubo_size / 4); - } - } -} - static void v3d_vs_set_prog_data(struct v3d_compile *c, struct v3d_vs_prog_data *prog_data) { - prog_data->base.num_inputs = c->num_inputs; - /* The vertex data gets format converted by the VPM so that * each attribute channel takes up a VPM column. Precompute * the sizes for the shader record. @@ -722,7 +632,7 @@ v3d_vs_set_prog_data(struct v3d_compile *c, * channel). */ prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8; - prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8; + prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8; /* Set us up for shared input/output segments. This is apparently * necessary for our VCM setup to avoid varying corruption. @@ -741,7 +651,7 @@ v3d_vs_set_prog_data(struct v3d_compile *c, * batches. */ assert(c->devinfo->vpm_size); - int sector_size = 16 * sizeof(uint32_t) * 8; + int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8; int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size; int half_vpm = vpm_size_in_sectors / 2; int vpm_output_sectors = half_vpm - prog_data->vpm_input_size; @@ -754,7 +664,7 @@ static void v3d_set_fs_prog_data_inputs(struct v3d_compile *c, struct v3d_fs_prog_data *prog_data) { - prog_data->base.num_inputs = c->num_inputs; + prog_data->num_inputs = c->num_inputs; memcpy(prog_data->input_slots, c->input_slots, c->num_inputs * sizeof(*c->input_slots)); @@ -780,6 +690,17 @@ v3d_fs_set_prog_data(struct v3d_compile *c, prog_data->writes_z = c->writes_z; prog_data->disable_ez = !c->s->info.fs.early_fragment_tests; prog_data->uses_center_w = c->uses_center_w; + prog_data->uses_implicit_point_line_varyings = + c->uses_implicit_point_line_varyings; + prog_data->lock_scoreboard_on_first_thrsw = + c->lock_scoreboard_on_first_thrsw; +} + +static void +v3d_cs_set_prog_data(struct v3d_compile *c, + struct v3d_compute_prog_data *prog_data) +{ + prog_data->shared_size = c->s->info.cs.shared_size; } static void @@ -791,9 +712,10 @@ v3d_set_prog_data(struct v3d_compile *c, prog_data->spill_size = c->spill_size; v3d_set_prog_data_uniforms(c, prog_data); - v3d_set_prog_data_ubo(c, prog_data); - if (c->s->info.stage == MESA_SHADER_VERTEX) { + if (c->s->info.stage == MESA_SHADER_COMPUTE) { + v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data); + } else if (c->s->info.stage == MESA_SHADER_VERTEX) { v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data); } else { assert(c->s->info.stage == MESA_SHADER_FRAGMENT); @@ -836,9 +758,16 @@ v3d_nir_lower_vs_early(struct v3d_compile *c) NIR_PASS_V(c->s, nir_lower_global_vars_to_local); v3d_optimize_nir(c->s); NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in); + + /* This must go before nir_lower_io */ + if (c->vs_key->per_vertex_point_size) + NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); + NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, type_size_vec4, (nir_lower_io_options)0); + /* clean up nir_lower_io's deref_var remains */ + NIR_PASS_V(c->s, nir_opt_dce); } static void @@ -877,6 +806,8 @@ v3d_nir_lower_fs_early(struct v3d_compile *c) if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb) v3d_fixup_fs_output_types(c); + NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c); + /* If the shader has no non-TLB side effects, we can promote it to * enabling early_fragment_tests even if the user didn't. */ @@ -928,6 +859,33 @@ v3d_nir_lower_fs_late(struct v3d_compile *c) NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in); } +static uint32_t +vir_get_max_temps(struct v3d_compile *c) +{ + int max_ip = 0; + vir_for_each_inst_inorder(inst, c) + max_ip++; + + uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip); + + for (int t = 0; t < c->num_temps; t++) { + for (int i = c->temp_start[t]; (i < c->temp_end[t] && + i < max_ip); i++) { + if (i > max_ip) + break; + pressure[i]++; + } + } + + uint32_t max_temps = 0; + for (int i = 0; i < max_ip; i++) + max_temps = MAX2(max_temps, pressure[i]); + + ralloc_free(pressure); + + return max_temps; +} + uint64_t *v3d_compile(const struct v3d_compiler *compiler, struct v3d_key *key, struct v3d_prog_data **out_prog_data, @@ -952,13 +910,17 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, c->fs_key = (struct v3d_fs_key *)key; prog_data = rzalloc_size(NULL, sizeof(struct v3d_fs_prog_data)); break; + case MESA_SHADER_COMPUTE: + prog_data = rzalloc_size(NULL, + sizeof(struct v3d_compute_prog_data)); + break; default: unreachable("unsupported shader stage"); } if (c->s->info.stage == MESA_SHADER_VERTEX) { v3d_nir_lower_vs_early(c); - } else { + } else if (c->s->info.stage != MESA_SHADER_COMPUTE) { assert(c->s->info.stage == MESA_SHADER_FRAGMENT); v3d_nir_lower_fs_early(c); } @@ -967,7 +929,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, if (c->s->info.stage == MESA_SHADER_VERTEX) { v3d_nir_lower_vs_late(c); - } else { + } else if (c->s->info.stage != MESA_SHADER_COMPUTE) { assert(c->s->info.stage == MESA_SHADER_FRAGMENT); v3d_nir_lower_fs_late(c); } @@ -990,15 +952,22 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, char *shaderdb; int ret = asprintf(&shaderdb, "%s shader: %d inst, %d threads, %d loops, " - "%d uniforms, %d:%d spills:fills", + "%d uniforms, %d max-temps, %d:%d spills:fills, " + "%d sfu-stalls, %d inst-and-stalls", vir_get_stage_name(c), c->qpu_inst_count, c->threads, c->loops, c->num_uniforms, + vir_get_max_temps(c), c->spills, - c->fills); + c->fills, + c->qpu_inst_stalled_count, + c->qpu_inst_count + c->qpu_inst_stalled_count); if (ret >= 0) { + if (V3D_DEBUG & V3D_DEBUG_SHADERDB) + fprintf(stderr, "SHADER-DB: %s\n", shaderdb); + c->debug_output(shaderdb, c->debug_output_data); free(shaderdb); } @@ -1059,15 +1028,15 @@ vir_compile_destroy(struct v3d_compile *c) ralloc_free(c); } -struct qreg -vir_uniform(struct v3d_compile *c, - enum quniform_contents contents, - uint32_t data) +uint32_t +vir_get_uniform_index(struct v3d_compile *c, + enum quniform_contents contents, + uint32_t data) { for (int i = 0; i < c->num_uniforms; i++) { if (c->uniform_contents[i] == contents && c->uniform_data[i] == data) { - return vir_reg(QFILE_UNIF, i); + return i; } } @@ -1088,52 +1057,20 @@ vir_uniform(struct v3d_compile *c, c->uniform_contents[uniform] = contents; c->uniform_data[uniform] = data; - return vir_reg(QFILE_UNIF, uniform); -} - -static bool -vir_can_set_flags(struct v3d_compile *c, struct qinst *inst) -{ - if (c->devinfo->ver >= 40 && (v3d_qpu_reads_vpm(&inst->qpu) || - v3d_qpu_uses_sfu(&inst->qpu))) { - return false; - } - - if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || - (inst->qpu.alu.add.op == V3D_QPU_A_NOP && - inst->qpu.alu.mul.op == V3D_QPU_M_NOP)) { - return false; - } - - return true; + return uniform; } -void -vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf) +struct qreg +vir_uniform(struct v3d_compile *c, + enum quniform_contents contents, + uint32_t data) { - struct qinst *last_inst = NULL; - - if (!list_empty(&c->cur_block->instructions)) { - last_inst = (struct qinst *)c->cur_block->instructions.prev; - - /* Can't stuff the PF into the last last inst if our cursor - * isn't pointing after it. - */ - struct vir_cursor after_inst = vir_after_inst(last_inst); - if (c->cursor.mode != after_inst.mode || - c->cursor.link != after_inst.link) - last_inst = NULL; - } - - if (src.file != QFILE_TEMP || - !c->defs[src.index] || - last_inst != c->defs[src.index] || - !vir_can_set_flags(c, last_inst)) { - /* XXX: Make the MOV be the appropriate type */ - last_inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), src); - } - - vir_set_pf(last_inst, pf); + struct qinst *inst = vir_NOP(c); + inst->qpu.sig.ldunif = true; + inst->uniform = vir_get_uniform_index(c, contents, data); + inst->dst = vir_get_temp(c); + c->defs[inst->dst.index] = inst; + return inst->dst; } #define OPTPASS(func) \ @@ -1160,6 +1097,7 @@ vir_optimize(struct v3d_compile *c) bool progress = false; OPTPASS(vir_opt_copy_propagate); + OPTPASS(vir_opt_redundant_flags); OPTPASS(vir_opt_dead_code); OPTPASS(vir_opt_small_immediates); diff --git a/lib/mesa/src/broadcom/compiler/vir_dump.c b/lib/mesa/src/broadcom/compiler/vir_dump.c index ecf6f3e1f..9e1ef1e9d 100644 --- a/lib/mesa/src/broadcom/compiler/vir_dump.c +++ b/lib/mesa/src/broadcom/compiler/vir_dump.c @@ -30,6 +30,7 @@ vir_dump_uniform(enum quniform_contents contents, uint32_t data) { static const char *quniform_names[] = { + [QUNIFORM_ALPHA_REF] = "alpha_ref", [QUNIFORM_VIEWPORT_X_SCALE] = "vp_x_scale", [QUNIFORM_VIEWPORT_Y_SCALE] = "vp_y_scale", [QUNIFORM_VIEWPORT_Z_OFFSET] = "vp_z_offset", @@ -52,20 +53,20 @@ vir_dump_uniform(enum quniform_contents contents, case QUNIFORM_TMU_CONFIG_P0: fprintf(stderr, "tex[%d].p0 | 0x%x", - v3d_tmu_config_data_get_unit(data), - v3d_tmu_config_data_get_value(data)); + v3d_unit_data_get_unit(data), + v3d_unit_data_get_offset(data)); break; case QUNIFORM_TMU_CONFIG_P1: fprintf(stderr, "tex[%d].p1 | 0x%x", - v3d_tmu_config_data_get_unit(data), - v3d_tmu_config_data_get_value(data)); + v3d_unit_data_get_unit(data), + v3d_unit_data_get_offset(data)); break; case QUNIFORM_IMAGE_TMU_CONFIG_P0: fprintf(stderr, "img[%d].p0 | 0x%x", - v3d_tmu_config_data_get_unit(data), - v3d_tmu_config_data_get_value(data)); + v3d_unit_data_get_unit(data), + v3d_unit_data_get_offset(data)); break; case QUNIFORM_TEXTURE_WIDTH: @@ -97,8 +98,18 @@ vir_dump_uniform(enum quniform_contents contents, fprintf(stderr, "img[%d].array_size", data); break; + case QUNIFORM_SPILL_OFFSET: + fprintf(stderr, "spill_offset"); + break; + + case QUNIFORM_SPILL_SIZE_PER_THREAD: + fprintf(stderr, "spill_size_per_thread"); + break; + case QUNIFORM_UBO_ADDR: - fprintf(stderr, "ubo[%d]", data); + fprintf(stderr, "ubo[%d]+0x%x", + v3d_unit_data_get_unit(data), + v3d_unit_data_get_offset(data)); break; case QUNIFORM_SSBO_OFFSET: @@ -118,7 +129,8 @@ vir_dump_uniform(enum quniform_contents contents, fprintf(stderr, "tex[%d].p0: 0x%08x", contents - QUNIFORM_TEXTURE_CONFIG_P0_0, data); - } else if (contents < ARRAY_SIZE(quniform_names)) { + } else if (contents < ARRAY_SIZE(quniform_names) && + quniform_names[contents]) { fprintf(stderr, "%s", quniform_names[contents]); } else { @@ -131,13 +143,6 @@ static void vir_print_reg(struct v3d_compile *c, const struct qinst *inst, struct qreg reg) { - static const char *files[] = { - [QFILE_TEMP] = "t", - [QFILE_UNIF] = "u", - [QFILE_TLB] = "tlb", - [QFILE_TLBU] = "tlbu", - }; - switch (reg.file) { case QFILE_NULL: @@ -176,21 +181,8 @@ vir_print_reg(struct v3d_compile *c, const struct qinst *inst, reg.index / 4, reg.index % 4); break; - case QFILE_TLB: - case QFILE_TLBU: - fprintf(stderr, "%s", files[reg.file]); - break; - - case QFILE_UNIF: - fprintf(stderr, "%s%d", files[reg.file], reg.index); - fprintf(stderr, " ("); - vir_dump_uniform(c->uniform_contents[reg.index], - c->uniform_data[reg.index]); - fprintf(stderr, ")"); - break; - - default: - fprintf(stderr, "%s%d", files[reg.file], reg.index); + case QFILE_TEMP: + fprintf(stderr, "t%d", reg.index); break; } } @@ -258,8 +250,7 @@ static void vir_dump_alu(struct v3d_compile *c, struct qinst *inst) { struct v3d_qpu_instr *instr = &inst->qpu; - int nsrc = vir_get_non_sideband_nsrc(inst); - int sideband_nsrc = vir_get_nsrc(inst); + int nsrc = vir_get_nsrc(inst); enum v3d_qpu_input_unpack unpack[2]; if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) { @@ -288,11 +279,10 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst) unpack[1] = instr->alu.mul.b_unpack; } - for (int i = 0; i < sideband_nsrc; i++) { + for (int i = 0; i < nsrc; i++) { fprintf(stderr, ", "); vir_print_reg(c, inst, inst->src[i]); - if (i < nsrc) - fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i])); + fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i])); } vir_dump_sig(c, inst); @@ -353,25 +343,34 @@ vir_dump_inst(struct v3d_compile *c, struct qinst *inst) break; } } - - if (vir_has_implicit_uniform(inst)) { - fprintf(stderr, " "); - vir_print_reg(c, inst, inst->src[vir_get_implicit_uniform_src(inst)]); - } - break; } + + if (vir_has_uniform(inst)) { + fprintf(stderr, " ("); + vir_dump_uniform(c->uniform_contents[inst->uniform], + c->uniform_data[inst->uniform]); + fprintf(stderr, ")"); + } } void vir_dump(struct v3d_compile *c) { int ip = 0; + int pressure = 0; vir_for_each_block(block, c) { fprintf(stderr, "BLOCK %d:\n", block->index); vir_for_each_inst(inst, block) { if (c->live_intervals_valid) { + for (int i = 0; i < c->num_temps; i++) { + if (c->temp_start[i] == ip) + pressure++; + } + + fprintf(stderr, "P%4d ", pressure); + bool first = true; for (int i = 0; i < c->num_temps; i++) { @@ -383,7 +382,10 @@ vir_dump(struct v3d_compile *c) } else { fprintf(stderr, ", "); } - fprintf(stderr, "S%4d", i); + if (BITSET_TEST(c->spillable, i)) + fprintf(stderr, "S%4d", i); + else + fprintf(stderr, "U%4d", i); } if (first) @@ -405,6 +407,7 @@ vir_dump(struct v3d_compile *c) fprintf(stderr, ", "); } fprintf(stderr, "E%4d", i); + pressure--; } if (first) diff --git a/lib/mesa/src/broadcom/compiler/vir_live_variables.c b/lib/mesa/src/broadcom/compiler/vir_live_variables.c index 2879e23b4..d3ca02f18 100644 --- a/lib/mesa/src/broadcom/compiler/vir_live_variables.c +++ b/lib/mesa/src/broadcom/compiler/vir_live_variables.c @@ -109,24 +109,18 @@ vir_setup_def(struct v3d_compile *c, struct qblock *block, int ip, c->temp_start[var] = MIN2(c->temp_start[var], ip); c->temp_end[var] = MAX2(c->temp_end[var], ip); - /* If we've already tracked this as a def, or already used it within - * the block, there's nothing to do. + /* Mark the block as having a (partial) def of the var. */ + BITSET_SET(block->defout, var); + + /* If we've already tracked this as a def that screens off previous + * uses, or already used it within the block, there's nothing to do. */ if (BITSET_TEST(block->use, var) || BITSET_TEST(block->def, var)) return; - /* Easy, common case: unconditional full register update. - * - * We treat conditioning on the exec mask as the same as not being - * conditional. This makes sure that if the register gets set on - * either side of an if, it is treated as being screened off before - * the if. Otherwise, if there was no intervening def, its live - * interval doesn't extend back to the start of he program, and if too - * many registers did that we'd fail to register allocate. - */ - if (((inst->qpu.flags.ac == V3D_QPU_COND_NONE && - inst->qpu.flags.mc == V3D_QPU_COND_NONE) || - inst->cond_is_exec_mask) && + /* Easy, common case: unconditional full register update.*/ + if ((inst->qpu.flags.ac == V3D_QPU_COND_NONE && + inst->qpu.flags.mc == V3D_QPU_COND_NONE) && inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE && inst->qpu.alu.mul.output_pack == V3D_QPU_PACK_NONE) { BITSET_SET(block->def, var); @@ -278,6 +272,33 @@ vir_live_variables_dataflow(struct v3d_compile *c, int bitset_words) return cont; } +static bool +vir_live_variables_defin_defout_dataflow(struct v3d_compile *c, int bitset_words) +{ + bool cont = false; + + vir_for_each_block_rev(block, c) { + /* Propagate defin/defout down the successors to produce the + * union of blocks with a reachable (partial) definition of + * the var. + * + * This keeps a conditional first write to a reg from + * extending its lifetime back to the start of the program. + */ + vir_for_each_successor(succ, block) { + for (int i = 0; i < bitset_words; i++) { + BITSET_WORD new_def = (block->defout[i] & + ~succ->defin[i]); + succ->defin[i] |= new_def; + succ->defout[i] |= new_def; + cont |= new_def; + } + } + } + + return cont; +} + /** * Extend the start/end ranges for each variable to account for the * new information calculated from control flow. @@ -287,14 +308,16 @@ vir_compute_start_end(struct v3d_compile *c, int num_vars) { vir_for_each_block(block, c) { for (int i = 0; i < num_vars; i++) { - if (BITSET_TEST(block->live_in, i)) { + if (BITSET_TEST(block->live_in, i) && + BITSET_TEST(block->defin, i)) { c->temp_start[i] = MIN2(c->temp_start[i], block->start_ip); c->temp_end[i] = MAX2(c->temp_end[i], block->start_ip); } - if (BITSET_TEST(block->live_out, i)) { + if (BITSET_TEST(block->live_out, i) && + BITSET_TEST(block->defout, i)) { c->temp_start[i] = MIN2(c->temp_start[i], block->end_ip); c->temp_end[i] = MAX2(c->temp_end[i], @@ -334,6 +357,8 @@ vir_calculate_live_intervals(struct v3d_compile *c) vir_for_each_block(block, c) { block->def = rzalloc_array(c, BITSET_WORD, bitset_words); + block->defin = rzalloc_array(c, BITSET_WORD, bitset_words); + block->defout = rzalloc_array(c, BITSET_WORD, bitset_words); block->use = rzalloc_array(c, BITSET_WORD, bitset_words); block->live_in = rzalloc_array(c, BITSET_WORD, bitset_words); block->live_out = rzalloc_array(c, BITSET_WORD, bitset_words); @@ -344,6 +369,9 @@ vir_calculate_live_intervals(struct v3d_compile *c) while (vir_live_variables_dataflow(c, bitset_words)) ; + while (vir_live_variables_defin_defout_dataflow(c, bitset_words)) + ; + vir_compute_start_end(c, c->num_temps); c->live_intervals_valid = true; diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c b/lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c index 2a22a1b55..c5bb61121 100644 --- a/lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c +++ b/lib/mesa/src/broadcom/compiler/vir_opt_copy_propagate.c @@ -49,10 +49,8 @@ is_copy_mov(struct qinst *inst) if (inst->dst.file != QFILE_TEMP) return false; - if (inst->src[0].file != QFILE_TEMP && - inst->src[0].file != QFILE_UNIF) { + if (inst->src[0].file != QFILE_TEMP) return false; - } if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE || inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) { @@ -151,13 +149,36 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs) * would be the same between the two * instructions. */ - if (vir_is_float_input(inst) != - vir_is_float_input(mov)) { + if (v3d_qpu_unpacks_f32(&inst->qpu) != + v3d_qpu_unpacks_f32(&mov->qpu) || + v3d_qpu_unpacks_f16(&inst->qpu) != + v3d_qpu_unpacks_f16(&mov->qpu)) { continue; } + /* No composing the unpacks. */ if (vir_has_unpack(inst, i)) - continue; + continue; + + /* these ops can't represent abs. */ + if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) { + switch (inst->qpu.alu.add.op) { + case V3D_QPU_A_VFPACK: + case V3D_QPU_A_FROUND: + case V3D_QPU_A_FTRUNC: + case V3D_QPU_A_FFLOOR: + case V3D_QPU_A_FCEIL: + case V3D_QPU_A_FDX: + case V3D_QPU_A_FDY: + case V3D_QPU_A_FTOIN: + case V3D_QPU_A_FTOIZ: + case V3D_QPU_A_FTOUZ: + case V3D_QPU_A_FTOC: + continue; + default: + break; + } + } } if (debug) { diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c b/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c index a486708bf..6048ccfcc 100644 --- a/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c +++ b/lib/mesa/src/broadcom/compiler/vir_opt_dead_code.c @@ -55,28 +55,8 @@ static bool has_nonremovable_reads(struct v3d_compile *c, struct qinst *inst) { for (int i = 0; i < vir_get_nsrc(inst); i++) { - if (inst->src[i].file == QFILE_VPM) { - /* Instance ID, Vertex ID: Should have been removed at - * the NIR level - */ - if (inst->src[i].index == ~0) - return true; - - uint32_t attr = inst->src[i].index / 4; - uint32_t offset = inst->src[i].index % 4; - - if (c->vattr_sizes[attr] != offset) - return true; - - /* Can't get rid of the last VPM read, or the - * simulator (at least) throws an error. - */ - uint32_t total_size = 0; - for (uint32_t i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++) - total_size += c->vattr_sizes[i]; - if (total_size == 1) - return true; - } + if (inst->src[i].file == QFILE_VPM) + return true; } return false; @@ -187,18 +167,6 @@ vir_opt_dead_code(struct v3d_compile *c) continue; } - for (int i = 0; i < vir_get_nsrc(inst); i++) { - if (inst->src[i].file != QFILE_VPM) - continue; - uint32_t attr = inst->src[i].index / 4; - uint32_t offset = (inst->src[i].index % 4); - - if (c->vattr_sizes[attr] == offset) { - c->num_inputs--; - c->vattr_sizes[attr]--; - } - } - assert(inst != last_flags_write); dce(c, inst); progress = true; diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c b/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c new file mode 100644 index 000000000..8749f3cd6 --- /dev/null +++ b/lib/mesa/src/broadcom/compiler/vir_opt_redundant_flags.c @@ -0,0 +1,143 @@ +/* + * Copyright © 2019 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file v3d_opt_redundant_flags.c + * + * This eliminates the APF/MPF flags for redundant flags updates. These are + * often produced by our channel masking in nonuniform control flow. + */ + +#include "v3d_compiler.h" + +static bool debug; + +static void +vir_dce_pf(struct v3d_compile *c, struct qinst *inst) +{ + if (debug) { + fprintf(stderr, + "Removing flags write from: "); + vir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } + + assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); + + inst->qpu.flags.apf = V3D_QPU_PF_NONE; + inst->qpu.flags.mpf = V3D_QPU_PF_NONE; +} + +static bool +vir_sources_modified(struct qinst *srcs, struct qinst *write) +{ + for (int i = 0; i < vir_get_nsrc(srcs); i++) { + if (write->dst.file == QFILE_TEMP && + srcs->src[i].file == QFILE_TEMP && + srcs->src[i].index == write->dst.index) { + return true; + } + + /* assume magic regs may be modified by basically anything. */ + if (srcs->src[i].file != QFILE_TEMP && + srcs->src[i].file != QFILE_SMALL_IMM) + return true; + } + + return false; +} + +static bool +vir_instr_flags_op_equal(struct qinst *a, struct qinst *b) +{ + for (int i = 0; i < vir_get_nsrc(a); i++) { + if (a->src[i].file != b->src[i].file || + a->src[i].index != b->src[i].index) { + return false; + } + } + + if (a->qpu.flags.apf != b->qpu.flags.apf || + a->qpu.flags.mpf != b->qpu.flags.mpf || + a->qpu.alu.add.op != b->qpu.alu.add.op || + a->qpu.alu.mul.op != b->qpu.alu.mul.op || + a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack || + a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack || + a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack || + a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack || + a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack || + a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) { + return false; + } + + return true; +} + +static bool +vir_opt_redundant_flags_block(struct v3d_compile *c, struct qblock *block) +{ + struct qinst *last_flags = NULL; + bool progress = false; + + vir_for_each_inst(inst, block) { + if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || + inst->qpu.flags.auf != V3D_QPU_UF_NONE || + inst->qpu.flags.muf != V3D_QPU_UF_NONE) { + last_flags = NULL; + continue; + } + + /* Flags aren't preserved across a thrsw. */ + if (inst->qpu.sig.thrsw) + last_flags = NULL; + + if (inst->qpu.flags.apf != V3D_QPU_PF_NONE || + inst->qpu.flags.mpf != V3D_QPU_PF_NONE) { + if (last_flags && + vir_instr_flags_op_equal(inst, last_flags)) { + vir_dce_pf(c, inst); + progress = true; + } else { + last_flags = inst; + } + } + + if (last_flags && vir_sources_modified(last_flags, inst)) { + last_flags = NULL; + } + } + + return progress; +} + +bool +vir_opt_redundant_flags(struct v3d_compile *c) +{ + bool progress = false; + + vir_for_each_block(block, c) { + progress = vir_opt_redundant_flags_block(c, block) || progress; + } + + return progress; +} diff --git a/lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c b/lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c index 5491f9c24..47d772296 100644 --- a/lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c +++ b/lib/mesa/src/broadcom/compiler/vir_opt_small_immediates.c @@ -55,26 +55,22 @@ vir_opt_small_immediates(struct v3d_compile *c) continue; for (int i = 0; i < vir_get_nsrc(inst); i++) { - struct qreg src = vir_follow_movs(c, inst->src[i]); + if (inst->src[i].file != QFILE_TEMP) + continue; - if (src.file != QFILE_UNIF || - c->uniform_contents[src.index] != - QUNIFORM_CONSTANT) { + /* See if it's a uniform load. */ + struct qinst *src_def = c->defs[inst->src[i].index]; + if (!src_def || !src_def->qpu.sig.ldunif) continue; - } + int uniform = src_def->uniform; - if (vir_has_implicit_uniform(inst) && - i == vir_get_implicit_uniform_src(inst)) { - /* No turning the implicit uniform read into - * an immediate. - */ + if (c->uniform_contents[uniform] != QUNIFORM_CONSTANT) continue; - } /* Check if the uniform is suitable as a small * immediate. */ - uint32_t imm = c->uniform_data[src.index]; + uint32_t imm = c->uniform_data[uniform]; uint32_t packed; if (!v3d_qpu_small_imm_pack(c->devinfo, imm, &packed)) continue; diff --git a/lib/mesa/src/broadcom/compiler/vir_register_allocate.c b/lib/mesa/src/broadcom/compiler/vir_register_allocate.c index 79ab5acd7..7583acf15 100644 --- a/lib/mesa/src/broadcom/compiler/vir_register_allocate.c +++ b/lib/mesa/src/broadcom/compiler/vir_register_allocate.c @@ -29,28 +29,44 @@ #define QPU_R(i) { .magic = false, .index = i } #define ACC_INDEX 0 -#define ACC_COUNT 5 +#define ACC_COUNT 6 #define PHYS_INDEX (ACC_INDEX + ACC_COUNT) #define PHYS_COUNT 64 +static inline bool +qinst_writes_tmu(struct qinst *inst) +{ + return (inst->dst.file == QFILE_MAGIC && + v3d_qpu_magic_waddr_is_tmu(inst->dst.index)); +} + static bool is_last_ldtmu(struct qinst *inst, struct qblock *block) { - list_for_each_entry_from(struct qinst, scan_inst, inst, + list_for_each_entry_from(struct qinst, scan_inst, inst->link.next, &block->instructions, link) { - if (inst->qpu.sig.ldtmu) + if (scan_inst->qpu.sig.ldtmu) return false; - if (v3d_qpu_writes_tmu(&inst->qpu)) + if (qinst_writes_tmu(scan_inst)) return true; } return true; } +static bool +vir_is_mov_uniform(struct v3d_compile *c, int temp) +{ + struct qinst *def = c->defs[temp]; + + return def && def->qpu.sig.ldunif; +} + static int v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, uint32_t *temp_to_node) { + const float tmu_scale = 5; float block_scale = 1.0; float spill_costs[c->num_temps]; bool in_tmu_operation = false; @@ -75,22 +91,28 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, continue; int temp = inst->src[i].index; - if (no_spilling) { - BITSET_CLEAR(c->spillable, - temp); - } else { + if (vir_is_mov_uniform(c, temp)) { spill_costs[temp] += block_scale; + } else if (!no_spilling) { + spill_costs[temp] += (block_scale * + tmu_scale); + } else { + BITSET_CLEAR(c->spillable, temp); } } if (inst->dst.file == QFILE_TEMP) { int temp = inst->dst.index; - if (no_spilling) { - BITSET_CLEAR(c->spillable, - temp); + if (vir_is_mov_uniform(c, temp)) { + /* We just rematerialize the unform + * later. + */ + } else if (!no_spilling) { + spill_costs[temp] += (block_scale * + tmu_scale); } else { - spill_costs[temp] += block_scale; + BITSET_CLEAR(c->spillable, temp); } } @@ -123,7 +145,7 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) in_tmu_operation = false; - if (v3d_qpu_writes_tmu(&inst->qpu)) + if (qinst_writes_tmu(inst)) in_tmu_operation = true; } } @@ -141,7 +163,7 @@ v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g, /* The spill offset for this thread takes a bit of setup, so do it once at * program start. */ -static void +void v3d_setup_spill_base(struct v3d_compile *c) { c->cursor = vir_before_block(vir_entry_block(c)); @@ -170,6 +192,8 @@ v3d_setup_spill_base(struct v3d_compile *c) /* Make sure that we don't spill the spilling setup instructions. */ for (int i = start_num_temps; i < c->num_temps; i++) BITSET_CLEAR(c->spillable, i); + + c->cursor = vir_after_block(c->cur_block); } static void @@ -184,18 +208,30 @@ v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset) static void v3d_spill_reg(struct v3d_compile *c, int spill_temp) { - uint32_t spill_offset = c->spill_size; - c->spill_size += 16 * sizeof(uint32_t); + bool is_uniform = vir_is_mov_uniform(c, spill_temp); + + uint32_t spill_offset = 0; - if (spill_offset == 0) - v3d_setup_spill_base(c); + if (!is_uniform) { + uint32_t spill_offset = c->spill_size; + c->spill_size += V3D_CHANNELS * sizeof(uint32_t); + + if (spill_offset == 0) + v3d_setup_spill_base(c); + } struct qinst *last_thrsw = c->last_thrsw; assert(!last_thrsw || last_thrsw->is_last_thrsw); int start_num_temps = c->num_temps; - vir_for_each_inst_inorder(inst, c) { + int uniform_index = ~0; + if (is_uniform) { + struct qinst *orig_unif = c->defs[spill_temp]; + uniform_index = orig_unif->uniform; + } + + vir_for_each_inst_inorder_safe(inst, c) { for (int i = 0; i < vir_get_nsrc(inst); i++) { if (inst->src[i].file != QFILE_TEMP || inst->src[i].index != spill_temp) { @@ -204,23 +240,37 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) c->cursor = vir_before_inst(inst); - v3d_emit_spill_tmua(c, spill_offset); - vir_emit_thrsw(c); - inst->src[i] = vir_LDTMU(c); - c->fills++; + if (is_uniform) { + struct qreg unif = + vir_uniform(c, + c->uniform_contents[uniform_index], + c->uniform_data[uniform_index]); + inst->src[i] = unif; + } else { + v3d_emit_spill_tmua(c, spill_offset); + vir_emit_thrsw(c); + inst->src[i] = vir_LDTMU(c); + c->fills++; + } } if (inst->dst.file == QFILE_TEMP && inst->dst.index == spill_temp) { - c->cursor = vir_after_inst(inst); - - inst->dst.index = c->num_temps++; - vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), - inst->dst); - v3d_emit_spill_tmua(c, spill_offset); - vir_emit_thrsw(c); - vir_TMUWT(c); - c->spills++; + if (is_uniform) { + c->cursor.link = NULL; + vir_remove_instruction(c, inst); + } else { + c->cursor = vir_after_inst(inst); + + inst->dst.index = c->num_temps++; + vir_MOV_dest(c, vir_reg(QFILE_MAGIC, + V3D_QPU_WADDR_TMUD), + inst->dst); + v3d_emit_spill_tmua(c, spill_offset); + vir_emit_thrsw(c); + vir_TMUWT(c); + c->spills++; + } } /* If we didn't have a last-thrsw inserted by nir_to_vir and @@ -228,7 +278,7 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp) * right before we start the vpm/tlb sequence for the last * thread segment. */ - if (!last_thrsw && c->last_thrsw && + if (!is_uniform && !last_thrsw && c->last_thrsw && (v3d_qpu_writes_vpm(&inst->qpu) || v3d_qpu_uses_tlb(&inst->qpu))) { c->cursor = vir_before_inst(inst); @@ -261,6 +311,14 @@ static unsigned int v3d_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data) { struct v3d_ra_select_callback_data *v3d_ra = data; + int r5 = ACC_INDEX + 5; + + /* Choose r5 for our ldunifs if possible (nobody else can load to that + * reg, and it keeps the QPU cond field free from being occupied by + * ldunifrf). + */ + if (BITSET_TEST(regs, r5)) + return r5; /* Choose an accumulator if possible (I think it's lower power than * phys regs), but round-robin through them to give post-RA @@ -303,6 +361,10 @@ vir_init_reg_sets(struct v3d_compiler *compiler) return false; for (int threads = 0; threads < max_thread_index; threads++) { + compiler->reg_class_any[threads] = + ra_alloc_reg_class(compiler->regs); + compiler->reg_class_r5[threads] = + ra_alloc_reg_class(compiler->regs); compiler->reg_class_phys_or_acc[threads] = ra_alloc_reg_class(compiler->regs); compiler->reg_class_phys[threads] = @@ -314,12 +376,25 @@ vir_init_reg_sets(struct v3d_compiler *compiler) compiler->reg_class_phys_or_acc[threads], i); ra_class_add_reg(compiler->regs, compiler->reg_class_phys[threads], i); + ra_class_add_reg(compiler->regs, + compiler->reg_class_any[threads], i); } - for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT; i++) { + for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) { ra_class_add_reg(compiler->regs, compiler->reg_class_phys_or_acc[threads], i); + ra_class_add_reg(compiler->regs, + compiler->reg_class_any[threads], i); } + /* r5 can only store a single 32-bit value, so not much can + * use it. + */ + ra_class_add_reg(compiler->regs, + compiler->reg_class_r5[threads], + ACC_INDEX + 5); + ra_class_add_reg(compiler->regs, + compiler->reg_class_any[threads], + ACC_INDEX + 5); } ra_set_finalize(compiler->regs, NULL); @@ -342,9 +417,11 @@ node_to_temp_priority(const void *in_a, const void *in_b) } #define CLASS_BIT_PHYS (1 << 0) -#define CLASS_BIT_R0_R2 (1 << 1) -#define CLASS_BIT_R3 (1 << 2) -#define CLASS_BIT_R4 (1 << 3) +#define CLASS_BIT_ACC (1 << 1) +#define CLASS_BIT_R5 (1 << 4) +#define CLASS_BITS_ANY (CLASS_BIT_PHYS | \ + CLASS_BIT_ACC | \ + CLASS_BIT_R5) /** * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. @@ -357,8 +434,6 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) struct node_to_temp_map map[c->num_temps]; uint32_t temp_to_node[c->num_temps]; uint8_t class_bits[c->num_temps]; - struct qpu_reg *temp_registers = calloc(c->num_temps, - sizeof(*temp_registers)); int acc_nodes[ACC_COUNT]; struct v3d_ra_select_callback_data callback_data = { .next_acc = 0, @@ -412,9 +487,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) * start with any temp being able to be in any file, then instructions * incrementally remove bits that the temp definitely can't be in. */ - memset(class_bits, - CLASS_BIT_PHYS | CLASS_BIT_R0_R2 | CLASS_BIT_R3 | CLASS_BIT_R4, - sizeof(class_bits)); + memset(class_bits, CLASS_BITS_ANY, sizeof(class_bits)); int ip = 0; vir_for_each_inst_inorder(inst, c) { @@ -497,6 +570,24 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) } } + if (inst->dst.file == QFILE_TEMP) { + /* Only a ldunif gets to write to R5, which only has a + * single 32-bit channel of storage. + */ + if (!inst->qpu.sig.ldunif) { + class_bits[inst->dst.index] &= ~CLASS_BIT_R5; + } else { + /* Until V3D 4.x, we could only load a uniform + * to r5, so we'll need to spill if uniform + * loads interfere with each other. + */ + if (c->devinfo->ver < 40) { + class_bits[inst->dst.index] &= + CLASS_BIT_R5; + } + } + } + if (inst->qpu.sig.thrsw) { /* All accumulators are invalidated across a thread * switch. @@ -514,13 +605,16 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) if (class_bits[i] == CLASS_BIT_PHYS) { ra_set_node_class(g, temp_to_node[i], c->compiler->reg_class_phys[thread_index]); - } else { - assert(class_bits[i] == (CLASS_BIT_PHYS | - CLASS_BIT_R0_R2 | - CLASS_BIT_R3 | - CLASS_BIT_R4)); + } else if (class_bits[i] == (CLASS_BIT_R5)) { + ra_set_node_class(g, temp_to_node[i], + c->compiler->reg_class_r5[thread_index]); + } else if (class_bits[i] == (CLASS_BIT_PHYS | CLASS_BIT_ACC)) { ra_set_node_class(g, temp_to_node[i], c->compiler->reg_class_phys_or_acc[thread_index]); + } else { + assert(class_bits[i] == CLASS_BITS_ANY); + ra_set_node_class(g, temp_to_node[i], + c->compiler->reg_class_any[thread_index]); } } @@ -539,7 +633,8 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) * conformance tests to make sure that spilling works. */ int force_register_spills = 0; - if (c->spill_size < 16 * sizeof(uint32_t) * force_register_spills) { + if (c->spill_size < + V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) { int node = v3d_choose_spill_node(c, g, temp_to_node); if (node != -1) { v3d_spill_reg(c, map[node].temp); @@ -551,24 +646,27 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) bool ok = ra_allocate(g); if (!ok) { - /* Try to spill, if we can't reduce threading first. */ - if (thread_index == 0) { - int node = v3d_choose_spill_node(c, g, temp_to_node); + int node = v3d_choose_spill_node(c, g, temp_to_node); - if (node != -1) { - v3d_spill_reg(c, map[node].temp); - ralloc_free(g); + /* Don't emit spills using the TMU until we've dropped thread + * conut first. + */ + if (node != -1 && + (vir_is_mov_uniform(c, map[node].temp) || + thread_index == 0)) { + v3d_spill_reg(c, map[node].temp); - /* Ask the outer loop to call back in. */ - *spilled = true; - return NULL; - } + /* Ask the outer loop to call back in. */ + *spilled = true; } - free(temp_registers); + ralloc_free(g); return NULL; } + struct qpu_reg *temp_registers = calloc(c->num_temps, + sizeof(*temp_registers)); + for (uint32_t i = 0; i < c->num_temps; i++) { int ra_reg = ra_get_node_reg(g, temp_to_node[i]); if (ra_reg < PHYS_INDEX) { @@ -591,17 +689,5 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) ralloc_free(g); - if (V3D_DEBUG & V3D_DEBUG_SHADERDB) { - fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d spills\n", - vir_get_stage_name(c), - c->program_id, c->variant_id, - c->spills); - - fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d fills\n", - vir_get_stage_name(c), - c->program_id, c->variant_id, - c->fills); - } - return temp_registers; } diff --git a/lib/mesa/src/broadcom/compiler/vir_to_qpu.c b/lib/mesa/src/broadcom/compiler/vir_to_qpu.c index c66bb84b3..e6461ff94 100644 --- a/lib/mesa/src/broadcom/compiler/vir_to_qpu.c +++ b/lib/mesa/src/broadcom/compiler/vir_to_qpu.c @@ -76,7 +76,7 @@ v3d_qpu_nop(void) static struct qinst * vir_nop(void) { - struct qreg undef = { QFILE_NULL, 0 }; + struct qreg undef = vir_nop_reg(); struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); return qinst; @@ -92,16 +92,6 @@ new_qpu_nop_before(struct qinst *inst) return q; } -static void -new_ldunif_instr(struct qinst *inst, int i) -{ - struct qinst *ldunif = new_qpu_nop_before(inst); - - ldunif->qpu.sig.ldunif = true; - assert(inst->src[i].file == QFILE_UNIF); - ldunif->uniform = inst->src[i].index; -} - /** * Allocates the src register (accumulator or register file) into the RADDR * fields of the instruction. @@ -214,16 +204,11 @@ v3d_generate_code_block(struct v3d_compile *c, struct qinst *temp; - if (vir_has_implicit_uniform(qinst)) { - int src = vir_get_implicit_uniform_src(qinst); - assert(qinst->src[src].file == QFILE_UNIF); - qinst->uniform = qinst->src[src].index; + if (vir_has_uniform(qinst)) c->num_uniforms++; - } - int nsrc = vir_get_non_sideband_nsrc(qinst); + int nsrc = vir_get_nsrc(qinst); struct qpu_reg src[ARRAY_SIZE(qinst->src)]; - bool emitted_ldunif = false; for (int i = 0; i < nsrc; i++) { int index = qinst->src[i].index; switch (qinst->src[i].file) { @@ -240,19 +225,6 @@ v3d_generate_code_block(struct v3d_compile *c, case QFILE_TEMP: src[i] = temp_registers[index]; break; - case QFILE_UNIF: - /* XXX perf: If the last ldunif we emitted was - * the same uniform value, skip it. Common - * for multop/umul24 sequences. - */ - if (!emitted_ldunif) { - new_ldunif_instr(qinst, i); - c->num_uniforms++; - emitted_ldunif = true; - } - - src[i] = qpu_acc(5); - break; case QFILE_SMALL_IMM: src[i].smimm = true; break; @@ -268,10 +240,6 @@ v3d_generate_code_block(struct v3d_compile *c, src[i] = qpu_acc(3); break; - - case QFILE_TLB: - case QFILE_TLBU: - unreachable("bad vir src file"); } } @@ -297,15 +265,6 @@ v3d_generate_code_block(struct v3d_compile *c, dst = qpu_magic(V3D_QPU_WADDR_VPM); break; - case QFILE_TLB: - dst = qpu_magic(V3D_QPU_WADDR_TLB); - break; - - case QFILE_TLBU: - dst = qpu_magic(V3D_QPU_WADDR_TLBU); - break; - - case QFILE_UNIF: case QFILE_SMALL_IMM: case QFILE_LOAD_IMM: assert(!"not reached"); @@ -313,7 +272,20 @@ v3d_generate_code_block(struct v3d_compile *c, } if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { - if (v3d_qpu_sig_writes_address(c->devinfo, + if (qinst->qpu.sig.ldunif) { + assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP); + assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); + + if (!dst.magic || + dst.index != V3D_QPU_WADDR_R5) { + assert(c->devinfo->ver >= 40); + + qinst->qpu.sig.ldunif = false; + qinst->qpu.sig.ldunifrf = true; + qinst->qpu.sig_addr = dst.index; + qinst->qpu.sig_magic = dst.magic; + } + } else if (v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig)) { assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP); assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP); @@ -361,11 +333,12 @@ static bool reads_uniform(const struct v3d_device_info *devinfo, uint64_t instruction) { struct v3d_qpu_instr qpu; - MAYBE_UNUSED bool ok = v3d_qpu_instr_unpack(devinfo, instruction, &qpu); + ASSERTED bool ok = v3d_qpu_instr_unpack(devinfo, instruction, &qpu); assert(ok); if (qpu.sig.ldunif || - qpu.sig.ldunifarf || + qpu.sig.ldunifrf || + qpu.sig.ldtlbu || qpu.sig.wrtmuc) { return true; } @@ -433,7 +406,7 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers) vir_for_each_block(block, c) v3d_generate_code_block(c, block, temp_registers); - uint32_t cycles = v3d_qpu_schedule_instructions(c); + v3d_qpu_schedule_instructions(c); c->qpu_insts = rzalloc_array(c, uint64_t, c->qpu_inst_count); int i = 0; @@ -450,23 +423,6 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers) } assert(i == c->qpu_inst_count); - if (V3D_DEBUG & V3D_DEBUG_SHADERDB) { - fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d instructions\n", - vir_get_stage_name(c), - c->program_id, c->variant_id, - c->qpu_inst_count); - } - - /* The QPU cycle estimates are pretty broken (see waddr_latency()), so - * don't report them for now. - */ - if (false) { - fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n", - vir_get_stage_name(c), - c->program_id, c->variant_id, - cycles); - } - if (V3D_DEBUG & (V3D_DEBUG_QPU | v3d_debug_flag_for_shader_stage(c->s->info.stage))) { v3d_dump_qpu(c); diff --git a/lib/mesa/src/broadcom/drm-shim/README.md b/lib/mesa/src/broadcom/drm-shim/README.md new file mode 100644 index 000000000..dde21c1b8 --- /dev/null +++ b/lib/mesa/src/broadcom/drm-shim/README.md @@ -0,0 +1,17 @@ +### v3d backend + +This implements some of v3d using the closed source v3dv3 tree's +C/C++-based simulator. All execution is synchronous. + +Export: `MESA_LOADER_DRIVER_OVERRIDE=v3d +LD_PRELOAD=$prefix/lib/libv3d_drm_shim.so`. The v3dv3 version exposed +will depend on the v3dv3 build -- 3.3, 4.1, and 4.2 are supported. + +### v3d_noop backend + +This implements the minimum of v3d in order to make shader-db work. +The submit ioctl is stubbed out to not execute anything. + +Export `MESA_LOADER_DRIVER_OVERRIDE=v3d +LD_PRELOAD=$prefix/lib/libv3d_noop_drm_shim.so`. This will be a V3D +4.2 device. diff --git a/lib/mesa/src/broadcom/drm-shim/meson.build b/lib/mesa/src/broadcom/drm-shim/meson.build new file mode 100644 index 000000000..4fcc594ad --- /dev/null +++ b/lib/mesa/src/broadcom/drm-shim/meson.build @@ -0,0 +1,62 @@ +# Copyright © 2019 Broadcom +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libv3d_noop_drm_shim = shared_library( + ['v3d_noop_drm_shim'], + 'v3d_noop.c', + include_directories: inc_common, + dependencies: dep_drm_shim, + c_args : c_vis_args, + install : true, +) + +dep_v3dv3 = dependency('v3dv3', required: false) +if dep_v3dv3.found() + v3dv3_c_args = '-DUSE_V3D_SIMULATOR' + + inc_gallium_v3d = include_directories('../../gallium/drivers/v3d') + + per_version_libs = [] + foreach ver : v3d_versions + per_version_libs += static_library( + 'libv3d_drm_shim-v' + ver, + [ + 'v3dx.c', + v3d_xml_pack + ], + include_directories : [inc_common, inc_broadcom, inc_src, inc_gallium_v3d], + c_args : [c_vis_args, no_override_init_args, '-DV3D_VERSION=' + ver, v3dv3_c_args], + dependencies: [dep_valgrind, dep_thread, dep_v3dv3], + ) + endforeach + + libv3d_drm_shim = shared_library( + ['v3d_drm_shim'], + [ + 'v3d.c', + '../../gallium/drivers/v3d/v3d_simulator_wrapper.cpp', + ], + dependencies: [idep_mesautil, dep_dl, dep_drm_shim, dep_v3dv3], + link_with: per_version_libs, + include_directories : [inc_common, inc_broadcom, inc_gallium_v3d], + c_args : [c_vis_args, no_override_init_args, '-std=gnu99', v3dv3_c_args], + cpp_args : [v3dv3_c_args] + ) +endif diff --git a/lib/mesa/src/broadcom/drm-shim/v3d.c b/lib/mesa/src/broadcom/drm-shim/v3d.c new file mode 100644 index 000000000..e75657f59 --- /dev/null +++ b/lib/mesa/src/broadcom/drm-shim/v3d.c @@ -0,0 +1,98 @@ +/* + * Copyright © 2018 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <stdio.h> +#include <sys/ioctl.h> +#include "drm-uapi/v3d_drm.h" +#include "drm-shim/drm_shim.h" +#include "v3d.h" +#include "v3d_simulator_wrapper.h" + +static struct v3d_device_info devinfo; +struct v3d_shim_device v3d = { + .devinfo = &devinfo +}; + +struct v3d_bo *v3d_bo_lookup(struct shim_fd *shim_fd, int handle) +{ + return v3d_bo(drm_shim_bo_lookup(shim_fd, handle)); +} + +int +v3d_ioctl_wait_bo(int fd, unsigned long request, void *arg) +{ + /* No need to wait on anything yet, given that we submit + * synchronously. + */ + return 0; +} + +int +v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg) +{ + struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); + struct drm_v3d_mmap_bo *map = arg; + struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, map->handle); + + map->offset = drm_shim_bo_get_mmap_offset(shim_fd, bo); + + drm_shim_bo_put(bo); + + return 0; +} + +int +v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg) +{ + struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); + struct drm_v3d_get_bo_offset *get = arg; + struct v3d_bo *bo = v3d_bo_lookup(shim_fd, get->handle); + + get->offset = bo->offset; + + drm_shim_bo_put(&bo->base); + + return 0; +} + +void +drm_shim_driver_init(void) +{ + shim_device.driver_name = "v3d"; + + drm_shim_override_file("OF_FULLNAME=/rdb/v3d\n" + "OF_COMPATIBLE_N=1\n" + "OF_COMPATIBLE_0=brcm,7278-v3d\n", + "/sys/dev/char/%d:%d/device/uevent", + DRM_MAJOR, render_node_minor); + + v3d.hw = v3d_hw_auto_new(NULL); + v3d.devinfo->ver = v3d_hw_get_version(v3d.hw); + + if (v3d.devinfo->ver >= 42) + v3d42_drm_shim_driver_init(); + else if (v3d.devinfo->ver >= 41) + v3d41_drm_shim_driver_init(); + else + v3d33_drm_shim_driver_init(); +} diff --git a/lib/mesa/src/broadcom/drm-shim/v3d.h b/lib/mesa/src/broadcom/drm-shim/v3d.h new file mode 100644 index 000000000..0712b8b3f --- /dev/null +++ b/lib/mesa/src/broadcom/drm-shim/v3d.h @@ -0,0 +1,70 @@ +/* + * Copyright © 2018 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef DRM_SHIM_V3D_H +#define DRM_SHIM_V3D_H + +#include "broadcom/common/v3d_device_info.h" +#include "util/vma.h" + +struct drm_shim_fd; + +struct v3d_shim_device { + struct v3d_hw *hw; + struct v3d_device_info *devinfo; + + /* Base virtual address of the heap. */ + void *mem; + /* Base hardware address of the heap. */ + uint32_t mem_base; + /* Size of the heap. */ + size_t mem_size; + + /* Allocator for the GPU virtual addresses. */ + struct util_vma_heap heap; +}; +extern struct v3d_shim_device v3d; + +struct v3d_bo { + struct shim_bo base; + uint64_t offset; + void *sim_vaddr; + void *gem_vaddr; +}; + +static inline struct v3d_bo * +v3d_bo(struct shim_bo *bo) +{ + return (struct v3d_bo *)bo; +} + +struct v3d_bo *v3d_bo_lookup(struct shim_fd *shim_fd, int handle); +int v3d_ioctl_wait_bo(int fd, unsigned long request, void *arg); +int v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg); +int v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg); + +void v3d33_drm_shim_driver_init(void); +void v3d41_drm_shim_driver_init(void); +void v3d42_drm_shim_driver_init(void); + +#endif /* DRM_SHIM_V3D_H */ diff --git a/lib/mesa/src/broadcom/drm-shim/v3d_noop.c b/lib/mesa/src/broadcom/drm-shim/v3d_noop.c new file mode 100644 index 000000000..7c7d75128 --- /dev/null +++ b/lib/mesa/src/broadcom/drm-shim/v3d_noop.c @@ -0,0 +1,158 @@ +/* + * Copyright © 2018 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/ioctl.h> +#include "drm-uapi/v3d_drm.h" +#include "drm-shim/drm_shim.h" + +struct v3d_bo { + struct shim_bo base; + uint32_t offset; +}; + +static struct v3d_bo * +v3d_bo(struct shim_bo *bo) +{ + return (struct v3d_bo *)bo; +} + +struct v3d_device { + uint32_t next_offset; +}; + +static struct v3d_device v3d = { + .next_offset = 0x1000, +}; + +static int +v3d_ioctl_noop(int fd, unsigned long request, void *arg) +{ + return 0; +} + +static int +v3d_ioctl_create_bo(int fd, unsigned long request, void *arg) +{ + struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); + struct drm_v3d_create_bo *create = arg; + struct v3d_bo *bo = calloc(1, sizeof(*bo)); + + drm_shim_bo_init(&bo->base, create->size); + + assert(UINT_MAX - v3d.next_offset > create->size); + bo->offset = v3d.next_offset; + v3d.next_offset += create->size; + + create->offset = bo->offset; + create->handle = drm_shim_bo_get_handle(shim_fd, &bo->base); + + drm_shim_bo_put(&bo->base); + + return 0; +} + +static int +v3d_ioctl_get_bo_offset(int fd, unsigned long request, void *arg) +{ + struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); + struct drm_v3d_get_bo_offset *args = arg; + struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, args->handle); + + args->offset = v3d_bo(bo)->offset; + + drm_shim_bo_put(bo); + + return 0; +} + +static int +v3d_ioctl_mmap_bo(int fd, unsigned long request, void *arg) +{ + struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); + struct drm_v3d_mmap_bo *map = arg; + struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, map->handle); + + map->offset = drm_shim_bo_get_mmap_offset(shim_fd, bo); + + drm_shim_bo_put(bo); + + return 0; +} + +static int +v3d_ioctl_get_param(int fd, unsigned long request, void *arg) +{ + struct drm_v3d_get_param *gp = arg; + static const uint32_t v3d42_reg_map[] = { + [DRM_V3D_PARAM_V3D_UIFCFG] = 0x00000045, + [DRM_V3D_PARAM_V3D_HUB_IDENT1] = 0x000e1124, + [DRM_V3D_PARAM_V3D_HUB_IDENT2] = 0x00000100, + [DRM_V3D_PARAM_V3D_HUB_IDENT3] = 0x00000e00, + [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = 0x04443356, + [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = 0x81001422, + [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = 0x40078121, + }; + + switch (gp->param) { + case DRM_V3D_PARAM_SUPPORTS_TFU: + gp->value = 1; + return 0; + default: + break; + } + + if (gp->param < ARRAY_SIZE(v3d42_reg_map) && v3d42_reg_map[gp->param]) { + gp->value = v3d42_reg_map[gp->param]; + return 0; + } + + fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM %d\n", gp->param); + return -1; +} + +static ioctl_fn_t driver_ioctls[] = { + [DRM_V3D_SUBMIT_CL] = v3d_ioctl_noop, + [DRM_V3D_SUBMIT_TFU] = v3d_ioctl_noop, + [DRM_V3D_WAIT_BO] = v3d_ioctl_noop, + [DRM_V3D_CREATE_BO] = v3d_ioctl_create_bo, + [DRM_V3D_GET_PARAM] = v3d_ioctl_get_param, + [DRM_V3D_GET_BO_OFFSET] = v3d_ioctl_get_bo_offset, + [DRM_V3D_MMAP_BO] = v3d_ioctl_mmap_bo, +}; + +void +drm_shim_driver_init(void) +{ + shim_device.driver_name = "v3d"; + shim_device.driver_ioctls = driver_ioctls; + shim_device.driver_ioctl_count = ARRAY_SIZE(driver_ioctls); + + drm_shim_override_file("OF_FULLNAME=/rdb/v3d\n" + "OF_COMPATIBLE_N=1\n" + "OF_COMPATIBLE_0=brcm,7278-v3d\n", + "/sys/dev/char/%d:%d/device/uevent", + DRM_MAJOR, render_node_minor); +} diff --git a/lib/mesa/src/broadcom/drm-shim/v3dx.c b/lib/mesa/src/broadcom/drm-shim/v3dx.c new file mode 100644 index 000000000..a22550a03 --- /dev/null +++ b/lib/mesa/src/broadcom/drm-shim/v3dx.c @@ -0,0 +1,370 @@ +/* + * Copyright © 2014-2017 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* @file + * + * v3d driver code interacting v3dv3 simulator/fpga library. + * + * This is compiled per V3D version we support, since the register definitions + * conflict. + */ + +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <string.h> +#include <sys/mman.h> +#include "util/macros.h" +#include "util/u_mm.h" +#include "broadcom/common/v3d_macros.h" +#include "v3d_simulator_wrapper.h" +#include "drm-shim/drm_shim.h" +#include "drm-uapi/v3d_drm.h" +#include "v3d.h" + +#define HW_REGISTER_RO(x) (x) +#define HW_REGISTER_RW(x) (x) +#if V3D_VERSION >= 41 +#include "libs/core/v3d/registers/4.1.34.0/v3d.h" +#else +#include "libs/core/v3d/registers/3.3.0.0/v3d.h" +#endif + +#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d.hw, reg, val) +#define V3D_READ(reg) v3d_hw_read_reg(v3d.hw, reg) + +static void +v3d_flush_l3() +{ + if (!v3d_hw_has_gca(v3d.hw)) + return; + +#if V3D_VERSION < 40 + uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL); + + V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET); + V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET); +#endif +} + +/* Invalidates the L2 cache. This is a read-only cache. */ +static void +v3d_flush_l2(void) +{ + V3D_WRITE(V3D_CTL_0_L2CACTL, + V3D_CTL_0_L2CACTL_L2CCLR_SET | + V3D_CTL_0_L2CACTL_L2CENA_SET); +} + +/* Invalidates texture L2 cachelines */ +static void +v3d_flush_l2t(void) +{ + V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0); + V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0); + V3D_WRITE(V3D_CTL_0_L2TCACTL, + V3D_CTL_0_L2TCACTL_L2TFLS_SET | + (0 << V3D_CTL_0_L2TCACTL_L2TFLM_LSB)); +} + +/* Invalidates the slice caches. These are read-only caches. */ +static void +v3d_flush_slices(void) +{ + V3D_WRITE(V3D_CTL_0_SLCACTL, ~0); +} + +static void +v3d_flush_caches(void) +{ + v3d_flush_l3(); + v3d_flush_l2(); + v3d_flush_l2t(); + v3d_flush_slices(); +} + +static void +v3d_simulator_copy_in_handle(struct shim_fd *shim_fd, int handle) +{ + if (!handle) + return; + + struct v3d_bo *bo = v3d_bo_lookup(shim_fd, handle); + + memcpy(bo->sim_vaddr, bo->gem_vaddr, bo->base.size); +} + +static void +v3d_simulator_copy_out_handle(struct shim_fd *shim_fd, int handle) +{ + if (!handle) + return; + + struct v3d_bo *bo = v3d_bo_lookup(shim_fd, handle); + + memcpy(bo->gem_vaddr, bo->sim_vaddr, bo->base.size); +} + +static int +v3dX(v3d_ioctl_submit_cl)(int fd, unsigned long request, void *arg) +{ + struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); + struct drm_v3d_submit_cl *submit = arg; + uint32_t *bo_handles = (uint32_t *)(uintptr_t)submit->bo_handles; + + for (int i = 0; i < submit->bo_handle_count; i++) + v3d_simulator_copy_in_handle(shim_fd, bo_handles[i]); + + v3d_flush_caches(); + + if (submit->qma) { + V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma); + V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms); + } +#if V3D_VERSION >= 41 + if (submit->qts) { + V3D_WRITE(V3D_CLE_0_CT0QTS, + V3D_CLE_0_CT0QTS_CTQTSEN_SET | + submit->qts); + } +#endif + + fprintf(stderr, "submit %x..%x!\n", submit->bcl_start, submit->bcl_end); + + V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start); + V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end); + + /* Wait for bin to complete before firing render, as it seems the + * simulator doesn't implement the semaphores. + */ + while (V3D_READ(V3D_CLE_0_CT0CA) != + V3D_READ(V3D_CLE_0_CT0EA)) { + v3d_hw_tick(v3d.hw); + } + + fprintf(stderr, "submit %x..%x!\n", submit->rcl_start, submit->rcl_end); + + v3d_flush_caches(); + + V3D_WRITE(V3D_CLE_0_CT1QBA, submit->rcl_start); + V3D_WRITE(V3D_CLE_0_CT1QEA, submit->rcl_end); + + while (V3D_READ(V3D_CLE_0_CT1CA) != + V3D_READ(V3D_CLE_0_CT1EA)) { + v3d_hw_tick(v3d.hw); + } + + for (int i = 0; i < submit->bo_handle_count; i++) + v3d_simulator_copy_out_handle(shim_fd, bo_handles[i]); + + return 0; +} + +static int +v3dX(v3d_ioctl_submit_tfu)(int fd, unsigned long request, void *arg) +{ + struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); + struct drm_v3d_submit_tfu *submit = arg; + + v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[0]); + v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[1]); + v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[2]); + v3d_simulator_copy_in_handle(shim_fd, submit->bo_handles[3]); + + int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET; + + V3D_WRITE(V3D_TFU_IIA, submit->iia); + V3D_WRITE(V3D_TFU_IIS, submit->iis); + V3D_WRITE(V3D_TFU_ICA, submit->ica); + V3D_WRITE(V3D_TFU_IUA, submit->iua); + V3D_WRITE(V3D_TFU_IOA, submit->ioa); + V3D_WRITE(V3D_TFU_IOS, submit->ios); + V3D_WRITE(V3D_TFU_COEF0, submit->coef[0]); + V3D_WRITE(V3D_TFU_COEF1, submit->coef[1]); + V3D_WRITE(V3D_TFU_COEF2, submit->coef[2]); + V3D_WRITE(V3D_TFU_COEF3, submit->coef[3]); + + V3D_WRITE(V3D_TFU_ICFG, submit->icfg); + + while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) { + v3d_hw_tick(v3d.hw); + } + + v3d_simulator_copy_out_handle(shim_fd, submit->bo_handles[0]); + + return 0; +} + +static int +v3dX(v3d_ioctl_create_bo)(int fd, unsigned long request, void *arg) +{ + struct shim_fd *shim_fd = drm_shim_fd_lookup(fd); + struct drm_v3d_create_bo *create = arg; + struct v3d_bo *bo = calloc(1, sizeof(*bo)); + + drm_shim_bo_init(&bo->base, create->size); + bo->offset = util_vma_heap_alloc(&v3d.heap, create->size, 4096); + if (bo->offset == 0) + return -ENOMEM; + + bo->sim_vaddr = v3d.mem + bo->offset - v3d.mem_base; +#if 0 + /* Place a mapping of the BO inside of the simulator's address space + * for V3D memory. This lets us avoid copy in/out for simpenrose, but + * I'm betting we'll need something else for FPGA. + */ + void *sim_addr = v3d.mem + bo->block->ofs; + void *mmap_ret = mmap(sim_addr, create->size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, bo->base.fd, 0); + assert(mmap_ret == sim_addr); +#else + /* Make a simulator-private mapping of the shim GEM object. */ + bo->gem_vaddr = mmap(NULL, bo->base.size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + bo->base.fd, 0); + if (bo->gem_vaddr == MAP_FAILED) { + fprintf(stderr, "v3d: mmap of shim bo failed\n"); + abort(); + } +#endif + + create->offset = bo->offset; + create->handle = drm_shim_bo_get_handle(shim_fd, &bo->base); + + drm_shim_bo_put(&bo->base); + + return 0; +} + +static int +v3dX(v3d_ioctl_get_param)(int fd, unsigned long request, void *arg) +{ + struct drm_v3d_get_param *gp = arg; + static const uint32_t reg_map[] = { + [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG, + [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1, + [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2, + [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3, + [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_0_IDENT0, + [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_0_IDENT1, + [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_0_IDENT2, + }; + + switch (gp->param) { + case DRM_V3D_PARAM_SUPPORTS_TFU: + gp->value = 1; + return 0; + } + + if (gp->param < ARRAY_SIZE(reg_map) && reg_map[gp->param]) { + gp->value = V3D_READ(reg_map[gp->param]); + return 0; + } + + fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM %d\n", gp->param); + return -1; +} + +static ioctl_fn_t driver_ioctls[] = { + [DRM_V3D_SUBMIT_CL] = v3dX(v3d_ioctl_submit_cl), + [DRM_V3D_SUBMIT_TFU] = v3dX(v3d_ioctl_submit_tfu), + [DRM_V3D_WAIT_BO] = v3d_ioctl_wait_bo, + [DRM_V3D_CREATE_BO] = v3dX(v3d_ioctl_create_bo), + [DRM_V3D_GET_PARAM] = v3dX(v3d_ioctl_get_param), + [DRM_V3D_MMAP_BO] = v3d_ioctl_mmap_bo, + [DRM_V3D_GET_BO_OFFSET] = v3d_ioctl_get_bo_offset, +}; + +static void +v3d_isr(uint32_t hub_status) +{ + /* Check the per-core bits */ + if (hub_status & (1 << 0)) { + uint32_t core_status = V3D_READ(V3D_CTL_0_INT_STS); + + if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) { + fprintf(stderr, "GMP violation at 0x%08x\n", + V3D_READ(V3D_GMP_0_VIO_ADDR)); + abort(); + } else { + fprintf(stderr, + "Unexpected ISR with core status 0x%08x\n", + core_status); + } + abort(); + } + + return; +} + +static void +v3dX(simulator_init_regs)(void) +{ +#if V3D_VERSION == 33 + /* Set OVRTMUOUT to match kernel behavior. + * + * This means that the texture sampler uniform configuration's tmu + * output type field is used, instead of using the hardware default + * behavior based on the texture type. If you want the default + * behavior, you can still put "2" in the indirect texture state's + * output_type field. + */ + V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET); +#endif + + uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_GMPV_SET; + V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts); + V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts); + + v3d_hw_set_isr(v3d.hw, v3d_isr); +} + +static void +v3d_bo_free(struct shim_bo *shim_bo) +{ + struct v3d_bo *bo = v3d_bo(shim_bo); + + if (bo->gem_vaddr) + munmap(bo->gem_vaddr, shim_bo->size); + + util_vma_heap_free(&v3d.heap, bo->offset, bo->base.size); +} + +void +v3dX(drm_shim_driver_init)(void) +{ + shim_device.driver_ioctls = driver_ioctls; + shim_device.driver_ioctl_count = ARRAY_SIZE(driver_ioctls); + + shim_device.driver_bo_free = v3d_bo_free; + + /* Allocate a gig of memory to play in. */ + v3d_hw_alloc_mem(v3d.hw, 1024 * 1024 * 1024); + v3d.mem_base = + v3d_hw_get_mem(v3d.hw, &v3d.mem_size, + &v3d.mem); + util_vma_heap_init(&v3d.heap, 4096, v3d.mem_size - 4096); + + v3dX(simulator_init_regs)(); +} diff --git a/lib/mesa/src/broadcom/meson.build b/lib/mesa/src/broadcom/meson.build index d3ea362f2..57f0d889b 100644 --- a/lib/mesa/src/broadcom/meson.build +++ b/lib/mesa/src/broadcom/meson.build @@ -30,6 +30,10 @@ if with_gallium_v3d subdir('qpu') endif +if with_tools.contains('drm-shim') + subdir('drm-shim') +endif + per_version_libs = [] foreach ver : v3d_versions per_version_libs += static_library( @@ -47,7 +51,7 @@ endforeach libbroadcom_v3d = static_library( 'libbroadcom_v3d', [ - files('common/v3d_debug.c', 'clif/clif_dump.c'), + files('common/v3d_debug.c', 'common/v3d_device_info.c', 'clif/clif_dump.c'), v3d_xml_pack, ], include_directories : [inc_common, inc_broadcom, inc_src], diff --git a/lib/mesa/src/broadcom/qpu/meson.build b/lib/mesa/src/broadcom/qpu/meson.build index 279b09cb9..c9cf7b9e9 100644 --- a/lib/mesa/src/broadcom/qpu/meson.build +++ b/lib/mesa/src/broadcom/qpu/meson.build @@ -39,7 +39,8 @@ test( 'qpu_disasm', executable( 'qpu_disasm', 'tests/qpu_disasm.c', - link_with: [libbroadcom_qpu, libmesa_util], + link_with: libbroadcom_qpu, + dependencies : idep_mesautil, include_directories: inc_common ), suite : ['broadcom'], diff --git a/lib/mesa/src/broadcom/qpu/qpu_disasm.c b/lib/mesa/src/broadcom/qpu/qpu_disasm.c index 32e7ba12a..9f59bcdf7 100644 --- a/lib/mesa/src/broadcom/qpu/qpu_disasm.c +++ b/lib/mesa/src/broadcom/qpu/qpu_disasm.c @@ -64,7 +64,7 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm, } else if (mux == V3D_QPU_MUX_B) { if (instr->sig.small_imm) { uint32_t val; - MAYBE_UNUSED bool ok = + ASSERTED bool ok = v3d_qpu_small_imm_unpack(disasm->devinfo, instr->raddr_b, &val); @@ -205,6 +205,8 @@ v3d_qpu_disasm_sig(struct disasm_state *disasm, !sig->ldvary && !sig->ldvpm && !sig->ldtmu && + !sig->ldtlb && + !sig->ldtlbu && !sig->ldunif && !sig->ldunifrf && !sig->ldunifa && diff --git a/lib/mesa/src/broadcom/qpu/qpu_instr.c b/lib/mesa/src/broadcom/qpu/qpu_instr.c index add2d2a23..09d06b3fa 100644 --- a/lib/mesa/src/broadcom/qpu/qpu_instr.c +++ b/lib/mesa/src/broadcom/qpu/qpu_instr.c @@ -645,19 +645,10 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) { - if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { - switch (inst->alu.add.op) { - case V3D_QPU_A_RECIP: - case V3D_QPU_A_RSQRT: - case V3D_QPU_A_EXP: - case V3D_QPU_A_LOG: - case V3D_QPU_A_SIN: - case V3D_QPU_A_RSQRT2: - return true; - default: - break; - } + if (v3d_qpu_instr_is_sfu(inst)) + return true; + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.magic_write && v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) { return true; @@ -673,6 +664,25 @@ v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) } bool +v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) +{ + if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { + switch (inst->alu.add.op) { + case V3D_QPU_A_RECIP: + case V3D_QPU_A_RSQRT: + case V3D_QPU_A_EXP: + case V3D_QPU_A_LOG: + case V3D_QPU_A_SIN: + case V3D_QPU_A_RSQRT2: + return true; + default: + return false; + } + } + return false; +} + +bool v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) { return (inst->type == V3D_QPU_INSTR_TYPE_ALU && @@ -683,6 +693,16 @@ v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) } bool +v3d_qpu_writes_tmu_not_tmuc(const struct v3d_qpu_instr *inst) +{ + return v3d_qpu_writes_tmu(inst) && + (!inst->alu.add.magic_write || + inst->alu.add.waddr != V3D_QPU_WADDR_TMUC) && + (!inst->alu.mul.magic_write || + inst->alu.mul.waddr != V3D_QPU_WADDR_TMUC); +} + +bool v3d_qpu_reads_vpm(const struct v3d_qpu_instr *inst) { if (inst->sig.ldvpm) @@ -751,9 +771,6 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *inst) { - if (inst->sig.ldtmu) - return true; - if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { if (inst->alu.add.magic_write && (inst->alu.add.waddr == V3D_QPU_WADDR_R4 || @@ -768,8 +785,10 @@ v3d_qpu_writes_r4(const struct v3d_device_info *devinfo, } } - if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && - inst->sig_magic && inst->sig_addr == V3D_QPU_WADDR_R4) { + if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) { + if (inst->sig_magic && inst->sig_addr == V3D_QPU_WADDR_R4) + return true; + } else if (inst->sig.ldtmu) { return true; } @@ -867,3 +886,70 @@ v3d_qpu_writes_flags(const struct v3d_qpu_instr *inst) return false; } + +bool +v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) +{ + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return false; + + switch (inst->alu.add.op) { + case V3D_QPU_A_FADD: + case V3D_QPU_A_FADDNF: + case V3D_QPU_A_FSUB: + case V3D_QPU_A_FMIN: + case V3D_QPU_A_FMAX: + case V3D_QPU_A_FCMP: + case V3D_QPU_A_FROUND: + case V3D_QPU_A_FTRUNC: + case V3D_QPU_A_FFLOOR: + case V3D_QPU_A_FCEIL: + case V3D_QPU_A_FDX: + case V3D_QPU_A_FDY: + case V3D_QPU_A_FTOIN: + case V3D_QPU_A_FTOIZ: + case V3D_QPU_A_FTOUZ: + case V3D_QPU_A_FTOC: + case V3D_QPU_A_VFPACK: + return true; + break; + default: + break; + } + + switch (inst->alu.mul.op) { + case V3D_QPU_M_FMOV: + case V3D_QPU_M_FMUL: + return true; + break; + default: + break; + } + + return false; +} +bool +v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) +{ + if (inst->type != V3D_QPU_INSTR_TYPE_ALU) + return false; + + switch (inst->alu.add.op) { + case V3D_QPU_A_VFMIN: + case V3D_QPU_A_VFMAX: + return true; + break; + default: + break; + } + + switch (inst->alu.mul.op) { + case V3D_QPU_M_VFMUL: + return true; + break; + default: + break; + } + + return false; +} diff --git a/lib/mesa/src/broadcom/qpu/qpu_instr.h b/lib/mesa/src/broadcom/qpu/qpu_instr.h index 1e2dcb78a..ad2d37b60 100644 --- a/lib/mesa/src/broadcom/qpu/qpu_instr.h +++ b/lib/mesa/src/broadcom/qpu/qpu_instr.h @@ -447,8 +447,10 @@ bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST; bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +bool v3d_qpu_writes_tmu_not_tmuc(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_writes_r3(const struct v3d_device_info *devinfo, const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST; bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo, @@ -464,5 +466,7 @@ bool v3d_qpu_reads_flags(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_writes_flags(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; bool v3d_qpu_sig_writes_address(const struct v3d_device_info *devinfo, const struct v3d_qpu_sig *sig) ATTRIBUTE_CONST; +bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; +bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST; #endif diff --git a/lib/mesa/src/broadcom/qpu/qpu_pack.c b/lib/mesa/src/broadcom/qpu/qpu_pack.c index 70f31d734..516b0cf53 100644 --- a/lib/mesa/src/broadcom/qpu/qpu_pack.c +++ b/lib/mesa/src/broadcom/qpu/qpu_pack.c @@ -776,7 +776,11 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst, case V3D_QPU_A_FMIN: case V3D_QPU_A_FMAX: case V3D_QPU_A_FCMP: - instr->alu.add.output_pack = (op >> 4) & 0x3; + case V3D_QPU_A_VFPACK: + if (instr->alu.add.op != V3D_QPU_A_VFPACK) + instr->alu.add.output_pack = (op >> 4) & 0x3; + else + instr->alu.add.output_pack = V3D_QPU_PACK_NONE; if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3, &instr->alu.add.a_unpack)) { @@ -1042,6 +1046,32 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, opcode |= a_unpack << 2; opcode |= b_unpack << 0; + + break; + } + + case V3D_QPU_A_VFPACK: { + uint32_t a_unpack; + uint32_t b_unpack; + + if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS || + instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) { + return false; + } + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack, + &a_unpack)) { + return false; + } + + if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack, + &b_unpack)) { + return false; + } + + opcode = (opcode & ~(1 << 2)) | (a_unpack << 2); + opcode = (opcode & ~(1 << 0)) | (b_unpack << 0); + break; } @@ -1065,7 +1095,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo, } if (packed == 0) return false; - opcode |= packed << 2; + opcode = (opcode & ~(1 << 2)) | packed << 2; break; } diff --git a/lib/mesa/src/broadcom/qpu/qpu_validate.c b/lib/mesa/src/broadcom/qpu/qpu_validate.c new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/lib/mesa/src/broadcom/qpu/qpu_validate.c diff --git a/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c b/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c index 2e8d98058..1bc3c9ec6 100644 --- a/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c +++ b/lib/mesa/src/broadcom/qpu/tests/qpu_disasm.c @@ -48,6 +48,9 @@ static const struct { { 33, 0x1c0a0dfde2294000ull, "fcmp.ifna rf61.h, r4.abs, r2.l; vfmul rf55, r2.hh, r1" }, { 33, 0x2011c89b402cc000ull, "fsub.norz rf27, r4.abs, r1.abs; vfmul.ifa rf34, r3.swp, r1" }, + { 33, 0xe01b42ab3bb063c0ull, "vfpack.andnc rf43, rf15.l, r0.h; fmul.ifna rf10.h, r4.l, r5.abs" }, + { 33, 0x600b8b87fb4d1000ull, "fdx.ifnb rf7.h, r1.l; fmul.pushn rf46, r3.l, r2.abs" }, + /* small immediates */ { 33, 0x5de24398bbdc6218ull, "vflb.andnn rf24 ; fmul rf14, -8, rf8.h" }, { 33, 0x25ef83d8b166f00full, "vfmin.pushn rf24, 15.ff, r5; smul24.ifnb rf15, r1, r3" }, |