/************************************************************************** * * Copyright 2017 Advanced Micro Devices, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * the Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ /* This is a wrapper for pipe_context that executes all pipe_context calls * in another thread. * * * Guidelines for adopters and deviations from Gallium * --------------------------------------------------- * * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen * driver functions that take a context (fence_finish, texture_get_handle) * should manually unwrap pipe_context by doing: * pipe = threaded_context_unwrap_sync(pipe); * * pipe_context::priv is used to unwrap the context, so drivers and state * trackers shouldn't use it. * * No other objects are wrapped. * * 2) Drivers must subclass and initialize these structures: * - threaded_resource for pipe_resource (use threaded_resource_init/deinit) * - threaded_query for pipe_query (zero memory) * - threaded_transfer for pipe_transfer (zero memory) * * 3) The threaded context must not be enabled for contexts that can use video * codecs. * * 4) Changes in driver behavior: * - begin_query and end_query always return true; return values from * the driver are ignored. * - generate_mipmap uses is_format_supported to determine success; * the return value from the driver is ignored. * - resource_commit always returns true; failures are ignored. * - set_debug_callback is skipped if the callback is synchronous. * * * Thread-safety requirements on context functions * ----------------------------------------------- * * These pipe_context functions are executed directly, so they shouldn't use * pipe_context in an unsafe way. They are de-facto screen functions now: * - create_query * - create_batch_query * - create_*_state (all CSOs and shaders) * - Make sure the shader compiler doesn't use any per-context stuff. * (e.g. LLVM target machine) * - Only pipe_context's debug callback for shader dumps is guaranteed to * be up to date, because set_debug_callback synchronizes execution. * - create_surface * - surface_destroy * - create_sampler_view * - sampler_view_destroy * - stream_output_target_destroy * - transfer_map (only unsychronized buffer mappings) * - get_query_result (when threaded_query::flushed == true) * * Create calls causing a sync that can't be async due to driver limitations: * - create_stream_output_target * * * Transfer_map rules for buffer mappings * -------------------------------------- * * 1) If transfer_map has PIPE_TRANSFER_UNSYNCHRONIZED, the call is made * in the non-driver thread without flushing the queue. The driver will * receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_TRANSFER_- * UNSYNCHRONIZED to indicate this. * Note that transfer_unmap is always enqueued and called from the driver * thread. * * 2) The driver isn't allowed to infer unsychronized mappings by tracking * the valid buffer range. The threaded context always sends TC_TRANSFER_- * MAP_IGNORE_VALID_RANGE to indicate this. Ignoring the flag will lead * to failures. * The threaded context does its own detection of unsynchronized mappings. * * 3) The driver isn't allowed to do buffer invalidations by itself under any * circumstances. This is necessary for unsychronized maps to map the latest * version of the buffer. (because invalidations can be queued, while * unsychronized maps are not queued and they should return the latest * storage after invalidation). The threaded context always sends * TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to * indicate this. Ignoring the flag will lead to failures. * The threaded context uses its own buffer invalidation mechanism. * * * Additional requirements * ----------------------- * * get_query_result: * If threaded_query::flushed == true, get_query_result should assume that * it's called from a non-driver thread, in which case the driver shouldn't * use the context in an unsafe way. * * replace_buffer_storage: * The driver has to implement this callback, which will be called when * the threaded context wants to replace a resource's backing storage with * another resource's backing storage. The threaded context uses it to * implement buffer invalidation. This call is always queued. * * * Performance gotchas * ------------------- * * Buffer invalidations are done unconditionally - they don't check whether * the buffer is busy. This can cause drivers to have more live allocations * and CPU mappings than necessary. * * * How it works (queue architecture) * --------------------------------- * * There is a multithreaded queue consisting of batches, each batch consisting * of call slots. Each call slot consists of an 8-byte header (call ID + * call size + constant 32-bit marker for integrity checking) and an 8-byte * body for per-call data. That is 16 bytes per call slot. * * Simple calls such as bind_xx_state(CSO) occupy only one call slot. Bigger * calls occupy multiple call slots depending on the size needed by call * parameters. That means that calls can have a variable size in the batch. * For example, set_vertex_buffers(count = any, buffers = NULL) occupies only * 1 call slot, but set_vertex_buffers(count = 5) occupies 6 call slots. * Even though the first call slot can use only 8 bytes for data, additional * call slots used by the same call can use all 16 bytes for data. * For example, a call using 2 call slots has 24 bytes of space for data. * * Once a batch is full and there is no space for the next call, it's flushed, * meaning that it's added to the queue for execution in the other thread. * The batches are ordered in a ring and reused once they are idle again. * The batching is necessary for low queue/mutex overhead. * */ #ifndef U_THREADED_CONTEXT_H #define U_THREADED_CONTEXT_H #include "pipe/p_context.h" #include "pipe/p_state.h" #include "util/u_queue.h" #include "util/u_range.h" #include "util/slab.h" /* These are transfer flags sent to drivers. */ /* Never infer whether it's safe to use unsychronized mappings: */ #define TC_TRANSFER_MAP_IGNORE_VALID_RANGE (1u << 29) /* Don't invalidate buffers: */ #define TC_TRANSFER_MAP_NO_INVALIDATE (1u << 30) /* transfer_map is called from a non-driver thread: */ #define TC_TRANSFER_MAP_THREADED_UNSYNC (1u << 31) /* Size of the queue = number of batch slots in memory. * - 1 batch is always idle and records new commands * - 1 batch is being executed * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches. * * Use a size as small as possible for low CPU L2 cache usage but large enough * so that the queue isn't stalled too often for not having enough idle batch * slots. */ #define TC_MAX_BATCHES 10 /* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer) * can occupy multiple call slots. * * The idea is to have batches as small as possible but large enough so that * the queuing and mutex overhead is negligible. */ #define TC_CALLS_PER_BATCH 192 /* Threshold for when to use the queue or sync. */ #define TC_MAX_STRING_MARKER_BYTES 512 /* Threshold for when to enqueue buffer/texture_subdata as-is. * If the upload size is greater than this, it will do instead: * - for buffers: DISCARD_RANGE is done by the threaded context * - for textures: sync and call the driver directly */ #define TC_MAX_SUBDATA_BYTES 320 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx, struct pipe_resource *dst, struct pipe_resource *src); struct threaded_resource { struct pipe_resource b; const struct u_resource_vtbl *vtbl; /* Since buffer invalidations are queued, we can't use the base resource * for unsychronized mappings. This points to the latest version of * the buffer after the latest invalidation. It's only used for unsychro- * nized mappings in the non-driver thread. Initially it's set to &b. */ struct pipe_resource *latest; /* The buffer range which is initialized (with a write transfer, streamout, * or writable shader resources). The remainder of the buffer is considered * invalid and can be mapped unsynchronized. * * This allows unsychronized mapping of a buffer range which hasn't been * used yet. It's for applications which forget to use the unsynchronized * map flag and expect the driver to figure it out. * * Drivers should set this to the full range for buffers backed by user * memory. */ struct util_range valid_buffer_range; /* If "this" is not the base instance of the buffer, but it's one of its * reallocations (set in "latest" of the base instance), this points to * the valid range of the base instance. It's used for transfers after * a buffer invalidation, because such transfers operate on "latest", not * the base instance. Initially it's set to &valid_buffer_range. */ struct util_range *base_valid_buffer_range; /* Drivers are required to update this for shared resources and user * pointers. */ bool is_shared; bool is_user_ptr; }; struct threaded_transfer { struct pipe_transfer b; /* Staging buffer for DISCARD_RANGE transfers. */ struct pipe_resource *staging; /* Offset into the staging buffer, because the backing buffer is * sub-allocated. */ unsigned offset; }; struct threaded_query { /* The query is added to the list in end_query and removed in flush. */ struct list_head head_unflushed; /* Whether pipe->flush has been called after end_query. */ bool flushed; }; /* This is the second half of tc_call containing call data. * Most calls will typecast this to the type they need, typically larger * than 8 bytes. */ union tc_payload { struct pipe_query *query; struct pipe_resource *resource; struct pipe_transfer *transfer; uint64_t handle; }; #ifdef _MSC_VER #define ALIGN16 __declspec(align(16)) #else #define ALIGN16 __attribute__((aligned(16))) #endif /* Each call slot should be aligned to its own size for optimal cache usage. */ struct ALIGN16 tc_call { unsigned sentinel; ushort num_call_slots; ushort call_id; union tc_payload payload; }; struct tc_batch { struct pipe_context *pipe; unsigned sentinel; unsigned num_total_call_slots; struct util_queue_fence fence; struct tc_call call[TC_CALLS_PER_BATCH]; }; struct threaded_context { struct pipe_context base; struct pipe_context *pipe; struct slab_child_pool pool_transfers; tc_replace_buffer_storage_func replace_buffer_storage; unsigned map_buffer_alignment; struct list_head unflushed_queries; /* Counters for the HUD. */ unsigned num_offloaded_slots; unsigned num_direct_slots; unsigned num_syncs; struct util_queue queue; struct util_queue_fence *fence; unsigned last, next; struct tc_batch batch_slots[TC_MAX_BATCHES]; }; void threaded_resource_init(struct pipe_resource *res); void threaded_resource_deinit(struct pipe_resource *res); struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe); struct pipe_context * threaded_context_create(struct pipe_context *pipe, struct slab_parent_pool *parent_transfer_pool, tc_replace_buffer_storage_func replace_buffer, struct threaded_context **out); static inline struct threaded_context * threaded_context(struct pipe_context *pipe) { return (struct threaded_context*)pipe; } static inline struct threaded_resource * threaded_resource(struct pipe_resource *res) { return (struct threaded_resource*)res; } static inline struct threaded_query * threaded_query(struct pipe_query *q) { return (struct threaded_query*)q; } static inline struct threaded_transfer * threaded_transfer(struct pipe_transfer *transfer) { return (struct threaded_transfer*)transfer; } #endif