From 98c05c52b1b11cf59051ccaf42d906e34aeda7a0 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Wed, 27 Feb 2013 02:18:23 +1100 Subject: sync execbuffer domain tracking and flushing with linux 3.4.33 --- sys/dev/pci/drm/drmP.h | 2 - sys/dev/pci/drm/i915_drv.h | 1 - sys/dev/pci/drm/i915_gem_execbuffer.c | 283 ++++++++++++++++++++++++++-------- 3 files changed, 218 insertions(+), 68 deletions(-) diff --git a/sys/dev/pci/drm/drmP.h b/sys/dev/pci/drm/drmP.h index a2b638059a7..fc32a2440ba 100644 --- a/sys/dev/pci/drm/drmP.h +++ b/sys/dev/pci/drm/drmP.h @@ -697,8 +697,6 @@ struct drm_device { atomic_t gtt_count; atomic_t gtt_memory; uint32_t gtt_total; - uint32_t invalidate_domains; - uint32_t flush_domains; SPLAY_HEAD(drm_name_tree, drm_obj) name_tree; struct pool objpl; diff --git a/sys/dev/pci/drm/i915_drv.h b/sys/dev/pci/drm/i915_drv.h index e42caa3d5d9..2c1cc3c0bbb 100644 --- a/sys/dev/pci/drm/i915_drv.h +++ b/sys/dev/pci/drm/i915_drv.h @@ -1054,7 +1054,6 @@ int i915_gem_put_relocs_to_user(struct drm_i915_gem_exec_object2 *, u_int32_t, struct drm_i915_gem_relocation_entry *); void i915_dispatch_gem_execbuffer(struct intel_ring_buffer *, struct drm_i915_gem_execbuffer2 *, uint64_t); -void i915_gem_object_set_to_gpu_domain(struct drm_obj *); int i915_gem_object_pin_and_relocate(struct drm_obj *, struct drm_file *, struct drm_i915_gem_exec_object2 *, struct drm_i915_gem_relocation_entry *); diff --git a/sys/dev/pci/drm/i915_gem_execbuffer.c b/sys/dev/pci/drm/i915_gem_execbuffer.c index a72354a049e..86c47295672 100644 --- a/sys/dev/pci/drm/i915_gem_execbuffer.c +++ b/sys/dev/pci/drm/i915_gem_execbuffer.c @@ -55,8 +55,25 @@ #include #include +struct change_domains { + uint32_t invalidate_domains; + uint32_t flush_domains; + uint32_t flush_rings; + uint32_t flips; +}; + int i915_reset_gen7_sol_offsets(struct drm_device *, struct intel_ring_buffer *); +int i915_gem_execbuffer_flush(struct drm_device *, uint32_t, uint32_t, + uint32_t); +bool intel_enable_semaphores(struct drm_device *); +int i915_gem_execbuffer_sync_rings(struct drm_i915_gem_object *, + struct intel_ring_buffer *); +int i915_gem_execbuffer_wait_for_flips(struct intel_ring_buffer *, u32); +int i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *, + struct drm_obj **, int); +void i915_gem_object_set_to_gpu_domain(struct drm_i915_gem_object *, + struct intel_ring_buffer *, struct change_domains *); /* * Set the next domain for the specified object. This @@ -128,7 +145,7 @@ int i915_reset_gen7_sol_offsets(struct drm_device *, * 4. set_domain (CPU, CPU) * flush_domains gets GPU * invalidate_domains gets CPU - * flush_gpu_write (obj) to make sure all drawing is complete. + * wait_rendering (obj) to make sure all drawing is complete. * This will include an MI_FLUSH to get the data from GPU * to memory * clflush (obj) to invalidate the CPU cache @@ -170,25 +187,18 @@ int i915_reset_gen7_sol_offsets(struct drm_device *, * drm_agp_chipset_flush */ void -i915_gem_object_set_to_gpu_domain(struct drm_obj *obj) +i915_gem_object_set_to_gpu_domain(struct drm_i915_gem_object *obj, + struct intel_ring_buffer *ring, + struct change_domains *cd) { - struct drm_device *dev = obj->dev; - struct inteldrm_softc *dev_priv = dev->dev_private; - struct drm_i915_gem_object *obj_priv = to_intel_bo(obj); - u_int32_t invalidate_domains = 0; - u_int32_t flush_domains = 0; - - DRM_ASSERT_HELD(obj); - KASSERT((obj->pending_read_domains & I915_GEM_DOMAIN_CPU) == 0); - KASSERT(obj->pending_write_domain != I915_GEM_DOMAIN_CPU); + uint32_t invalidate_domains = 0, flush_domains = 0; + /* * If the object isn't moving to a new write domain, * let the object stay in multiple read domains */ - if (obj->pending_write_domain == 0) - obj->pending_read_domains |= obj->read_domains; - else - obj_priv->dirty = 1; + if (obj->base.pending_write_domain == 0) + obj->base.pending_read_domains |= obj->base.read_domains; /* * Flush the current write domain if @@ -196,41 +206,40 @@ i915_gem_object_set_to_gpu_domain(struct drm_obj *obj) * any read domains which differ from the old * write domain */ - if (obj->write_domain && - obj->write_domain != obj->pending_read_domains) { - flush_domains |= obj->write_domain; - invalidate_domains |= obj->pending_read_domains & - ~obj->write_domain; + if (obj->base.write_domain && + (((obj->base.write_domain != obj->base.pending_read_domains || + obj->ring != ring)) || + (obj->fenced_gpu_access && !obj->pending_fenced_gpu_access))) { + flush_domains |= obj->base.write_domain; + invalidate_domains |= + obj->base.pending_read_domains & ~obj->base.write_domain; } /* * Invalidate any read caches which may have * stale data. That is, any new read domains. */ - invalidate_domains |= obj->pending_read_domains & ~obj->read_domains; - /* clflush the cpu now, gpu caches get queued. */ - if ((flush_domains | invalidate_domains) & I915_GEM_DOMAIN_CPU) { - bus_dmamap_sync(dev_priv->agpdmat, obj_priv->dmamap, 0, - obj->size, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - } - if ((flush_domains | invalidate_domains) & I915_GEM_DOMAIN_GTT) { - inteldrm_wipe_mappings(obj); - } + invalidate_domains |= obj->base.pending_read_domains & ~obj->base.read_domains; + if ((flush_domains | invalidate_domains) & I915_GEM_DOMAIN_CPU) + i915_gem_clflush_object(obj); + + if (obj->base.pending_write_domain) + cd->flips |= atomic_read(&obj->pending_flip); /* The actual obj->write_domain will be updated with - * pending_write_domain after we emit the accumulated flush for all of - * the domain changes in execuffer (which clears object's write - * domains). So if we have a current write domain that we aren't - * changing, set pending_write_domain to it. + * pending_write_domain after we emit the accumulated flush for all + * of our domain changes in execbuffers (which clears objects' + * write_domains). So if we have a current write domain that we + * aren't changing, set pending_write_domain to that. */ - if (flush_domains == 0 && obj->pending_write_domain == 0 && - (obj->pending_read_domains == obj->write_domain || - obj->write_domain == 0)) - obj->pending_write_domain = obj->write_domain; - obj->read_domains = obj->pending_read_domains; - obj->pending_read_domains = 0; - - dev->invalidate_domains |= invalidate_domains; - dev->flush_domains |= flush_domains; + if (flush_domains == 0 && obj->base.pending_write_domain == 0) + obj->base.pending_write_domain = obj->base.write_domain; + + cd->invalidate_domains |= invalidate_domains; + cd->flush_domains |= flush_domains; + if (flush_domains & I915_GEM_GPU_DOMAINS) + cd->flush_rings |= intel_ring_flag(obj->ring); + if (invalidate_domains & I915_GEM_GPU_DOMAINS) + cd->flush_rings |= intel_ring_flag(ring); } // struct eb_objects { @@ -246,11 +255,169 @@ i915_gem_object_set_to_gpu_domain(struct drm_obj *obj) // pin_and_fence_object // i915_gem_execbuffer_reserve // i915_gem_execbuffer_relocate_slow -// i915_gem_execbuffer_flush -// intel_enable_semaphores -// i915_gem_execbuffer_sync_rings -// i915_gem_execbuffer_wait_for_flips -// i915_gem_execbuffer_move_to_gpu + +int +i915_gem_execbuffer_flush(struct drm_device *dev, + uint32_t invalidate_domains, + uint32_t flush_domains, + uint32_t flush_rings) +{ + drm_i915_private_t *dev_priv = dev->dev_private; + int i, ret; + + if (flush_domains & I915_GEM_DOMAIN_CPU) + inteldrm_chipset_flush(dev_priv); + + if (flush_domains & I915_GEM_DOMAIN_GTT) + DRM_WRITEMEMORYBARRIER(); + + if ((flush_domains | invalidate_domains) & I915_GEM_GPU_DOMAINS) { + for (i = 0; i < I915_NUM_RINGS; i++) + if (flush_rings & (1 << i)) { + ret = i915_gem_flush_ring(&dev_priv->rings[i], + invalidate_domains, + flush_domains); + if (ret) + return ret; + } + } + + return 0; +} + +bool +intel_enable_semaphores(struct drm_device *dev) +{ + return 0; +#ifdef notyet + if (INTEL_INFO(dev)->gen < 6) + return 0; + + if (i915_semaphores >= 0) + return i915_semaphores; + + /* Disable semaphores on SNB */ + if (INTEL_INFO(dev)->gen == 6) + return 0; + + return 1; +#endif +} + +int +i915_gem_execbuffer_sync_rings(struct drm_i915_gem_object *obj, + struct intel_ring_buffer *to) +{ + struct intel_ring_buffer *from = obj->ring; +// u32 seqno; +// int ret, idx; + + if (from == NULL || to == from) + return 0; + + /* XXX gpu semaphores are implicated in various hard hangs on SNB */ +// if (!intel_enable_semaphores(obj->base.dev)) + return i915_gem_object_wait_rendering(obj, false); +#ifdef notyet + idx = intel_ring_sync_index(from, to); + + seqno = obj->last_rendering_seqno; + if (seqno <= from->sync_seqno[idx]) + return 0; + + if (seqno == from->outstanding_lazy_request) { + struct drm_i915_gem_request *request; + + request = kzalloc(sizeof(*request), GFP_KERNEL); + if (request == NULL) + return -ENOMEM; + + ret = i915_add_request(from, NULL, request); + if (ret) { + kfree(request); + return ret; + } + + seqno = request->seqno; + } + + from->sync_seqno[idx] = seqno; + + return to->sync_to(to, from, seqno - 1); +#endif +} + +int +i915_gem_execbuffer_wait_for_flips(struct intel_ring_buffer *ring, u32 flips) +{ + u32 plane, flip_mask; + int ret; + + /* Check for any pending flips. As we only maintain a flip queue depth + * of 1, we can simply insert a WAIT for the next display flip prior + * to executing the batch and avoid stalling the CPU. + */ + + for (plane = 0; flips >> plane; plane++) { + if (((flips >> plane) & 1) == 0) + continue; + + if (plane) + flip_mask = MI_WAIT_FOR_PLANE_B_FLIP; + else + flip_mask = MI_WAIT_FOR_PLANE_A_FLIP; + + ret = intel_ring_begin(ring, 2); + if (ret) + return ret; + + intel_ring_emit(ring, MI_WAIT_FOR_EVENT | flip_mask); + intel_ring_emit(ring, MI_NOOP); + intel_ring_advance(ring); + } + + return 0; +} + +int +i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring, + struct drm_obj **object_list, int buffer_count) +{ + struct drm_i915_gem_object *obj; + struct change_domains cd; + int ret, i; + + memset(&cd, 0, sizeof(cd)); + for (i = 0; i < buffer_count; i++) { + obj = to_intel_bo(object_list[i]); + i915_gem_object_set_to_gpu_domain(obj, ring, &cd); + } + + if (cd.invalidate_domains | cd.flush_domains) { + ret = i915_gem_execbuffer_flush(ring->dev, + cd.invalidate_domains, + cd.flush_domains, + cd.flush_rings); + if (ret) + return ret; + } + + if (cd.flips) { + ret = i915_gem_execbuffer_wait_for_flips(ring, cd.flips); + if (ret) + return ret; + } + + for (i = 0; i < buffer_count; i++) { + obj = to_intel_bo(object_list[i]); + ret = i915_gem_execbuffer_sync_rings(obj, ring); + if (ret) + return ret; + } + + return 0; +} + // i915_gem_check_execbuffer // validate_exec_list // i915_gem_execbuffer_move_to_active @@ -456,24 +623,10 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data, } batch_obj->pending_read_domains |= I915_GEM_DOMAIN_COMMAND; - inteldrm_verify_inactive(dev_priv, __FILE__, __LINE__); - - /* - * Zero the global flush/invalidate flags. These will be modified as - * new domains are computed for each object - */ - dev->invalidate_domains = 0; - dev->flush_domains = 0; - - /* Compute new gpu domains and update invalidate/flush */ - for (i = 0; i < args->buffer_count; i++) - i915_gem_object_set_to_gpu_domain(object_list[i]); - - inteldrm_verify_inactive(dev_priv, __FILE__, __LINE__); - - /* flush and invalidate any domains that need them. */ - (void)i915_gem_flush_ring(ring, dev->invalidate_domains, - dev->flush_domains); + ret = i915_gem_execbuffer_move_to_gpu(ring, object_list, + args->buffer_count); + if (ret) + goto err; /* * update the write domains, and fence/gpu write accounting information. -- cgit v1.2.3