diff options
author | Jonathan Gray <jsg@cvs.openbsd.org> | 2024-01-16 23:38:15 +0000 |
---|---|---|
committer | Jonathan Gray <jsg@cvs.openbsd.org> | 2024-01-16 23:38:15 +0000 |
commit | 05d4111894ef4fd94ed8efa99fca0c508e89db16 (patch) | |
tree | e57cb1ce48f5e92fee1882516089e05f72d15426 /sys/dev/pci/drm/amd/amdkfd/kfd_svm.c | |
parent | 77a5aef715cfd74dd54c674bc228dfc80962248a (diff) |
update drm to linux 6.6.12
Thanks to the OpenBSD Foundation for sponsoring this work.
Diffstat (limited to 'sys/dev/pci/drm/amd/amdkfd/kfd_svm.c')
-rw-r--r-- | sys/dev/pci/drm/amd/amdkfd/kfd_svm.c | 532 |
1 files changed, 320 insertions, 212 deletions
diff --git a/sys/dev/pci/drm/amd/amdkfd/kfd_svm.c b/sys/dev/pci/drm/amd/amdkfd/kfd_svm.c index 208812512d8..63ce30ea689 100644 --- a/sys/dev/pci/drm/amd/amdkfd/kfd_svm.c +++ b/sys/dev/pci/drm/amd/amdkfd/kfd_svm.c @@ -23,10 +23,14 @@ #include <linux/types.h> #include <linux/sched/task.h> +#include <linux/dynamic_debug.h> +#include <drm/ttm/ttm_tt.h> +#include <drm/drm_exec.h> + #include "amdgpu_sync.h" #include "amdgpu_object.h" #include "amdgpu_vm.h" -#include "amdgpu_mn.h" +#include "amdgpu_hmm.h" #include "amdgpu.h" #include "amdgpu_xgmi.h" #include "kfd_priv.h" @@ -45,6 +49,13 @@ * page table is updated. */ #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (2UL * NSEC_PER_MSEC) +#if IS_ENABLED(CONFIG_DYNAMIC_DEBUG) +#define dynamic_svm_range_dump(svms) \ + _dynamic_func_call_no_desc("svm_range_dump", svm_range_debug_dump, svms) +#else +#define dynamic_svm_range_dump(svms) \ + do { if (0) svm_range_debug_dump(svms); } while (0) +#endif /* Giant svm range split into smaller ranges based on this, it is decided using * minimum of all dGPU/APU 1/32 VRAM size, between 2MB to 1GB and alignment to @@ -169,12 +180,11 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, page = hmm_pfn_to_page(hmm_pfns[i]); if (is_zone_device_page(page)) { - struct amdgpu_device *bo_adev = - amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); + struct amdgpu_device *bo_adev = prange->svm_bo->node->adev; addr[i] = (hmm_pfns[i] << PAGE_SHIFT) + bo_adev->vm_manager.vram_base_offset - - bo_adev->kfd.dev->pgmap.range.start; + bo_adev->kfd.pgmap.range.start; addr[i] |= SVM_RANGE_VRAM_DOMAIN; pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]); continue; @@ -239,7 +249,7 @@ void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, } } -void svm_range_free_dma_mappings(struct svm_range *prange) +void svm_range_free_dma_mappings(struct svm_range *prange, bool unmap_dma) { struct kfd_process_device *pdd; dma_addr_t *dma_addr; @@ -259,14 +269,15 @@ void svm_range_free_dma_mappings(struct svm_range *prange) pr_debug("failed to find device idx %d\n", gpuidx); continue; } - dev = &pdd->dev->pdev->dev; - svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); + dev = &pdd->dev->adev->pdev->dev; + if (unmap_dma) + svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); kvfree(dma_addr); prange->dma_addr[gpuidx] = NULL; } } -static void svm_range_free(struct svm_range *prange, bool update_mem_usage) +static void svm_range_free(struct svm_range *prange, bool do_unmap) { uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT; struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); @@ -275,12 +286,12 @@ static void svm_range_free(struct svm_range *prange, bool update_mem_usage) prange->start, prange->last); svm_range_vram_node_free(prange); - svm_range_free_dma_mappings(prange); + svm_range_free_dma_mappings(prange, do_unmap); - if (update_mem_usage && !p->xnack_enabled) { + if (do_unmap && !p->xnack_enabled) { pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size); amdgpu_amdkfd_unreserve_mem_limit(NULL, size, - KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); } mutex_destroy(&prange->lock); mutex_destroy(&prange->migrate_mutex); @@ -313,7 +324,7 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, p = container_of(svms, struct kfd_process, svms); if (!p->xnack_enabled && update_mem_usage && amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT, - KFD_IOC_ALLOC_MEM_FLAGS_USERPTR)) { + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0)) { pr_info("SVM mapping failed, exceeds resident system memory limit\n"); kfree(prange); return NULL; @@ -423,10 +434,8 @@ static void svm_range_bo_unref(struct svm_range_bo *svm_bo) } static bool -svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange) +svm_range_validate_svm_bo(struct kfd_node *node, struct svm_range *prange) { - struct amdgpu_device *bo_adev; - mutex_lock(&prange->lock); if (!prange->svm_bo) { mutex_unlock(&prange->lock); @@ -439,12 +448,11 @@ svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange) } if (svm_bo_ref_unless_zero(prange->svm_bo)) { /* - * Migrate from GPU to GPU, remove range from source bo_adev - * svm_bo range list, and return false to allocate svm_bo from - * destination adev. + * Migrate from GPU to GPU, remove range from source svm_bo->node + * range list, and return false to allocate svm_bo from destination + * node. */ - bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); - if (bo_adev != adev) { + if (prange->svm_bo->node != node) { mutex_unlock(&prange->lock); spin_lock(&prange->svm_bo->list_lock); @@ -512,7 +520,7 @@ static struct svm_range_bo *svm_range_bo_new(void) } int -svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, +svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, bool clear) { struct amdgpu_bo_param bp; @@ -527,7 +535,7 @@ svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms, prange->start, prange->last); - if (svm_range_validate_svm_bo(adev, prange)) + if (svm_range_validate_svm_bo(node, prange)) return 0; svm_bo = svm_range_bo_new(); @@ -541,6 +549,7 @@ svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, kfree(svm_bo); return -ESRCH; } + svm_bo->node = node; svm_bo->eviction_fence = amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1), mm, @@ -557,13 +566,20 @@ svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, bp.flags |= AMDGPU_GEM_CREATE_DISCARDABLE; bp.type = ttm_bo_type_device; bp.resv = NULL; + if (node->xcp) + bp.xcp_id_plus1 = node->xcp->id + 1; - r = amdgpu_bo_create_user(adev, &bp, &ubo); + r = amdgpu_bo_create_user(node->adev, &bp, &ubo); if (r) { pr_debug("failed %d to create bo\n", r); goto create_bo_failed; } bo = &ubo->bo; + + pr_debug("alloc bo at offset 0x%lx size 0x%lx on partition %d\n", + bo->tbo.resource->start << PAGE_SHIFT, bp.size, + bp.xcp_id_plus1 - 1); + r = amdgpu_bo_reserve(bo, true); if (r) { pr_debug("failed %d to reserve bo\n", r); @@ -623,45 +639,30 @@ void svm_range_vram_node_free(struct svm_range *prange) mutex_unlock(&prange->lock); } -struct amdgpu_device * -svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id) +struct kfd_node * +svm_range_get_node_by_id(struct svm_range *prange, uint32_t gpu_id) { - struct kfd_process_device *pdd; struct kfd_process *p; - int32_t gpu_idx; + struct kfd_process_device *pdd; p = container_of(prange->svms, struct kfd_process, svms); - - gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id); - if (gpu_idx < 0) { - pr_debug("failed to get device by id 0x%x\n", gpu_id); - return NULL; - } - pdd = kfd_process_device_from_gpuidx(p, gpu_idx); + pdd = kfd_process_device_data_by_id(p, gpu_id); if (!pdd) { - pr_debug("failed to get device by idx 0x%x\n", gpu_idx); + pr_debug("failed to get kfd process device by id 0x%x\n", gpu_id); return NULL; } - return pdd->dev->adev; + return pdd->dev; } struct kfd_process_device * -svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev) +svm_range_get_pdd_by_node(struct svm_range *prange, struct kfd_node *node) { struct kfd_process *p; - int32_t gpu_idx, gpuid; - int r; p = container_of(prange->svms, struct kfd_process, svms); - r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpu_idx); - if (r) { - pr_debug("failed to get device id by adev %p\n", adev); - return NULL; - } - - return kfd_process_device_from_gpuidx(p, gpu_idx); + return kfd_get_process_device_data(node, p); } static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) @@ -741,7 +742,9 @@ svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, case KFD_IOCTL_SVM_ATTR_ACCESS: case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: case KFD_IOCTL_SVM_ATTR_NO_ACCESS: - *update_mapping = true; + if (!p->xnack_enabled) + *update_mapping = true; + gpuidx = kfd_process_gpuidx_from_gpuid(p, attrs[i].value); if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { @@ -864,6 +867,37 @@ static void svm_range_debug_dump(struct svm_range_list *svms) } } +static void * +svm_range_copy_array(void *psrc, size_t size, uint64_t num_elements, + uint64_t offset) +{ + unsigned char *dst; + + dst = kvmalloc_array(num_elements, size, GFP_KERNEL); + if (!dst) + return NULL; + memcpy(dst, (unsigned char *)psrc + offset, num_elements * size); + + return (void *)dst; +} + +static int +svm_range_copy_dma_addrs(struct svm_range *dst, struct svm_range *src) +{ + int i; + + for (i = 0; i < MAX_GPU_INSTANCE; i++) { + if (!src->dma_addr[i]) + continue; + dst->dma_addr[i] = svm_range_copy_array(src->dma_addr[i], + sizeof(*src->dma_addr[i]), src->npages, 0); + if (!dst->dma_addr[i]) + return -ENOMEM; + } + + return 0; +} + static int svm_range_split_array(void *ppnew, void *ppold, size_t size, uint64_t old_start, uint64_t old_n, @@ -878,22 +912,16 @@ svm_range_split_array(void *ppnew, void *ppold, size_t size, if (!pold) return 0; - new = kvmalloc_array(new_n, size, GFP_KERNEL); + d = (new_start - old_start) * size; + new = svm_range_copy_array(pold, size, new_n, d); if (!new) return -ENOMEM; - - d = (new_start - old_start) * size; - memcpy(new, pold + d, new_n * size); - - old = kvmalloc_array(old_n, size, GFP_KERNEL); + d = (new_start == old_start) ? new_n * size : 0; + old = svm_range_copy_array(pold, size, old_n, d); if (!old) { kvfree(new); return -ENOMEM; } - - d = (new_start == old_start) ? new_n * size : 0; - memcpy(old, pold + d, old_n * size); - kvfree(pold); *(void **)ppold = old; *(void **)ppnew = new; @@ -1152,31 +1180,39 @@ svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm, } return 0; } +static bool +svm_nodes_in_same_hive(struct kfd_node *node_a, struct kfd_node *node_b) +{ + return (node_a->adev == node_b->adev || + amdgpu_xgmi_same_hive(node_a->adev, node_b->adev)); +} static uint64_t -svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange, - int domain) +svm_range_get_pte_flags(struct kfd_node *node, + struct svm_range *prange, int domain) { - struct amdgpu_device *bo_adev; + struct kfd_node *bo_node; uint32_t flags = prange->flags; uint32_t mapping_flags = 0; uint64_t pte_flags; bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; + bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/ + unsigned int mtype_local; if (domain == SVM_RANGE_VRAM_DOMAIN) - bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); + bo_node = prange->svm_bo->node; - switch (KFD_GC_VERSION(adev->kfd.dev)) { + switch (node->adev->ip_versions[GC_HWIP][0]) { case IP_VERSION(9, 4, 1): if (domain == SVM_RANGE_VRAM_DOMAIN) { - if (bo_adev == adev) { + if (bo_node == node) { mapping_flags |= coherent ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; } else { mapping_flags |= coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; - if (amdgpu_xgmi_same_hive(adev, bo_adev)) + if (svm_nodes_in_same_hive(node, bo_node)) snoop = true; } } else { @@ -1186,15 +1222,15 @@ svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange, break; case IP_VERSION(9, 4, 2): if (domain == SVM_RANGE_VRAM_DOMAIN) { - if (bo_adev == adev) { + if (bo_node == node) { mapping_flags |= coherent ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; - if (adev->gmc.xgmi.connected_to_cpu) + if (node->adev->gmc.xgmi.connected_to_cpu) snoop = true; } else { mapping_flags |= coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; - if (amdgpu_xgmi_same_hive(adev, bo_adev)) + if (svm_nodes_in_same_hive(node, bo_node)) snoop = true; } } else { @@ -1202,6 +1238,37 @@ svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange, AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; } break; + case IP_VERSION(9, 4, 3): + mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC : + (amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW); + snoop = true; + if (uncached) { + mapping_flags |= AMDGPU_VM_MTYPE_UC; + } else if (domain == SVM_RANGE_VRAM_DOMAIN) { + /* local HBM region close to partition */ + if (bo_node->adev == node->adev && + (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id)) + mapping_flags |= mtype_local; + /* local HBM region far from partition or remote XGMI GPU */ + else if (svm_nodes_in_same_hive(bo_node, node)) + mapping_flags |= AMDGPU_VM_MTYPE_NC; + /* PCIe P2P */ + else + mapping_flags |= AMDGPU_VM_MTYPE_UC; + /* system memory accessed by the APU */ + } else if (node->adev->flags & AMD_IS_APU) { + /* On NUMA systems, locality is determined per-page + * in amdgpu_gmc_override_vm_pte_flags + */ + if (num_possible_nodes() <= 1) + mapping_flags |= mtype_local; + else + mapping_flags |= AMDGPU_VM_MTYPE_NC; + /* system memory accessed by the dGPU */ + } else { + mapping_flags |= AMDGPU_VM_MTYPE_UC; + } + break; default: mapping_flags |= coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; @@ -1218,7 +1285,7 @@ svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange, pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM; pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; - pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags); + pte_flags |= amdgpu_gem_va_map_flags(node->adev, mapping_flags); return pte_flags; } @@ -1325,7 +1392,7 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange, pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n", last_start, prange->start + i, last_domain ? "GPU" : "CPU"); - pte_flags = svm_range_get_pte_flags(adev, prange, last_domain); + pte_flags = svm_range_get_pte_flags(pdd->dev, prange, last_domain); if (readonly) pte_flags &= ~AMDGPU_PTE_WRITEABLE; @@ -1334,6 +1401,10 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange, (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0, pte_flags); + /* For dGPU mode, we use same vm_manager to allocate VRAM for + * different memory partition based on fpfn/lpfn, we should use + * same vm_manager.vram_base_offset regardless memory partition. + */ r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, NULL, last_start, prange->start + i, pte_flags, @@ -1371,16 +1442,14 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, unsigned long *bitmap, bool wait, bool flush_tlb) { struct kfd_process_device *pdd; - struct amdgpu_device *bo_adev; + struct amdgpu_device *bo_adev = NULL; struct kfd_process *p; struct dma_fence *fence = NULL; uint32_t gpuidx; int r = 0; if (prange->svm_bo && prange->ttm_res) - bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); - else - bo_adev = NULL; + bo_adev = prange->svm_bo->node->adev; p = container_of(prange->svms, struct kfd_process, svms); for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { @@ -1429,37 +1498,34 @@ struct svm_validate_context { struct svm_range *prange; bool intr; DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); - struct ttm_validate_buffer tv[MAX_GPU_INSTANCE]; - struct list_head validate_list; - struct ww_acquire_ctx ticket; + struct drm_exec exec; }; -static int svm_range_reserve_bos(struct svm_validate_context *ctx) +static int svm_range_reserve_bos(struct svm_validate_context *ctx, bool intr) { struct kfd_process_device *pdd; struct amdgpu_vm *vm; uint32_t gpuidx; int r; - INIT_LIST_HEAD(&ctx->validate_list); - for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { - pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); - if (!pdd) { - pr_debug("failed to find device idx %d\n", gpuidx); - return -EINVAL; - } - vm = drm_priv_to_vm(pdd->drm_priv); - - ctx->tv[gpuidx].bo = &vm->root.bo->tbo; - ctx->tv[gpuidx].num_shared = 4; - list_add(&ctx->tv[gpuidx].head, &ctx->validate_list); - } + drm_exec_init(&ctx->exec, intr ? DRM_EXEC_INTERRUPTIBLE_WAIT: 0); + drm_exec_until_all_locked(&ctx->exec) { + for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { + pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx); + if (!pdd) { + pr_debug("failed to find device idx %d\n", gpuidx); + r = -EINVAL; + goto unreserve_out; + } + vm = drm_priv_to_vm(pdd->drm_priv); - r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list, - ctx->intr, NULL); - if (r) { - pr_debug("failed %d to reserve bo\n", r); - return r; + r = amdgpu_vm_lock_pd(vm, &ctx->exec, 2); + drm_exec_retry_on_contention(&ctx->exec); + if (unlikely(r)) { + pr_debug("failed %d to reserve bo\n", r); + goto unreserve_out; + } + } } for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { @@ -1482,13 +1548,13 @@ static int svm_range_reserve_bos(struct svm_validate_context *ctx) return 0; unreserve_out: - ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); + drm_exec_fini(&ctx->exec); return r; } static void svm_range_unreserve_bos(struct svm_validate_context *ctx) { - ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list); + drm_exec_fini(&ctx->exec); } static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) @@ -1496,6 +1562,8 @@ static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) struct kfd_process_device *pdd; pdd = kfd_process_device_from_gpuidx(p, gpuidx); + if (!pdd) + return NULL; return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev); } @@ -1528,48 +1596,54 @@ static int svm_range_validate_and_map(struct mm_struct *mm, struct svm_range *prange, int32_t gpuidx, bool intr, bool wait, bool flush_tlb) { - struct svm_validate_context ctx; + struct svm_validate_context *ctx; unsigned long start, end, addr; struct kfd_process *p; void *owner; int32_t idx; int r = 0; - ctx.process = container_of(prange->svms, struct kfd_process, svms); - ctx.prange = prange; - ctx.intr = intr; + ctx = kzalloc(sizeof(struct svm_validate_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->process = container_of(prange->svms, struct kfd_process, svms); + ctx->prange = prange; + ctx->intr = intr; if (gpuidx < MAX_GPU_INSTANCE) { - bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE); - bitmap_set(ctx.bitmap, gpuidx, 1); - } else if (ctx.process->xnack_enabled) { - bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); + bitmap_zero(ctx->bitmap, MAX_GPU_INSTANCE); + bitmap_set(ctx->bitmap, gpuidx, 1); + } else if (ctx->process->xnack_enabled) { + bitmap_copy(ctx->bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); /* If prefetch range to GPU, or GPU retry fault migrate range to * GPU, which has ACCESS attribute to the range, create mapping * on that GPU. */ if (prange->actual_loc) { - gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process, + gpuidx = kfd_process_gpuidx_from_gpuid(ctx->process, prange->actual_loc); if (gpuidx < 0) { WARN_ONCE(1, "failed get device by id 0x%x\n", prange->actual_loc); - return -EINVAL; + r = -EINVAL; + goto free_ctx; } if (test_bit(gpuidx, prange->bitmap_access)) - bitmap_set(ctx.bitmap, gpuidx, 1); + bitmap_set(ctx->bitmap, gpuidx, 1); } } else { - bitmap_or(ctx.bitmap, prange->bitmap_access, + bitmap_or(ctx->bitmap, prange->bitmap_access, prange->bitmap_aip, MAX_GPU_INSTANCE); } - if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE)) { - if (!prange->mapped_to_gpu) - return 0; - - bitmap_copy(ctx.bitmap, prange->bitmap_access, MAX_GPU_INSTANCE); + if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) { + bitmap_copy(ctx->bitmap, prange->bitmap_access, MAX_GPU_INSTANCE); + if (!prange->mapped_to_gpu || + bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) { + r = 0; + goto free_ctx; + } } if (prange->actual_loc && !prange->ttm_res) { @@ -1577,15 +1651,16 @@ static int svm_range_validate_and_map(struct mm_struct *mm, * svm_migrate_ram_to_vram after allocating a BO. */ WARN_ONCE(1, "VRAM BO missing during validation\n"); - return -EINVAL; + r = -EINVAL; + goto free_ctx; } - svm_range_reserve_bos(&ctx); + svm_range_reserve_bos(ctx, intr); p = container_of(prange->svms, struct kfd_process, svms); - owner = kfd_svm_page_owner(p, find_first_bit(ctx.bitmap, + owner = kfd_svm_page_owner(p, find_first_bit(ctx->bitmap, MAX_GPU_INSTANCE)); - for_each_set_bit(idx, ctx.bitmap, MAX_GPU_INSTANCE) { + for_each_set_bit(idx, ctx->bitmap, MAX_GPU_INSTANCE) { if (kfd_svm_page_owner(p, idx) != owner) { owner = NULL; break; @@ -1594,73 +1669,72 @@ static int svm_range_validate_and_map(struct mm_struct *mm, start = prange->start << PAGE_SHIFT; end = (prange->last + 1) << PAGE_SHIFT; - for (addr = start; addr < end && !r; ) { + for (addr = start; !r && addr < end; ) { struct hmm_range *hmm_range; struct vm_area_struct *vma; - unsigned long next; + unsigned long next = 0; unsigned long offset; unsigned long npages; bool readonly; - vma = find_vma(mm, addr); - if (!vma || addr < vma->vm_start) { + vma = vma_lookup(mm, addr); + if (vma) { + readonly = !(vma->vm_flags & VM_WRITE); + + next = min(vma->vm_end, end); + npages = (next - addr) >> PAGE_SHIFT; + WRITE_ONCE(p->svms.faulting_task, current); + r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages, + readonly, owner, NULL, + &hmm_range); + WRITE_ONCE(p->svms.faulting_task, NULL); + if (r) { + pr_debug("failed %d to get svm range pages\n", r); + if (r == -EBUSY) + r = -EAGAIN; + } + } else { r = -EFAULT; - goto unreserve_out; - } - readonly = !(vma->vm_flags & VM_WRITE); - - next = min(vma->vm_end, end); - npages = (next - addr) >> PAGE_SHIFT; - WRITE_ONCE(p->svms.faulting_task, current); - r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, - addr, npages, &hmm_range, - readonly, true, owner); - WRITE_ONCE(p->svms.faulting_task, NULL); - if (r) { - pr_debug("failed %d to get svm range pages\n", r); - goto unreserve_out; } - offset = (addr - start) >> PAGE_SHIFT; - r = svm_range_dma_map(prange, ctx.bitmap, offset, npages, - hmm_range->hmm_pfns); - if (r) { - pr_debug("failed %d to dma map range\n", r); - goto unreserve_out; + if (!r) { + offset = (addr - start) >> PAGE_SHIFT; + r = svm_range_dma_map(prange, ctx->bitmap, offset, npages, + hmm_range->hmm_pfns); + if (r) + pr_debug("failed %d to dma map range\n", r); } svm_range_lock(prange); - if (amdgpu_hmm_range_get_pages_done(hmm_range)) { + if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) { pr_debug("hmm update the range, need validate again\n"); r = -EAGAIN; - goto unlock_out; } - if (!list_empty(&prange->child_list)) { + + if (!r && !list_empty(&prange->child_list)) { pr_debug("range split by unmap in parallel, validate again\n"); r = -EAGAIN; - goto unlock_out; } - r = svm_range_map_to_gpus(prange, offset, npages, readonly, - ctx.bitmap, wait, flush_tlb); + if (!r) + r = svm_range_map_to_gpus(prange, offset, npages, readonly, + ctx->bitmap, wait, flush_tlb); + + if (!r && next == end) + prange->mapped_to_gpu = true; -unlock_out: svm_range_unlock(prange); addr = next; } - if (addr == end) { - prange->validated_once = true; - prange->mapped_to_gpu = true; - } - -unreserve_out: - svm_range_unreserve_bos(&ctx); - + svm_range_unreserve_bos(ctx); if (!r) prange->validate_timestamp = ktime_get_boottime(); +free_ctx: + kfree(ctx); + return r; } @@ -1789,6 +1863,7 @@ out_reschedule: * @mm: current process mm_struct * @start: starting process queue number * @last: last process queue number + * @event: mmu notifier event when range is evicted or migrated * * Stop all queues of the process to ensure GPU doesn't access the memory, then * return to let CPU evict the buffer and proceed CPU pagetable update. @@ -1887,7 +1962,10 @@ static struct svm_range *svm_range_clone(struct svm_range *old) new = svm_range_new(old->svms, old->start, old->last, false); if (!new) return NULL; - + if (svm_range_copy_dma_addrs(new, old)) { + svm_range_free(new, false); + return NULL; + } if (old->svm_bo) { new->ttm_res = old->ttm_res; new->offset = old->offset; @@ -1912,14 +1990,23 @@ void svm_range_set_max_pages(struct amdgpu_device *adev) { uint64_t max_pages; uint64_t pages, _pages; + uint64_t min_pages = 0; + int i, id; + + for (i = 0; i < adev->kfd.dev->num_nodes; i++) { + if (adev->kfd.dev->nodes[i]->xcp) + id = adev->kfd.dev->nodes[i]->xcp->id; + else + id = -1; + pages = KFD_XCP_MEMORY_SIZE(adev, id) >> 17; + pages = clamp(pages, 1ULL << 9, 1ULL << 18); + pages = rounddown_pow_of_two(pages); + min_pages = min_not_zero(min_pages, pages); + } - /* 1/32 VRAM size in pages */ - pages = adev->gmc.real_vram_size >> 17; - pages = clamp(pages, 1ULL << 9, 1ULL << 18); - pages = rounddown_pow_of_two(pages); do { max_pages = READ_ONCE(max_svm_range_pages); - _pages = min_not_zero(max_pages, pages); + _pages = min_not_zero(max_pages, min_pages); } while (cmpxchg(&max_svm_range_pages, max_pages, _pages) != max_pages); } @@ -2010,7 +2097,8 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, next = interval_tree_iter_next(node, start, last); next_start = min(node->last, last) + 1; - if (svm_range_is_same_attrs(p, prange, nattr, attrs)) { + if (svm_range_is_same_attrs(p, prange, nattr, attrs) && + prange->mapped_to_gpu) { /* nothing to do */ } else if (node->start < start || node->last > last) { /* node intersects the update range and its attributes @@ -2178,7 +2266,15 @@ restart: pr_debug("drain retry fault gpu %d svms %p\n", i, svms); amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, - &pdd->dev->adev->irq.ih1); + pdd->dev->adev->irq.retry_cam_enabled ? + &pdd->dev->adev->irq.ih : + &pdd->dev->adev->irq.ih1); + + if (pdd->dev->adev->irq.retry_cam_enabled) + amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, + &pdd->dev->adev->irq.ih_soft); + + pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); } if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain) @@ -2505,29 +2601,31 @@ svm_range_from_addr(struct svm_range_list *svms, unsigned long addr, */ static int32_t svm_range_best_restore_location(struct svm_range *prange, - struct amdgpu_device *adev, + struct kfd_node *node, int32_t *gpuidx) { - struct amdgpu_device *bo_adev, *preferred_adev; + struct kfd_node *bo_node, *preferred_node; struct kfd_process *p; uint32_t gpuid; int r; p = container_of(prange->svms, struct kfd_process, svms); - r = kfd_process_gpuid_from_adev(p, adev, &gpuid, gpuidx); + r = kfd_process_gpuid_from_node(p, node, &gpuid, gpuidx); if (r < 0) { pr_debug("failed to get gpuid from kgd\n"); return -1; } + if (node->adev->gmc.is_app_apu) + return 0; + if (prange->preferred_loc == gpuid || prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) { return prange->preferred_loc; } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) { - preferred_adev = svm_range_get_adev_by_id(prange, - prange->preferred_loc); - if (amdgpu_xgmi_same_hive(adev, preferred_adev)) + preferred_node = svm_range_get_node_by_id(prange, prange->preferred_loc); + if (preferred_node && svm_nodes_in_same_hive(node, preferred_node)) return prange->preferred_loc; /* fall through */ } @@ -2539,8 +2637,8 @@ svm_range_best_restore_location(struct svm_range *prange, if (!prange->actual_loc) return 0; - bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc); - if (amdgpu_xgmi_same_hive(adev, bo_adev)) + bo_node = svm_range_get_node_by_id(prange, prange->actual_loc); + if (bo_node && svm_nodes_in_same_hive(node, bo_node)) return prange->actual_loc; else return 0; @@ -2558,16 +2656,13 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, struct interval_tree_node *node; unsigned long start_limit, end_limit; - vma = find_vma(p->mm, addr << PAGE_SHIFT); - if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { + vma = vma_lookup(p->mm, addr << PAGE_SHIFT); + if (!vma) { pr_debug("VMA does not exist in address [0x%llx]\n", addr); return -EFAULT; } - *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk && - vma->vm_end >= vma->vm_mm->start_brk) || - (vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack); + *is_heap_stack = vma_is_initial_heap(vma) || vma_is_initial_stack(vma); start_limit = max(vma->vm_start >> PAGE_SHIFT, (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); @@ -2657,7 +2752,7 @@ svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last, } static struct -svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev, +svm_range *svm_range_create_unregistered_range(struct kfd_node *node, struct kfd_process *p, struct mm_struct *mm, int64_t addr) @@ -2692,7 +2787,7 @@ svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev, pr_debug("Failed to create prange in address [0x%llx]\n", addr); return NULL; } - if (kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx)) { + if (kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx)) { pr_debug("failed to get gpuid from kgd\n"); svm_range_free(prange, true); return NULL; @@ -2746,7 +2841,7 @@ static bool svm_range_skip_recover(struct svm_range *prange) } static void -svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p, +svm_range_count_fault(struct kfd_node *node, struct kfd_process *p, int32_t gpuidx) { struct kfd_process_device *pdd; @@ -2759,7 +2854,7 @@ svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p, uint32_t gpuid; int r; - r = kfd_process_gpuid_from_adev(p, adev, &gpuid, &gpuidx); + r = kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx); if (r < 0) return; } @@ -2787,6 +2882,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, + uint32_t vmid, uint32_t node_id, uint64_t addr, bool write_fault) { struct mm_struct *mm = NULL; @@ -2794,6 +2890,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, struct svm_range *prange; struct kfd_process *p; ktime_t timestamp = ktime_get_boottime(); + struct kfd_node *node; int32_t best_loc; int32_t gpuidx = MAX_GPU_INSTANCE; bool write_locked = false; @@ -2801,7 +2898,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, bool migration = false; int r = 0; - if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) { + if (!KFD_IS_SVM_API_SUPPORTED(adev)) { pr_debug("device does not support SVM\n"); return -EFAULT; } @@ -2837,6 +2934,13 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, goto out; } + node = kfd_node_by_irq_ids(adev, node_id, vmid); + if (!node) { + pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id, + vmid); + r = -EFAULT; + goto out; + } mmap_read_lock(mm); retry_write_locked: mutex_lock(&svms->lock); @@ -2855,7 +2959,7 @@ retry_write_locked: write_locked = true; goto retry_write_locked; } - prange = svm_range_create_unregistered_range(adev, p, mm, addr); + prange = svm_range_create_unregistered_range(node, p, mm, addr); if (!prange) { pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n", svms, addr); @@ -2870,7 +2974,7 @@ retry_write_locked: mutex_lock(&prange->migrate_mutex); if (svm_range_skip_recover(prange)) { - amdgpu_gmc_filter_faults_remove(adev, addr, pasid); + amdgpu_gmc_filter_faults_remove(node->adev, addr, pasid); r = 0; goto out_unlock_range; } @@ -2887,8 +2991,8 @@ retry_write_locked: /* __do_munmap removed VMA, return success as we are handling stale * retry fault. */ - vma = find_vma(mm, addr << PAGE_SHIFT); - if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) { + vma = vma_lookup(mm, addr << PAGE_SHIFT); + if (!vma) { pr_debug("address 0x%llx VMA is removed\n", addr); r = 0; goto out_unlock_range; @@ -2901,7 +3005,7 @@ retry_write_locked: goto out_unlock_range; } - best_loc = svm_range_best_restore_location(prange, adev, &gpuidx); + best_loc = svm_range_best_restore_location(prange, node, &gpuidx); if (best_loc == -1) { pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n", svms, prange->start, prange->last); @@ -2913,7 +3017,7 @@ retry_write_locked: svms, prange->start, prange->last, best_loc, prange->actual_loc); - kfd_smi_event_page_fault_start(adev->kfd.dev, p->lead_thread->pid, addr, + kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr, write_fault, timestamp); if (prange->actual_loc != best_loc) { @@ -2951,7 +3055,7 @@ retry_write_locked: pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", r, svms, prange->start, prange->last); - kfd_smi_event_page_fault_end(adev->kfd.dev, p->lead_thread->pid, addr, + kfd_smi_event_page_fault_end(node, p->lead_thread->pid, addr, migration); out_unlock_range: @@ -2960,7 +3064,7 @@ out_unlock_svms: mutex_unlock(&svms->lock); mmap_read_unlock(mm); - svm_range_count_fault(adev, p, gpuidx); + svm_range_count_fault(node, p, gpuidx); mmput(mm); out: @@ -2968,7 +3072,7 @@ out: if (r == -EAGAIN) { pr_debug("recover vm fault later\n"); - amdgpu_gmc_filter_faults_remove(adev, addr, pasid); + amdgpu_gmc_filter_faults_remove(node->adev, addr, pasid); r = 0; } return r; @@ -2992,10 +3096,10 @@ svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled) size = (pchild->last - pchild->start + 1) << PAGE_SHIFT; if (xnack_enabled) { amdgpu_amdkfd_unreserve_mem_limit(NULL, size, - KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); } else { r = amdgpu_amdkfd_reserve_mem_limit(NULL, size, - KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); if (r) goto out_unlock; reserved_size += size; @@ -3005,10 +3109,10 @@ svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled) size = (prange->last - prange->start + 1) << PAGE_SHIFT; if (xnack_enabled) { amdgpu_amdkfd_unreserve_mem_limit(NULL, size, - KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); } else { r = amdgpu_amdkfd_reserve_mem_limit(NULL, size, - KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); if (r) goto out_unlock; reserved_size += size; @@ -3021,7 +3125,7 @@ out_unlock: if (r) amdgpu_amdkfd_unreserve_mem_limit(NULL, reserved_size, - KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); else /* Change xnack mode must be inside svms lock, to avoid race with * svm_range_deferred_list_work unreserve memory in parallel. @@ -3079,7 +3183,7 @@ int svm_range_list_init(struct kfd_process *p) spin_lock_init(&svms->deferred_list_lock); for (i = 0; i < p->n_pdds; i++) - if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev)) + if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev->adev)) bitmap_set(svms->bitmap_supported, i, 1); return 0; @@ -3168,9 +3272,8 @@ svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size) start <<= PAGE_SHIFT; end = start + (size << PAGE_SHIFT); do { - vma = find_vma(p->mm, start); - if (!vma || start < vma->vm_start || - (vma->vm_flags & device_vma)) + vma = vma_lookup(p->mm, start); + if (!vma || (vma->vm_flags & device_vma)) return -EFAULT; start = min(end, vma->vm_end); } while (start < end); @@ -3211,7 +3314,7 @@ svm_range_best_prefetch_location(struct svm_range *prange) DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); uint32_t best_loc = prange->prefetch_loc; struct kfd_process_device *pdd; - struct amdgpu_device *bo_adev; + struct kfd_node *bo_node; struct kfd_process *p; uint32_t gpuidx; @@ -3220,9 +3323,14 @@ svm_range_best_prefetch_location(struct svm_range *prange) if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) goto out; - bo_adev = svm_range_get_adev_by_id(prange, best_loc); - if (!bo_adev) { - WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc); + bo_node = svm_range_get_node_by_id(prange, best_loc); + if (!bo_node) { + WARN_ONCE(1, "failed to get valid kfd node at id%x\n", best_loc); + best_loc = 0; + goto out; + } + + if (bo_node->adev->gmc.is_app_apu) { best_loc = 0; goto out; } @@ -3240,10 +3348,10 @@ svm_range_best_prefetch_location(struct svm_range *prange) continue; } - if (pdd->dev->adev == bo_adev) + if (pdd->dev->adev == bo_node->adev) continue; - if (!amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) { + if (!svm_nodes_in_same_hive(pdd->dev, bo_node)) { best_loc = 0; break; } @@ -3400,7 +3508,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, struct svm_range *next; bool update_mapping = false; bool flush_tlb; - int r = 0; + int r, ret = 0; pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", p->pasid, &p->svms, start, start + size - 1, size); @@ -3488,10 +3596,10 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, out_unlock_range: mutex_unlock(&prange->migrate_mutex); if (r) - break; + ret = r; } - svm_range_debug_dump(svms); + dynamic_svm_range_dump(svms); mutex_unlock(&svms->lock); mmap_read_unlock(mm); @@ -3501,7 +3609,7 @@ out: pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, &p->svms, start, start + size - 1, r); - return r; + return ret ? ret : r; } static int |