From 6eb9718567b9a75e2a6646ad362c1f0ceaff68c1 Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Fri, 7 Apr 2023 03:55:34 +0000 Subject: drm/amdkfd: Fixed kfd_process cleanup on module exit. From David Belanger b969838c9554a0e9aab3c3cadfcd23d246bc2abe in linux-6.1.y/6.1.23 20bc9f76b6a2455c6b54b91ae7634f147f64987f in mainline linux --- sys/dev/pci/drm/amd/amdkfd/kfd_module.c | 1 + sys/dev/pci/drm/amd/amdkfd/kfd_priv.h | 1 + sys/dev/pci/drm/amd/amdkfd/kfd_process.c | 67 ++++++++++++++++++++++++++++---- 3 files changed, 62 insertions(+), 7 deletions(-) (limited to 'sys/dev/pci/drm/amd/amdkfd') diff --git a/sys/dev/pci/drm/amd/amdkfd/kfd_module.c b/sys/dev/pci/drm/amd/amdkfd/kfd_module.c index 09b966dc376..aee2212e52f 100644 --- a/sys/dev/pci/drm/amd/amdkfd/kfd_module.c +++ b/sys/dev/pci/drm/amd/amdkfd/kfd_module.c @@ -77,6 +77,7 @@ err_ioctl: static void kfd_exit(void) { + kfd_cleanup_processes(); kfd_debugfs_fini(); kfd_process_destroy_wq(); kfd_procfs_shutdown(); diff --git a/sys/dev/pci/drm/amd/amdkfd/kfd_priv.h b/sys/dev/pci/drm/amd/amdkfd/kfd_priv.h index bf610e3b683..6d6588b9bee 100644 --- a/sys/dev/pci/drm/amd/amdkfd/kfd_priv.h +++ b/sys/dev/pci/drm/amd/amdkfd/kfd_priv.h @@ -928,6 +928,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev); int kfd_process_create_wq(void); void kfd_process_destroy_wq(void); +void kfd_cleanup_processes(void); struct kfd_process *kfd_create_process(struct file *filep); struct kfd_process *kfd_get_process(const struct task_struct *task); struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid); diff --git a/sys/dev/pci/drm/amd/amdkfd/kfd_process.c b/sys/dev/pci/drm/amd/amdkfd/kfd_process.c index dd351105c1b..7f68d51541e 100644 --- a/sys/dev/pci/drm/amd/amdkfd/kfd_process.c +++ b/sys/dev/pci/drm/amd/amdkfd/kfd_process.c @@ -1167,6 +1167,17 @@ static void kfd_process_free_notifier(struct mmu_notifier *mn) kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier)); } +static void kfd_process_notifier_release_internal(struct kfd_process *p) +{ + cancel_delayed_work_sync(&p->eviction_work); + cancel_delayed_work_sync(&p->restore_work); + + /* Indicate to other users that MM is no longer valid */ + p->mm = NULL; + + mmu_notifier_put(&p->mmu_notifier); +} + static void kfd_process_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -1181,17 +1192,22 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, return; mutex_lock(&kfd_processes_mutex); + /* + * Do early return if table is empty. + * + * This could potentially happen if this function is called concurrently + * by mmu_notifier and by kfd_cleanup_pocesses. + * + */ + if (hash_empty(kfd_processes_table)) { + mutex_unlock(&kfd_processes_mutex); + return; + } hash_del_rcu(&p->kfd_processes); mutex_unlock(&kfd_processes_mutex); synchronize_srcu(&kfd_processes_srcu); - cancel_delayed_work_sync(&p->eviction_work); - cancel_delayed_work_sync(&p->restore_work); - - /* Indicate to other users that MM is no longer valid */ - p->mm = NULL; - - mmu_notifier_put(&p->mmu_notifier); + kfd_process_notifier_release_internal(p); } static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { @@ -1200,6 +1216,43 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { .free_notifier = kfd_process_free_notifier, }; +/* + * This code handles the case when driver is being unloaded before all + * mm_struct are released. We need to safely free the kfd_process and + * avoid race conditions with mmu_notifier that might try to free them. + * + */ +void kfd_cleanup_processes(void) +{ + struct kfd_process *p; + struct hlist_node *p_temp; + unsigned int temp; + HLIST_HEAD(cleanup_list); + + /* + * Move all remaining kfd_process from the process table to a + * temp list for processing. Once done, callback from mmu_notifier + * release will not see the kfd_process in the table and do early return, + * avoiding double free issues. + */ + mutex_lock(&kfd_processes_mutex); + hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) { + hash_del_rcu(&p->kfd_processes); + synchronize_srcu(&kfd_processes_srcu); + hlist_add_head(&p->kfd_processes, &cleanup_list); + } + mutex_unlock(&kfd_processes_mutex); + + hlist_for_each_entry_safe(p, p_temp, &cleanup_list, kfd_processes) + kfd_process_notifier_release_internal(p); + + /* + * Ensures that all outstanding free_notifier get called, triggering + * the release of the kfd_process struct. + */ + mmu_notifier_synchronize(); +} + static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) { unsigned long offset; -- cgit v1.2.3