From 1ab1957d95a38e35958a4be9acbc250a2193f3bd Mon Sep 17 00:00:00 2001 From: Jonathan Gray Date: Tue, 6 Feb 2024 03:36:15 +0000 Subject: drm/amdgpu: Fix ecc irq enable/disable unpaired From Stanley Yang 0a8fc4e007b933d46f079a1d9ab8539a4d8439ef in linux-6.6.y/6.6.16 a32c6f7f5737cc7e31cd7ad5133f0d96fca12ea6 in mainline linux --- sys/dev/pci/drm/amd/amdgpu/aldebaran.c | 26 +++++++++++++++++++++++++- sys/dev/pci/drm/amd/amdgpu/gmc_v10_0.c | 4 ++++ sys/dev/pci/drm/amd/amdgpu/gmc_v11_0.c | 5 +++++ sys/dev/pci/drm/amd/amdgpu/gmc_v9_0.c | 4 ++++ 4 files changed, 38 insertions(+), 1 deletion(-) diff --git a/sys/dev/pci/drm/amd/amdgpu/aldebaran.c b/sys/dev/pci/drm/amd/amdgpu/aldebaran.c index 2b97b8a96fb..fa6193535d4 100644 --- a/sys/dev/pci/drm/amd/amdgpu/aldebaran.c +++ b/sys/dev/pci/drm/amd/amdgpu/aldebaran.c @@ -333,6 +333,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl, { struct list_head *reset_device_list = reset_context->reset_device_list; struct amdgpu_device *tmp_adev = NULL; + struct amdgpu_ras *con; int r; if (reset_device_list == NULL) @@ -358,7 +359,30 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl, */ amdgpu_register_gpu_instance(tmp_adev); - /* Resume RAS */ + /* Resume RAS, ecc_irq */ + con = amdgpu_ras_get_context(tmp_adev); + if (!amdgpu_sriov_vf(tmp_adev) && con) { + if (tmp_adev->sdma.ras && + tmp_adev->sdma.ras->ras_block.ras_late_init) { + r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev, + &tmp_adev->sdma.ras->ras_block.ras_comm); + if (r) { + dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r); + goto end; + } + } + + if (tmp_adev->gfx.ras && + tmp_adev->gfx.ras->ras_block.ras_late_init) { + r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev, + &tmp_adev->gfx.ras->ras_block.ras_comm); + if (r) { + dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r); + goto end; + } + } + } + amdgpu_ras_resume(tmp_adev); /* Update PSP FW topology after reset */ diff --git a/sys/dev/pci/drm/amd/amdgpu/gmc_v10_0.c b/sys/dev/pci/drm/amd/amdgpu/gmc_v10_0.c index c5aebb866b4..c05cb4cfd68 100644 --- a/sys/dev/pci/drm/amd/amdgpu/gmc_v10_0.c +++ b/sys/dev/pci/drm/amd/amdgpu/gmc_v10_0.c @@ -1141,6 +1141,10 @@ static int gmc_v10_0_hw_fini(void *handle) amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); + if (adev->gmc.ecc_irq.funcs && + amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); + return 0; } diff --git a/sys/dev/pci/drm/amd/amdgpu/gmc_v11_0.c b/sys/dev/pci/drm/amd/amdgpu/gmc_v11_0.c index f4f3fa2c677..df78be203f6 100644 --- a/sys/dev/pci/drm/amd/amdgpu/gmc_v11_0.c +++ b/sys/dev/pci/drm/amd/amdgpu/gmc_v11_0.c @@ -974,6 +974,11 @@ static int gmc_v11_0_hw_fini(void *handle) } amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); + + if (adev->gmc.ecc_irq.funcs && + amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); + gmc_v11_0_gart_disable(adev); return 0; diff --git a/sys/dev/pci/drm/amd/amdgpu/gmc_v9_0.c b/sys/dev/pci/drm/amd/amdgpu/gmc_v9_0.c index e116624ca93..0576e19bf50 100644 --- a/sys/dev/pci/drm/amd/amdgpu/gmc_v9_0.c +++ b/sys/dev/pci/drm/amd/amdgpu/gmc_v9_0.c @@ -2420,6 +2420,10 @@ static int gmc_v9_0_hw_fini(void *handle) amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); + if (adev->gmc.ecc_irq.funcs && + amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); + return 0; } -- cgit v1.2.3