src - OpenBSD base system

diff options


context:
space:
mode:

author	Jonathan Gray <jsg@cvs.openbsd.org>	2023-08-17 03:55:05 +0000
committer	Jonathan Gray <jsg@cvs.openbsd.org>	2023-08-17 03:55:05 +0000
commit	2abd4facbf01639d11f3e213574c318c174145b9 (patch)
tree	2fff7884568e109c669c21504a7c4f6bd8629239 /sys
parent	3c920cc2ce4449dd464f574d8f9baa5b5666e062 (diff)

drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation

From Evan Quan 0f19195d639764d68f6f316dda363ba29821e5bc in linux-6.1.y/6.1.46 b75efe88b20c2be28b67e2821a794cc183e32374 in mainline linux

Diffstat (limited to 'sys')

-rw-r--r--

sys/dev/pci/drm/amd/amdgpu/amdgpu.h

-rw-r--r--

sys/dev/pci/drm/amd/pm/powerplay/amd_powerplay.c

-rw-r--r--

sys/dev/pci/drm/amd/pm/powerplay/hwmgr/smu_helper.c

-rw-r--r--

sys/dev/pci/drm/amd/pm/powerplay/inc/hwmgr.h

-rw-r--r--

sys/dev/pci/drm/amd/pm/swsmu/amdgpu_smu.c

-rw-r--r--

sys/dev/pci/drm/amd/pm/swsmu/inc/amdgpu_smu.h

-rw-r--r--

sys/dev/pci/drm/amd/pm/swsmu/smu11/smu_v11_0.c

-rw-r--r--

sys/dev/pci/drm/amd/pm/swsmu/smu13/smu_v13_0.c

8 files changed, 102 insertions, 32 deletions

diff --git a/sys/dev/pci/drm/amd/amdgpu/amdgpu.h b/sys/dev/pci/drm/amd/amdgpu/amdgpu.h
index b15b9dc227a..6bd8895f3d4 100644
--- a/sys/dev/pci/drm/amd/amdgpu/amdgpu.h
+++ b/sys/dev/pci/drm/amd/amdgpu/amdgpu.h

@@ -288,6 +288,9 @@ extern int amdgpu_sg_display;

#define AMDGPU_SMARTSHIFT_MAX_BIAS (100)

#define AMDGPU_SMARTSHIFT_MIN_BIAS (-100)

+/* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */

+#define AMDGPU_SWCTF_EXTRA_DELAY 50

struct amdgpu_device;

struct amdgpu_irq_src;

struct amdgpu_fpriv;

diff --git a/sys/dev/pci/drm/amd/pm/powerplay/amd_powerplay.c b/sys/dev/pci/drm/amd/pm/powerplay/amd_powerplay.c
index 012dbdb193a..aa1b7a4cb26 100644
--- a/sys/dev/pci/drm/amd/pm/powerplay/amd_powerplay.c
+++ b/sys/dev/pci/drm/amd/pm/powerplay/amd_powerplay.c

@@ -26,6 +26,7 @@

#include <linux/gfp.h>

#include <linux/slab.h>

#include <linux/firmware.h>

+#include <linux/reboot.h>

#include "amd_shared.h"

#include "amd_powerplay.h"

#include "power_state.h"

@@ -91,6 +92,45 @@ static int pp_early_init(void *handle)

return 0;

}

+static void pp_swctf_delayed_work_handler(struct work_struct *work)

+ struct pp_hwmgr *hwmgr =

+ container_of(work, struct pp_hwmgr, swctf_delayed_work.work);

+ struct amdgpu_device *adev = hwmgr->adev;

+ struct amdgpu_dpm_thermal *range =

+ &adev->pm.dpm.thermal;

+ uint32_t gpu_temperature, size;

+ int ret;

+ /*

+ * If the hotspot/edge temperature is confirmed as below SW CTF setting point

+ * after the delay enforced, nothing will be done.

+ * Otherwise, a graceful shutdown will be performed to prevent further damage.

+ */

+ if (range->sw_ctf_threshold &&

+ hwmgr->hwmgr_func->read_sensor) {

+ ret = hwmgr->hwmgr_func->read_sensor(hwmgr,

+ AMDGPU_PP_SENSOR_HOTSPOT_TEMP,

+ &gpu_temperature,

+ &size);

+ /*

+ * For some legacy ASICs, hotspot temperature retrieving might be not

+ * supported. Check the edge temperature instead then.

+ */

+ if (ret == -EOPNOTSUPP)

+ ret = hwmgr->hwmgr_func->read_sensor(hwmgr,

+ AMDGPU_PP_SENSOR_EDGE_TEMP,

+ &gpu_temperature,

+ &size);

+ if (!ret && gpu_temperature / 1000 < range->sw_ctf_threshold)

+ return;

+ }

+ dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");

+ dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");

+ orderly_poweroff(true);

static int pp_sw_init(void *handle)

{

struct amdgpu_device *adev = handle;

@@ -101,6 +141,10 @@ static int pp_sw_init(void *handle)

pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully");

+ if (!ret)

+ INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,

+ pp_swctf_delayed_work_handler);

return ret;

}

@@ -136,6 +180,8 @@ static int pp_hw_fini(void *handle)

struct amdgpu_device *adev = handle;

struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;

+ cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);

hwmgr_hw_fini(hwmgr);

return 0;

@@ -222,6 +268,8 @@ static int pp_suspend(void *handle)

struct amdgpu_device *adev = handle;

struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;

+ cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);

return hwmgr_suspend(hwmgr);

}

diff --git a/sys/dev/pci/drm/amd/pm/powerplay/hwmgr/smu_helper.c b/sys/dev/pci/drm/amd/pm/powerplay/hwmgr/smu_helper.c
index bfe80ac0ad8..d0b1ab6c452 100644
--- a/sys/dev/pci/drm/amd/pm/powerplay/hwmgr/smu_helper.c
+++ b/sys/dev/pci/drm/amd/pm/powerplay/hwmgr/smu_helper.c

@@ -603,21 +603,17 @@ int phm_irq_process(struct amdgpu_device *adev,

struct amdgpu_irq_src *source,

struct amdgpu_iv_entry *entry)

{

+ struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;

uint32_t client_id = entry->client_id;

uint32_t src_id = entry->src_id;

if (client_id == AMDGPU_IRQ_CLIENTID_LEGACY) {

if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_LOW_TO_HIGH) {

- dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");

- /*

- * SW CTF just occurred.

- * Try to do a graceful shutdown to prevent further damage.

- */

- dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");

- orderly_poweroff(true);

- } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW)

+ schedule_delayed_work(&hwmgr->swctf_delayed_work,

+ msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));

+ } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) {

dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");

- else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {

+ } else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {

dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");

* HW CTF just occurred. Shutdown to prevent further damage.

@@ -626,15 +622,10 @@ int phm_irq_process(struct amdgpu_device *adev,

orderly_poweroff(true);

}

} else if (client_id == SOC15_IH_CLIENTID_THM) {

- if (src_id == 0) {

- dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");

- /*

- * SW CTF just occurred.

- * Try to do a graceful shutdown to prevent further damage.

- */

- dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");

- orderly_poweroff(true);

- } else

+ if (src_id == 0)

+ schedule_delayed_work(&hwmgr->swctf_delayed_work,

+ msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));

+ else

dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");

} else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) {

dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");

diff --git a/sys/dev/pci/drm/amd/pm/powerplay/inc/hwmgr.h b/sys/dev/pci/drm/amd/pm/powerplay/inc/hwmgr.h
index 06ad0aebfff..713c95725af 100644
--- a/sys/dev/pci/drm/amd/pm/powerplay/inc/hwmgr.h
+++ b/sys/dev/pci/drm/amd/pm/powerplay/inc/hwmgr.h

@@ -811,6 +811,8 @@ struct pp_hwmgr {

bool gfxoff_state_changed_by_workload;

uint32_t pstate_sclk_peak;

uint32_t pstate_mclk_peak;

+ struct delayed_work swctf_delayed_work;

};

int hwmgr_early_init(struct pp_hwmgr *hwmgr);

diff --git a/sys/dev/pci/drm/amd/pm/swsmu/amdgpu_smu.c b/sys/dev/pci/drm/amd/pm/swsmu/amdgpu_smu.c
index 7a7d3a5e0d3..f68365e8a89 100644
--- a/sys/dev/pci/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/sys/dev/pci/drm/amd/pm/swsmu/amdgpu_smu.c

@@ -24,6 +24,7 @@

#include <linux/firmware.h>

#include <linux/pci.h>

+#include <linux/reboot.h>

#include "amdgpu.h"

#include "amdgpu_smu.h"

@@ -1061,6 +1062,34 @@ static void smu_interrupt_work_fn(struct work_struct *work)

smu->ppt_funcs->interrupt_work(smu);

}

+static void smu_swctf_delayed_work_handler(struct work_struct *work)

+ struct smu_context *smu =

+ container_of(work, struct smu_context, swctf_delayed_work.work);

+ struct smu_temperature_range *range =

+ &smu->thermal_range;

+ struct amdgpu_device *adev = smu->adev;

+ uint32_t hotspot_tmp, size;

+ /*

+ * If the hotspot temperature is confirmed as below SW CTF setting point

+ * after the delay enforced, nothing will be done.

+ * Otherwise, a graceful shutdown will be performed to prevent further damage.

+ */

+ if (range->software_shutdown_temp &&

+ smu->ppt_funcs->read_sensor &&

+ !smu->ppt_funcs->read_sensor(smu,

+ AMDGPU_PP_SENSOR_HOTSPOT_TEMP,

+ &hotspot_tmp,

+ &size) &&

+ hotspot_tmp / 1000 < range->software_shutdown_temp)

+ return;

+ dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");

+ dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");

+ orderly_poweroff(true);

static int smu_sw_init(void *handle)

{

struct amdgpu_device *adev = (struct amdgpu_device *)handle;

@@ -1109,6 +1138,9 @@ static int smu_sw_init(void *handle)

return ret;

}

+ INIT_DELAYED_WORK(&smu->swctf_delayed_work,

+ smu_swctf_delayed_work_handler);

ret = smu_smc_table_sw_init(smu);

if (ret) {

dev_err(adev->dev, "Failed to sw init smc table!\n");

@@ -1581,6 +1613,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)

return ret;

}

+ cancel_delayed_work_sync(&smu->swctf_delayed_work);

ret = smu_disable_dpms(smu);

if (ret) {

dev_err(adev->dev, "Fail to disable dpm features!\n");

diff --git a/sys/dev/pci/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/sys/dev/pci/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index 1bb953c9796..d070ce60fd4 100644
--- a/sys/dev/pci/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/sys/dev/pci/drm/amd/pm/swsmu/inc/amdgpu_smu.h

@@ -573,6 +573,8 @@ struct smu_context

u32 debug_param_reg;

u32 debug_msg_reg;

u32 debug_resp_reg;

+ struct delayed_work swctf_delayed_work;

};

struct i2c_adapter;

diff --git a/sys/dev/pci/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/sys/dev/pci/drm/amd/pm/swsmu/smu11/smu_v11_0.c
index efd7d1feb0c..6de309a4a60 100644
--- a/sys/dev/pci/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+++ b/sys/dev/pci/drm/amd/pm/swsmu/smu11/smu_v11_0.c

@@ -1438,13 +1438,8 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,

if (client_id == SOC15_IH_CLIENTID_THM) {

switch (src_id) {

case THM_11_0__SRCID__THM_DIG_THERM_L2H:

- dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");

- /*

- * SW CTF just occurred.

- * Try to do a graceful shutdown to prevent further damage.

- */

- dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");

- orderly_poweroff(true);

+ schedule_delayed_work(&smu->swctf_delayed_work,

+ msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));

break;

case THM_11_0__SRCID__THM_DIG_THERM_H2L:

dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");

diff --git a/sys/dev/pci/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/sys/dev/pci/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index a2ad1a06806..7a4c6aef0ef 100644
--- a/sys/dev/pci/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/sys/dev/pci/drm/amd/pm/swsmu/smu13/smu_v13_0.c

@@ -1386,13 +1386,8 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev,

if (client_id == SOC15_IH_CLIENTID_THM) {

switch (src_id) {

case THM_11_0__SRCID__THM_DIG_THERM_L2H:

- dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");

- /*

- * SW CTF just occurred.

- * Try to do a graceful shutdown to prevent further damage.

- */

- dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");

- orderly_poweroff(true);

+ schedule_delayed_work(&smu->swctf_delayed_work,

+ msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));

break;

case THM_11_0__SRCID__THM_DIG_THERM_H2L:

dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");