svn commit: r347803 - in stable/11/sys/dev/mlx5: . mlx5_core
Hans Petter Selasky
hselasky at FreeBSD.org
Thu May 16 17:15:42 UTC 2019
Author: hselasky
Date: Thu May 16 17:15:41 2019
New Revision: 347803
URL: https://svnweb.freebsd.org/changeset/base/347803
Log:
MFC r347253:
Protect from infinite sw-reset loop in mlx5core.
Avoid an infinite software firmware reset loop that may be caused by a
hardware bug by limiting the maximum number of resets.
The counter between resets is reset by request for reset, and not by a
successful reset.
The interval between two resets can be configured via sysctl:
hw.mlx5.sw_reset_timeout
which is global to all mlx5 devices in the system.
Submitted by: slavash@
Sponsored by: Mellanox Technologies
Modified:
stable/11/sys/dev/mlx5/driver.h
stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c
Directory Properties:
stable/11/ (props changed)
Modified: stable/11/sys/dev/mlx5/driver.h
==============================================================================
--- stable/11/sys/dev/mlx5/driver.h Thu May 16 17:15:00 2019 (r347802)
+++ stable/11/sys/dev/mlx5/driver.h Thu May 16 17:15:41 2019 (r347803)
@@ -534,6 +534,7 @@ struct mlx5_core_health {
unsigned long flags;
struct work_struct work;
struct delayed_work recover_work;
+ unsigned int last_reset_req;
};
#define MLX5_CQ_LINEAR_ARRAY_SIZE 1024
Modified: stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c
==============================================================================
--- stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c Thu May 16 17:15:00 2019 (r347802)
+++ stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c Thu May 16 17:15:41 2019 (r347803)
@@ -64,6 +64,12 @@ SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLA
&mlx5_fw_reset_enable, 0,
"Enable firmware reset");
+static unsigned int sw_reset_to = 1200;
+SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN,
+ &sw_reset_to, 0,
+ "Minimum timeout in seconds between two firmware resets");
+
+
static int lock_sem_sw_reset(struct mlx5_core_dev *dev)
{
int ret;
@@ -218,6 +224,32 @@ static void reset_fw_if_needed(struct mlx5_core_dev *d
&dev->iseg->cmdq_addr_l_sz);
}
+static bool
+mlx5_health_allow_reset(struct mlx5_core_dev *dev)
+{
+ struct mlx5_core_health *health = &dev->priv.health;
+ unsigned int delta;
+ bool ret;
+
+ if (health->last_reset_req != 0) {
+ delta = ticks - health->last_reset_req;
+ delta /= hz;
+ ret = delta >= sw_reset_to;
+ } else {
+ ret = true;
+ }
+
+ /*
+ * In principle, ticks may be 0. Setting it to off by one (-1)
+ * to prevent certain reset in next request.
+ */
+ health->last_reset_req = ticks ? : -1;
+ if (!ret)
+ mlx5_core_warn(dev, "Firmware reset elided due to "
+ "auto-reset frequency threshold.\n");
+ return (ret);
+}
+
#define MLX5_CRDUMP_WAIT_MS 60000
#define MLX5_FW_RESET_WAIT_MS 1000
#define MLX5_NIC_STATE_POLL_MS 5
@@ -243,7 +275,8 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev,
if (force)
goto err_state_done;
- if (fatal_error == MLX5_SENSOR_FW_SYND_RFR) {
+ if (fatal_error == MLX5_SENSOR_FW_SYND_RFR &&
+ mlx5_health_allow_reset(dev)) {
/* Get cr-dump and reset FW semaphore */
if (mlx5_core_is_pf(dev))
lock = lock_sem_sw_reset(dev);
More information about the svn-src-stable
mailing list