svn commit: r331915 - stable/11/sys/dev/mlx5/mlx5_core
Hans Petter Selasky
hselasky at FreeBSD.org
Tue Apr 3 09:28:12 UTC 2018
Author: hselasky
Date: Tue Apr 3 09:28:11 2018
New Revision: 331915
URL: https://svnweb.freebsd.org/changeset/base/331915
Log:
MFC r331822:
Reorganize health recovery in mlx5core.
- Move the semaphore locking and unlocking to the same function.
- Flags are no longer needed if the reset and crdump will be done in the
same function.
Submitted by: slavash@
Sponsored by: Mellanox Technologies
Modified:
stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c
Directory Properties:
stable/11/ (props changed)
Modified: stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c
==============================================================================
--- stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c Tue Apr 3 09:27:05 2018 (r331914)
+++ stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c Tue Apr 3 09:28:11 2018 (r331915)
@@ -48,8 +48,6 @@ enum {
enum {
MLX5_DROP_NEW_HEALTH_WORK,
MLX5_DROP_NEW_RECOVERY_WORK,
- MLX5_SKIP_SW_RESET,
- MLX5_SW_RESET_SEM_LOCKED,
};
enum {
@@ -167,7 +165,6 @@ static void reset_fw_if_needed(struct mlx5_core_dev *d
{
bool supported = (ioread32be(&dev->iseg->initializing) >>
MLX5_FW_RESET_SUPPORTED_OFFSET) & 1;
- struct mlx5_core_health *health = &dev->priv.health;
u32 cmdq_addr, fatal_error;
if (!supported)
@@ -181,8 +178,7 @@ static void reset_fw_if_needed(struct mlx5_core_dev *d
fatal_error = check_fatal_sensors(dev);
if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
fatal_error == MLX5_SENSOR_NIC_DISABLED ||
- fatal_error == MLX5_SENSOR_NIC_SW_RESET ||
- test_bit(MLX5_SKIP_SW_RESET, &health->flags)) {
+ fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help.\n");
return;
}
@@ -197,26 +193,65 @@ static void reset_fw_if_needed(struct mlx5_core_dev *d
&dev->iseg->cmdq_addr_l_sz);
}
+#define MLX5_CRDUMP_WAIT_MS 60000
+#define MLX5_FW_RESET_WAIT_MS 1000
+#define MLX5_NIC_STATE_POLL_MS 5
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
{
+ unsigned long end, delay_ms = MLX5_CRDUMP_WAIT_MS;
+ u32 fatal_error;
+ int lock = -EBUSY;
+
mutex_lock(&dev->intf_state_mutex);
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
goto unlock;
return;
}
- if (!force)
- mlx5_core_err(dev, "internal state error detected\n");
- if (check_fatal_sensors(dev) || force) {
- reset_fw_if_needed(dev);
+ fatal_error = check_fatal_sensors(dev);
+
+ if (fatal_error || force) {
+ if (!force)
+ mlx5_core_err(dev, "internal state error detected\n");
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
mlx5_trigger_cmd_completions(dev);
}
- mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0);
- if (!force)
- mlx5_core_err(dev, "system error event triggered\n");
+ if (force)
+ goto err_state_done;
+ if (fatal_error == MLX5_SENSOR_FW_SYND_RFR) {
+ if (mlx5_core_is_pf(dev))
+ lock = lock_sem_sw_reset(dev, LOCK);
+
+ if (lock != -EBUSY) {
+ reset_fw_if_needed(dev);
+ delay_ms = MLX5_FW_RESET_WAIT_MS;
+ }
+ }
+
+ /* Recover from SW reset */
+ end = jiffies + msecs_to_jiffies(delay_ms);
+ do {
+ if (sensor_nic_disabled(dev))
+ break;
+
+ msleep(MLX5_NIC_STATE_POLL_MS);
+ } while (!time_after(jiffies, end));
+
+ if (!sensor_nic_disabled(dev)) {
+ dev_err(&dev->pdev->dev, "NIC IFC still %d after %lums.\n",
+ get_nic_mode(dev), delay_ms);
+ }
+
+ /* Release FW semaphore if you are the lock owner */
+ if (!lock)
+ lock_sem_sw_reset(dev, UNLOCK);
+
+ mlx5_core_err(dev, "system error event triggered\n");
+
+err_state_done:
+ mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0);
unlock:
mutex_unlock(&dev->intf_state_mutex);
}
@@ -265,7 +300,6 @@ static void health_recover(struct work_struct *work)
if (sensor_pci_no_comm(dev)) {
dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n");
recover = false;
- goto clear_sem;
}
nic_mode = get_nic_mode(dev);
@@ -281,14 +315,6 @@ static void health_recover(struct work_struct *work)
recover = false;
}
-clear_sem:
- if (test_and_clear_bit(MLX5_SW_RESET_SEM_LOCKED, &health->flags)) {
- mlx5_core_dbg(dev, "Unlocking FW reset semaphore\n");
- lock_sem_sw_reset(dev, UNLOCK);
- }
-
- test_and_clear_bit(MLX5_SKIP_SW_RESET, &health->flags);
-
if (recover) {
dev_err(&dev->pdev->dev, "starting health recovery flow\n");
mlx5_recover_device(dev);
@@ -312,28 +338,10 @@ static void health_care(struct work_struct *work)
struct mlx5_core_dev *dev;
struct mlx5_priv *priv;
unsigned long flags;
- int ret;
health = container_of(work, struct mlx5_core_health, work);
priv = container_of(health, struct mlx5_priv, health);
dev = container_of(priv, struct mlx5_core_dev, priv);
-
- if (mlx5_core_is_pf(dev)) {
- ret = lock_sem_sw_reset(dev, LOCK);
- if (!ret) {
- mlx5_core_warn(dev, "Locked FW reset semaphore\n");
- set_bit(MLX5_SW_RESET_SEM_LOCKED, &health->flags);
- }
- else if (ret == -EBUSY) {
- /* sw reset will be skipped only in case we detect the
- * semaphore was already taken. In case of an error
- * while taking the semaphore we prefer to issue a
- * reset since longer cr-dump time and multiple resets
- * are better than a stuck fw.
- */
- set_bit(MLX5_SKIP_SW_RESET, &health->flags);
- }
- }
mlx5_core_warn(dev, "handling bad device here\n");
mlx5_handle_bad_state(dev);
More information about the svn-src-stable-11
mailing list