git: db0c751ed7f1 - stable/14 - ena: Add differentiation for missing TX completions reset
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Thu, 31 Oct 2024 16:01:31 UTC
The branch stable/14 has been updated by osamaabb: URL: https://cgit.FreeBSD.org/src/commit/?id=db0c751ed7f1dc1c2c3daeaafa9c5354f48b8589 commit db0c751ed7f1dc1c2c3daeaafa9c5354f48b8589 Author: Osama Abboud <osamaabb@amazon.com> AuthorDate: 2024-08-07 06:24:19 +0000 Commit: Osama Abboud <osamaabb@FreeBSD.org> CommitDate: 2024-10-31 14:54:11 +0000 ena: Add differentiation for missing TX completions reset This commit adds differentiation for a reset caused by missing tx completions, by verifying if the driver didn't receive tx completions caused by missing interrupts. The cleanup_running field was added to ena_ring because cleanup_task.ta_pending is zeroed before ena_cleanup() runs. Also ena_increment_reset_counter() API was added in order to support only incrementing the reset counter. Approved by: cperciva (mentor) Sponsored by: Amazon, Inc. (cherry picked from commit a33ec635d1f6d574d54e6f6d74766d070183be4c) --- sys/dev/ena/ena.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- sys/dev/ena/ena.h | 29 ++++++++++++++++++----------- sys/dev/ena/ena_datapath.c | 21 +++++++++++++++------ 3 files changed, 77 insertions(+), 18 deletions(-) diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c index 8c20596d3d23..bbc857004671 100644 --- a/sys/dev/ena/ena.c +++ b/sys/dev/ena/ena.c @@ -169,6 +169,9 @@ static int ena_copy_eni_metrics(struct ena_adapter *); static int ena_copy_srd_metrics(struct ena_adapter *); static int ena_copy_customer_metrics(struct ena_adapter *); static void ena_timer_service(void *); +static enum ena_regs_reset_reason_types check_cdesc_in_tx_cq(struct ena_adapter *, + struct ena_ring *); + static char ena_version[] = ENA_DEVICE_NAME ENA_DRV_MODULE_NAME " v" ENA_DRV_MODULE_VERSION; @@ -3088,6 +3091,31 @@ check_for_rx_interrupt_queue(struct ena_adapter *adapter, return (0); } +static enum ena_regs_reset_reason_types +check_cdesc_in_tx_cq(struct ena_adapter *adapter, + struct ena_ring *tx_ring) +{ + device_t pdev = adapter->pdev; + int rc; + u16 req_id; + + rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq, &req_id); + /* TX CQ is empty */ + if (rc == ENA_COM_TRY_AGAIN) { + ena_log(pdev, ERR, + "No completion descriptors found in CQ %d\n", + tx_ring->qid); + return ENA_REGS_RESET_MISS_TX_CMPL; + } + + /* TX CQ has cdescs */ + ena_log(pdev, ERR, + "Completion descriptors found in CQ %d", + tx_ring->qid); + + return ENA_REGS_RESET_MISS_INTERRUPT; +} + static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter, struct ena_ring *tx_ring) @@ -3100,6 +3128,8 @@ check_missing_comp_in_tx_queue(struct ena_adapter *adapter, int missing_tx_comp_to; sbintime_t time_offset; int i, rc = 0; + enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_MISS_TX_CMPL; + bool cleanup_scheduled, cleanup_running; getbinuptime(&curtime); @@ -3155,7 +3185,19 @@ check_missing_comp_in_tx_queue(struct ena_adapter *adapter, "The number of lost tx completion is above the threshold " "(%d > %d). Reset the device\n", missed_tx, adapter->missing_tx_threshold); - ena_trigger_reset(adapter, ENA_REGS_RESET_MISS_TX_CMPL); + /* Set the reset flag to prevent ena_cleanup() from running */ + ENA_FLAG_SET_ATOMIC(ENA_FLAG_TRIGGER_RESET, adapter); + /* Need to make sure that ENA_FLAG_TRIGGER_RESET is visible to ena_cleanup() and + * that cleanup_running is visible to check_missing_comp_in_tx_queue() to + * prevent the case of accessing CQ concurrently with check_cdesc_in_tx_cq() + */ + mb(); + cleanup_scheduled = !!(atomic_load_16(&tx_ring->que->cleanup_task.ta_pending)); + cleanup_running = !!(atomic_load_8((&tx_ring->cleanup_running))); + if (!(cleanup_scheduled || cleanup_running)) + reset_reason = check_cdesc_in_tx_cq(adapter, tx_ring); + + adapter->reset_reason = reset_reason; rc = EIO; } /* Add the newly discovered missing TX completions */ @@ -3618,6 +3660,7 @@ ena_reset_task(void *arg, int pending) ENA_LOCK_LOCK(); if (likely(ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) { + ena_increment_reset_counter(adapter); ena_destroy_device(adapter, false); ena_restore_device(adapter); diff --git a/sys/dev/ena/ena.h b/sys/dev/ena/ena.h index 876c3cd258aa..c9eb9e8c43d3 100644 --- a/sys/dev/ena/ena.h +++ b/sys/dev/ena/ena.h @@ -327,6 +327,7 @@ struct ena_ring { }; uint8_t first_interrupt; + uint8_t cleanup_running; uint16_t no_interrupt_event_cnt; struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS]; @@ -584,21 +585,27 @@ ena_mbuf_count(struct mbuf *mbuf) } static inline void -ena_trigger_reset(struct ena_adapter *adapter, - enum ena_regs_reset_reason_types reset_reason) +ena_increment_reset_counter(struct ena_adapter *adapter) { - if (likely(!ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) { - const struct ena_reset_stats_offset *ena_reset_stats_offset = - &resets_to_stats_offset_map[reset_reason]; + enum ena_regs_reset_reason_types reset_reason = adapter->reset_reason; + const struct ena_reset_stats_offset *ena_reset_stats_offset = + &resets_to_stats_offset_map[reset_reason]; - if (ena_reset_stats_offset->has_counter) { - uint64_t *stat_ptr = (uint64_t *)&adapter->dev_stats + - ena_reset_stats_offset->stat_offset; + if (ena_reset_stats_offset->has_counter) { + uint64_t *stat_ptr = (uint64_t *)&adapter->dev_stats + + ena_reset_stats_offset->stat_offset; - counter_u64_add((counter_u64_t)(*stat_ptr), 1); - } + counter_u64_add((counter_u64_t)(*stat_ptr), 1); + } + + counter_u64_add(adapter->dev_stats.total_resets, 1); +} - counter_u64_add(adapter->dev_stats.total_resets, 1); +static inline void +ena_trigger_reset(struct ena_adapter *adapter, + enum ena_regs_reset_reason_types reset_reason) +{ + if (likely(!ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) { adapter->reset_reason = reset_reason; ENA_FLAG_SET_ATOMIC(ENA_FLAG_TRIGGER_RESET, adapter); } diff --git a/sys/dev/ena/ena_datapath.c b/sys/dev/ena/ena_datapath.c index 6cbe46cead3e..20864d0d2df6 100644 --- a/sys/dev/ena/ena_datapath.c +++ b/sys/dev/ena/ena_datapath.c @@ -77,17 +77,24 @@ ena_cleanup(void *arg, int pending) int qid, ena_qid; int txc, rxc, i; - if (unlikely((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)) - return; - - ena_log_io(adapter->pdev, DBG, "MSI-X TX/RX routine\n"); - tx_ring = que->tx_ring; rx_ring = que->rx_ring; qid = que->id; ena_qid = ENA_IO_TXQ_IDX(qid); io_cq = &adapter->ena_dev->io_cq_queues[ena_qid]; + atomic_store_8(&tx_ring->cleanup_running, 1); + /* Need to make sure that ENA_FLAG_TRIGGER_RESET is visible to ena_cleanup() and + * that cleanup_running is visible to check_missing_comp_in_tx_queue() to + * prevent the case of accessing CQ concurrently with check_cdesc_in_tx_cq() + */ + mb(); + if (unlikely(((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) || + (ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter)))) + return; + + ena_log_io(adapter->pdev, DBG, "MSI-X TX/RX routine\n"); + atomic_store_8(&tx_ring->first_interrupt, 1); atomic_store_8(&rx_ring->first_interrupt, 1); @@ -95,7 +102,8 @@ ena_cleanup(void *arg, int pending) rxc = ena_rx_cleanup(rx_ring); txc = ena_tx_cleanup(tx_ring); - if (unlikely((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)) + if (unlikely(((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) || + (ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter)))) return; if ((txc != ENA_TX_BUDGET) && (rxc != ENA_RX_BUDGET)) @@ -107,6 +115,7 @@ ena_cleanup(void *arg, int pending) ENA_TX_IRQ_INTERVAL, true, false); counter_u64_add(tx_ring->tx_stats.unmask_interrupt_num, 1); ena_com_unmask_intr(io_cq, &intr_reg); + atomic_store_8(&tx_ring->cleanup_running, 0); } void