git: 35bbcf091699 - main - mlx5: add fs_counters

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Wed, 15 Nov 2023 23:09:13 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=35bbcf0916992d77fe1521962db42b3106a701fb

commit 35bbcf0916992d77fe1521962db42b3106a701fb
Author:     Raed Salem <raeds@nvidia.com>
AuthorDate: 2023-02-20 16:10:29 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2023-11-15 23:08:17 +0000

    mlx5: add fs_counters
    
    Signed-off-by: Raed Salem <raeds@nvidia.com>
    Sponsored by:   NVidia networking
    MFC after:      1 week
---
 sys/dev/mlx5/driver.h                     |  43 ++
 sys/dev/mlx5/fs.h                         |  16 +-
 sys/dev/mlx5/mlx5_core/fs_core.h          |   5 +
 sys/dev/mlx5/mlx5_core/mlx5_fc_cmd.c      | 102 ++++
 sys/dev/mlx5/mlx5_core/mlx5_fc_cmd.h      |  54 +++
 sys/dev/mlx5/mlx5_core/mlx5_fs_counters.c | 758 ++++++++++++++++++++++++++++++
 sys/dev/mlx5/mlx5_core/mlx5_fs_tree.c     |   5 +
 sys/dev/mlx5/mlx5_ifc.h                   |  22 +-
 sys/modules/mlx5/Makefile                 |   2 +
 9 files changed, 1003 insertions(+), 4 deletions(-)

diff --git a/sys/dev/mlx5/driver.h b/sys/dev/mlx5/driver.h
index cb1a2907a443..9daa1235bd9c 100644
--- a/sys/dev/mlx5/driver.h
+++ b/sys/dev/mlx5/driver.h
@@ -50,6 +50,7 @@
 #define MLX5_MAX_NUMBER_OF_VFS 128
 
 #define MLX5_INVALID_QUEUE_HANDLE 0xffffffff
+#define MLX5_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8)
 
 enum {
 	MLX5_BOARD_ID_LEN = 64,
@@ -269,6 +270,36 @@ struct mlx5_traffic_counter {
 	u64         octets;
 };
 
+struct mlx5_fc_pool {
+	struct mlx5_core_dev *dev;
+	struct mutex pool_lock; /* protects pool lists */
+	struct list_head fully_used;
+	struct list_head partially_used;
+	struct list_head unused;
+	int available_fcs;
+	int used_fcs;
+	int threshold;
+};
+
+struct mlx5_fc_stats {
+	spinlock_t counters_idr_lock; /* protects counters_idr */
+	struct idr counters_idr;
+	struct list_head counters;
+	struct llist_head addlist;
+	struct llist_head dellist;
+
+	struct workqueue_struct *wq;
+	struct delayed_work work;
+	unsigned long next_query;
+	unsigned long sampling_interval; /* jiffies */
+	u32 *bulk_query_out;
+	int bulk_query_len;
+	size_t num_counters;
+	bool bulk_query_alloc_failed;
+	unsigned long next_bulk_query_alloc;
+	struct mlx5_fc_pool fc_pool;
+};
+
 enum mlx5_cmd_mode {
 	MLX5_CMD_MODE_POLLING,
 	MLX5_CMD_MODE_EVENTS
@@ -607,6 +638,7 @@ struct mlx5_priv {
 
 	struct mlx5_bfreg_data		bfregs;
 	struct mlx5_uars_page	       *uar;
+	struct mlx5_fc_stats		fc_stats;
 };
 
 enum mlx5_device_state {
@@ -963,6 +995,17 @@ int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size,
 		     struct mlx5_async_work *work);
 int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
 		  int out_size);
+#define mlx5_cmd_exec_inout(dev, ifc_cmd, in, out)                             \
+	({                                                                     \
+		mlx5_cmd_exec(dev, in, MLX5_ST_SZ_BYTES(ifc_cmd##_in), out,    \
+			      MLX5_ST_SZ_BYTES(ifc_cmd##_out));                \
+	})
+
+#define mlx5_cmd_exec_in(dev, ifc_cmd, in)                                     \
+	({                                                                     \
+		u32 _out[MLX5_ST_SZ_DW(ifc_cmd##_out)] = {};                   \
+		mlx5_cmd_exec_inout(dev, ifc_cmd, in, _out);                   \
+	})
 int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size,
 			  void *out, int out_size);
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
diff --git a/sys/dev/mlx5/fs.h b/sys/dev/mlx5/fs.h
index f62716d806d0..65d38b9ee67a 100644
--- a/sys/dev/mlx5/fs.h
+++ b/sys/dev/mlx5/fs.h
@@ -254,7 +254,6 @@ bool fs_match_exact_mask(
 		void *mask1,
 		void *mask2);
 /**********end API for sniffer**********/
-
 struct mlx5_modify_hdr *mlx5_modify_header_alloc(struct mlx5_core_dev *dev,
 						 enum mlx5_flow_namespace_type ns_type,
 						 u8 num_actions,
@@ -275,4 +274,19 @@ struct mlx5_pkt_reformat *mlx5_packet_reformat_alloc(struct mlx5_core_dev *dev,
 						     enum mlx5_flow_namespace_type ns_type);
 void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev,
 					  struct mlx5_pkt_reformat *pkt_reformat);
+/********** Flow counters API **********/
+struct mlx5_fc;
+struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
+
+/* As mlx5_fc_create() but doesn't queue stats refresh thread. */
+struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging);
+
+void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
+u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter);
+void mlx5_fc_query_cached(struct mlx5_fc *counter,
+                          u64 *bytes, u64 *packets, u64 *lastuse);
+int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
+                  u64 *packets, u64 *bytes);
+u32 mlx5_fc_id(struct mlx5_fc *counter);
+/******* End of Flow counters API ******/
 #endif
diff --git a/sys/dev/mlx5/mlx5_core/fs_core.h b/sys/dev/mlx5/mlx5_core/fs_core.h
index a9273fdab61c..05757f493469 100644
--- a/sys/dev/mlx5/mlx5_core/fs_core.h
+++ b/sys/dev/mlx5/mlx5_core/fs_core.h
@@ -323,4 +323,9 @@ int mlx5_cmd_packet_reformat_alloc(struct mlx5_core_dev *dev,
 				   struct mlx5_pkt_reformat *pkt_reformat);
 void mlx5_cmd_packet_reformat_dealloc(struct mlx5_core_dev *dev,
 				      struct mlx5_pkt_reformat *pkt_reformat);
+int mlx5_init_fc_stats(struct mlx5_core_dev *dev);
+void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev);
+void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
+			      struct delayed_work *dwork,
+			      unsigned long delay);
 #endif
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_fc_cmd.c b/sys/dev/mlx5/mlx5_core/mlx5_fc_cmd.c
new file mode 100644
index 000000000000..f3410249e67f
--- /dev/null
+++ b/sys/dev/mlx5/mlx5_core/mlx5_fc_cmd.c
@@ -0,0 +1,102 @@
+/*-
+ * Copyright (c) 2022 NVIDIA corporation & affiliates.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <dev/mlx5/driver.h>
+#include <dev/mlx5/device.h>
+#include <dev/mlx5/mlx5_ifc.h>
+#include <dev/mlx5/mlx5_core/mlx5_fc_cmd.h>
+#include <dev/mlx5/mlx5_core/mlx5_core.h>
+
+int mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev,
+			   enum mlx5_fc_bulk_alloc_bitmask alloc_bitmask,
+			   u32 *id)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_flow_counter_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(alloc_flow_counter_in)] = {};
+	int err;
+
+	MLX5_SET(alloc_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_FLOW_COUNTER);
+	MLX5_SET(alloc_flow_counter_in, in, flow_counter_bulk, alloc_bitmask);
+
+	err = mlx5_cmd_exec_inout(dev, alloc_flow_counter, in, out);
+	if (!err)
+		*id = MLX5_GET(alloc_flow_counter_out, out, flow_counter_id);
+	return err;
+}
+
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u32 *id)
+{
+	return mlx5_cmd_fc_bulk_alloc(dev, 0, id);
+}
+
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u32 id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_flow_counter_in)] = {};
+
+	MLX5_SET(dealloc_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_FLOW_COUNTER);
+	MLX5_SET(dealloc_flow_counter_in, in, flow_counter_id, id);
+	return mlx5_cmd_exec_in(dev, dealloc_flow_counter, in);
+}
+
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u32 id,
+		      u64 *packets, u64 *bytes)
+{
+	u32 out[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+		MLX5_ST_SZ_BYTES(traffic_counter)] = {};
+	u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {};
+	void *stats;
+	int err = 0;
+
+	MLX5_SET(query_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_FLOW_COUNTER);
+	MLX5_SET(query_flow_counter_in, in, op_mod, 0);
+	MLX5_SET(query_flow_counter_in, in, flow_counter_id, id);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	stats = MLX5_ADDR_OF(query_flow_counter_out, out, flow_statistics);
+	*packets = MLX5_GET64(traffic_counter, stats, packets);
+	*bytes = MLX5_GET64(traffic_counter, stats, octets);
+	return 0;
+}
+
+int mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, u32 base_id, int bulk_len,
+			   u32 *out)
+{
+	int outlen = mlx5_cmd_fc_get_bulk_query_out_len(bulk_len);
+	u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {};
+
+	MLX5_SET(query_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_FLOW_COUNTER);
+	MLX5_SET(query_flow_counter_in, in, flow_counter_id, base_id);
+	MLX5_SET(query_flow_counter_in, in, num_of_counters, bulk_len);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_fc_cmd.h b/sys/dev/mlx5/mlx5_core/mlx5_fc_cmd.h
new file mode 100644
index 000000000000..3adebb3ca94c
--- /dev/null
+++ b/sys/dev/mlx5/mlx5_core/mlx5_fc_cmd.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, NVIDIA Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MLX5_FC_CMD_
+#define _MLX5_FC_CMD_
+
+#include "fs_core.h"
+
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u32 *id);
+int mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev,
+			   enum mlx5_fc_bulk_alloc_bitmask alloc_bitmask,
+			   u32 *id);
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u32 id);
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u32 id,
+		      u64 *packets, u64 *bytes);
+
+int mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, u32 base_id, int bulk_len,
+			   u32 *out);
+static inline int mlx5_cmd_fc_get_bulk_query_out_len(int bulk_len)
+{
+	return MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+		MLX5_ST_SZ_BYTES(traffic_counter) * bulk_len;
+}
+
+#endif
diff --git a/sys/dev/mlx5/mlx5_core/mlx5_fs_counters.c b/sys/dev/mlx5/mlx5_core/mlx5_fs_counters.c
new file mode 100644
index 000000000000..7214c5256388
--- /dev/null
+++ b/sys/dev/mlx5/mlx5_core/mlx5_fs_counters.c
@@ -0,0 +1,758 @@
+/*-
+ * Copyright (c) 2013-2019, Mellanox Technologies, Ltd.  All rights reserved.
+ * Copyright (c) 2022 NVIDIA corporation & affiliates.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <dev/mlx5/driver.h>
+#include <dev/mlx5/fs.h>
+#include <linux/rbtree.h>
+#include <dev/mlx5/mlx5_core/mlx5_core.h>
+#include <dev/mlx5/mlx5_core/fs_core.h>
+#include <dev/mlx5/mlx5_core/mlx5_fc_cmd.h>
+
+#define MLX5_FC_STATS_PERIOD msecs_to_jiffies(1000)
+#define MLX5_FC_BULK_QUERY_ALLOC_PERIOD msecs_to_jiffies(180 * 1000)
+/* Max number of counters to query in bulk read is 32K */
+#define MLX5_SW_MAX_COUNTERS_BULK BIT(15)
+#define MLX5_INIT_COUNTERS_BULK 8
+#define MLX5_FC_POOL_MAX_THRESHOLD BIT(18)
+#define MLX5_FC_POOL_USED_BUFF_RATIO 10
+
+struct mlx5_fc_cache {
+	u64 packets;
+	u64 bytes;
+	u64 lastuse;
+};
+
+struct mlx5_fc {
+	struct list_head list;
+	struct llist_node addlist;
+	struct llist_node dellist;
+
+	/* last{packets,bytes} members are used when calculating the delta since
+	 * last reading
+	 */
+	u64 lastpackets;
+	u64 lastbytes;
+
+	struct mlx5_fc_bulk *bulk;
+	u32 id;
+	bool aging;
+
+	struct mlx5_fc_cache cache ____cacheline_aligned_in_smp;
+};
+
+static void mlx5_fc_pool_init(struct mlx5_fc_pool *fc_pool, struct mlx5_core_dev *dev);
+static void mlx5_fc_pool_cleanup(struct mlx5_fc_pool *fc_pool);
+static struct mlx5_fc *mlx5_fc_pool_acquire_counter(struct mlx5_fc_pool *fc_pool);
+static void mlx5_fc_pool_release_counter(struct mlx5_fc_pool *fc_pool, struct mlx5_fc *fc);
+
+/* locking scheme:
+ *
+ * It is the responsibility of the user to prevent concurrent calls or bad
+ * ordering to mlx5_fc_create(), mlx5_fc_destroy() and accessing a reference
+ * to struct mlx5_fc.
+ * e.g en_tc.c is protected by RTNL lock of its caller, and will never call a
+ * dump (access to struct mlx5_fc) after a counter is destroyed.
+ *
+ * access to counter list:
+ * - create (user context)
+ *   - mlx5_fc_create() only adds to an addlist to be used by
+ *     mlx5_fc_stats_work(). addlist is a lockless single linked list
+ *     that doesn't require any additional synchronization when adding single
+ *     node.
+ *   - spawn thread to do the actual destroy
+ *
+ * - destroy (user context)
+ *   - add a counter to lockless dellist
+ *   - spawn thread to do the actual del
+ *
+ * - dump (user context)
+ *   user should not call dump after destroy
+ *
+ * - query (single thread workqueue context)
+ *   destroy/dump - no conflict (see destroy)
+ *   query/dump - packets and bytes might be inconsistent (since update is not
+ *                atomic)
+ *   query/create - no conflict (see create)
+ *   since every create/destroy spawn the work, only after necessary time has
+ *   elapsed, the thread will actually query the hardware.
+ */
+
+static struct list_head *mlx5_fc_counters_lookup_next(struct mlx5_core_dev *dev,
+						      u32 id)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc *counter;
+	int next_id = id + 1;
+
+	rcu_read_lock();
+	/* skip counters that are in idr, but not yet in counters list */
+	while ((counter = idr_get_next(&fc_stats->counters_idr, &next_id)) != NULL &&
+		list_empty(&counter->list))
+		next_id++;
+	rcu_read_unlock();
+
+	return counter ? &counter->list : &fc_stats->counters;
+}
+
+static void mlx5_fc_stats_insert(struct mlx5_core_dev *dev,
+				 struct mlx5_fc *counter)
+{
+	struct list_head *next = mlx5_fc_counters_lookup_next(dev, counter->id);
+
+	list_add_tail(&counter->list, next);
+}
+
+static void mlx5_fc_stats_remove(struct mlx5_core_dev *dev,
+				 struct mlx5_fc *counter)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	list_del(&counter->list);
+
+	spin_lock(&fc_stats->counters_idr_lock);
+	WARN_ON(!idr_remove(&fc_stats->counters_idr, counter->id));
+	spin_unlock(&fc_stats->counters_idr_lock);
+}
+
+static int get_init_bulk_query_len(struct mlx5_core_dev *dev)
+{
+	return min_t(int, MLX5_INIT_COUNTERS_BULK,
+		     (1 << MLX5_CAP_GEN(dev, log_max_flow_counter_bulk)));
+}
+
+static int get_max_bulk_query_len(struct mlx5_core_dev *dev)
+{
+	return min_t(int, MLX5_SW_MAX_COUNTERS_BULK,
+		     (1 << MLX5_CAP_GEN(dev, log_max_flow_counter_bulk)));
+}
+
+static void update_counter_cache(int index, u32 *bulk_raw_data,
+				 struct mlx5_fc_cache *cache)
+{
+	void *stats = MLX5_ADDR_OF(query_flow_counter_out, bulk_raw_data,
+			     flow_statistics[index]);
+	u64 packets = MLX5_GET64(traffic_counter, stats, packets);
+	u64 bytes = MLX5_GET64(traffic_counter, stats, octets);
+
+	if (cache->packets == packets)
+		return;
+
+	cache->packets = packets;
+	cache->bytes = bytes;
+	cache->lastuse = jiffies;
+}
+
+static void mlx5_fc_stats_query_counter_range(struct mlx5_core_dev *dev,
+					      struct mlx5_fc *first,
+					      u32 last_id)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	bool query_more_counters = (first->id <= last_id);
+	int cur_bulk_len = fc_stats->bulk_query_len;
+	u32 *data = fc_stats->bulk_query_out;
+	struct mlx5_fc *counter = first;
+	u32 bulk_base_id;
+	int bulk_len;
+	int err;
+
+	while (query_more_counters) {
+		/* first id must be aligned to 4 when using bulk query */
+		bulk_base_id = counter->id & ~0x3;
+
+		/* number of counters to query inc. the last counter */
+		bulk_len = min_t(int, cur_bulk_len,
+				 ALIGN(last_id - bulk_base_id + 1, 4));
+
+		err = mlx5_cmd_fc_bulk_query(dev, bulk_base_id, bulk_len,
+					     data);
+		if (err) {
+			mlx5_core_err(dev, "Error doing bulk query: %d\n", err);
+			return;
+		}
+		query_more_counters = false;
+
+		list_for_each_entry_from(counter, &fc_stats->counters, list) {
+			int counter_index = counter->id - bulk_base_id;
+			struct mlx5_fc_cache *cache = &counter->cache;
+
+			if (counter->id >= bulk_base_id + bulk_len) {
+				query_more_counters = true;
+				break;
+			}
+
+			update_counter_cache(counter_index, data, cache);
+		}
+	}
+}
+
+static void mlx5_fc_free(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
+{
+	mlx5_cmd_fc_free(dev, counter->id);
+	kfree(counter);
+}
+
+static void mlx5_fc_release(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	if (counter->bulk)
+		mlx5_fc_pool_release_counter(&fc_stats->fc_pool, counter);
+	else
+		mlx5_fc_free(dev, counter);
+}
+
+static void mlx5_fc_stats_bulk_query_size_increase(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	int max_bulk_len = get_max_bulk_query_len(dev);
+	unsigned long now = jiffies;
+	u32 *bulk_query_out_tmp;
+	int max_out_len;
+
+	if (fc_stats->bulk_query_alloc_failed &&
+	    time_before(now, fc_stats->next_bulk_query_alloc))
+		return;
+
+	max_out_len = mlx5_cmd_fc_get_bulk_query_out_len(max_bulk_len);
+	bulk_query_out_tmp = kzalloc(max_out_len, GFP_KERNEL);
+	if (!bulk_query_out_tmp) {
+		mlx5_core_warn(dev,
+			       "Can't increase flow counters bulk query buffer size, insufficient memory, bulk_size(%d)\n",
+			       max_bulk_len);
+		fc_stats->bulk_query_alloc_failed = true;
+		fc_stats->next_bulk_query_alloc =
+			now + MLX5_FC_BULK_QUERY_ALLOC_PERIOD;
+		return;
+	}
+
+	kfree(fc_stats->bulk_query_out);
+	fc_stats->bulk_query_out = bulk_query_out_tmp;
+	fc_stats->bulk_query_len = max_bulk_len;
+	if (fc_stats->bulk_query_alloc_failed) {
+		mlx5_core_info(dev,
+			       "Flow counters bulk query buffer size increased, bulk_size(%d)\n",
+			       max_bulk_len);
+		fc_stats->bulk_query_alloc_failed = false;
+	}
+}
+
+static void mlx5_fc_stats_work(struct work_struct *work)
+{
+	struct mlx5_core_dev *dev = container_of(work, struct mlx5_core_dev,
+						 priv.fc_stats.work.work);
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	/* Take dellist first to ensure that counters cannot be deleted before
+	 * they are inserted.
+	 */
+	struct llist_node *dellist = llist_del_all(&fc_stats->dellist);
+	struct llist_node *addlist = llist_del_all(&fc_stats->addlist);
+	struct mlx5_fc *counter = NULL, *last = NULL, *tmp;
+	unsigned long now = jiffies;
+
+	if (addlist || !list_empty(&fc_stats->counters))
+		queue_delayed_work(fc_stats->wq, &fc_stats->work,
+				   fc_stats->sampling_interval);
+
+	llist_for_each_entry(counter, addlist, addlist) {
+		mlx5_fc_stats_insert(dev, counter);
+		fc_stats->num_counters++;
+	}
+
+	llist_for_each_entry_safe(counter, tmp, dellist, dellist) {
+		mlx5_fc_stats_remove(dev, counter);
+
+		mlx5_fc_release(dev, counter);
+		fc_stats->num_counters--;
+	}
+
+	if (fc_stats->bulk_query_len < get_max_bulk_query_len(dev) &&
+	    fc_stats->num_counters > get_init_bulk_query_len(dev))
+		mlx5_fc_stats_bulk_query_size_increase(dev);
+
+	if (time_before(now, fc_stats->next_query) ||
+	    list_empty(&fc_stats->counters))
+		return;
+	last = list_last_entry(&fc_stats->counters, struct mlx5_fc, list);
+
+	counter = list_first_entry(&fc_stats->counters, struct mlx5_fc,
+				   list);
+	if (counter)
+		mlx5_fc_stats_query_counter_range(dev, counter, last->id);
+
+	fc_stats->next_query = now + fc_stats->sampling_interval;
+}
+
+static struct mlx5_fc *mlx5_fc_single_alloc(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fc *counter;
+	int err;
+
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	if (!counter)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_cmd_fc_alloc(dev, &counter->id);
+	if (err) {
+		kfree(counter);
+		return ERR_PTR(err);
+	}
+
+	return counter;
+}
+
+static struct mlx5_fc *mlx5_fc_acquire(struct mlx5_core_dev *dev, bool aging)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc *counter;
+
+	if (aging && MLX5_CAP_GEN(dev, flow_counter_bulk_alloc) != 0) {
+		counter = mlx5_fc_pool_acquire_counter(&fc_stats->fc_pool);
+		if (!IS_ERR(counter))
+			return counter;
+	}
+
+	return mlx5_fc_single_alloc(dev);
+}
+
+struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging)
+{
+	struct mlx5_fc *counter = mlx5_fc_acquire(dev, aging);
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	int err = 0;
+
+	if (IS_ERR(counter))
+		return counter;
+
+	INIT_LIST_HEAD(&counter->list);
+	counter->aging = aging;
+
+	if (aging) {
+		u32 id = counter->id;
+
+		counter->cache.lastuse = jiffies;
+		counter->lastbytes = counter->cache.bytes;
+		counter->lastpackets = counter->cache.packets;
+
+		idr_preload(GFP_KERNEL);
+		spin_lock(&fc_stats->counters_idr_lock);
+
+		err = idr_alloc(&fc_stats->counters_idr, counter, id, id + 1,
+				GFP_NOWAIT);
+
+		spin_unlock(&fc_stats->counters_idr_lock);
+		idr_preload_end();
+		if (err < 0 || err != id)
+			goto err_out_alloc;
+
+		llist_add(&counter->addlist, &fc_stats->addlist);
+	}
+
+	return counter;
+
+err_out_alloc:
+	mlx5_fc_release(dev, counter);
+	return ERR_PTR(err);
+}
+
+struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
+{
+	struct mlx5_fc *counter = mlx5_fc_create_ex(dev, aging);
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	if (aging)
+		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
+	return counter;
+}
+EXPORT_SYMBOL(mlx5_fc_create);
+
+u32 mlx5_fc_id(struct mlx5_fc *counter)
+{
+	return counter->id;
+}
+EXPORT_SYMBOL(mlx5_fc_id);
+
+void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	if (!counter)
+		return;
+
+	if (counter->aging) {
+		llist_add(&counter->dellist, &fc_stats->dellist);
+		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
+		return;
+	}
+
+	mlx5_fc_release(dev, counter);
+}
+EXPORT_SYMBOL(mlx5_fc_destroy);
+
+int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	int init_bulk_len;
+	int init_out_len;
+
+	spin_lock_init(&fc_stats->counters_idr_lock);
+	idr_init(&fc_stats->counters_idr);
+	INIT_LIST_HEAD(&fc_stats->counters);
+	init_llist_head(&fc_stats->addlist);
+	init_llist_head(&fc_stats->dellist);
+
+	init_bulk_len = get_init_bulk_query_len(dev);
+	init_out_len = mlx5_cmd_fc_get_bulk_query_out_len(init_bulk_len);
+	fc_stats->bulk_query_out = kzalloc(init_out_len, GFP_KERNEL);
+	if (!fc_stats->bulk_query_out)
+		return -ENOMEM;
+	fc_stats->bulk_query_len = init_bulk_len;
+
+	fc_stats->wq = create_singlethread_workqueue("mlx5_fc");
+	if (!fc_stats->wq)
+		goto err_wq_create;
+
+	fc_stats->sampling_interval = MLX5_FC_STATS_PERIOD;
+	INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work);
+
+	mlx5_fc_pool_init(&fc_stats->fc_pool, dev);
+	return 0;
+
+err_wq_create:
+	kfree(fc_stats->bulk_query_out);
+	return -ENOMEM;
+}
+
+void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct llist_node *tmplist;
+	struct mlx5_fc *counter;
+	struct mlx5_fc *tmp;
+
+	if (!dev->priv.fc_stats.wq)
+		return;
+
+	cancel_delayed_work_sync(&dev->priv.fc_stats.work);
+	destroy_workqueue(dev->priv.fc_stats.wq);
+	dev->priv.fc_stats.wq = NULL;
+
+	tmplist = llist_del_all(&fc_stats->addlist);
+	llist_for_each_entry_safe(counter, tmp, tmplist, addlist)
+		mlx5_fc_release(dev, counter);
+
+	list_for_each_entry_safe(counter, tmp, &fc_stats->counters, list)
+		mlx5_fc_release(dev, counter);
+
+	mlx5_fc_pool_cleanup(&fc_stats->fc_pool);
+	idr_destroy(&fc_stats->counters_idr);
+	kfree(fc_stats->bulk_query_out);
+}
+
+int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
+		  u64 *packets, u64 *bytes)
+{
+	return mlx5_cmd_fc_query(dev, counter->id, packets, bytes);
+}
+EXPORT_SYMBOL(mlx5_fc_query);
+
+u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter)
+{
+	return counter->cache.lastuse;
+}
+
+void mlx5_fc_query_cached(struct mlx5_fc *counter,
+			  u64 *bytes, u64 *packets, u64 *lastuse)
+{
+	struct mlx5_fc_cache c;
+
+	c = counter->cache;
+
+	*bytes = c.bytes - counter->lastbytes;
+	*packets = c.packets - counter->lastpackets;
+	*lastuse = c.lastuse;
+
+	counter->lastbytes = c.bytes;
+	counter->lastpackets = c.packets;
+}
+
+void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
+			      struct delayed_work *dwork,
+			      unsigned long delay)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	queue_delayed_work(fc_stats->wq, dwork, delay);
+}
+
+void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
+				      unsigned long interval)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	fc_stats->sampling_interval = min_t(unsigned long, interval,
+					    fc_stats->sampling_interval);
+}
+
+/* Flow counter bluks */
+
+struct mlx5_fc_bulk {
+	struct list_head pool_list;
+	u32 base_id;
+	int bulk_len;
+	unsigned long *bitmask;
+	struct mlx5_fc fcs[];
+};
+
+static void mlx5_fc_init(struct mlx5_fc *counter, struct mlx5_fc_bulk *bulk,
+			 u32 id)
+{
+	counter->bulk = bulk;
+	counter->id = id;
+}
+
+static int mlx5_fc_bulk_get_free_fcs_amount(struct mlx5_fc_bulk *bulk)
+{
+	return bitmap_weight(bulk->bitmask, bulk->bulk_len);
+}
+
+static struct mlx5_fc_bulk *mlx5_fc_bulk_create(struct mlx5_core_dev *dev)
+{
+	enum mlx5_fc_bulk_alloc_bitmask alloc_bitmask;
+	struct mlx5_fc_bulk *bulk;
+	int err = -ENOMEM;
+	int bulk_len;
+	u32 base_id;
+	int i;
+
+	alloc_bitmask = MLX5_CAP_GEN(dev, flow_counter_bulk_alloc);
+	bulk_len = alloc_bitmask > 0 ? MLX5_FC_BULK_NUM_FCS(alloc_bitmask) : 1;
+
+	bulk = kvzalloc(struct_size(bulk, fcs, bulk_len), GFP_KERNEL);
+	if (!bulk)
+		goto err_alloc_bulk;
+
+	bulk->bitmask = kvcalloc(BITS_TO_LONGS(bulk_len), sizeof(unsigned long),
+				 GFP_KERNEL);
+	if (!bulk->bitmask)
+		goto err_alloc_bitmask;
+
+	err = mlx5_cmd_fc_bulk_alloc(dev, alloc_bitmask, &base_id);
+	if (err)
+		goto err_mlx5_cmd_bulk_alloc;
+
+	bulk->base_id = base_id;
+	bulk->bulk_len = bulk_len;
+	for (i = 0; i < bulk_len; i++) {
+		mlx5_fc_init(&bulk->fcs[i], bulk, base_id + i);
+		set_bit(i, bulk->bitmask);
+	}
+
+	return bulk;
+
+err_mlx5_cmd_bulk_alloc:
+	kvfree(bulk->bitmask);
+err_alloc_bitmask:
+	kvfree(bulk);
+err_alloc_bulk:
+	return ERR_PTR(err);
+}
+
+static int
+mlx5_fc_bulk_destroy(struct mlx5_core_dev *dev, struct mlx5_fc_bulk *bulk)
+{
+	if (mlx5_fc_bulk_get_free_fcs_amount(bulk) < bulk->bulk_len) {
+		mlx5_core_err(dev, "Freeing bulk before all counters were released\n");
+		return -EBUSY;
+	}
+
+	mlx5_cmd_fc_free(dev, bulk->base_id);
+	kvfree(bulk->bitmask);
+	kvfree(bulk);
+
+	return 0;
+}
+
+static struct mlx5_fc *mlx5_fc_bulk_acquire_fc(struct mlx5_fc_bulk *bulk)
+{
+	int free_fc_index = find_first_bit(bulk->bitmask, bulk->bulk_len);
+
+	if (free_fc_index >= bulk->bulk_len)
+		return ERR_PTR(-ENOSPC);
+
+	clear_bit(free_fc_index, bulk->bitmask);
+	return &bulk->fcs[free_fc_index];
+}
+
+static int mlx5_fc_bulk_release_fc(struct mlx5_fc_bulk *bulk, struct mlx5_fc *fc)
+{
+	int fc_index = fc->id - bulk->base_id;
+
+	if (test_bit(fc_index, bulk->bitmask))
+		return -EINVAL;
+
+	set_bit(fc_index, bulk->bitmask);
+	return 0;
+}
+
+/* Flow counters pool API */
+
+static void mlx5_fc_pool_init(struct mlx5_fc_pool *fc_pool, struct mlx5_core_dev *dev)
+{
+	fc_pool->dev = dev;
+	mutex_init(&fc_pool->pool_lock);
+	INIT_LIST_HEAD(&fc_pool->fully_used);
+	INIT_LIST_HEAD(&fc_pool->partially_used);
+	INIT_LIST_HEAD(&fc_pool->unused);
+	fc_pool->available_fcs = 0;
+	fc_pool->used_fcs = 0;
+	fc_pool->threshold = 0;
+}
+
+static void mlx5_fc_pool_cleanup(struct mlx5_fc_pool *fc_pool)
+{
+	struct mlx5_core_dev *dev = fc_pool->dev;
+	struct mlx5_fc_bulk *bulk;
+	struct mlx5_fc_bulk *tmp;
+
+	list_for_each_entry_safe(bulk, tmp, &fc_pool->fully_used, pool_list)
+		mlx5_fc_bulk_destroy(dev, bulk);
+	list_for_each_entry_safe(bulk, tmp, &fc_pool->partially_used, pool_list)
+		mlx5_fc_bulk_destroy(dev, bulk);
+	list_for_each_entry_safe(bulk, tmp, &fc_pool->unused, pool_list)
+		mlx5_fc_bulk_destroy(dev, bulk);
+}
+
+static void mlx5_fc_pool_update_threshold(struct mlx5_fc_pool *fc_pool)
+{
+	fc_pool->threshold = min_t(int, MLX5_FC_POOL_MAX_THRESHOLD,
+				   fc_pool->used_fcs / MLX5_FC_POOL_USED_BUFF_RATIO);
+}
+
+static struct mlx5_fc_bulk *
+mlx5_fc_pool_alloc_new_bulk(struct mlx5_fc_pool *fc_pool)
+{
+	struct mlx5_core_dev *dev = fc_pool->dev;
+	struct mlx5_fc_bulk *new_bulk;
+
+	new_bulk = mlx5_fc_bulk_create(dev);
+	if (!IS_ERR(new_bulk))
+		fc_pool->available_fcs += new_bulk->bulk_len;
+	mlx5_fc_pool_update_threshold(fc_pool);
*** 177 LINES SKIPPED ***