git: 787845c0e8e8 - main - Revert "ip_mroute: refactor bw_meter API"
Wojciech Macek
wma at FreeBSD.org
Thu May 20 10:15:11 UTC 2021
The branch main has been updated by wma:
URL: https://cgit.FreeBSD.org/src/commit/?id=787845c0e8e831bf5b2d000950241cb23c16ca45
commit 787845c0e8e831bf5b2d000950241cb23c16ca45
Author: Wojciech Macek <wma at FreeBSD.org>
AuthorDate: 2021-05-20 10:14:58 +0000
Commit: Wojciech Macek <wma at FreeBSD.org>
CommitDate: 2021-05-20 10:14:58 +0000
Revert "ip_mroute: refactor bw_meter API"
This reverts commit d1cd99b147411b331a9bff659533780ef297ef58.
---
sys/netinet/ip_mroute.c | 610 +++++++++++++++++++++++++++++++-----------------
sys/netinet/ip_mroute.h | 10 +-
2 files changed, 400 insertions(+), 220 deletions(-)
diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c
index 0cb980b7247e..b8e677ba9af1 100644
--- a/sys/netinet/ip_mroute.c
+++ b/sys/netinet/ip_mroute.c
@@ -49,7 +49,6 @@
* Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
* Modified by Hitoshi Asaeda, WIDE, August 2000
* Modified by Pavlin Radoslavov, ICSI, October 2002
- * Modified by Wojciech Macek, Semihalf, May 2021
*
* MROUTING Revision: 3.5
* and PIM-SMv2 and PIM-DM support, advanced API support,
@@ -203,6 +202,16 @@ VNET_DEFINE_STATIC(struct callout, expire_upcalls_ch);
* Bandwidth meter variables and constants
*/
static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters");
+/*
+ * Pending timeouts are stored in a hash table, the key being the
+ * expiration time. Periodically, the entries are analysed and processed.
+ */
+#define BW_METER_BUCKETS 1024
+VNET_DEFINE_STATIC(struct bw_meter **, bw_meter_timers);
+#define V_bw_meter_timers VNET(bw_meter_timers)
+VNET_DEFINE_STATIC(struct callout, bw_meter_ch);
+#define V_bw_meter_ch VNET(bw_meter_ch)
+#define BW_METER_PERIOD (hz) /* periodical handling of bw meters */
/*
* Pending upcalls are stored in a vector which is flushed when
@@ -311,13 +320,14 @@ static int add_mfc(struct mfcctl2 *);
static int add_vif(struct vifctl *);
static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
static void bw_meter_process(void);
-static void bw_meter_geq_receive_packet(struct bw_meter *, int,
+static void bw_meter_receive_packet(struct bw_meter *, int,
struct timeval *);
static void bw_upcalls_send(void);
static int del_bw_upcall(struct bw_upcall *);
static int del_mfc(struct mfcctl2 *);
static int del_vif(vifi_t);
static int del_vif_locked(vifi_t);
+static void expire_bw_meter_process(void *);
static void expire_bw_upcalls_send(void *);
static void expire_mfc(struct mfc *);
static void expire_upcalls(void *);
@@ -675,6 +685,8 @@ ip_mrouter_init(struct socket *so, int version)
curvnet);
callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
curvnet);
+ callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
+ curvnet);
V_ip_mrouter = so;
ip_mrouter_cnt++;
@@ -733,6 +745,7 @@ X_ip_mrouter_done(void)
callout_stop(&V_expire_upcalls_ch);
callout_stop(&V_bw_upcalls_ch);
+ callout_stop(&V_bw_meter_ch);
MFC_LOCK();
@@ -753,6 +766,7 @@ X_ip_mrouter_done(void)
bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize);
V_bw_upcalls_n = 0;
+ bzero(V_bw_meter_timers, BW_METER_BUCKETS * sizeof(*V_bw_meter_timers));
MFC_UNLOCK();
@@ -1022,8 +1036,7 @@ expire_mfc(struct mfc *rt)
MFC_LOCK_ASSERT();
- free_bw_list(rt->mfc_bw_meter_leq);
- free_bw_list(rt->mfc_bw_meter_geq);
+ free_bw_list(rt->mfc_bw_meter);
TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
m_freem(rte->m);
@@ -1126,8 +1139,7 @@ add_mfc(struct mfcctl2 *mfccp)
rt->mfc_nstall = 0;
rt->mfc_expire = 0;
- rt->mfc_bw_meter_leq = NULL;
- rt->mfc_bw_meter_geq = NULL;
+ rt->mfc_bw_meter = NULL;
/* insert new entry at head of hash chain */
LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
@@ -1167,10 +1179,8 @@ del_mfc(struct mfcctl2 *mfccp)
/*
* free the bw_meter entries
*/
- free_bw_list(rt->mfc_bw_meter_leq);
- rt->mfc_bw_meter_leq = NULL;
- free_bw_list(rt->mfc_bw_meter_geq);
- rt->mfc_bw_meter_geq = NULL;
+ free_bw_list(rt->mfc_bw_meter);
+ rt->mfc_bw_meter = NULL;
LIST_REMOVE(rt, mfc_hash);
free(rt, M_MRTABLE);
@@ -1383,8 +1393,7 @@ fail:
/* clear the RP address */
rt->mfc_rp.s_addr = INADDR_ANY;
- rt->mfc_bw_meter_leq = NULL;
- rt->mfc_bw_meter_geq = NULL;
+ rt->mfc_bw_meter = NULL;
/* initialize pkt counters per src-grp */
rt->mfc_pkt_cnt = 0;
@@ -1450,6 +1459,16 @@ expire_upcalls(void *arg)
if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
continue;
+ /*
+ * free the bw_meter entries
+ */
+ while (rt->mfc_bw_meter != NULL) {
+ struct bw_meter *x = rt->mfc_bw_meter;
+
+ rt->mfc_bw_meter = x->bm_mfc_next;
+ free(x, M_BWMETER);
+ }
+
MRTSTAT_INC(mrts_cache_cleanups);
CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__,
(u_long)ntohl(rt->mfc_origin.s_addr),
@@ -1583,22 +1602,14 @@ ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
/*
* Perform upcall-related bw measuring.
*/
- if ((rt->mfc_bw_meter_geq != NULL) || (rt->mfc_bw_meter_leq != NULL)) {
+ if (rt->mfc_bw_meter != NULL) {
struct bw_meter *x;
struct timeval now;
microtime(&now);
MFC_LOCK_ASSERT();
- /* Process meters for Greater-or-EQual case */
- for (x = rt->mfc_bw_meter_geq; x != NULL; x = x->bm_mfc_next)
- bw_meter_geq_receive_packet(x, plen, &now);
-
- /* Process meters for Lower-or-EQual case */
- for (x = rt->mfc_bw_meter_leq; x != NULL; x = x->bm_mfc_next) {
- /* Record that a packet is received */
- x->bm_measured.b_packets++;
- x->bm_measured.b_bytes += plen;
- }
+ for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
+ bw_meter_receive_packet(x, plen, &now);
}
return 0;
@@ -1748,139 +1759,84 @@ compute_bw_meter_flags(struct bw_upcall *req)
return flags;
}
-static void
-expire_bw_meter_leq(void *arg)
-{
- struct bw_meter *x = arg;
- struct timeval now;
- /*
- * INFO:
- * callout is always executed with MFC_LOCK taken
- */
-
- CURVNET_SET((struct vnet *)x->arg);
-
- microtime(&now);
-
- /*
- * Test if we should deliver an upcall
- */
- if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
- (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
- ((x->bm_flags & BW_METER_UNIT_BYTES) &&
- (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
- /* Prepare an upcall for delivery */
- bw_meter_prepare_upcall(x, &now);
- }
-
- /* Send all upcalls that are pending delivery */
- bw_upcalls_send();
-
- /* Reset counters */
- x->bm_start_time = now;
- x->bm_measured.b_bytes = 0;
- x->bm_measured.b_packets = 0;
-
- callout_schedule(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time));
-
- CURVNET_RESTORE();
-}
-
/*
* Add a bw_meter entry
*/
static int
add_bw_upcall(struct bw_upcall *req)
{
- struct mfc *mfc;
- struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
- BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
- struct timeval now;
- struct bw_meter *x, **bwm_ptr;
- uint32_t flags;
-
- if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
- return EOPNOTSUPP;
-
- /* Test if the flags are valid */
- if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
- return EINVAL;
- if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
- return EINVAL;
- if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
- == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
- return EINVAL;
-
- /* Test if the threshold time interval is valid */
- if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
- return EINVAL;
+ struct mfc *mfc;
+ struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
+ BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
+ struct timeval now;
+ struct bw_meter *x;
+ uint32_t flags;
- flags = compute_bw_meter_flags(req);
+ if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
+ return EOPNOTSUPP;
- /*
- * Find if we have already same bw_meter entry
- */
- MFC_LOCK();
- mfc = mfc_find(&req->bu_src, &req->bu_dst);
- if (mfc == NULL) {
- MFC_UNLOCK();
- return EADDRNOTAVAIL;
- }
+ /* Test if the flags are valid */
+ if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
+ return EINVAL;
+ if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
+ return EINVAL;
+ if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
+ == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
+ return EINVAL;
- /* Choose an appropriate bw_meter list */
- if (req->bu_flags & BW_UPCALL_GEQ)
- bwm_ptr = &mfc->mfc_bw_meter_geq;
- else
- bwm_ptr = &mfc->mfc_bw_meter_leq;
-
- for (x = *bwm_ptr; x != NULL; x = x->bm_mfc_next) {
- if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
- &req->bu_threshold.b_time, ==))
- && (x->bm_threshold.b_packets
- == req->bu_threshold.b_packets)
- && (x->bm_threshold.b_bytes
- == req->bu_threshold.b_bytes)
- && (x->bm_flags & BW_METER_USER_FLAGS)
- == flags) {
- MFC_UNLOCK();
- return 0; /* XXX Already installed */
- }
- }
+ /* Test if the threshold time interval is valid */
+ if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
+ return EINVAL;
- /* Allocate the new bw_meter entry */
- x = (struct bw_meter*) malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
- if (x == NULL) {
- MFC_UNLOCK();
- return ENOBUFS;
- }
+ flags = compute_bw_meter_flags(req);
- /* Set the new bw_meter entry */
- x->bm_threshold.b_time = req->bu_threshold.b_time;
- microtime(&now);
- x->bm_start_time = now;
- x->bm_threshold.b_packets = req->bu_threshold.b_packets;
- x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
- x->bm_measured.b_packets = 0;
- x->bm_measured.b_bytes = 0;
- x->bm_flags = flags;
- x->bm_time_next = NULL;
- x->bm_mfc = mfc;
- x->arg = curvnet;
-
- /* For LEQ case create periodic callout */
- if (req->bu_flags & BW_UPCALL_LEQ) {
- callout_init_mtx(&x->bm_meter_callout, &mfc_mtx,0);
- callout_reset(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time),
- expire_bw_meter_leq, x);
+ /*
+ * Find if we have already same bw_meter entry
+ */
+ MFC_LOCK();
+ mfc = mfc_find(&req->bu_src, &req->bu_dst);
+ if (mfc == NULL) {
+ MFC_UNLOCK();
+ return EADDRNOTAVAIL;
+ }
+ for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
+ if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
+ &req->bu_threshold.b_time, ==)) &&
+ (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
+ (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
+ (x->bm_flags & BW_METER_USER_FLAGS) == flags) {
+ MFC_UNLOCK();
+ return 0; /* XXX Already installed */
}
+ }
- /* Add the new bw_meter entry to the front of entries for this MFC */
- x->bm_mfc_next = *bwm_ptr;
- *bwm_ptr = x;
-
+ /* Allocate the new bw_meter entry */
+ x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
+ if (x == NULL) {
MFC_UNLOCK();
+ return ENOBUFS;
+ }
- return 0;
+ /* Set the new bw_meter entry */
+ x->bm_threshold.b_time = req->bu_threshold.b_time;
+ microtime(&now);
+ x->bm_start_time = now;
+ x->bm_threshold.b_packets = req->bu_threshold.b_packets;
+ x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
+ x->bm_measured.b_packets = 0;
+ x->bm_measured.b_bytes = 0;
+ x->bm_flags = flags;
+ x->bm_time_next = NULL;
+ x->bm_time_hash = BW_METER_BUCKETS;
+
+ /* Add the new bw_meter entry to the front of entries for this MFC */
+ x->bm_mfc = mfc;
+ x->bm_mfc_next = mfc->mfc_bw_meter;
+ mfc->mfc_bw_meter = x;
+ schedule_bw_meter(x, &now);
+ MFC_UNLOCK();
+
+ return 0;
}
static void
@@ -1889,11 +1845,8 @@ free_bw_list(struct bw_meter *list)
while (list != NULL) {
struct bw_meter *x = list;
- /* MFC_LOCK must be held here */
- if (x->bm_flags & BW_METER_LEQ)
- callout_drain(&x->bm_meter_callout);
-
list = list->bm_mfc_next;
+ unschedule_bw_meter(x);
free(x, M_BWMETER);
}
}
@@ -1905,7 +1858,7 @@ static int
del_bw_upcall(struct bw_upcall *req)
{
struct mfc *mfc;
- struct bw_meter *x, **bwm_ptr;
+ struct bw_meter *x;
if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
return EOPNOTSUPP;
@@ -1923,14 +1876,8 @@ del_bw_upcall(struct bw_upcall *req)
*/
struct bw_meter *list;
- /* Free LEQ list */
- list = mfc->mfc_bw_meter_leq;
- mfc->mfc_bw_meter_leq = NULL;
- free_bw_list(list);
-
- /* Free GEQ list */
- list = mfc->mfc_bw_meter_geq;
- mfc->mfc_bw_meter_geq = NULL;
+ list = mfc->mfc_bw_meter;
+ mfc->mfc_bw_meter = NULL;
free_bw_list(list);
MFC_UNLOCK();
return 0;
@@ -1940,14 +1887,8 @@ del_bw_upcall(struct bw_upcall *req)
flags = compute_bw_meter_flags(req);
- /* Choose an appropriate bw_meter list */
- if (req->bu_flags & BW_UPCALL_GEQ)
- bwm_ptr = &mfc->mfc_bw_meter_geq;
- else
- bwm_ptr = &mfc->mfc_bw_meter_leq;
-
/* Find the bw_meter entry to delete */
- for (prev = NULL, x = *bwm_ptr; x != NULL;
+ for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
prev = x, x = x->bm_mfc_next) {
if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
&req->bu_threshold.b_time, ==)) &&
@@ -1960,11 +1901,9 @@ del_bw_upcall(struct bw_upcall *req)
if (prev != NULL)
prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/
else
- *bwm_ptr = x->bm_mfc_next;/* new head of list */
-
- if (req->bu_flags & BW_UPCALL_LEQ)
- callout_stop(&x->bm_meter_callout);
+ x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
+ unschedule_bw_meter(x);
MFC_UNLOCK();
/* Free the bw_meter entry */
free(x, M_BWMETER);
@@ -1981,15 +1920,16 @@ del_bw_upcall(struct bw_upcall *req)
* Perform bandwidth measurement processing that may result in an upcall
*/
static void
-bw_meter_geq_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
+bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
{
- struct timeval delta;
+ struct timeval delta;
- MFC_LOCK_ASSERT();
+ MFC_LOCK_ASSERT();
- delta = *nowp;
- BW_TIMEVALDECR(&delta, &x->bm_start_time);
+ delta = *nowp;
+ BW_TIMEVALDECR(&delta, &x->bm_start_time);
+ if (x->bm_flags & BW_METER_GEQ) {
/*
* Processing for ">=" type of bw_meter entry
*/
@@ -2009,15 +1949,63 @@ bw_meter_geq_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
* Test if we should deliver an upcall
*/
if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
- if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
- (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
- ((x->bm_flags & BW_METER_UNIT_BYTES) &&
- (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
- /* Prepare an upcall for delivery */
- bw_meter_prepare_upcall(x, nowp);
- x->bm_flags |= BW_METER_UPCALL_DELIVERED;
- }
+ if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
+ (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
+ ((x->bm_flags & BW_METER_UNIT_BYTES) &&
+ (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
+ /* Prepare an upcall for delivery */
+ bw_meter_prepare_upcall(x, nowp);
+ x->bm_flags |= BW_METER_UPCALL_DELIVERED;
+ }
+ }
+ } else if (x->bm_flags & BW_METER_LEQ) {
+ /*
+ * Processing for "<=" type of bw_meter entry
+ */
+ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
+ /*
+ * We are behind time with the multicast forwarding table
+ * scanning for "<=" type of bw_meter entries, so test now
+ * if we should deliver an upcall.
+ */
+ if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
+ (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
+ ((x->bm_flags & BW_METER_UNIT_BYTES) &&
+ (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
+ /* Prepare an upcall for delivery */
+ bw_meter_prepare_upcall(x, nowp);
+ }
+ /* Reschedule the bw_meter entry */
+ unschedule_bw_meter(x);
+ schedule_bw_meter(x, nowp);
+ }
+
+ /* Record that a packet is received */
+ x->bm_measured.b_packets++;
+ x->bm_measured.b_bytes += plen;
+
+ /*
+ * Test if we should restart the measuring interval
+ */
+ if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
+ x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
+ (x->bm_flags & BW_METER_UNIT_BYTES &&
+ x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
+ /* Don't restart the measuring interval */
+ } else {
+ /* Do restart the measuring interval */
+ /*
+ * XXX: note that we don't unschedule and schedule, because this
+ * might be too much overhead per packet. Instead, when we process
+ * all entries for a given timer hash bin, we check whether it is
+ * really a timeout. If not, we reschedule at that time.
+ */
+ x->bm_start_time = *nowp;
+ x->bm_measured.b_packets = 0;
+ x->bm_measured.b_bytes = 0;
+ x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
}
+ }
}
/*
@@ -2026,44 +2014,44 @@ bw_meter_geq_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
static void
bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
{
- struct timeval delta;
- struct bw_upcall *u;
+ struct timeval delta;
+ struct bw_upcall *u;
- MFC_LOCK_ASSERT();
+ MFC_LOCK_ASSERT();
- /*
- * Compute the measured time interval
- */
- delta = *nowp;
- BW_TIMEVALDECR(&delta, &x->bm_start_time);
+ /*
+ * Compute the measured time interval
+ */
+ delta = *nowp;
+ BW_TIMEVALDECR(&delta, &x->bm_start_time);
- /*
- * If there are too many pending upcalls, deliver them now
- */
- if (V_bw_upcalls_n >= BW_UPCALLS_MAX)
- bw_upcalls_send();
+ /*
+ * If there are too many pending upcalls, deliver them now
+ */
+ if (V_bw_upcalls_n >= BW_UPCALLS_MAX)
+ bw_upcalls_send();
- /*
- * Set the bw_upcall entry
- */
- u = &V_bw_upcalls[V_bw_upcalls_n++];
- u->bu_src = x->bm_mfc->mfc_origin;
- u->bu_dst = x->bm_mfc->mfc_mcastgrp;
- u->bu_threshold.b_time = x->bm_threshold.b_time;
- u->bu_threshold.b_packets = x->bm_threshold.b_packets;
- u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
- u->bu_measured.b_time = delta;
- u->bu_measured.b_packets = x->bm_measured.b_packets;
- u->bu_measured.b_bytes = x->bm_measured.b_bytes;
- u->bu_flags = 0;
- if (x->bm_flags & BW_METER_UNIT_PACKETS)
- u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
- if (x->bm_flags & BW_METER_UNIT_BYTES)
- u->bu_flags |= BW_UPCALL_UNIT_BYTES;
- if (x->bm_flags & BW_METER_GEQ)
- u->bu_flags |= BW_UPCALL_GEQ;
- if (x->bm_flags & BW_METER_LEQ)
- u->bu_flags |= BW_UPCALL_LEQ;
+ /*
+ * Set the bw_upcall entry
+ */
+ u = &V_bw_upcalls[V_bw_upcalls_n++];
+ u->bu_src = x->bm_mfc->mfc_origin;
+ u->bu_dst = x->bm_mfc->mfc_mcastgrp;
+ u->bu_threshold.b_time = x->bm_threshold.b_time;
+ u->bu_threshold.b_packets = x->bm_threshold.b_packets;
+ u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
+ u->bu_measured.b_time = delta;
+ u->bu_measured.b_packets = x->bm_measured.b_packets;
+ u->bu_measured.b_bytes = x->bm_measured.b_bytes;
+ u->bu_flags = 0;
+ if (x->bm_flags & BW_METER_UNIT_PACKETS)
+ u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
+ if (x->bm_flags & BW_METER_UNIT_BYTES)
+ u->bu_flags |= BW_UPCALL_UNIT_BYTES;
+ if (x->bm_flags & BW_METER_GEQ)
+ u->bu_flags |= BW_UPCALL_GEQ;
+ if (x->bm_flags & BW_METER_LEQ)
+ u->bu_flags |= BW_UPCALL_LEQ;
}
/*
@@ -2115,6 +2103,183 @@ bw_upcalls_send(void)
}
}
+/*
+ * Compute the timeout hash value for the bw_meter entries
+ */
+#define BW_METER_TIMEHASH(bw_meter, hash) \
+ do { \
+ struct timeval next_timeval = (bw_meter)->bm_start_time; \
+ \
+ BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
+ (hash) = next_timeval.tv_sec; \
+ if (next_timeval.tv_usec) \
+ (hash)++; /* XXX: make sure we don't timeout early */ \
+ (hash) %= BW_METER_BUCKETS; \
+ } while (0)
+
+/*
+ * Schedule a timer to process periodically bw_meter entry of type "<="
+ * by linking the entry in the proper hash bucket.
+ */
+static void
+schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
+{
+ int time_hash;
+
+ MFC_LOCK_ASSERT();
+
+ if (!(x->bm_flags & BW_METER_LEQ))
+ return; /* XXX: we schedule timers only for "<=" entries */
+
+ /*
+ * Reset the bw_meter entry
+ */
+ x->bm_start_time = *nowp;
+ x->bm_measured.b_packets = 0;
+ x->bm_measured.b_bytes = 0;
+ x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
+
+ /*
+ * Compute the timeout hash value and insert the entry
+ */
+ BW_METER_TIMEHASH(x, time_hash);
+ x->bm_time_next = V_bw_meter_timers[time_hash];
+ V_bw_meter_timers[time_hash] = x;
+ x->bm_time_hash = time_hash;
+}
+
+/*
+ * Unschedule the periodic timer that processes bw_meter entry of type "<="
+ * by removing the entry from the proper hash bucket.
+ */
+static void
+unschedule_bw_meter(struct bw_meter *x)
+{
+ int time_hash;
+ struct bw_meter *prev, *tmp;
+
+ MFC_LOCK_ASSERT();
+
+ if (!(x->bm_flags & BW_METER_LEQ))
+ return; /* XXX: we schedule timers only for "<=" entries */
+
+ /*
+ * Compute the timeout hash value and delete the entry
+ */
+ time_hash = x->bm_time_hash;
+ if (time_hash >= BW_METER_BUCKETS)
+ return; /* Entry was not scheduled */
+
+ for (prev = NULL, tmp = V_bw_meter_timers[time_hash];
+ tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
+ if (tmp == x)
+ break;
+
+ if (tmp == NULL)
+ panic("unschedule_bw_meter: bw_meter entry not found");
+
+ if (prev != NULL)
+ prev->bm_time_next = x->bm_time_next;
+ else
+ V_bw_meter_timers[time_hash] = x->bm_time_next;
+
+ x->bm_time_next = NULL;
+ x->bm_time_hash = BW_METER_BUCKETS;
+}
+
+/*
+ * Process all "<=" type of bw_meter that should be processed now,
+ * and for each entry prepare an upcall if necessary. Each processed
+ * entry is rescheduled again for the (periodic) processing.
+ *
+ * This is run periodically (once per second normally). On each round,
+ * all the potentially matching entries are in the hash slot that we are
+ * looking at.
+ */
+static void
+bw_meter_process()
+{
+ uint32_t loops;
+ int i;
+ struct timeval now, process_endtime;
+
+ microtime(&now);
+ if (V_last_tv_sec == now.tv_sec)
+ return; /* nothing to do */
+
+ loops = now.tv_sec - V_last_tv_sec;
+ V_last_tv_sec = now.tv_sec;
+ if (loops > BW_METER_BUCKETS)
+ loops = BW_METER_BUCKETS;
+
+ MFC_LOCK();
+ /*
+ * Process all bins of bw_meter entries from the one after the last
+ * processed to the current one. On entry, i points to the last bucket
+ * visited, so we need to increment i at the beginning of the loop.
+ */
+ for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
+ struct bw_meter *x, *tmp_list;
+
+ if (++i >= BW_METER_BUCKETS)
+ i = 0;
+
+ /* Disconnect the list of bw_meter entries from the bin */
+ tmp_list = V_bw_meter_timers[i];
+ V_bw_meter_timers[i] = NULL;
+
+ /* Process the list of bw_meter entries */
+ while (tmp_list != NULL) {
+ x = tmp_list;
+ tmp_list = tmp_list->bm_time_next;
+
+ /* Test if the time interval is over */
+ process_endtime = x->bm_start_time;
+ BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
+ if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
+ /* Not yet: reschedule, but don't reset */
+ int time_hash;
+
+ BW_METER_TIMEHASH(x, time_hash);
+ if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
+ /*
+ * XXX: somehow the bin processing is a bit ahead of time.
+ * Put the entry in the next bin.
+ */
+ if (++time_hash >= BW_METER_BUCKETS)
+ time_hash = 0;
+ }
+ x->bm_time_next = V_bw_meter_timers[time_hash];
+ V_bw_meter_timers[time_hash] = x;
+ x->bm_time_hash = time_hash;
+
+ continue;
+ }
+
+ /*
+ * Test if we should deliver an upcall
+ */
+ if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
+ (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
+ ((x->bm_flags & BW_METER_UNIT_BYTES) &&
+ (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
+ /* Prepare an upcall for delivery */
+ bw_meter_prepare_upcall(x, &now);
+ }
+
+ /*
+ * Reschedule for next processing
+ */
+ schedule_bw_meter(x, &now);
+ }
+ }
+
+ /* Send all upcalls that are pending delivery */
+ bw_upcalls_send();
+
+ MFC_UNLOCK();
+}
+
/*
* A periodic function for sending all upcalls that are pending delivery
*/
@@ -2132,6 +2297,23 @@ expire_bw_upcalls_send(void *arg)
CURVNET_RESTORE();
}
+/*
+ * A periodic function for periodic scanning of the multicast forwarding
+ * table for processing all "<=" bw_meter entries.
+ */
+static void
+expire_bw_meter_process(void *arg)
+{
+ CURVNET_SET((struct vnet *) arg);
+
+ if (V_mrt_api_config & MRT_MFC_BW_UPCALL)
+ bw_meter_process();
+
+ callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
+ curvnet);
+ CURVNET_RESTORE();
+}
+
/*
* End of bandwidth monitoring code
*/
@@ -2653,11 +2835,14 @@ vnet_mroute_init(const void *unused __unused)
V_viftable = mallocarray(MAXVIFS, sizeof(*V_viftable),
M_MRTABLE, M_WAITOK|M_ZERO);
+ V_bw_meter_timers = mallocarray(BW_METER_BUCKETS,
+ sizeof(*V_bw_meter_timers), M_MRTABLE, M_WAITOK|M_ZERO);
V_bw_upcalls = mallocarray(BW_UPCALLS_MAX, sizeof(*V_bw_upcalls),
M_MRTABLE, M_WAITOK|M_ZERO);
callout_init(&V_expire_upcalls_ch, 1);
callout_init(&V_bw_upcalls_ch, 1);
+ callout_init(&V_bw_meter_ch, 1);
}
VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init,
@@ -2668,6 +2853,7 @@ vnet_mroute_uninit(const void *unused __unused)
{
free(V_bw_upcalls, M_MRTABLE);
+ free(V_bw_meter_timers, M_MRTABLE);
free(V_viftable, M_MRTABLE);
free(V_nexpire, M_MRTABLE);
V_nexpire = NULL;
diff --git a/sys/netinet/ip_mroute.h b/sys/netinet/ip_mroute.h
index 07d77065de33..6ef99c0172f3 100644
--- a/sys/netinet/ip_mroute.h
+++ b/sys/netinet/ip_mroute.h
@@ -283,10 +283,7 @@ struct mfc {
struct timeval mfc_last_assert; /* last time I sent an assert*/
uint8_t mfc_flags[MAXVIFS]; /* the MRT_MFC_FLAGS_* flags */
struct in_addr mfc_rp; /* the RP address */
- struct bw_meter *mfc_bw_meter_leq; /* list of bandwidth meters
- for Lower-or-EQual case */
- struct bw_meter *mfc_bw_meter_geq; /* list of bandwidth meters
- for Greater-or-EQual case */
+ struct bw_meter *mfc_bw_meter; /* list of bandwidth meters */
u_long mfc_nstall; /* # of packets awaiting mfc */
TAILQ_HEAD(, rtdetq) mfc_stall; /* q of packets awaiting mfc */
};
@@ -330,6 +327,7 @@ struct rtdetq {
struct bw_meter {
struct bw_meter *bm_mfc_next; /* next bw meter (same mfc) */
struct bw_meter *bm_time_next; /* next bw meter (same time) */
+ uint32_t bm_time_hash; /* the time hash value */
struct mfc *bm_mfc; /* the corresponding mfc */
uint32_t bm_flags; /* misc flags (see below) */
#define BW_METER_UNIT_PACKETS (1 << 0) /* threshold (in packets) */
@@ -346,10 +344,6 @@ struct bw_meter {
struct bw_data bm_threshold; /* the upcall threshold */
struct bw_data bm_measured; /* the measured bw */
struct timeval bm_start_time; /* abs. time */
-#ifdef _KERNEL
- struct callout bm_meter_callout; /* Periodic callout */
- void* arg; /* custom argument */
-#endif
};
#ifdef _KERNEL
More information about the dev-commits-src-main
mailing list