git: bbec8e698b5b - main - pf: call dummynet directly from the ethernet code

From: Kristof Provost <kp_at_FreeBSD.org>
Date: Fri, 20 May 2022 12:51:17 UTC
The branch main has been updated by kp:

URL: https://cgit.FreeBSD.org/src/commit/?id=bbec8e698b5bfbd568b840fc411b4fd125684045

commit bbec8e698b5bfbd568b840fc411b4fd125684045
Author:     Kristof Provost <kp@FreeBSD.org>
AuthorDate: 2022-05-18 15:49:28 +0000
Commit:     Kristof Provost <kp@FreeBSD.org>
CommitDate: 2022-05-20 12:49:31 +0000

    pf: call dummynet directly from the ethernet code
    
    Until recently dummynet in ethernet rules did not send packets directly
    to dummynet but instead marked them and left the interactions with
    dummynet to the layer 3 pf code.
    This worked fine for incoming packets (where we process ethernet rules
    before layer 3 rules), but not for outbound packets (where the order of
    operations is the reverse).
    
    Dummynet does support handling layer 2 traffic, so send the packets
    directly to dummynet.
    
    The main limitation now is that pf does not inspect layer 4 (i.e.
    TCP/UDP) so we don't have protocol information or port numbers. Dummynet
    potentially uses this to separate traffic flows, which will not work for
    ethernet dummynet rules. However, pipes (i.e. adding latency or
    restricting bandwidth) will work exactly as expected.
    
    Sponsored by:   Rubicon Communications, LLC ("Netgate")
    Differential Revision:  https://reviews.freebsd.org/D35257
---
 sys/netpfil/pf/pf.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 70 insertions(+), 10 deletions(-)

diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index 5b3bc719ecb6..c613194ce9b5 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -3858,6 +3858,19 @@ pf_test_eth_rule(int dir, struct pfi_kkif *kif, struct mbuf **m0)
 
 	SDT_PROBE3(pf, eth, test_rule, entry, dir, kif->pfik_ifp, m);
 
+	mtag = pf_find_mtag(m);
+	if (mtag != NULL && mtag->flags & PF_TAG_DUMMYNET) {
+		/* Dummynet re-injects packets after they've
+		 * completed their delay. We've already
+		 * processed them, so pass unconditionally. */
+
+		/* But only once. We may see the packet multiple times (e.g.
+		 * PFIL_IN/PFIL_OUT). */
+		mtag->flags &= ~PF_TAG_DUMMYNET;
+
+		return (PF_PASS);
+	}
+
 	ruleset = V_pf_keth;
 	rules = ck_pr_load_ptr(&ruleset->active.rules);
 	r = TAILQ_FIRST(rules);
@@ -3989,7 +4002,8 @@ pf_test_eth_rule(int dir, struct pfi_kkif *kif, struct mbuf **m0)
 	}
 
 	if (r->tag > 0) {
-		mtag = pf_get_mtag(m);
+		if (mtag == NULL)
+			mtag = pf_get_mtag(m);
 		if (mtag == NULL) {
 			PF_RULES_RUNLOCK();
 			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
@@ -3999,7 +4013,8 @@ pf_test_eth_rule(int dir, struct pfi_kkif *kif, struct mbuf **m0)
 	}
 
 	if (r->qid != 0) {
-		mtag = pf_get_mtag(m);
+		if (mtag == NULL)
+			mtag = pf_get_mtag(m);
 		if (mtag == NULL) {
 			PF_RULES_RUNLOCK();
 			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
@@ -4010,19 +4025,64 @@ pf_test_eth_rule(int dir, struct pfi_kkif *kif, struct mbuf **m0)
 
 	/* Dummynet */
 	if (r->dnpipe) {
-		/** While dummynet supports handling Ethernet packets directly
-		 * it still wants some L3/L4 information, and we're not set up
-		 * to provide that here. Instead we'll do what we do for ALTQ
-		 * and merely mark the packet with the dummynet queue/pipe number.
-		 **/
-		mtag = pf_get_mtag(m);
+		struct ip_fw_args dnflow;
+
+		/* Drop packet if dummynet is not loaded. */
+		if (ip_dn_io_ptr == NULL) {
+			PF_RULES_RUNLOCK();
+			m_freem(m);
+			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
+			return (PF_DROP);
+		}
+		if (mtag == NULL)
+			mtag = pf_get_mtag(m);
 		if (mtag == NULL) {
 			PF_RULES_RUNLOCK();
 			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
 			return (PF_DROP);
 		}
-		mtag->dnpipe = r->dnpipe;
-		mtag->dnflags = r->dnflags;
+
+		bzero(&dnflow, sizeof(dnflow));
+
+		/* We don't have port numbers here, so we set 0.  That means
+		 * that we'll be somewhat limited in distinguishing flows (i.e.
+		 * only based on IP addresses, not based on port numbers), but
+		 * it's better than nothing. */
+		dnflow.f_id.dst_port = 0;
+		dnflow.f_id.src_port = 0;
+		dnflow.f_id.proto = 0;
+
+		dnflow.rule.info = r->dnpipe;
+		dnflow.rule.info |= IPFW_IS_DUMMYNET;
+		if (r->dnflags & PFRULE_DN_IS_PIPE)
+			dnflow.rule.info |= IPFW_IS_PIPE;
+
+		dnflow.f_id.extra = dnflow.rule.info;
+
+		dnflow.flags = dir == PF_IN ? IPFW_ARGS_IN : IPFW_ARGS_OUT;
+		dnflow.flags |= IPFW_ARGS_ETHER;
+		dnflow.ifp = kif->pfik_ifp;
+
+		switch (af) {
+		case AF_INET:
+			dnflow.f_id.addr_type = 4;
+			dnflow.f_id.src_ip = src->v4.s_addr;
+			dnflow.f_id.dst_ip = dst->v4.s_addr;
+			break;
+		case AF_INET6:
+			dnflow.flags |= IPFW_ARGS_IP6;
+			dnflow.f_id.addr_type = 6;
+			dnflow.f_id.src_ip6 = src->v6;
+			dnflow.f_id.dst_ip6 = dst->v6;
+			break;
+		default:
+			panic("Unknown address family");
+		}
+
+		mtag->flags |= PF_TAG_DUMMYNET;
+		ip_dn_io_ptr(m0, &dnflow);
+		if (*m0 != NULL)
+			mtag->flags &= ~PF_TAG_DUMMYNET;
 	}
 
 	action = r->action;