[RFC PATCH 2/4] net/ipv4: respect MTU determined by `ndo_lookup_mtu`

leon at is.currently.online leon at is.currently.online
Tue Dec 28 23:45:25 UTC 2021


From: Leon Schuermann <leon at is.currently.online>

This integrates the newly introduced dynamic MTU lookup mechanism with
the IPv4 stack. It will attempt to query the destination netdevice for
the individual packet MTU and, if not found or the mechanism is not
implemented, fall back to the device MTU.

`ndo_lookup_mtu` will not be queried and respected for every
packet. For instance, flow offloading with netfilter will only take
the device MTU into account.

Signed-off-by: Leon Schuermann <leon at is.currently.online>
---
 include/net/ip.h                   | 34 ++++++++++++++++++++++--------
 net/ipv4/ip_forward.c              |  2 +-
 net/netfilter/nf_flow_table_core.c |  2 +-
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 2d6b985d11cc..5232d0c07dea 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -433,34 +433,50 @@ static inline bool ip_sk_ignore_df(const struct sock *sk)
 }
 
 static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
-						    bool forwarding)
+						    bool forwarding,
+						    const struct sk_buff *skb)
 {
-	struct net *net = dev_net(dst->dev);
+	int err;
 	unsigned int mtu;
+	struct net_device *dev = dst->dev;
 
-	if (net->ipv4.sysctl_ip_fwd_use_pmtu ||
-	    ip_mtu_locked(dst) ||
-	    !forwarding)
-		return dst_mtu(dst);
+	err = -ENODATA;
+	if (skb && dev->netdev_ops->ndo_lookup_mtu)
+		err = dev->netdev_ops->ndo_lookup_mtu(skb, dev);
+	mtu = (err >= 0) ? err : READ_ONCE(dst->dev->mtu);
+
+	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
+		ip_mtu_locked(dst) ||
+		!forwarding)
+		return min(mtu, dst_mtu(dst));
 
 	/* 'forwarding = true' case should always honour route mtu */
 	mtu = dst_metric_raw(dst, RTAX_MTU);
 	if (mtu)
 		return mtu;
 
-	return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU);
+	return min(mtu, IP_MAX_MTU);
 }
 
 static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 					  const struct sk_buff *skb)
 {
+	int err;
+	unsigned int mtu;
+	struct net_device *dev = skb_dst(skb)->dev;
+
 	if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) {
 		bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;
 
-		return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
+		return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding, skb);
 	}
 
-	return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
+	err = -ENODATA;
+	if (dev->netdev_ops->ndo_lookup_mtu)
+		err = dev->netdev_ops->ndo_lookup_mtu(skb, dev);
+	mtu = (err >= 0) ? err : READ_ONCE(dev->mtu);
+
+	return min(mtu, IP_MAX_MTU);
 }
 
 struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 00ec819f949b..7a7ec3643c37 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -127,7 +127,7 @@ int ip_forward(struct sk_buff *skb)
 		goto sr_failed;
 
 	IPCB(skb)->flags |= IPSKB_FORWARDED;
-	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
+	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true, skb);
 	if (ip_exceeds_mtu(skb, mtu)) {
 		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 513f78db3cb2..95bd7a87066a 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -87,7 +87,7 @@ static int flow_offload_fill_route(struct flow_offload *flow,
 
 	switch (flow_tuple->l3proto) {
 	case NFPROTO_IPV4:
-		flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
+		flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true, NULL);
 		break;
 	case NFPROTO_IPV6:
 		flow_tuple->mtu = ip6_dst_mtu_forward(dst);
-- 
2.33.1



More information about the WireGuard mailing list