[PATCH] WireGuard: restrict packet handling to non-isolated CPUs.
Charles-Francois Natali
cf.natali at gmail.com
Tue Apr 5 21:21:57 UTC 2022
WireGuard currently uses round-robin to dispatch the handling of
packets, handling them on all online CPUs, including isolated ones
(isolcpus).
This is unfortunate because it causes significant latency on isolated
CPUs - see e.g. below over 240 usec:
kworker/47:1-2373323 [047] 243644.756405: funcgraph_entry: |
process_one_work() {
kworker/47:1-2373323 [047] 243644.756406: funcgraph_entry: |
wg_packet_decrypt_worker() {
[...]
kworker/47:1-2373323 [047] 243644.756647: funcgraph_exit: 0.591 us | }
kworker/47:1-2373323 [047] 243644.756647: funcgraph_exit: ! 242.655 us | }
Instead, restrict to non-isolated CPUs.
Example:
~# cat /sys/devices/system/cpu/isolated
3
~# /usr/share/doc/wireguard-tools/examples/ncat-client-server/client.sh
~# ping 192.168.4.1
Before - corresponding workqueues are executed on all CPUs:
~# trace-cmd record -p function -l wg_packet_decrypt_worker -- sleep 10
plugin 'function'
CPU0 data recorded at offset=0x7d6000
4096 bytes in size
CPU1 data recorded at offset=0x7d7000
4096 bytes in size
CPU2 data recorded at offset=0x7d8000
4096 bytes in size
CPU3 data recorded at offset=0x7d9000
4096 bytes in size
~# trace-cmd report
cpus=4
kworker/3:1-52 [003] 49.784353: function:
wg_packet_decrypt_worker
kworker/0:1-17 [000] 50.782879: function:
wg_packet_decrypt_worker
kworker/1:3-162 [001] 51.783044: function:
wg_packet_decrypt_worker
kworker/2:1-56 [002] 52.782159: function:
wg_packet_decrypt_worker
kworker/3:1-52 [003] 53.780919: function:
wg_packet_decrypt_worker
kworker/0:0-6 [000] 54.781755: function:
wg_packet_decrypt_worker
kworker/1:3-162 [001] 55.781273: function:
wg_packet_decrypt_worker
kworker/2:1-56 [002] 56.781946: function:
wg_packet_decrypt_worker
kworker/3:1-52 [003] 57.781010: function:
wg_packet_decrypt_worker
kworker/0:0-6 [000] 58.782097: function:
wg_packet_decrypt_worker
~#
After - isolated CPU 3 is excluded:
~# trace-cmd record -p function -l wg_packet_decrypt_worker -- sleep 10
plugin 'function'
CPU0 data recorded at offset=0x7d7000
4096 bytes in size
CPU1 data recorded at offset=0x7d8000
4096 bytes in size
CPU2 data recorded at offset=0x7d9000
4096 bytes in size
CPU3 data recorded at offset=0x7da000
0 bytes in size
~# trace-cmd report
CPU 3 is empty
cpus=4
kworker/1:2-66 [001] 291.800063: function:
wg_packet_decrypt_worker
kworker/2:2-143 [002] 292.800266: function:
wg_packet_decrypt_worker
kworker/0:2-145 [000] 293.801778: function:
wg_packet_decrypt_worker
kworker/1:4-261 [001] 294.803411: function:
wg_packet_decrypt_worker
kworker/2:2-143 [002] 295.804068: function:
wg_packet_decrypt_worker
kworker/0:2-145 [000] 296.806057: function:
wg_packet_decrypt_worker
kworker/1:2-66 [001] 297.810686: function:
wg_packet_decrypt_worker
kworker/2:2-143 [002] 298.811602: function:
wg_packet_decrypt_worker
kworker/0:2-145 [000] 299.812790: function:
wg_packet_decrypt_worker
kworker/1:4-261 [001] 300.813076: function:
wg_packet_decrypt_worker
~#
Signed-off-by: Charles-Francois Natali <cf.natali at gmail.com>
---
drivers/net/wireguard/queueing.h | 59 +++++++++++++++++++++++++-------
drivers/net/wireguard/receive.c | 2 +-
2 files changed, 48 insertions(+), 13 deletions(-)
diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h
index 583adb37e..106a2686c 100644
--- a/drivers/net/wireguard/queueing.h
+++ b/drivers/net/wireguard/queueing.h
@@ -11,6 +11,7 @@
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/sched/isolation.h>
#include <net/ip_tunnels.h>
struct wg_device;
@@ -102,16 +103,50 @@ static inline void wg_reset_packet(struct sk_buff *skb, bool encapsulating)
skb_reset_inner_headers(skb);
}
-static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id)
+/* We only want to dispatch work to housekeeping CPUs, ignoring isolated ones.
+ */
+static inline const struct cpumask *wg_cpumask_housekeeping(void)
+{
+ return housekeeping_cpumask(HK_FLAG_DOMAIN);
+}
+
+static inline int wg_cpumask_test_cpu(int cpu)
+{
+ return cpumask_test_cpu(cpu, cpu_online_mask) &&
+ cpumask_test_cpu(cpu, wg_cpumask_housekeeping());
+}
+
+static inline unsigned int wg_cpumask_first(void)
+{
+ return cpumask_first_and(cpu_online_mask, wg_cpumask_housekeeping());
+}
+
+static inline unsigned int wg_cpumask_next(int n)
+{
+ return cpumask_next_and(n, cpu_online_mask, wg_cpumask_housekeeping());
+}
+
+static inline unsigned int wg_cpumask_weight(void)
+{
+ int cpu;
+ int weight = 0;
+
+ for_each_cpu_and(cpu, cpu_online_mask, wg_cpumask_housekeeping()) {
+ ++weight;
+ }
+
+ return weight;
+}
+
+static inline int wg_cpumask_choose_eligible(int *stored_cpu, unsigned int id)
{
unsigned int cpu = *stored_cpu, cpu_index, i;
- if (unlikely(cpu == nr_cpumask_bits ||
- !cpumask_test_cpu(cpu, cpu_online_mask))) {
- cpu_index = id % cpumask_weight(cpu_online_mask);
- cpu = cpumask_first(cpu_online_mask);
+ if (unlikely(cpu == nr_cpumask_bits || !wg_cpumask_test_cpu(cpu))) {
+ cpu_index = id % wg_cpumask_weight();
+ cpu = wg_cpumask_first();
for (i = 0; i < cpu_index; ++i)
- cpu = cpumask_next(cpu, cpu_online_mask);
+ cpu = wg_cpumask_next(cpu);
*stored_cpu = cpu;
}
return cpu;
@@ -124,13 +159,13 @@ static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id)
* a bit slower, and it doesn't seem like this potential race actually
* introduces any performance loss, so we live with it.
*/
-static inline int wg_cpumask_next_online(int *next)
+static inline int wg_cpumask_next_eligible(int *next)
{
int cpu = *next;
- while (unlikely(!cpumask_test_cpu(cpu, cpu_online_mask)))
- cpu = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits;
- *next = cpumask_next(cpu, cpu_online_mask) % nr_cpumask_bits;
+ while (unlikely(!wg_cpumask_test_cpu(cpu)))
+ cpu = wg_cpumask_next(cpu) % nr_cpumask_bits;
+ *next = wg_cpumask_next(cpu) % nr_cpumask_bits;
return cpu;
}
@@ -173,7 +208,7 @@ static inline int wg_queue_enqueue_per_device_and_peer(
/* Then we queue it up in the device queue, which consumes the
* packet as soon as it can.
*/
- cpu = wg_cpumask_next_online(next_cpu);
+ cpu = wg_cpumask_next_eligible(next_cpu);
if (unlikely(ptr_ring_produce_bh(&device_queue->ring, skb)))
return -EPIPE;
queue_work_on(cpu, wq, &per_cpu_ptr(device_queue->worker, cpu)->work);
@@ -188,7 +223,7 @@ static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet
struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb));
atomic_set_release(&PACKET_CB(skb)->state, state);
- queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id),
+ queue_work_on(wg_cpumask_choose_eligible(&peer->serial_work_cpu, peer->internal_id),
peer->device->packet_crypt_wq, &peer->transmit_packet_work);
wg_peer_put(peer);
}
diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c
index 7b8df406c..2d5d903d0 100644
--- a/drivers/net/wireguard/receive.c
+++ b/drivers/net/wireguard/receive.c
@@ -572,7 +572,7 @@ void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb)
goto err;
}
atomic_inc(&wg->handshake_queue_len);
- cpu = wg_cpumask_next_online(&wg->handshake_queue.last_cpu);
+ cpu = wg_cpumask_next_eligible(&wg->handshake_queue.last_cpu);
/* Queues up a call to packet_process_queued_handshake_packets(skb): */
queue_work_on(cpu, wg->handshake_receive_wq,
&per_cpu_ptr(wg->handshake_queue.worker, cpu)->work);
--
2.30.2
More information about the WireGuard
mailing list