[WireGuard] [PATCHv2] Add support for platforms which has no efficient unaligned memory access

Tue Sep 20 21:58:54 CEST 2016

Hey René,

This is an excellent find. Thanks. Pretty significant speed improvements. I
wonder where else this is happening too.

Have you tested this on both endians?

The main thing I'm wondering here is why exactly the compiler can't
generate more efficient code itself.

I'll review this and merge soon if it looks good.

Regards,
Jason

On Sun, Sep 11, 2016 at 2:06 PM, René van Dorst <opensource at vdorst.com>
wrote:

> Typo HAVE_EFFICIENT_UNALIGNED_ACCESS --> CONFIG_HAVE_EFFICIENT_UNALIGNE
> D_ACCESS.
>
> From 13fae657624aac6b9c1f411aa6472a91aae7fcc3 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Ren=C3=A9=20van=20Dorst?= <opensource at vdorst.com>
> Date: Sat, 10 Sep 2016 10:58:58 +0200
> Subject: [PATCH] Add support for platforms which has no efficient unaligned
>  memory access
>
> Without it, it caused 55.2% slowdown in throughput at TP-Link WR1043ND,
> MIPS32r2 at 400Mhz.
>
> Simply check for CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS at compile time.
>
> Test on TP-Link WR1043ND, MIPS32r2 at 400Mhz.
> Setup: https://lists.zx2c4.com/pipermail/wireguard/2016-August/000331.html
>
> Benchmarks before:
>
> root at lede:~# iperf3 -c 10.0.0.1 -i 10
> [ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
> [  4]   0.00-10.13  sec  28.8 MBytes  23.8 Mbits/sec    0    202 KBytes
> - - - - - - - - - - - - - - - - - - - - - - - - -
> [ ID] Interval           Transfer     Bandwidth       Retr
> [  4]   0.00-10.13  sec  28.8 MBytes  23.8 Mbits/sec    0
>  sender
> [  4]   0.00-10.13  sec  28.8 MBytes  23.8 Mbits/sec
> receiver
>
> root at lede:~# iperf3 -c 10.0.0.1 -i 10 -u -b 1G
> [ ID] Interval           Transfer     Bandwidth       Total Datagrams
> [  4]   0.00-10.00  sec  31.1 MBytes  26.1 Mbits/sec  3982
> - - - - - - - - - - - - - - - - - - - - - - - - -
> [ ID] Interval           Transfer     Bandwidth       Jitter    Lost/Total
> Datagrams
> [  4]   0.00-10.00  sec  31.1 MBytes  26.1 Mbits/sec  0.049 ms  0/3982 (0%)
> [  4] Sent 3982 datagrams
>
> Benchmarks with aligned memory fetching:
>
> root at lede:~# iperf3 -c 10.0.0.1 -i 10
> [ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
> [  4]   0.00-10.22  sec  52.5 MBytes  43.1 Mbits/sec    0    145 KBytes
> - - - - - - - - - - - - - - - - - - - - - - - - -
> [ ID] Interval           Transfer     Bandwidth       Retr
> [  4]   0.00-10.22  sec  52.5 MBytes  43.1 Mbits/sec    0
>  sender
> [  4]   0.00-10.22  sec  52.5 MBytes  43.1 Mbits/sec
> receiver
>
> iperf Done.
> root at lede:~# iperf3 -c 10.0.0.1 -i 10 -u -b 1G
> [ ID] Interval           Transfer     Bandwidth       Total Datagrams
> [  4]   0.00-10.00  sec  56.3 MBytes  47.2 Mbits/sec  7207
> - - - - - - - - - - - - - - - - - - - - - - - - -
> [ ID] Interval           Transfer     Bandwidth       Jitter    Lost/Total
> Datagrams
> [  4]   0.00-10.00  sec  56.3 MBytes  47.2 Mbits/sec  0.041 ms  0/7207 (0%)
> [  4] Sent 7207 datagrams
> ---
>  src/crypto/chacha20poly1305.c | 31 +++++++++++++++++++++++++++++++
>  1 file changed, 31 insertions(+)
>
> diff --git a/src/crypto/chacha20poly1305.c b/src/crypto/chacha20poly1305.c
> index 5190894..294cbf6 100644
> --- a/src/crypto/chacha20poly1305.c
> +++ b/src/crypto/chacha20poly1305.c
> @@ -248,13 +248,29 @@ struct poly1305_ctx {
>
>  static void poly1305_init(struct poly1305_ctx *ctx, const u8 key[static
> POLY1305_KEY_SIZE])
>  {
> +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> +       u32 t0, t1, t2, t3;
> +#endif
> +
>         memset(ctx, 0, sizeof(struct poly1305_ctx));
>         /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>         ctx->r[0] = (le32_to_cpuvp(key +  0) >> 0) & 0x3ffffff;
>         ctx->r[1] = (le32_to_cpuvp(key +  3) >> 2) & 0x3ffff03;
>         ctx->r[2] = (le32_to_cpuvp(key +  6) >> 4) & 0x3ffc0ff;
>         ctx->r[3] = (le32_to_cpuvp(key +  9) >> 6) & 0x3f03fff;
>         ctx->r[4] = (le32_to_cpuvp(key + 12) >> 8) & 0x00fffff;
> +#else
> +       t0 = le32_to_cpuvp(key + 0);
> +       t1 = le32_to_cpuvp(key + 4);
> +       t2 = le32_to_cpuvp(key + 8);
> +       t3 = le32_to_cpuvp(key +12);
> +       ctx->r[0] = t0 & 0x3ffffff; t0 >>= 26; t0 |= t1 << 6;
> +       ctx->r[1] = t0 & 0x3ffff03; t1 >>= 20; t1 |= t2 << 12;
> +       ctx->r[2] = t1 & 0x3ffc0ff; t2 >>= 14; t2 |= t3 << 18;
> +       ctx->r[3] = t2 & 0x3f03fff; t3 >>= 8;
> +       ctx->r[4] = t3 & 0x00fffff;
> +#endif
>         ctx->s[0] = le32_to_cpuvp(key +  16);
>         ctx->s[1] = le32_to_cpuvp(key +  20);
>         ctx->s[2] = le32_to_cpuvp(key +  24);
> @@ -267,6 +283,9 @@ static unsigned int poly1305_generic_blocks(struct
> poly1305_ctx *ctx, const u8 *
>         u32 s1, s2, s3, s4;
>         u32 h0, h1, h2, h3, h4;
>         u64 d0, d1, d2, d3, d4;
> +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> +       u32 t0, t1, t2, t3;
> +#endif
>
>         r0 = ctx->r[0];
>         r1 = ctx->r[1];
> @@ -287,11 +306,23 @@ static unsigned int poly1305_generic_blocks(struct
> poly1305_ctx *ctx, const u8 *
>
>         while (likely(srclen >= POLY1305_BLOCK_SIZE)) {
>                 /* h += m[i] */
> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>                 h0 += (le32_to_cpuvp(src +  0) >> 0) & 0x3ffffff;
>                 h1 += (le32_to_cpuvp(src +  3) >> 2) & 0x3ffffff;
>                 h2 += (le32_to_cpuvp(src +  6) >> 4) & 0x3ffffff;
>                 h3 += (le32_to_cpuvp(src +  9) >> 6) & 0x3ffffff;
>                 h4 += (le32_to_cpuvp(src + 12) >> 8) | hibit;
> +#else
> +               t0 = le32_to_cpuvp(src +  0);
> +               t1 = le32_to_cpuvp(src +  4);
> +               t2 = le32_to_cpuvp(src +  8);
> +               t3 = le32_to_cpuvp(src + 12);
> +               h0 += t0 & 0x3ffffff;
> +               h1 += sr((((u64)t1 << 32) | t0), 26) & 0x3ffffff;
> +               h2 += sr((((u64)t2 << 32) | t1), 20) & 0x3ffffff;
> +               h3 += sr((((u64)t3 << 32) | t2), 14) & 0x3ffffff;
> +               h4 += (t3 >> 8) | hibit;
> +#endif
>
>                 /* h *= r */
>                 d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) + mlt(h3, s2)
> + mlt(h4, s1);
> --
> 2.5.5
>
>
> _______________________________________________
> WireGuard mailing list
> WireGuard at lists.zx2c4.com
> http://lists.zx2c4.com/mailman/listinfo/wireguard
>

-- 
Jason A. Donenfeld
Deep Space Explorer
fr: +33 6 51 90 82 66
us: +1 513 476 1200
www.jasondonenfeld.com
www.zx2c4.com
zx2c4.com/keys/AB9942E6D4A4CFC3412620A749FC7012A5DE03AE.asc
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.zx2c4.com/pipermail/wireguard/attachments/20160920/67aae5b3/attachment.html>