From patchwork Tue Oct 29 13:50:53 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Matteo Croce X-Patchwork-Id: 1186123 X-Patchwork-Delegate: davem@davemloft.net Return-Path: X-Original-To: patchwork-incoming-netdev@ozlabs.org Delivered-To: patchwork-incoming-netdev@ozlabs.org Authentication-Results: ozlabs.org; spf=none (no SPF record) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=netdev-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=pass (p=none dis=none) header.from=redhat.com Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=redhat.com header.i=@redhat.com header.b="hzqfoOYB"; dkim-atps=neutral Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 472Y065ypLz9sQv for ; Wed, 30 Oct 2019 00:51:18 +1100 (AEDT) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S2389049AbfJ2NvR (ORCPT ); Tue, 29 Oct 2019 09:51:17 -0400 Received: from us-smtp-delivery-1.mimecast.com ([207.211.31.120]:24340 "EHLO us-smtp-1.mimecast.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S2388802AbfJ2NvR (ORCPT ); Tue, 29 Oct 2019 09:51:17 -0400 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1572357075; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=y0QBgHuOvRpTy/yL3LM7SM8CMC4Q19dnHgWxMUVqqv8=; b=hzqfoOYBCrA+8Lp0bq5OgOq1FhJw2xraui97bSTFGLwdV7OJWeWtboWgqyjmAcUYp9M4/f 7oqjkhCuqbAvfK297cBpXIfn+My+zDcXmxuMYp3dOe7A3AJ8ElJ9vxQxOl2GvhjWcw0ITB YN4J0jc+oHJUtTMgKFQiPoIaoaCGkAQ= Received: from mail-wr1-f69.google.com (mail-wr1-f69.google.com [209.85.221.69]) (Using TLS) by relay.mimecast.com with ESMTP id us-mta-405-OllKlYiONbKn_ovuTqWcBQ-1; Tue, 29 Oct 2019 09:51:14 -0400 Received: by mail-wr1-f69.google.com with SMTP id h4so8433800wrx.15 for ; Tue, 29 Oct 2019 06:51:13 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=Pci9O/Y1uGzGl4VbZQrWCqVtKu7mWU+rWUzr+zqpUus=; b=fwUIm6RpA1rzYr24j9yHtupvPxxvU1H9tPo+OSEap8ttYIjhBTjTwM0k4iCUVj8yRi 8Vm5TywaAt1iZHk72PMRTJMYnh0v/Q9ULUsCdzA75jWeZhWIxoQ4G0MMfC+mMePHlD9X NrKvevnmK6rWcqAxwj5XGPscyk89WRxm4L3UKeld4Yf1x/O7TcNi13Rxjd5hPf2s0MD5 YEYlBcCDyp4dd5DmgSmExoZ1TmNKiEuwoKPPo/IbIIxdMX2Rczwlc+YYuPQ2ntdqK7dN t4pnrmMSwfWTOyWeBAAvWn2i1GXFXKCixSGCRXiQbFG7uZLerTOV8nsad1XbL6slVBrn STNQ== X-Gm-Message-State: APjAAAWqCMPt5fxJR1HsAyF63oilmSxEU9eoOVbNrfkTxV7rtvABIm6E Q47MsPPamZmNpdL1MHgjQwaxeBVgNlyt8ufjvL4J718Xddd5I1BX478yMqJa7VFqKgGUPtKmYEj dQFo6L10bGT5hSm6S X-Received: by 2002:adf:fc10:: with SMTP id i16mr19093180wrr.157.1572357072803; Tue, 29 Oct 2019 06:51:12 -0700 (PDT) X-Google-Smtp-Source: APXvYqxG5wt0T0t9KCOCTG4DWyEMa2fhof6y77Xio3y/jp2lKJcEoqrdOFiVxJqSgiyOI0edNNmXKA== X-Received: by 2002:adf:fc10:: with SMTP id i16mr19093157wrr.157.1572357072538; Tue, 29 Oct 2019 06:51:12 -0700 (PDT) Received: from mcroce-redhat.mxp.redhat.com (nat-pool-mxp-t.redhat.com. [149.6.153.186]) by smtp.gmail.com with ESMTPSA id 189sm2556920wmc.7.2019.10.29.06.51.11 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 29 Oct 2019 06:51:11 -0700 (PDT) From: Matteo Croce To: netdev@vger.kernel.org Cc: Jay Vosburgh , Veaceslav Falico , Andy Gospodarek , "David S . Miller " , Stanislav Fomichev , Daniel Borkmann , Song Liu , Alexei Starovoitov , Paul Blakey , linux-kernel@vger.kernel.org Subject: [PATCH net-next v2 4/4] bonding: balance ICMP echoes in layer3+4 mode Date: Tue, 29 Oct 2019 14:50:53 +0100 Message-Id: <20191029135053.10055-5-mcroce@redhat.com> X-Mailer: git-send-email 2.21.0 In-Reply-To: <20191029135053.10055-1-mcroce@redhat.com> References: <20191029135053.10055-1-mcroce@redhat.com> MIME-Version: 1.0 X-MC-Unique: OllKlYiONbKn_ovuTqWcBQ-1 X-Mimecast-Spam-Score: 0 Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org The bonding uses the L4 ports to balance flows between slaves. As the ICMP protocol has no ports, those packets are sent all to the same device: # tcpdump -qltnni veth0 ip |sed 's/^/0: /' & # tcpdump -qltnni veth1 ip |sed 's/^/1: /' & # ping -qc1 192.168.0.2 1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 315, seq 1, length 64 1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 315, seq 1, length 64 # ping -qc1 192.168.0.2 1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 316, seq 1, length 64 1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 316, seq 1, length 64 # ping -qc1 192.168.0.2 1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 317, seq 1, length 64 1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 317, seq 1, length 64 But some ICMP packets have an Identifier field which is used to match packets within sessions, let's use this value in the hash function to balance these packets between bond slaves: # ping -qc1 192.168.0.2 0: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 303, seq 1, length 64 0: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 303, seq 1, length 64 # ping -qc1 192.168.0.2 1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 304, seq 1, length 64 1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 304, seq 1, length 64 Aso, let's use a flow_dissector_key which defines FLOW_DISSECTOR_KEY_ICMP, so we can balance pings encapsulated in a tunnel when using mode encap3+4: # ping -q 192.168.1.2 -c1 0: IP 192.168.0.1 > 192.168.0.2: GREv0, length 102: IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 585, seq 1, length 64 0: IP 192.168.0.2 > 192.168.0.1: GREv0, length 102: IP 192.168.1.2 > 192.168.1.1: ICMP echo reply, id 585, seq 1, length 64 # ping -q 192.168.1.2 -c1 1: IP 192.168.0.1 > 192.168.0.2: GREv0, length 102: IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 586, seq 1, length 64 1: IP 192.168.0.2 > 192.168.0.1: GREv0, length 102: IP 192.168.1.2 > 192.168.1.1: ICMP echo reply, id 586, seq 1, length 64 Signed-off-by: Matteo Croce Reviewed-by: Nikolay Aleksandrov --- drivers/net/bonding/bond_main.c | 77 ++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 7 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 21d8fcc83c9c..3e496e746cc6 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -200,6 +200,51 @@ atomic_t netpoll_block_tx = ATOMIC_INIT(0); unsigned int bond_net_id __read_mostly; +static const struct flow_dissector_key flow_keys_bonding_keys[] = { + { + .key_id = FLOW_DISSECTOR_KEY_CONTROL, + .offset = offsetof(struct flow_keys, control), + }, + { + .key_id = FLOW_DISSECTOR_KEY_BASIC, + .offset = offsetof(struct flow_keys, basic), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v4addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, + .offset = offsetof(struct flow_keys, addrs.v6addrs), + }, + { + .key_id = FLOW_DISSECTOR_KEY_TIPC, + .offset = offsetof(struct flow_keys, addrs.tipckey), + }, + { + .key_id = FLOW_DISSECTOR_KEY_PORTS, + .offset = offsetof(struct flow_keys, ports), + }, + { + .key_id = FLOW_DISSECTOR_KEY_ICMP, + .offset = offsetof(struct flow_keys, icmp), + }, + { + .key_id = FLOW_DISSECTOR_KEY_VLAN, + .offset = offsetof(struct flow_keys, vlan), + }, + { + .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, + .offset = offsetof(struct flow_keys, tags), + }, + { + .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, + .offset = offsetof(struct flow_keys, keyid), + }, +}; + +static struct flow_dissector flow_keys_bonding __read_mostly; + /*-------------------------- Forward declarations ---------------------------*/ static int bond_init(struct net_device *bond_dev); @@ -3263,10 +3308,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, const struct iphdr *iph; int noff, proto = -1; - if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) - return skb_flow_dissect_flow_keys(skb, fk, 0); + if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) { + memset(fk, 0, sizeof(*fk)); + return __skb_flow_dissect(NULL, skb, &flow_keys_bonding, + fk, NULL, 0, 0, 0, 0); + } fk->ports.ports = 0; + memset(&fk->icmp, 0, sizeof(fk->icmp)); noff = skb_network_offset(skb); if (skb->protocol == htons(ETH_P_IP)) { if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph)))) @@ -3286,8 +3335,14 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, } else { return false; } - if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0) - fk->ports.ports = skb_flow_get_ports(skb, noff, proto); + if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0) { + if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) + skb_flow_get_icmp_tci(skb, &fk->icmp, skb->data, + skb_transport_offset(skb), + skb_headlen(skb)); + else + fk->ports.ports = skb_flow_get_ports(skb, noff, proto); + } return true; } @@ -3314,10 +3369,14 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) return bond_eth_hash(skb); if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 || - bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) + bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) { hash = bond_eth_hash(skb); - else - hash = (__force u32)flow.ports.ports; + } else { + if (flow.icmp.id) + memcpy(&hash, &flow.icmp, sizeof(hash)); + else + memcpy(&hash, &flow.ports.ports, sizeof(hash)); + } hash ^= (__force u32)flow_get_u32_dst(&flow) ^ (__force u32)flow_get_u32_src(&flow); hash ^= (hash >> 16); @@ -4901,6 +4960,10 @@ static int __init bonding_init(void) goto err; } + skb_flow_dissector_init(&flow_keys_bonding, + flow_keys_bonding_keys, + ARRAY_SIZE(flow_keys_bonding_keys)); + register_netdevice_notifier(&bond_netdev_notifier); out: return res;