From patchwork Tue Oct 27 00:03:02 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Benjamin LaHaise X-Patchwork-Id: 36949 X-Patchwork-Delegate: davem@davemloft.net Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by ozlabs.org (Postfix) with ESMTP id B2E03B7BB8 for ; Tue, 27 Oct 2009 11:03:10 +1100 (EST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755216AbZJ0ADA (ORCPT ); Mon, 26 Oct 2009 20:03:00 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1755056AbZJ0AC7 (ORCPT ); Mon, 26 Oct 2009 20:02:59 -0400 Received: from kanga.kvack.org ([205.233.56.17]:59276 "EHLO kanga.kvack.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755035AbZJ0AC7 (ORCPT ); Mon, 26 Oct 2009 20:02:59 -0400 Received: by kanga.kvack.org (Postfix, from userid 63042) id 107386B005A; Mon, 26 Oct 2009 20:03:03 -0400 (EDT) Date: Mon, 26 Oct 2009 20:03:02 -0400 From: Benjamin LaHaise To: netdev@vger.kernel.org Subject: [RFC PATCH] fib_hash: improve route deletion scaling on interface drop with lots of interfaces Message-ID: <20091027000302.GA3141@kvack.org> Mime-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.4.2.2i Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org Hi folks, Below is a patch to improve the scaling of interface destruction in fib_hash. The general idea is to tie the fib_alias structure into a list off of net_device and walk that list during a fib_flush() caused by an interface drop. This makes the resulting flush only have to walk the number of routes attached to an interface rather than the number of routes attached to all interfaces at the expense of a couple of additional pointers in struct fib_alias. This patch is against Linus' tree. I'll post against net-next after a bit more testing and feedback. With 20,000 interfaces & routes, interface deletion time improves from 53s to 40s. Note that this is with other changes applied to improve sysfs and procfs scaling, as otherwise those are the bottleneck. Next up in the network code is rt_cache_flush(). Comments? -ben --- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 812a5f3..982045b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -856,6 +856,7 @@ struct net_device /* delayed register/unregister */ struct list_head todo_list; + struct list_head fib_list; /* device index hash chain */ struct hlist_node index_hlist; diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index ef91fe9..0c32193 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -149,7 +149,7 @@ struct fib_table { int (*tb_delete)(struct fib_table *, struct fib_config *); int (*tb_dump)(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb); - int (*tb_flush)(struct fib_table *table); + int (*tb_flush)(struct fib_table *table, struct net_device *dev); void (*tb_select_default)(struct fib_table *table, const struct flowi *flp, struct fib_result *res); diff --git a/net/core/dev.c b/net/core/dev.c index b8f74cf..9f6f736 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5173,6 +5173,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, netdev_init_queues(dev); INIT_LIST_HEAD(&dev->napi_list); + INIT_LIST_HEAD(&dev->fib_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); strcpy(dev->name, name); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index e2f9505..0283b1f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -128,18 +128,19 @@ void fib_select_default(struct net *net, tb->tb_select_default(tb, flp, res); } -static void fib_flush(struct net *net) +static void fib_flush(struct net_device *dev) { int flushed = 0; struct fib_table *tb; struct hlist_node *node; struct hlist_head *head; unsigned int h; + struct net *net = dev_net(dev); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry(tb, node, head, tb_hlist) - flushed += tb->tb_flush(tb); + flushed += tb->tb_flush(tb, dev); } if (flushed) @@ -805,7 +806,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) for stray nexthop entries, then ignite fib_flush. */ if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) - fib_flush(dev_net(dev)); + fib_flush(dev); } } #undef LOCAL_OK @@ -895,7 +896,7 @@ static void nl_fib_lookup_exit(struct net *net) static void fib_disable_ip(struct net_device *dev, int force) { if (fib_sync_down_dev(dev, force)) - fib_flush(dev_net(dev)); + fib_flush(dev); rt_cache_flush(dev_net(dev), 0); arp_ifdown(dev); } @@ -1009,7 +1010,7 @@ static void __net_exit ip_fib_net_exit(struct net *net) head = &net->ipv4.fib_table_hash[i]; hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { hlist_del(node); - tb->tb_flush(tb); + tb->tb_flush(tb, NULL); kfree(tb); } } diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index ecd3945..d08ba2f 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -377,6 +377,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) u8 tos = cfg->fc_tos; __be32 key; int err; + struct net_device *dev; if (cfg->fc_dst_len > 32) return -EINVAL; @@ -516,6 +517,10 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_type = cfg->fc_type; new_fa->fa_scope = cfg->fc_scope; new_fa->fa_state = 0; + new_fa->fa_fib_node = f; + new_fa->fa_fz = fz; + + dev = fi->fib_dev; /* * Insert new entry to the list. @@ -527,6 +532,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : &f->fn_alias)); fib_hash_genid++; + list_add_tail(&new_fa->fa_dev_list, &dev->fib_list); write_unlock_bh(&fib_hash_lock); if (new_f) @@ -605,6 +611,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg) kill_fn = 0; write_lock_bh(&fib_hash_lock); list_del(&fa->fa_list); + list_del(&fa->fa_dev_list); if (list_empty(&f->fn_alias)) { hlist_del(&f->fn_hash); kill_fn = 1; @@ -643,6 +650,7 @@ static int fn_flush_list(struct fn_zone *fz, int idx) if (fi && (fi->fib_flags&RTNH_F_DEAD)) { write_lock_bh(&fib_hash_lock); list_del(&fa->fa_list); + list_del(&fa->fa_dev_list); if (list_empty(&f->fn_alias)) { hlist_del(&f->fn_hash); kill_f = 1; @@ -662,17 +670,69 @@ static int fn_flush_list(struct fn_zone *fz, int idx) return found; } -static int fn_hash_flush(struct fib_table *tb) +static int fn_flush_alias(struct fn_hash *table, struct fib_alias *fa) +{ + int kill_f = 0; + struct fib_info *fi = fa->fa_info; + int found = 0; + + if (!fi) + BUG(); + + if (fi && (fi->fib_flags & RTNH_F_DEAD)) { + struct fib_node *f = fa->fa_fib_node; + struct fn_zone *fz = fa->fa_fz; + + write_lock_bh(&fib_hash_lock); + list_del(&fa->fa_list); + list_del(&fa->fa_dev_list); + if (list_empty(&f->fn_alias)) { + hlist_del(&f->fn_hash); + kill_f = 1; + } + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + + fn_free_alias(fa, f); + found++; + + if (kill_f) + fn_free_node(f); + fz->fz_nent--; + } + + return found; +} + +static int fn_flush_dev(struct fn_hash *table, struct net_device *dev) +{ + int found = 0; + struct list_head *pos, *next; + + list_for_each_safe(pos, next, &dev->fib_list) { + struct fib_alias *fa = + container_of(pos, struct fib_alias, fa_dev_list); + found += fn_flush_alias(table, fa); + } + + return found; +} + +static int fn_hash_flush(struct fib_table *tb, struct net_device *dev) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; struct fn_zone *fz; int found = 0; - for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { - int i; + if (dev) { + found = fn_flush_dev(table, dev); + } else { + for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { + int i; - for (i = fz->fz_divisor - 1; i >= 0; i--) - found += fn_flush_list(fz, i); + for (i = fz->fz_divisor - 1; i >= 0; i--) + found += fn_flush_list(fz, i); + } } return found; } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 637b133..9f2fad1 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -5,9 +5,17 @@ #include #include +struct fib_node; +struct fn_zone; + struct fib_alias { struct list_head fa_list; + struct list_head fa_dev_list; struct fib_info *fa_info; +#ifdef CONFIG_IP_FIB_HASH + struct fib_node *fa_fib_node; + struct fn_zone *fa_fz; +#endif u8 fa_tos; u8 fa_type; u8 fa_scope; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 291bdf5..4805772 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1786,7 +1786,7 @@ static struct leaf *trie_leafindex(struct trie *t, int index) /* * Caller must hold RTNL. */ -static int fn_trie_flush(struct fib_table *tb) +static int fn_trie_flush(struct fib_table *tb, struct net_device *dev) { struct trie *t = (struct trie *) tb->tb_data; struct leaf *l, *ll = NULL;