inet: frags: use rhashtables for reassembly units
commit 648700f76b03b7e8149d13cc2bdb3355035258a9 upstream.
Some applications still rely on IP fragmentation, and to be fair linux
reassembly unit is not working under any serious load.
It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)
A work queue is supposed to garbage collect items when host is under memory
pressure, and doing a hash rebuild, changing seed used in hash computations.
This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
occurring every 5 seconds if host is under fire.
Then there is the problem of sharing this hash table for all netns.
It is time to switch to rhashtables, and allocate one of them per netns
to speedup netns dismantle, since this is a critical metric these days.
Lookup is now using RCU. A followup patch will even remove
the refcount hold/release left from prior implementation and save
a couple of atomic operations.
Before this patch, 16 cpus (16 RX queue NIC) could not handle more
than 1 Mpps frags DDOS.
After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB
of storage for the fragments (exact number depends on frags being evicted
after timeout)
$ grep FRAG /proc/net/sockstat
FRAG: inuse 1966916 memory 2140004608
A followup patch will change the limits for 64bit arches.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Florian Westphal <fw@strlen.de>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Alexander Aring <alex.aring@gmail.com>
Cc: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
[bwh: Backported to 4.4: adjust context]
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 12589f0..623eb82 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -1,7 +1,11 @@
#ifndef __NET_FRAG_H__
#define __NET_FRAG_H__
+#include <linux/rhashtable.h>
+
struct netns_frags {
+ struct rhashtable rhashtable ____cacheline_aligned_in_smp;
+
/* Keep atomic mem on separate cachelines in structs that include it */
atomic_t mem ____cacheline_aligned_in_smp;
/* sysctls */
@@ -24,12 +28,30 @@
INET_FRAG_COMPLETE = BIT(2),
};
+struct frag_v4_compare_key {
+ __be32 saddr;
+ __be32 daddr;
+ u32 user;
+ u32 vif;
+ __be16 id;
+ u16 protocol;
+};
+
+struct frag_v6_compare_key {
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+ u32 user;
+ __be32 id;
+ u32 iif;
+};
+
/**
* struct inet_frag_queue - fragment queue
*
- * @lock: spinlock protecting the queue
+ * @node: rhash node
+ * @key: keys identifying this frag.
* @timer: queue expiration timer
- * @list: hash bucket list
+ * @lock: spinlock protecting this frag
* @refcnt: reference count of the queue
* @fragments: received fragments head
* @fragments_tail: received fragments tail
@@ -39,12 +61,16 @@
* @flags: fragment queue flags
* @max_size: maximum received fragment size
* @net: namespace that this frag belongs to
- * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
+ * @rcu: rcu head for freeing deferall
*/
struct inet_frag_queue {
- spinlock_t lock;
+ struct rhash_head node;
+ union {
+ struct frag_v4_compare_key v4;
+ struct frag_v6_compare_key v6;
+ } key;
struct timer_list timer;
- struct hlist_node list;
+ spinlock_t lock;
atomic_t refcnt;
struct sk_buff *fragments;
struct sk_buff *fragments_tail;
@@ -53,45 +79,13 @@
int meat;
__u8 flags;
u16 max_size;
- struct netns_frags *net;
- struct hlist_node list_evictor;
-};
-
-#define INETFRAGS_HASHSZ 1024
-
-/* averaged:
- * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
- * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
- * struct frag_queue))
- */
-#define INETFRAGS_MAXDEPTH 128
-
-struct inet_frag_bucket {
- struct hlist_head chain;
- spinlock_t chain_lock;
+ struct netns_frags *net;
+ struct rcu_head rcu;
};
struct inet_frags {
- struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
-
- struct work_struct frags_work;
- unsigned int next_bucket;
- unsigned long last_rebuild_jiffies;
- bool rebuild;
-
- /* The first call to hashfn is responsible to initialize
- * rnd. This is best done with net_get_random_once.
- *
- * rnd_seqlock is used to let hash insertion detect
- * when it needs to re-lookup the hash chain to use.
- */
- u32 rnd;
- seqlock_t rnd_seqlock;
int qsize;
- unsigned int (*hashfn)(const struct inet_frag_queue *);
- bool (*match)(const struct inet_frag_queue *q,
- const void *arg);
void (*constructor)(struct inet_frag_queue *q,
const void *arg);
void (*destructor)(struct inet_frag_queue *);
@@ -99,6 +93,7 @@
void (*frag_expire)(unsigned long data);
struct kmem_cache *frags_cachep;
const char *frags_cache_name;
+ struct rhashtable_params rhash_params;
};
int inet_frags_init(struct inet_frags *);
@@ -107,15 +102,13 @@
static inline int inet_frags_init_net(struct netns_frags *nf)
{
atomic_set(&nf->mem, 0);
- return 0;
+ return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
}
void inet_frags_exit_net(struct netns_frags *nf);
void inet_frag_kill(struct inet_frag_queue *q);
void inet_frag_destroy(struct inet_frag_queue *q);
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
- struct inet_frags *f, void *key, unsigned int hash);
-
+struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
const char *prefix);
@@ -127,7 +120,7 @@
static inline bool inet_frag_evicting(struct inet_frag_queue *q)
{
- return !hlist_unhashed(&q->list_evictor);
+ return false;
}
/* Memory Tracking Functions. */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 9d8eace..2067bbe 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -505,17 +505,8 @@
__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
};
-struct ip6_create_arg {
- __be32 id;
- u32 user;
- const struct in6_addr *src;
- const struct in6_addr *dst;
- int iif;
- u8 ecn;
-};
-
void ip6_frag_init(struct inet_frag_queue *q, const void *a);
-bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
+extern const struct rhashtable_params ip6_rhash_params;
/*
* Equivalent of ipv4 struct ip
@@ -523,11 +514,6 @@
struct frag_queue {
struct inet_frag_queue q;
- __be32 id; /* fragment id */
- u32 user;
- struct in6_addr saddr;
- struct in6_addr daddr;
-
int iif;
unsigned int csum;
__u16 nhoffset;