udp: introduce sk_for_each_rcu_safenext()
Corey Minyard found a race added in commit 271b72c7fa82c2c7a795bc16896149933110672d
(udp: RCU handling for Unicast packets.)
"If the socket is moved from one list to another list in-between the
time the hash is calculated and the next field is accessed, and the
socket has moved to the end of the new list, the traversal will not
complete properly on the list it should have, since the socket will
be on the end of the new list and there's not a way to tell it's on a
new list and restart the list traversal. I think that this can be
solved by pre-fetching the "next" field (with proper barriers) before
checking the hash."
This patch corrects this problem, introducing a new
sk_for_each_rcu_safenext() macro.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e649bd3..3ba2998 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -383,5 +383,22 @@
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
pos = rcu_dereference(pos->next))
+/**
+ * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct hlist_node to use as a loop cursor.
+ * @head: the head for your list.
+ * @member: the name of the hlist_node within the struct.
+ * @next: the &struct hlist_node to use as a next cursor
+ *
+ * Special version of hlist_for_each_entry_rcu that make sure
+ * each next pointer is fetched before each iteration.
+ */
+#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \
+ for (pos = rcu_dereference((head)->first); \
+ pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && \
+ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
+ pos = rcu_dereference(next))
+
#endif /* __KERNEL__ */
#endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 0bea25d..a4f6d3f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -419,8 +419,8 @@
#define sk_for_each(__sk, node, list) \
hlist_for_each_entry(__sk, node, list, sk_node)
-#define sk_for_each_rcu(__sk, node, list) \
- hlist_for_each_entry_rcu(__sk, node, list, sk_node)
+#define sk_for_each_rcu_safenext(__sk, node, list, next) \
+ hlist_for_each_entry_rcu_safenext(__sk, node, list, sk_node, next)
#define sk_for_each_from(__sk, node) \
if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
hlist_for_each_entry_from(__sk, node, sk_node)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ced8203..c3ecec8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -256,7 +256,7 @@
int dif, struct udp_table *udptable)
{
struct sock *sk, *result;
- struct hlist_node *node;
+ struct hlist_node *node, *next;
unsigned short hnum = ntohs(dport);
unsigned int hash = udp_hashfn(net, hnum);
struct udp_hslot *hslot = &udptable->hash[hash];
@@ -266,7 +266,7 @@
begin:
result = NULL;
badness = -1;
- sk_for_each_rcu(sk, node, &hslot->head) {
+ sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
/*
* lockless reader, and SLAB_DESTROY_BY_RCU items:
* We must check this item was not moved to another chain
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1d9790e..32d914d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -98,7 +98,7 @@
int dif, struct udp_table *udptable)
{
struct sock *sk, *result;
- struct hlist_node *node;
+ struct hlist_node *node, *next;
unsigned short hnum = ntohs(dport);
unsigned int hash = udp_hashfn(net, hnum);
struct udp_hslot *hslot = &udptable->hash[hash];
@@ -108,7 +108,7 @@
begin:
result = NULL;
badness = -1;
- sk_for_each_rcu(sk, node, &hslot->head) {
+ sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
/*
* lockless reader, and SLAB_DESTROY_BY_RCU items:
* We must check this item was not moved to another chain