[NET]: Size listen hash tables using backlog hint
We currently allocate a fixed size (TCP_SYNQ_HSIZE=512) slots hash table for
each LISTEN socket, regardless of various parameters (listen backlog for
example)
On x86_64, this means order-1 allocations (might fail), even for 'small'
sockets, expecting few connections. On the contrary, a huge server wanting a
backlog of 50000 is slowed down a bit because of this fixed limit.
This patch makes the sizing of listen hash table a dynamic parameter,
depending of :
- net.core.somaxconn tunable (default is 128)
- net.ipv4.tcp_max_syn_backlog tunable (default : 256, 1024 or 128)
- backlog value given by user application (2nd parameter of listen())
For large allocations (bigger than PAGE_SIZE), we use vmalloc() instead of
kmalloc().
We still limit memory allocation with the two existing tunables (somaxconn &
tcp_max_syn_backlog). So for standard setups, this patch actually reduce RAM
usage.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index edcf093..4a81d54 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -204,7 +204,7 @@
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
- err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
+ err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 96bbe2a..9d68837 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -343,7 +343,7 @@
EXPORT_SYMBOL_GPL(inet_csk_route_req);
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
- const u32 rnd, const u16 synq_hsize)
+ const u32 rnd, const u32 synq_hsize)
{
return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 22ef8bd..5fbf965 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -715,7 +715,7 @@
return dopt;
}
-struct request_sock_ops tcp_request_sock_ops = {
+struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct tcp_request_sock),
.rtx_syn_ack = tcp_v4_send_synack,
@@ -1385,7 +1385,7 @@
if (st->state == TCP_SEQ_STATE_OPENREQ) {
struct request_sock *req = cur;
- icsk = inet_csk(st->syn_wait_sk);
+ icsk = inet_csk(st->syn_wait_sk);
req = req->dl_next;
while (1) {
while (req) {
@@ -1395,7 +1395,7 @@
}
req = req->dl_next;
}
- if (++st->sbucket >= TCP_SYNQ_HSIZE)
+ if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
break;
get_req:
req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];