net: listening_hash get a spinlock per bucket

This patch prepares RCU migration of listening_hash table for
TCP/DCCP protocols.

listening_hash table being small (32 slots per protocol), we add
a spinlock for each slot, instead of a single rwlock for whole table.

This should reduce hold time of readers, and writers concurrency.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4818960..62d2dd0d 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -99,6 +99,11 @@
 	struct hlist_head	chain;
 };
 
+struct inet_listen_hashbucket {
+	spinlock_t		lock;
+	struct hlist_head	head;
+};
+
 /* This is for listening sockets, thus all sockets which possess wildcards. */
 #define INET_LHTABLE_SIZE	32	/* Yes, really, this is all you need. */
 
@@ -123,22 +128,21 @@
 	unsigned int			bhash_size;
 	/* Note : 4 bytes padding on 64 bit arches */
 
-	/* All sockets in TCP_LISTEN state will be in here.  This is the only
-	 * table where wildcard'd TCP sockets can exist.  Hash function here
-	 * is just local port number.
-	 */
-	struct hlist_head		listening_hash[INET_LHTABLE_SIZE];
+	struct kmem_cache		*bind_bucket_cachep;
 
 	/* All the above members are written once at bootup and
 	 * never written again _or_ are predominantly read-access.
 	 *
 	 * Now align to a new cache line as all the following members
-	 * are often dirty.
+	 * might be often dirty.
 	 */
-	rwlock_t			lhash_lock ____cacheline_aligned;
-	atomic_t			lhash_users;
-	wait_queue_head_t		lhash_wait;
-	struct kmem_cache			*bind_bucket_cachep;
+	/* All sockets in TCP_LISTEN state will be in here.  This is the only
+	 * table where wildcard'd TCP sockets can exist.  Hash function here
+	 * is just local port number.
+	 */
+	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
+					____cacheline_aligned_in_smp;
+
 };
 
 static inline struct inet_ehash_bucket *inet_ehash_bucket(
@@ -236,26 +240,7 @@
 
 extern void inet_put_port(struct sock *sk);
 
-extern void inet_listen_wlock(struct inet_hashinfo *hashinfo);
-
-/*
- * - We may sleep inside this lock.
- * - If sleeping is not required (or called from BH),
- *   use plain read_(un)lock(&inet_hashinfo.lhash_lock).
- */
-static inline void inet_listen_lock(struct inet_hashinfo *hashinfo)
-{
-	/* read_lock synchronizes to candidates to writers */
-	read_lock(&hashinfo->lhash_lock);
-	atomic_inc(&hashinfo->lhash_users);
-	read_unlock(&hashinfo->lhash_lock);
-}
-
-static inline void inet_listen_unlock(struct inet_hashinfo *hashinfo)
-{
-	if (atomic_dec_and_test(&hashinfo->lhash_users))
-		wake_up(&hashinfo->lhash_wait);
-}
+void inet_hashinfo_init(struct inet_hashinfo *h);
 
 extern void __inet_hash_nolisten(struct sock *sk);
 extern void inet_hash(struct sock *sk);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index bdf784c..8b63394 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -44,12 +44,7 @@
 
 EXPORT_SYMBOL_GPL(dccp_orphan_count);
 
-struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
-	.lhash_lock	= RW_LOCK_UNLOCKED,
-	.lhash_users	= ATOMIC_INIT(0),
-	.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
-};
-
+struct inet_hashinfo dccp_hashinfo;
 EXPORT_SYMBOL_GPL(dccp_hashinfo);
 
 /* the maximum queue length for tx in packets. 0 is no limit */
@@ -1030,6 +1025,7 @@
 	BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
 		     FIELD_SIZEOF(struct sk_buff, cb));
 
+	inet_hashinfo_init(&dccp_hashinfo);
 	dccp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("dccp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 41b3672..1cb154e 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -718,13 +718,15 @@
 		if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
 			goto skip_listen_ht;
 
-		inet_listen_lock(hashinfo);
 		for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
 			struct sock *sk;
 			struct hlist_node *node;
+			struct inet_listen_hashbucket *ilb;
 
 			num = 0;
-			sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
+			ilb = &hashinfo->listening_hash[i];
+			spin_lock_bh(&ilb->lock);
+			sk_for_each(sk, node, &ilb->head) {
 				struct inet_sock *inet = inet_sk(sk);
 
 				if (num < s_num) {
@@ -742,7 +744,7 @@
 					goto syn_recv;
 
 				if (inet_csk_diag_dump(sk, skb, cb) < 0) {
-					inet_listen_unlock(hashinfo);
+					spin_unlock_bh(&ilb->lock);
 					goto done;
 				}
 
@@ -751,7 +753,7 @@
 					goto next_listen;
 
 				if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
-					inet_listen_unlock(hashinfo);
+					spin_unlock_bh(&ilb->lock);
 					goto done;
 				}
 
@@ -760,12 +762,12 @@
 				cb->args[4] = 0;
 				++num;
 			}
+			spin_unlock_bh(&ilb->lock);
 
 			s_num = 0;
 			cb->args[3] = 0;
 			cb->args[4] = 0;
 		}
-		inet_listen_unlock(hashinfo);
 skip_listen_ht:
 		cb->args[0] = 1;
 		s_i = num = s_num = 0;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fd269cf..377d004 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -111,35 +111,6 @@
 EXPORT_SYMBOL_GPL(__inet_inherit_port);
 
 /*
- * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
- * Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines (wake up each
- * exclusive lock release). It should be ifdefed really.
- */
-void inet_listen_wlock(struct inet_hashinfo *hashinfo)
-	__acquires(hashinfo->lhash_lock)
-{
-	write_lock(&hashinfo->lhash_lock);
-
-	if (atomic_read(&hashinfo->lhash_users)) {
-		DEFINE_WAIT(wait);
-
-		for (;;) {
-			prepare_to_wait_exclusive(&hashinfo->lhash_wait,
-						  &wait, TASK_UNINTERRUPTIBLE);
-			if (!atomic_read(&hashinfo->lhash_users))
-				break;
-			write_unlock_bh(&hashinfo->lhash_lock);
-			schedule();
-			write_lock_bh(&hashinfo->lhash_lock);
-		}
-
-		finish_wait(&hashinfo->lhash_wait, &wait);
-	}
-}
-
-/*
  * Don't inline this cruft. Here are some nice properties to exploit here. The
  * BSD API does not allow a listening sock to specify the remote port nor the
  * remote address for the connection. So always assume those are both
@@ -191,25 +162,25 @@
 				    const int dif)
 {
 	struct sock *sk = NULL;
-	const struct hlist_head *head;
+	struct inet_listen_hashbucket *ilb;
 
-	read_lock(&hashinfo->lhash_lock);
-	head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
-	if (!hlist_empty(head)) {
-		const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
+	ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
+	spin_lock(&ilb->lock);
+	if (!hlist_empty(&ilb->head)) {
+		const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head)));
 
 		if (inet->num == hnum && !sk->sk_node.next &&
 		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 		    !sk->sk_bound_dev_if && net_eq(sock_net(sk), net))
 			goto sherry_cache;
-		sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif);
+		sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif);
 	}
 	if (sk) {
 sherry_cache:
 		sock_hold(sk);
 	}
-	read_unlock(&hashinfo->lhash_lock);
+	spin_unlock(&ilb->lock);
 	return sk;
 }
 EXPORT_SYMBOL_GPL(__inet_lookup_listener);
@@ -389,8 +360,7 @@
 static void __inet_hash(struct sock *sk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-	struct hlist_head *list;
-	rwlock_t *lock;
+	struct inet_listen_hashbucket *ilb;
 
 	if (sk->sk_state != TCP_LISTEN) {
 		__inet_hash_nolisten(sk);
@@ -398,14 +368,12 @@
 	}
 
 	WARN_ON(!sk_unhashed(sk));
-	list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
-	lock = &hashinfo->lhash_lock;
+	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
 
-	inet_listen_wlock(hashinfo);
-	__sk_add_node(sk, list);
+	spin_lock(&ilb->lock);
+	__sk_add_node(sk, &ilb->head);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
-	write_unlock(lock);
-	wake_up(&hashinfo->lhash_wait);
+	spin_unlock(&ilb->lock);
 }
 
 void inet_hash(struct sock *sk)
@@ -420,29 +388,27 @@
 
 void inet_unhash(struct sock *sk)
 {
-	rwlock_t *lock;
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 
 	if (sk_unhashed(sk))
-		goto out;
+		return;
 
 	if (sk->sk_state == TCP_LISTEN) {
-		local_bh_disable();
-		inet_listen_wlock(hashinfo);
-		lock = &hashinfo->lhash_lock;
+		struct inet_listen_hashbucket *ilb;
+
+		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+		spin_lock_bh(&ilb->lock);
 		if (__sk_del_node_init(sk))
 			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+		spin_unlock_bh(&ilb->lock);
 	} else {
-		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+		rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
 		write_lock_bh(lock);
 		if (__sk_nulls_del_node_init_rcu(sk))
 			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+		write_unlock_bh(lock);
 	}
-
-	write_unlock_bh(lock);
-out:
-	if (sk->sk_state == TCP_LISTEN)
-		wake_up(&hashinfo->lhash_wait);
 }
 EXPORT_SYMBOL_GPL(inet_unhash);
 
@@ -556,3 +522,13 @@
 }
 
 EXPORT_SYMBOL_GPL(inet_hash_connect);
+
+void inet_hashinfo_init(struct inet_hashinfo *h)
+{
+	int i;
+
+	for (i = 0; i < INET_LHTABLE_SIZE; i++)
+		spin_lock_init(&h->listening_hash[i].lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5559fea..330b08a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -97,11 +97,7 @@
 }
 #endif
 
-struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
-	.lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
-	.lhash_users = ATOMIC_INIT(0),
-	.lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
-};
+struct inet_hashinfo tcp_hashinfo;
 
 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 {
@@ -1874,15 +1870,18 @@
 	struct inet_connection_sock *icsk;
 	struct hlist_node *node;
 	struct sock *sk = cur;
+	struct inet_listen_hashbucket *ilb;
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 
 	if (!sk) {
 		st->bucket = 0;
-		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
+		ilb = &tcp_hashinfo.listening_hash[0];
+		spin_lock_bh(&ilb->lock);
+		sk = sk_head(&ilb->head);
 		goto get_sk;
 	}
-
+	ilb = &tcp_hashinfo.listening_hash[st->bucket];
 	++st->num;
 
 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
@@ -1932,8 +1931,11 @@
 		}
 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
 	}
+	spin_unlock_bh(&ilb->lock);
 	if (++st->bucket < INET_LHTABLE_SIZE) {
-		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
+		ilb = &tcp_hashinfo.listening_hash[st->bucket];
+		spin_lock_bh(&ilb->lock);
+		sk = sk_head(&ilb->head);
 		goto get_sk;
 	}
 	cur = NULL;
@@ -2066,12 +2068,10 @@
 	void *rc;
 	struct tcp_iter_state *st = seq->private;
 
-	inet_listen_lock(&tcp_hashinfo);
 	st->state = TCP_SEQ_STATE_LISTENING;
 	rc	  = listening_get_idx(seq, &pos);
 
 	if (!rc) {
-		inet_listen_unlock(&tcp_hashinfo);
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 		rc	  = established_get_idx(seq, pos);
 	}
@@ -2103,7 +2103,6 @@
 	case TCP_SEQ_STATE_LISTENING:
 		rc = listening_get_next(seq, v);
 		if (!rc) {
-			inet_listen_unlock(&tcp_hashinfo);
 			st->state = TCP_SEQ_STATE_ESTABLISHED;
 			rc	  = established_get_first(seq);
 		}
@@ -2130,7 +2129,7 @@
 		}
 	case TCP_SEQ_STATE_LISTENING:
 		if (v != SEQ_START_TOKEN)
-			inet_listen_unlock(&tcp_hashinfo);
+			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
 		break;
 	case TCP_SEQ_STATE_TIME_WAIT:
 	case TCP_SEQ_STATE_ESTABLISHED:
@@ -2405,6 +2404,7 @@
 
 void __init tcp_v4_init(void)
 {
+	inet_hashinfo_init(&tcp_hashinfo);
 	if (register_pernet_device(&tcp_sk_ops))
 		panic("Failed to create the TCP control socket.\n");
 }
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index c1b4d40..21544b9 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -25,30 +25,30 @@
 void __inet6_hash(struct sock *sk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-	rwlock_t *lock;
 
 	WARN_ON(!sk_unhashed(sk));
 
 	if (sk->sk_state == TCP_LISTEN) {
-		struct hlist_head *list;
+		struct inet_listen_hashbucket *ilb;
 
-		list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
-		lock = &hashinfo->lhash_lock;
-		inet_listen_wlock(hashinfo);
-		__sk_add_node(sk, list);
+		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+		spin_lock(&ilb->lock);
+		__sk_add_node(sk, &ilb->head);
+		spin_unlock(&ilb->lock);
 	} else {
 		unsigned int hash;
 		struct hlist_nulls_head *list;
+		rwlock_t *lock;
 
 		sk->sk_hash = hash = inet6_sk_ehashfn(sk);
 		list = &inet_ehash_bucket(hashinfo, hash)->chain;
 		lock = inet_ehash_lockp(hashinfo, hash);
 		write_lock(lock);
 		__sk_nulls_add_node_rcu(sk, list);
+		write_unlock(lock);
 	}
 
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
-	write_unlock(lock);
 }
 EXPORT_SYMBOL(__inet6_hash);
 
@@ -126,10 +126,11 @@
 	const struct hlist_node *node;
 	struct sock *result = NULL;
 	int score, hiscore = 0;
+	struct inet_listen_hashbucket *ilb;
 
-	read_lock(&hashinfo->lhash_lock);
-	sk_for_each(sk, node,
-			&hashinfo->listening_hash[inet_lhashfn(net, hnum)]) {
+	ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
+	spin_lock(&ilb->lock);
+	sk_for_each(sk, node, &ilb->head) {
 		if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum &&
 				sk->sk_family == PF_INET6) {
 			const struct ipv6_pinfo *np = inet6_sk(sk);
@@ -157,7 +158,7 @@
 	}
 	if (result)
 		sock_hold(result);
-	read_unlock(&hashinfo->lhash_lock);
+	spin_unlock(&ilb->lock);
 	return result;
 }