net: introduce SO_INCOMING_CPU Alternative to RPS/RFS is to use hardware support for multiple queues. Then split a set of million of sockets into worker threads, each one using epoll() to manage events on its own socket pool. Ideally, we want one thread per RX/TX queue/cpu, but we have no way to know after accept() or connect() on which queue/cpu a socket is managed. We normally use one cpu per RX queue (IRQ smp_affinity being properly set), so remembering on socket structure which cpu delivered last packet is enough to solve the problem. After accept(), connect(), or even file descriptor passing around processes, applications can use : int cpu; socklen_t len = sizeof(cpu); getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len); And use this information to put the socket into the right silo for optimal performance, as all networking stack should run on the appropriate cpu, without need to send IPI (RPS/RFS). Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>

commit: 2c8c56e15df3d4c2af3d656e44feb18789f75837 [log] [tgz]
author: Eric Dumazet <edumazet@google.com> Tue Nov 11 05:54:28 2014 -0800
committer: David S. Miller <davem@davemloft.net> Tue Nov 11 13:00:06 2014 -0500
tree: e3c81c868a7c14ca2bac7efd69b6b21e25c355d4
parent: 3d97379a67486bc481ab5b8f7aa5b7ceb6154a95 [diff]
diff --git a/net/core/sock.c b/net/core/sock.c
index ac56dd0..0725cf0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c

@@ -1213,6 +1213,10 @@
 		v.val = sk->sk_max_pacing_rate;
 		break;
 
+	case SO_INCOMING_CPU:
+		v.val = sk->sk_incoming_cpu;
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -1517,6 +1521,7 @@
 
 		newsk->sk_err	   = 0;
 		newsk->sk_priority = 0;
+		newsk->sk_incoming_cpu = raw_smp_processor_id();
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
 		 * (Documentation/RCU/rculist_nulls.txt for details)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8893598..2c6a955 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c

@@ -1663,6 +1663,7 @@
 	if (sk_filter(sk, skb))
 		goto discard_and_relse;
 
+	sk_incoming_cpu_update(sk);
 	skb->dev = NULL;
 
 	bh_lock_sock_nested(sk);

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5d0fdca..d137516 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c

@@ -1445,6 +1445,7 @@
 	if (inet_sk(sk)->inet_daddr) {
 		sock_rps_save_rxhash(sk, skb);
 		sk_mark_napi_id(sk, skb);
+		sk_incoming_cpu_update(sk);
 	}
 
 	rc = sock_queue_rcv_skb(sk, skb);

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index fd8e50b..1985b49 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c

@@ -1456,6 +1456,7 @@
 	if (sk_filter(sk, skb))
 		goto discard_and_relse;
 
+	sk_incoming_cpu_update(sk);
 	skb->dev = NULL;
 
 	bh_lock_sock_nested(sk);

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b756355..d1fe362 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c

@@ -577,6 +577,7 @@
 	if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
 		sock_rps_save_rxhash(sk, skb);
 		sk_mark_napi_id(sk, skb);
+		sk_incoming_cpu_update(sk);
 	}
 
 	rc = sock_queue_rcv_skb(sk, skb);

diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index d49dc2ed..ce469d6 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c

@@ -205,9 +205,10 @@
 	if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
 		goto out_free;
 
-	if (!sctp_ulpevent_is_notification(event))
+	if (!sctp_ulpevent_is_notification(event)) {
 		sk_mark_napi_id(sk, skb);
-
+		sk_incoming_cpu_update(sk);
+	}
 	/* Check if the user wishes to receive this event.  */
 	if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe))
 		goto out_free;
commit	2c8c56e15df3d4c2af3d656e44feb18789f75837	[log] [tgz]
author	Eric Dumazet <edumazet@google.com>	Tue Nov 11 05:54:28 2014 -0800
committer	David S. Miller <davem@davemloft.net>	Tue Nov 11 13:00:06 2014 -0500
tree	e3c81c868a7c14ca2bac7efd69b6b21e25c355d4
parent	3d97379a67486bc481ab5b8f7aa5b7ceb6154a95 [diff]