Use of percpu_counter structure to track count of orphaned
sockets is causing problems on modern hosts with 256 cpus
or more.
Stefan Bach reported a serious spinlock contention in real workloads,
that I was able to reproduce with a netfilter rule dropping
incoming FIN packets.
    53.56%  server  [kernel.kallsyms]      [k] queued_spin_lock_slowpath
            |
            ---queued_spin_lock_slowpath
               |
                --53.51%--_raw_spin_lock_irqsave
                          |
                           --53.51%--__percpu_counter_sum
                                     tcp_check_oom
                                     |
                                     |--39.03%--__tcp_close
                                     |          tcp_close
                                     |          inet_release
                                     |          inet6_release
                                     |          sock_close
                                     |          __fput
                                     |          ____fput
                                     |          task_work_run
                                     |          exit_to_usermode_loop
                                     |          do_syscall_64
                                     |          entry_SYSCALL_64_after_hwframe
                                     |          __GI___libc_close
                                     |
                                      --14.48%--tcp_out_of_resources
                                                tcp_write_timeout
                                                tcp_retransmit_timer
                                                tcp_write_timer_handler
                                                tcp_write_timer
                                                call_timer_fn
                                                expire_timers
                                                __run_timers
                                                run_timer_softirq
                                                __softirqentry_text_start
As explained in commit 
cf86a086a180 ("net/dst: use a smaller percpu_counter
batch for dst entries accounting"), default batch size is too big
for the default value of tcp_max_orphans (262144).
But even if we reduce batch sizes, there would still be cases
where the estimated count of orphans is beyond the limit,
and where tcp_too_many_orphans() has to call the expensive
percpu_counter_sum_positive().
One solution is to use plain per-cpu counters, and have
a timer to periodically refresh this cache.
Updating this cache every 100ms seems about right, tcp pressure
state is not radically changing over shorter periods.
percpu_counter was nice 15 years ago while hosts had less
than 16 cpus, not anymore by current standards.
v2: Fix the build issue for CONFIG_CRYPTO_DEV_CHELSIO_TLS=m,
    reported by kernel test robot <lkp@intel.com>
    Remove unused socket argument from tcp_too_many_orphans()
Fixes: dd24c00191d5 ("net: Use a percpu_counter for orphan_count")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Stefan Bach <sfb@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
                 * created only after 3 way handshake is done.
                 */
                sock_orphan(child);
-               percpu_counter_inc((child)->sk_prot->orphan_count);
+               INC_ORPHAN_COUNT(child);
                chtls_release_resources(child);
                chtls_conn_done(child);
        } else {
 
 #define WSCALE_OK(tp) ((tp)->rx_opt.wscale_ok)
 #define TSTAMP_OK(tp) ((tp)->rx_opt.tstamp_ok)
 #define SACK_OK(tp) ((tp)->rx_opt.sack_ok)
-#define INC_ORPHAN_COUNT(sk) percpu_counter_inc((sk)->sk_prot->orphan_count)
+#define INC_ORPHAN_COUNT(sk) this_cpu_inc(*(sk)->sk_prot->orphan_count)
 
 /* TLS SKB */
 #define skb_ulp_tls_inline(skb)      (ULP_SKB_CB(skb)->ulp.tls.ofld)
 
 {
        /* The below has to be done to allow calling inet_csk_destroy_sock */
        sock_set_flag(sk, SOCK_DEAD);
-       percpu_counter_inc(sk->sk_prot->orphan_count);
+       this_cpu_inc(*sk->sk_prot->orphan_count);
 }
 
 void inet_csk_destroy_sock(struct sock *sk);
 
        unsigned int            useroffset;     /* Usercopy region offset */
        unsigned int            usersize;       /* Usercopy region size */
 
-       struct percpu_counter   *orphan_count;
+       unsigned int __percpu   *orphan_count;
 
        struct request_sock_ops *rsk_prot;
        struct timewait_sock_ops *twsk_prot;
 
 
 extern struct inet_hashinfo tcp_hashinfo;
 
-extern struct percpu_counter tcp_orphan_count;
+DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
+int tcp_orphan_count_sum(void);
+
 void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER)
 
 void sk_forced_mem_schedule(struct sock *sk, int size);
 
-static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
-{
-       struct percpu_counter *ocp = sk->sk_prot->orphan_count;
-       int orphans = percpu_counter_read_positive(ocp);
-
-       if (orphans << shift > sysctl_tcp_max_orphans) {
-               orphans = percpu_counter_sum_positive(ocp);
-               if (orphans << shift > sysctl_tcp_max_orphans)
-                       return true;
-       }
-       return false;
-}
-
 bool tcp_check_oom(struct sock *sk, int shift);
 
 
 
 
 extern struct inet_hashinfo dccp_hashinfo;
 
-extern struct percpu_counter dccp_orphan_count;
+DECLARE_PER_CPU(unsigned int, dccp_orphan_count);
 
 void dccp_time_wait(struct sock *sk, int state, int timeo);
 
 
 
 EXPORT_SYMBOL_GPL(dccp_statistics);
 
-struct percpu_counter dccp_orphan_count;
-EXPORT_SYMBOL_GPL(dccp_orphan_count);
+DEFINE_PER_CPU(unsigned int, dccp_orphan_count);
+EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count);
 
 struct inet_hashinfo dccp_hashinfo;
 EXPORT_SYMBOL_GPL(dccp_hashinfo);
        bh_lock_sock(sk);
        WARN_ON(sock_owned_by_user(sk));
 
-       percpu_counter_inc(sk->sk_prot->orphan_count);
+       this_cpu_inc(dccp_orphan_count);
 
        /* Have we already been destroyed by a softirq or backlog? */
        if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
 
        BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
                     sizeof_field(struct sk_buff, cb));
-       rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL);
-       if (rc)
-               goto out_fail;
        inet_hashinfo_init(&dccp_hashinfo);
        rc = inet_hashinfo2_init_mod(&dccp_hashinfo);
        if (rc)
-               goto out_free_percpu;
+               goto out_fail;
        rc = -ENOBUFS;
        dccp_hashinfo.bind_bucket_cachep =
                kmem_cache_create("dccp_bind_bucket",
        kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
 out_free_hashinfo2:
        inet_hashinfo2_free_mod(&dccp_hashinfo);
-out_free_percpu:
-       percpu_counter_destroy(&dccp_orphan_count);
 out_fail:
        dccp_hashinfo.bhash = NULL;
        dccp_hashinfo.ehash = NULL;
        dccp_ackvec_exit();
        dccp_sysctl_exit();
        inet_hashinfo2_free_mod(&dccp_hashinfo);
-       percpu_counter_destroy(&dccp_orphan_count);
 }
 
 module_init(dccp_init);
 
 
        sk_refcnt_debug_release(sk);
 
-       percpu_counter_dec(sk->sk_prot->orphan_count);
+       this_cpu_dec(*sk->sk_prot->orphan_count);
 
        sock_put(sk);
 }
 
        sock_orphan(child);
 
-       percpu_counter_inc(sk->sk_prot->orphan_count);
+       this_cpu_inc(*sk->sk_prot->orphan_count);
 
        if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
                BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
 
        if (ok) {
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
        } else {
-               percpu_counter_inc(sk->sk_prot->orphan_count);
+               this_cpu_inc(*sk->sk_prot->orphan_count);
                inet_sk_set_state(sk, TCP_CLOSE);
                sock_set_flag(sk, SOCK_DEAD);
                inet_csk_destroy_sock(sk);
 
        struct net *net = seq->private;
        int orphans, sockets;
 
-       orphans = percpu_counter_sum_positive(&tcp_orphan_count);
+       orphans = tcp_orphan_count_sum();
        sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
 
        socket_seq_show(seq);
 
        TCP_CMSG_TS = 2
 };
 
-struct percpu_counter tcp_orphan_count;
-EXPORT_SYMBOL_GPL(tcp_orphan_count);
+DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
 
 long sysctl_tcp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_tcp_mem);
 }
 EXPORT_SYMBOL(tcp_shutdown);
 
+int tcp_orphan_count_sum(void)
+{
+       int i, total = 0;
+
+       for_each_possible_cpu(i)
+               total += per_cpu(tcp_orphan_count, i);
+
+       return max(total, 0);
+}
+
+static int tcp_orphan_cache;
+static struct timer_list tcp_orphan_timer;
+#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
+
+static void tcp_orphan_update(struct timer_list *unused)
+{
+       WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
+       mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+}
+
+static bool tcp_too_many_orphans(int shift)
+{
+       return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans;
+}
+
 bool tcp_check_oom(struct sock *sk, int shift)
 {
        bool too_many_orphans, out_of_socket_memory;
 
-       too_many_orphans = tcp_too_many_orphans(sk, shift);
+       too_many_orphans = tcp_too_many_orphans(shift);
        out_of_socket_memory = tcp_out_of_memory(sk);
 
        if (too_many_orphans)
        /* remove backlog if any, without releasing ownership. */
        __release_sock(sk);
 
-       percpu_counter_inc(sk->sk_prot->orphan_count);
+       this_cpu_inc(tcp_orphan_count);
 
        /* Have we already been destroyed by a softirq or backlog? */
        if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
                     sizeof_field(struct sk_buff, cb));
 
        percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
-       percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
+
+       timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
+       mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+
        inet_hashinfo_init(&tcp_hashinfo);
        inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
                            thash_entries, 21,  /* one slot per 2 MB*/