[PATCH 3/3] Convert the UDP hash lock to RCU

View: New views
20 Messages — Rating Filter:   Alert me  
< Prev | 1 - 2 | Next >

[PATCH 3/3] Convert the UDP hash lock to RCU

by Corey Minyard :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Change the UDP hash lock from an rwlock to RCU.

Signed-off-by: Corey Minyard <cminyard@...>
---
 include/net/udp.h |    9 +++++----
 net/ipv4/udp.c    |   47 +++++++++++++++++++++++++++--------------------
 net/ipv6/udp.c    |   17 +++++++++--------
 3 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index addcdc6..35aa104 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -51,7 +51,7 @@ struct udp_skb_cb {
 #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb))
 
 extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
-extern rwlock_t udp_hash_lock;
+extern spinlock_t udp_hash_wlock;
 
 
 /* Note: this must match 'valbool' in sock_setsockopt */
@@ -112,12 +112,13 @@ static inline void udp_lib_hash(struct sock *sk)
 
 static inline void udp_lib_unhash(struct sock *sk)
 {
- write_lock_bh(&udp_hash_lock);
- if (sk_del_node_init(sk)) {
+ spin_lock_bh(&udp_hash_wlock);
+ if (sk_del_node_init_rcu(sk)) {
  inet_sk(sk)->num = 0;
  sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
  }
- write_unlock_bh(&udp_hash_lock);
+ spin_unlock_bh(&udp_hash_wlock);
+ synchronize_rcu();
 }
 
 static inline void udp_lib_close(struct sock *sk, long timeout)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 57e26fa..1b65cb6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -112,7 +112,8 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
 EXPORT_SYMBOL(udp_stats_in6);
 
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
-DEFINE_RWLOCK(udp_hash_lock);
+DEFINE_SPINLOCK(udp_hash_wlock);
+EXPORT_SYMBOL(udp_hash_wlock);
 
 int sysctl_udp_mem[3] __read_mostly;
 int sysctl_udp_rmem_min __read_mostly;
@@ -155,7 +156,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
  int    error = 1;
  struct net *net = sock_net(sk);
 
- write_lock_bh(&udp_hash_lock);
+ spin_lock_bh(&udp_hash_wlock);
 
  if (!snum) {
  int i, low, high, remaining;
@@ -225,12 +226,12 @@ gotit:
  sk->sk_hash = snum;
  if (sk_unhashed(sk)) {
  head = &udptable[udp_hashfn(net, snum)];
- sk_add_node(sk, head);
+ sk_add_node_rcu(sk, head);
  sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
  }
  error = 0;
 fail:
- write_unlock_bh(&udp_hash_lock);
+ spin_unlock_bh(&udp_hash_wlock);
  return error;
 }
 
@@ -260,8 +261,8 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
  unsigned short hnum = ntohs(dport);
  int badness = -1;
 
- read_lock(&udp_hash_lock);
- sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
+ rcu_read_lock();
+ sk_for_each_rcu(sk, node, &udptable[udp_hashfn(net, hnum)]) {
  struct inet_sock *inet = inet_sk(sk);
 
  if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
@@ -296,9 +297,17 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
  }
  }
  }
+ /*
+ * Note that this is safe, even with an RCU lock.
+ * udp_lib_unhash() is the removal function, it calls
+ * synchronize_rcu() and the socket counter cannot go to
+ * zero until it returns.  So if we increment it inside the
+ * RCU read lock, it should never go to zero and then be
+ * incremented again.
+ */
  if (result)
  sock_hold(result);
- read_unlock(&udp_hash_lock);
+ rcu_read_unlock();
  return result;
 }
 
@@ -311,7 +320,7 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk,
  struct sock *s = sk;
  unsigned short hnum = ntohs(loc_port);
 
- sk_for_each_from(s, node) {
+ sk_for_each_from_rcu(s, node) {
  struct inet_sock *inet = inet_sk(s);
 
  if (s->sk_hash != hnum ||
@@ -1094,8 +1103,8 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
  struct sock *sk;
  int dif;
 
- read_lock(&udp_hash_lock);
- sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
+ rcu_read_lock();
+ sk = sk_head_rcu(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
  dif = skb->dev->ifindex;
  sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
  if (sk) {
@@ -1104,8 +1113,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
  do {
  struct sk_buff *skb1 = skb;
 
- sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
-   uh->source, saddr, dif);
+ sknext = udp_v4_mcast_next(sk_next_rcu(sk), uh->dest,
+   daddr, uh->source, saddr,
+   dif);
  if (sknext)
  skb1 = skb_clone(skb, GFP_ATOMIC);
 
@@ -1120,7 +1130,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
  } while (sknext);
  } else
  kfree_skb(skb);
- read_unlock(&udp_hash_lock);
+ rcu_read_unlock();
  return 0;
 }
 
@@ -1543,13 +1553,13 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
  struct net *net = seq_file_net(seq);
 
  do {
- sk = sk_next(sk);
+ sk = sk_next_rcu(sk);
 try_again:
  ;
  } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
  if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
- sk = sk_head(state->hashtable + state->bucket);
+ sk = sk_head_rcu(state->hashtable + state->bucket);
  goto try_again;
  }
  return sk;
@@ -1566,9 +1576,8 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(udp_hash_lock)
 {
- read_lock(&udp_hash_lock);
+ rcu_read_lock();
  return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
 }
 
@@ -1586,9 +1595,8 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void udp_seq_stop(struct seq_file *seq, void *v)
- __releases(udp_hash_lock)
 {
- read_unlock(&udp_hash_lock);
+ rcu_read_unlock();
 }
 
 static int udp_seq_open(struct inode *inode, struct file *file)
@@ -1732,7 +1740,6 @@ void __init udp_init(void)
 
 EXPORT_SYMBOL(udp_disconnect);
 EXPORT_SYMBOL(udp_hash);
-EXPORT_SYMBOL(udp_hash_lock);
 EXPORT_SYMBOL(udp_ioctl);
 EXPORT_SYMBOL(udp_prot);
 EXPORT_SYMBOL(udp_sendmsg);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index a6aecf7..b807de7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -64,8 +64,8 @@ static struct sock *__udp6_lib_lookup(struct net *net,
  unsigned short hnum = ntohs(dport);
  int badness = -1;
 
- read_lock(&udp_hash_lock);
- sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
+ rcu_read_lock();
+ sk_for_each_rcu(sk, node, &udptable[udp_hashfn(net, hnum)]) {
  struct inet_sock *inet = inet_sk(sk);
 
  if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
@@ -101,9 +101,10 @@ static struct sock *__udp6_lib_lookup(struct net *net,
  }
  }
  }
+ /* See comment in __udp4_lib_lookup on why this is safe. */
  if (result)
  sock_hold(result);
- read_unlock(&udp_hash_lock);
+ rcu_read_unlock();
  return result;
 }
 
@@ -322,7 +323,7 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
  struct sock *s = sk;
  unsigned short num = ntohs(loc_port);
 
- sk_for_each_from(s, node) {
+ sk_for_each_from_rcu(s, node) {
  struct inet_sock *inet = inet_sk(s);
 
  if (sock_net(s) != sock_net(sk))
@@ -365,8 +366,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
  const struct udphdr *uh = udp_hdr(skb);
  int dif;
 
- read_lock(&udp_hash_lock);
- sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
+ rcu_read_lock();
+ sk = sk_head_rcu(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
  dif = inet6_iif(skb);
  sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
  if (!sk) {
@@ -375,7 +376,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
  }
 
  sk2 = sk;
- while ((sk2 = udp_v6_mcast_next(sk_next(sk2), uh->dest, daddr,
+ while ((sk2 = udp_v6_mcast_next(sk_next_rcu(sk2), uh->dest, daddr,
  uh->source, saddr, dif))) {
  struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC);
  if (buff) {
@@ -394,7 +395,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
  sk_add_backlog(sk, skb);
  bh_unlock_sock(sk);
 out:
- read_unlock(&udp_hash_lock);
+ rcu_read_unlock();
  return 0;
 }
 
--
1.5.6.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Eric Dumazet :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Corey Minyard a écrit :

> Change the UDP hash lock from an rwlock to RCU.
>
> Signed-off-by: Corey Minyard <cminyard@...>
> ---
>  include/net/udp.h |    9 +++++----
>  net/ipv4/udp.c    |   47 +++++++++++++++++++++++++++--------------------
>  net/ipv6/udp.c    |   17 +++++++++--------
>  3 files changed, 41 insertions(+), 32 deletions(-)
>
> diff --git a/include/net/udp.h b/include/net/udp.h
> index addcdc6..35aa104 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -51,7 +51,7 @@ struct udp_skb_cb {
>  #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb))
>  
>  extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
> -extern rwlock_t udp_hash_lock;
> +extern spinlock_t udp_hash_wlock;
>  
>  
>  /* Note: this must match 'valbool' in sock_setsockopt */
> @@ -112,12 +112,13 @@ static inline void udp_lib_hash(struct sock *sk)
>  
>  static inline void udp_lib_unhash(struct sock *sk)
>  {
> - write_lock_bh(&udp_hash_lock);
> - if (sk_del_node_init(sk)) {
> + spin_lock_bh(&udp_hash_wlock);
> + if (sk_del_node_init_rcu(sk)) {
>   inet_sk(sk)->num = 0;
>   sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
>   }
> - write_unlock_bh(&udp_hash_lock);
> + spin_unlock_bh(&udp_hash_wlock);
> + synchronize_rcu();

UDP central rwlock can hurt performance, because of cache line ping pong,
so your patch really makes sense.

Me wondering what impact this synchronize_rcu() can have on mono-threaded
VOIP applications using lot of UDP sockets. What is the maximum delay of
this function ?

For "struct file" freeing, we chose call_rcu() instead of synchronize_rcu()

Maybe we could add a generic rcu head to struct sock, and use call_rcu() in
sk_prot_free() for sockets needing RCU (udp after your patch is applied, maybe
tcp on future patches, while I believe previous work on the subject concluded
RCU was not giving good results for short lived http sessions) ?

Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free()
done in sk_prot_free() can defer freeing to RCU...




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by David Miller-13 :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

From: Eric Dumazet <dada1@...>
Date: Mon, 06 Oct 2008 23:22:31 +0200

> Me wondering what impact this synchronize_rcu() can have on mono-threaded
> VOIP applications using lot of UDP sockets. What is the maximum delay of
> this function ?

The cost is enormous, we really can't use it here.

I have a patch that did top-level socket destruction using RCU,
and that didn't use synchronize_rcu(), and that killed connection
rates by up to %20.

I can only imagine what the cost would be if I had to add such a call
in there.

Really, I can't consider these changes seriously, as-is.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Corey Minyard :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Eric Dumazet wrote:

> Corey Minyard a écrit :
>> Change the UDP hash lock from an rwlock to RCU.
>>
>> Signed-off-by: Corey Minyard <cminyard@...>
>> ---
>>  include/net/udp.h |    9 +++++----
>>  net/ipv4/udp.c    |   47
>> +++++++++++++++++++++++++++--------------------
>>  net/ipv6/udp.c    |   17 +++++++++--------
>>  3 files changed, 41 insertions(+), 32 deletions(-)
>>
>> diff --git a/include/net/udp.h b/include/net/udp.h
>> index addcdc6..35aa104 100644
>> --- a/include/net/udp.h
>> +++ b/include/net/udp.h
>> @@ -51,7 +51,7 @@ struct udp_skb_cb {
>>  #define UDP_SKB_CB(__skb)    ((struct udp_skb_cb *)((__skb)->cb))
>>  
>>  extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
>> -extern rwlock_t udp_hash_lock;
>> +extern spinlock_t udp_hash_wlock;
>>  
>>  
>>  /* Note: this must match 'valbool' in sock_setsockopt */
>> @@ -112,12 +112,13 @@ static inline void udp_lib_hash(struct sock *sk)
>>  
>>  static inline void udp_lib_unhash(struct sock *sk)
>>  {
>> -    write_lock_bh(&udp_hash_lock);
>> -    if (sk_del_node_init(sk)) {
>> +    spin_lock_bh(&udp_hash_wlock);
>> +    if (sk_del_node_init_rcu(sk)) {
>>          inet_sk(sk)->num = 0;
>>          sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
>>      }
>> -    write_unlock_bh(&udp_hash_lock);
>> +    spin_unlock_bh(&udp_hash_wlock);
>> +    synchronize_rcu();
>
> UDP central rwlock can hurt performance, because of cache line ping pong,
> so your patch really makes sense.
>
> Me wondering what impact this synchronize_rcu() can have on mono-threaded
> VOIP applications using lot of UDP sockets. What is the maximum delay of
> this function ?
It delays until all currently executing RCU read-side sections have
executed (new ones don't count, just currently executing ones).  I'm not
sure what this delay is, but I would expect it to be fairly small.  This
function is only called when a socket is closed, too, so it's not a
high-runner.  Paul would certainly know better than me.

>
> For "struct file" freeing, we chose call_rcu() instead of
> synchronize_rcu()
I'd prefer that, too, but that would mean adding another member to the
socket structure.

>
> Maybe we could add a generic rcu head to struct sock, and use
> call_rcu() in
> sk_prot_free() for sockets needing RCU (udp after your patch is
> applied, maybe
> tcp on future patches, while I believe previous work on the subject
> concluded
> RCU was not giving good results for short lived http sessions) ?
RCU probably wouldn't be a good choice for short-lived http sessions,
since you will only get a couple of messages that would matter.  I'm not
against adding an item to struct sock, but this is not a common thing
and struct sock was already big and ugly.

>
> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done
> in sk_prot_free() can defer freeing to RCU...
That's an interesting thought; I didn't know that capability was there.  
I can look at that.  With this, the short-lived TCP sessions might not
matter, though that's a different issue.

-corey
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Corey Minyard :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

David Miller wrote:

> From: Eric Dumazet <dada1@...>
> Date: Mon, 06 Oct 2008 23:22:31 +0200
>
>  
>> Me wondering what impact this synchronize_rcu() can have on mono-threaded
>> VOIP applications using lot of UDP sockets. What is the maximum delay of
>> this function ?
>>    
>
> The cost is enormous, we really can't use it here.
>
> I have a patch that did top-level socket destruction using RCU,
> and that didn't use synchronize_rcu(), and that killed connection
> rates by up to %20.
>
> I can only imagine what the cost would be if I had to add such a call
> in there.
>
> Really, I can't consider these changes seriously, as-is.
>
>  
Would using SLAB_DESTROY_BY_RCU be ok, or would that be too expensive?

-corey
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Eric Dumazet :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

David Miller a écrit :

> From: Eric Dumazet <dada1@...>
> Date: Mon, 06 Oct 2008 23:22:31 +0200
>
>> Me wondering what impact this synchronize_rcu() can have on mono-threaded
>> VOIP applications using lot of UDP sockets. What is the maximum delay of
>> this function ?
>
> The cost is enormous, we really can't use it here.
>
> I have a patch that did top-level socket destruction using RCU,
> and that didn't use synchronize_rcu(), and that killed connection
> rates by up to %20.
>
> I can only imagine what the cost would be if I had to add such a call
> in there.
>
> Really, I can't consider these changes seriously, as-is.

Yes, I suppose you are right about TCP sessions, that should stay as they are.

Then if we use call_rcu() RCU freeing only for UDP sockets, we should get rid of
taking this rwlock each time we handle an incoming datagram, and introduce
no extra cost for other sockets.

Most UDP sockets are setup for long periods (RTP trafic), or if an application really
wants to {open/send or receive one UDP frame/close} many sockets, it already hits
RCU handling of its file structures and should not be slowed down that much.

By 'long period' I mean thousand of packets sent/received by each RTP session, being
voice (50 packets/second) or even worse video...




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Peter Zijlstra :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

On Mon, 2008-10-06 at 23:22 +0200, Eric Dumazet wrote:

> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free()
> done in sk_prot_free() can defer freeing to RCU...

Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
RCU-freed, this means that slab object pointers stay pointing to valid
memory, but it does _NOT_ mean those slab objects themselves remain
valid.

The slab allocator is free to re-use those objects at any time -
irrespective of the rcu-grace period. Therefore you will have to be able
to validate that the object you point to is indeed the object you
expect, otherwise strange and wonderful things will happen.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Peter Zijlstra :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

On Mon, 2008-10-06 at 14:40 -0700, David Miller wrote:

> From: Eric Dumazet <dada1@...>
> Date: Mon, 06 Oct 2008 23:22:31 +0200
>
> > Me wondering what impact this synchronize_rcu() can have on mono-threaded
> > VOIP applications using lot of UDP sockets. What is the maximum delay of
> > this function ?
>
> The cost is enormous, we really can't use it here.
>
> I have a patch that did top-level socket destruction using RCU,
> and that didn't use synchronize_rcu(), and that killed connection
> rates by up to %20.

Did you ever figure out why you lost those 20% ?

> I can only imagine what the cost would be if I had to add such a call
> in there.

Yeah, sync_rcu() is rediculously expensive, at best 3 jiffies IIRC.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Evgeniy Polyakov :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

On Mon, Oct 06, 2008 at 06:08:09PM -0500, Corey Minyard (minyard@...) wrote:
> Would using SLAB_DESTROY_BY_RCU be ok, or would that be too expensive?

I tested skb destruction via RCU path, and got 2.5 times worse numbers
with small-packets-bulk-transfer workload.

For more details
http://tservice.net.ru/~s0mbre/blog/devel/networking/2006_12_05_1.html

--
        Evgeniy Polyakov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Benny Amorsen :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Eric Dumazet <dada1@...> writes:

> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
> RCU handling of its file structures and should not be slowed down that much.
>
> By 'long period' I mean thousand of packets sent/received by each RTP session, being
> voice (50 packets/second) or even worse video...

Does DNS with port randomization need short lived sockets?


/Benny

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Eric Dumazet :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Peter Zijlstra a écrit :

> On Mon, 2008-10-06 at 23:22 +0200, Eric Dumazet wrote:
>
>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free()
>> done in sk_prot_free() can defer freeing to RCU...
>
> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
> RCU-freed, this means that slab object pointers stay pointing to valid
> memory, but it does _NOT_ mean those slab objects themselves remain
> valid.
>
> The slab allocator is free to re-use those objects at any time -
> irrespective of the rcu-grace period. Therefore you will have to be able
> to validate that the object you point to is indeed the object you
> expect, otherwise strange and wonderful things will happen.
>
Thanks for this clarification. I guess we really need a rcu head then :)



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Eric Dumazet :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Benny Amorsen a écrit :
> Eric Dumazet <dada1@...> writes:
>
>> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
>> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
>> RCU handling of its file structures and should not be slowed down that much.
>>

I should have say 'Many' instead of 'Most' :)

>> By 'long period' I mean thousand of packets sent/received by each RTP session, being
>> voice (50 packets/second) or even worse video...
>
> Does DNS with port randomization need short lived sockets?
>

Yes very true, but current allocation of a random port can be very expensive,
since we scan all the UDP hash table to select the smaller hash chain.

We stop the scan if we find an empty slot, but on machines with say more than 200
bound UDP sockets, they are probably no empty slots. (UDP_HTABLE_SIZE is 128)

bind(NULL port) algo is then O(N), N being number of bound UDP sockets.

So heavy DNS servers/proxies probably use a pool/range of pre-allocated sockets
to avoid costs of allocating/freeing them ? If they dont care about that cost,
the extra call_rcu() will be unnoticed.

For pathological (yet very common :) ) cases like single DNS query/answer, RCU
would mean :

Pros :
- one few rwlock hit when receiving the answer (if any)
Cons :
- one call_rcu() to delay socket freeing/reuse after RCU period.

So it might be a litle bit more expensive than without RCU

I agree I am more interested in optimizing UDP stack for heavy users like RTP
servers/proxies handling xxx.000 packets/second than DNS users/servers.
Shame on me :)

(2 weeks ago, Corey mentioned a 10x increase on UDP throughput on a 16-way machine,
that sounds promising)





--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Stephen Hemminger-6 :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

On Tue, 07 Oct 2008 14:59:20 +0200
Eric Dumazet <dada1@...> wrote:

> Benny Amorsen a écrit :
> > Eric Dumazet <dada1@...> writes:
> >
> >> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
> >> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
> >> RCU handling of its file structures and should not be slowed down that much.
> >>
>
> I should have say 'Many' instead of 'Most' :)
>
> >> By 'long period' I mean thousand of packets sent/received by each RTP session, being
> >> voice (50 packets/second) or even worse video...
> >
> > Does DNS with port randomization need short lived sockets?
> >
>
> Yes very true, but current allocation of a random port can be very expensive,
> since we scan all the UDP hash table to select the smaller hash chain.
>
> We stop the scan if we find an empty slot, but on machines with say more than 200
> bound UDP sockets, they are probably no empty slots. (UDP_HTABLE_SIZE is 128)
>
> bind(NULL port) algo is then O(N), N being number of bound UDP sockets.
>
> So heavy DNS servers/proxies probably use a pool/range of pre-allocated sockets
> to avoid costs of allocating/freeing them ? If they dont care about that cost,
> the extra call_rcu() will be unnoticed.
>
> For pathological (yet very common :) ) cases like single DNS query/answer, RCU
> would mean :
>
> Pros :
> - one few rwlock hit when receiving the answer (if any)
> Cons :
> - one call_rcu() to delay socket freeing/reuse after RCU period.
>
> So it might be a litle bit more expensive than without RCU
>
> I agree I am more interested in optimizing UDP stack for heavy users like RTP
> servers/proxies handling xxx.000 packets/second than DNS users/servers.
> Shame on me :)
>
> (2 weeks ago, Corey mentioned a 10x increase on UDP throughput on a 16-way machine,
> that sounds promising)

The idea of keeping chains short is the problem. That code should just be pulled because
it doesn't help that much, and also creates bias on the port randomization.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Christoph Lameter-7 :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Eric Dumazet wrote:

>>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
>>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done
>>> in sk_prot_free() can defer freeing to RCU...
>>
>> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
>> RCU-freed, this means that slab object pointers stay pointing to valid
>> memory, but it does _NOT_ mean those slab objects themselves remain
>> valid.
>>
>> The slab allocator is free to re-use those objects at any time -
>> irrespective of the rcu-grace period. Therefore you will have to be able
>> to validate that the object you point to is indeed the object you
>> expect, otherwise strange and wonderful things will happen.
>>
> Thanks for this clarification. I guess we really need a rcu head then :)

No you just need to make sure that the object you located is still active
(f.e. refcount > 0) and that it is really a match (hash pointers may be
updated asynchronously and therefore point to the object that has been reused
for something else).

Generally it is advisable to use SLAB_DESTROY_BY_RCU because it preserves the
cache hot advantages of the objects. Regular RCU freeing will let the object
expire for a tick or so which will result in the cacheline cooling down.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Christoph Lameter-7 :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Evgeniy Polyakov wrote:
> On Mon, Oct 06, 2008 at 06:08:09PM -0500, Corey Minyard (minyard@...) wrote:
>> Would using SLAB_DESTROY_BY_RCU be ok, or would that be too expensive?
>
> I tested skb destruction via RCU path, and got 2.5 times worse numbers
> with small-packets-bulk-transfer workload.

Was this with regular RCU freeing? This will cool down the cacheline before
frees. You need SLAB_DESTROY_BY_RCU to keep the objects cache hot.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU

by Evgeniy Polyakov :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

On Tue, Oct 07, 2008 at 09:16:13AM -0500, Christoph Lameter (cl@...) wrote:
> > I tested skb destruction via RCU path, and got 2.5 times worse numbers
> > with small-packets-bulk-transfer workload.
>
> Was this with regular RCU freeing? This will cool down the cacheline before
> frees. You need SLAB_DESTROY_BY_RCU to keep the objects cache hot.

I believe there were no SLAB_DESTROY_BY_RCU 2 yars ago :)

It was pure call_rcu(&skb->rcu, real_skb_freeing), where
real_skb_freeing() just did usual kfree().

--
        Evgeniy Polyakov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] Convert the UDP hash lock to RCU