Bug #326

Fwd: [PATCH net-next] sch_sfq: rehash queues in perturb timer

Added by David Taht on Dec 21, 2011. Updated on Apr 21, 2012.
Closed Normal Dave Täht

Description

Eric, I think I love you.

I’d like to slam this into cerowrt, which is based on 3.1.5 for at
least another month. Are there dependencies on the other work you’ve
been doing in this area I should worry about?

———- Forwarded message ———-
From: Eric Dumazet eric.dumazet@gmail.com
Date: Wed, Dec 21, 2011 at 2:30 PM
Subject: [PATCH net-next] sch_sfq: rehash queues in perturb timer
To: David Miller davem@davemloft.net
Cc: netdev netdev@vger.kernel.org

A known Out Of Order (OOO) problem hurts SFQ when timer changes
perturbation value, since all new packets delivered to SFQ enqueue might
end on different slots than previous in-flight packets.

With round robin delivery, we can thus deliver packets in a different
order.

Since SFQ is limited to small amount of in-flight packets, we can rehash
packets so that this OOO problem is fixed.

This rehashing is performed only if internal flow classifier is in use.

We now store in skb->cb[] the “struct flow_keys” so that we dont call
skb_flow_dissect() again while rehashing.

Signed-off-by: Eric Dumazet eric.dumazet@gmail.com

 net/sched/sch_sfq.c |   87 **+—
 1 file changed, 81 insertions(+), 6 deletions(-)

diff –git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 30cda70..d329a8a 100644
— a/net/sched/sch_sfq.c
**+ b/net/sched/sch_sfq.c
@ -136,16 +136,30@ static inline struct sfq_head
*sfq_dep_head(struct sfq_sched_data *q, sfq_index
       return &q->dep[val - SFQ_SLOTS];
 }

+/*
+ * In order to be able to quickly rehash our queue when timer changes
+ * q->perturbation, we store flow_keys in skb->cb[]
+ */
+struct sfq_skb_cb {
+       struct flow_keys        keys;
+};
+
+static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
+{
+       BUILD_BUG_ON(sizeof(skb->cb) <
+               sizeof(struct qdisc_skb_cb) + sizeof(struct sfq_skb_cb));
+       return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
 static unsigned int sfq_hash(const struct sfq_sched_data *q,
                            const struct sk_buff *skb)
 {
-       struct flow_keys keys;
+       const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
       unsigned int hash;

  •       skb_flow_dissect(skb, &keys);
  •       hash = jhash_3words((__force u32)keys.dst,
  •                           (__force u32)keys.src \^ keys.ip_proto,
  •                           (__force u32)keys.ports, q->perturbation);
  •       hash = jhash_3words((__force u32)keys->dst,
  •                           (__force u32)keys->src \^ keys->ip_proto,
  •                           (__force u32)keys->ports, q->perturbation);
           return hash & (q->divisor - 1);
     }

@ -161,8 +175,10@ static unsigned int sfq_classify(struct sk_buff
*skb, struct Qdisc *sch,
           TC_H_MIN(skb->priority) <= q->divisor)
               return TC_H_MIN(skb->priority);

  •       if (!q->filter_list)
  •       if (!q->filter_list) {
  •               skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
                   return sfq_hash(q, skb) + 1;
  •       }

       *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
       result = tc_classify(skb, q->filter_list, &res);
@ -423,12 +439,71@ sfq_reset(struct Qdisc *sch)
               kfree_skb(skb);
 }

+/*
+ * When q->perturbation is changed, we rehash all queued skbs
+ * to avoid OOO (Out Of Order) effects.
+ * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
+ * counters.
+ */
+static void sfq_rehash(struct sfq_sched_data *q)
+{
+       struct sk_buff *skb;
+       int i;
+       struct sfq_slot *slot;
+       struct sk_buff_head list;
+
+       __skb_queue_head_init(&list);
+
+       for (i = 0; i < SFQ_SLOTS; i++) {
+               slot = &q->slots[i];
+               if (!slot->qlen)
+                       continue;
+               while (slot->qlen) {
+                       skb = slot_dequeue_head(slot);
+                       sfq_dec(q, i);
+                       __skb_queue_tail(&list, skb);
+               }
+               q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+       }
+       q->tail = NULL;
+
+       while ((skb = __skb_dequeue(&list)) != NULL) {
+               unsigned int hash = sfq_hash(q, skb);
+               sfq_index x = q->ht[hash];
+
+               slot = &q->slots[x];
+               if (x SFQ_EMPTY_SLOT) { +                       x = q->dep[0].next; /* get a free slot */ +                       q->ht[hash] = x; +                       slot = &q->slots[x]; +                       slot->hash = hash; +               } +               slot_queue_add(slot, skb); +               sfq_inc(q, x); +               if (slot->qlen 1) {          /* The flow is new */
+                       if (q->tail == NULL) {  /* It is the first flow */
+                               slot->next = x;
+                       } else {
+                               slot->next = q->tail->next;
+                               q->tail->next = x;
+                       }
+                       q->tail = slot;
+                       slot->allot = q->scaled_quantum;
+               }
+       }
+}
+
 static void sfq_perturbation(unsigned long arg)
 {
       struct Qdisc sch = (struct Qdisc)arg;
       struct sfq_sched_data *q = qdisc_priv(sch);
+       spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));

  •       spin_lock(root_lock);
           q->perturbation = net_random();
  •       if (!q->filter_list && q->tail)
  •               sfq_rehash(q);
  •       spin_unlock(root_lock);

       if (q->perturb_period)
               mod_timer(&q->perturb_timer, jiffies + q->perturb_period);

History

Updated by David Taht on Dec 21, 2011.
On Thu, Dec 22, 2011 at 5:19 AM, Eric Dumazet eric.dumazet@gmail.com wrote:
> Le jeudi 22 décembre 2011 à 05:02 +0100, Dave Taht a écrit :
>> Eric, I think I love you.
>>
>> I’d like to slam this into cerowrt, which is based on 3.1.5 for at
>> least another month. Are there dependencies on the other work you’ve
>> been doing in this area I should worry about?
>>
>
> Hmm, this work depends on skb_flow_dissect() patches, included in
> net-next only for the moment…
>

(sorry for cc’ing the bug system on this one)

OK. I’m really looking forward to the upcoming next generation kernel,
but it’s very hard to keep an embedded system this close to something
so bleeding edge. :/

That said, it looked plausible to to extract the flow_dissect and
adaptive-red work you’ve done recently, safely. I’ll think upon it.

(I thought the backporting bql would be easy too, but thus far no luck)

I’ve moved most of my own AQM prototyping to x86, which is now
showing useful results, thx to you, tom, and bql.

Updated by Dave Täht on Jan 29, 2012.
I abstracted out BQL, this patch, etc and backported to 3.2.2.

I note that the permute option only works in the background with the
default filters (I think), and I think that I need to use the pre-nat
ips in order to get saner SFQ out the other end… but we’ll see.

Certainly the old method would put in a nasty 10 second disturbance. Perhaps we’ll
see different behavior now, but traces are needed to see what happens.

Updated by Dave Täht on Apr 21, 2012.

This is a static export of the original bufferbloat.net issue database. As such, no further commenting is possible; the information is solely here for archival purposes.
RSS feed

Recent Updates

Jul 21, 2024 Wiki page
cake-autorate
Jul 21, 2024 Wiki page
What Can I Do About Bufferbloat?
Jul 21, 2024 Wiki page
Tests for Bufferbloat
Jul 1, 2024 Wiki page
RRUL Chart Explanation
Dec 3, 2022 Wiki page
Codel Wiki

Find us elsewhere

Bufferbloat Mailing Lists
#bufferbloat on Twitter
Google+ group
Archived Bufferbloat pages from the Wayback Machine

Sponsors

Comcast Research Innovation Fund
Nlnet Foundation
Shuttleworth Foundation
GoFundMe

Bufferbloat Related Projects

OpenWrt Project
Congestion Control Blog
Flent Network Test Suite
Sqm-Scripts
The Cake shaper
AQMs in BSD
IETF AQM WG
CeroWrt (where it all started)

Network Performance Related Resources


Jim Gettys' Blog - The chairman of the Fjord
Toke's Blog - Karlstad University's work on bloat
Voip Users Conference - Weekly Videoconference mostly about voip
Candelatech - A wifi testing company that "gets it".