]> Gitweb @ Texas Instruments - Open Source Git Repositories - git.TI.com/gitweb - android-sdk/kernel-video.git/blob - net/sched/sch_cbq.c
Merge branch 'p-ti-linux-3.8.y' into p-ti-android-3.8.y
[android-sdk/kernel-video.git] / net / sched / sch_cbq.c
1 /*
2  * net/sched/sch_cbq.c  Class-Based Queueing discipline.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  */
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
24 /*      Class-Based Queueing (CBQ) algorithm.
25         =======================================
27         Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
28                  Management Models for Packet Networks",
29                  IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
31                  [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
33                  [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
34                  Parameters", 1996
36                  [4] Sally Floyd and Michael Speer, "Experimental Results
37                  for Class-Based Queueing", 1998, not published.
39         -----------------------------------------------------------------------
41         Algorithm skeleton was taken from NS simulator cbq.cc.
42         If someone wants to check this code against the LBL version,
43         he should take into account that ONLY the skeleton was borrowed,
44         the implementation is different. Particularly:
46         --- The WRR algorithm is different. Our version looks more
47         reasonable (I hope) and works when quanta are allowed to be
48         less than MTU, which is always the case when real time classes
49         have small rates. Note, that the statement of [3] is
50         incomplete, delay may actually be estimated even if class
51         per-round allotment is less than MTU. Namely, if per-round
52         allotment is W*r_i, and r_1+...+r_k = r < 1
54         delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
56         In the worst case we have IntServ estimate with D = W*r+k*MTU
57         and C = MTU*r. The proof (if correct at all) is trivial.
60         --- It seems that cbq-2.0 is not very accurate. At least, I cannot
61         interpret some places, which look like wrong translations
62         from NS. Anyone is advised to find these differences
63         and explain to me, why I am wrong 8).
65         --- Linux has no EOI event, so that we cannot estimate true class
66         idle time. Workaround is to consider the next dequeue event
67         as sign that previous packet is finished. This is wrong because of
68         internal device queueing, but on a permanently loaded link it is true.
69         Moreover, combined with clock integrator, this scheme looks
70         very close to an ideal solution.  */
72 struct cbq_sched_data;
75 struct cbq_class {
76         struct Qdisc_class_common common;
77         struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
79 /* Parameters */
80         unsigned char           priority;       /* class priority */
81         unsigned char           priority2;      /* priority to be used after overlimit */
82         unsigned char           ewma_log;       /* time constant for idle time calculation */
83         unsigned char           ovl_strategy;
84 #ifdef CONFIG_NET_CLS_ACT
85         unsigned char           police;
86 #endif
88         u32                     defmap;
90         /* Link-sharing scheduler parameters */
91         long                    maxidle;        /* Class parameters: see below. */
92         long                    offtime;
93         long                    minidle;
94         u32                     avpkt;
95         struct qdisc_rate_table *R_tab;
97         /* Overlimit strategy parameters */
98         void                    (*overlimit)(struct cbq_class *cl);
99         psched_tdiff_t          penalty;
101         /* General scheduler (WRR) parameters */
102         long                    allot;
103         long                    quantum;        /* Allotment per WRR round */
104         long                    weight;         /* Relative allotment: see below */
106         struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
107         struct cbq_class        *split;         /* Ptr to split node */
108         struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
109         struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
110         struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
111                                                    parent otherwise */
112         struct cbq_class        *sibling;       /* Sibling chain */
113         struct cbq_class        *children;      /* Pointer to children chain */
115         struct Qdisc            *q;             /* Elementary queueing discipline */
118 /* Variables */
119         unsigned char           cpriority;      /* Effective priority */
120         unsigned char           delayed;
121         unsigned char           level;          /* level of the class in hierarchy:
122                                                    0 for leaf classes, and maximal
123                                                    level of children + 1 for nodes.
124                                                  */
126         psched_time_t           last;           /* Last end of service */
127         psched_time_t           undertime;
128         long                    avgidle;
129         long                    deficit;        /* Saved deficit for WRR */
130         psched_time_t           penalized;
131         struct gnet_stats_basic_packed bstats;
132         struct gnet_stats_queue qstats;
133         struct gnet_stats_rate_est rate_est;
134         struct tc_cbq_xstats    xstats;
136         struct tcf_proto        *filter_list;
138         int                     refcnt;
139         int                     filters;
141         struct cbq_class        *defaults[TC_PRIO_MAX + 1];
142 };
144 struct cbq_sched_data {
145         struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
146         int                     nclasses[TC_CBQ_MAXPRIO + 1];
147         unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
149         struct cbq_class        link;
151         unsigned int            activemask;
152         struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
153                                                                    with backlog */
155 #ifdef CONFIG_NET_CLS_ACT
156         struct cbq_class        *rx_class;
157 #endif
158         struct cbq_class        *tx_class;
159         struct cbq_class        *tx_borrowed;
160         int                     tx_len;
161         psched_time_t           now;            /* Cached timestamp */
162         psched_time_t           now_rt;         /* Cached real time */
163         unsigned int            pmask;
165         struct hrtimer          delay_timer;
166         struct qdisc_watchdog   watchdog;       /* Watchdog timer,
167                                                    started when CBQ has
168                                                    backlog, but cannot
169                                                    transmit just now */
170         psched_tdiff_t          wd_expires;
171         int                     toplevel;
172         u32                     hgenerator;
173 };
176 #define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
178 static inline struct cbq_class *
179 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
181         struct Qdisc_class_common *clc;
183         clc = qdisc_class_find(&q->clhash, classid);
184         if (clc == NULL)
185                 return NULL;
186         return container_of(clc, struct cbq_class, common);
189 #ifdef CONFIG_NET_CLS_ACT
191 static struct cbq_class *
192 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
194         struct cbq_class *cl;
196         for (cl = this->tparent; cl; cl = cl->tparent) {
197                 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
199                 if (new != NULL && new != this)
200                         return new;
201         }
202         return NULL;
205 #endif
207 /* Classify packet. The procedure is pretty complicated, but
208  * it allows us to combine link sharing and priority scheduling
209  * transparently.
210  *
211  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
212  * so that it resolves to split nodes. Then packets are classified
213  * by logical priority, or a more specific classifier may be attached
214  * to the split node.
215  */
217 static struct cbq_class *
218 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
220         struct cbq_sched_data *q = qdisc_priv(sch);
221         struct cbq_class *head = &q->link;
222         struct cbq_class **defmap;
223         struct cbq_class *cl = NULL;
224         u32 prio = skb->priority;
225         struct tcf_result res;
227         /*
228          *  Step 1. If skb->priority points to one of our classes, use it.
229          */
230         if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
231             (cl = cbq_class_lookup(q, prio)) != NULL)
232                 return cl;
234         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
235         for (;;) {
236                 int result = 0;
237                 defmap = head->defaults;
239                 /*
240                  * Step 2+n. Apply classifier.
241                  */
242                 if (!head->filter_list ||
243                     (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
244                         goto fallback;
246                 cl = (void *)res.class;
247                 if (!cl) {
248                         if (TC_H_MAJ(res.classid))
249                                 cl = cbq_class_lookup(q, res.classid);
250                         else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
251                                 cl = defmap[TC_PRIO_BESTEFFORT];
253                         if (cl == NULL)
254                                 goto fallback;
255                 }
256                 if (cl->level >= head->level)
257                         goto fallback;
258 #ifdef CONFIG_NET_CLS_ACT
259                 switch (result) {
260                 case TC_ACT_QUEUED:
261                 case TC_ACT_STOLEN:
262                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
263                 case TC_ACT_SHOT:
264                         return NULL;
265                 case TC_ACT_RECLASSIFY:
266                         return cbq_reclassify(skb, cl);
267                 }
268 #endif
269                 if (cl->level == 0)
270                         return cl;
272                 /*
273                  * Step 3+n. If classifier selected a link sharing class,
274                  *         apply agency specific classifier.
275                  *         Repeat this procdure until we hit a leaf node.
276                  */
277                 head = cl;
278         }
280 fallback:
281         cl = head;
283         /*
284          * Step 4. No success...
285          */
286         if (TC_H_MAJ(prio) == 0 &&
287             !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
288             !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
289                 return head;
291         return cl;
294 /*
295  * A packet has just been enqueued on the empty class.
296  * cbq_activate_class adds it to the tail of active class list
297  * of its priority band.
298  */
300 static inline void cbq_activate_class(struct cbq_class *cl)
302         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
303         int prio = cl->cpriority;
304         struct cbq_class *cl_tail;
306         cl_tail = q->active[prio];
307         q->active[prio] = cl;
309         if (cl_tail != NULL) {
310                 cl->next_alive = cl_tail->next_alive;
311                 cl_tail->next_alive = cl;
312         } else {
313                 cl->next_alive = cl;
314                 q->activemask |= (1<<prio);
315         }
318 /*
319  * Unlink class from active chain.
320  * Note that this same procedure is done directly in cbq_dequeue*
321  * during round-robin procedure.
322  */
324 static void cbq_deactivate_class(struct cbq_class *this)
326         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
327         int prio = this->cpriority;
328         struct cbq_class *cl;
329         struct cbq_class *cl_prev = q->active[prio];
331         do {
332                 cl = cl_prev->next_alive;
333                 if (cl == this) {
334                         cl_prev->next_alive = cl->next_alive;
335                         cl->next_alive = NULL;
337                         if (cl == q->active[prio]) {
338                                 q->active[prio] = cl_prev;
339                                 if (cl == q->active[prio]) {
340                                         q->active[prio] = NULL;
341                                         q->activemask &= ~(1<<prio);
342                                         return;
343                                 }
344                         }
345                         return;
346                 }
347         } while ((cl_prev = cl) != q->active[prio]);
350 static void
351 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
353         int toplevel = q->toplevel;
355         if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
356                 psched_time_t now;
357                 psched_tdiff_t incr;
359                 now = psched_get_time();
360                 incr = now - q->now_rt;
361                 now = q->now + incr;
363                 do {
364                         if (cl->undertime < now) {
365                                 q->toplevel = cl->level;
366                                 return;
367                         }
368                 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
369         }
372 static int
373 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
375         struct cbq_sched_data *q = qdisc_priv(sch);
376         int uninitialized_var(ret);
377         struct cbq_class *cl = cbq_classify(skb, sch, &ret);
379 #ifdef CONFIG_NET_CLS_ACT
380         q->rx_class = cl;
381 #endif
382         if (cl == NULL) {
383                 if (ret & __NET_XMIT_BYPASS)
384                         sch->qstats.drops++;
385                 kfree_skb(skb);
386                 return ret;
387         }
389 #ifdef CONFIG_NET_CLS_ACT
390         cl->q->__parent = sch;
391 #endif
392         ret = qdisc_enqueue(skb, cl->q);
393         if (ret == NET_XMIT_SUCCESS) {
394                 sch->q.qlen++;
395                 cbq_mark_toplevel(q, cl);
396                 if (!cl->next_alive)
397                         cbq_activate_class(cl);
398                 return ret;
399         }
401         if (net_xmit_drop_count(ret)) {
402                 sch->qstats.drops++;
403                 cbq_mark_toplevel(q, cl);
404                 cl->qstats.drops++;
405         }
406         return ret;
409 /* Overlimit actions */
411 /* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
413 static void cbq_ovl_classic(struct cbq_class *cl)
415         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
416         psched_tdiff_t delay = cl->undertime - q->now;
418         if (!cl->delayed) {
419                 delay += cl->offtime;
421                 /*
422                  * Class goes to sleep, so that it will have no
423                  * chance to work avgidle. Let's forgive it 8)
424                  *
425                  * BTW cbq-2.0 has a crap in this
426                  * place, apparently they forgot to shift it by cl->ewma_log.
427                  */
428                 if (cl->avgidle < 0)
429                         delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
430                 if (cl->avgidle < cl->minidle)
431                         cl->avgidle = cl->minidle;
432                 if (delay <= 0)
433                         delay = 1;
434                 cl->undertime = q->now + delay;
436                 cl->xstats.overactions++;
437                 cl->delayed = 1;
438         }
439         if (q->wd_expires == 0 || q->wd_expires > delay)
440                 q->wd_expires = delay;
442         /* Dirty work! We must schedule wakeups based on
443          * real available rate, rather than leaf rate,
444          * which may be tiny (even zero).
445          */
446         if (q->toplevel == TC_CBQ_MAXLEVEL) {
447                 struct cbq_class *b;
448                 psched_tdiff_t base_delay = q->wd_expires;
450                 for (b = cl->borrow; b; b = b->borrow) {
451                         delay = b->undertime - q->now;
452                         if (delay < base_delay) {
453                                 if (delay <= 0)
454                                         delay = 1;
455                                 base_delay = delay;
456                         }
457                 }
459                 q->wd_expires = base_delay;
460         }
463 /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
464  * they go overlimit
465  */
467 static void cbq_ovl_rclassic(struct cbq_class *cl)
469         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
470         struct cbq_class *this = cl;
472         do {
473                 if (cl->level > q->toplevel) {
474                         cl = NULL;
475                         break;
476                 }
477         } while ((cl = cl->borrow) != NULL);
479         if (cl == NULL)
480                 cl = this;
481         cbq_ovl_classic(cl);
484 /* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
486 static void cbq_ovl_delay(struct cbq_class *cl)
488         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
489         psched_tdiff_t delay = cl->undertime - q->now;
491         if (test_bit(__QDISC_STATE_DEACTIVATED,
492                      &qdisc_root_sleeping(cl->qdisc)->state))
493                 return;
495         if (!cl->delayed) {
496                 psched_time_t sched = q->now;
497                 ktime_t expires;
499                 delay += cl->offtime;
500                 if (cl->avgidle < 0)
501                         delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
502                 if (cl->avgidle < cl->minidle)
503                         cl->avgidle = cl->minidle;
504                 cl->undertime = q->now + delay;
506                 if (delay > 0) {
507                         sched += delay + cl->penalty;
508                         cl->penalized = sched;
509                         cl->cpriority = TC_CBQ_MAXPRIO;
510                         q->pmask |= (1<<TC_CBQ_MAXPRIO);
512                         expires = ns_to_ktime(PSCHED_TICKS2NS(sched));
513                         if (hrtimer_try_to_cancel(&q->delay_timer) &&
514                             ktime_to_ns(ktime_sub(
515                                         hrtimer_get_expires(&q->delay_timer),
516                                         expires)) > 0)
517                                 hrtimer_set_expires(&q->delay_timer, expires);
518                         hrtimer_restart(&q->delay_timer);
519                         cl->delayed = 1;
520                         cl->xstats.overactions++;
521                         return;
522                 }
523                 delay = 1;
524         }
525         if (q->wd_expires == 0 || q->wd_expires > delay)
526                 q->wd_expires = delay;
529 /* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
531 static void cbq_ovl_lowprio(struct cbq_class *cl)
533         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
535         cl->penalized = q->now + cl->penalty;
537         if (cl->cpriority != cl->priority2) {
538                 cl->cpriority = cl->priority2;
539                 q->pmask |= (1<<cl->cpriority);
540                 cl->xstats.overactions++;
541         }
542         cbq_ovl_classic(cl);
545 /* TC_CBQ_OVL_DROP: penalize class by dropping */
547 static void cbq_ovl_drop(struct cbq_class *cl)
549         if (cl->q->ops->drop)
550                 if (cl->q->ops->drop(cl->q))
551                         cl->qdisc->q.qlen--;
552         cl->xstats.overactions++;
553         cbq_ovl_classic(cl);
556 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
557                                        psched_time_t now)
559         struct cbq_class *cl;
560         struct cbq_class *cl_prev = q->active[prio];
561         psched_time_t sched = now;
563         if (cl_prev == NULL)
564                 return 0;
566         do {
567                 cl = cl_prev->next_alive;
568                 if (now - cl->penalized > 0) {
569                         cl_prev->next_alive = cl->next_alive;
570                         cl->next_alive = NULL;
571                         cl->cpriority = cl->priority;
572                         cl->delayed = 0;
573                         cbq_activate_class(cl);
575                         if (cl == q->active[prio]) {
576                                 q->active[prio] = cl_prev;
577                                 if (cl == q->active[prio]) {
578                                         q->active[prio] = NULL;
579                                         return 0;
580                                 }
581                         }
583                         cl = cl_prev->next_alive;
584                 } else if (sched - cl->penalized > 0)
585                         sched = cl->penalized;
586         } while ((cl_prev = cl) != q->active[prio]);
588         return sched - now;
591 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
593         struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
594                                                 delay_timer);
595         struct Qdisc *sch = q->watchdog.qdisc;
596         psched_time_t now;
597         psched_tdiff_t delay = 0;
598         unsigned int pmask;
600         now = psched_get_time();
602         pmask = q->pmask;
603         q->pmask = 0;
605         while (pmask) {
606                 int prio = ffz(~pmask);
607                 psched_tdiff_t tmp;
609                 pmask &= ~(1<<prio);
611                 tmp = cbq_undelay_prio(q, prio, now);
612                 if (tmp > 0) {
613                         q->pmask |= 1<<prio;
614                         if (tmp < delay || delay == 0)
615                                 delay = tmp;
616                 }
617         }
619         if (delay) {
620                 ktime_t time;
622                 time = ktime_set(0, 0);
623                 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
624                 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
625         }
627         qdisc_unthrottled(sch);
628         __netif_schedule(qdisc_root(sch));
629         return HRTIMER_NORESTART;
632 #ifdef CONFIG_NET_CLS_ACT
633 static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
635         struct Qdisc *sch = child->__parent;
636         struct cbq_sched_data *q = qdisc_priv(sch);
637         struct cbq_class *cl = q->rx_class;
639         q->rx_class = NULL;
641         if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
642                 int ret;
644                 cbq_mark_toplevel(q, cl);
646                 q->rx_class = cl;
647                 cl->q->__parent = sch;
649                 ret = qdisc_enqueue(skb, cl->q);
650                 if (ret == NET_XMIT_SUCCESS) {
651                         sch->q.qlen++;
652                         if (!cl->next_alive)
653                                 cbq_activate_class(cl);
654                         return 0;
655                 }
656                 if (net_xmit_drop_count(ret))
657                         sch->qstats.drops++;
658                 return 0;
659         }
661         sch->qstats.drops++;
662         return -1;
664 #endif
666 /*
667  * It is mission critical procedure.
668  *
669  * We "regenerate" toplevel cutoff, if transmitting class
670  * has backlog and it is not regulated. It is not part of
671  * original CBQ description, but looks more reasonable.
672  * Probably, it is wrong. This question needs further investigation.
673  */
675 static inline void
676 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
677                     struct cbq_class *borrowed)
679         if (cl && q->toplevel >= borrowed->level) {
680                 if (cl->q->q.qlen > 1) {
681                         do {
682                                 if (borrowed->undertime == PSCHED_PASTPERFECT) {
683                                         q->toplevel = borrowed->level;
684                                         return;
685                                 }
686                         } while ((borrowed = borrowed->borrow) != NULL);
687                 }
688 #if 0
689         /* It is not necessary now. Uncommenting it
690            will save CPU cycles, but decrease fairness.
691          */
692                 q->toplevel = TC_CBQ_MAXLEVEL;
693 #endif
694         }
697 static void
698 cbq_update(struct cbq_sched_data *q)
700         struct cbq_class *this = q->tx_class;
701         struct cbq_class *cl = this;
702         int len = q->tx_len;
704         q->tx_class = NULL;
706         for ( ; cl; cl = cl->share) {
707                 long avgidle = cl->avgidle;
708                 long idle;
710                 cl->bstats.packets++;
711                 cl->bstats.bytes += len;
713                 /*
714                  * (now - last) is total time between packet right edges.
715                  * (last_pktlen/rate) is "virtual" busy time, so that
716                  *
717                  *      idle = (now - last) - last_pktlen/rate
718                  */
720                 idle = q->now - cl->last;
721                 if ((unsigned long)idle > 128*1024*1024) {
722                         avgidle = cl->maxidle;
723                 } else {
724                         idle -= L2T(cl, len);
726                 /* true_avgidle := (1-W)*true_avgidle + W*idle,
727                  * where W=2^{-ewma_log}. But cl->avgidle is scaled:
728                  * cl->avgidle == true_avgidle/W,
729                  * hence:
730                  */
731                         avgidle += idle - (avgidle>>cl->ewma_log);
732                 }
734                 if (avgidle <= 0) {
735                         /* Overlimit or at-limit */
737                         if (avgidle < cl->minidle)
738                                 avgidle = cl->minidle;
740                         cl->avgidle = avgidle;
742                         /* Calculate expected time, when this class
743                          * will be allowed to send.
744                          * It will occur, when:
745                          * (1-W)*true_avgidle + W*delay = 0, i.e.
746                          * idle = (1/W - 1)*(-true_avgidle)
747                          * or
748                          * idle = (1 - W)*(-cl->avgidle);
749                          */
750                         idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
752                         /*
753                          * That is not all.
754                          * To maintain the rate allocated to the class,
755                          * we add to undertime virtual clock,
756                          * necessary to complete transmitted packet.
757                          * (len/phys_bandwidth has been already passed
758                          * to the moment of cbq_update)
759                          */
761                         idle -= L2T(&q->link, len);
762                         idle += L2T(cl, len);
764                         cl->undertime = q->now + idle;
765                 } else {
766                         /* Underlimit */
768                         cl->undertime = PSCHED_PASTPERFECT;
769                         if (avgidle > cl->maxidle)
770                                 cl->avgidle = cl->maxidle;
771                         else
772                                 cl->avgidle = avgidle;
773                 }
774                 cl->last = q->now;
775         }
777         cbq_update_toplevel(q, this, q->tx_borrowed);
780 static inline struct cbq_class *
781 cbq_under_limit(struct cbq_class *cl)
783         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
784         struct cbq_class *this_cl = cl;
786         if (cl->tparent == NULL)
787                 return cl;
789         if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
790                 cl->delayed = 0;
791                 return cl;
792         }
794         do {
795                 /* It is very suspicious place. Now overlimit
796                  * action is generated for not bounded classes
797                  * only if link is completely congested.
798                  * Though it is in agree with ancestor-only paradigm,
799                  * it looks very stupid. Particularly,
800                  * it means that this chunk of code will either
801                  * never be called or result in strong amplification
802                  * of burstiness. Dangerous, silly, and, however,
803                  * no another solution exists.
804                  */
805                 cl = cl->borrow;
806                 if (!cl) {
807                         this_cl->qstats.overlimits++;
808                         this_cl->overlimit(this_cl);
809                         return NULL;
810                 }
811                 if (cl->level > q->toplevel)
812                         return NULL;
813         } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
815         cl->delayed = 0;
816         return cl;
819 static inline struct sk_buff *
820 cbq_dequeue_prio(struct Qdisc *sch, int prio)
822         struct cbq_sched_data *q = qdisc_priv(sch);
823         struct cbq_class *cl_tail, *cl_prev, *cl;
824         struct sk_buff *skb;
825         int deficit;
827         cl_tail = cl_prev = q->active[prio];
828         cl = cl_prev->next_alive;
830         do {
831                 deficit = 0;
833                 /* Start round */
834                 do {
835                         struct cbq_class *borrow = cl;
837                         if (cl->q->q.qlen &&
838                             (borrow = cbq_under_limit(cl)) == NULL)
839                                 goto skip_class;
841                         if (cl->deficit <= 0) {
842                                 /* Class exhausted its allotment per
843                                  * this round. Switch to the next one.
844                                  */
845                                 deficit = 1;
846                                 cl->deficit += cl->quantum;
847                                 goto next_class;
848                         }
850                         skb = cl->q->dequeue(cl->q);
852                         /* Class did not give us any skb :-(
853                          * It could occur even if cl->q->q.qlen != 0
854                          * f.e. if cl->q == "tbf"
855                          */
856                         if (skb == NULL)
857                                 goto skip_class;
859                         cl->deficit -= qdisc_pkt_len(skb);
860                         q->tx_class = cl;
861                         q->tx_borrowed = borrow;
862                         if (borrow != cl) {
863 #ifndef CBQ_XSTATS_BORROWS_BYTES
864                                 borrow->xstats.borrows++;
865                                 cl->xstats.borrows++;
866 #else
867                                 borrow->xstats.borrows += qdisc_pkt_len(skb);
868                                 cl->xstats.borrows += qdisc_pkt_len(skb);
869 #endif
870                         }
871                         q->tx_len = qdisc_pkt_len(skb);
873                         if (cl->deficit <= 0) {
874                                 q->active[prio] = cl;
875                                 cl = cl->next_alive;
876                                 cl->deficit += cl->quantum;
877                         }
878                         return skb;
880 skip_class:
881                         if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
882                                 /* Class is empty or penalized.
883                                  * Unlink it from active chain.
884                                  */
885                                 cl_prev->next_alive = cl->next_alive;
886                                 cl->next_alive = NULL;
888                                 /* Did cl_tail point to it? */
889                                 if (cl == cl_tail) {
890                                         /* Repair it! */
891                                         cl_tail = cl_prev;
893                                         /* Was it the last class in this band? */
894                                         if (cl == cl_tail) {
895                                                 /* Kill the band! */
896                                                 q->active[prio] = NULL;
897                                                 q->activemask &= ~(1<<prio);
898                                                 if (cl->q->q.qlen)
899                                                         cbq_activate_class(cl);
900                                                 return NULL;
901                                         }
903                                         q->active[prio] = cl_tail;
904                                 }
905                                 if (cl->q->q.qlen)
906                                         cbq_activate_class(cl);
908                                 cl = cl_prev;
909                         }
911 next_class:
912                         cl_prev = cl;
913                         cl = cl->next_alive;
914                 } while (cl_prev != cl_tail);
915         } while (deficit);
917         q->active[prio] = cl_prev;
919         return NULL;
922 static inline struct sk_buff *
923 cbq_dequeue_1(struct Qdisc *sch)
925         struct cbq_sched_data *q = qdisc_priv(sch);
926         struct sk_buff *skb;
927         unsigned int activemask;
929         activemask = q->activemask & 0xFF;
930         while (activemask) {
931                 int prio = ffz(~activemask);
932                 activemask &= ~(1<<prio);
933                 skb = cbq_dequeue_prio(sch, prio);
934                 if (skb)
935                         return skb;
936         }
937         return NULL;
940 static struct sk_buff *
941 cbq_dequeue(struct Qdisc *sch)
943         struct sk_buff *skb;
944         struct cbq_sched_data *q = qdisc_priv(sch);
945         psched_time_t now;
946         psched_tdiff_t incr;
948         now = psched_get_time();
949         incr = now - q->now_rt;
951         if (q->tx_class) {
952                 psched_tdiff_t incr2;
953                 /* Time integrator. We calculate EOS time
954                  * by adding expected packet transmission time.
955                  * If real time is greater, we warp artificial clock,
956                  * so that:
957                  *
958                  * cbq_time = max(real_time, work);
959                  */
960                 incr2 = L2T(&q->link, q->tx_len);
961                 q->now += incr2;
962                 cbq_update(q);
963                 if ((incr -= incr2) < 0)
964                         incr = 0;
965                 q->now += incr;
966         } else {
967                 if (now > q->now)
968                         q->now = now;
969         }
970         q->now_rt = now;
972         for (;;) {
973                 q->wd_expires = 0;
975                 skb = cbq_dequeue_1(sch);
976                 if (skb) {
977                         qdisc_bstats_update(sch, skb);
978                         sch->q.qlen--;
979                         qdisc_unthrottled(sch);
980                         return skb;
981                 }
983                 /* All the classes are overlimit.
984                  *
985                  * It is possible, if:
986                  *
987                  * 1. Scheduler is empty.
988                  * 2. Toplevel cutoff inhibited borrowing.
989                  * 3. Root class is overlimit.
990                  *
991                  * Reset 2d and 3d conditions and retry.
992                  *
993                  * Note, that NS and cbq-2.0 are buggy, peeking
994                  * an arbitrary class is appropriate for ancestor-only
995                  * sharing, but not for toplevel algorithm.
996                  *
997                  * Our version is better, but slower, because it requires
998                  * two passes, but it is unavoidable with top-level sharing.
999                  */
1001                 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1002                     q->link.undertime == PSCHED_PASTPERFECT)
1003                         break;
1005                 q->toplevel = TC_CBQ_MAXLEVEL;
1006                 q->link.undertime = PSCHED_PASTPERFECT;
1007         }
1009         /* No packets in scheduler or nobody wants to give them to us :-(
1010          * Sigh... start watchdog timer in the last case.
1011          */
1013         if (sch->q.qlen) {
1014                 sch->qstats.overlimits++;
1015                 if (q->wd_expires)
1016                         qdisc_watchdog_schedule(&q->watchdog,
1017                                                 now + q->wd_expires);
1018         }
1019         return NULL;
1022 /* CBQ class maintanance routines */
1024 static void cbq_adjust_levels(struct cbq_class *this)
1026         if (this == NULL)
1027                 return;
1029         do {
1030                 int level = 0;
1031                 struct cbq_class *cl;
1033                 cl = this->children;
1034                 if (cl) {
1035                         do {
1036                                 if (cl->level > level)
1037                                         level = cl->level;
1038                         } while ((cl = cl->sibling) != this->children);
1039                 }
1040                 this->level = level + 1;
1041         } while ((this = this->tparent) != NULL);
1044 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1046         struct cbq_class *cl;
1047         struct hlist_node *n;
1048         unsigned int h;
1050         if (q->quanta[prio] == 0)
1051                 return;
1053         for (h = 0; h < q->clhash.hashsize; h++) {
1054                 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1055                         /* BUGGGG... Beware! This expression suffer of
1056                          * arithmetic overflows!
1057                          */
1058                         if (cl->priority == prio) {
1059                                 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1060                                         q->quanta[prio];
1061                         }
1062                         if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1063                                 pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n",
1064                                            cl->common.classid, cl->quantum);
1065                                 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1066                         }
1067                 }
1068         }
1071 static void cbq_sync_defmap(struct cbq_class *cl)
1073         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1074         struct cbq_class *split = cl->split;
1075         unsigned int h;
1076         int i;
1078         if (split == NULL)
1079                 return;
1081         for (i = 0; i <= TC_PRIO_MAX; i++) {
1082                 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
1083                         split->defaults[i] = NULL;
1084         }
1086         for (i = 0; i <= TC_PRIO_MAX; i++) {
1087                 int level = split->level;
1089                 if (split->defaults[i])
1090                         continue;
1092                 for (h = 0; h < q->clhash.hashsize; h++) {
1093                         struct hlist_node *n;
1094                         struct cbq_class *c;
1096                         hlist_for_each_entry(c, n, &q->clhash.hash[h],
1097                                              common.hnode) {
1098                                 if (c->split == split && c->level < level &&
1099                                     c->defmap & (1<<i)) {
1100                                         split->defaults[i] = c;
1101                                         level = c->level;
1102                                 }
1103                         }
1104                 }
1105         }
1108 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1110         struct cbq_class *split = NULL;
1112         if (splitid == 0) {
1113                 split = cl->split;
1114                 if (!split)
1115                         return;
1116                 splitid = split->common.classid;
1117         }
1119         if (split == NULL || split->common.classid != splitid) {
1120                 for (split = cl->tparent; split; split = split->tparent)
1121                         if (split->common.classid == splitid)
1122                                 break;
1123         }
1125         if (split == NULL)
1126                 return;
1128         if (cl->split != split) {
1129                 cl->defmap = 0;
1130                 cbq_sync_defmap(cl);
1131                 cl->split = split;
1132                 cl->defmap = def & mask;
1133         } else
1134                 cl->defmap = (cl->defmap & ~mask) | (def & mask);
1136         cbq_sync_defmap(cl);
1139 static void cbq_unlink_class(struct cbq_class *this)
1141         struct cbq_class *cl, **clp;
1142         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1144         qdisc_class_hash_remove(&q->clhash, &this->common);
1146         if (this->tparent) {
1147                 clp = &this->sibling;
1148                 cl = *clp;
1149                 do {
1150                         if (cl == this) {
1151                                 *clp = cl->sibling;
1152                                 break;
1153                         }
1154                         clp = &cl->sibling;
1155                 } while ((cl = *clp) != this->sibling);
1157                 if (this->tparent->children == this) {
1158                         this->tparent->children = this->sibling;
1159                         if (this->sibling == this)
1160                                 this->tparent->children = NULL;
1161                 }
1162         } else {
1163                 WARN_ON(this->sibling != this);
1164         }
1167 static void cbq_link_class(struct cbq_class *this)
1169         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1170         struct cbq_class *parent = this->tparent;
1172         this->sibling = this;
1173         qdisc_class_hash_insert(&q->clhash, &this->common);
1175         if (parent == NULL)
1176                 return;
1178         if (parent->children == NULL) {
1179                 parent->children = this;
1180         } else {
1181                 this->sibling = parent->children->sibling;
1182                 parent->children->sibling = this;
1183         }
1186 static unsigned int cbq_drop(struct Qdisc *sch)
1188         struct cbq_sched_data *q = qdisc_priv(sch);
1189         struct cbq_class *cl, *cl_head;
1190         int prio;
1191         unsigned int len;
1193         for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1194                 cl_head = q->active[prio];
1195                 if (!cl_head)
1196                         continue;
1198                 cl = cl_head;
1199                 do {
1200                         if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1201                                 sch->q.qlen--;
1202                                 if (!cl->q->q.qlen)
1203                                         cbq_deactivate_class(cl);
1204                                 return len;
1205                         }
1206                 } while ((cl = cl->next_alive) != cl_head);
1207         }
1208         return 0;
1211 static void
1212 cbq_reset(struct Qdisc *sch)
1214         struct cbq_sched_data *q = qdisc_priv(sch);
1215         struct cbq_class *cl;
1216         struct hlist_node *n;
1217         int prio;
1218         unsigned int h;
1220         q->activemask = 0;
1221         q->pmask = 0;
1222         q->tx_class = NULL;
1223         q->tx_borrowed = NULL;
1224         qdisc_watchdog_cancel(&q->watchdog);
1225         hrtimer_cancel(&q->delay_timer);
1226         q->toplevel = TC_CBQ_MAXLEVEL;
1227         q->now = psched_get_time();
1228         q->now_rt = q->now;
1230         for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1231                 q->active[prio] = NULL;
1233         for (h = 0; h < q->clhash.hashsize; h++) {
1234                 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1235                         qdisc_reset(cl->q);
1237                         cl->next_alive = NULL;
1238                         cl->undertime = PSCHED_PASTPERFECT;
1239                         cl->avgidle = cl->maxidle;
1240                         cl->deficit = cl->quantum;
1241                         cl->cpriority = cl->priority;
1242                 }
1243         }
1244         sch->q.qlen = 0;
1248 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1250         if (lss->change & TCF_CBQ_LSS_FLAGS) {
1251                 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1252                 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1253         }
1254         if (lss->change & TCF_CBQ_LSS_EWMA)
1255                 cl->ewma_log = lss->ewma_log;
1256         if (lss->change & TCF_CBQ_LSS_AVPKT)
1257                 cl->avpkt = lss->avpkt;
1258         if (lss->change & TCF_CBQ_LSS_MINIDLE)
1259                 cl->minidle = -(long)lss->minidle;
1260         if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1261                 cl->maxidle = lss->maxidle;
1262                 cl->avgidle = lss->maxidle;
1263         }
1264         if (lss->change & TCF_CBQ_LSS_OFFTIME)
1265                 cl->offtime = lss->offtime;
1266         return 0;
1269 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1271         q->nclasses[cl->priority]--;
1272         q->quanta[cl->priority] -= cl->weight;
1273         cbq_normalize_quanta(q, cl->priority);
1276 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1278         q->nclasses[cl->priority]++;
1279         q->quanta[cl->priority] += cl->weight;
1280         cbq_normalize_quanta(q, cl->priority);
1283 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1285         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1287         if (wrr->allot)
1288                 cl->allot = wrr->allot;
1289         if (wrr->weight)
1290                 cl->weight = wrr->weight;
1291         if (wrr->priority) {
1292                 cl->priority = wrr->priority - 1;
1293                 cl->cpriority = cl->priority;
1294                 if (cl->priority >= cl->priority2)
1295                         cl->priority2 = TC_CBQ_MAXPRIO - 1;
1296         }
1298         cbq_addprio(q, cl);
1299         return 0;
1302 static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1304         switch (ovl->strategy) {
1305         case TC_CBQ_OVL_CLASSIC:
1306                 cl->overlimit = cbq_ovl_classic;
1307                 break;
1308         case TC_CBQ_OVL_DELAY:
1309                 cl->overlimit = cbq_ovl_delay;
1310                 break;
1311         case TC_CBQ_OVL_LOWPRIO:
1312                 if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
1313                     ovl->priority2 - 1 <= cl->priority)
1314                         return -EINVAL;
1315                 cl->priority2 = ovl->priority2 - 1;
1316                 cl->overlimit = cbq_ovl_lowprio;
1317                 break;
1318         case TC_CBQ_OVL_DROP:
1319                 cl->overlimit = cbq_ovl_drop;
1320                 break;
1321         case TC_CBQ_OVL_RCLASSIC:
1322                 cl->overlimit = cbq_ovl_rclassic;
1323                 break;
1324         default:
1325                 return -EINVAL;
1326         }
1327         cl->penalty = ovl->penalty;
1328         return 0;
1331 #ifdef CONFIG_NET_CLS_ACT
1332 static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1334         cl->police = p->police;
1336         if (cl->q->handle) {
1337                 if (p->police == TC_POLICE_RECLASSIFY)
1338                         cl->q->reshape_fail = cbq_reshape_fail;
1339                 else
1340                         cl->q->reshape_fail = NULL;
1341         }
1342         return 0;
1344 #endif
1346 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1348         cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1349         return 0;
1352 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1353         [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1354         [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1355         [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1356         [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1357         [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1358         [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1359         [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1360 };
1362 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1364         struct cbq_sched_data *q = qdisc_priv(sch);
1365         struct nlattr *tb[TCA_CBQ_MAX + 1];
1366         struct tc_ratespec *r;
1367         int err;
1369         err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1370         if (err < 0)
1371                 return err;
1373         if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1374                 return -EINVAL;
1376         r = nla_data(tb[TCA_CBQ_RATE]);
1378         if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1379                 return -EINVAL;
1381         err = qdisc_class_hash_init(&q->clhash);
1382         if (err < 0)
1383                 goto put_rtab;
1385         q->link.refcnt = 1;
1386         q->link.sibling = &q->link;
1387         q->link.common.classid = sch->handle;
1388         q->link.qdisc = sch;
1389         q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1390                                       sch->handle);
1391         if (!q->link.q)
1392                 q->link.q = &noop_qdisc;
1394         q->link.priority = TC_CBQ_MAXPRIO - 1;
1395         q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1396         q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1397         q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1398         q->link.overlimit = cbq_ovl_classic;
1399         q->link.allot = psched_mtu(qdisc_dev(sch));
1400         q->link.quantum = q->link.allot;
1401         q->link.weight = q->link.R_tab->rate.rate;
1403         q->link.ewma_log = TC_CBQ_DEF_EWMA;
1404         q->link.avpkt = q->link.allot/2;
1405         q->link.minidle = -0x7FFFFFFF;
1407         qdisc_watchdog_init(&q->watchdog, sch);
1408         hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1409         q->delay_timer.function = cbq_undelay;
1410         q->toplevel = TC_CBQ_MAXLEVEL;
1411         q->now = psched_get_time();
1412         q->now_rt = q->now;
1414         cbq_link_class(&q->link);
1416         if (tb[TCA_CBQ_LSSOPT])
1417                 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1419         cbq_addprio(q, &q->link);
1420         return 0;
1422 put_rtab:
1423         qdisc_put_rtab(q->link.R_tab);
1424         return err;
1427 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1429         unsigned char *b = skb_tail_pointer(skb);
1431         if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1432                 goto nla_put_failure;
1433         return skb->len;
1435 nla_put_failure:
1436         nlmsg_trim(skb, b);
1437         return -1;
1440 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1442         unsigned char *b = skb_tail_pointer(skb);
1443         struct tc_cbq_lssopt opt;
1445         opt.flags = 0;
1446         if (cl->borrow == NULL)
1447                 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1448         if (cl->share == NULL)
1449                 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1450         opt.ewma_log = cl->ewma_log;
1451         opt.level = cl->level;
1452         opt.avpkt = cl->avpkt;
1453         opt.maxidle = cl->maxidle;
1454         opt.minidle = (u32)(-cl->minidle);
1455         opt.offtime = cl->offtime;
1456         opt.change = ~0;
1457         if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1458                 goto nla_put_failure;
1459         return skb->len;
1461 nla_put_failure:
1462         nlmsg_trim(skb, b);
1463         return -1;
1466 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1468         unsigned char *b = skb_tail_pointer(skb);
1469         struct tc_cbq_wrropt opt;
1471         opt.flags = 0;
1472         opt.allot = cl->allot;
1473         opt.priority = cl->priority + 1;
1474         opt.cpriority = cl->cpriority + 1;
1475         opt.weight = cl->weight;
1476         if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1477                 goto nla_put_failure;
1478         return skb->len;
1480 nla_put_failure:
1481         nlmsg_trim(skb, b);
1482         return -1;
1485 static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1487         unsigned char *b = skb_tail_pointer(skb);
1488         struct tc_cbq_ovl opt;
1490         opt.strategy = cl->ovl_strategy;
1491         opt.priority2 = cl->priority2 + 1;
1492         opt.pad = 0;
1493         opt.penalty = cl->penalty;
1494         if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt))
1495                 goto nla_put_failure;
1496         return skb->len;
1498 nla_put_failure:
1499         nlmsg_trim(skb, b);
1500         return -1;
1503 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1505         unsigned char *b = skb_tail_pointer(skb);
1506         struct tc_cbq_fopt opt;
1508         if (cl->split || cl->defmap) {
1509                 opt.split = cl->split ? cl->split->common.classid : 0;
1510                 opt.defmap = cl->defmap;
1511                 opt.defchange = ~0;
1512                 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1513                         goto nla_put_failure;
1514         }
1515         return skb->len;
1517 nla_put_failure:
1518         nlmsg_trim(skb, b);
1519         return -1;
1522 #ifdef CONFIG_NET_CLS_ACT
1523 static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1525         unsigned char *b = skb_tail_pointer(skb);
1526         struct tc_cbq_police opt;
1528         if (cl->police) {
1529                 opt.police = cl->police;
1530                 opt.__res1 = 0;
1531                 opt.__res2 = 0;
1532                 if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
1533                         goto nla_put_failure;
1534         }
1535         return skb->len;
1537 nla_put_failure:
1538         nlmsg_trim(skb, b);
1539         return -1;
1541 #endif
1543 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1545         if (cbq_dump_lss(skb, cl) < 0 ||
1546             cbq_dump_rate(skb, cl) < 0 ||
1547             cbq_dump_wrr(skb, cl) < 0 ||
1548             cbq_dump_ovl(skb, cl) < 0 ||
1549 #ifdef CONFIG_NET_CLS_ACT
1550             cbq_dump_police(skb, cl) < 0 ||
1551 #endif
1552             cbq_dump_fopt(skb, cl) < 0)
1553                 return -1;
1554         return 0;
1557 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1559         struct cbq_sched_data *q = qdisc_priv(sch);
1560         struct nlattr *nest;
1562         nest = nla_nest_start(skb, TCA_OPTIONS);
1563         if (nest == NULL)
1564                 goto nla_put_failure;
1565         if (cbq_dump_attr(skb, &q->link) < 0)
1566                 goto nla_put_failure;
1567         nla_nest_end(skb, nest);
1568         return skb->len;
1570 nla_put_failure:
1571         nla_nest_cancel(skb, nest);
1572         return -1;
1575 static int
1576 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1578         struct cbq_sched_data *q = qdisc_priv(sch);
1580         q->link.xstats.avgidle = q->link.avgidle;
1581         return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1584 static int
1585 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1586                struct sk_buff *skb, struct tcmsg *tcm)
1588         struct cbq_class *cl = (struct cbq_class *)arg;
1589         struct nlattr *nest;
1591         if (cl->tparent)
1592                 tcm->tcm_parent = cl->tparent->common.classid;
1593         else
1594                 tcm->tcm_parent = TC_H_ROOT;
1595         tcm->tcm_handle = cl->common.classid;
1596         tcm->tcm_info = cl->q->handle;
1598         nest = nla_nest_start(skb, TCA_OPTIONS);
1599         if (nest == NULL)
1600                 goto nla_put_failure;
1601         if (cbq_dump_attr(skb, cl) < 0)
1602                 goto nla_put_failure;
1603         nla_nest_end(skb, nest);
1604         return skb->len;
1606 nla_put_failure:
1607         nla_nest_cancel(skb, nest);
1608         return -1;
1611 static int
1612 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1613         struct gnet_dump *d)
1615         struct cbq_sched_data *q = qdisc_priv(sch);
1616         struct cbq_class *cl = (struct cbq_class *)arg;
1618         cl->qstats.qlen = cl->q->q.qlen;
1619         cl->xstats.avgidle = cl->avgidle;
1620         cl->xstats.undertime = 0;
1622         if (cl->undertime != PSCHED_PASTPERFECT)
1623                 cl->xstats.undertime = cl->undertime - q->now;
1625         if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1626             gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1627             gnet_stats_copy_queue(d, &cl->qstats) < 0)
1628                 return -1;
1630         return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1633 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1634                      struct Qdisc **old)
1636         struct cbq_class *cl = (struct cbq_class *)arg;
1638         if (new == NULL) {
1639                 new = qdisc_create_dflt(sch->dev_queue,
1640                                         &pfifo_qdisc_ops, cl->common.classid);
1641                 if (new == NULL)
1642                         return -ENOBUFS;
1643         } else {
1644 #ifdef CONFIG_NET_CLS_ACT
1645                 if (cl->police == TC_POLICE_RECLASSIFY)
1646                         new->reshape_fail = cbq_reshape_fail;
1647 #endif
1648         }
1649         sch_tree_lock(sch);
1650         *old = cl->q;
1651         cl->q = new;
1652         qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1653         qdisc_reset(*old);
1654         sch_tree_unlock(sch);
1656         return 0;
1659 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1661         struct cbq_class *cl = (struct cbq_class *)arg;
1663         return cl->q;
1666 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1668         struct cbq_class *cl = (struct cbq_class *)arg;
1670         if (cl->q->q.qlen == 0)
1671                 cbq_deactivate_class(cl);
1674 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1676         struct cbq_sched_data *q = qdisc_priv(sch);
1677         struct cbq_class *cl = cbq_class_lookup(q, classid);
1679         if (cl) {
1680                 cl->refcnt++;
1681                 return (unsigned long)cl;
1682         }
1683         return 0;
1686 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1688         struct cbq_sched_data *q = qdisc_priv(sch);
1690         WARN_ON(cl->filters);
1692         tcf_destroy_chain(&cl->filter_list);
1693         qdisc_destroy(cl->q);
1694         qdisc_put_rtab(cl->R_tab);
1695         gen_kill_estimator(&cl->bstats, &cl->rate_est);
1696         if (cl != &q->link)
1697                 kfree(cl);
1700 static void cbq_destroy(struct Qdisc *sch)
1702         struct cbq_sched_data *q = qdisc_priv(sch);
1703         struct hlist_node *n, *next;
1704         struct cbq_class *cl;
1705         unsigned int h;
1707 #ifdef CONFIG_NET_CLS_ACT
1708         q->rx_class = NULL;
1709 #endif
1710         /*
1711          * Filters must be destroyed first because we don't destroy the
1712          * classes from root to leafs which means that filters can still
1713          * be bound to classes which have been destroyed already. --TGR '04
1714          */
1715         for (h = 0; h < q->clhash.hashsize; h++) {
1716                 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode)
1717                         tcf_destroy_chain(&cl->filter_list);
1718         }
1719         for (h = 0; h < q->clhash.hashsize; h++) {
1720                 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h],
1721                                           common.hnode)
1722                         cbq_destroy_class(sch, cl);
1723         }
1724         qdisc_class_hash_destroy(&q->clhash);
1727 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1729         struct cbq_class *cl = (struct cbq_class *)arg;
1731         if (--cl->refcnt == 0) {
1732 #ifdef CONFIG_NET_CLS_ACT
1733                 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1734                 struct cbq_sched_data *q = qdisc_priv(sch);
1736                 spin_lock_bh(root_lock);
1737                 if (q->rx_class == cl)
1738                         q->rx_class = NULL;
1739                 spin_unlock_bh(root_lock);
1740 #endif
1742                 cbq_destroy_class(sch, cl);
1743         }
1746 static int
1747 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1748                  unsigned long *arg)
1750         int err;
1751         struct cbq_sched_data *q = qdisc_priv(sch);
1752         struct cbq_class *cl = (struct cbq_class *)*arg;
1753         struct nlattr *opt = tca[TCA_OPTIONS];
1754         struct nlattr *tb[TCA_CBQ_MAX + 1];
1755         struct cbq_class *parent;
1756         struct qdisc_rate_table *rtab = NULL;
1758         if (opt == NULL)
1759                 return -EINVAL;
1761         err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1762         if (err < 0)
1763                 return err;
1765         if (cl) {
1766                 /* Check parent */
1767                 if (parentid) {
1768                         if (cl->tparent &&
1769                             cl->tparent->common.classid != parentid)
1770                                 return -EINVAL;
1771                         if (!cl->tparent && parentid != TC_H_ROOT)
1772                                 return -EINVAL;
1773                 }
1775                 if (tb[TCA_CBQ_RATE]) {
1776                         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1777                                               tb[TCA_CBQ_RTAB]);
1778                         if (rtab == NULL)
1779                                 return -EINVAL;
1780                 }
1782                 if (tca[TCA_RATE]) {
1783                         err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1784                                                     qdisc_root_sleeping_lock(sch),
1785                                                     tca[TCA_RATE]);
1786                         if (err) {
1787                                 if (rtab)
1788                                         qdisc_put_rtab(rtab);
1789                                 return err;
1790                         }
1791                 }
1793                 /* Change class parameters */
1794                 sch_tree_lock(sch);
1796                 if (cl->next_alive != NULL)
1797                         cbq_deactivate_class(cl);
1799                 if (rtab) {
1800                         qdisc_put_rtab(cl->R_tab);
1801                         cl->R_tab = rtab;
1802                 }
1804                 if (tb[TCA_CBQ_LSSOPT])
1805                         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1807                 if (tb[TCA_CBQ_WRROPT]) {
1808                         cbq_rmprio(q, cl);
1809                         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1810                 }
1812                 if (tb[TCA_CBQ_OVL_STRATEGY])
1813                         cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1815 #ifdef CONFIG_NET_CLS_ACT
1816                 if (tb[TCA_CBQ_POLICE])
1817                         cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1818 #endif
1820                 if (tb[TCA_CBQ_FOPT])
1821                         cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1823                 if (cl->q->q.qlen)
1824                         cbq_activate_class(cl);
1826                 sch_tree_unlock(sch);
1828                 return 0;
1829         }
1831         if (parentid == TC_H_ROOT)
1832                 return -EINVAL;
1834         if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1835             tb[TCA_CBQ_LSSOPT] == NULL)
1836                 return -EINVAL;
1838         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1839         if (rtab == NULL)
1840                 return -EINVAL;
1842         if (classid) {
1843                 err = -EINVAL;
1844                 if (TC_H_MAJ(classid ^ sch->handle) ||
1845                     cbq_class_lookup(q, classid))
1846                         goto failure;
1847         } else {
1848                 int i;
1849                 classid = TC_H_MAKE(sch->handle, 0x8000);
1851                 for (i = 0; i < 0x8000; i++) {
1852                         if (++q->hgenerator >= 0x8000)
1853                                 q->hgenerator = 1;
1854                         if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1855                                 break;
1856                 }
1857                 err = -ENOSR;
1858                 if (i >= 0x8000)
1859                         goto failure;
1860                 classid = classid|q->hgenerator;
1861         }
1863         parent = &q->link;
1864         if (parentid) {
1865                 parent = cbq_class_lookup(q, parentid);
1866                 err = -EINVAL;
1867                 if (parent == NULL)
1868                         goto failure;
1869         }
1871         err = -ENOBUFS;
1872         cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1873         if (cl == NULL)
1874                 goto failure;
1876         if (tca[TCA_RATE]) {
1877                 err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1878                                         qdisc_root_sleeping_lock(sch),
1879                                         tca[TCA_RATE]);
1880                 if (err) {
1881                         kfree(cl);
1882                         goto failure;
1883                 }
1884         }
1886         cl->R_tab = rtab;
1887         rtab = NULL;
1888         cl->refcnt = 1;
1889         cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1890         if (!cl->q)
1891                 cl->q = &noop_qdisc;
1892         cl->common.classid = classid;
1893         cl->tparent = parent;
1894         cl->qdisc = sch;
1895         cl->allot = parent->allot;
1896         cl->quantum = cl->allot;
1897         cl->weight = cl->R_tab->rate.rate;
1899         sch_tree_lock(sch);
1900         cbq_link_class(cl);
1901         cl->borrow = cl->tparent;
1902         if (cl->tparent != &q->link)
1903                 cl->share = cl->tparent;
1904         cbq_adjust_levels(parent);
1905         cl->minidle = -0x7FFFFFFF;
1906         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1907         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1908         if (cl->ewma_log == 0)
1909                 cl->ewma_log = q->link.ewma_log;
1910         if (cl->maxidle == 0)
1911                 cl->maxidle = q->link.maxidle;
1912         if (cl->avpkt == 0)
1913                 cl->avpkt = q->link.avpkt;
1914         cl->overlimit = cbq_ovl_classic;
1915         if (tb[TCA_CBQ_OVL_STRATEGY])
1916                 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY]));
1917 #ifdef CONFIG_NET_CLS_ACT
1918         if (tb[TCA_CBQ_POLICE])
1919                 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
1920 #endif
1921         if (tb[TCA_CBQ_FOPT])
1922                 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1923         sch_tree_unlock(sch);
1925         qdisc_class_hash_grow(sch, &q->clhash);
1927         *arg = (unsigned long)cl;
1928         return 0;
1930 failure:
1931         qdisc_put_rtab(rtab);
1932         return err;
1935 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1937         struct cbq_sched_data *q = qdisc_priv(sch);
1938         struct cbq_class *cl = (struct cbq_class *)arg;
1939         unsigned int qlen;
1941         if (cl->filters || cl->children || cl == &q->link)
1942                 return -EBUSY;
1944         sch_tree_lock(sch);
1946         qlen = cl->q->q.qlen;
1947         qdisc_reset(cl->q);
1948         qdisc_tree_decrease_qlen(cl->q, qlen);
1950         if (cl->next_alive)
1951                 cbq_deactivate_class(cl);
1953         if (q->tx_borrowed == cl)
1954                 q->tx_borrowed = q->tx_class;
1955         if (q->tx_class == cl) {
1956                 q->tx_class = NULL;
1957                 q->tx_borrowed = NULL;
1958         }
1959 #ifdef CONFIG_NET_CLS_ACT
1960         if (q->rx_class == cl)
1961                 q->rx_class = NULL;
1962 #endif
1964         cbq_unlink_class(cl);
1965         cbq_adjust_levels(cl->tparent);
1966         cl->defmap = 0;
1967         cbq_sync_defmap(cl);
1969         cbq_rmprio(q, cl);
1970         sch_tree_unlock(sch);
1972         BUG_ON(--cl->refcnt == 0);
1973         /*
1974          * This shouldn't happen: we "hold" one cops->get() when called
1975          * from tc_ctl_tclass; the destroy method is done from cops->put().
1976          */
1978         return 0;
1981 static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1983         struct cbq_sched_data *q = qdisc_priv(sch);
1984         struct cbq_class *cl = (struct cbq_class *)arg;
1986         if (cl == NULL)
1987                 cl = &q->link;
1989         return &cl->filter_list;
1992 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1993                                      u32 classid)
1995         struct cbq_sched_data *q = qdisc_priv(sch);
1996         struct cbq_class *p = (struct cbq_class *)parent;
1997         struct cbq_class *cl = cbq_class_lookup(q, classid);
1999         if (cl) {
2000                 if (p && p->level <= cl->level)
2001                         return 0;
2002                 cl->filters++;
2003                 return (unsigned long)cl;
2004         }
2005         return 0;
2008 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2010         struct cbq_class *cl = (struct cbq_class *)arg;
2012         cl->filters--;
2015 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2017         struct cbq_sched_data *q = qdisc_priv(sch);
2018         struct cbq_class *cl;
2019         struct hlist_node *n;
2020         unsigned int h;
2022         if (arg->stop)
2023                 return;
2025         for (h = 0; h < q->clhash.hashsize; h++) {
2026                 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
2027                         if (arg->count < arg->skip) {
2028                                 arg->count++;
2029                                 continue;
2030                         }
2031                         if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2032                                 arg->stop = 1;
2033                                 return;
2034                         }
2035                         arg->count++;
2036                 }
2037         }
2040 static const struct Qdisc_class_ops cbq_class_ops = {
2041         .graft          =       cbq_graft,
2042         .leaf           =       cbq_leaf,
2043         .qlen_notify    =       cbq_qlen_notify,
2044         .get            =       cbq_get,
2045         .put            =       cbq_put,
2046         .change         =       cbq_change_class,
2047         .delete         =       cbq_delete,
2048         .walk           =       cbq_walk,
2049         .tcf_chain      =       cbq_find_tcf,
2050         .bind_tcf       =       cbq_bind_filter,
2051         .unbind_tcf     =       cbq_unbind_filter,
2052         .dump           =       cbq_dump_class,
2053         .dump_stats     =       cbq_dump_class_stats,
2054 };
2056 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
2057         .next           =       NULL,
2058         .cl_ops         =       &cbq_class_ops,
2059         .id             =       "cbq",
2060         .priv_size      =       sizeof(struct cbq_sched_data),
2061         .enqueue        =       cbq_enqueue,
2062         .dequeue        =       cbq_dequeue,
2063         .peek           =       qdisc_peek_dequeued,
2064         .drop           =       cbq_drop,
2065         .init           =       cbq_init,
2066         .reset          =       cbq_reset,
2067         .destroy        =       cbq_destroy,
2068         .change         =       NULL,
2069         .dump           =       cbq_dump,
2070         .dump_stats     =       cbq_dump_stats,
2071         .owner          =       THIS_MODULE,
2072 };
2074 static int __init cbq_module_init(void)
2076         return register_qdisc(&cbq_qdisc_ops);
2078 static void __exit cbq_module_exit(void)
2080         unregister_qdisc(&cbq_qdisc_ops);
2082 module_init(cbq_module_init)
2083 module_exit(cbq_module_exit)
2084 MODULE_LICENSE("GPL");