diff options
Diffstat (limited to 'net/sched')
47 files changed, 1514 insertions, 245 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2590e91..e7a8976 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -250,6 +250,28 @@ config NET_SCH_QFQ If unsure, say N. +config NET_SCH_CODEL + tristate "Controlled Delay AQM (CODEL)" + help + Say Y here if you want to use the Controlled Delay (CODEL) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_codel. + + If unsure, say N. + +config NET_SCH_FQ_CODEL + tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)" + help + Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_fq_codel. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT @@ -260,6 +282,32 @@ config NET_SCH_INGRESS To compile this code as a module, choose M here: the module will be called sch_ingress. +config NET_SCH_PLUG + tristate "Plug network traffic until release (PLUG)" + ---help--- + + This queuing discipline allows userspace to plug/unplug a network + output queue, using the netlink interface. When it receives an + enqueue command it inserts a plug into the outbound queue that + causes following packets to enqueue until a dequeue command arrives + over netlink, causing the plug to be removed and resuming the normal + packet flow. + + This module also provides a generic "network output buffering" + functionality (aka output commit), wherein upon arrival of a dequeue + command, only packets up to the first plug are released for delivery. + The Remus HA project uses this module to enable speculative execution + of virtual machines by allowing the generated network output to be rolled + back if needed. + + For more information, please refer to http://wiki.xensource.com/xenwiki/Remus + + Say Y here if you are using this kernel for Xen dom0 and + want to protect Xen guests with Remus. + + To compile this code as a module, choose M here: the + module will be called sch_plug. + comment "Classification" config NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index dc5889c..5940a19 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -33,9 +33,12 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o +obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o +obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o +obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 93fdf13..5cfb160 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -127,7 +127,8 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, nest = nla_nest_start(skb, a->order); if (nest == NULL) goto nla_put_failure; - NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind); + if (nla_put_string(skb, TCA_KIND, a->ops->kind)) + goto nla_put_failure; for (i = 0; i < (hinfo->hmask + 1); i++) { p = hinfo->htab[tcf_hash(i, hinfo->hmask)]; @@ -139,7 +140,8 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, p = s_p; } } - NLA_PUT_U32(skb, TCA_FCNT, n_i); + if (nla_put_u32(skb, TCA_FCNT, n_i)) + goto nla_put_failure; nla_nest_end(skb, nest); return n_i; @@ -437,7 +439,8 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) if (a->ops == NULL || a->ops->dump == NULL) return err; - NLA_PUT_STRING(skb, TCA_KIND, a->ops->kind); + if (nla_put_string(skb, TCA_KIND, a->ops->kind)) + goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) goto nla_put_failure; nest = nla_nest_start(skb, TCA_OPTIONS); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 453a734..2c8ad7c 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -397,7 +397,7 @@ static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, while (len > 1) { switch (xh[off]) { - case IPV6_TLV_PAD0: + case IPV6_TLV_PAD1: optlen = 1; break; case IPV6_TLV_JUMBO: @@ -550,11 +550,13 @@ static int tcf_csum_dump(struct sk_buff *skb, }; struct tcf_t t; - NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t); + if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index b77f5a0..f10fb82 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -162,7 +162,8 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int }; struct tcf_t t; - NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; #ifdef CONFIG_GACT_PROB if (gact->tcfg_ptype) { struct tc_gact_p p_opt = { @@ -171,13 +172,15 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int .ptype = gact->tcfg_ptype, }; - NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt); + if (nla_put(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt)) + goto nla_put_failure; } #endif t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); - NLA_PUT(skb, TCA_GACT_TM, sizeof(t), &t); + if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; nla_put_failure: diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 60f8f61..60e281a 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -1,5 +1,5 @@ /* - * net/sched/ipt.c iptables target interface + * net/sched/ipt.c iptables target interface * *TODO: Add other tables. For now we only support the ipv4 table targets * @@ -235,9 +235,8 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, result = TC_ACT_PIPE; break; default: - if (net_ratelimit()) - pr_notice("tc filter: Bogus netfilter code" - " %d assume ACCEPT\n", ret); + net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n", + ret); result = TC_POLICE_OK; break; } @@ -267,15 +266,17 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int c.refcnt = ipt->tcf_refcnt - ref; strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); - NLA_PUT(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t); - NLA_PUT_U32(skb, TCA_IPT_INDEX, ipt->tcf_index); - NLA_PUT_U32(skb, TCA_IPT_HOOK, ipt->tcfi_hook); - NLA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c); - NLA_PUT_STRING(skb, TCA_IPT_TABLE, ipt->tcfi_tname); + if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || + nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) || + nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) || + nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) || + nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname)) + goto nla_put_failure; tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install); tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse); tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); - NLA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm); + if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm)) + goto nla_put_failure; kfree(t); return skb->len; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e051398..fe81cc1 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -174,9 +174,8 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, } if (!(dev->flags & IFF_UP)) { - if (net_ratelimit()) - pr_notice("tc mirred to Houston: device %s is down\n", - dev->name); + net_notice_ratelimited("tc mirred to Houston: device %s is down\n", + dev->name); goto out; } @@ -227,11 +226,13 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i }; struct tcf_t t; - NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(m->tcf_tm.expires); - NLA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t); + if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; nla_put_failure: diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 001d1b3..b5d029e 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -284,11 +284,13 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - NLA_PUT(skb, TCA_NAT_TM, sizeof(t), &t); + if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 10d3aed..26aa2f6 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -215,11 +215,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, opt->refcnt = p->tcf_refcnt - ref; opt->bindcnt = p->tcf_bindcnt - bind; - NLA_PUT(skb, TCA_PEDIT_PARMS, s, opt); + if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) + goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - NLA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t); + if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t)) + goto nla_put_failure; kfree(opt); return skb->len; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 6fb3f5a..a9de232 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -356,11 +356,14 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) opt.rate = police->tcfp_R_tab->rate; if (police->tcfp_P_tab) opt.peakrate = police->tcfp_P_tab->rate; - NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); - if (police->tcfp_result) - NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result); - if (police->tcfp_ewma_rate) - NLA_PUT_U32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate); + if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) + goto nla_put_failure; + if (police->tcfp_result && + nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result)) + goto nla_put_failure; + if (police->tcfp_ewma_rate && + nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate)) + goto nla_put_failure; return skb->len; nla_put_failure: diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 73e0a3a..3922f2a 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -172,12 +172,14 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt); - NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata); + if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || + nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) + goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); - NLA_PUT(skb, TCA_DEF_TM, sizeof(t), &t); + if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; nla_put_failure: diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 35dbbe9..476e0fa 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -166,20 +166,25 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt); - if (d->flags & SKBEDIT_F_PRIORITY) - NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), - &d->priority); - if (d->flags & SKBEDIT_F_QUEUE_MAPPING) - NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING, - sizeof(d->queue_mapping), &d->queue_mapping); - if (d->flags & SKBEDIT_F_MARK) - NLA_PUT(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), - &d->mark); + if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + if ((d->flags & SKBEDIT_F_PRIORITY) && + nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority), + &d->priority)) + goto nla_put_failure; + if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) && + nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING, + sizeof(d->queue_mapping), &d->queue_mapping)) + goto nla_put_failure; + if ((d->flags & SKBEDIT_F_MARK) && + nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark), + &d->mark)) + goto nla_put_failure; t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); - NLA_PUT(skb, TCA_SKBEDIT_TM, sizeof(t), &t); + if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t)) + goto nla_put_failure; return skb->len; nla_put_failure: diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a69d44f..f452f69 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -357,7 +357,8 @@ static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex; tcm->tcm_parent = tp->classid; tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); - NLA_PUT_STRING(skb, TCA_KIND, tp->ops->kind); + if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) + goto nla_put_failure; tcm->tcm_handle = fh; if (RTM_DELTFILTER != event) { tcm->tcm_handle = 0; diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index ea1f70b..590960a 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -257,8 +257,9 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - if (f->res.classid) - NLA_PUT_U32(skb, TCA_BASIC_CLASSID, f->res.classid); + if (f->res.classid && + nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid)) + goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index f84fdc3..7743ea8 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -22,23 +22,6 @@ #include <net/sock.h> #include <net/cls_cgroup.h> -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, - struct cgroup *cgrp); -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); -static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); - -struct cgroup_subsys net_cls_subsys = { - .name = "net_cls", - .create = cgrp_create, - .destroy = cgrp_destroy, - .populate = cgrp_populate, -#ifdef CONFIG_NET_CLS_CGROUP - .subsys_id = net_cls_subsys_id, -#endif - .module = THIS_MODULE, -}; - - static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) { return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), @@ -51,8 +34,7 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) struct cgroup_cls_state, css); } -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, - struct cgroup *cgrp) +static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) { struct cgroup_cls_state *cs; @@ -66,7 +48,7 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, return &cs->css; } -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cgrp_destroy(struct cgroup *cgrp) { kfree(cgrp_cls_state(cgrp)); } @@ -88,12 +70,19 @@ static struct cftype ss_files[] = { .read_u64 = read_classid, .write_u64 = write_classid, }, + { } /* terminate */ }; -static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); -} +struct cgroup_subsys net_cls_subsys = { + .name = "net_cls", + .create = cgrp_create, + .destroy = cgrp_destroy, +#ifdef CONFIG_NET_CLS_CGROUP + .subsys_id = net_cls_subsys_id, +#endif + .base_cftypes = ss_files, + .module = THIS_MODULE, +}; struct cls_cgroup_head { u32 handle; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 1d8bd0d..ccd08c8 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -572,25 +572,32 @@ static int flow_dump(struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - NLA_PUT_U32(skb, TCA_FLOW_KEYS, f->keymask); - NLA_PUT_U32(skb, TCA_FLOW_MODE, f->mode); + if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) || + nla_put_u32(skb, TCA_FLOW_MODE, f->mode)) + goto nla_put_failure; if (f->mask != ~0 || f->xor != 0) { - NLA_PUT_U32(skb, TCA_FLOW_MASK, f->mask); - NLA_PUT_U32(skb, TCA_FLOW_XOR, f->xor); + if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) || + nla_put_u32(skb, TCA_FLOW_XOR, f->xor)) + goto nla_put_failure; } - if (f->rshift) - NLA_PUT_U32(skb, TCA_FLOW_RSHIFT, f->rshift); - if (f->addend) - NLA_PUT_U32(skb, TCA_FLOW_ADDEND, f->addend); + if (f->rshift && + nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift)) + goto nla_put_failure; + if (f->addend && + nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend)) + goto nla_put_failure; - if (f->divisor) - NLA_PUT_U32(skb, TCA_FLOW_DIVISOR, f->divisor); - if (f->baseclass) - NLA_PUT_U32(skb, TCA_FLOW_BASECLASS, f->baseclass); + if (f->divisor && + nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor)) + goto nla_put_failure; + if (f->baseclass && + nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass)) + goto nla_put_failure; - if (f->perturb_period) - NLA_PUT_U32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ); + if (f->perturb_period && + nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ)) + goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0) goto nla_put_failure; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 389af15..8384a47 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -346,14 +346,17 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - if (f->res.classid) - NLA_PUT_U32(skb, TCA_FW_CLASSID, f->res.classid); + if (f->res.classid && + nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid)) + goto nla_put_failure; #ifdef CONFIG_NET_CLS_IND - if (strlen(f->indev)) - NLA_PUT_STRING(skb, TCA_FW_INDEV, f->indev); + if (strlen(f->indev) && + nla_put_string(skb, TCA_FW_INDEV, f->indev)) + goto nla_put_failure; #endif /* CONFIG_NET_CLS_IND */ - if (head->mask != 0xFFFFFFFF) - NLA_PUT_U32(skb, TCA_FW_MASK, head->mask); + if (head->mask != 0xFFFFFFFF && + nla_put_u32(skb, TCA_FW_MASK, head->mask)) + goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) goto nla_put_failure; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 13ab66e..36fec42 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -571,17 +571,21 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh, if (!(f->handle & 0x8000)) { id = f->id & 0xFF; - NLA_PUT_U32(skb, TCA_ROUTE4_TO, id); + if (nla_put_u32(skb, TCA_ROUTE4_TO, id)) + goto nla_put_failure; } if (f->handle & 0x80000000) { - if ((f->handle >> 16) != 0xFFFF) - NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif); + if ((f->handle >> 16) != 0xFFFF && + nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif)) + goto nla_put_failure; } else { id = f->id >> 16; - NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id); + if (nla_put_u32(skb, TCA_ROUTE4_FROM, id)) + goto nla_put_failure; } - if (f->res.classid) - NLA_PUT_U32(skb, TCA_ROUTE4_CLASSID, f->res.classid); + if (f->res.classid && + nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid)) + goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) goto nla_put_failure; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index b014279..18ab93e 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -615,18 +615,22 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); + if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst)) + goto nla_put_failure; pinfo.dpi = s->dpi; pinfo.spi = f->spi; pinfo.protocol = s->protocol; pinfo.tunnelid = s->tunnelid; pinfo.tunnelhdr = f->tunnelhdr; pinfo.pad = 0; - NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); - if (f->res.classid) - NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid); - if (((f->handle >> 8) & 0xFF) != 16) - NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); + if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo)) + goto nla_put_failure; + if (f->res.classid && + nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid)) + goto nla_put_failure; + if (((f->handle >> 8) & 0xFF) != 16 && + nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src)) + goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) goto nla_put_failure; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index dbe1992..fe29420 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -438,10 +438,11 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, if (!fh) { t->tcm_handle = ~0; /* whatever ... */ - NLA_PUT_U32(skb, TCA_TCINDEX_HASH, p->hash); - NLA_PUT_U16(skb, TCA_TCINDEX_MASK, p->mask); - NLA_PUT_U32(skb, TCA_TCINDEX_SHIFT, p->shift); - NLA_PUT_U32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through); + if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) || + nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) || + nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) || + nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through)) + goto nla_put_failure; nla_nest_end(skb, nest); } else { if (p->perfect) { @@ -460,8 +461,9 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, } } pr_debug("handle = %d\n", t->tcm_handle); - if (r->res.class) - NLA_PUT_U32(skb, TCA_TCINDEX_CLASSID, r->res.classid); + if (r->res.class && + nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid)) + goto nla_put_failure; if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) goto nla_put_failure; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 939b627..d45373f 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -234,8 +234,7 @@ out: return -1; deadloop: - if (net_ratelimit()) - pr_warning("cls_u32: dead loop\n"); + net_warn_ratelimited("cls_u32: dead loop\n"); return -1; } @@ -733,36 +732,44 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh, struct tc_u_hnode *ht = (struct tc_u_hnode *)fh; u32 divisor = ht->divisor + 1; - NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor); + if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor)) + goto nla_put_failure; } else { - NLA_PUT(skb, TCA_U32_SEL, - sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), - &n->sel); + if (nla_put(skb, TCA_U32_SEL, + sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), + &n->sel)) + goto nla_put_failure; if (n->ht_up) { u32 htid = n->handle & 0xFFFFF000; - NLA_PUT_U32(skb, TCA_U32_HASH, htid); + if (nla_put_u32(skb, TCA_U32_HASH, htid)) + goto nla_put_failure; } - if (n->res.classid) - NLA_PUT_U32(skb, TCA_U32_CLASSID, n->res.classid); - if (n->ht_down) - NLA_PUT_U32(skb, TCA_U32_LINK, n->ht_down->handle); + if (n->res.classid && + nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid)) + goto nla_put_failure; + if (n->ht_down && + nla_put_u32(skb, TCA_U32_LINK, n->ht_down->handle)) + goto nla_put_failure; #ifdef CONFIG_CLS_U32_MARK - if (n->mark.val || n->mark.mask) - NLA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark); + if ((n->mark.val || n->mark.mask) && + nla_put(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark)) + goto nla_put_failure; #endif if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0) goto nla_put_failure; #ifdef CONFIG_NET_CLS_IND - if (strlen(n->indev)) - NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev); + if (strlen(n->indev) && + nla_put_string(skb, TCA_U32_INDEV, n->indev)) + goto nla_put_failure; #endif #ifdef CONFIG_CLS_U32_PERF - NLA_PUT(skb, TCA_U32_PCNT, - sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), - n->pf); + if (nla_put(skb, TCA_U32_PCNT, + sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), + n->pf)) + goto nla_put_failure; #endif } diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 1363bf1..4790c69 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -585,8 +585,9 @@ static void meta_var_apply_extras(struct meta_value *v, static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { - if (v->val && v->len) - NLA_PUT(skb, tlv, v->len, (void *) v->val); + if (v->val && v->len && + nla_put(skb, tlv, v->len, (void *) v->val)) + goto nla_put_failure; return 0; nla_put_failure: @@ -636,10 +637,13 @@ static void meta_int_apply_extras(struct meta_value *v, static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { - if (v->len == sizeof(unsigned long)) - NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val); - else if (v->len == sizeof(u32)) - NLA_PUT_U32(skb, tlv, v->val); + if (v->len == sizeof(unsigned long)) { + if (nla_put(skb, tlv, sizeof(unsigned long), &v->val)) + goto nla_put_failure; + } else if (v->len == sizeof(u32)) { + if (nla_put_u32(skb, tlv, v->val)) + goto nla_put_failure; + } return 0; @@ -831,7 +835,8 @@ static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em) memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left)); memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right)); - NLA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr); + if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr)) + goto nla_put_failure; ops = meta_type_ops(&meta->lvalue); if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 || diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 88d93eb..3a633de 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -441,7 +441,8 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) if (top_start == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr); + if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr)) + goto nla_put_failure; list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST); if (list_start == NULL) @@ -457,7 +458,8 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) .flags = em->flags }; - NLA_PUT(skb, i + 1, sizeof(em_hdr), &em_hdr); + if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr)) + goto nla_put_failure; if (em->ops && em->ops->dump) { if (em->ops->dump(skb, em) < 0) @@ -535,9 +537,7 @@ pop_stack: return res; stack_overflow: - if (net_ratelimit()) - pr_warning("tc ematch: local stack overflow," - " increase NET_EMATCH_STACK\n"); + net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n"); return -1; } EXPORT_SYMBOL(__tcf_em_tree_match); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 3d8981f..085ce53 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -426,7 +426,8 @@ static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) nest = nla_nest_start(skb, TCA_STAB); if (nest == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts); + if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) + goto nla_put_failure; nla_nest_end(skb, nest); return skb->len; @@ -1201,7 +1202,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, tcm->tcm_parent = clid; tcm->tcm_handle = q->handle; tcm->tcm_info = atomic_read(&q->refcnt); - NLA_PUT_STRING(skb, TCA_KIND, q->ops->id); + if (nla_put_string(skb, TCA_KIND, q->ops->id)) + goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; q->qstats.qlen = q->q.qlen; @@ -1505,7 +1507,8 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, tcm->tcm_parent = q->handle; tcm->tcm_handle = q->handle; tcm->tcm_info = 0; - NLA_PUT_STRING(skb, TCA_KIND, q->ops->id); + if (nla_put_string(skb, TCA_KIND, q->ops->id)) + goto nla_put_failure; if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) goto nla_put_failure; @@ -1688,12 +1691,10 @@ reclassify: tp = otp; if (verd++ >= MAX_REC_LOOP) { - if (net_ratelimit()) - pr_notice("%s: packet reclassify loop" - " rule prio %u protocol %02x\n", - tp->q->ops->id, - tp->prio & 0xffff, - ntohs(tp->protocol)); + net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n", + tp->q->ops->id, + tp->prio & 0xffff, + ntohs(tp->protocol)); return TC_ACT_SHOT; } skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index e25e490..8522a47 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -423,8 +423,6 @@ drop: __maybe_unused } return ret; } - qdisc_bstats_update(sch, skb); - bstats_update(&flow->bstats, skb); /* * Okay, this may seem weird. We pretend we've dropped the packet if * it goes via ATM. The reason for this is that the outer qdisc @@ -472,6 +470,8 @@ static void sch_atm_dequeue(unsigned long data) if (unlikely(!skb)) break; + qdisc_bstats_update(sch, skb); + bstats_update(&flow->bstats, skb); pr_debug("atm_tc_dequeue: sending on class %p\n", flow); /* remove any LL header somebody else has attached */ skb_pull(skb, skb_network_offset(skb)); @@ -601,7 +601,8 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, if (nest == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr); + if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr)) + goto nla_put_failure; if (flow->vcc) { struct sockaddr_atmpvc pvc; int state; @@ -610,15 +611,19 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; pvc.sap_addr.vpi = flow->vcc->vpi; pvc.sap_addr.vci = flow->vcc->vci; - NLA_PUT(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc); + if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc)) + goto nla_put_failure; state = ATM_VF2VS(flow->vcc->flags); - NLA_PUT_U32(skb, TCA_ATM_STATE, state); + if (nla_put_u32(skb, TCA_ATM_STATE, state)) + goto nla_put_failure; + } + if (flow->excess) { + if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->classid)) + goto nla_put_failure; + } else { + if (nla_put_u32(skb, TCA_ATM_EXCESS, 0)) + goto nla_put_failure; } - if (flow->excess) - NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid); - else - NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0); - nla_nest_end(skb, nest); return skb->len; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 24d94c0..6aabd77 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1425,7 +1425,8 @@ static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) { unsigned char *b = skb_tail_pointer(skb); - NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); + if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -1450,7 +1451,8 @@ static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) opt.minidle = (u32)(-cl->minidle); opt.offtime = cl->offtime; opt.change = ~0; - NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -1468,7 +1470,8 @@ static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) opt.priority = cl->priority + 1; opt.cpriority = cl->cpriority + 1; opt.weight = cl->weight; - NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -1485,7 +1488,8 @@ static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) opt.priority2 = cl->priority2 + 1; opt.pad = 0; opt.penalty = cl->penalty; - NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -1502,7 +1506,8 @@ static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) opt.split = cl->split ? cl->split->common.classid : 0; opt.defmap = cl->defmap; opt.defchange = ~0; - NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt)) + goto nla_put_failure; } return skb->len; @@ -1521,7 +1526,8 @@ static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) opt.police = cl->police; opt.__res1 = 0; opt.__res2 = 0; - NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); + if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt)) + goto nla_put_failure; } return skb->len; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 7e267d7..cc37dd5 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -332,15 +332,13 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch) } q->stats.pdrop++; - sch->qstats.drops++; - kfree_skb(skb); - return NET_XMIT_DROP; + return qdisc_drop(skb, sch); - congestion_drop: +congestion_drop: qdisc_drop(skb, sch); return NET_XMIT_CN; - other_drop: +other_drop: if (ret & __NET_XMIT_BYPASS) sch->qstats.drops++; kfree_skb(skb); @@ -515,8 +513,9 @@ static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) if (opts == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt); - NLA_PUT_U32(skb, TCA_CHOKE_MAX_P, q->parms.max_P); + if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) || + nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P)) + goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c new file mode 100644 index 0000000..2f9ab17 --- /dev/null +++ b/net/sched/sch_codel.c @@ -0,0 +1,276 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm + * + * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com> + * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net> + * + * Implemented on linux by : + * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> + * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/prefetch.h> +#include <net/pkt_sched.h> +#include <net/codel.h> + + +#define DEFAULT_CODEL_LIMIT 1000 + +struct codel_sched_data { + struct codel_params params; + struct codel_vars vars; + struct codel_stats stats; + u32 drop_overlimit; +}; + +/* This is the specific function called from codel_dequeue() + * to dequeue a packet from queue. Note: backlog is handled in + * codel, we dont need to reduce it here. + */ +static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) +{ + struct sk_buff *skb = __skb_dequeue(&sch->q); + + prefetch(&skb->end); /* we'll need skb_shinfo() */ + return skb; +} + +static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue); + + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + * or HTB crashes. Defer it for next round. + */ + if (q->stats.drop_count && sch->q.qlen) { + qdisc_tree_decrease_qlen(sch, q->stats.drop_count); + q->stats.drop_count = 0; + } + if (skb) + qdisc_bstats_update(sch, skb); + return skb; +} + +static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct codel_sched_data *q; + + if (likely(qdisc_qlen(sch) < sch->limit)) { + codel_set_enqueue_time(skb); + return qdisc_enqueue_tail(skb, sch); + } + q = qdisc_priv(sch); + q->drop_overlimit++; + return qdisc_drop(skb, sch); +} + +static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { + [TCA_CODEL_TARGET] = { .type = NLA_U32 }, + [TCA_CODEL_LIMIT] = { .type = NLA_U32 }, + [TCA_CODEL_INTERVAL] = { .type = NLA_U32 }, + [TCA_CODEL_ECN] = { .type = NLA_U32 }, +}; + +static int codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CODEL_MAX + 1]; + unsigned int qlen; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy); + if (err < 0) + return err; + + sch_tree_lock(sch); + + if (tb[TCA_CODEL_TARGET]) { + u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]); + + q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_CODEL_INTERVAL]) { + u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]); + + q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_CODEL_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]); + + if (tb[TCA_CODEL_ECN]) + q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]); + + qlen = sch->q.qlen; + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = __skb_dequeue(&sch->q); + + sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_drop(skb, sch); + } + qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen); + + sch_tree_unlock(sch); + return 0; +} + +static int codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + sch->limit = DEFAULT_CODEL_LIMIT; + + codel_params_init(&q->params); + codel_vars_init(&q->vars); + codel_stats_init(&q->stats); + + if (opt) { + int err = codel_change(sch, opt); + + if (err) + return err; + } + + if (sch->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + + return 0; +} + +static int codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct codel_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CODEL_TARGET, + codel_time_to_us(q->params.target)) || + nla_put_u32(skb, TCA_CODEL_LIMIT, + sch->limit) || + nla_put_u32(skb, TCA_CODEL_INTERVAL, + codel_time_to_us(q->params.interval)) || + nla_put_u32(skb, TCA_CODEL_ECN, + q->params.ecn)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; +} + +static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + const struct codel_sched_data *q = qdisc_priv(sch); + struct tc_codel_xstats st = { + .maxpacket = q->stats.maxpacket, + .count = q->vars.count, + .lastcount = q->vars.lastcount, + .drop_overlimit = q->drop_overlimit, + .ldelay = codel_time_to_us(q->vars.ldelay), + .dropping = q->vars.dropping, + .ecn_mark = q->stats.ecn_mark, + }; + + if (q->vars.dropping) { + codel_tdiff_t delta = q->vars.drop_next - codel_get_time(); + + if (delta >= 0) + st.drop_next = codel_time_to_us(delta); + else + st.drop_next = -codel_time_to_us(-delta); + } + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static void codel_reset(struct Qdisc *sch) +{ + struct codel_sched_data *q = qdisc_priv(sch); + + qdisc_reset_queue(sch); + codel_vars_init(&q->vars); +} + +static struct Qdisc_ops codel_qdisc_ops __read_mostly = { + .id = "codel", + .priv_size = sizeof(struct codel_sched_data), + + .enqueue = codel_qdisc_enqueue, + .dequeue = codel_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = codel_init, + .reset = codel_reset, + .change = codel_change, + .dump = codel_dump, + .dump_stats = codel_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init codel_module_init(void) +{ + return register_qdisc(&codel_qdisc_ops); +} + +static void __exit codel_module_exit(void) +{ + unregister_qdisc(&codel_qdisc_ops); +} + +module_init(codel_module_init) +module_exit(codel_module_exit) + +MODULE_DESCRIPTION("Controlled Delay queue discipline"); +MODULE_AUTHOR("Dave Taht"); +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 6b7fe4a..9ce0b4f 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -260,7 +260,8 @@ static int drr_dump_class(struct Qdisc *sch, unsigned long arg, nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - NLA_PUT_U32(skb, TCA_DRR_QUANTUM, cl->quantum); + if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum)) + goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: @@ -375,8 +376,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl->deficit = cl->quantum; } - bstats_update(&cl->bstats, skb); - sch->q.qlen++; return err; } @@ -402,6 +401,8 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) skb = qdisc_dequeue_peeked(cl->qdisc); if (cl->qdisc->q.qlen == 0) list_del(&cl->alist); + + bstats_update(&cl->bstats, skb); qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 2c79020..3886365 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -265,8 +265,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; drop: - kfree_skb(skb); - sch->qstats.drops++; + qdisc_drop(skb, sch); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } @@ -429,8 +428,9 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; - NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]); - NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]); + if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) || + nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1])) + goto nla_put_failure; return nla_nest_end(skb, opts); @@ -447,13 +447,16 @@ static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; - NLA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices); + if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices)) + goto nla_put_failure; - if (p->default_index != NO_DEFAULT_INDEX) - NLA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index); + if (p->default_index != NO_DEFAULT_INDEX && + nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index)) + goto nla_put_failure; - if (p->set_tc_index) - NLA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX); + if (p->set_tc_index && + nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX)) + goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 66effe2..e15a9eb 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -85,7 +85,8 @@ static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tc_fifo_qopt opt = { .limit = sch->limit }; - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; nla_put_failure: diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c new file mode 100644 index 0000000..9fc1c62 --- /dev/null +++ b/net/sched/sch_fq_codel.c @@ -0,0 +1,626 @@ +/* + * Fair Queue CoDel discipline + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/flow_keys.h> +#include <net/codel.h> + +/* Fair Queue CoDel. + * + * Principles : + * Packets are classified (internal classifier or external) on flows. + * This is a Stochastic model (as we use a hash, several flows + * might be hashed on same slot) + * Each flow has a CoDel managed queue. + * Flows are linked onto two (Round Robin) lists, + * so that new flows have priority on old ones. + * + * For a given flow, packets are not reordered (CoDel uses a FIFO) + * head drops only. + * ECN capability is on by default. + * Low memory footprint (64 bytes per flow) + */ + +struct fq_codel_flow { + struct sk_buff *head; + struct sk_buff *tail; + struct list_head flowchain; + int deficit; + u32 dropped; /* number of drops (or ECN marks) on this flow */ + struct codel_vars cvars; +}; /* please try to keep this structure <= 64 bytes */ + +struct fq_codel_sched_data { + struct tcf_proto *filter_list; /* optional external classifier */ + struct fq_codel_flow *flows; /* Flows table [flows_cnt] */ + u32 *backlogs; /* backlog table [flows_cnt] */ + u32 flows_cnt; /* number of flows */ + u32 perturbation; /* hash perturbation */ + u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ + struct codel_params cparams; + struct codel_stats cstats; + u32 drop_overlimit; + u32 new_flow_count; + + struct list_head new_flows; /* list of new flows */ + struct list_head old_flows; /* list of old flows */ +}; + +static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q, + const struct sk_buff *skb) +{ + struct flow_keys keys; + unsigned int hash; + + skb_flow_dissect(skb, &keys); + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src ^ keys.ip_proto, + (__force u32)keys.ports, q->perturbation); + return ((u64)hash * q->flows_cnt) >> 32; +} + +static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->flows_cnt) + return TC_H_MIN(skb->priority); + + if (!q->filter_list) + return fq_codel_hash(q, skb) + 1; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= q->flows_cnt) + return TC_H_MIN(res.classid); + } + return 0; +} + +/* helper functions : might be changed when/if skb use a standard list_head */ + +/* remove one skb from head of slot queue */ +static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) +{ + struct sk_buff *skb = flow->head; + + flow->head = skb->next; + skb->next = NULL; + return skb; +} + +/* add skb to flow queue (tail add) */ +static inline void flow_queue_add(struct fq_codel_flow *flow, + struct sk_buff *skb) +{ + if (flow->head == NULL) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; +} + +static unsigned int fq_codel_drop(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + unsigned int maxbacklog = 0, idx = 0, i, len; + struct fq_codel_flow *flow; + + /* Queue is full! Find the fat flow and drop packet from it. + * This might sound expensive, but with 1024 flows, we scan + * 4KB of memory, and we dont need to handle a complex tree + * in fast path (packet queue/enqueue) with many cache misses. + */ + for (i = 0; i < q->flows_cnt; i++) { + if (q->backlogs[i] > maxbacklog) { + maxbacklog = q->backlogs[i]; + idx = i; + } + } + flow = &q->flows[idx]; + skb = dequeue_head(flow); + len = qdisc_pkt_len(skb); + q->backlogs[idx] -= len; + kfree_skb(skb); + sch->q.qlen--; + sch->qstats.drops++; + sch->qstats.backlog -= len; + flow->dropped++; + return idx; +} + +static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + unsigned int idx; + struct fq_codel_flow *flow; + int uninitialized_var(ret); + + idx = fq_codel_classify(skb, sch, &ret); + if (idx == 0) { + if (ret & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } + idx--; + + codel_set_enqueue_time(skb); + flow = &q->flows[idx]; + flow_queue_add(flow, skb); + q->backlogs[idx] += qdisc_pkt_len(skb); + sch->qstats.backlog += qdisc_pkt_len(skb); + + if (list_empty(&flow->flowchain)) { + list_add_tail(&flow->flowchain, &q->new_flows); + codel_vars_init(&flow->cvars); + q->new_flow_count++; + flow->deficit = q->quantum; + flow->dropped = 0; + } + if (++sch->q.qlen < sch->limit) + return NET_XMIT_SUCCESS; + + q->drop_overlimit++; + /* Return Congestion Notification only if we dropped a packet + * from this flow. + */ + if (fq_codel_drop(sch) == idx) + return NET_XMIT_CN; + + /* As we dropped a packet, better let upper stack know this */ + qdisc_tree_decrease_qlen(sch, 1); + return NET_XMIT_SUCCESS; +} + +/* This is the specific function called from codel_dequeue() + * to dequeue a packet from queue. Note: backlog is handled in + * codel, we dont need to reduce it here. + */ +static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct fq_codel_flow *flow; + struct sk_buff *skb = NULL; + + flow = container_of(vars, struct fq_codel_flow, cvars); + if (flow->head) { + skb = dequeue_head(flow); + q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); + sch->q.qlen--; + } + return skb; +} + +static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + struct fq_codel_flow *flow; + struct list_head *head; + u32 prev_drop_count, prev_ecn_mark; + +begin: + head = &q->new_flows; + if (list_empty(head)) { + head = &q->old_flows; + if (list_empty(head)) + return NULL; + } + flow = list_first_entry(head, struct fq_codel_flow, flowchain); + + if (flow->deficit <= 0) { + flow->deficit += q->quantum; + list_move_tail(&flow->flowchain, &q->old_flows); + goto begin; + } + + prev_drop_count = q->cstats.drop_count; + prev_ecn_mark = q->cstats.ecn_mark; + + skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, + dequeue); + + flow->dropped += q->cstats.drop_count - prev_drop_count; + flow->dropped += q->cstats.ecn_mark - prev_ecn_mark; + + if (!skb) { + /* force a pass through old_flows to prevent starvation */ + if ((head == &q->new_flows) && !list_empty(&q->old_flows)) + list_move_tail(&flow->flowchain, &q->old_flows); + else + list_del_init(&flow->flowchain); + goto begin; + } + qdisc_bstats_update(sch, skb); + flow->deficit -= qdisc_pkt_len(skb); + /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0, + * or HTB crashes. Defer it for next round. + */ + if (q->cstats.drop_count && sch->q.qlen) { + qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + q->cstats.drop_count = 0; + } + return skb; +} + +static void fq_codel_reset(struct Qdisc *sch) +{ + struct sk_buff *skb; + + while ((skb = fq_codel_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { + [TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_ECN] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, +}; + +static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy); + if (err < 0) + return err; + if (tb[TCA_FQ_CODEL_FLOWS]) { + if (q->flows) + return -EINVAL; + q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]); + if (!q->flows_cnt || + q->flows_cnt > 65536) + return -EINVAL; + } + sch_tree_lock(sch); + + if (tb[TCA_FQ_CODEL_TARGET]) { + u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]); + + q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_FQ_CODEL_INTERVAL]) { + u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); + + q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT; + } + + if (tb[TCA_FQ_CODEL_LIMIT]) + sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]); + + if (tb[TCA_FQ_CODEL_ECN]) + q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); + + if (tb[TCA_FQ_CODEL_QUANTUM]) + q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + + while (sch->q.qlen > sch->limit) { + struct sk_buff *skb = fq_codel_dequeue(sch); + + kfree_skb(skb); + q->cstats.drop_count++; + } + qdisc_tree_decrease_qlen(sch, q->cstats.drop_count); + q->cstats.drop_count = 0; + + sch_tree_unlock(sch); + return 0; +} + +static void *fq_codel_zalloc(size_t sz) +{ + void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); + + if (!ptr) + ptr = vzalloc(sz); + return ptr; +} + +static void fq_codel_free(void *addr) +{ + if (addr) { + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); + } +} + +static void fq_codel_destroy(struct Qdisc *sch) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(&q->filter_list); + fq_codel_free(q->backlogs); + fq_codel_free(q->flows); +} + +static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + int i; + + sch->limit = 10*1024; + q->flows_cnt = 1024; + q->quantum = psched_mtu(qdisc_dev(sch)); + q->perturbation = net_random(); + INIT_LIST_HEAD(&q->new_flows); + INIT_LIST_HEAD(&q->old_flows); + codel_params_init(&q->cparams); + codel_stats_init(&q->cstats); + q->cparams.ecn = true; + + if (opt) { + int err = fq_codel_change(sch, opt); + if (err) + return err; + } + + if (!q->flows) { + q->flows = fq_codel_zalloc(q->flows_cnt * + sizeof(struct fq_codel_flow)); + if (!q->flows) + return -ENOMEM; + q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); + if (!q->backlogs) { + fq_codel_free(q->flows); + return -ENOMEM; + } + for (i = 0; i < q->flows_cnt; i++) { + struct fq_codel_flow *flow = q->flows + i; + + INIT_LIST_HEAD(&flow->flowchain); + } + } + if (sch->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; +} + +static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET, + codel_time_to_us(q->cparams.target)) || + nla_put_u32(skb, TCA_FQ_CODEL_LIMIT, + sch->limit) || + nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL, + codel_time_to_us(q->cparams.interval)) || + nla_put_u32(skb, TCA_FQ_CODEL_ECN, + q->cparams.ecn) || + nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, + q->quantum) || + nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, + q->flows_cnt)) + goto nla_put_failure; + + nla_nest_end(skb, opts); + return skb->len; + +nla_put_failure: + return -1; +} + +static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + struct tc_fq_codel_xstats st = { + .type = TCA_FQ_CODEL_XSTATS_QDISC, + }; + struct list_head *pos; + + st.qdisc_stats.maxpacket = q->cstats.maxpacket; + st.qdisc_stats.drop_overlimit = q->drop_overlimit; + st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; + st.qdisc_stats.new_flow_count = q->new_flow_count; + + list_for_each(pos, &q->new_flows) + st.qdisc_stats.new_flows_len++; + + list_for_each(pos, &q->old_flows) + st.qdisc_stats.old_flows_len++; + + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + /* we cannot bypass queue discipline anymore */ + sch->flags &= ~TCQ_F_CAN_BYPASS; + return 0; +} + +static void fq_codel_put(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + u32 idx = cl - 1; + struct gnet_stats_queue qs = { 0 }; + struct tc_fq_codel_xstats xstats; + + if (idx < q->flows_cnt) { + const struct fq_codel_flow *flow = &q->flows[idx]; + const struct sk_buff *skb = flow->head; + + memset(&xstats, 0, sizeof(xstats)); + xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; + xstats.class_stats.deficit = flow->deficit; + xstats.class_stats.ldelay = + codel_time_to_us(flow->cvars.ldelay); + xstats.class_stats.count = flow->cvars.count; + xstats.class_stats.lastcount = flow->cvars.lastcount; + xstats.class_stats.dropping = flow->cvars.dropping; + if (flow->cvars.dropping) { + codel_tdiff_t delta = flow->cvars.drop_next - + codel_get_time(); + + xstats.class_stats.drop_next = (delta >= 0) ? + codel_time_to_us(delta) : + -codel_time_to_us(-delta); + } + while (skb) { + qs.qlen++; + skb = skb->next; + } + qs.backlog = q->backlogs[idx]; + qs.drops = flow->dropped; + } + if (gnet_stats_copy_queue(d, &qs) < 0) + return -1; + if (idx < q->flows_cnt) + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); + return 0; +} + +static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct fq_codel_sched_data *q = qdisc_priv(sch); + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->flows_cnt; i++) { + if (list_empty(&q->flows[i].flowchain) || + arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops fq_codel_class_ops = { + .leaf = fq_codel_leaf, + .get = fq_codel_get, + .put = fq_codel_put, + .tcf_chain = fq_codel_find_tcf, + .bind_tcf = fq_codel_bind, + .unbind_tcf = fq_codel_put, + .dump = fq_codel_dump_class, + .dump_stats = fq_codel_dump_class_stats, + .walk = fq_codel_walk, +}; + +static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { + .cl_ops = &fq_codel_class_ops, + .id = "fq_codel", + .priv_size = sizeof(struct fq_codel_sched_data), + .enqueue = fq_codel_enqueue, + .dequeue = fq_codel_dequeue, + .peek = qdisc_peek_dequeued, + .drop = fq_codel_drop, + .init = fq_codel_init, + .reset = fq_codel_reset, + .destroy = fq_codel_destroy, + .change = fq_codel_change, + .dump = fq_codel_dump, + .dump_stats = fq_codel_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init fq_codel_module_init(void) +{ + return register_qdisc(&fq_codel_qdisc_ops); +} + +static void __exit fq_codel_module_exit(void) +{ + unregister_qdisc(&fq_codel_qdisc_ops); +} + +module_init(fq_codel_module_init) +module_exit(fq_codel_module_exit) +MODULE_AUTHOR("Eric Dumazet"); +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 67fc573..511323e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -86,9 +86,8 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, * deadloop is detected. Return OK to try the next skb. */ kfree_skb(skb); - if (net_ratelimit()) - pr_warning("Dead loop on netdevice %s, fix it urgently!\n", - dev_queue->dev->name); + net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n", + dev_queue->dev->name); ret = qdisc_qlen(q); } else { /* @@ -136,9 +135,9 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, ret = handle_dev_cpu_collision(skb, txq, q); } else { /* Driver returned NETDEV_TX_BUSY - requeue skb */ - if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit())) - pr_warning("BUG %s code %d qlen %d\n", - dev->name, ret, q->q.qlen); + if (unlikely(ret != NETDEV_TX_BUSY)) + net_warn_ratelimited("BUG %s code %d qlen %d\n", + dev->name, ret, q->q.qlen); ret = dev_requeue_skb(skb, q); } @@ -512,7 +511,8 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1); - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; nla_put_failure: diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 0b15236..e901583 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -255,10 +255,8 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch) u16 dp = tc_index_to_dp(skb); if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { - if (net_ratelimit()) - pr_warning("GRED: Unable to relocate VQ 0x%x " - "after dequeue, screwing up " - "backlog.\n", tc_index_to_dp(skb)); + net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n", + tc_index_to_dp(skb)); } else { q->backlog -= qdisc_pkt_len(skb); @@ -287,10 +285,8 @@ static unsigned int gred_drop(struct Qdisc *sch) u16 dp = tc_index_to_dp(skb); if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { - if (net_ratelimit()) - pr_warning("GRED: Unable to relocate VQ 0x%x " - "while dropping, screwing up " - "backlog.\n", tc_index_to_dp(skb)); + net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n", + tc_index_to_dp(skb)); } else { q->backlog -= len; q->stats.other++; @@ -521,14 +517,16 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); + if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt)) + goto nla_put_failure; for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; max_p[i] = q ? q->parms.max_P : 0; } - NLA_PUT(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p); + if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p)) + goto nla_put_failure; parms = nla_nest_start(skb, TCA_GRED_PARMS); if (parms == NULL) @@ -565,11 +563,8 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) opt.packets = q->packetsin; opt.bytesin = q->bytesin; - if (gred_wred_mode(table)) { - q->vars.qidlestart = - table->tab[table->def]->vars.qidlestart; - q->vars.qavg = table->tab[table->def]->vars.qavg; - } + if (gred_wred_mode(table)) + gred_load_wred_set(table, q); opt.qave = red_calc_qavg(&q->parms, &q->vars, q->vars.qavg); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 9bdca2e..6c2ec45 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1305,7 +1305,8 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc) tsc.m1 = sm2m(sc->sm1); tsc.d = dx2d(sc->dx); tsc.m2 = sm2m(sc->sm2); - NLA_PUT(skb, attr, sizeof(tsc), &tsc); + if (nla_put(skb, attr, sizeof(tsc), &tsc)) + goto nla_put_failure; return skb->len; @@ -1573,7 +1574,8 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) } qopt.defcls = q->defcls; - NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) + goto nla_put_failure; return skb->len; nla_put_failure: @@ -1607,7 +1609,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl->qdisc->q.qlen == 1) set_active(cl, qdisc_pkt_len(skb)); - bstats_update(&cl->bstats, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -1655,6 +1656,7 @@ hfsc_dequeue(struct Qdisc *sch) return NULL; } + bstats_update(&cl->bstats, skb); update_vf(cl, qdisc_pkt_len(skb), cur_time); if (realtime) cl->cl_cumul += qdisc_pkt_len(skb); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 29b942c..9d75b77 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -558,9 +558,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) __skb_queue_tail(&q->direct_queue, skb); q->direct_pkts++; } else { - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_DROP; + return qdisc_drop(skb, sch); } #ifdef CONFIG_NET_CLS_ACT } else if (!cl) { @@ -576,7 +574,6 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } return ret; } else { - bstats_update(&cl->bstats, skb); htb_activate(q, cl); } @@ -837,6 +834,7 @@ next: } while (cl != start); if (likely(skb != NULL)) { + bstats_update(&cl->bstats, skb); cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb); if (cl->un.leaf.deficit[level] < 0) { cl->un.leaf.deficit[level] += cl->quantum; @@ -1051,7 +1049,8 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); + if (nla_put(skb, TCA_HTB_INIT, sizeof(gopt), &gopt)) + goto nla_put_failure; nla_nest_end(skb, nest); spin_unlock_bh(root_lock); @@ -1090,7 +1089,8 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, opt.quantum = cl->quantum; opt.prio = cl->prio; opt.level = cl->level; - NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; nla_nest_end(skb, nest); spin_unlock_bh(root_lock); diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 28de430..d1831ca 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -247,7 +247,8 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) opt.offset[i] = dev->tc_to_txq[i].offset; } - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; nla_put_failure: diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 49131d7..2a2b096 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -284,7 +284,8 @@ static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) opt.bands = q->bands; opt.max_bands = q->max_bands; - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 5da548f..a2a95aa 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -26,6 +26,7 @@ #include <net/netlink.h> #include <net/pkt_sched.h> +#include <net/inet_ecn.h> #define VERSION "1.3" @@ -78,6 +79,7 @@ struct netem_sched_data { psched_tdiff_t jitter; u32 loss; + u32 ecn; u32 limit; u32 counter; u32 gap; @@ -374,9 +376,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) ++count; /* Drop packet? */ - if (loss_event(q)) - --count; - + if (loss_event(q)) { + if (q->ecn && INET_ECN_set_ce(skb)) + sch->qstats.drops++; /* mark packet */ + else + --count; + } if (count == 0) { sch->qstats.drops++; kfree_skb(skb); @@ -408,10 +413,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb))) { - sch->qstats.drops++; - return NET_XMIT_DROP; - } + skb_checksum_help(skb))) + return qdisc_drop(skb, sch); skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); } @@ -706,6 +709,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, + [TCA_NETEM_ECN] = { .type = NLA_U32 }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, @@ -776,6 +780,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_NETEM_RATE]) get_rate(sch, tb[TCA_NETEM_RATE]); + if (tb[TCA_NETEM_ECN]) + q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); + q->loss_model = CLG_RANDOM; if (tb[TCA_NETEM_LOSS]) ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]); @@ -834,7 +841,8 @@ static int dump_loss_model(const struct netem_sched_data *q, .p23 = q->clg.a5, }; - NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi); + if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) + goto nla_put_failure; break; } case CLG_GILB_ELL: { @@ -845,7 +853,8 @@ static int dump_loss_model(const struct netem_sched_data *q, .k1 = q->clg.a4, }; - NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge); + if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) + goto nla_put_failure; break; } } @@ -874,26 +883,34 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) qopt.loss = q->loss; qopt.gap = q->gap; qopt.duplicate = q->duplicate; - NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) + goto nla_put_failure; cor.delay_corr = q->delay_cor.rho; cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; - NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); + if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) + goto nla_put_failure; reorder.probability = q->reorder; reorder.correlation = q->reorder_cor.rho; - NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); + if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) + goto nla_put_failure; corrupt.probability = q->corrupt; corrupt.correlation = q->corrupt_cor.rho; - NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) + goto nla_put_failure; rate.rate = q->rate; rate.packet_overhead = q->packet_overhead; rate.cell_size = q->cell_size; rate.cell_overhead = q->cell_overhead; - NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate); + if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) + goto nla_put_failure; + + if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) + goto nla_put_failure; if (dump_loss_model(q, skb) != 0) goto nla_put_failure; diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c new file mode 100644 index 0000000..89f8fcf --- /dev/null +++ b/net/sched/sch_plug.c @@ -0,0 +1,233 @@ +/* + * sch_plug.c Queue traffic until an explicit release command + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * There are two ways to use this qdisc: + * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating + * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands. + * + * 2. For network output buffering (a.k.a output commit) functionality. + * Output commit property is commonly used by applications using checkpoint + * based fault-tolerance to ensure that the checkpoint from which a system + * is being restored is consistent w.r.t outside world. + * + * Consider for e.g. Remus - a Virtual Machine checkpointing system, + * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated + * asynchronously to the backup host, while the VM continues executing the + * next epoch speculatively. + * + * The following is a typical sequence of output buffer operations: + * 1.At epoch i, start_buffer(i) + * 2. At end of epoch i (i.e. after 50ms): + * 2.1 Stop VM and take checkpoint(i). + * 2.2 start_buffer(i+1) and Resume VM + * 3. While speculatively executing epoch(i+1), asynchronously replicate + * checkpoint(i) to backup host. + * 4. When checkpoint_ack(i) is received from backup, release_buffer(i) + * Thus, this Qdisc would receive the following sequence of commands: + * TCQ_PLUG_BUFFER (epoch i) + * .. TCQ_PLUG_BUFFER (epoch i+1) + * ....TCQ_PLUG_RELEASE_ONE (epoch i) + * ......TCQ_PLUG_BUFFER (epoch i+2) + * ........ + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> + +/* + * State of the queue, when used for network output buffering: + * + * plug(i+1) plug(i) head + * ------------------+--------------------+----------------> + * | | + * | | + * pkts_current_epoch| pkts_last_epoch |pkts_to_release + * ----------------->|<--------+--------->|+---------------> + * v v + * + */ + +struct plug_sched_data { + /* If true, the dequeue function releases all packets + * from head to end of the queue. The queue turns into + * a pass-through queue for newly arriving packets. + */ + bool unplug_indefinite; + + /* Queue Limit in bytes */ + u32 limit; + + /* Number of packets (output) from the current speculatively + * executing epoch. + */ + u32 pkts_current_epoch; + + /* Number of packets corresponding to the recently finished + * epoch. These will be released when we receive a + * TCQ_PLUG_RELEASE_ONE command. This command is typically + * issued after committing a checkpoint at the target. + */ + u32 pkts_last_epoch; + + /* + * Number of packets from the head of the queue, that can + * be released (committed checkpoint). + */ + u32 pkts_to_release; +}; + +static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct plug_sched_data *q = qdisc_priv(sch); + + if (likely(sch->qstats.backlog + skb->len <= q->limit)) { + if (!q->unplug_indefinite) + q->pkts_current_epoch++; + return qdisc_enqueue_tail(skb, sch); + } + + return qdisc_reshape_fail(skb, sch); +} + +static struct sk_buff *plug_dequeue(struct Qdisc *sch) +{ + struct plug_sched_data *q = qdisc_priv(sch); + + if (qdisc_is_throttled(sch)) + return NULL; + + if (!q->unplug_indefinite) { + if (!q->pkts_to_release) { + /* No more packets to dequeue. Block the queue + * and wait for the next release command. + */ + qdisc_throttled(sch); + return NULL; + } + q->pkts_to_release--; + } + + return qdisc_dequeue_head(sch); +} + +static int plug_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct plug_sched_data *q = qdisc_priv(sch); + + q->pkts_current_epoch = 0; + q->pkts_last_epoch = 0; + q->pkts_to_release = 0; + q->unplug_indefinite = false; + + if (opt == NULL) { + /* We will set a default limit of 100 pkts (~150kB) + * in case tx_queue_len is not available. The + * default value is completely arbitrary. + */ + u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100; + q->limit = pkt_limit * psched_mtu(qdisc_dev(sch)); + } else { + struct tc_plug_qopt *ctl = nla_data(opt); + + if (nla_len(opt) < sizeof(*ctl)) + return -EINVAL; + + q->limit = ctl->limit; + } + + qdisc_throttled(sch); + return 0; +} + +/* Receives 4 types of messages: + * TCQ_PLUG_BUFFER: Inset a plug into the queue and + * buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + * to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + * Stop buffering packets until the next TCQ_PLUG_BUFFER + * command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ +static int plug_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct plug_sched_data *q = qdisc_priv(sch); + struct tc_plug_qopt *msg; + + if (opt == NULL) + return -EINVAL; + + msg = nla_data(opt); + if (nla_len(opt) < sizeof(*msg)) + return -EINVAL; + + switch (msg->action) { + case TCQ_PLUG_BUFFER: + /* Save size of the current buffer */ + q->pkts_last_epoch = q->pkts_current_epoch; + q->pkts_current_epoch = 0; + if (q->unplug_indefinite) + qdisc_throttled(sch); + q->unplug_indefinite = false; + break; + case TCQ_PLUG_RELEASE_ONE: + /* Add packets from the last complete buffer to the + * packets to be released set. + */ + q->pkts_to_release += q->pkts_last_epoch; + q->pkts_last_epoch = 0; + qdisc_unthrottled(sch); + netif_schedule_queue(sch->dev_queue); + break; + case TCQ_PLUG_RELEASE_INDEFINITE: + q->unplug_indefinite = true; + q->pkts_to_release = 0; + q->pkts_last_epoch = 0; + q->pkts_current_epoch = 0; + qdisc_unthrottled(sch); + netif_schedule_queue(sch->dev_queue); + break; + case TCQ_PLUG_LIMIT: + /* Limit is supplied in bytes */ + q->limit = msg->limit; + break; + default: + return -EINVAL; + } + + return 0; +} + +static struct Qdisc_ops plug_qdisc_ops __read_mostly = { + .id = "plug", + .priv_size = sizeof(struct plug_sched_data), + .enqueue = plug_enqueue, + .dequeue = plug_dequeue, + .peek = qdisc_peek_head, + .init = plug_init, + .change = plug_change, + .owner = THIS_MODULE, +}; + +static int __init plug_module_init(void) +{ + return register_qdisc(&plug_qdisc_ops); +} + +static void __exit plug_module_exit(void) +{ + unregister_qdisc(&plug_qdisc_ops); +} +module_init(plug_module_init) +module_exit(plug_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index b5d56a2..79359b6 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -247,7 +247,8 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) opt.bands = q->bands; memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1); - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index e68cb44..9af01f3 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -429,8 +429,9 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - NLA_PUT_U32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w); - NLA_PUT_U32(skb, TCA_QFQ_LMAX, cl->lmax); + if (nla_put_u32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w) || + nla_put_u32(skb, TCA_QFQ_LMAX, cl->lmax)) + goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index a5cc301..633e32d 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -272,8 +272,9 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; - NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); - NLA_PUT_U32(skb, TCA_RED_MAX_P, q->parms.max_P); + if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || + nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P)) + goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index d7eea99..74305c8 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -570,7 +570,8 @@ static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) sch->qstats.backlog = q->qdisc->qstats.backlog; opts = nla_nest_start(skb, TCA_OPTIONS); - NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 02a21ab..d3a1bc2 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -812,7 +812,8 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); opt.flags = q->flags; - NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + goto nla_put_failure; return skb->len; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index b8e1563..4b056c15 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -359,7 +359,8 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) memset(&opt.peakrate, 0, sizeof(opt.peakrate)); opt.mtu = q->mtu; opt.buffer = q->buffer; - NLA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); + if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; nla_nest_end(skb, nest); return skb->len; diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 4532659..ca0c296 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -88,9 +88,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_DROP; + return qdisc_drop(skb, sch); } static struct sk_buff * |