summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/fc.c2
-rw-r--r--net/802/fddi.c8
-rw-r--r--net/802/hippi.c6
-rw-r--r--net/802/p8022.c3
-rw-r--r--net/802/p8023.c1
-rw-r--r--net/802/psnap.c7
-rw-r--r--net/802/sysctl_net_802.c3
-rw-r--r--net/802/tr.c22
-rw-r--r--net/8021q/Kconfig19
-rw-r--r--net/8021q/vlan.c8
-rw-r--r--net/8021q/vlan.h2
-rw-r--r--net/8021q/vlan_dev.c2
-rw-r--r--net/Kconfig476
-rw-r--r--net/Makefile3
-rw-r--r--net/appletalk/aarp.c11
-rw-r--r--net/appletalk/ddp.c10
-rw-r--r--net/atm/Kconfig74
-rw-r--r--net/atm/br2684.c3
-rw-r--r--net/atm/ioctl.c1
-rw-r--r--net/atm/ipcommon.c3
-rw-r--r--net/atm/svc.c4
-rw-r--r--net/ax25/af_ax25.c36
-rw-r--r--net/ax25/ax25_addr.c3
-rw-r--r--net/ax25/ax25_ds_in.c3
-rw-r--r--net/ax25/ax25_ds_timer.c2
-rw-r--r--net/ax25/ax25_in.c17
-rw-r--r--net/ax25/ax25_route.c19
-rw-r--r--net/ax25/ax25_std_in.c3
-rw-r--r--net/ax25/ax25_std_timer.c2
-rw-r--r--net/ax25/ax25_subr.c4
-rw-r--r--net/ax25/ax25_uid.c87
-rw-r--r--net/bluetooth/cmtp/core.c6
-rw-r--r--net/bluetooth/hci_core.c20
-rw-r--r--net/bluetooth/hci_event.c83
-rw-r--r--net/bluetooth/hci_sock.c26
-rw-r--r--net/bluetooth/hidp/core.c5
-rw-r--r--net/bluetooth/l2cap.c2
-rw-r--r--net/bluetooth/lib.c25
-rw-r--r--net/bluetooth/rfcomm/core.c77
-rw-r--r--net/bluetooth/rfcomm/sock.c9
-rw-r--r--net/bluetooth/rfcomm/tty.c208
-rw-r--r--net/bluetooth/sco.c2
-rw-r--r--net/bridge/Kconfig31
-rw-r--r--net/bridge/br_fdb.c2
-rw-r--r--net/bridge/br_forward.c3
-rw-r--r--net/bridge/br_input.c4
-rw-r--r--net/bridge/br_netfilter.c40
-rw-r--r--net/bridge/netfilter/Kconfig2
-rw-r--r--net/bridge/netfilter/ebt_log.c6
-rw-r--r--net/bridge/netfilter/ebt_mark.c5
-rw-r--r--net/bridge/netfilter/ebt_ulog.c9
-rw-r--r--net/bridge/netfilter/ebtables.c21
-rw-r--r--net/compat.c53
-rw-r--r--net/core/Makefile7
-rw-r--r--net/core/datagram.c6
-rw-r--r--net/core/dev.c221
-rw-r--r--net/core/dst.c15
-rw-r--r--net/core/ethtool.c49
-rw-r--r--net/core/filter.c110
-rw-r--r--net/core/flow.c2
-rw-r--r--net/core/neighbour.c357
-rw-r--r--net/core/netfilter.c786
-rw-r--r--net/core/netpoll.c133
-rw-r--r--net/core/pktgen.c31
-rw-r--r--net/core/request_sock.c90
-rw-r--r--net/core/rtnetlink.c44
-rw-r--r--net/core/skbuff.c342
-rw-r--r--net/core/sock.c199
-rw-r--r--net/core/sysctl_net_core.c68
-rw-r--r--net/core/utils.c39
-rw-r--r--net/core/wireless.c141
-rw-r--r--net/dccp/Kconfig50
-rw-r--r--net/dccp/Makefile10
-rw-r--r--net/dccp/ccid.c139
-rw-r--r--net/dccp/ccid.h180
-rw-r--r--net/dccp/ccids/Kconfig29
-rw-r--r--net/dccp/ccids/Makefile5
-rw-r--r--net/dccp/ccids/ccid3.c1221
-rw-r--r--net/dccp/ccids/ccid3.h137
-rw-r--r--net/dccp/ccids/lib/Makefile3
-rw-r--r--net/dccp/ccids/lib/loss_interval.c144
-rw-r--r--net/dccp/ccids/lib/loss_interval.h61
-rw-r--r--net/dccp/ccids/lib/packet_history.c398
-rw-r--r--net/dccp/ccids/lib/packet_history.h199
-rw-r--r--net/dccp/ccids/lib/tfrc.h22
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c644
-rw-r--r--net/dccp/dccp.h493
-rw-r--r--net/dccp/diag.c71
-rw-r--r--net/dccp/input.c600
-rw-r--r--net/dccp/ipv4.c1356
-rw-r--r--net/dccp/minisocks.c264
-rw-r--r--net/dccp/options.c855
-rw-r--r--net/dccp/output.c528
-rw-r--r--net/dccp/proto.c826
-rw-r--r--net/dccp/timer.c255
-rw-r--r--net/decnet/Kconfig23
-rw-r--r--net/decnet/af_decnet.c61
-rw-r--r--net/decnet/dn_dev.c17
-rw-r--r--net/decnet/dn_fib.c3
-rw-r--r--net/decnet/dn_neigh.c3
-rw-r--r--net/decnet/dn_nsp_in.c2
-rw-r--r--net/decnet/dn_nsp_out.c68
-rw-r--r--net/decnet/dn_route.c13
-rw-r--r--net/decnet/dn_rules.c7
-rw-r--r--net/decnet/dn_table.c14
-rw-r--r--net/decnet/netfilter/dn_rtmsg.c11
-rw-r--r--net/econet/Kconfig36
-rw-r--r--net/econet/af_econet.c8
-rw-r--r--net/ethernet/eth.c12
-rw-r--r--net/ethernet/sysctl_net_ether.c1
-rw-r--r--net/ieee80211/Kconfig68
-rw-r--r--net/ieee80211/Makefile11
-rw-r--r--net/ieee80211/ieee80211_crypt.c258
-rw-r--r--net/ieee80211/ieee80211_crypt_ccmp.c455
-rw-r--r--net/ieee80211/ieee80211_crypt_tkip.c683
-rw-r--r--net/ieee80211/ieee80211_crypt_wep.c258
-rw-r--r--net/ieee80211/ieee80211_module.c297
-rw-r--r--net/ieee80211/ieee80211_rx.c1193
-rw-r--r--net/ieee80211/ieee80211_tx.c428
-rw-r--r--net/ieee80211/ieee80211_wx.c468
-rw-r--r--net/ipv4/Kconfig171
-rw-r--r--net/ipv4/Makefile22
-rw-r--r--net/ipv4/af_inet.c203
-rw-r--r--net/ipv4/ah4.c20
-rw-r--r--net/ipv4/arp.c8
-rw-r--r--net/ipv4/datagram.c3
-rw-r--r--net/ipv4/devinet.c18
-rw-r--r--net/ipv4/esp4.c38
-rw-r--r--net/ipv4/fib_frontend.c55
-rw-r--r--net/ipv4/fib_hash.c7
-rw-r--r--net/ipv4/fib_lookup.h4
-rw-r--r--net/ipv4/fib_rules.c7
-rw-r--r--net/ipv4/fib_semantics.c26
-rw-r--r--net/ipv4/fib_trie.c2528
-rw-r--r--net/ipv4/icmp.c39
-rw-r--r--net/ipv4/igmp.c98
-rw-r--r--net/ipv4/inet_connection_sock.c641
-rw-r--r--net/ipv4/inet_diag.c868
-rw-r--r--net/ipv4/inet_hashtables.c165
-rw-r--r--net/ipv4/inet_timewait_sock.c384
-rw-r--r--net/ipv4/inetpeer.c16
-rw-r--r--net/ipv4/ip_forward.c6
-rw-r--r--net/ipv4/ip_fragment.c14
-rw-r--r--net/ipv4/ip_gre.c21
-rw-r--r--net/ipv4/ip_input.c148
-rw-r--r--net/ipv4/ip_options.c52
-rw-r--r--net/ipv4/ip_output.c54
-rw-r--r--net/ipv4/ip_sockglue.c23
-rw-r--r--net/ipv4/ipcomp.c20
-rw-r--r--net/ipv4/ipconfig.c13
-rw-r--r--net/ipv4/ipip.c56
-rw-r--r--net/ipv4/ipmr.c19
-rw-r--r--net/ipv4/ipvs/Kconfig4
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c1
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c33
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c9
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c21
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c8
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c4
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c3
-rw-r--r--net/ipv4/multipath_drr.c4
-rw-r--r--net/ipv4/multipath_random.c2
-rw-r--r--net/ipv4/multipath_rr.c2
-rw-r--r--net/ipv4/multipath_wrandom.c2
-rw-r--r--net/ipv4/netfilter.c139
-rw-r--r--net/ipv4/netfilter/Kconfig90
-rw-r--r--net/ipv4/netfilter/Makefile10
-rw-r--r--net/ipv4/netfilter/arp_tables.c1
-rw-r--r--net/ipv4/netfilter/ip_conntrack_amanda.c34
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c552
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c43
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c23
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netbios_ns.c131
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c1584
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c73
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_sctp.c32
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c76
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c15
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c81
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c9
-rw-r--r--net/ipv4/netfilter/ip_nat_amanda.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c136
-rw-r--r--net/ipv4/netfilter/ip_nat_ftp.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_helper.c21
-rw-r--r--net/ipv4/netfilter/ip_nat_irc.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c30
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c27
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c26
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_unknown.c13
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c25
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c2
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c21
-rw-r--r--net/ipv4/netfilter/ip_nat_tftp.c4
-rw-r--r--net/ipv4/netfilter/ip_queue.c58
-rw-r--r--net/ipv4/netfilter/ip_tables.c6
-rw-r--r--net/ipv4/netfilter/ipt_CLASSIFY.c4
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c62
-rw-r--r--net/ipv4/netfilter/ipt_CONNMARK.c15
-rw-r--r--net/ipv4/netfilter/ipt_DSCP.c3
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c23
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c86
-rw-r--r--net/ipv4/netfilter/ipt_MARK.c22
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c15
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c8
-rw-r--r--net/ipv4/netfilter/ipt_NFQUEUE.c70
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c14
-rw-r--r--net/ipv4/netfilter/ipt_TCPMSS.c10
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c3
-rw-r--r--net/ipv4/netfilter/ipt_TTL.c119
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c68
-rw-r--r--net/ipv4/netfilter/ipt_connbytes.c162
-rw-r--r--net/ipv4/netfilter/ipt_connmark.c7
-rw-r--r--net/ipv4/netfilter/ipt_dccp.c176
-rw-r--r--net/ipv4/netfilter/ipt_hashlimit.c19
-rw-r--r--net/ipv4/netfilter/ipt_helper.c4
-rw-r--r--net/ipv4/netfilter/ipt_mark.c7
-rw-r--r--net/ipv4/netfilter/ipt_owner.c132
-rw-r--r--net/ipv4/netfilter/ipt_recent.c10
-rw-r--r--net/ipv4/netfilter/ipt_string.c91
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/protocol.c1
-rw-r--r--net/ipv4/raw.c29
-rw-r--r--net/ipv4/route.c164
-rw-r--r--net/ipv4/syncookies.c51
-rw-r--r--net/ipv4/sysctl_net_ipv4.c160
-rw-r--r--net/ipv4/tcp.c540
-rw-r--r--net/ipv4/tcp_bic.c337
-rw-r--r--net/ipv4/tcp_cong.c245
-rw-r--r--net/ipv4/tcp_diag.c797
-rw-r--r--net/ipv4/tcp_highspeed.c184
-rw-r--r--net/ipv4/tcp_htcp.c298
-rw-r--r--net/ipv4/tcp_hybla.c188
-rw-r--r--net/ipv4/tcp_input.c1299
-rw-r--r--net/ipv4/tcp_ipv4.c1054
-rw-r--r--net/ipv4/tcp_minisocks.c659
-rw-r--r--net/ipv4/tcp_output.c771
-rw-r--r--net/ipv4/tcp_scalable.c70
-rw-r--r--net/ipv4/tcp_timer.c262
-rw-r--r--net/ipv4/tcp_vegas.c413
-rw-r--r--net/ipv4/tcp_westwood.c263
-rw-r--r--net/ipv4/udp.c39
-rw-r--r--net/ipv4/utils.c59
-rw-r--r--net/ipv4/xfrm4_output.c8
-rw-r--r--net/ipv4/xfrm4_state.c11
-rw-r--r--net/ipv4/xfrm4_tunnel.c5
-rw-r--r--net/ipv6/Kconfig23
-rw-r--r--net/ipv6/Makefile4
-rw-r--r--net/ipv6/addrconf.c128
-rw-r--r--net/ipv6/af_inet6.c66
-rw-r--r--net/ipv6/ah6.c33
-rw-r--r--net/ipv6/anycast.c4
-rw-r--r--net/ipv6/datagram.c11
-rw-r--r--net/ipv6/esp6.c29
-rw-r--r--net/ipv6/exthdrs.c8
-rw-r--r--net/ipv6/icmp.c39
-rw-r--r--net/ipv6/inet6_hashtables.c81
-rw-r--r--net/ipv6/ip6_fib.c21
-rw-r--r--net/ipv6/ip6_flowlabel.c1
-rw-r--r--net/ipv6/ip6_input.c15
-rw-r--r--net/ipv6/ip6_output.c65
-rw-r--r--net/ipv6/ip6_tunnel.c38
-rw-r--r--net/ipv6/ipcomp6.c14
-rw-r--r--net/ipv6/ipv6_sockglue.c33
-rw-r--r--net/ipv6/ipv6_syms.c3
-rw-r--r--net/ipv6/mcast.c95
-rw-r--r--net/ipv6/ndisc.c8
-rw-r--r--net/ipv6/netfilter.c104
-rw-r--r--net/ipv6/netfilter/Kconfig37
-rw-r--r--net/ipv6/netfilter/Makefile3
-rw-r--r--net/ipv6/netfilter/ip6_queue.c57
-rw-r--r--net/ipv6/netfilter/ip6_tables.c6
-rw-r--r--net/ipv6/netfilter/ip6t_HL.c118
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c148
-rw-r--r--net/ipv6/netfilter/ip6t_MARK.c5
-rw-r--r--net/ipv6/netfilter/ip6t_NFQUEUE.c70
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c284
-rw-r--r--net/ipv6/netfilter/ip6t_owner.c90
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c6
-rw-r--r--net/ipv6/raw.c30
-rw-r--r--net/ipv6/reassembly.c4
-rw-r--r--net/ipv6/route.c93
-rw-r--r--net/ipv6/sit.c23
-rw-r--r--net/ipv6/sysctl_net_ipv6.c3
-rw-r--r--net/ipv6/tcp_ipv6.c586
-rw-r--r--net/ipv6/udp.c16
-rw-r--r--net/ipv6/xfrm6_tunnel.c4
-rw-r--r--net/ipx/Kconfig33
-rw-r--r--net/ipx/af_ipx.c10
-rw-r--r--net/ipx/ipx_proc.c2
-rw-r--r--net/irda/af_irda.c2
-rw-r--r--net/irda/irlan/irlan_filter.c1
-rw-r--r--net/irda/irlap.c3
-rw-r--r--net/irda/irlap_event.c14
-rw-r--r--net/irda/irlap_frame.c16
-rw-r--r--net/irda/irlmp.c3
-rw-r--r--net/irda/irmod.c2
-rw-r--r--net/irda/irnet/irnet.h3
-rw-r--r--net/irda/irnet/irnet_ppp.c2
-rw-r--r--net/irda/irqueue.c1
-rw-r--r--net/irda/irttp.c2
-rw-r--r--net/irda/qos.c1
-rw-r--r--net/key/af_key.c385
-rw-r--r--net/lapb/Kconfig22
-rw-r--r--net/lapb/lapb_subr.c2
-rw-r--r--net/llc/af_llc.c4
-rw-r--r--net/llc/llc_c_ev.c2
-rw-r--r--net/llc/llc_conn.c8
-rw-r--r--net/llc/llc_core.c3
-rw-r--r--net/llc/llc_if.c2
-rw-r--r--net/llc/llc_input.c4
-rw-r--r--net/llc/llc_sap.c2
-rw-r--r--net/netfilter/Kconfig24
-rw-r--r--net/netfilter/Makefile7
-rw-r--r--net/netfilter/core.c216
-rw-r--r--net/netfilter/nf_internals.h39
-rw-r--r--net/netfilter/nf_log.c178
-rw-r--r--net/netfilter/nf_queue.c343
-rw-r--r--net/netfilter/nf_sockopt.c132
-rw-r--r--net/netfilter/nfnetlink.c376
-rw-r--r--net/netfilter/nfnetlink_log.c1055
-rw-r--r--net/netfilter/nfnetlink_queue.c1127
-rw-r--r--net/netlink/af_netlink.c365
-rw-r--r--net/netrom/af_netrom.c45
-rw-r--r--net/netrom/nr_dev.c5
-rw-r--r--net/netrom/nr_in.c3
-rw-r--r--net/netrom/nr_route.c8
-rw-r--r--net/netrom/nr_subr.c4
-rw-r--r--net/netrom/nr_timer.c2
-rw-r--r--net/packet/Kconfig26
-rw-r--r--net/packet/af_packet.c26
-rw-r--r--net/rose/af_rose.c35
-rw-r--r--net/rose/rose_in.c3
-rw-r--r--net/rose/rose_route.c22
-rw-r--r--net/rose/rose_subr.c9
-rw-r--r--net/rose/rose_timer.c2
-rw-r--r--net/rxrpc/krxiod.c2
-rw-r--r--net/rxrpc/krxsecd.c2
-rw-r--r--net/rxrpc/krxtimod.c2
-rw-r--r--net/rxrpc/transport.c2
-rw-r--r--net/sched/Kconfig51
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/act_api.c38
-rw-r--r--net/sched/cls_api.c9
-rw-r--r--net/sched/cls_rsvp.h1
-rw-r--r--net/sched/em_meta.c68
-rw-r--r--net/sched/em_text.c154
-rw-r--r--net/sched/gact.c2
-rw-r--r--net/sched/ipt.c2
-rw-r--r--net/sched/mirred.c2
-rw-r--r--net/sched/pedit.c2
-rw-r--r--net/sched/police.c3
-rw-r--r--net/sched/sch_api.c79
-rw-r--r--net/sched/sch_blackhole.c54
-rw-r--r--net/sched/sch_cbq.c3
-rw-r--r--net/sched/sch_dsmark.c357
-rw-r--r--net/sched/sch_fifo.c152
-rw-r--r--net/sched/sch_generic.c139
-rw-r--r--net/sched/sch_red.c2
-rw-r--r--net/sched/simple.c2
-rw-r--r--net/sctp/associola.c164
-rw-r--r--net/sctp/bind_addr.c16
-rw-r--r--net/sctp/chunk.c2
-rw-r--r--net/sctp/endpointola.c23
-rw-r--r--net/sctp/input.c126
-rw-r--r--net/sctp/inqueue.c18
-rw-r--r--net/sctp/ipv6.c50
-rw-r--r--net/sctp/objcnt.c6
-rw-r--r--net/sctp/output.c22
-rw-r--r--net/sctp/outqueue.c61
-rw-r--r--net/sctp/proc.c195
-rw-r--r--net/sctp/protocol.c28
-rw-r--r--net/sctp/sm_make_chunk.c56
-rw-r--r--net/sctp/sm_sideeffect.c118
-rw-r--r--net/sctp/sm_statefuns.c162
-rw-r--r--net/sctp/sm_statetable.c6
-rw-r--r--net/sctp/socket.c434
-rw-r--r--net/sctp/ssnmap.c3
-rw-r--r--net/sctp/sysctl.c14
-rw-r--r--net/sctp/transport.c10
-rw-r--r--net/sctp/ulpevent.c19
-rw-r--r--net/sctp/ulpqueue.c72
-rw-r--r--net/socket.c56
-rw-r--r--net/sunrpc/auth.c6
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c18
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c7
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c9
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_mech.c12
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c8
-rw-r--r--net/sunrpc/cache.c8
-rw-r--r--net/sunrpc/clnt.c205
-rw-r--r--net/sunrpc/pmap_clnt.c9
-rw-r--r--net/sunrpc/rpc_pipe.c6
-rw-r--r--net/sunrpc/sched.c92
-rw-r--r--net/sunrpc/stats.c16
-rw-r--r--net/sunrpc/sunrpc_syms.c13
-rw-r--r--net/sunrpc/svc.c36
-rw-r--r--net/sunrpc/svcauth.c1
-rw-r--r--net/sunrpc/svcauth_unix.c12
-rw-r--r--net/sunrpc/svcsock.c19
-rw-r--r--net/sunrpc/xdr.c299
-rw-r--r--net/sunrpc/xprt.c73
-rw-r--r--net/sysctl_net.c8
-rw-r--r--net/unix/Kconfig21
-rw-r--r--net/unix/af_unix.c14
-rw-r--r--net/unix/garbage.c14
-rw-r--r--net/unix/sysctl_net_unix.c2
-rw-r--r--net/wanrouter/Kconfig29
-rw-r--r--net/wanrouter/af_wanpipe.c2
-rw-r--r--net/wanrouter/wanmain.c6
-rw-r--r--net/x25/Kconfig36
-rw-r--r--net/x25/af_x25.c112
-rw-r--r--net/x25/x25_dev.c2
-rw-r--r--net/x25/x25_facilities.c34
-rw-r--r--net/x25/x25_in.c2
-rw-r--r--net/x25/x25_subr.c45
-rw-r--r--net/x25/x25_timer.c2
-rw-r--r--net/xfrm/Kconfig15
-rw-r--r--net/xfrm/xfrm_input.c2
-rw-r--r--net/xfrm/xfrm_policy.c11
-rw-r--r--net/xfrm/xfrm_state.c118
-rw-r--r--net/xfrm/xfrm_user.c319
423 files changed, 36347 insertions, 10250 deletions
diff --git a/net/802/fc.c b/net/802/fc.c
index 640d34e..282c4ab 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -87,7 +87,7 @@ static int fc_rebuild_header(struct sk_buff *skb)
struct fch_hdr *fch=(struct fch_hdr *)skb->data;
struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr));
if(fcllc->ethertype != htons(ETH_P_IP)) {
- printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n",(unsigned int)htons(fcllc->ethertype));
+ printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype));
return 0;
}
#ifdef CONFIG_INET
diff --git a/net/802/fddi.c b/net/802/fddi.c
index ebcf483..ac242a4b 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -108,8 +108,8 @@ static int fddi_rebuild_header(struct sk_buff *skb)
else
#endif
{
- printk("%s: Don't know how to resolve type %02X addresses.\n",
- skb->dev->name, htons(fddi->hdr.llc_snap.ethertype));
+ printk("%s: Don't know how to resolve type %04X addresses.\n",
+ skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype));
return(0);
}
}
@@ -122,10 +122,10 @@ static int fddi_rebuild_header(struct sk_buff *skb)
* the proper pointer to the start of packet data (skb->data).
*/
-unsigned short fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
{
struct fddihdr *fddi = (struct fddihdr *)skb->data;
- unsigned short type;
+ __be16 type;
/*
* Set mac.raw field to point to FC byte, set data field to point
diff --git a/net/802/hippi.c b/net/802/hippi.c
index 051e8af..6d7fed3 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -51,6 +51,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
unsigned len)
{
struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN);
+ struct hippi_cb *hcb = (struct hippi_cb *) skb->cb;
if (!len){
len = skb->len - HIPPI_HLEN;
@@ -84,9 +85,10 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
if (daddr)
{
memcpy(hip->le.dest_switch_addr, daddr + 3, 3);
- memcpy(&skb->private.ifield, daddr + 2, 4);
+ memcpy(&hcb->ifield, daddr + 2, 4);
return HIPPI_HLEN;
}
+ hcb->ifield = 0;
return -((int)HIPPI_HLEN);
}
@@ -122,7 +124,7 @@ static int hippi_rebuild_header(struct sk_buff *skb)
* Determine the packet's protocol ID.
*/
-unsigned short hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
{
struct hippi_hdr *hip;
diff --git a/net/802/p8022.c b/net/802/p8022.c
index 5ae6341..b24817c 100644
--- a/net/802/p8022.c
+++ b/net/802/p8022.c
@@ -35,7 +35,8 @@ static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb,
struct datalink_proto *register_8022_client(unsigned char type,
int (*func)(struct sk_buff *skb,
struct net_device *dev,
- struct packet_type *pt))
+ struct packet_type *pt,
+ struct net_device *orig_dev))
{
struct datalink_proto *proto;
diff --git a/net/802/p8023.c b/net/802/p8023.c
index a0b61b4..6368d3d 100644
--- a/net/802/p8023.c
+++ b/net/802/p8023.c
@@ -20,6 +20,7 @@
#include <linux/skbuff.h>
#include <net/datalink.h>
+#include <net/p8022.h>
/*
* Place an 802.3 header on a packet. The driver will do the mac
diff --git a/net/802/psnap.c b/net/802/psnap.c
index 1053821..ab80b1f 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -47,7 +47,7 @@ static struct datalink_proto *find_snap_client(unsigned char *desc)
* A SNAP packet has arrived
*/
static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+ struct packet_type *pt, struct net_device *orig_dev)
{
int rc = 1;
struct datalink_proto *proto;
@@ -61,7 +61,7 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
/* Pass the frame on. */
skb->h.raw += 5;
skb_pull(skb, 5);
- rc = proto->rcvfunc(skb, dev, &snap_packet_type);
+ rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev);
} else {
skb->sk = NULL;
kfree_skb(skb);
@@ -118,7 +118,8 @@ module_exit(snap_exit);
struct datalink_proto *register_snap_client(unsigned char *desc,
int (*rcvfunc)(struct sk_buff *,
struct net_device *,
- struct packet_type *))
+ struct packet_type *,
+ struct net_device *))
{
struct datalink_proto *proto = NULL;
diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c
index 3607963..7001295 100644
--- a/net/802/sysctl_net_802.c
+++ b/net/802/sysctl_net_802.c
@@ -10,9 +10,10 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <linux/config.h>
#include <linux/mm.h>
+#include <linux/if_tr.h>
#include <linux/sysctl.h>
-#include <linux/config.h>
#ifdef CONFIG_TR
extern int sysctl_tr_rif_timeout;
diff --git a/net/802/tr.c b/net/802/tr.c
index a755e88..1bb7dc1 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -251,10 +251,11 @@ void tr_source_route(struct sk_buff *skb,struct trh_hdr *trh,struct net_device *
unsigned int hash;
struct rif_cache *entry;
unsigned char *olddata;
+ unsigned long flags;
static const unsigned char mcast_func_addr[]
= {0xC0,0x00,0x00,0x04,0x00,0x00};
- spin_lock_bh(&rif_lock);
+ spin_lock_irqsave(&rif_lock, flags);
/*
* Broadcasts are single route as stated in RFC 1042
@@ -323,7 +324,7 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0],
else
slack = 18 - ((ntohs(trh->rcf) & TR_RCF_LEN_MASK)>>8);
olddata = skb->data;
- spin_unlock_bh(&rif_lock);
+ spin_unlock_irqrestore(&rif_lock, flags);
skb_pull(skb, slack);
memmove(skb->data, olddata, sizeof(struct trh_hdr) - slack);
@@ -337,10 +338,11 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0],
static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev)
{
unsigned int hash, rii_p = 0;
+ unsigned long flags;
struct rif_cache *entry;
- spin_lock_bh(&rif_lock);
+ spin_lock_irqsave(&rif_lock, flags);
/*
* Firstly see if the entry exists
@@ -378,7 +380,7 @@ printk("adding rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
if(!entry)
{
printk(KERN_DEBUG "tr.c: Couldn't malloc rif cache entry !\n");
- spin_unlock_bh(&rif_lock);
+ spin_unlock_irqrestore(&rif_lock, flags);
return;
}
@@ -420,7 +422,7 @@ printk("updating rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
}
entry->last_used=jiffies;
}
- spin_unlock_bh(&rif_lock);
+ spin_unlock_irqrestore(&rif_lock, flags);
}
/*
@@ -430,9 +432,9 @@ printk("updating rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
static void rif_check_expire(unsigned long dummy)
{
int i;
- unsigned long next_interval = jiffies + sysctl_tr_rif_timeout/2;
+ unsigned long flags, next_interval = jiffies + sysctl_tr_rif_timeout/2;
- spin_lock_bh(&rif_lock);
+ spin_lock_irqsave(&rif_lock, flags);
for(i =0; i < RIF_TABLE_SIZE; i++) {
struct rif_cache *entry, **pentry;
@@ -454,7 +456,7 @@ static void rif_check_expire(unsigned long dummy)
}
}
- spin_unlock_bh(&rif_lock);
+ spin_unlock_irqrestore(&rif_lock, flags);
mod_timer(&rif_timer, next_interval);
@@ -485,7 +487,7 @@ static struct rif_cache *rif_get_idx(loff_t pos)
static void *rif_seq_start(struct seq_file *seq, loff_t *pos)
{
- spin_lock_bh(&rif_lock);
+ spin_lock_irq(&rif_lock);
return *pos ? rif_get_idx(*pos - 1) : SEQ_START_TOKEN;
}
@@ -516,7 +518,7 @@ static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void rif_seq_stop(struct seq_file *seq, void *v)
{
- spin_unlock_bh(&rif_lock);
+ spin_unlock_irq(&rif_lock);
}
static int rif_seq_show(struct seq_file *seq, void *v)
diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig
new file mode 100644
index 0000000..c4a382e
--- /dev/null
+++ b/net/8021q/Kconfig
@@ -0,0 +1,19 @@
+#
+# Configuration for 802.1Q VLAN support
+#
+
+config VLAN_8021Q
+ tristate "802.1Q VLAN Support"
+ ---help---
+ Select this and you will be able to create 802.1Q VLAN interfaces
+ on your ethernet interfaces. 802.1Q VLAN supports almost
+ everything a regular ethernet interface does, including
+ firewalling, bridging, and of course IP traffic. You will need
+ the 'vconfig' tool from the VLAN project in order to effectively
+ use VLANs. See the VLAN web page for more information:
+ <http://www.candelatech.com/~greear/vlan.html>
+
+ To compile this code as a module, choose M here: the module
+ will be called 8021q.
+
+ If unsure, say N.
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 1f6d316..91e412b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -578,6 +578,14 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
if (!vlandev)
continue;
+ if (netif_carrier_ok(dev)) {
+ if (!netif_carrier_ok(vlandev))
+ netif_carrier_on(vlandev);
+ } else {
+ if (netif_carrier_ok(vlandev))
+ netif_carrier_off(vlandev);
+ }
+
if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) {
vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK)
| flgs;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 508b1fa..9ae3a14 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -51,7 +51,7 @@ struct net_device *__find_vlan_dev(struct net_device* real_dev,
/* found in vlan_dev.c */
int vlan_dev_rebuild_header(struct sk_buff *skb);
int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type* ptype);
+ struct packet_type *ptype, struct net_device *orig_dev);
int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type, void *daddr, void *saddr,
unsigned len);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 49c4874..145f5cd 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -113,7 +113,7 @@ static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb)
*
*/
int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type* ptype)
+ struct packet_type* ptype, struct net_device *orig_dev)
{
unsigned char *rawp = NULL;
struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data);
diff --git a/net/Kconfig b/net/Kconfig
index 9251b28..2bdd562 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -2,7 +2,7 @@
# Network configuration
#
-menu "Networking support"
+menu "Networking"
config NET
bool "Networking support"
@@ -10,7 +10,9 @@ config NET
Unless you really know what you are doing, you should say Y here.
The reason is that some programs need kernel networking support even
when running on a stand-alone machine that isn't connected to any
- other computer. If you are upgrading from an older kernel, you
+ other computer.
+
+ If you are upgrading from an older kernel, you
should consider updating your networking tools too because changes
in the kernel and the tools often go hand in hand. The tools are
contained in the package net-tools, the location and version number
@@ -20,57 +22,14 @@ config NET
recommended to read the NET-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
-menu "Networking options"
- depends on NET
-
-config PACKET
- tristate "Packet socket"
- ---help---
- The Packet protocol is used by applications which communicate
- directly with network devices without an intermediate network
- protocol implemented in the kernel, e.g. tcpdump. If you want them
- to work, choose Y.
+# Make sure that all config symbols are dependent on NET
+if NET
- To compile this driver as a module, choose M here: the module will
- be called af_packet.
-
- If unsure, say Y.
-
-config PACKET_MMAP
- bool "Packet socket: mmapped IO"
- depends on PACKET
- help
- If you say Y here, the Packet protocol driver will use an IO
- mechanism that results in faster communication.
-
- If unsure, say N.
-
-config UNIX
- tristate "Unix domain sockets"
- ---help---
- If you say Y here, you will include support for Unix domain sockets;
- sockets are the standard Unix mechanism for establishing and
- accessing network connections. Many commonly used programs such as
- the X Window system and syslog use these sockets even if your
- machine is not connected to any network. Unless you are working on
- an embedded system or something similar, you therefore definitely
- want to say Y here.
-
- To compile this driver as a module, choose M here: the module will be
- called unix. Note that several important services won't work
- correctly if you say M here and then neglect to load the module.
-
- Say Y unless you know what you are doing.
-
-config NET_KEY
- tristate "PF_KEY sockets"
- select XFRM
- ---help---
- PF_KEYv2 socket family, compatible to KAME ones.
- They are required if you are going to use IPsec tools ported
- from KAME.
+menu "Networking options"
- Say Y unless you know what you are doing.
+source "net/packet/Kconfig"
+source "net/unix/Kconfig"
+source "net/xfrm/Kconfig"
config INET
bool "TCP/IP networking"
@@ -94,30 +53,12 @@ config INET
Short answer: say Y.
+if INET
source "net/ipv4/Kconfig"
-
-# IPv6 as module will cause a CRASH if you try to unload it
-config IPV6
- tristate "The IPv6 protocol"
- depends on INET
- default m
- select CRYPTO if IPV6_PRIVACY
- select CRYPTO_MD5 if IPV6_PRIVACY
- ---help---
- This is complemental support for the IP version 6.
- You will still be able to do traditional IPv4 networking as well.
-
- For general information about IPv6, see
- <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
- For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
- For specific information about IPv6 under Linux, read the HOWTO at
- <http://www.bieringer.de/linux/IPv6/>.
-
- To compile this protocol support as a module, choose M here: the
- module will be called ipv6.
-
source "net/ipv6/Kconfig"
+endif # if INET
+
menuconfig NETFILTER
bool "Network packet filtering (replaces ipchains)"
---help---
@@ -206,269 +147,17 @@ source "net/bridge/netfilter/Kconfig"
endif
-config XFRM
- bool
- depends on NET
-
-source "net/xfrm/Kconfig"
-
+source "net/dccp/Kconfig"
source "net/sctp/Kconfig"
-
-config ATM
- tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)"
- depends on EXPERIMENTAL
- ---help---
- ATM is a high-speed networking technology for Local Area Networks
- and Wide Area Networks. It uses a fixed packet size and is
- connection oriented, allowing for the negotiation of minimum
- bandwidth requirements.
-
- In order to participate in an ATM network, your Linux box needs an
- ATM networking card. If you have that, say Y here and to the driver
- of your ATM card below.
-
- Note that you need a set of user-space programs to actually make use
- of ATM. See the file <file:Documentation/networking/atm.txt> for
- further details.
-
-config ATM_CLIP
- tristate "Classical IP over ATM (EXPERIMENTAL)"
- depends on ATM && INET
- help
- Classical IP over ATM for PVCs and SVCs, supporting InARP and
- ATMARP. If you want to communication with other IP hosts on your ATM
- network, you will typically either say Y here or to "LAN Emulation
- (LANE)" below.
-
-config ATM_CLIP_NO_ICMP
- bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
- depends on ATM_CLIP
- help
- Normally, an "ICMP host unreachable" message is sent if a neighbour
- cannot be reached because there is no VC to it in the kernel's
- ATMARP table. This may cause problems when ATMARP table entries are
- briefly removed during revalidation. If you say Y here, packets to
- such neighbours are silently discarded instead.
-
-config ATM_LANE
- tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
- depends on ATM
- help
- LAN Emulation emulates services of existing LANs across an ATM
- network. Besides operating as a normal ATM end station client, Linux
- LANE client can also act as an proxy client bridging packets between
- ELAN and Ethernet segments. You need LANE if you want to try MPOA.
-
-config ATM_MPOA
- tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
- depends on ATM && INET && ATM_LANE!=n
- help
- Multi-Protocol Over ATM allows ATM edge devices such as routers,
- bridges and ATM attached hosts establish direct ATM VCs across
- subnetwork boundaries. These shortcut connections bypass routers
- enhancing overall network performance.
-
-config ATM_BR2684
- tristate "RFC1483/2684 Bridged protocols"
- depends on ATM && INET
- help
- ATM PVCs can carry ethernet PDUs according to rfc2684 (formerly 1483)
- This device will act like an ethernet from the kernels point of view,
- with the traffic being carried by ATM PVCs (currently 1 PVC/device).
- This is sometimes used over DSL lines. If in doubt, say N.
-
-config ATM_BR2684_IPFILTER
- bool "Per-VC IP filter kludge"
- depends on ATM_BR2684
- help
- This is an experimental mechanism for users who need to terminating a
- large number of IP-only vcc's. Do not enable this unless you are sure
- you know what you are doing.
-
-config BRIDGE
- tristate "802.1d Ethernet Bridging"
- ---help---
- If you say Y here, then your Linux box will be able to act as an
- Ethernet bridge, which means that the different Ethernet segments it
- is connected to will appear as one Ethernet to the participants.
- Several such bridges can work together to create even larger
- networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
- As this is a standard, Linux bridges will cooperate properly with
- other third party bridge products.
-
- In order to use the Ethernet bridge, you'll need the bridge
- configuration tools; see <file:Documentation/networking/bridge.txt>
- for location. Please read the Bridge mini-HOWTO for more
- information.
-
- If you enable iptables support along with the bridge support then you
- turn your bridge into a bridging IP firewall.
- iptables will then see the IP packets being bridged, so you need to
- take this into account when setting up your firewall rules.
- Enabling arptables support when bridging will let arptables see
- bridged ARP traffic in the arptables FORWARD chain.
-
- To compile this code as a module, choose M here: the module
- will be called bridge.
-
- If unsure, say N.
-
-config VLAN_8021Q
- tristate "802.1Q VLAN Support"
- ---help---
- Select this and you will be able to create 802.1Q VLAN interfaces
- on your ethernet interfaces. 802.1Q VLAN supports almost
- everything a regular ethernet interface does, including
- firewalling, bridging, and of course IP traffic. You will need
- the 'vconfig' tool from the VLAN project in order to effectively
- use VLANs. See the VLAN web page for more information:
- <http://www.candelatech.com/~greear/vlan.html>
-
- To compile this code as a module, choose M here: the module
- will be called 8021q.
-
- If unsure, say N.
-
-config DECNET
- tristate "DECnet Support"
- ---help---
- The DECnet networking protocol was used in many products made by
- Digital (now Compaq). It provides reliable stream and sequenced
- packet communications over which run a variety of services similar
- to those which run over TCP/IP.
-
- To find some tools to use with the kernel layer support, please
- look at Patrick Caulfield's web site:
- <http://linux-decnet.sourceforge.net/>.
-
- More detailed documentation is available in
- <file:Documentation/networking/decnet.txt>.
-
- Be sure to say Y to "/proc file system support" and "Sysctl support"
- below when using DECnet, since you will need sysctl support to aid
- in configuration at run time.
-
- The DECnet code is also available as a module ( = code which can be
- inserted in and removed from the running kernel whenever you want).
- The module is called decnet.
-
+source "net/atm/Kconfig"
+source "net/bridge/Kconfig"
+source "net/8021q/Kconfig"
source "net/decnet/Kconfig"
-
source "net/llc/Kconfig"
-
-config IPX
- tristate "The IPX protocol"
- select LLC
- ---help---
- This is support for the Novell networking protocol, IPX, commonly
- used for local networks of Windows machines. You need it if you
- want to access Novell NetWare file or print servers using the Linux
- Novell client ncpfs (available from
- <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
- within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
- available from <http://www.tldp.org/docs.html#howto>). In order
- to do the former, you'll also have to say Y to "NCP file system
- support", below.
-
- IPX is similar in scope to IP, while SPX, which runs on top of IPX,
- is similar to TCP. There is also experimental support for SPX in
- Linux (see "SPX networking", below).
-
- To turn your Linux box into a fully featured NetWare file server and
- IPX router, say Y here and fetch either lwared from
- <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
- mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
- information, read the IPX-HOWTO available from
- <http://www.tldp.org/docs.html#howto>.
-
- General information about how to connect Linux, Windows machines and
- Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-
- The IPX driver would enlarge your kernel by about 16 KB. To compile
- this driver as a module, choose M here: the module will be called ipx.
- Unless you want to integrate your Linux box with a local Novell
- network, say N.
-
source "net/ipx/Kconfig"
-
-config ATALK
- tristate "Appletalk protocol support"
- select LLC
- ---help---
- AppleTalk is the protocol that Apple computers can use to communicate
- on a network. If your Linux box is connected to such a network and you
- wish to connect to it, say Y. You will need to use the netatalk package
- so that your Linux box can act as a print and file server for Macs as
- well as access AppleTalk printers. Check out
- <http://www.zettabyte.net/netatalk/> on the WWW for details.
- EtherTalk is the name used for AppleTalk over Ethernet and the
- cheaper and slower LocalTalk is AppleTalk over a proprietary Apple
- network using serial links. EtherTalk and LocalTalk are fully
- supported by Linux.
-
- General information about how to connect Linux, Windows machines and
- Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. The
- NET-3-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>, contains valuable
- information as well.
-
- To compile this driver as a module, choose M here: the module will be
- called appletalk. You almost certainly want to compile it as a
- module so you can restart your AppleTalk stack without rebooting
- your machine. I hear that the GNU boycott of Apple is over, so
- even politically correct people are allowed to say Y here.
-
source "drivers/net/appletalk/Kconfig"
-
-config X25
- tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
- depends on EXPERIMENTAL
- ---help---
- X.25 is a set of standardized network protocols, similar in scope to
- frame relay; the one physical line from your box to the X.25 network
- entry point can carry several logical point-to-point connections
- (called "virtual circuits") to other computers connected to the X.25
- network. Governments, banks, and other organizations tend to use it
- to connect to each other or to form Wide Area Networks (WANs). Many
- countries have public X.25 networks. X.25 consists of two
- protocols: the higher level Packet Layer Protocol (PLP) (say Y here
- if you want that) and the lower level data link layer protocol LAPB
- (say Y to "LAPB Data Link Driver" below if you want that).
-
- You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
- <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
- Information about X.25 for Linux is contained in the files
- <file:Documentation/networking/x25.txt> and
- <file:Documentation/networking/x25-iface.txt>.
-
- One connects to an X.25 network either with a dedicated network card
- using the X.21 protocol (not yet supported by Linux) or one can do
- X.25 over a standard telephone line using an ordinary modem (say Y
- to "X.25 async driver" below) or over Ethernet using an ordinary
- Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
- Driver" and "LAPB over Ethernet driver" below).
-
- To compile this driver as a module, choose M here: the module
- will be called x25. If unsure, say N.
-
-config LAPB
- tristate "LAPB Data Link Driver (EXPERIMENTAL)"
- depends on EXPERIMENTAL
- ---help---
- Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
- the lower) part of the X.25 protocol. It offers a reliable
- connection service to exchange data frames with one other host, and
- it is used to transport higher level protocols (mostly X.25 Packet
- Layer, the higher part of X.25, but others are possible as well).
- Usually, LAPB is used with specialized X.21 network cards, but Linux
- currently supports LAPB only over Ethernet connections. If you want
- to use LAPB connections over Ethernet, say Y here and to "LAPB over
- Ethernet driver" below. Read
- <file:Documentation/networking/lapb-module.txt> for technical
- details.
-
- To compile this driver as a module, choose M here: the
- module will be called lapb. If unsure, say N.
+source "net/x25/Kconfig"
+source "net/lapb/Kconfig"
config NET_DIVERT
bool "Frame Diverter (EXPERIMENTAL)"
@@ -496,107 +185,10 @@ config NET_DIVERT
If unsure, say N.
-config ECONET
- tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)"
- depends on EXPERIMENTAL && INET
- ---help---
- Econet is a fairly old and slow networking protocol mainly used by
- Acorn computers to access file and print servers. It uses native
- Econet network cards. AUN is an implementation of the higher level
- parts of Econet that runs over ordinary Ethernet connections, on
- top of the UDP packet protocol, which in turn runs on top of the
- Internet protocol IP.
-
- If you say Y here, you can choose with the next two options whether
- to send Econet/AUN traffic over a UDP Ethernet connection or over
- a native Econet network card.
-
- To compile this driver as a module, choose M here: the module
- will be called econet.
-
-config ECONET_AUNUDP
- bool "AUN over UDP"
- depends on ECONET
- help
- Say Y here if you want to send Econet/AUN traffic over a UDP
- connection (UDP is a packet based protocol that runs on top of the
- Internet protocol IP) using an ordinary Ethernet network card.
-
-config ECONET_NATIVE
- bool "Native Econet"
- depends on ECONET
- help
- Say Y here if you have a native Econet network card installed in
- your computer.
-
-config WAN_ROUTER
- tristate "WAN router"
- depends on EXPERIMENTAL
- ---help---
- Wide Area Networks (WANs), such as X.25, frame relay and leased
- lines, are used to interconnect Local Area Networks (LANs) over vast
- distances with data transfer rates significantly higher than those
- achievable with commonly used asynchronous modem connections.
- Usually, a quite expensive external device called a `WAN router' is
- needed to connect to a WAN.
-
- As an alternative, WAN routing can be built into the Linux kernel.
- With relatively inexpensive WAN interface cards available on the
- market, a perfectly usable router can be built for less than half
- the price of an external router. If you have one of those cards and
- wish to use your Linux box as a WAN router, say Y here and also to
- the WAN driver for your card, below. You will then need the
- wan-tools package which is available from <ftp://ftp.sangoma.com/>.
- Read <file:Documentation/networking/wan-router.txt> for more
- information.
-
- To compile WAN routing support as a module, choose M here: the
- module will be called wanrouter.
-
- If unsure, say N.
-
-menu "QoS and/or fair queueing"
-
-config NET_SCHED
- bool "QoS and/or fair queueing"
- ---help---
- When the kernel has several packets to send out over a network
- device, it has to decide which ones to send first, which ones to
- delay, and which ones to drop. This is the job of the packet
- scheduler, and several different algorithms for how to do this
- "fairly" have been proposed.
-
- If you say N here, you will get the standard packet scheduler, which
- is a FIFO (first come, first served). If you say Y here, you will be
- able to choose from among several alternative algorithms which can
- then be attached to different network devices. This is useful for
- example if some of your network devices are real time devices that
- need a certain minimum data flow rate, or if you need to limit the
- maximum data flow rate for traffic which matches specified criteria.
- This code is considered to be experimental.
-
- To administer these schedulers, you'll need the user-level utilities
- from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
- That package also contains some documentation; for more, check out
- <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
-
- This Quality of Service (QoS) support will enable you to use
- Differentiated Services (diffserv) and Resource Reservation Protocol
- (RSVP) on your Linux router if you also say Y to "QoS support",
- "Packet classifier API" and to some classifiers below. Documentation
- and software is at <http://diffserv.sourceforge.net/>.
-
- If you say Y here and to "/proc file system" below, you will be able
- to read status information about packet schedulers from the file
- /proc/net/psched.
-
- The available schedulers are listed in the following questions; you
- can say Y to as many as you like. If unsure, say N now.
-
+source "net/econet/Kconfig"
+source "net/wanrouter/Kconfig"
source "net/sched/Kconfig"
-endmenu
-
menu "Network testing"
config NET_PKTGEN
@@ -614,33 +206,17 @@ config NET_PKTGEN
To compile this code as a module, choose M here: the
module will be called pktgen.
-endmenu
+source "net/netfilter/Kconfig"
endmenu
-config NETPOLL
- def_bool NETCONSOLE
-
-config NETPOLL_RX
- bool "Netpoll support for trapping incoming packets"
- default n
- depends on NETPOLL
-
-config NETPOLL_TRAP
- bool "Netpoll traffic trapping"
- default n
- depends on NETPOLL
-
-config NET_POLL_CONTROLLER
- def_bool NETPOLL
+endmenu
source "net/ax25/Kconfig"
-
source "net/irda/Kconfig"
-
source "net/bluetooth/Kconfig"
+source "net/ieee80211/Kconfig"
-source "drivers/net/Kconfig"
-
-endmenu
+endif # if NET
+endmenu # Networking
diff --git a/net/Makefile b/net/Makefile
index 8e2bdc0..4aa2f46 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_NET) += $(tmp-y)
obj-$(CONFIG_LLC) += llc/
obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/
obj-$(CONFIG_INET) += ipv4/
+obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX) += unix/
ifneq ($(CONFIG_IPV6),)
@@ -41,7 +42,9 @@ obj-$(CONFIG_ATM) += atm/
obj-$(CONFIG_DECNET) += decnet/
obj-$(CONFIG_ECONET) += econet/
obj-$(CONFIG_VLAN_8021Q) += 8021q/
+obj-$(CONFIG_IP_DCCP) += dccp/
obj-$(CONFIG_IP_SCTP) += sctp/
+obj-$(CONFIG_IEEE80211) += ieee80211/
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_SYSCTL) += sysctl_net.o
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 54640c0..7076097 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -35,6 +35,7 @@
#include <net/datalink.h>
#include <net/psnap.h>
#include <linux/atalk.h>
+#include <linux/delay.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -462,8 +463,7 @@ void aarp_probe_network(struct atalk_iface *atif)
aarp_send_probe(atif->dev, &atif->address);
/* Defer 1/10th */
- current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(HZ / 10);
+ msleep(100);
if (atif->status & ATIF_PROBE_FAIL)
break;
@@ -510,9 +510,8 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa)
aarp_send_probe(atif->dev, sa);
/* Defer 1/10th */
- current->state = TASK_INTERRUPTIBLE;
write_unlock_bh(&aarp_lock);
- schedule_timeout(HZ / 10);
+ msleep(100);
write_lock_bh(&aarp_lock);
if (entry->status & ATIF_PROBE_FAIL)
@@ -565,7 +564,7 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb,
* numbers we just happen to need. Now put the
* length in the lower two.
*/
- *((__u16 *)skb->data) = htons(skb->len);
+ *((__be16 *)skb->data) = htons(skb->len);
ft = 1;
}
/*
@@ -699,7 +698,7 @@ static void __aarp_resolved(struct aarp_entry **list, struct aarp_entry *a,
* frame. We currently only support Ethernet.
*/
static int aarp_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+ struct packet_type *pt, struct net_device *orig_dev)
{
struct elapaarp *ea = aarp_hdr(skb);
int hash, ret = 0;
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 876dbac..1d31b3a 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -53,12 +53,12 @@
#include <linux/config.h>
#include <linux/module.h>
-#include <linux/tcp.h>
#include <linux/if_arp.h>
#include <linux/termios.h> /* For TIOCOUTQ/INQ */
#include <net/datalink.h>
#include <net/psnap.h>
#include <net/sock.h>
+#include <net/tcp_states.h>
#include <net/route.h>
#include <linux/atalk.h>
@@ -401,7 +401,7 @@ out_err:
}
/* Find a match for a specific network:node pair */
-static struct atalk_iface *atalk_find_interface(int net, int node)
+static struct atalk_iface *atalk_find_interface(__be16 net, int node)
{
struct atalk_iface *iface;
@@ -1390,7 +1390,7 @@ free_it:
* [ie ARPHRD_ETHERTALK]
*/
static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+ struct packet_type *pt, struct net_device *orig_dev)
{
struct ddpehdr *ddp;
struct sock *sock;
@@ -1482,7 +1482,7 @@ freeit:
* header and append a long one.
*/
static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+ struct packet_type *pt, struct net_device *orig_dev)
{
/* Expand any short form frames */
if (skb->mac.raw[2] == 1) {
@@ -1528,7 +1528,7 @@ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
}
skb->h.raw = skb->data;
- return atalk_rcv(skb, dev, pt);
+ return atalk_rcv(skb, dev, pt, orig_dev);
freeit:
kfree_skb(skb);
return 0;
diff --git a/net/atm/Kconfig b/net/atm/Kconfig
new file mode 100644
index 0000000..21ff276
--- /dev/null
+++ b/net/atm/Kconfig
@@ -0,0 +1,74 @@
+#
+# Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)
+#
+
+config ATM
+ tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ ---help---
+ ATM is a high-speed networking technology for Local Area Networks
+ and Wide Area Networks. It uses a fixed packet size and is
+ connection oriented, allowing for the negotiation of minimum
+ bandwidth requirements.
+
+ In order to participate in an ATM network, your Linux box needs an
+ ATM networking card. If you have that, say Y here and to the driver
+ of your ATM card below.
+
+ Note that you need a set of user-space programs to actually make use
+ of ATM. See the file <file:Documentation/networking/atm.txt> for
+ further details.
+
+config ATM_CLIP
+ tristate "Classical IP over ATM (EXPERIMENTAL)"
+ depends on ATM && INET
+ help
+ Classical IP over ATM for PVCs and SVCs, supporting InARP and
+ ATMARP. If you want to communication with other IP hosts on your ATM
+ network, you will typically either say Y here or to "LAN Emulation
+ (LANE)" below.
+
+config ATM_CLIP_NO_ICMP
+ bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
+ depends on ATM_CLIP
+ help
+ Normally, an "ICMP host unreachable" message is sent if a neighbour
+ cannot be reached because there is no VC to it in the kernel's
+ ATMARP table. This may cause problems when ATMARP table entries are
+ briefly removed during revalidation. If you say Y here, packets to
+ such neighbours are silently discarded instead.
+
+config ATM_LANE
+ tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
+ depends on ATM
+ help
+ LAN Emulation emulates services of existing LANs across an ATM
+ network. Besides operating as a normal ATM end station client, Linux
+ LANE client can also act as an proxy client bridging packets between
+ ELAN and Ethernet segments. You need LANE if you want to try MPOA.
+
+config ATM_MPOA
+ tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
+ depends on ATM && INET && ATM_LANE!=n
+ help
+ Multi-Protocol Over ATM allows ATM edge devices such as routers,
+ bridges and ATM attached hosts establish direct ATM VCs across
+ subnetwork boundaries. These shortcut connections bypass routers
+ enhancing overall network performance.
+
+config ATM_BR2684
+ tristate "RFC1483/2684 Bridged protocols"
+ depends on ATM && INET
+ help
+ ATM PVCs can carry ethernet PDUs according to RFC2684 (formerly 1483)
+ This device will act like an ethernet from the kernels point of view,
+ with the traffic being carried by ATM PVCs (currently 1 PVC/device).
+ This is sometimes used over DSL lines. If in doubt, say N.
+
+config ATM_BR2684_IPFILTER
+ bool "Per-VC IP filter kludge"
+ depends on ATM_BR2684
+ help
+ This is an experimental mechanism for users who need to terminate a
+ large number of IP-only vcc's. Do not enable this unless you are sure
+ you know what you are doing.
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index e6954cf..289956c 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -289,8 +289,7 @@ xmit will add the additional header part in that case */
* This is similar to eth_type_trans, which cannot be used because of
* our dev->hard_header_len
*/
-static inline unsigned short br_type_trans(struct sk_buff *skb,
- struct net_device *dev)
+static inline __be16 br_type_trans(struct sk_buff *skb, struct net_device *dev)
{
struct ethhdr *eth;
unsigned char *rawp;
diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
index 4dbb5af..d89056e 100644
--- a/net/atm/ioctl.c
+++ b/net/atm/ioctl.c
@@ -21,6 +21,7 @@
#include "resources.h"
#include "signaling.h" /* for WAITING and sigd_attach */
+#include "common.h"
static DECLARE_MUTEX(ioctl_mutex);
diff --git a/net/atm/ipcommon.c b/net/atm/ipcommon.c
index 181a300..4b1faca 100644
--- a/net/atm/ipcommon.c
+++ b/net/atm/ipcommon.c
@@ -34,7 +34,6 @@
void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to)
{
- struct sk_buff *skb;
unsigned long flags;
struct sk_buff *skb_from = (struct sk_buff *) from;
struct sk_buff *skb_to = (struct sk_buff *) to;
@@ -47,8 +46,6 @@ void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to)
prev->next = skb_to;
to->prev->next = from->next;
to->prev = from->prev;
- for (skb = from->next; skb != skb_to; skb = skb->next)
- skb->list = to;
to->qlen += from->qlen;
spin_unlock(&to->lock);
from->prev = skb_from;
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 02f5374..08e4605 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -118,10 +118,6 @@ static int svc_bind(struct socket *sock,struct sockaddr *sockaddr,
goto out;
}
vcc = ATM_SD(sock);
- if (test_bit(ATM_VF_SESSION, &vcc->flags)) {
- error = -EINVAL;
- goto out;
- }
addr = (struct sockaddr_atmsvc *) sockaddr;
if (addr->sas_family != AF_ATMSVC) {
error = -EAFNOSUPPORT;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 707097d..ed705dd 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -45,7 +45,7 @@
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/spinlock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/ip.h>
#include <net/arp.h>
@@ -875,12 +875,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
sk->sk_sndbuf = osk->sk_sndbuf;
sk->sk_state = TCP_ESTABLISHED;
sk->sk_sleep = osk->sk_sleep;
-
- if (sock_flag(osk, SOCK_DBG))
- sock_set_flag(sk, SOCK_DBG);
-
- if (sock_flag(osk, SOCK_ZAPPED))
- sock_set_flag(sk, SOCK_ZAPPED);
+ sock_copy_flags(sk, osk);
oax25 = ax25_sk(osk);
@@ -1007,7 +1002,8 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct sock *sk = sock->sk;
struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
ax25_dev *ax25_dev = NULL;
- ax25_address *call;
+ ax25_uid_assoc *user;
+ ax25_address call;
ax25_cb *ax25;
int err = 0;
@@ -1026,9 +1022,15 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (addr->fsa_ax25.sax25_family != AF_AX25)
return -EINVAL;
- call = ax25_findbyuid(current->euid);
- if (call == NULL && ax25_uid_policy && !capable(CAP_NET_ADMIN)) {
- return -EACCES;
+ user = ax25_findbyuid(current->euid);
+ if (user) {
+ call = user->call;
+ ax25_uid_put(user);
+ } else {
+ if (ax25_uid_policy && !capable(CAP_NET_ADMIN))
+ return -EACCES;
+
+ call = addr->fsa_ax25.sax25_call;
}
lock_sock(sk);
@@ -1039,10 +1041,7 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
}
- if (call == NULL)
- ax25->source_addr = addr->fsa_ax25.sax25_call;
- else
- ax25->source_addr = *call;
+ ax25->source_addr = call;
/*
* User already set interface with SO_BINDTODEVICE
@@ -1875,6 +1874,7 @@ static void ax25_info_stop(struct seq_file *seq, void *v)
static int ax25_info_show(struct seq_file *seq, void *v)
{
ax25_cb *ax25 = v;
+ char buf[11];
int k;
@@ -1886,13 +1886,13 @@ static int ax25_info_show(struct seq_file *seq, void *v)
seq_printf(seq, "%8.8lx %s %s%s ",
(long) ax25,
ax25->ax25_dev == NULL? "???" : ax25->ax25_dev->dev->name,
- ax2asc(&ax25->source_addr),
+ ax2asc(buf, &ax25->source_addr),
ax25->iamdigi? "*":"");
- seq_printf(seq, "%s", ax2asc(&ax25->dest_addr));
+ seq_printf(seq, "%s", ax2asc(buf, &ax25->dest_addr));
for (k=0; (ax25->digipeat != NULL) && (k < ax25->digipeat->ndigi); k++) {
seq_printf(seq, ",%s%s",
- ax2asc(&ax25->digipeat->calls[k]),
+ ax2asc(buf, &ax25->digipeat->calls[k]),
ax25->digipeat->repeated[k]? "*":"");
}
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
index f4fa6df..dca179d 100644
--- a/net/ax25/ax25_addr.c
+++ b/net/ax25/ax25_addr.c
@@ -36,9 +36,8 @@ ax25_address null_ax25_address = {{0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00}};
/*
* ax25 -> ascii conversion
*/
-char *ax2asc(ax25_address *a)
+char *ax2asc(char *buf, ax25_address *a)
{
- static char buf[11];
char c, *s;
int n;
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 8adc002..edcaa89 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -22,8 +22,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/ip.h> /* For ip_rcv */
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 3a8b673..061083e 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -18,7 +18,7 @@
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index 3dc808f..810c9c7 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -9,7 +9,6 @@
* Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
* Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
*/
-#include <linux/config.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
@@ -26,9 +25,7 @@
#include <linux/skbuff.h>
#include <linux/netfilter.h>
#include <net/sock.h>
-#include <net/ip.h> /* For ip_rcv */
-#include <net/tcp.h>
-#include <net/arp.h> /* For arp_rcv */
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
@@ -114,7 +111,6 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
pid = *skb->data;
-#ifdef CONFIG_INET
if (pid == AX25_P_IP) {
/* working around a TCP bug to keep additional listeners
* happy. TCP re-uses the buffer and destroys the original
@@ -132,10 +128,9 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
skb->dev = ax25->ax25_dev->dev;
skb->pkt_type = PACKET_HOST;
skb->protocol = htons(ETH_P_IP);
- ip_rcv(skb, skb->dev, NULL); /* Wrong ptype */
+ netif_rx(skb);
return 1;
}
-#endif
if (pid == AX25_P_SEGMENT) {
skb_pull(skb, 1); /* Remove PID */
return ax25_rx_fragment(ax25, skb);
@@ -250,7 +245,6 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
/* Now we are pointing at the pid byte */
switch (skb->data[1]) {
-#ifdef CONFIG_INET
case AX25_P_IP:
skb_pull(skb,2); /* drop PID/CTRL */
skb->h.raw = skb->data;
@@ -258,7 +252,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
skb->dev = dev;
skb->pkt_type = PACKET_HOST;
skb->protocol = htons(ETH_P_IP);
- ip_rcv(skb, dev, ptype); /* Note ptype here is the wrong one, fix me later */
+ netif_rx(skb);
break;
case AX25_P_ARP:
@@ -268,9 +262,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
skb->dev = dev;
skb->pkt_type = PACKET_HOST;
skb->protocol = htons(ETH_P_ARP);
- arp_rcv(skb, dev, ptype); /* Note ptype here is wrong... */
+ netif_rx(skb);
break;
-#endif
case AX25_P_TEXT:
/* Now find a suitable dgram socket */
sk = ax25_get_socket(&dest, &src, SOCK_DGRAM);
@@ -454,7 +447,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
* Receive an AX.25 frame via a SLIP interface.
*/
int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *ptype)
+ struct packet_type *ptype, struct net_device *orig_dev)
{
skb->sk = NULL; /* Initially we don't know who it's for */
skb->destructor = NULL; /* Who initializes this, dammit?! */
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 44b99b1f..26b77d9 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -298,6 +298,8 @@ static void ax25_rt_seq_stop(struct seq_file *seq, void *v)
static int ax25_rt_seq_show(struct seq_file *seq, void *v)
{
+ char buf[11];
+
if (v == SEQ_START_TOKEN)
seq_puts(seq, "callsign dev mode digipeaters\n");
else {
@@ -308,7 +310,7 @@ static int ax25_rt_seq_show(struct seq_file *seq, void *v)
if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0)
callsign = "default";
else
- callsign = ax2asc(&ax25_rt->callsign);
+ callsign = ax2asc(buf, &ax25_rt->callsign);
seq_printf(seq, "%-9s %-4s",
callsign,
@@ -328,7 +330,8 @@ static int ax25_rt_seq_show(struct seq_file *seq, void *v)
if (ax25_rt->digipeat != NULL)
for (i = 0; i < ax25_rt->digipeat->ndigi; i++)
- seq_printf(seq, " %s", ax2asc(&ax25_rt->digipeat->calls[i]));
+ seq_printf(seq, " %s",
+ ax2asc(buf, &ax25_rt->digipeat->calls[i]));
seq_puts(seq, "\n");
}
@@ -422,8 +425,8 @@ static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat)
*/
int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
{
+ ax25_uid_assoc *user;
ax25_route *ax25_rt;
- ax25_address *call;
int err;
if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
@@ -434,16 +437,18 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
goto put;
}
- if ((call = ax25_findbyuid(current->euid)) == NULL) {
+ user = ax25_findbyuid(current->euid);
+ if (user) {
+ ax25->source_addr = user->call;
+ ax25_uid_put(user);
+ } else {
if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
err = -EPERM;
goto put;
}
- call = (ax25_address *)ax25->ax25_dev->dev->dev_addr;
+ ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr;
}
- ax25->source_addr = *call;
-
if (ax25_rt->digipeat != NULL) {
if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
err = -ENOMEM;
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
index 7131873..f6ed283 100644
--- a/net/ax25/ax25_std_in.c
+++ b/net/ax25/ax25_std_in.c
@@ -29,8 +29,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/ip.h> /* For ip_rcv */
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index 066897b..a29c480 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -24,7 +24,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 99694b5..c41dbe5 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -24,7 +24,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
@@ -76,7 +76,7 @@ void ax25_requeue_frames(ax25_cb *ax25)
if (skb_prev == NULL)
skb_queue_head(&ax25->write_queue, skb);
else
- skb_append(skb_prev, skb);
+ skb_append(skb_prev, skb, &ax25->write_queue);
skb_prev = skb;
}
}
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index cea6b7d..d53cc86 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -28,6 +28,7 @@
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
+#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -41,38 +42,41 @@
* Callsign/UID mapper. This is in kernel space for security on multi-amateur machines.
*/
-static ax25_uid_assoc *ax25_uid_list;
+HLIST_HEAD(ax25_uid_list);
static DEFINE_RWLOCK(ax25_uid_lock);
int ax25_uid_policy = 0;
-ax25_address *ax25_findbyuid(uid_t uid)
+ax25_uid_assoc *ax25_findbyuid(uid_t uid)
{
- ax25_uid_assoc *ax25_uid;
- ax25_address *res = NULL;
+ ax25_uid_assoc *ax25_uid, *res = NULL;
+ struct hlist_node *node;
read_lock(&ax25_uid_lock);
- for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) {
+ ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
if (ax25_uid->uid == uid) {
- res = &ax25_uid->call;
+ ax25_uid_hold(ax25_uid);
+ res = ax25_uid;
break;
}
}
read_unlock(&ax25_uid_lock);
- return NULL;
+ return res;
}
int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
{
- ax25_uid_assoc *s, *ax25_uid;
+ ax25_uid_assoc *ax25_uid;
+ struct hlist_node *node;
+ ax25_uid_assoc *user;
unsigned long res;
switch (cmd) {
case SIOCAX25GETUID:
res = -ENOENT;
read_lock(&ax25_uid_lock);
- for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) {
+ ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) {
res = ax25_uid->uid;
break;
@@ -85,19 +89,22 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
case SIOCAX25ADDUID:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- if (ax25_findbyuid(sax->sax25_uid))
+ user = ax25_findbyuid(sax->sax25_uid);
+ if (user) {
+ ax25_uid_put(user);
return -EEXIST;
+ }
if (sax->sax25_uid == 0)
return -EINVAL;
if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL)
return -ENOMEM;
+ atomic_set(&ax25_uid->refcount, 1);
ax25_uid->uid = sax->sax25_uid;
ax25_uid->call = sax->sax25_call;
write_lock(&ax25_uid_lock);
- ax25_uid->next = ax25_uid_list;
- ax25_uid_list = ax25_uid;
+ hlist_add_head(&ax25_uid->uid_node, &ax25_uid_list);
write_unlock(&ax25_uid_lock);
return 0;
@@ -106,34 +113,21 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
if (!capable(CAP_NET_ADMIN))
return -EPERM;
+ ax25_uid = NULL;
write_lock(&ax25_uid_lock);
- for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) {
- if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) {
+ ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+ if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0)
break;
- }
}
if (ax25_uid == NULL) {
write_unlock(&ax25_uid_lock);
return -ENOENT;
}
- if ((s = ax25_uid_list) == ax25_uid) {
- ax25_uid_list = s->next;
- write_unlock(&ax25_uid_lock);
- kfree(ax25_uid);
- return 0;
- }
- while (s != NULL && s->next != NULL) {
- if (s->next == ax25_uid) {
- s->next = ax25_uid->next;
- write_unlock(&ax25_uid_lock);
- kfree(ax25_uid);
- return 0;
- }
- s = s->next;
- }
+ hlist_del_init(&ax25_uid->uid_node);
+ ax25_uid_put(ax25_uid);
write_unlock(&ax25_uid_lock);
- return -ENOENT;
+ return 0;
default:
return -EINVAL;
@@ -147,13 +141,11 @@ int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax)
static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
{
struct ax25_uid_assoc *pt;
- int i = 1;
+ struct hlist_node *node;
+ int i = 0;
read_lock(&ax25_uid_lock);
- if (*pos == 0)
- return SEQ_START_TOKEN;
-
- for (pt = ax25_uid_list; pt != NULL; pt = pt->next) {
+ ax25_uid_for_each(pt, node, &ax25_uid_list) {
if (i == *pos)
return pt;
++i;
@@ -164,8 +156,9 @@ static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos)
static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
- return (v == SEQ_START_TOKEN) ? ax25_uid_list :
- ((struct ax25_uid_assoc *) v)->next;
+
+ return hlist_entry(((ax25_uid_assoc *)v)->uid_node.next,
+ ax25_uid_assoc, uid_node);
}
static void ax25_uid_seq_stop(struct seq_file *seq, void *v)
@@ -175,13 +168,14 @@ static void ax25_uid_seq_stop(struct seq_file *seq, void *v)
static int ax25_uid_seq_show(struct seq_file *seq, void *v)
{
+ char buf[11];
+
if (v == SEQ_START_TOKEN)
seq_printf(seq, "Policy: %d\n", ax25_uid_policy);
else {
struct ax25_uid_assoc *pt = v;
-
- seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(&pt->call));
+ seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(buf, &pt->call));
}
return 0;
}
@@ -213,16 +207,13 @@ struct file_operations ax25_uid_fops = {
*/
void __exit ax25_uid_free(void)
{
- ax25_uid_assoc *s, *ax25_uid;
+ ax25_uid_assoc *ax25_uid;
+ struct hlist_node *node;
write_lock(&ax25_uid_lock);
- ax25_uid = ax25_uid_list;
- while (ax25_uid != NULL) {
- s = ax25_uid;
- ax25_uid = ax25_uid->next;
-
- kfree(s);
+ ax25_uid_for_each(ax25_uid, node, &ax25_uid_list) {
+ hlist_del_init(&ax25_uid->uid_node);
+ ax25_uid_put(ax25_uid);
}
- ax25_uid_list = NULL;
write_unlock(&ax25_uid_lock);
}
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 2e341de..901eff7 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -213,7 +213,7 @@ static int cmtp_send_frame(struct cmtp_session *session, unsigned char *data, in
return kernel_sendmsg(sock, &msg, &iv, 1, len);
}
-static int cmtp_process_transmit(struct cmtp_session *session)
+static void cmtp_process_transmit(struct cmtp_session *session)
{
struct sk_buff *skb, *nskb;
unsigned char *hdr;
@@ -223,7 +223,7 @@ static int cmtp_process_transmit(struct cmtp_session *session)
if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) {
BT_ERR("Can't allocate memory for new frame");
- return -ENOMEM;
+ return;
}
while ((skb = skb_dequeue(&session->transmit))) {
@@ -275,8 +275,6 @@ static int cmtp_process_transmit(struct cmtp_session *session)
cmtp_send_frame(session, nskb->data, nskb->len);
kfree_skb(nskb);
-
- return skb_queue_len(&session->transmit);
}
static int cmtp_session(void *arg)
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index fb55243..55dc42e 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -191,7 +191,7 @@ static void hci_init_req(struct hci_dev *hdev, unsigned long opt)
/* Special commands */
while ((skb = skb_dequeue(&hdev->driver_init))) {
- skb->pkt_type = HCI_COMMAND_PKT;
+ bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
skb->dev = (void *) hdev;
skb_queue_tail(&hdev->cmd_q, skb);
hci_sched_cmd(hdev);
@@ -299,7 +299,6 @@ struct hci_dev *hci_dev_get(int index)
read_unlock(&hci_dev_list_lock);
return hdev;
}
-EXPORT_SYMBOL(hci_dev_get);
/* ---- Inquiry support ---- */
static void inquiry_cache_flush(struct hci_dev *hdev)
@@ -996,11 +995,11 @@ static int hci_send_frame(struct sk_buff *skb)
return -ENODEV;
}
- BT_DBG("%s type %d len %d", hdev->name, skb->pkt_type, skb->len);
+ BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len);
if (atomic_read(&hdev->promisc)) {
/* Time stamp */
- do_gettimeofday(&skb->stamp);
+ __net_timestamp(skb);
hci_send_to_sock(hdev, skb);
}
@@ -1035,14 +1034,13 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *p
BT_DBG("skb len %d", skb->len);
- skb->pkt_type = HCI_COMMAND_PKT;
+ bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
skb->dev = (void *) hdev;
skb_queue_tail(&hdev->cmd_q, skb);
hci_sched_cmd(hdev);
return 0;
}
-EXPORT_SYMBOL(hci_send_cmd);
/* Get data from the previously sent command */
void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 ogf, __u16 ocf)
@@ -1083,7 +1081,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags)
BT_DBG("%s conn %p flags 0x%x", hdev->name, conn, flags);
skb->dev = (void *) hdev;
- skb->pkt_type = HCI_ACLDATA_PKT;
+ bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
hci_add_acl_hdr(skb, conn->handle, flags | ACL_START);
if (!(list = skb_shinfo(skb)->frag_list)) {
@@ -1105,7 +1103,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags)
skb = list; list = list->next;
skb->dev = (void *) hdev;
- skb->pkt_type = HCI_ACLDATA_PKT;
+ bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
hci_add_acl_hdr(skb, conn->handle, flags | ACL_CONT);
BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
@@ -1141,7 +1139,7 @@ int hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
memcpy(skb->h.raw, &hdr, HCI_SCO_HDR_SIZE);
skb->dev = (void *) hdev;
- skb->pkt_type = HCI_SCODATA_PKT;
+ bt_cb(skb)->pkt_type = HCI_SCODATA_PKT;
skb_queue_tail(&conn->data_q, skb);
hci_sched_tx(hdev);
return 0;
@@ -1371,7 +1369,7 @@ void hci_rx_task(unsigned long arg)
if (test_bit(HCI_INIT, &hdev->flags)) {
/* Don't process data packets in this states. */
- switch (skb->pkt_type) {
+ switch (bt_cb(skb)->pkt_type) {
case HCI_ACLDATA_PKT:
case HCI_SCODATA_PKT:
kfree_skb(skb);
@@ -1380,7 +1378,7 @@ void hci_rx_task(unsigned long arg)
}
/* Process frame */
- switch (skb->pkt_type) {
+ switch (bt_cb(skb)->pkt_type) {
case HCI_EVENT_PKT:
hci_event_packet(hdev, skb);
break;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index c4b592b..d6da093 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -484,14 +484,18 @@ static inline void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff
/* Inquiry Result */
static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
{
+ struct inquiry_data data;
struct inquiry_info *info = (struct inquiry_info *) (skb->data + 1);
int num_rsp = *((__u8 *) skb->data);
BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+ if (!num_rsp)
+ return;
+
hci_dev_lock(hdev);
+
for (; num_rsp; num_rsp--) {
- struct inquiry_data data;
bacpy(&data.bdaddr, &info->bdaddr);
data.pscan_rep_mode = info->pscan_rep_mode;
data.pscan_period_mode = info->pscan_period_mode;
@@ -502,30 +506,55 @@ static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *
info++;
hci_inquiry_cache_update(hdev, &data);
}
+
hci_dev_unlock(hdev);
}
/* Inquiry Result With RSSI */
static inline void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct sk_buff *skb)
{
- struct inquiry_info_with_rssi *info = (struct inquiry_info_with_rssi *) (skb->data + 1);
+ struct inquiry_data data;
int num_rsp = *((__u8 *) skb->data);
BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+ if (!num_rsp)
+ return;
+
hci_dev_lock(hdev);
- for (; num_rsp; num_rsp--) {
- struct inquiry_data data;
- bacpy(&data.bdaddr, &info->bdaddr);
- data.pscan_rep_mode = info->pscan_rep_mode;
- data.pscan_period_mode = info->pscan_period_mode;
- data.pscan_mode = 0x00;
- memcpy(data.dev_class, info->dev_class, 3);
- data.clock_offset = info->clock_offset;
- data.rssi = info->rssi;
- info++;
- hci_inquiry_cache_update(hdev, &data);
+
+ if ((skb->len - 1) / num_rsp != sizeof(struct inquiry_info_with_rssi)) {
+ struct inquiry_info_with_rssi_and_pscan_mode *info =
+ (struct inquiry_info_with_rssi_and_pscan_mode *) (skb->data + 1);
+
+ for (; num_rsp; num_rsp--) {
+ bacpy(&data.bdaddr, &info->bdaddr);
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = info->pscan_mode;
+ memcpy(data.dev_class, info->dev_class, 3);
+ data.clock_offset = info->clock_offset;
+ data.rssi = info->rssi;
+ info++;
+ hci_inquiry_cache_update(hdev, &data);
+ }
+ } else {
+ struct inquiry_info_with_rssi *info =
+ (struct inquiry_info_with_rssi *) (skb->data + 1);
+
+ for (; num_rsp; num_rsp--) {
+ bacpy(&data.bdaddr, &info->bdaddr);
+ data.pscan_rep_mode = info->pscan_rep_mode;
+ data.pscan_period_mode = info->pscan_period_mode;
+ data.pscan_mode = 0x00;
+ memcpy(data.dev_class, info->dev_class, 3);
+ data.clock_offset = info->clock_offset;
+ data.rssi = info->rssi;
+ info++;
+ hci_inquiry_cache_update(hdev, &data);
+ }
}
+
hci_dev_unlock(hdev);
}
@@ -865,6 +894,24 @@ static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *sk
hci_dev_unlock(hdev);
}
+/* Page Scan Repetition Mode */
+static inline void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_pscan_rep_mode *ev = (struct hci_ev_pscan_rep_mode *) skb->data;
+ struct inquiry_entry *ie;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if ((ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr))) {
+ ie->data.pscan_rep_mode = ev->pscan_rep_mode;
+ ie->timestamp = jiffies;
+ }
+
+ hci_dev_unlock(hdev);
+}
+
void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_event_hdr *hdr = (struct hci_event_hdr *) skb->data;
@@ -937,6 +984,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
hci_clock_offset_evt(hdev, skb);
break;
+ case HCI_EV_PSCAN_REP_MODE:
+ hci_pscan_rep_mode_evt(hdev, skb);
+ break;
+
case HCI_EV_CMD_STATUS:
cs = (struct hci_ev_cmd_status *) skb->data;
skb_pull(skb, sizeof(cs));
@@ -1035,9 +1086,11 @@ void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
ev->type = type;
memcpy(ev->data, data, dlen);
- skb->pkt_type = HCI_EVENT_PKT;
+ bt_cb(skb)->incoming = 1;
+ __net_timestamp(skb);
+
+ bt_cb(skb)->pkt_type = HCI_EVENT_PKT;
skb->dev = (void *) hdev;
hci_send_to_sock(hdev, skb);
kfree_skb(skb);
}
-EXPORT_SYMBOL(hci_si_event);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index ebdcce5..32ef797 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -110,11 +110,11 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
/* Apply filter */
flt = &hci_pi(sk)->filter;
- if (!test_bit((skb->pkt_type == HCI_VENDOR_PKT) ?
- 0 : (skb->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask))
+ if (!test_bit((bt_cb(skb)->pkt_type == HCI_VENDOR_PKT) ?
+ 0 : (bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask))
continue;
- if (skb->pkt_type == HCI_EVENT_PKT) {
+ if (bt_cb(skb)->pkt_type == HCI_EVENT_PKT) {
register int evt = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS);
if (!hci_test_bit(evt, &flt->event_mask))
@@ -131,7 +131,7 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
continue;
/* Put type byte before the data */
- memcpy(skb_push(nskb, 1), &nskb->pkt_type, 1);
+ memcpy(skb_push(nskb, 1), &bt_cb(nskb)->pkt_type, 1);
if (sock_queue_rcv_skb(sk, nskb))
kfree_skb(nskb);
@@ -327,11 +327,17 @@ static inline void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_
{
__u32 mask = hci_pi(sk)->cmsg_mask;
- if (mask & HCI_CMSG_DIR)
- put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(int), &bt_cb(skb)->incoming);
+ if (mask & HCI_CMSG_DIR) {
+ int incoming = bt_cb(skb)->incoming;
+ put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming);
+ }
+
+ if (mask & HCI_CMSG_TSTAMP) {
+ struct timeval tv;
- if (mask & HCI_CMSG_TSTAMP)
- put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(skb->stamp), &skb->stamp);
+ skb_get_timestamp(skb, &tv);
+ put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(tv), &tv);
+ }
}
static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
@@ -405,11 +411,11 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
goto drop;
}
- skb->pkt_type = *((unsigned char *) skb->data);
+ bt_cb(skb)->pkt_type = *((unsigned char *) skb->data);
skb_pull(skb, 1);
skb->dev = (void *) hdev;
- if (skb->pkt_type == HCI_COMMAND_PKT) {
+ if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) {
u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data));
u16 ogf = hci_opcode_ogf(opcode);
u16 ocf = hci_opcode_ocf(opcode);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index affbc554..de8af5f 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -428,7 +428,7 @@ static int hidp_send_frame(struct socket *sock, unsigned char *data, int len)
return kernel_sendmsg(sock, &msg, &iv, 1, len);
}
-static int hidp_process_transmit(struct hidp_session *session)
+static void hidp_process_transmit(struct hidp_session *session)
{
struct sk_buff *skb;
@@ -453,9 +453,6 @@ static int hidp_process_transmit(struct hidp_session *session)
hidp_set_timer(session);
kfree_skb(skb);
}
-
- return skb_queue_len(&session->ctrl_transmit) +
- skb_queue_len(&session->intr_transmit);
}
static int hidp_session(void *arg)
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 32fccfb..d3d6bc5 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -372,7 +372,7 @@ static struct proto l2cap_proto = {
.obj_size = sizeof(struct l2cap_pinfo)
};
-static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, int prio)
+static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
{
struct sock *sk;
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index 9efb0a0..ee6a669 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -34,31 +34,6 @@
#include <net/bluetooth/bluetooth.h>
-void bt_dump(char *pref, __u8 *buf, int count)
-{
- char *ptr;
- char line[100];
- unsigned int i;
-
- printk(KERN_INFO "%s: dump, len %d\n", pref, count);
-
- ptr = line;
- *ptr = 0;
- for (i = 0; i < count; i++) {
- ptr += sprintf(ptr, " %2.2X", buf[i]);
-
- if (i && !((i + 1) % 20)) {
- printk(KERN_INFO "%s:%s\n", pref, line);
- ptr = line;
- *ptr = 0;
- }
- }
-
- if (line[0])
- printk(KERN_INFO "%s:%s\n", pref, line);
-}
-EXPORT_SYMBOL(bt_dump);
-
void baswap(bdaddr_t *dst, bdaddr_t *src)
{
unsigned char *d = (unsigned char *) dst;
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index e9e6fda..173f46e 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -21,10 +21,6 @@
SOFTWARE IS DISCLAIMED.
*/
-/*
- RPN support - Dirk Husemann <hud@zurich.ibm.com>
-*/
-
/*
* Bluetooth RFCOMM core.
*
@@ -115,10 +111,10 @@ static void rfcomm_session_del(struct rfcomm_session *s);
#define __get_mcc_len(b) ((b & 0xfe) >> 1)
/* RPN macros */
-#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x3) << 3))
+#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x7) << 3))
#define __get_rpn_data_bits(line) ((line) & 0x3)
#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1)
-#define __get_rpn_parity(line) (((line) >> 3) & 0x3)
+#define __get_rpn_parity(line) (((line) >> 3) & 0x7)
static inline void rfcomm_schedule(uint event)
{
@@ -233,7 +229,7 @@ static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d)
d->rx_credits = RFCOMM_DEFAULT_CREDITS;
}
-struct rfcomm_dlc *rfcomm_dlc_alloc(int prio)
+struct rfcomm_dlc *rfcomm_dlc_alloc(unsigned int __nocast prio)
{
struct rfcomm_dlc *d = kmalloc(sizeof(*d), prio);
if (!d)
@@ -389,8 +385,6 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
rfcomm_dlc_unlock(d);
skb_queue_purge(&d->tx_queue);
- rfcomm_session_put(s);
-
rfcomm_dlc_unlink(d);
}
@@ -600,8 +594,6 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
goto failed;
}
- rfcomm_session_hold(s);
-
s->initiator = 1;
bacpy(&addr.l2_bdaddr, dst);
@@ -784,10 +776,10 @@ static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d
return rfcomm_send_frame(s, buf, ptr - buf);
}
-static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
- u8 bit_rate, u8 data_bits, u8 stop_bits,
- u8 parity, u8 flow_ctrl_settings,
- u8 xon_char, u8 xoff_char, u16 param_mask)
+int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
+ u8 bit_rate, u8 data_bits, u8 stop_bits,
+ u8 parity, u8 flow_ctrl_settings,
+ u8 xon_char, u8 xoff_char, u16 param_mask)
{
struct rfcomm_hdr *hdr;
struct rfcomm_mcc *mcc;
@@ -795,9 +787,9 @@ static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
u8 buf[16], *ptr = buf;
BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x"
- "flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x",
- s, cr, dlci, bit_rate, data_bits, stop_bits, parity,
- flow_ctrl_settings, xon_char, xoff_char, param_mask);
+ " flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x",
+ s, cr, dlci, bit_rate, data_bits, stop_bits, parity,
+ flow_ctrl_settings, xon_char, xoff_char, param_mask);
hdr = (void *) ptr; ptr += sizeof(*hdr);
hdr->addr = __addr(s->initiator, 0);
@@ -1269,16 +1261,16 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
u8 xon_char = 0;
u8 xoff_char = 0;
u16 rpn_mask = RFCOMM_RPN_PM_ALL;
-
- BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x",
- dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
- rpn->xon_char, rpn->xoff_char, rpn->param_mask);
-
- if (!cr)
+
+ BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x",
+ dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
+ rpn->xon_char, rpn->xoff_char, rpn->param_mask);
+
+ if (!cr)
return 0;
-
+
if (len == 1) {
- /* request: return default setting */
+ /* This is a request, return default settings */
bit_rate = RFCOMM_RPN_BR_115200;
data_bits = RFCOMM_RPN_DATA_8;
stop_bits = RFCOMM_RPN_STOP_1;
@@ -1286,11 +1278,12 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
flow_ctrl = RFCOMM_RPN_FLOW_NONE;
xon_char = RFCOMM_RPN_XON_CHAR;
xoff_char = RFCOMM_RPN_XOFF_CHAR;
-
goto rpn_out;
}
- /* check for sane values: ignore/accept bit_rate, 8 bits, 1 stop bit, no parity,
- no flow control lines, normal XON/XOFF chars */
+
+ /* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit,
+ * no parity, no flow control lines, normal XON/XOFF chars */
+
if (rpn->param_mask & RFCOMM_RPN_PM_BITRATE) {
bit_rate = rpn->bit_rate;
if (bit_rate != RFCOMM_RPN_BR_115200) {
@@ -1299,6 +1292,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
rpn_mask ^= RFCOMM_RPN_PM_BITRATE;
}
}
+
if (rpn->param_mask & RFCOMM_RPN_PM_DATA) {
data_bits = __get_rpn_data_bits(rpn->line_settings);
if (data_bits != RFCOMM_RPN_DATA_8) {
@@ -1307,6 +1301,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
rpn_mask ^= RFCOMM_RPN_PM_DATA;
}
}
+
if (rpn->param_mask & RFCOMM_RPN_PM_STOP) {
stop_bits = __get_rpn_stop_bits(rpn->line_settings);
if (stop_bits != RFCOMM_RPN_STOP_1) {
@@ -1315,6 +1310,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
rpn_mask ^= RFCOMM_RPN_PM_STOP;
}
}
+
if (rpn->param_mask & RFCOMM_RPN_PM_PARITY) {
parity = __get_rpn_parity(rpn->line_settings);
if (parity != RFCOMM_RPN_PARITY_NONE) {
@@ -1323,6 +1319,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
rpn_mask ^= RFCOMM_RPN_PM_PARITY;
}
}
+
if (rpn->param_mask & RFCOMM_RPN_PM_FLOW) {
flow_ctrl = rpn->flow_ctrl;
if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) {
@@ -1331,6 +1328,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
rpn_mask ^= RFCOMM_RPN_PM_FLOW;
}
}
+
if (rpn->param_mask & RFCOMM_RPN_PM_XON) {
xon_char = rpn->xon_char;
if (xon_char != RFCOMM_RPN_XON_CHAR) {
@@ -1339,6 +1337,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
rpn_mask ^= RFCOMM_RPN_PM_XON;
}
}
+
if (rpn->param_mask & RFCOMM_RPN_PM_XOFF) {
xoff_char = rpn->xoff_char;
if (xoff_char != RFCOMM_RPN_XOFF_CHAR) {
@@ -1349,9 +1348,8 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
}
rpn_out:
- rfcomm_send_rpn(s, 0, dlci,
- bit_rate, data_bits, stop_bits, parity, flow_ctrl,
- xon_char, xoff_char, rpn_mask);
+ rfcomm_send_rpn(s, 0, dlci, bit_rate, data_bits, stop_bits,
+ parity, flow_ctrl, xon_char, xoff_char, rpn_mask);
return 0;
}
@@ -1362,14 +1360,13 @@ static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb
u8 dlci = __get_dlci(rls->dlci);
BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status);
-
+
if (!cr)
return 0;
- /* FIXME: We should probably do something with this
- information here. But for now it's sufficient just
- to reply -- Bluetooth 1.1 says it's mandatory to
- recognise and respond to RLS */
+ /* We should probably do something with this information here. But
+ * for now it's sufficient just to reply -- Bluetooth 1.1 says it's
+ * mandatory to recognise and respond to RLS */
rfcomm_send_rls(s, 0, dlci, rls->status);
@@ -1385,7 +1382,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig);
d = rfcomm_dlc_get(s, dlci);
- if (!d)
+ if (!d)
return 0;
if (cr) {
@@ -1393,7 +1390,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
set_bit(RFCOMM_TX_THROTTLED, &d->flags);
else
clear_bit(RFCOMM_TX_THROTTLED, &d->flags);
-
+
rfcomm_dlc_lock(d);
if (d->modem_status)
d->modem_status(d, msc->v24_sig);
@@ -1402,7 +1399,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
rfcomm_send_msc(s, 0, dlci, msc->v24_sig);
d->mscex |= RFCOMM_MSCEX_RX;
- } else
+ } else
d->mscex |= RFCOMM_MSCEX_TX;
return 0;
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index f3f6355..90e19eb 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -284,7 +284,7 @@ static struct proto rfcomm_proto = {
.obj_size = sizeof(struct rfcomm_pinfo)
};
-static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, int prio)
+static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
{
struct rfcomm_dlc *d;
struct sock *sk;
@@ -590,8 +590,11 @@ static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (skb_queue_len(&sk->sk_receive_queue) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) ||
- signal_pending(current) || !timeo)
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ sk->sk_err ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current) ||
+ !timeo)
break;
set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 6d68920..1bca860 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -286,7 +286,7 @@ static inline void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *de
skb->destructor = rfcomm_wfree;
}
-static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, int priority)
+static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, unsigned int __nocast priority)
{
if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) {
struct sk_buff *skb = alloc_skb(size, priority);
@@ -528,9 +528,14 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig)
struct rfcomm_dev *dev = dlc->owner;
if (!dev)
return;
-
+
BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig);
+ if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) {
+ if (dev->tty && !C_CLOCAL(dev->tty))
+ tty_hangup(dev->tty);
+ }
+
dev->modem_status =
((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) |
((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) |
@@ -740,20 +745,143 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned
return -ENOIOCTLCMD;
}
-#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
-
static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old)
{
- BT_DBG("tty %p", tty);
+ struct termios *new = (struct termios *) tty->termios;
+ int old_baud_rate = tty_termios_baud_rate(old);
+ int new_baud_rate = tty_termios_baud_rate(new);
- if ((tty->termios->c_cflag == old->c_cflag) &&
- (RELEVANT_IFLAG(tty->termios->c_iflag) == RELEVANT_IFLAG(old->c_iflag)))
- return;
+ u8 baud, data_bits, stop_bits, parity, x_on, x_off;
+ u16 changes = 0;
+
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+ BT_DBG("tty %p termios %p", tty, old);
+
+ /* Handle turning off CRTSCTS */
+ if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS))
+ BT_DBG("Turning off CRTSCTS unsupported");
+
+ /* Parity on/off and when on, odd/even */
+ if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) ||
+ ((old->c_cflag & PARODD) != (new->c_cflag & PARODD)) ) {
+ changes |= RFCOMM_RPN_PM_PARITY;
+ BT_DBG("Parity change detected.");
+ }
+
+ /* Mark and space parity are not supported! */
+ if (new->c_cflag & PARENB) {
+ if (new->c_cflag & PARODD) {
+ BT_DBG("Parity is ODD");
+ parity = RFCOMM_RPN_PARITY_ODD;
+ } else {
+ BT_DBG("Parity is EVEN");
+ parity = RFCOMM_RPN_PARITY_EVEN;
+ }
+ } else {
+ BT_DBG("Parity is OFF");
+ parity = RFCOMM_RPN_PARITY_NONE;
+ }
+
+ /* Setting the x_on / x_off characters */
+ if (old->c_cc[VSTOP] != new->c_cc[VSTOP]) {
+ BT_DBG("XOFF custom");
+ x_on = new->c_cc[VSTOP];
+ changes |= RFCOMM_RPN_PM_XON;
+ } else {
+ BT_DBG("XOFF default");
+ x_on = RFCOMM_RPN_XON_CHAR;
+ }
+
+ if (old->c_cc[VSTART] != new->c_cc[VSTART]) {
+ BT_DBG("XON custom");
+ x_off = new->c_cc[VSTART];
+ changes |= RFCOMM_RPN_PM_XOFF;
+ } else {
+ BT_DBG("XON default");
+ x_off = RFCOMM_RPN_XOFF_CHAR;
+ }
+
+ /* Handle setting of stop bits */
+ if ((old->c_cflag & CSTOPB) != (new->c_cflag & CSTOPB))
+ changes |= RFCOMM_RPN_PM_STOP;
+
+ /* POSIX does not support 1.5 stop bits and RFCOMM does not
+ * support 2 stop bits. So a request for 2 stop bits gets
+ * translated to 1.5 stop bits */
+ if (new->c_cflag & CSTOPB) {
+ stop_bits = RFCOMM_RPN_STOP_15;
+ } else {
+ stop_bits = RFCOMM_RPN_STOP_1;
+ }
+
+ /* Handle number of data bits [5-8] */
+ if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE))
+ changes |= RFCOMM_RPN_PM_DATA;
+
+ switch (new->c_cflag & CSIZE) {
+ case CS5:
+ data_bits = RFCOMM_RPN_DATA_5;
+ break;
+ case CS6:
+ data_bits = RFCOMM_RPN_DATA_6;
+ break;
+ case CS7:
+ data_bits = RFCOMM_RPN_DATA_7;
+ break;
+ case CS8:
+ data_bits = RFCOMM_RPN_DATA_8;
+ break;
+ default:
+ data_bits = RFCOMM_RPN_DATA_8;
+ break;
+ }
+
+ /* Handle baudrate settings */
+ if (old_baud_rate != new_baud_rate)
+ changes |= RFCOMM_RPN_PM_BITRATE;
- /* handle turning off CRTSCTS */
- if ((old->c_cflag & CRTSCTS) && !(tty->termios->c_cflag & CRTSCTS)) {
- BT_DBG("turning off CRTSCTS");
+ switch (new_baud_rate) {
+ case 2400:
+ baud = RFCOMM_RPN_BR_2400;
+ break;
+ case 4800:
+ baud = RFCOMM_RPN_BR_4800;
+ break;
+ case 7200:
+ baud = RFCOMM_RPN_BR_7200;
+ break;
+ case 9600:
+ baud = RFCOMM_RPN_BR_9600;
+ break;
+ case 19200:
+ baud = RFCOMM_RPN_BR_19200;
+ break;
+ case 38400:
+ baud = RFCOMM_RPN_BR_38400;
+ break;
+ case 57600:
+ baud = RFCOMM_RPN_BR_57600;
+ break;
+ case 115200:
+ baud = RFCOMM_RPN_BR_115200;
+ break;
+ case 230400:
+ baud = RFCOMM_RPN_BR_230400;
+ break;
+ default:
+ /* 9600 is standard accordinag to the RFCOMM specification */
+ baud = RFCOMM_RPN_BR_9600;
+ break;
+
}
+
+ if (changes)
+ rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud,
+ data_bits, stop_bits, parity,
+ RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes);
+
+ return;
}
static void rfcomm_tty_throttle(struct tty_struct *tty)
@@ -761,7 +889,7 @@ static void rfcomm_tty_throttle(struct tty_struct *tty)
struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
-
+
rfcomm_dlc_throttle(dev->dlc);
}
@@ -770,7 +898,7 @@ static void rfcomm_tty_unthrottle(struct tty_struct *tty)
struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
-
+
rfcomm_dlc_unthrottle(dev->dlc);
}
@@ -781,7 +909,7 @@ static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
BT_DBG("tty %p dev %p", tty, dev);
- if (skb_queue_len(&dlc->tx_queue))
+ if (!skb_queue_empty(&dlc->tx_queue))
return dlc->mtu;
return 0;
@@ -841,35 +969,35 @@ static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp)
static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsigned int set, unsigned int clear)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
- struct rfcomm_dlc *dlc = dev->dlc;
- u8 v24_sig;
+ struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dlc *dlc = dev->dlc;
+ u8 v24_sig;
BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear);
- rfcomm_dlc_get_modem_status(dlc, &v24_sig);
-
- if (set & TIOCM_DSR || set & TIOCM_DTR)
- v24_sig |= RFCOMM_V24_RTC;
- if (set & TIOCM_RTS || set & TIOCM_CTS)
- v24_sig |= RFCOMM_V24_RTR;
- if (set & TIOCM_RI)
- v24_sig |= RFCOMM_V24_IC;
- if (set & TIOCM_CD)
- v24_sig |= RFCOMM_V24_DV;
-
- if (clear & TIOCM_DSR || clear & TIOCM_DTR)
- v24_sig &= ~RFCOMM_V24_RTC;
- if (clear & TIOCM_RTS || clear & TIOCM_CTS)
- v24_sig &= ~RFCOMM_V24_RTR;
- if (clear & TIOCM_RI)
- v24_sig &= ~RFCOMM_V24_IC;
- if (clear & TIOCM_CD)
- v24_sig &= ~RFCOMM_V24_DV;
-
- rfcomm_dlc_set_modem_status(dlc, v24_sig);
-
- return 0;
+ rfcomm_dlc_get_modem_status(dlc, &v24_sig);
+
+ if (set & TIOCM_DSR || set & TIOCM_DTR)
+ v24_sig |= RFCOMM_V24_RTC;
+ if (set & TIOCM_RTS || set & TIOCM_CTS)
+ v24_sig |= RFCOMM_V24_RTR;
+ if (set & TIOCM_RI)
+ v24_sig |= RFCOMM_V24_IC;
+ if (set & TIOCM_CD)
+ v24_sig |= RFCOMM_V24_DV;
+
+ if (clear & TIOCM_DSR || clear & TIOCM_DTR)
+ v24_sig &= ~RFCOMM_V24_RTC;
+ if (clear & TIOCM_RTS || clear & TIOCM_CTS)
+ v24_sig &= ~RFCOMM_V24_RTR;
+ if (clear & TIOCM_RI)
+ v24_sig &= ~RFCOMM_V24_IC;
+ if (clear & TIOCM_CD)
+ v24_sig &= ~RFCOMM_V24_DV;
+
+ rfcomm_dlc_set_modem_status(dlc, v24_sig);
+
+ return 0;
}
/* ---- TTY structure ---- */
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 746c11f..ce7ab7d 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -418,7 +418,7 @@ static struct proto sco_proto = {
.obj_size = sizeof(struct sco_pinfo)
};
-static struct sock *sco_sock_alloc(struct socket *sock, int proto, int prio)
+static struct sock *sco_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
{
struct sock *sk;
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
new file mode 100644
index 0000000..db23d59
--- /dev/null
+++ b/net/bridge/Kconfig
@@ -0,0 +1,31 @@
+#
+# 802.1d Ethernet Bridging
+#
+
+config BRIDGE
+ tristate "802.1d Ethernet Bridging"
+ ---help---
+ If you say Y here, then your Linux box will be able to act as an
+ Ethernet bridge, which means that the different Ethernet segments it
+ is connected to will appear as one Ethernet to the participants.
+ Several such bridges can work together to create even larger
+ networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
+ As this is a standard, Linux bridges will cooperate properly with
+ other third party bridge products.
+
+ In order to use the Ethernet bridge, you'll need the bridge
+ configuration tools; see <file:Documentation/networking/bridge.txt>
+ for location. Please read the Bridge mini-HOWTO for more
+ information.
+
+ If you enable iptables support along with the bridge support then you
+ turn your bridge into a bridging IP firewall.
+ iptables will then see the IP packets being bridged, so you need to
+ take this into account when setting up your firewall rules.
+ Enabling arptables support when bridging will let arptables see
+ bridged ARP traffic in the arptables FORWARD chain.
+
+ To compile this code as a module, choose M here: the module
+ will be called bridge.
+
+ If unsure, say N.
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e6c2200..24396b9 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -23,7 +23,7 @@
#include <asm/atomic.h>
#include "br_private.h"
-static kmem_cache_t *br_fdb_cache;
+static kmem_cache_t *br_fdb_cache __read_mostly;
static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr);
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index ef9f209..069253f 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -57,9 +57,6 @@ int br_forward_finish(struct sk_buff *skb)
static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
{
skb->dev = to->dev;
-#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug = 0;
-#endif
NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
br_forward_finish);
}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 8f5f2e7..9a45e62 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -23,11 +23,7 @@ const unsigned char bridge_ula[6] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
static int br_pass_frame_up_finish(struct sk_buff *skb)
{
-#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug = 0;
-#endif
netif_receive_skb(skb);
-
return 0;
}
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index be03d3a..2d52fee 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -102,10 +102,6 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
{
struct nf_bridge_info *nf_bridge = skb->nf_bridge;
-#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING);
-#endif
-
if (nf_bridge->mask & BRNF_PKT_TYPE) {
skb->pkt_type = PACKET_OTHERHOST;
nf_bridge->mask ^= BRNF_PKT_TYPE;
@@ -182,10 +178,6 @@ static void __br_dnat_complain(void)
* --Bart, 20021007 (updated) */
static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
{
-#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug |= (1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_FORWARD);
-#endif
-
if (skb->pkt_type == PACKET_OTHERHOST) {
skb->pkt_type = PACKET_HOST;
skb->nf_bridge->mask |= BRNF_PKT_TYPE;
@@ -207,10 +199,6 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
struct iphdr *iph = skb->nh.iph;
struct nf_bridge_info *nf_bridge = skb->nf_bridge;
-#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING);
-#endif
-
if (nf_bridge->mask & BRNF_PKT_TYPE) {
skb->pkt_type = PACKET_OTHERHOST;
nf_bridge->mask ^= BRNF_PKT_TYPE;
@@ -382,9 +370,6 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
goto inhdr_error;
-#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug ^= (1 << NF_IP6_PRE_ROUTING);
-#endif
if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
return NF_DROP;
setup_pre_routing(skb);
@@ -468,9 +453,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
skb->ip_summed = CHECKSUM_NONE;
}
-#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug ^= (1 << NF_IP_PRE_ROUTING);
-#endif
if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
return NF_DROP;
setup_pre_routing(skb);
@@ -517,10 +499,6 @@ static int br_nf_forward_finish(struct sk_buff *skb)
struct net_device *in;
struct vlan_ethhdr *hdr = vlan_eth_hdr(skb);
-#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug ^= (1 << NF_BR_FORWARD);
-#endif
-
if (skb->protocol != __constant_htons(ETH_P_ARP) && !IS_VLAN_ARP) {
in = nf_bridge->physindev;
if (nf_bridge->mask & BRNF_PKT_TYPE) {
@@ -566,9 +544,6 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff **pskb,
(*pskb)->nh.raw += VLAN_HLEN;
}
-#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug ^= (1 << NF_BR_FORWARD);
-#endif
nf_bridge = skb->nf_bridge;
if (skb->pkt_type == PACKET_OTHERHOST) {
skb->pkt_type = PACKET_HOST;
@@ -605,10 +580,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
(*pskb)->nh.raw += VLAN_HLEN;
}
-#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug ^= (1 << NF_BR_FORWARD);
-#endif
-
if (skb->nh.arph->ar_pln != 4) {
if (IS_VLAN_ARP) {
skb_push(*pskb, VLAN_HLEN);
@@ -627,9 +598,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
/* PF_BRIDGE/LOCAL_OUT ***********************************************/
static int br_nf_local_out_finish(struct sk_buff *skb)
{
-#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug &= ~(1 << NF_BR_LOCAL_OUT);
-#endif
if (skb->protocol == __constant_htons(ETH_P_8021Q)) {
skb_push(skb, VLAN_HLEN);
skb->nh.raw -= VLAN_HLEN;
@@ -731,10 +699,6 @@ static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff **pskb,
realoutdev, br_nf_local_out_finish,
NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1);
} else {
-#ifdef CONFIG_NETFILTER_DEBUG
- skb->nf_debug ^= (1 << NF_IP_LOCAL_OUT);
-#endif
-
NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev,
realoutdev, br_nf_local_out_finish,
NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1);
@@ -779,8 +743,6 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb,
printk(KERN_CRIT "br_netfilter: skb->dst == NULL.");
goto print_error;
}
-
- skb->nf_debug ^= (1 << NF_IP_POST_ROUTING);
#endif
/* We assume any code from br_dev_queue_push_xmit onwards doesn't care
@@ -882,7 +844,7 @@ static unsigned int ip_sabotage_out(unsigned int hook, struct sk_buff **pskb,
* doesn't use the bridge parent of the indev by using
* the BRNF_DONT_TAKE_PARENT mask. */
if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) {
- nf_bridge->mask &= BRNF_DONT_TAKE_PARENT;
+ nf_bridge->mask |= BRNF_DONT_TAKE_PARENT;
nf_bridge->physindev = (struct net_device *)in;
}
#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 68ccef5..c70b3be 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -138,7 +138,7 @@ config BRIDGE_EBT_VLAN
#
config BRIDGE_EBT_ARPREPLY
tristate "ebt: arp reply target support"
- depends on BRIDGE_NF_EBTABLES
+ depends on BRIDGE_NF_EBTABLES && INET
help
This option adds the arp reply target, which allows
automatically sending arp replies to arp requests.
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index e4ae34b..662975b 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -61,8 +61,6 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
{
struct ebt_log_info *info = (struct ebt_log_info *)data;
char level_string[4] = "< >";
- union {struct iphdr iph; struct tcpudphdr ports;
- struct arphdr arph; struct arppayload arpp;} u;
level_string[1] = '0' + info->loglevel;
spin_lock_bh(&ebt_log_lock);
@@ -88,7 +86,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
}
printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,",
NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
- printk(" IP tos=0x%02X, IP proto=%d", u.iph.tos,
+ printk(" IP tos=0x%02X, IP proto=%d", ih->tos,
ih->protocol);
if (ih->protocol == IPPROTO_TCP ||
ih->protocol == IPPROTO_UDP) {
@@ -127,7 +125,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
ah->ar_pln == sizeof(uint32_t)) {
struct arppayload _arpp, *ap;
- ap = skb_header_pointer(skb, sizeof(u.arph),
+ ap = skb_header_pointer(skb, sizeof(_arph),
sizeof(_arpp), &_arpp);
if (ap == NULL) {
printk(" INCOMPLETE ARP payload");
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index 02c632b..c93d35a 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -23,10 +23,9 @@ static int ebt_target_mark(struct sk_buff **pskb, unsigned int hooknr,
{
struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data;
- if ((*pskb)->nfmark != info->mark) {
+ if ((*pskb)->nfmark != info->mark)
(*pskb)->nfmark = info->mark;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
+
return info->target;
}
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 01af4fc..aae26ae 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -78,8 +78,8 @@ static void ulog_send(unsigned int nlgroup)
if (ub->qlen > 1)
ub->lastnlh->nlmsg_type = NLMSG_DONE;
- NETLINK_CB(ub->skb).dst_groups = 1 << nlgroup;
- netlink_broadcast(ebtulognl, ub->skb, 0, 1 << nlgroup, GFP_ATOMIC);
+ NETLINK_CB(ub->skb).dst_group = nlgroup + 1;
+ netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC);
ub->qlen = 0;
ub->skb = NULL;
@@ -162,7 +162,7 @@ static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
pm->version = EBT_ULOG_VERSION;
do_gettimeofday(&pm->stamp);
if (ub->qlen == 1)
- ub->skb->stamp = pm->stamp;
+ skb_set_timestamp(ub->skb, &pm->stamp);
pm->data_len = copy_len;
pm->mark = skb->nfmark;
pm->hook = hooknr;
@@ -258,7 +258,8 @@ static int __init init(void)
spin_lock_init(&ulog_buffers[i].lock);
}
- ebtulognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+ ebtulognl = netlink_kernel_create(NETLINK_NFLOG, EBT_ULOG_MAXNLGROUPS,
+ NULL, THIS_MODULE);
if (!ebtulognl)
ret = -ENOMEM;
else if ((ret = ebt_register_watcher(&ulog)))
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 18ebc66..c454014 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -859,8 +859,7 @@ static int translate_table(struct ebt_replace *repl,
if (repl->valid_hooks & (1 << i))
if (check_chainloops(newinfo->hook_entry[i],
cl_s, udc_cnt, i, newinfo->entries)) {
- if (cl_s)
- vfree(cl_s);
+ vfree(cl_s);
return -EINVAL;
}
@@ -883,8 +882,7 @@ static int translate_table(struct ebt_replace *repl,
EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
ebt_cleanup_entry, &i);
}
- if (cl_s)
- vfree(cl_s);
+ vfree(cl_s);
return ret;
}
@@ -1030,8 +1028,7 @@ static int do_replace(void __user *user, unsigned int len)
}
vfree(table);
- if (counterstmp)
- vfree(counterstmp);
+ vfree(counterstmp);
return ret;
free_unlock:
@@ -1040,8 +1037,7 @@ free_iterate:
EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
ebt_cleanup_entry, NULL);
free_counterstmp:
- if (counterstmp)
- vfree(counterstmp);
+ vfree(counterstmp);
/* can be initialized in translate_table() */
if (newinfo->chainstack) {
for (i = 0; i < num_possible_cpus(); i++)
@@ -1049,11 +1045,9 @@ free_counterstmp:
vfree(newinfo->chainstack);
}
free_entries:
- if (newinfo->entries)
- vfree(newinfo->entries);
+ vfree(newinfo->entries);
free_newinfo:
- if (newinfo)
- vfree(newinfo);
+ vfree(newinfo);
return ret;
}
@@ -1213,8 +1207,7 @@ void ebt_unregister_table(struct ebt_table *table)
down(&ebt_mutex);
LIST_DELETE(&ebt_tables, table);
up(&ebt_mutex);
- if (table->private->entries)
- vfree(table->private->entries);
+ vfree(table->private->entries);
if (table->private->chainstack) {
for (i = 0; i < num_possible_cpus(); i++)
vfree(table->private->chainstack[i]);
diff --git a/net/compat.c b/net/compat.c
index be5d936..e593dac 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -91,20 +91,11 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov,
} else
kern_msg->msg_name = NULL;
- if(kern_msg->msg_iovlen > UIO_FASTIOV) {
- kern_iov = kmalloc(kern_msg->msg_iovlen * sizeof(struct iovec),
- GFP_KERNEL);
- if(!kern_iov)
- return -ENOMEM;
- }
-
tot_len = iov_from_user_compat_to_kern(kern_iov,
(struct compat_iovec __user *)kern_msg->msg_iov,
kern_msg->msg_iovlen);
if(tot_len >= 0)
kern_msg->msg_iov = kern_iov;
- else if(kern_msg->msg_iovlen > UIO_FASTIOV)
- kfree(kern_iov);
return tot_len;
}
@@ -144,13 +135,14 @@ static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *ms
* thus placement) of cmsg headers and length are different for
* 32-bit apps. -DaveM
*/
-int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg,
+int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
unsigned char *stackbuf, int stackbuf_size)
{
struct compat_cmsghdr __user *ucmsg;
struct cmsghdr *kcmsg, *kcmsg_base;
compat_size_t ucmlen;
__kernel_size_t kcmlen, tmp;
+ int err = -EFAULT;
kcmlen = 0;
kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf;
@@ -165,6 +157,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg,
tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
CMSG_ALIGN(sizeof(struct cmsghdr)));
+ tmp = CMSG_ALIGN(tmp);
kcmlen += tmp;
ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
}
@@ -176,30 +169,34 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg,
* until we have successfully copied over all of the data
* from the user.
*/
- if(kcmlen > stackbuf_size)
- kcmsg_base = kcmsg = kmalloc(kcmlen, GFP_KERNEL);
- if(kcmsg == NULL)
+ if (kcmlen > stackbuf_size)
+ kcmsg_base = kcmsg = sock_kmalloc(sk, kcmlen, GFP_KERNEL);
+ if (kcmsg == NULL)
return -ENOBUFS;
/* Now copy them over neatly. */
memset(kcmsg, 0, kcmlen);
ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
while(ucmsg != NULL) {
- __get_user(ucmlen, &ucmsg->cmsg_len);
+ if (__get_user(ucmlen, &ucmsg->cmsg_len))
+ goto Efault;
+ if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
+ goto Einval;
tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) +
CMSG_ALIGN(sizeof(struct cmsghdr)));
+ if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
+ goto Einval;
kcmsg->cmsg_len = tmp;
- __get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level);
- __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type);
-
- /* Copy over the data. */
- if(copy_from_user(CMSG_DATA(kcmsg),
- CMSG_COMPAT_DATA(ucmsg),
- (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg)))))
- goto out_free_efault;
+ tmp = CMSG_ALIGN(tmp);
+ if (__get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level) ||
+ __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
+ copy_from_user(CMSG_DATA(kcmsg),
+ CMSG_COMPAT_DATA(ucmsg),
+ (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg)))))
+ goto Efault;
/* Advance. */
- kcmsg = (struct cmsghdr *)((char *)kcmsg + CMSG_ALIGN(tmp));
+ kcmsg = (struct cmsghdr *)((char *)kcmsg + tmp);
ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
}
@@ -208,10 +205,12 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg,
kmsg->msg_controllen = kcmlen;
return 0;
-out_free_efault:
- if(kcmsg_base != (struct cmsghdr *)stackbuf)
- kfree(kcmsg_base);
- return -EFAULT;
+Einval:
+ err = -EINVAL;
+Efault:
+ if (kcmsg_base != (struct cmsghdr *)stackbuf)
+ sock_kfree_s(sk, kcmsg_base, kcmlen);
+ return err;
}
int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *data)
diff --git a/net/core/Makefile b/net/core/Makefile
index 81f0324..630da0f 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -2,15 +2,16 @@
# Makefile for the Linux networking core.
#
-obj-y := sock.o skbuff.o iovec.o datagram.o stream.o scm.o gen_stats.o gen_estimator.o
+obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
+ gen_stats.o gen_estimator.o
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
-obj-y += flow.o dev.o ethtool.o dev_mcast.o dst.o \
+obj-y += dev.o ethtool.o dev_mcast.o dst.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o
+obj-$(CONFIG_XFRM) += flow.o
obj-$(CONFIG_SYSFS) += net-sysfs.o
-obj-$(CONFIG_NETFILTER) += netfilter.o
obj-$(CONFIG_NET_DIVERT) += dv.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NET_RADIO) += wireless.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fcee054..da9bf71 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -43,7 +43,6 @@
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/inet.h>
-#include <linux/tcp.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/poll.h>
@@ -51,9 +50,10 @@
#include <net/protocol.h>
#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/checksum.h>
+#include <net/checksum.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
/*
* Is a socket 'connection oriented' ?
diff --git a/net/core/dev.c b/net/core/dev.c
index ab93577..c01511e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,18 +115,6 @@
#endif /* CONFIG_NET_RADIO */
#include <asm/current.h>
-/* This define, if set, will randomly drop a packet when congestion
- * is more than moderate. It helps fairness in the multi-interface
- * case when one of them is a hog, but it kills performance for the
- * single interface case so it is off now by default.
- */
-#undef RAND_LIE
-
-/* Setting this will sample the queue lengths and thus congestion
- * via a timer instead of as each packet is received.
- */
-#undef OFFLINE_SAMPLE
-
/*
* The list of packet types we will receive (as opposed to discard)
* and the routines to invoke.
@@ -159,11 +147,6 @@ static DEFINE_SPINLOCK(ptype_lock);
static struct list_head ptype_base[16]; /* 16 way hashed list */
static struct list_head ptype_all; /* Taps */
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy);
-static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
-#endif
-
/*
* The @dev_base list is protected by @dev_base_lock and the rtln
* semaphore.
@@ -215,7 +198,7 @@ static struct notifier_block *netdev_chain;
* Device drivers call our routines to queue packets here. We empty the
* queue in the local softnet handler.
*/
-DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
+DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
#ifdef CONFIG_SYSFS
extern int netdev_sysfs_init(void);
@@ -284,10 +267,6 @@ void dev_add_pack(struct packet_type *pt)
spin_unlock_bh(&ptype_lock);
}
-extern void linkwatch_run_queue(void);
-
-
-
/**
* __dev_remove_pack - remove packet handler
* @pt: packet type declaration
@@ -918,8 +897,7 @@ int dev_close(struct net_device *dev)
smp_mb__after_clear_bit(); /* Commit netif_running(). */
while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
/* No hurry. */
- current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(1);
+ msleep(1);
}
/*
@@ -1027,13 +1005,22 @@ void net_disable_timestamp(void)
atomic_dec(&netstamp_needed);
}
-static inline void net_timestamp(struct timeval *stamp)
+void __net_timestamp(struct sk_buff *skb)
+{
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
+ skb_set_timestamp(skb, &tv);
+}
+EXPORT_SYMBOL(__net_timestamp);
+
+static inline void net_timestamp(struct sk_buff *skb)
{
if (atomic_read(&netstamp_needed))
- do_gettimeofday(stamp);
+ __net_timestamp(skb);
else {
- stamp->tv_sec = 0;
- stamp->tv_usec = 0;
+ skb->tstamp.off_sec = 0;
+ skb->tstamp.off_usec = 0;
}
}
@@ -1045,7 +1032,8 @@ static inline void net_timestamp(struct timeval *stamp)
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
- net_timestamp(&skb->stamp);
+
+ net_timestamp(skb);
rcu_read_lock();
list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -1076,7 +1064,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
skb2->h.raw = skb2->nh.raw;
skb2->pkt_type = PACKET_OUTGOING;
- ptype->func(skb2, skb->dev, ptype);
+ ptype->func(skb2, skb->dev, ptype, skb->dev);
}
}
rcu_read_unlock();
@@ -1141,10 +1129,8 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
#define illegal_highdma(dev, skb) (0)
#endif
-extern void skb_release_data(struct sk_buff *);
-
/* Keep head the same: replace data */
-int __skb_linearize(struct sk_buff *skb, int gfp_mask)
+int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
{
unsigned int size;
u8 *data;
@@ -1363,71 +1349,13 @@ out:
Receiver routines
=======================================================================*/
-int netdev_max_backlog = 300;
+int netdev_max_backlog = 1000;
+int netdev_budget = 300;
int weight_p = 64; /* old backlog weight */
-/* These numbers are selected based on intuition and some
- * experimentatiom, if you have more scientific way of doing this
- * please go ahead and fix things.
- */
-int no_cong_thresh = 10;
-int no_cong = 20;
-int lo_cong = 100;
-int mod_cong = 290;
DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
-static void get_sample_stats(int cpu)
-{
-#ifdef RAND_LIE
- unsigned long rd;
- int rq;
-#endif
- struct softnet_data *sd = &per_cpu(softnet_data, cpu);
- int blog = sd->input_pkt_queue.qlen;
- int avg_blog = sd->avg_blog;
-
- avg_blog = (avg_blog >> 1) + (blog >> 1);
-
- if (avg_blog > mod_cong) {
- /* Above moderate congestion levels. */
- sd->cng_level = NET_RX_CN_HIGH;
-#ifdef RAND_LIE
- rd = net_random();
- rq = rd % netdev_max_backlog;
- if (rq < avg_blog) /* unlucky bastard */
- sd->cng_level = NET_RX_DROP;
-#endif
- } else if (avg_blog > lo_cong) {
- sd->cng_level = NET_RX_CN_MOD;
-#ifdef RAND_LIE
- rd = net_random();
- rq = rd % netdev_max_backlog;
- if (rq < avg_blog) /* unlucky bastard */
- sd->cng_level = NET_RX_CN_HIGH;
-#endif
- } else if (avg_blog > no_cong)
- sd->cng_level = NET_RX_CN_LOW;
- else /* no congestion */
- sd->cng_level = NET_RX_SUCCESS;
-
- sd->avg_blog = avg_blog;
-}
-
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy)
-{
-/* 10 ms 0r 1ms -- i don't care -- JHS */
- int next_tick = 1;
- int cpu = smp_processor_id();
-
- get_sample_stats(cpu);
- next_tick += jiffies;
- mod_timer(&samp_timer, next_tick);
-}
-#endif
-
-
/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
@@ -1448,7 +1376,6 @@ static void sample_queue(unsigned long dummy)
int netif_rx(struct sk_buff *skb)
{
- int this_cpu;
struct softnet_data *queue;
unsigned long flags;
@@ -1456,46 +1383,30 @@ int netif_rx(struct sk_buff *skb)
if (netpoll_rx(skb))
return NET_RX_DROP;
- if (!skb->stamp.tv_sec)
- net_timestamp(&skb->stamp);
+ if (!skb->tstamp.off_sec)
+ net_timestamp(skb);
/*
* The code is rearranged so that the path is the most
* short when CPU is congested, but is still operating.
*/
local_irq_save(flags);
- this_cpu = smp_processor_id();
queue = &__get_cpu_var(softnet_data);
__get_cpu_var(netdev_rx_stat).total++;
if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
if (queue->input_pkt_queue.qlen) {
- if (queue->throttle)
- goto drop;
-
enqueue:
dev_hold(skb->dev);
__skb_queue_tail(&queue->input_pkt_queue, skb);
-#ifndef OFFLINE_SAMPLE
- get_sample_stats(this_cpu);
-#endif
local_irq_restore(flags);
- return queue->cng_level;
+ return NET_RX_SUCCESS;
}
- if (queue->throttle)
- queue->throttle = 0;
-
netif_rx_schedule(&queue->backlog_dev);
goto enqueue;
}
- if (!queue->throttle) {
- queue->throttle = 1;
- __get_cpu_var(netdev_rx_stat).throttled++;
- }
-
-drop:
__get_cpu_var(netdev_rx_stat).dropped++;
local_irq_restore(flags);
@@ -1518,14 +1429,14 @@ int netif_rx_ni(struct sk_buff *skb)
EXPORT_SYMBOL(netif_rx_ni);
-static __inline__ void skb_bond(struct sk_buff *skb)
+static inline struct net_device *skb_bond(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
- if (dev->master) {
- skb->real_dev = skb->dev;
+ if (dev->master)
skb->dev = dev->master;
- }
+
+ return dev;
}
static void net_tx_action(struct softirq_action *h)
@@ -1575,10 +1486,11 @@ static void net_tx_action(struct softirq_action *h)
}
static __inline__ int deliver_skb(struct sk_buff *skb,
- struct packet_type *pt_prev)
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
{
atomic_inc(&skb->users);
- return pt_prev->func(skb, skb->dev, pt_prev);
+ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
@@ -1589,7 +1501,8 @@ struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
static __inline__ int handle_bridge(struct sk_buff **pskb,
- struct packet_type **pt_prev, int *ret)
+ struct packet_type **pt_prev, int *ret,
+ struct net_device *orig_dev)
{
struct net_bridge_port *port;
@@ -1598,14 +1511,14 @@ static __inline__ int handle_bridge(struct sk_buff **pskb,
return 0;
if (*pt_prev) {
- *ret = deliver_skb(*pskb, *pt_prev);
+ *ret = deliver_skb(*pskb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
return br_handle_frame_hook(port, pskb);
}
#else
-#define handle_bridge(skb, pt_prev, ret) (0)
+#define handle_bridge(skb, pt_prev, ret, orig_dev) (0)
#endif
#ifdef CONFIG_NET_CLS_ACT
@@ -1627,17 +1540,14 @@ static int ing_filter(struct sk_buff *skb)
__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
if (MAX_RED_LOOP < ttl++) {
printk("Redir loop detected Dropping packet (%s->%s)\n",
- skb->input_dev?skb->input_dev->name:"??",skb->dev->name);
+ skb->input_dev->name, skb->dev->name);
return TC_ACT_SHOT;
}
skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
- if (NULL == skb->input_dev) {
- skb->input_dev = skb->dev;
- printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name);
- }
+
spin_lock(&dev->ingress_lock);
if ((q = dev->qdisc_ingress) != NULL)
result = q->enqueue(skb, q);
@@ -1652,6 +1562,7 @@ static int ing_filter(struct sk_buff *skb)
int netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
+ struct net_device *orig_dev;
int ret = NET_RX_DROP;
unsigned short type;
@@ -1659,10 +1570,13 @@ int netif_receive_skb(struct sk_buff *skb)
if (skb->dev->poll && netpoll_rx(skb))
return NET_RX_DROP;
- if (!skb->stamp.tv_sec)
- net_timestamp(&skb->stamp);
+ if (!skb->tstamp.off_sec)
+ net_timestamp(skb);
+
+ if (!skb->input_dev)
+ skb->input_dev = skb->dev;
- skb_bond(skb);
+ orig_dev = skb_bond(skb);
__get_cpu_var(netdev_rx_stat).total++;
@@ -1683,14 +1597,14 @@ int netif_receive_skb(struct sk_buff *skb)
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev)
- ret = deliver_skb(skb, pt_prev);
+ ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
#ifdef CONFIG_NET_CLS_ACT
if (pt_prev) {
- ret = deliver_skb(skb, pt_prev);
+ ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL; /* noone else should process this after*/
} else {
skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
@@ -1709,7 +1623,7 @@ ncls:
handle_diverter(skb);
- if (handle_bridge(&skb, &pt_prev, &ret))
+ if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
goto out;
type = skb->protocol;
@@ -1717,13 +1631,13 @@ ncls:
if (ptype->type == type &&
(!ptype->dev || ptype->dev == skb->dev)) {
if (pt_prev)
- ret = deliver_skb(skb, pt_prev);
+ ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
if (pt_prev) {
- ret = pt_prev->func(skb, skb->dev, pt_prev);
+ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
@@ -1780,8 +1694,6 @@ job_done:
smp_mb__before_clear_bit();
netif_poll_enable(backlog_dev);
- if (queue->throttle)
- queue->throttle = 0;
local_irq_enable();
return 0;
}
@@ -1790,9 +1702,9 @@ static void net_rx_action(struct softirq_action *h)
{
struct softnet_data *queue = &__get_cpu_var(softnet_data);
unsigned long start_time = jiffies;
- int budget = netdev_max_backlog;
+ int budget = netdev_budget;
+ void *have;
-
local_irq_disable();
while (!list_empty(&queue->poll_list)) {
@@ -1805,10 +1717,10 @@ static void net_rx_action(struct softirq_action *h)
dev = list_entry(queue->poll_list.next,
struct net_device, poll_list);
- netpoll_poll_lock(dev);
+ have = netpoll_poll_lock(dev);
if (dev->quota <= 0 || dev->poll(dev, &budget)) {
- netpoll_poll_unlock(dev);
+ netpoll_poll_unlock(have);
local_irq_disable();
list_del(&dev->poll_list);
list_add_tail(&dev->poll_list, &queue->poll_list);
@@ -1817,7 +1729,7 @@ static void net_rx_action(struct softirq_action *h)
else
dev->quota = dev->weight;
} else {
- netpoll_poll_unlock(dev);
+ netpoll_poll_unlock(have);
dev_put(dev);
local_irq_disable();
}
@@ -2055,15 +1967,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
struct netif_rx_stats *s = v;
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
- s->total, s->dropped, s->time_squeeze, s->throttled,
- s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
- s->fastroute_deferred_out,
-#if 0
- s->fastroute_latency_reduction
-#else
- s->cpu_collision
-#endif
- );
+ s->total, s->dropped, s->time_squeeze, 0,
+ 0, 0, 0, 0, /* was fastroute */
+ s->cpu_collision );
return 0;
}
@@ -2190,10 +2096,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
{
unsigned short old_flags = dev->flags;
- dev->flags |= IFF_PROMISC;
if ((dev->promiscuity += inc) == 0)
dev->flags &= ~IFF_PROMISC;
- if (dev->flags ^ old_flags) {
+ else
+ dev->flags |= IFF_PROMISC;
+ if (dev->flags != old_flags) {
dev_mc_upload(dev);
printk(KERN_INFO "device %s %s promiscuous mode\n",
dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
@@ -3305,9 +3212,6 @@ static int __init net_dev_init(void)
queue = &per_cpu(softnet_data, i);
skb_queue_head_init(&queue->input_pkt_queue);
- queue->throttle = 0;
- queue->cng_level = 0;
- queue->avg_blog = 10; /* arbitrary non-zero */
queue->completion_queue = NULL;
INIT_LIST_HEAD(&queue->poll_list);
set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
@@ -3316,11 +3220,6 @@ static int __init net_dev_init(void)
atomic_set(&queue->backlog_dev.refcnt, 1);
}
-#ifdef OFFLINE_SAMPLE
- samp_timer.expires = jiffies + (10 * HZ);
- add_timer(&samp_timer);
-#endif
-
dev_boot_phase = 0;
open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
diff --git a/net/core/dst.c b/net/core/dst.c
index fc434ad..334790d 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -45,6 +45,7 @@ static struct timer_list dst_gc_timer =
static void dst_run_gc(unsigned long dummy)
{
int delayed = 0;
+ int work_performed;
struct dst_entry * dst, **dstp;
if (!spin_trylock(&dst_lock)) {
@@ -52,9 +53,9 @@ static void dst_run_gc(unsigned long dummy)
return;
}
-
del_timer(&dst_gc_timer);
dstp = &dst_garbage_list;
+ work_performed = 0;
while ((dst = *dstp) != NULL) {
if (atomic_read(&dst->__refcnt)) {
dstp = &dst->next;
@@ -62,6 +63,7 @@ static void dst_run_gc(unsigned long dummy)
continue;
}
*dstp = dst->next;
+ work_performed = 1;
dst = dst_destroy(dst);
if (dst) {
@@ -86,9 +88,14 @@ static void dst_run_gc(unsigned long dummy)
dst_gc_timer_inc = DST_GC_MAX;
goto out;
}
- if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
- dst_gc_timer_expires = DST_GC_MAX;
- dst_gc_timer_inc += DST_GC_INC;
+ if (!work_performed) {
+ if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
+ dst_gc_timer_expires = DST_GC_MAX;
+ dst_gc_timer_inc += DST_GC_INC;
+ } else {
+ dst_gc_timer_inc = DST_GC_INC;
+ dst_gc_timer_expires = DST_GC_MIN;
+ }
dst_gc_timer.expires = jiffies + dst_gc_timer_expires;
#if RT_CACHE_DEBUG >= 2
printk("dst_total: %d/%d %ld\n",
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index a3eeb88..404b761 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -81,6 +81,18 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data)
return 0;
}
+int ethtool_op_get_perm_addr(struct net_device *dev, struct ethtool_perm_addr *addr, u8 *data)
+{
+ unsigned char len = dev->addr_len;
+ if ( addr->size < len )
+ return -ETOOSMALL;
+
+ addr->size = len;
+ memcpy(data, dev->perm_addr, len);
+ return 0;
+}
+
+
/* Handlers for each ethtool command */
static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
@@ -683,6 +695,39 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
return ret;
}
+static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_perm_addr epaddr;
+ u8 *data;
+ int ret;
+
+ if (!dev->ethtool_ops->get_perm_addr)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&epaddr,useraddr,sizeof(epaddr)))
+ return -EFAULT;
+
+ data = kmalloc(epaddr.size, GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ ret = dev->ethtool_ops->get_perm_addr(dev,&epaddr,data);
+ if (ret)
+ return ret;
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
+ goto out;
+ useraddr += sizeof(epaddr);
+ if (copy_to_user(useraddr, data, epaddr.size))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(data);
+ return ret;
+}
+
/* The main entry point in this file. Called from net/core/dev.c */
int dev_ethtool(struct ifreq *ifr)
@@ -806,6 +851,9 @@ int dev_ethtool(struct ifreq *ifr)
case ETHTOOL_GSTATS:
rc = ethtool_get_stats(dev, useraddr);
break;
+ case ETHTOOL_GPERMADDR:
+ rc = ethtool_get_perm_addr(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
@@ -826,6 +874,7 @@ int dev_ethtool(struct ifreq *ifr)
EXPORT_SYMBOL(dev_ethtool);
EXPORT_SYMBOL(ethtool_op_get_link);
+EXPORT_SYMBOL_GPL(ethtool_op_get_perm_addr);
EXPORT_SYMBOL(ethtool_op_get_sg);
EXPORT_SYMBOL(ethtool_op_get_tso);
EXPORT_SYMBOL(ethtool_op_get_tx_csum);
diff --git a/net/core/filter.c b/net/core/filter.c
index f3b8820..079c2ed 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,7 +36,7 @@
#include <linux/filter.h>
/* No hurry in this branch */
-static u8 *load_pointer(struct sk_buff *skb, int k)
+static void *__load_pointer(struct sk_buff *skb, int k)
{
u8 *ptr = NULL;
@@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
return NULL;
}
+static inline void *load_pointer(struct sk_buff *skb, int k,
+ unsigned int size, void *buffer)
+{
+ if (k >= 0)
+ return skb_header_pointer(skb, k, size, buffer);
+ else {
+ if (k >= SKF_AD_OFF)
+ return NULL;
+ return __load_pointer(skb, k);
+ }
+}
+
/**
* sk_run_filter - run a filter on a socket
* @skb: buffer to run the filter on
@@ -64,15 +76,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
{
- unsigned char *data = skb->data;
- /* len is UNSIGNED. Byte wide insns relies only on implicit
- type casts to prevent reading arbitrary memory locations.
- */
- unsigned int len = skb->len-skb->data_len;
struct sock_filter *fentry; /* We walk down these */
+ void *ptr;
u32 A = 0; /* Accumulator */
u32 X = 0; /* Index Register */
u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */
+ u32 tmp;
int k;
int pc;
@@ -168,86 +177,35 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
case BPF_LD|BPF_W|BPF_ABS:
k = fentry->k;
load_w:
- if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) {
- A = ntohl(*(u32*)&data[k]);
+ ptr = load_pointer(skb, k, 4, &tmp);
+ if (ptr != NULL) {
+ A = ntohl(*(u32 *)ptr);
continue;
}
- if (k < 0) {
- u8 *ptr;
-
- if (k >= SKF_AD_OFF)
- break;
- ptr = load_pointer(skb, k);
- if (ptr) {
- A = ntohl(*(u32*)ptr);
- continue;
- }
- } else {
- u32 _tmp, *p;
- p = skb_header_pointer(skb, k, 4, &_tmp);
- if (p != NULL) {
- A = ntohl(*p);
- continue;
- }
- }
- return 0;
+ break;
case BPF_LD|BPF_H|BPF_ABS:
k = fentry->k;
load_h:
- if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) {
- A = ntohs(*(u16*)&data[k]);
+ ptr = load_pointer(skb, k, 2, &tmp);
+ if (ptr != NULL) {
+ A = ntohs(*(u16 *)ptr);
continue;
}
- if (k < 0) {
- u8 *ptr;
-
- if (k >= SKF_AD_OFF)
- break;
- ptr = load_pointer(skb, k);
- if (ptr) {
- A = ntohs(*(u16*)ptr);
- continue;
- }
- } else {
- u16 _tmp, *p;
- p = skb_header_pointer(skb, k, 2, &_tmp);
- if (p != NULL) {
- A = ntohs(*p);
- continue;
- }
- }
- return 0;
+ break;
case BPF_LD|BPF_B|BPF_ABS:
k = fentry->k;
load_b:
- if (k >= 0 && (unsigned int)k < len) {
- A = data[k];
+ ptr = load_pointer(skb, k, 1, &tmp);
+ if (ptr != NULL) {
+ A = *(u8 *)ptr;
continue;
}
- if (k < 0) {
- u8 *ptr;
-
- if (k >= SKF_AD_OFF)
- break;
- ptr = load_pointer(skb, k);
- if (ptr) {
- A = *ptr;
- continue;
- }
- } else {
- u8 _tmp, *p;
- p = skb_header_pointer(skb, k, 1, &_tmp);
- if (p != NULL) {
- A = *p;
- continue;
- }
- }
- return 0;
+ break;
case BPF_LD|BPF_W|BPF_LEN:
- A = len;
+ A = skb->len;
continue;
case BPF_LDX|BPF_W|BPF_LEN:
- X = len;
+ X = skb->len;
continue;
case BPF_LD|BPF_W|BPF_IND:
k = X + fentry->k;
@@ -259,10 +217,12 @@ load_b:
k = X + fentry->k;
goto load_b;
case BPF_LDX|BPF_B|BPF_MSH:
- if (fentry->k >= len)
- return 0;
- X = (data[fentry->k] & 0xf) << 2;
- continue;
+ ptr = load_pointer(skb, fentry->k, 1, &tmp);
+ if (ptr != NULL) {
+ X = (*(u8 *)ptr & 0xf) << 2;
+ continue;
+ }
+ return 0;
case BPF_LD|BPF_IMM:
A = fentry->k;
continue;
diff --git a/net/core/flow.c b/net/core/flow.c
index f289570..7e95b39 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -42,7 +42,7 @@ static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
#define flow_table(cpu) (per_cpu(flow_tables, cpu))
-static kmem_cache_t *flow_cachep;
+static kmem_cache_t *flow_cachep __read_mostly;
static int flow_lwm, flow_hwm;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 43bdc52..39fc55e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -32,6 +32,7 @@
#include <net/sock.h>
#include <linux/rtnetlink.h>
#include <linux/random.h>
+#include <linux/string.h>
#define NEIGH_DEBUG 1
@@ -1216,7 +1217,7 @@ static void neigh_proxy_process(unsigned long arg)
while (skb != (struct sk_buff *)&tbl->proxy_queue) {
struct sk_buff *back = skb;
- long tdif = back->stamp.tv_usec - now;
+ long tdif = NEIGH_CB(back)->sched_next - now;
skb = skb->next;
if (tdif <= 0) {
@@ -1247,8 +1248,9 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
kfree_skb(skb);
return;
}
- skb->stamp.tv_sec = LOCALLY_ENQUEUED;
- skb->stamp.tv_usec = sched_next;
+
+ NEIGH_CB(skb)->sched_next = sched_next;
+ NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
spin_lock(&tbl->proxy_queue.lock);
if (del_timer(&tbl->proxy_timer)) {
@@ -1276,9 +1278,14 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
INIT_RCU_HEAD(&p->rcu_head);
p->reachable_time =
neigh_rand_reach_time(p->base_reachable_time);
- if (dev && dev->neigh_setup && dev->neigh_setup(dev, p)) {
- kfree(p);
- return NULL;
+ if (dev) {
+ if (dev->neigh_setup && dev->neigh_setup(dev, p)) {
+ kfree(p);
+ return NULL;
+ }
+
+ dev_hold(dev);
+ p->dev = dev;
}
p->sysctl_table = NULL;
write_lock_bh(&tbl->lock);
@@ -1309,6 +1316,8 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
*p = parms->next;
parms->dead = 1;
write_unlock_bh(&tbl->lock);
+ if (parms->dev)
+ dev_put(parms->dev);
call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
return;
}
@@ -1546,21 +1555,330 @@ out:
return err;
}
+static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
+{
+ struct rtattr *nest = NULL;
+
+ nest = RTA_NEST(skb, NDTA_PARMS);
+
+ if (parms->dev)
+ RTA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
+
+ RTA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
+ RTA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
+ RTA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
+ RTA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
+ RTA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
+ RTA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes);
+ RTA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time);
+ RTA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME,
+ parms->base_reachable_time);
+ RTA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime);
+ RTA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time);
+ RTA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time);
+ RTA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay);
+ RTA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay);
+ RTA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime);
+
+ return RTA_NEST_END(skb, nest);
+
+rtattr_failure:
+ return RTA_NEST_CANCEL(skb, nest);
+}
+
+static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nlmsghdr *nlh;
+ struct ndtmsg *ndtmsg;
+
+ nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg),
+ NLM_F_MULTI);
+
+ ndtmsg = NLMSG_DATA(nlh);
+
+ read_lock_bh(&tbl->lock);
+ ndtmsg->ndtm_family = tbl->family;
+ ndtmsg->ndtm_pad1 = 0;
+ ndtmsg->ndtm_pad2 = 0;
+
+ RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
+ RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
+ RTA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1);
+ RTA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2);
+ RTA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3);
+
+ {
+ unsigned long now = jiffies;
+ unsigned int flush_delta = now - tbl->last_flush;
+ unsigned int rand_delta = now - tbl->last_rand;
+
+ struct ndt_config ndc = {
+ .ndtc_key_len = tbl->key_len,
+ .ndtc_entry_size = tbl->entry_size,
+ .ndtc_entries = atomic_read(&tbl->entries),
+ .ndtc_last_flush = jiffies_to_msecs(flush_delta),
+ .ndtc_last_rand = jiffies_to_msecs(rand_delta),
+ .ndtc_hash_rnd = tbl->hash_rnd,
+ .ndtc_hash_mask = tbl->hash_mask,
+ .ndtc_hash_chain_gc = tbl->hash_chain_gc,
+ .ndtc_proxy_qlen = tbl->proxy_queue.qlen,
+ };
+
+ RTA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
+ }
+
+ {
+ int cpu;
+ struct ndt_stats ndst;
+
+ memset(&ndst, 0, sizeof(ndst));
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ struct neigh_statistics *st;
+
+ if (!cpu_possible(cpu))
+ continue;
+
+ st = per_cpu_ptr(tbl->stats, cpu);
+ ndst.ndts_allocs += st->allocs;
+ ndst.ndts_destroys += st->destroys;
+ ndst.ndts_hash_grows += st->hash_grows;
+ ndst.ndts_res_failed += st->res_failed;
+ ndst.ndts_lookups += st->lookups;
+ ndst.ndts_hits += st->hits;
+ ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast;
+ ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
+ ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
+ ndst.ndts_forced_gc_runs += st->forced_gc_runs;
+ }
+
+ RTA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst);
+ }
+
+ BUG_ON(tbl->parms.dev);
+ if (neightbl_fill_parms(skb, &tbl->parms) < 0)
+ goto rtattr_failure;
+
+ read_unlock_bh(&tbl->lock);
+ return NLMSG_END(skb, nlh);
+
+rtattr_failure:
+ read_unlock_bh(&tbl->lock);
+ return NLMSG_CANCEL(skb, nlh);
+
+nlmsg_failure:
+ return -1;
+}
+
+static int neightbl_fill_param_info(struct neigh_table *tbl,
+ struct neigh_parms *parms,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct ndtmsg *ndtmsg;
+ struct nlmsghdr *nlh;
+
+ nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg),
+ NLM_F_MULTI);
+
+ ndtmsg = NLMSG_DATA(nlh);
+
+ read_lock_bh(&tbl->lock);
+ ndtmsg->ndtm_family = tbl->family;
+ ndtmsg->ndtm_pad1 = 0;
+ ndtmsg->ndtm_pad2 = 0;
+ RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
+
+ if (neightbl_fill_parms(skb, parms) < 0)
+ goto rtattr_failure;
+
+ read_unlock_bh(&tbl->lock);
+ return NLMSG_END(skb, nlh);
+
+rtattr_failure:
+ read_unlock_bh(&tbl->lock);
+ return NLMSG_CANCEL(skb, nlh);
+
+nlmsg_failure:
+ return -1;
+}
+
+static inline struct neigh_parms *lookup_neigh_params(struct neigh_table *tbl,
+ int ifindex)
+{
+ struct neigh_parms *p;
+
+ for (p = &tbl->parms; p; p = p->next)
+ if ((p->dev && p->dev->ifindex == ifindex) ||
+ (!p->dev && !ifindex))
+ return p;
+
+ return NULL;
+}
+
+int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct neigh_table *tbl;
+ struct ndtmsg *ndtmsg = NLMSG_DATA(nlh);
+ struct rtattr **tb = arg;
+ int err = -EINVAL;
+
+ if (!tb[NDTA_NAME - 1] || !RTA_PAYLOAD(tb[NDTA_NAME - 1]))
+ return -EINVAL;
+
+ read_lock(&neigh_tbl_lock);
+ for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+ if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
+ continue;
+
+ if (!rtattr_strcmp(tb[NDTA_NAME - 1], tbl->id))
+ break;
+ }
+
+ if (tbl == NULL) {
+ err = -ENOENT;
+ goto errout;
+ }
+
+ /*
+ * We acquire tbl->lock to be nice to the periodic timers and
+ * make sure they always see a consistent set of values.
+ */
+ write_lock_bh(&tbl->lock);
+
+ if (tb[NDTA_THRESH1 - 1])
+ tbl->gc_thresh1 = RTA_GET_U32(tb[NDTA_THRESH1 - 1]);
+
+ if (tb[NDTA_THRESH2 - 1])
+ tbl->gc_thresh2 = RTA_GET_U32(tb[NDTA_THRESH2 - 1]);
+
+ if (tb[NDTA_THRESH3 - 1])
+ tbl->gc_thresh3 = RTA_GET_U32(tb[NDTA_THRESH3 - 1]);
+
+ if (tb[NDTA_GC_INTERVAL - 1])
+ tbl->gc_interval = RTA_GET_MSECS(tb[NDTA_GC_INTERVAL - 1]);
+
+ if (tb[NDTA_PARMS - 1]) {
+ struct rtattr *tbp[NDTPA_MAX];
+ struct neigh_parms *p;
+ u32 ifindex = 0;
+
+ if (rtattr_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS - 1]) < 0)
+ goto rtattr_failure;
+
+ if (tbp[NDTPA_IFINDEX - 1])
+ ifindex = RTA_GET_U32(tbp[NDTPA_IFINDEX - 1]);
+
+ p = lookup_neigh_params(tbl, ifindex);
+ if (p == NULL) {
+ err = -ENOENT;
+ goto rtattr_failure;
+ }
+
+ if (tbp[NDTPA_QUEUE_LEN - 1])
+ p->queue_len = RTA_GET_U32(tbp[NDTPA_QUEUE_LEN - 1]);
+
+ if (tbp[NDTPA_PROXY_QLEN - 1])
+ p->proxy_qlen = RTA_GET_U32(tbp[NDTPA_PROXY_QLEN - 1]);
+
+ if (tbp[NDTPA_APP_PROBES - 1])
+ p->app_probes = RTA_GET_U32(tbp[NDTPA_APP_PROBES - 1]);
+
+ if (tbp[NDTPA_UCAST_PROBES - 1])
+ p->ucast_probes =
+ RTA_GET_U32(tbp[NDTPA_UCAST_PROBES - 1]);
+
+ if (tbp[NDTPA_MCAST_PROBES - 1])
+ p->mcast_probes =
+ RTA_GET_U32(tbp[NDTPA_MCAST_PROBES - 1]);
+
+ if (tbp[NDTPA_BASE_REACHABLE_TIME - 1])
+ p->base_reachable_time =
+ RTA_GET_MSECS(tbp[NDTPA_BASE_REACHABLE_TIME - 1]);
+
+ if (tbp[NDTPA_GC_STALETIME - 1])
+ p->gc_staletime =
+ RTA_GET_MSECS(tbp[NDTPA_GC_STALETIME - 1]);
+
+ if (tbp[NDTPA_DELAY_PROBE_TIME - 1])
+ p->delay_probe_time =
+ RTA_GET_MSECS(tbp[NDTPA_DELAY_PROBE_TIME - 1]);
+
+ if (tbp[NDTPA_RETRANS_TIME - 1])
+ p->retrans_time =
+ RTA_GET_MSECS(tbp[NDTPA_RETRANS_TIME - 1]);
+
+ if (tbp[NDTPA_ANYCAST_DELAY - 1])
+ p->anycast_delay =
+ RTA_GET_MSECS(tbp[NDTPA_ANYCAST_DELAY - 1]);
+
+ if (tbp[NDTPA_PROXY_DELAY - 1])
+ p->proxy_delay =
+ RTA_GET_MSECS(tbp[NDTPA_PROXY_DELAY - 1]);
+
+ if (tbp[NDTPA_LOCKTIME - 1])
+ p->locktime = RTA_GET_MSECS(tbp[NDTPA_LOCKTIME - 1]);
+ }
+
+ err = 0;
+
+rtattr_failure:
+ write_unlock_bh(&tbl->lock);
+errout:
+ read_unlock(&neigh_tbl_lock);
+ return err;
+}
+
+int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx, family;
+ int s_idx = cb->args[0];
+ struct neigh_table *tbl;
+
+ family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family;
+
+ read_lock(&neigh_tbl_lock);
+ for (tbl = neigh_tables, idx = 0; tbl; tbl = tbl->next) {
+ struct neigh_parms *p;
+
+ if (idx < s_idx || (family && tbl->family != family))
+ continue;
+
+ if (neightbl_fill_info(tbl, skb, cb) <= 0)
+ break;
+
+ for (++idx, p = tbl->parms.next; p; p = p->next, idx++) {
+ if (idx < s_idx)
+ continue;
+
+ if (neightbl_fill_param_info(tbl, p, skb, cb) <= 0)
+ goto out;
+ }
+
+ }
+out:
+ read_unlock(&neigh_tbl_lock);
+ cb->args[0] = idx;
+
+ return skb->len;
+}
static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
- u32 pid, u32 seq, int event)
+ u32 pid, u32 seq, int event, unsigned int flags)
{
unsigned long now = jiffies;
unsigned char *b = skb->tail;
struct nda_cacheinfo ci;
int locked = 0;
u32 probes;
- struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, event,
- sizeof(struct ndmsg));
+ struct nlmsghdr *nlh = NLMSG_NEW(skb, pid, seq, event,
+ sizeof(struct ndmsg), flags);
struct ndmsg *ndm = NLMSG_DATA(nlh);
- nlh->nlmsg_flags = pid ? NLM_F_MULTI : 0;
ndm->ndm_family = n->ops->family;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
ndm->ndm_flags = n->flags;
ndm->ndm_type = n->type;
ndm->ndm_ifindex = n->dev->ifindex;
@@ -1609,7 +1927,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
continue;
if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH) <= 0) {
+ RTM_NEWNEIGH,
+ NLM_F_MULTI) <= 0) {
read_unlock_bh(&tbl->lock);
rc = -1;
goto out;
@@ -2018,14 +2337,14 @@ void neigh_app_ns(struct neighbour *n)
if (!skb)
return;
- if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) {
+ if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH, 0) < 0) {
kfree_skb(skb);
return;
}
nlh = (struct nlmsghdr *)skb->data;
nlh->nlmsg_flags = NLM_F_REQUEST;
- NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC);
}
static void neigh_app_notify(struct neighbour *n)
@@ -2037,13 +2356,13 @@ static void neigh_app_notify(struct neighbour *n)
if (!skb)
return;
- if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) {
+ if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH, 0) < 0) {
kfree_skb(skb);
return;
}
nlh = (struct nlmsghdr *)skb->data;
- NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC);
}
#endif /* CONFIG_ARPD */
@@ -2281,7 +2600,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
t->neigh_vars[17].extra1 = dev;
}
- dev_name = net_sysctl_strdup(dev_name_source);
+ dev_name = kstrdup(dev_name_source, GFP_KERNEL);
if (!dev_name) {
err = -ENOBUFS;
goto free;
@@ -2352,6 +2671,8 @@ EXPORT_SYMBOL(neigh_update);
EXPORT_SYMBOL(neigh_update_hhs);
EXPORT_SYMBOL(pneigh_enqueue);
EXPORT_SYMBOL(pneigh_lookup);
+EXPORT_SYMBOL(neightbl_dump_info);
+EXPORT_SYMBOL(neightbl_set);
#ifdef CONFIG_ARPD
EXPORT_SYMBOL(neigh_app_ns);
diff --git a/net/core/netfilter.c b/net/core/netfilter.c
deleted file mode 100644
index 22a8f12..0000000
--- a/net/core/netfilter.c
+++ /dev/null
@@ -1,786 +0,0 @@
-/* netfilter.c: look after the filters for various protocols.
- * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
- *
- * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
- * way.
- *
- * Rusty Russell (C)2000 -- This code is GPL.
- *
- * February 2000: Modified by James Morris to have 1 queue per protocol.
- * 15-Mar-2000: Added NF_REPEAT --RR.
- * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik.
- */
-#include <linux/config.h>
-#include <linux/kernel.h>
-#include <linux/netfilter.h>
-#include <net/protocol.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/wait.h>
-#include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/if.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <linux/icmp.h>
-#include <net/sock.h>
-#include <net/route.h>
-#include <linux/ip.h>
-
-/* In this code, we can be waiting indefinitely for userspace to
- * service a packet if a hook returns NF_QUEUE. We could keep a count
- * of skbuffs queued for userspace, and not deregister a hook unless
- * this is zero, but that sucks. Now, we simply check when the
- * packets come back: if the hook is gone, the packet is discarded. */
-#ifdef CONFIG_NETFILTER_DEBUG
-#define NFDEBUG(format, args...) printk(format , ## args)
-#else
-#define NFDEBUG(format, args...)
-#endif
-
-/* Sockopts only registered and called from user context, so
- net locking would be overkill. Also, [gs]etsockopt calls may
- sleep. */
-static DECLARE_MUTEX(nf_sockopt_mutex);
-
-struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
-static LIST_HEAD(nf_sockopts);
-static DEFINE_SPINLOCK(nf_hook_lock);
-
-/*
- * A queue handler may be registered for each protocol. Each is protected by
- * long term mutex. The handler must provide an an outfn() to accept packets
- * for queueing and must reinject all packets it receives, no matter what.
- */
-static struct nf_queue_handler_t {
- nf_queue_outfn_t outfn;
- void *data;
-} queue_handler[NPROTO];
-static DEFINE_RWLOCK(queue_handler_lock);
-
-int nf_register_hook(struct nf_hook_ops *reg)
-{
- struct list_head *i;
-
- spin_lock_bh(&nf_hook_lock);
- list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
- if (reg->priority < ((struct nf_hook_ops *)i)->priority)
- break;
- }
- list_add_rcu(&reg->list, i->prev);
- spin_unlock_bh(&nf_hook_lock);
-
- synchronize_net();
- return 0;
-}
-
-void nf_unregister_hook(struct nf_hook_ops *reg)
-{
- spin_lock_bh(&nf_hook_lock);
- list_del_rcu(&reg->list);
- spin_unlock_bh(&nf_hook_lock);
-
- synchronize_net();
-}
-
-/* Do exclusive ranges overlap? */
-static inline int overlap(int min1, int max1, int min2, int max2)
-{
- return max1 > min2 && min1 < max2;
-}
-
-/* Functions to register sockopt ranges (exclusive). */
-int nf_register_sockopt(struct nf_sockopt_ops *reg)
-{
- struct list_head *i;
- int ret = 0;
-
- if (down_interruptible(&nf_sockopt_mutex) != 0)
- return -EINTR;
-
- list_for_each(i, &nf_sockopts) {
- struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
- if (ops->pf == reg->pf
- && (overlap(ops->set_optmin, ops->set_optmax,
- reg->set_optmin, reg->set_optmax)
- || overlap(ops->get_optmin, ops->get_optmax,
- reg->get_optmin, reg->get_optmax))) {
- NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
- ops->set_optmin, ops->set_optmax,
- ops->get_optmin, ops->get_optmax,
- reg->set_optmin, reg->set_optmax,
- reg->get_optmin, reg->get_optmax);
- ret = -EBUSY;
- goto out;
- }
- }
-
- list_add(&reg->list, &nf_sockopts);
-out:
- up(&nf_sockopt_mutex);
- return ret;
-}
-
-void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
-{
- /* No point being interruptible: we're probably in cleanup_module() */
- restart:
- down(&nf_sockopt_mutex);
- if (reg->use != 0) {
- /* To be woken by nf_sockopt call... */
- /* FIXME: Stuart Young's name appears gratuitously. */
- set_current_state(TASK_UNINTERRUPTIBLE);
- reg->cleanup_task = current;
- up(&nf_sockopt_mutex);
- schedule();
- goto restart;
- }
- list_del(&reg->list);
- up(&nf_sockopt_mutex);
-}
-
-#ifdef CONFIG_NETFILTER_DEBUG
-#include <net/ip.h>
-#include <net/tcp.h>
-#include <linux/netfilter_ipv4.h>
-
-static void debug_print_hooks_ip(unsigned int nf_debug)
-{
- if (nf_debug & (1 << NF_IP_PRE_ROUTING)) {
- printk("PRE_ROUTING ");
- nf_debug ^= (1 << NF_IP_PRE_ROUTING);
- }
- if (nf_debug & (1 << NF_IP_LOCAL_IN)) {
- printk("LOCAL_IN ");
- nf_debug ^= (1 << NF_IP_LOCAL_IN);
- }
- if (nf_debug & (1 << NF_IP_FORWARD)) {
- printk("FORWARD ");
- nf_debug ^= (1 << NF_IP_FORWARD);
- }
- if (nf_debug & (1 << NF_IP_LOCAL_OUT)) {
- printk("LOCAL_OUT ");
- nf_debug ^= (1 << NF_IP_LOCAL_OUT);
- }
- if (nf_debug & (1 << NF_IP_POST_ROUTING)) {
- printk("POST_ROUTING ");
- nf_debug ^= (1 << NF_IP_POST_ROUTING);
- }
- if (nf_debug)
- printk("Crap bits: 0x%04X", nf_debug);
- printk("\n");
-}
-
-static void nf_dump_skb(int pf, struct sk_buff *skb)
-{
- printk("skb: pf=%i %s dev=%s len=%u\n",
- pf,
- skb->sk ? "(owned)" : "(unowned)",
- skb->dev ? skb->dev->name : "(no dev)",
- skb->len);
- switch (pf) {
- case PF_INET: {
- const struct iphdr *ip = skb->nh.iph;
- __u32 *opt = (__u32 *) (ip + 1);
- int opti;
- __u16 src_port = 0, dst_port = 0;
-
- if (ip->protocol == IPPROTO_TCP
- || ip->protocol == IPPROTO_UDP) {
- struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl);
- src_port = ntohs(tcp->source);
- dst_port = ntohs(tcp->dest);
- }
-
- printk("PROTO=%d %u.%u.%u.%u:%hu %u.%u.%u.%u:%hu"
- " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu",
- ip->protocol, NIPQUAD(ip->saddr),
- src_port, NIPQUAD(ip->daddr),
- dst_port,
- ntohs(ip->tot_len), ip->tos, ntohs(ip->id),
- ntohs(ip->frag_off), ip->ttl);
-
- for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++)
- printk(" O=0x%8.8X", *opt++);
- printk("\n");
- }
- }
-}
-
-void nf_debug_ip_local_deliver(struct sk_buff *skb)
-{
- /* If it's a loopback packet, it must have come through
- * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and
- * NF_IP_LOCAL_IN. Otherwise, must have gone through
- * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING. */
- if (!skb->dev) {
- printk("ip_local_deliver: skb->dev is NULL.\n");
- } else {
- if (skb->nf_debug != ((1<<NF_IP_PRE_ROUTING)
- | (1<<NF_IP_LOCAL_IN))) {
- printk("ip_local_deliver: bad skb: ");
- debug_print_hooks_ip(skb->nf_debug);
- nf_dump_skb(PF_INET, skb);
- }
- }
-}
-
-void nf_debug_ip_loopback_xmit(struct sk_buff *newskb)
-{
- if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
- | (1 << NF_IP_POST_ROUTING))) {
- printk("ip_dev_loopback_xmit: bad owned skb = %p: ",
- newskb);
- debug_print_hooks_ip(newskb->nf_debug);
- nf_dump_skb(PF_INET, newskb);
- }
-}
-
-void nf_debug_ip_finish_output2(struct sk_buff *skb)
-{
- /* If it's owned, it must have gone through the
- * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING.
- * Otherwise, must have gone through
- * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING.
- */
- if (skb->sk) {
- if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
- | (1 << NF_IP_POST_ROUTING))) {
- printk("ip_finish_output: bad owned skb = %p: ", skb);
- debug_print_hooks_ip(skb->nf_debug);
- nf_dump_skb(PF_INET, skb);
- }
- } else {
- if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING)
- | (1 << NF_IP_FORWARD)
- | (1 << NF_IP_POST_ROUTING))) {
- /* Fragments, entunnelled packets, TCP RSTs
- generated by ipt_REJECT will have no
- owners, but still may be local */
- if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
- | (1 << NF_IP_POST_ROUTING))){
- printk("ip_finish_output:"
- " bad unowned skb = %p: ",skb);
- debug_print_hooks_ip(skb->nf_debug);
- nf_dump_skb(PF_INET, skb);
- }
- }
- }
-}
-#endif /*CONFIG_NETFILTER_DEBUG*/
-
-/* Call get/setsockopt() */
-static int nf_sockopt(struct sock *sk, int pf, int val,
- char __user *opt, int *len, int get)
-{
- struct list_head *i;
- struct nf_sockopt_ops *ops;
- int ret;
-
- if (down_interruptible(&nf_sockopt_mutex) != 0)
- return -EINTR;
-
- list_for_each(i, &nf_sockopts) {
- ops = (struct nf_sockopt_ops *)i;
- if (ops->pf == pf) {
- if (get) {
- if (val >= ops->get_optmin
- && val < ops->get_optmax) {
- ops->use++;
- up(&nf_sockopt_mutex);
- ret = ops->get(sk, val, opt, len);
- goto out;
- }
- } else {
- if (val >= ops->set_optmin
- && val < ops->set_optmax) {
- ops->use++;
- up(&nf_sockopt_mutex);
- ret = ops->set(sk, val, opt, *len);
- goto out;
- }
- }
- }
- }
- up(&nf_sockopt_mutex);
- return -ENOPROTOOPT;
-
- out:
- down(&nf_sockopt_mutex);
- ops->use--;
- if (ops->cleanup_task)
- wake_up_process(ops->cleanup_task);
- up(&nf_sockopt_mutex);
- return ret;
-}
-
-int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
- int len)
-{
- return nf_sockopt(sk, pf, val, opt, &len, 0);
-}
-
-int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
-{
- return nf_sockopt(sk, pf, val, opt, len, 1);
-}
-
-static unsigned int nf_iterate(struct list_head *head,
- struct sk_buff **skb,
- int hook,
- const struct net_device *indev,
- const struct net_device *outdev,
- struct list_head **i,
- int (*okfn)(struct sk_buff *),
- int hook_thresh)
-{
- unsigned int verdict;
-
- /*
- * The caller must not block between calls to this
- * function because of risk of continuing from deleted element.
- */
- list_for_each_continue_rcu(*i, head) {
- struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
-
- if (hook_thresh > elem->priority)
- continue;
-
- /* Optimization: we don't need to hold module
- reference here, since function can't sleep. --RR */
- verdict = elem->hook(hook, skb, indev, outdev, okfn);
- if (verdict != NF_ACCEPT) {
-#ifdef CONFIG_NETFILTER_DEBUG
- if (unlikely(verdict > NF_MAX_VERDICT)) {
- NFDEBUG("Evil return from %p(%u).\n",
- elem->hook, hook);
- continue;
- }
-#endif
- if (verdict != NF_REPEAT)
- return verdict;
- *i = (*i)->prev;
- }
- }
- return NF_ACCEPT;
-}
-
-int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data)
-{
- int ret;
-
- write_lock_bh(&queue_handler_lock);
- if (queue_handler[pf].outfn)
- ret = -EBUSY;
- else {
- queue_handler[pf].outfn = outfn;
- queue_handler[pf].data = data;
- ret = 0;
- }
- write_unlock_bh(&queue_handler_lock);
-
- return ret;
-}
-
-/* The caller must flush their queue before this */
-int nf_unregister_queue_handler(int pf)
-{
- write_lock_bh(&queue_handler_lock);
- queue_handler[pf].outfn = NULL;
- queue_handler[pf].data = NULL;
- write_unlock_bh(&queue_handler_lock);
-
- return 0;
-}
-
-/*
- * Any packet that leaves via this function must come back
- * through nf_reinject().
- */
-static int nf_queue(struct sk_buff *skb,
- struct list_head *elem,
- int pf, unsigned int hook,
- struct net_device *indev,
- struct net_device *outdev,
- int (*okfn)(struct sk_buff *))
-{
- int status;
- struct nf_info *info;
-#ifdef CONFIG_BRIDGE_NETFILTER
- struct net_device *physindev = NULL;
- struct net_device *physoutdev = NULL;
-#endif
-
- /* QUEUE == DROP if noone is waiting, to be safe. */
- read_lock(&queue_handler_lock);
- if (!queue_handler[pf].outfn) {
- read_unlock(&queue_handler_lock);
- kfree_skb(skb);
- return 1;
- }
-
- info = kmalloc(sizeof(*info), GFP_ATOMIC);
- if (!info) {
- if (net_ratelimit())
- printk(KERN_ERR "OOM queueing packet %p\n",
- skb);
- read_unlock(&queue_handler_lock);
- kfree_skb(skb);
- return 1;
- }
-
- *info = (struct nf_info) {
- (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
-
- /* If it's going away, ignore hook. */
- if (!try_module_get(info->elem->owner)) {
- read_unlock(&queue_handler_lock);
- kfree(info);
- return 0;
- }
-
- /* Bump dev refs so they don't vanish while packet is out */
- if (indev) dev_hold(indev);
- if (outdev) dev_hold(outdev);
-
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (skb->nf_bridge) {
- physindev = skb->nf_bridge->physindev;
- if (physindev) dev_hold(physindev);
- physoutdev = skb->nf_bridge->physoutdev;
- if (physoutdev) dev_hold(physoutdev);
- }
-#endif
-
- status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data);
- read_unlock(&queue_handler_lock);
-
- if (status < 0) {
- /* James M doesn't say fuck enough. */
- if (indev) dev_put(indev);
- if (outdev) dev_put(outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (physindev) dev_put(physindev);
- if (physoutdev) dev_put(physoutdev);
-#endif
- module_put(info->elem->owner);
- kfree(info);
- kfree_skb(skb);
- return 1;
- }
- return 1;
-}
-
-/* Returns 1 if okfn() needs to be executed by the caller,
- * -EPERM for NF_DROP, 0 otherwise. */
-int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
- struct net_device *indev,
- struct net_device *outdev,
- int (*okfn)(struct sk_buff *),
- int hook_thresh)
-{
- struct list_head *elem;
- unsigned int verdict;
- int ret = 0;
-
- /* We may already have this, but read-locks nest anyway */
- rcu_read_lock();
-
-#ifdef CONFIG_NETFILTER_DEBUG
- if (unlikely((*pskb)->nf_debug & (1 << hook))) {
- printk("nf_hook: hook %i already set.\n", hook);
- nf_dump_skb(pf, *pskb);
- }
- (*pskb)->nf_debug |= (1 << hook);
-#endif
-
- elem = &nf_hooks[pf][hook];
-next_hook:
- verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
- outdev, &elem, okfn, hook_thresh);
- if (verdict == NF_ACCEPT || verdict == NF_STOP) {
- ret = 1;
- goto unlock;
- } else if (verdict == NF_DROP) {
- kfree_skb(*pskb);
- ret = -EPERM;
- } else if (verdict == NF_QUEUE) {
- NFDEBUG("nf_hook: Verdict = QUEUE.\n");
- if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn))
- goto next_hook;
- }
-unlock:
- rcu_read_unlock();
- return ret;
-}
-
-void nf_reinject(struct sk_buff *skb, struct nf_info *info,
- unsigned int verdict)
-{
- struct list_head *elem = &info->elem->list;
- struct list_head *i;
-
- rcu_read_lock();
-
- /* Release those devices we held, or Alexey will kill me. */
- if (info->indev) dev_put(info->indev);
- if (info->outdev) dev_put(info->outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (skb->nf_bridge) {
- if (skb->nf_bridge->physindev)
- dev_put(skb->nf_bridge->physindev);
- if (skb->nf_bridge->physoutdev)
- dev_put(skb->nf_bridge->physoutdev);
- }
-#endif
-
- /* Drop reference to owner of hook which queued us. */
- module_put(info->elem->owner);
-
- list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
- if (i == elem)
- break;
- }
-
- if (elem == &nf_hooks[info->pf][info->hook]) {
- /* The module which sent it to userspace is gone. */
- NFDEBUG("%s: module disappeared, dropping packet.\n",
- __FUNCTION__);
- verdict = NF_DROP;
- }
-
- /* Continue traversal iff userspace said ok... */
- if (verdict == NF_REPEAT) {
- elem = elem->prev;
- verdict = NF_ACCEPT;
- }
-
- if (verdict == NF_ACCEPT) {
- next_hook:
- verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
- &skb, info->hook,
- info->indev, info->outdev, &elem,
- info->okfn, INT_MIN);
- }
-
- switch (verdict) {
- case NF_ACCEPT:
- info->okfn(skb);
- break;
-
- case NF_QUEUE:
- if (!nf_queue(skb, elem, info->pf, info->hook,
- info->indev, info->outdev, info->okfn))
- goto next_hook;
- break;
- }
- rcu_read_unlock();
-
- if (verdict == NF_DROP)
- kfree_skb(skb);
-
- kfree(info);
- return;
-}
-
-#ifdef CONFIG_INET
-/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff **pskb)
-{
- struct iphdr *iph = (*pskb)->nh.iph;
- struct rtable *rt;
- struct flowi fl = {};
- struct dst_entry *odst;
- unsigned int hh_len;
-
- /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
- * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
- */
- if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
- fl.nl_u.ip4_u.daddr = iph->daddr;
- fl.nl_u.ip4_u.saddr = iph->saddr;
- fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
- fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
-#ifdef CONFIG_IP_ROUTE_FWMARK
- fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
-#endif
- fl.proto = iph->protocol;
- if (ip_route_output_key(&rt, &fl) != 0)
- return -1;
-
- /* Drop old route. */
- dst_release((*pskb)->dst);
- (*pskb)->dst = &rt->u.dst;
- } else {
- /* non-local src, find valid iif to satisfy
- * rp-filter when calling ip_route_input. */
- fl.nl_u.ip4_u.daddr = iph->saddr;
- if (ip_route_output_key(&rt, &fl) != 0)
- return -1;
-
- odst = (*pskb)->dst;
- if (ip_route_input(*pskb, iph->daddr, iph->saddr,
- RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
- dst_release(&rt->u.dst);
- return -1;
- }
- dst_release(&rt->u.dst);
- dst_release(odst);
- }
-
- if ((*pskb)->dst->error)
- return -1;
-
- /* Change in oif may mean change in hh_len. */
- hh_len = (*pskb)->dst->dev->hard_header_len;
- if (skb_headroom(*pskb) < hh_len) {
- struct sk_buff *nskb;
-
- nskb = skb_realloc_headroom(*pskb, hh_len);
- if (!nskb)
- return -1;
- if ((*pskb)->sk)
- skb_set_owner_w(nskb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = nskb;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(ip_route_me_harder);
-
-int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len)
-{
- struct sk_buff *nskb;
-
- if (writable_len > (*pskb)->len)
- return 0;
-
- /* Not exclusive use of packet? Must copy. */
- if (skb_shared(*pskb) || skb_cloned(*pskb))
- goto copy_skb;
-
- return pskb_may_pull(*pskb, writable_len);
-
-copy_skb:
- nskb = skb_copy(*pskb, GFP_ATOMIC);
- if (!nskb)
- return 0;
- BUG_ON(skb_is_nonlinear(nskb));
-
- /* Rest of kernel will get very unhappy if we pass it a
- suddenly-orphaned skbuff */
- if ((*pskb)->sk)
- skb_set_owner_w(nskb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = nskb;
- return 1;
-}
-EXPORT_SYMBOL(skb_ip_make_writable);
-#endif /*CONFIG_INET*/
-
-/* Internal logging interface, which relies on the real
- LOG target modules */
-
-#define NF_LOG_PREFIXLEN 128
-
-static nf_logfn *nf_logging[NPROTO]; /* = NULL */
-static int reported = 0;
-static DEFINE_SPINLOCK(nf_log_lock);
-
-int nf_log_register(int pf, nf_logfn *logfn)
-{
- int ret = -EBUSY;
-
- /* Any setup of logging members must be done before
- * substituting pointer. */
- spin_lock(&nf_log_lock);
- if (!nf_logging[pf]) {
- rcu_assign_pointer(nf_logging[pf], logfn);
- ret = 0;
- }
- spin_unlock(&nf_log_lock);
- return ret;
-}
-
-void nf_log_unregister(int pf, nf_logfn *logfn)
-{
- spin_lock(&nf_log_lock);
- if (nf_logging[pf] == logfn)
- nf_logging[pf] = NULL;
- spin_unlock(&nf_log_lock);
-
- /* Give time to concurrent readers. */
- synchronize_net();
-}
-
-void nf_log_packet(int pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const char *fmt, ...)
-{
- va_list args;
- char prefix[NF_LOG_PREFIXLEN];
- nf_logfn *logfn;
-
- rcu_read_lock();
- logfn = rcu_dereference(nf_logging[pf]);
- if (logfn) {
- va_start(args, fmt);
- vsnprintf(prefix, sizeof(prefix), fmt, args);
- va_end(args);
- /* We must read logging before nf_logfn[pf] */
- logfn(hooknum, skb, in, out, prefix);
- } else if (!reported) {
- printk(KERN_WARNING "nf_log_packet: can\'t log yet, "
- "no backend logging module loaded in!\n");
- reported++;
- }
- rcu_read_unlock();
-}
-EXPORT_SYMBOL(nf_log_register);
-EXPORT_SYMBOL(nf_log_unregister);
-EXPORT_SYMBOL(nf_log_packet);
-
-/* This does not belong here, but locally generated errors need it if connection
- tracking in use: without this, connection may not be in hash table, and hence
- manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
-
-void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
-{
- void (*attach)(struct sk_buff *, struct sk_buff *);
-
- if (skb->nfct && (attach = ip_ct_attach) != NULL) {
- mb(); /* Just to be sure: must be read before executing this */
- attach(new, skb);
- }
-}
-
-void __init netfilter_init(void)
-{
- int i, h;
-
- for (i = 0; i < NPROTO; i++) {
- for (h = 0; h < NF_MAX_HOOKS; h++)
- INIT_LIST_HEAD(&nf_hooks[i][h]);
- }
-}
-
-EXPORT_SYMBOL(ip_ct_attach);
-EXPORT_SYMBOL(nf_ct_attach);
-EXPORT_SYMBOL(nf_getsockopt);
-EXPORT_SYMBOL(nf_hook_slow);
-EXPORT_SYMBOL(nf_hooks);
-EXPORT_SYMBOL(nf_register_hook);
-EXPORT_SYMBOL(nf_register_queue_handler);
-EXPORT_SYMBOL(nf_register_sockopt);
-EXPORT_SYMBOL(nf_reinject);
-EXPORT_SYMBOL(nf_setsockopt);
-EXPORT_SYMBOL(nf_unregister_hook);
-EXPORT_SYMBOL(nf_unregister_queue_handler);
-EXPORT_SYMBOL(nf_unregister_sockopt);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a119696..a1a9a7a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -33,6 +33,7 @@
#define MAX_UDP_CHUNK 1460
#define MAX_SKBS 32
#define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
+#define MAX_RETRIES 20000
static DEFINE_SPINLOCK(skb_list_lock);
static int nr_skbs;
@@ -130,19 +131,20 @@ static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
*/
static void poll_napi(struct netpoll *np)
{
+ struct netpoll_info *npinfo = np->dev->npinfo;
int budget = 16;
if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
- np->poll_owner != smp_processor_id() &&
- spin_trylock(&np->poll_lock)) {
- np->rx_flags |= NETPOLL_RX_DROP;
+ npinfo->poll_owner != smp_processor_id() &&
+ spin_trylock(&npinfo->poll_lock)) {
+ npinfo->rx_flags |= NETPOLL_RX_DROP;
atomic_inc(&trapped);
np->dev->poll(np->dev, &budget);
atomic_dec(&trapped);
- np->rx_flags &= ~NETPOLL_RX_DROP;
- spin_unlock(&np->poll_lock);
+ npinfo->rx_flags &= ~NETPOLL_RX_DROP;
+ spin_unlock(&npinfo->poll_lock);
}
}
@@ -245,16 +247,18 @@ repeat:
static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
{
int status;
+ struct netpoll_info *npinfo;
-repeat:
- if(!np || !np->dev || !netif_running(np->dev)) {
+ if (!np || !np->dev || !netif_running(np->dev)) {
__kfree_skb(skb);
return;
}
+ npinfo = np->dev->npinfo;
+
/* avoid recursion */
- if(np->poll_owner == smp_processor_id() ||
- np->dev->xmit_lock_owner == smp_processor_id()) {
+ if (npinfo->poll_owner == smp_processor_id() ||
+ np->dev->xmit_lock_owner == smp_processor_id()) {
if (np->drop)
np->drop(skb);
else
@@ -262,30 +266,37 @@ repeat:
return;
}
- spin_lock(&np->dev->xmit_lock);
- np->dev->xmit_lock_owner = smp_processor_id();
+ do {
+ npinfo->tries--;
+ spin_lock(&np->dev->xmit_lock);
+ np->dev->xmit_lock_owner = smp_processor_id();
- /*
- * network drivers do not expect to be called if the queue is
- * stopped.
- */
- if (netif_queue_stopped(np->dev)) {
+ /*
+ * network drivers do not expect to be called if the queue is
+ * stopped.
+ */
+ if (netif_queue_stopped(np->dev)) {
+ np->dev->xmit_lock_owner = -1;
+ spin_unlock(&np->dev->xmit_lock);
+ netpoll_poll(np);
+ udelay(50);
+ continue;
+ }
+
+ status = np->dev->hard_start_xmit(skb, np->dev);
np->dev->xmit_lock_owner = -1;
spin_unlock(&np->dev->xmit_lock);
- netpoll_poll(np);
- goto repeat;
- }
-
- status = np->dev->hard_start_xmit(skb, np->dev);
- np->dev->xmit_lock_owner = -1;
- spin_unlock(&np->dev->xmit_lock);
+ /* success */
+ if(!status) {
+ npinfo->tries = MAX_RETRIES; /* reset */
+ return;
+ }
- /* transmit busy */
- if(status) {
+ /* transmit busy */
netpoll_poll(np);
- goto repeat;
- }
+ udelay(50);
+ } while (npinfo->tries > 0);
}
void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
@@ -341,14 +352,18 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
static void arp_reply(struct sk_buff *skb)
{
+ struct netpoll_info *npinfo = skb->dev->npinfo;
struct arphdr *arp;
unsigned char *arp_ptr;
int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
u32 sip, tip;
struct sk_buff *send_skb;
- struct netpoll *np = skb->dev->np;
+ struct netpoll *np = NULL;
- if (!np) return;
+ if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
+ np = npinfo->rx_np;
+ if (!np)
+ return;
/* No arp on this interface */
if (skb->dev->flags & IFF_NOARP)
@@ -429,9 +444,9 @@ int __netpoll_rx(struct sk_buff *skb)
int proto, len, ulen;
struct iphdr *iph;
struct udphdr *uh;
- struct netpoll *np = skb->dev->np;
+ struct netpoll *np = skb->dev->npinfo->rx_np;
- if (!np->rx_hook)
+ if (!np)
goto out;
if (skb->dev->type != ARPHRD_ETHER)
goto out;
@@ -611,9 +626,8 @@ int netpoll_setup(struct netpoll *np)
{
struct net_device *ndev = NULL;
struct in_device *in_dev;
-
- np->poll_lock = SPIN_LOCK_UNLOCKED;
- np->poll_owner = -1;
+ struct netpoll_info *npinfo;
+ unsigned long flags;
if (np->dev_name)
ndev = dev_get_by_name(np->dev_name);
@@ -624,7 +638,19 @@ int netpoll_setup(struct netpoll *np)
}
np->dev = ndev;
- ndev->np = np;
+ if (!ndev->npinfo) {
+ npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+ if (!npinfo)
+ goto release;
+
+ npinfo->rx_flags = 0;
+ npinfo->rx_np = NULL;
+ npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
+ npinfo->poll_owner = -1;
+ npinfo->tries = MAX_RETRIES;
+ npinfo->rx_lock = SPIN_LOCK_UNLOCKED;
+ } else
+ npinfo = ndev->npinfo;
if (!ndev->poll_controller) {
printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
@@ -692,13 +718,27 @@ int netpoll_setup(struct netpoll *np)
np->name, HIPQUAD(np->local_ip));
}
- if(np->rx_hook)
- np->rx_flags = NETPOLL_RX_ENABLED;
+ if (np->rx_hook) {
+ spin_lock_irqsave(&npinfo->rx_lock, flags);
+ npinfo->rx_flags |= NETPOLL_RX_ENABLED;
+ npinfo->rx_np = np;
+ spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+ }
+
+ /* fill up the skb queue */
+ refill_skbs();
+
+ /* last thing to do is link it to the net device structure */
+ ndev->npinfo = npinfo;
+
+ /* avoid racing with NAPI reading npinfo */
+ synchronize_rcu();
return 0;
release:
- ndev->np = NULL;
+ if (!ndev->npinfo)
+ kfree(npinfo);
np->dev = NULL;
dev_put(ndev);
return -1;
@@ -706,9 +746,20 @@ int netpoll_setup(struct netpoll *np)
void netpoll_cleanup(struct netpoll *np)
{
- if (np->dev)
- np->dev->np = NULL;
- dev_put(np->dev);
+ struct netpoll_info *npinfo;
+ unsigned long flags;
+
+ if (np->dev) {
+ npinfo = np->dev->npinfo;
+ if (npinfo && npinfo->rx_np == np) {
+ spin_lock_irqsave(&npinfo->rx_lock, flags);
+ npinfo->rx_np = NULL;
+ npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+ spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+ }
+ dev_put(np->dev);
+ }
+
np->dev = NULL;
}
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c57b06b..8eb083b 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -151,7 +151,7 @@
#include <asm/timex.h>
-#define VERSION "pktgen v2.61: Packet Generator for packet performance testing.\n"
+#define VERSION "pktgen v2.62: Packet Generator for packet performance testing.\n"
/* #define PG_DEBUG(a) a */
#define PG_DEBUG(a)
@@ -363,7 +363,7 @@ struct pktgen_thread {
* All Rights Reserved.
*
*/
-inline static s64 divremdi3(s64 x, s64 y, int type)
+static inline s64 divremdi3(s64 x, s64 y, int type)
{
u64 a = (x < 0) ? -x : x;
u64 b = (y < 0) ? -y : y;
@@ -1921,6 +1921,11 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
struct iphdr *iph;
struct pktgen_hdr *pgh = NULL;
+ /* Update any of the values, used when we're incrementing various
+ * fields.
+ */
+ mod_cur_headers(pkt_dev);
+
skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
@@ -1934,11 +1939,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
- /* Update any of the values, used when we're incrementing various
- * fields.
- */
- mod_cur_headers(pkt_dev);
-
memcpy(eth, pkt_dev->hh, 12);
*(u16*)&eth[12] = __constant_htons(ETH_P_IP);
@@ -2192,7 +2192,12 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
int datalen;
struct ipv6hdr *iph;
struct pktgen_hdr *pgh = NULL;
-
+
+ /* Update any of the values, used when we're incrementing various
+ * fields.
+ */
+ mod_cur_headers(pkt_dev);
+
skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
if (!skb) {
sprintf(pkt_dev->result, "No memory");
@@ -2206,17 +2211,9 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr));
udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
-
- /* Update any of the values, used when we're incrementing various
- * fields.
- */
- mod_cur_headers(pkt_dev);
-
-
memcpy(eth, pkt_dev->hh, 12);
*(u16*)&eth[12] = __constant_htons(ETH_P_IPV6);
-
-
+
datalen = pkt_dev->cur_pkt_size-14-
sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
new file mode 100644
index 0000000..b8203de
--- /dev/null
+++ b/net/core/request_sock.c
@@ -0,0 +1,90 @@
+/*
+ * NET Generic infrastructure for Network protocols.
+ *
+ * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * From code originally in include/net/tcp.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <net/request_sock.h>
+
+/*
+ * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
+ * One SYN_RECV socket costs about 80bytes on a 32bit machine.
+ * It would be better to replace it with a global counter for all sockets
+ * but then some measure against one socket starving all other sockets
+ * would be needed.
+ *
+ * It was 128 by default. Experiments with real servers show, that
+ * it is absolutely not enough even at 100conn/sec. 256 cures most
+ * of problems. This value is adjusted to 128 for very small machines
+ * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * Further increasing requires to change hash table size.
+ */
+int sysctl_max_syn_backlog = 256;
+
+int reqsk_queue_alloc(struct request_sock_queue *queue,
+ const int nr_table_entries)
+{
+ const int lopt_size = sizeof(struct listen_sock) +
+ nr_table_entries * sizeof(struct request_sock *);
+ struct listen_sock *lopt = kmalloc(lopt_size, GFP_KERNEL);
+
+ if (lopt == NULL)
+ return -ENOMEM;
+
+ memset(lopt, 0, lopt_size);
+
+ for (lopt->max_qlen_log = 6;
+ (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
+ lopt->max_qlen_log++);
+
+ get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
+ rwlock_init(&queue->syn_wait_lock);
+ queue->rskq_accept_head = queue->rskq_accept_head = NULL;
+ queue->rskq_defer_accept = 0;
+ lopt->nr_table_entries = nr_table_entries;
+
+ write_lock_bh(&queue->syn_wait_lock);
+ queue->listen_opt = lopt;
+ write_unlock_bh(&queue->syn_wait_lock);
+
+ return 0;
+}
+
+EXPORT_SYMBOL(reqsk_queue_alloc);
+
+void reqsk_queue_destroy(struct request_sock_queue *queue)
+{
+ /* make all the listen_opt local to us */
+ struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
+
+ if (lopt->qlen != 0) {
+ int i;
+
+ for (i = 0; i < lopt->nr_table_entries; i++) {
+ struct request_sock *req;
+
+ while ((req = lopt->syn_table[i]) != NULL) {
+ lopt->syn_table[i] = req->dl_next;
+ lopt->qlen--;
+ reqsk_free(req);
+ }
+ }
+ }
+
+ BUG_TRAP(lopt->qlen == 0);
+ kfree(lopt);
+}
+
+EXPORT_SYMBOL(reqsk_queue_destroy);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 00caf4b..9bed756 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -100,6 +100,7 @@ static const int rtm_min[RTM_NR_FAMILIES] =
[RTM_FAM(RTM_NEWPREFIX)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
[RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
[RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
+ [RTM_FAM(RTM_NEWNEIGHTBL)] = NLMSG_LENGTH(sizeof(struct ndtmsg)),
};
static const int rta_max[RTM_NR_FAMILIES] =
@@ -113,6 +114,7 @@ static const int rta_max[RTM_NR_FAMILIES] =
[RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX,
[RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX,
[RTM_FAM(RTM_NEWACTION)] = TCAA_MAX,
+ [RTM_FAM(RTM_NEWNEIGHTBL)] = NDTA_MAX,
};
void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
@@ -124,6 +126,7 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
rta->rta_type = attrtype;
rta->rta_len = size;
memcpy(RTA_DATA(rta), data, attrlen);
+ memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
}
size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
@@ -145,7 +148,7 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
{
int err = 0;
- NETLINK_CB(skb).dst_groups = group;
+ NETLINK_CB(skb).dst_group = group;
if (echo)
atomic_inc(&skb->users);
netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
@@ -176,16 +179,17 @@ rtattr_failure:
static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
- int type, u32 pid, u32 seq, u32 change)
+ int type, u32 pid, u32 seq, u32 change,
+ unsigned int flags)
{
struct ifinfomsg *r;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
- nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r));
- if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+ nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags);
r = NLMSG_DATA(nlh);
r->ifi_family = AF_UNSPEC;
+ r->__ifi_pad = 0;
r->ifi_type = dev->type;
r->ifi_index = dev->ifindex;
r->ifi_flags = dev_get_flags(dev);
@@ -273,7 +277,10 @@ static int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *c
for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
if (idx < s_idx)
continue;
- if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0)
+ if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, 0,
+ NLM_F_MULTI) <= 0)
break;
}
read_unlock(&dev_base_lock);
@@ -447,12 +454,12 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
if (!skb)
return;
- if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, change) < 0) {
+ if (rtnetlink_fill_ifinfo(skb, dev, type, current->pid, 0, change, 0) < 0) {
kfree_skb(skb);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_LINK;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL);
+ NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL);
}
static int rtnetlink_done(struct netlink_callback *cb)
@@ -649,14 +656,16 @@ static void rtnetlink_rcv(struct sock *sk, int len)
static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] =
{
- [RTM_GETLINK - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo },
- [RTM_SETLINK - RTM_BASE] = { .doit = do_setlink },
- [RTM_GETADDR - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
- [RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
- [RTM_NEWNEIGH - RTM_BASE] = { .doit = neigh_add },
- [RTM_DELNEIGH - RTM_BASE] = { .doit = neigh_delete },
- [RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info },
- [RTM_GETRULE - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
+ [RTM_GETLINK - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo },
+ [RTM_SETLINK - RTM_BASE] = { .doit = do_setlink },
+ [RTM_GETADDR - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
+ [RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
+ [RTM_NEWNEIGH - RTM_BASE] = { .doit = neigh_add },
+ [RTM_DELNEIGH - RTM_BASE] = { .doit = neigh_delete },
+ [RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info },
+ [RTM_GETRULE - RTM_BASE] = { .dumpit = rtnetlink_dump_all },
+ [RTM_GETNEIGHTBL - RTM_BASE] = { .dumpit = neightbl_dump_info },
+ [RTM_SETNEIGHTBL - RTM_BASE] = { .doit = neightbl_set },
};
static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
@@ -699,7 +708,8 @@ void __init rtnetlink_init(void)
if (!rta_buf)
panic("rtnetlink_init: cannot allocate rta_buf\n");
- rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv);
+ rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv,
+ THIS_MODULE);
if (rtnl == NULL)
panic("rtnetlink_init: cannot initialize rtnetlink\n");
netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f65b3de..f80a287 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -68,7 +68,10 @@
#include <asm/uaccess.h>
#include <asm/system.h>
-static kmem_cache_t *skbuff_head_cache;
+static kmem_cache_t *skbuff_head_cache __read_mostly;
+static kmem_cache_t *skbuff_fclone_cache __read_mostly;
+
+struct timeval __read_mostly skb_tv_base;
/*
* Keep out-of-line to prevent kernel bloat.
@@ -118,7 +121,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
*/
/**
- * alloc_skb - allocate a network buffer
+ * __alloc_skb - allocate a network buffer
* @size: size to allocate
* @gfp_mask: allocation mask
*
@@ -129,14 +132,20 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
* Buffers may only be allocated from interrupts using a @gfp_mask of
* %GFP_ATOMIC.
*/
-struct sk_buff *alloc_skb(unsigned int size, int gfp_mask)
+struct sk_buff *__alloc_skb(unsigned int size, unsigned int __nocast gfp_mask,
+ int fclone)
{
struct sk_buff *skb;
u8 *data;
/* Get the HEAD */
- skb = kmem_cache_alloc(skbuff_head_cache,
- gfp_mask & ~__GFP_DMA);
+ if (fclone)
+ skb = kmem_cache_alloc(skbuff_fclone_cache,
+ gfp_mask & ~__GFP_DMA);
+ else
+ skb = kmem_cache_alloc(skbuff_head_cache,
+ gfp_mask & ~__GFP_DMA);
+
if (!skb)
goto out;
@@ -153,7 +162,15 @@ struct sk_buff *alloc_skb(unsigned int size, int gfp_mask)
skb->data = data;
skb->tail = data;
skb->end = data + size;
+ if (fclone) {
+ struct sk_buff *child = skb + 1;
+ atomic_t *fclone_ref = (atomic_t *) (child + 1);
+
+ skb->fclone = SKB_FCLONE_ORIG;
+ atomic_set(fclone_ref, 1);
+ child->fclone = SKB_FCLONE_UNAVAILABLE;
+ }
atomic_set(&(skb_shinfo(skb)->dataref), 1);
skb_shinfo(skb)->nr_frags = 0;
skb_shinfo(skb)->tso_size = 0;
@@ -182,7 +199,8 @@ nodata:
* %GFP_ATOMIC.
*/
struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
- unsigned int size, int gfp_mask)
+ unsigned int size,
+ unsigned int __nocast gfp_mask)
{
struct sk_buff *skb;
u8 *data;
@@ -265,8 +283,34 @@ void skb_release_data(struct sk_buff *skb)
*/
void kfree_skbmem(struct sk_buff *skb)
{
+ struct sk_buff *other;
+ atomic_t *fclone_ref;
+
skb_release_data(skb);
- kmem_cache_free(skbuff_head_cache, skb);
+ switch (skb->fclone) {
+ case SKB_FCLONE_UNAVAILABLE:
+ kmem_cache_free(skbuff_head_cache, skb);
+ break;
+
+ case SKB_FCLONE_ORIG:
+ fclone_ref = (atomic_t *) (skb + 2);
+ if (atomic_dec_and_test(fclone_ref))
+ kmem_cache_free(skbuff_fclone_cache, skb);
+ break;
+
+ case SKB_FCLONE_CLONE:
+ fclone_ref = (atomic_t *) (skb + 1);
+ other = skb - 1;
+
+ /* The clone portion is available for
+ * fast-cloning again.
+ */
+ skb->fclone = SKB_FCLONE_UNAVAILABLE;
+
+ if (atomic_dec_and_test(fclone_ref))
+ kmem_cache_free(skbuff_fclone_cache, other);
+ break;
+ };
}
/**
@@ -280,8 +324,6 @@ void kfree_skbmem(struct sk_buff *skb)
void __kfree_skb(struct sk_buff *skb)
{
- BUG_ON(skb->list != NULL);
-
dst_release(skb->dst);
#ifdef CONFIG_XFRM
secpath_put(skb->sp);
@@ -301,7 +343,6 @@ void __kfree_skb(struct sk_buff *skb)
skb->tc_index = 0;
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = 0;
- skb->tc_classid = 0;
#endif
#endif
@@ -322,21 +363,29 @@ void __kfree_skb(struct sk_buff *skb)
* %GFP_ATOMIC.
*/
-struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
+struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
{
- struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
-
- if (!n)
- return NULL;
+ struct sk_buff *n;
+
+ n = skb + 1;
+ if (skb->fclone == SKB_FCLONE_ORIG &&
+ n->fclone == SKB_FCLONE_UNAVAILABLE) {
+ atomic_t *fclone_ref = (atomic_t *) (n + 1);
+ n->fclone = SKB_FCLONE_CLONE;
+ atomic_inc(fclone_ref);
+ } else {
+ n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+ if (!n)
+ return NULL;
+ n->fclone = SKB_FCLONE_UNAVAILABLE;
+ }
#define C(x) n->x = skb->x
n->next = n->prev = NULL;
- n->list = NULL;
n->sk = NULL;
- C(stamp);
+ C(tstamp);
C(dev);
- C(real_dev);
C(h);
C(nh);
C(mac);
@@ -357,33 +406,24 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
C(ip_summed);
C(priority);
C(protocol);
- C(security);
n->destructor = NULL;
#ifdef CONFIG_NETFILTER
C(nfmark);
- C(nfcache);
C(nfct);
nf_conntrack_get(skb->nfct);
C(nfctinfo);
-#ifdef CONFIG_NETFILTER_DEBUG
- C(nf_debug);
-#endif
#ifdef CONFIG_BRIDGE_NETFILTER
C(nf_bridge);
nf_bridge_get(skb->nf_bridge);
#endif
#endif /*CONFIG_NETFILTER*/
-#if defined(CONFIG_HIPPI)
- C(private);
-#endif
#ifdef CONFIG_NET_SCHED
C(tc_index);
#ifdef CONFIG_NET_CLS_ACT
n->tc_verd = SET_TC_VERD(skb->tc_verd,0);
- n->tc_verd = CLR_TC_OK2MUNGE(skb->tc_verd);
- n->tc_verd = CLR_TC_MUNGED(skb->tc_verd);
+ n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
+ n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
C(input_dev);
- C(tc_classid);
#endif
#endif
@@ -407,10 +447,8 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
*/
unsigned long offset = new->data - old->data;
- new->list = NULL;
new->sk = NULL;
new->dev = old->dev;
- new->real_dev = old->real_dev;
new->priority = old->priority;
new->protocol = old->protocol;
new->dst = dst_clone(old->dst);
@@ -422,19 +460,15 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->mac.raw = old->mac.raw + offset;
memcpy(new->cb, old->cb, sizeof(old->cb));
new->local_df = old->local_df;
+ new->fclone = SKB_FCLONE_UNAVAILABLE;
new->pkt_type = old->pkt_type;
- new->stamp = old->stamp;
+ new->tstamp = old->tstamp;
new->destructor = NULL;
- new->security = old->security;
#ifdef CONFIG_NETFILTER
new->nfmark = old->nfmark;
- new->nfcache = old->nfcache;
new->nfct = old->nfct;
nf_conntrack_get(old->nfct);
new->nfctinfo = old->nfctinfo;
-#ifdef CONFIG_NETFILTER_DEBUG
- new->nf_debug = old->nf_debug;
-#endif
#ifdef CONFIG_BRIDGE_NETFILTER
new->nf_bridge = old->nf_bridge;
nf_bridge_get(old->nf_bridge);
@@ -468,7 +502,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
* header is going to be modified. Use pskb_copy() instead.
*/
-struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
+struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_mask)
{
int headerlen = skb->data - skb->head;
/*
@@ -507,7 +541,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
* The returned buffer has a reference count of 1.
*/
-struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask)
+struct sk_buff *pskb_copy(struct sk_buff *skb, unsigned int __nocast gfp_mask)
{
/*
* Allocate the copy buffer
@@ -565,7 +599,8 @@ out:
* reloaded after call to this function.
*/
-int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask)
+int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+ unsigned int __nocast gfp_mask)
{
int i;
u8 *data;
@@ -655,7 +690,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
* only by netfilter in the cases when checksum is recalculated? --ANK
*/
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
- int newheadroom, int newtailroom, int gfp_mask)
+ int newheadroom, int newtailroom,
+ unsigned int __nocast gfp_mask)
{
/*
* Allocate the copy buffer
@@ -1349,50 +1385,43 @@ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
__skb_queue_tail(list, newsk);
spin_unlock_irqrestore(&list->lock, flags);
}
+
/**
* skb_unlink - remove a buffer from a list
* @skb: buffer to remove
+ * @list: list to use
*
- * Place a packet after a given packet in a list. The list locks are taken
- * and this function is atomic with respect to other list locked calls
+ * Remove a packet from a list. The list locks are taken and this
+ * function is atomic with respect to other list locked calls
*
- * Works even without knowing the list it is sitting on, which can be
- * handy at times. It also means that THE LIST MUST EXIST when you
- * unlink. Thus a list must have its contents unlinked before it is
- * destroyed.
+ * You must know what list the SKB is on.
*/
-void skb_unlink(struct sk_buff *skb)
+void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
- struct sk_buff_head *list = skb->list;
-
- if (list) {
- unsigned long flags;
+ unsigned long flags;
- spin_lock_irqsave(&list->lock, flags);
- if (skb->list == list)
- __skb_unlink(skb, skb->list);
- spin_unlock_irqrestore(&list->lock, flags);
- }
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_unlink(skb, list);
+ spin_unlock_irqrestore(&list->lock, flags);
}
-
/**
* skb_append - append a buffer
* @old: buffer to insert after
* @newsk: buffer to insert
+ * @list: list to use
*
* Place a packet after a given packet in a list. The list locks are taken
* and this function is atomic with respect to other list locked calls.
* A buffer cannot be placed on two lists at the same time.
*/
-
-void skb_append(struct sk_buff *old, struct sk_buff *newsk)
+void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
{
unsigned long flags;
- spin_lock_irqsave(&old->list->lock, flags);
- __skb_append(old, newsk);
- spin_unlock_irqrestore(&old->list->lock, flags);
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_append(old, newsk, list);
+ spin_unlock_irqrestore(&list->lock, flags);
}
@@ -1400,19 +1429,21 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk)
* skb_insert - insert a buffer
* @old: buffer to insert before
* @newsk: buffer to insert
+ * @list: list to use
+ *
+ * Place a packet before a given packet in a list. The list locks are
+ * taken and this function is atomic with respect to other list locked
+ * calls.
*
- * Place a packet before a given packet in a list. The list locks are taken
- * and this function is atomic with respect to other list locked calls
* A buffer cannot be placed on two lists at the same time.
*/
-
-void skb_insert(struct sk_buff *old, struct sk_buff *newsk)
+void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
{
unsigned long flags;
- spin_lock_irqsave(&old->list->lock, flags);
- __skb_insert(newsk, old->prev, old, old->list);
- spin_unlock_irqrestore(&old->list->lock, flags);
+ spin_lock_irqsave(&list->lock, flags);
+ __skb_insert(newsk, old->prev, old, list);
+ spin_unlock_irqrestore(&list->lock, flags);
}
#if 0
@@ -1506,6 +1537,159 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
skb_split_no_header(skb, skb1, len, pos);
}
+/**
+ * skb_prepare_seq_read - Prepare a sequential read of skb data
+ * @skb: the buffer to read
+ * @from: lower offset of data to be read
+ * @to: upper offset of data to be read
+ * @st: state variable
+ *
+ * Initializes the specified state variable. Must be called before
+ * invoking skb_seq_read() for the first time.
+ */
+void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
+ unsigned int to, struct skb_seq_state *st)
+{
+ st->lower_offset = from;
+ st->upper_offset = to;
+ st->root_skb = st->cur_skb = skb;
+ st->frag_idx = st->stepped_offset = 0;
+ st->frag_data = NULL;
+}
+
+/**
+ * skb_seq_read - Sequentially read skb data
+ * @consumed: number of bytes consumed by the caller so far
+ * @data: destination pointer for data to be returned
+ * @st: state variable
+ *
+ * Reads a block of skb data at &consumed relative to the
+ * lower offset specified to skb_prepare_seq_read(). Assigns
+ * the head of the data block to &data and returns the length
+ * of the block or 0 if the end of the skb data or the upper
+ * offset has been reached.
+ *
+ * The caller is not required to consume all of the data
+ * returned, i.e. &consumed is typically set to the number
+ * of bytes already consumed and the next call to
+ * skb_seq_read() will return the remaining part of the block.
+ *
+ * Note: The size of each block of data returned can be arbitary,
+ * this limitation is the cost for zerocopy seqeuental
+ * reads of potentially non linear data.
+ *
+ * Note: Fragment lists within fragments are not implemented
+ * at the moment, state->root_skb could be replaced with
+ * a stack for this purpose.
+ */
+unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
+ struct skb_seq_state *st)
+{
+ unsigned int block_limit, abs_offset = consumed + st->lower_offset;
+ skb_frag_t *frag;
+
+ if (unlikely(abs_offset >= st->upper_offset))
+ return 0;
+
+next_skb:
+ block_limit = skb_headlen(st->cur_skb);
+
+ if (abs_offset < block_limit) {
+ *data = st->cur_skb->data + abs_offset;
+ return block_limit - abs_offset;
+ }
+
+ if (st->frag_idx == 0 && !st->frag_data)
+ st->stepped_offset += skb_headlen(st->cur_skb);
+
+ while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+ frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
+ block_limit = frag->size + st->stepped_offset;
+
+ if (abs_offset < block_limit) {
+ if (!st->frag_data)
+ st->frag_data = kmap_skb_frag(frag);
+
+ *data = (u8 *) st->frag_data + frag->page_offset +
+ (abs_offset - st->stepped_offset);
+
+ return block_limit - abs_offset;
+ }
+
+ if (st->frag_data) {
+ kunmap_skb_frag(st->frag_data);
+ st->frag_data = NULL;
+ }
+
+ st->frag_idx++;
+ st->stepped_offset += frag->size;
+ }
+
+ if (st->cur_skb->next) {
+ st->cur_skb = st->cur_skb->next;
+ st->frag_idx = 0;
+ goto next_skb;
+ } else if (st->root_skb == st->cur_skb &&
+ skb_shinfo(st->root_skb)->frag_list) {
+ st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
+ goto next_skb;
+ }
+
+ return 0;
+}
+
+/**
+ * skb_abort_seq_read - Abort a sequential read of skb data
+ * @st: state variable
+ *
+ * Must be called if skb_seq_read() was not called until it
+ * returned 0.
+ */
+void skb_abort_seq_read(struct skb_seq_state *st)
+{
+ if (st->frag_data)
+ kunmap_skb_frag(st->frag_data);
+}
+
+#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
+
+static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
+ struct ts_config *conf,
+ struct ts_state *state)
+{
+ return skb_seq_read(offset, text, TS_SKB_CB(state));
+}
+
+static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
+{
+ skb_abort_seq_read(TS_SKB_CB(state));
+}
+
+/**
+ * skb_find_text - Find a text pattern in skb data
+ * @skb: the buffer to look in
+ * @from: search offset
+ * @to: search limit
+ * @config: textsearch configuration
+ * @state: uninitialized textsearch state variable
+ *
+ * Finds a pattern in the skb data according to the specified
+ * textsearch configuration. Use textsearch_next() to retrieve
+ * subsequent occurrences of the pattern. Returns the offset
+ * to the first occurrence or UINT_MAX if no match was found.
+ */
+unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
+ unsigned int to, struct ts_config *config,
+ struct ts_state *state)
+{
+ config->get_next_block = skb_ts_get_next_block;
+ config->finish = skb_ts_finish;
+
+ skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
+
+ return textsearch_find(config, state);
+}
+
void __init skb_init(void)
{
skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1515,12 +1699,23 @@ void __init skb_init(void)
NULL, NULL);
if (!skbuff_head_cache)
panic("cannot create skbuff cache");
+
+ skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+ (2*sizeof(struct sk_buff)) +
+ sizeof(atomic_t),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (!skbuff_fclone_cache)
+ panic("cannot create skbuff cache");
+
+ do_gettimeofday(&skb_tv_base);
}
EXPORT_SYMBOL(___pskb_trim);
EXPORT_SYMBOL(__kfree_skb);
EXPORT_SYMBOL(__pskb_pull_tail);
-EXPORT_SYMBOL(alloc_skb);
+EXPORT_SYMBOL(__alloc_skb);
EXPORT_SYMBOL(pskb_copy);
EXPORT_SYMBOL(pskb_expand_head);
EXPORT_SYMBOL(skb_checksum);
@@ -1544,3 +1739,8 @@ EXPORT_SYMBOL(skb_queue_tail);
EXPORT_SYMBOL(skb_unlink);
EXPORT_SYMBOL(skb_append);
EXPORT_SYMBOL(skb_split);
+EXPORT_SYMBOL(skb_prepare_seq_read);
+EXPORT_SYMBOL(skb_seq_read);
+EXPORT_SYMBOL(skb_abort_seq_read);
+EXPORT_SYMBOL(skb_find_text);
+EXPORT_SYMBOL(skb_tv_base);
diff --git a/net/core/sock.c b/net/core/sock.c
index 96e00b0..ac63b56 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -118,6 +118,7 @@
#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
+#include <net/request_sock.h>
#include <net/sock.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
@@ -205,13 +206,14 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
*/
#ifdef SO_DONTLINGER /* Compatibility item... */
- switch (optname) {
- case SO_DONTLINGER:
- sock_reset_flag(sk, SOCK_LINGER);
- return 0;
+ if (optname == SO_DONTLINGER) {
+ lock_sock(sk);
+ sock_reset_flag(sk, SOCK_LINGER);
+ release_sock(sk);
+ return 0;
}
-#endif
-
+#endif
+
if(optlen<sizeof(int))
return(-EINVAL);
@@ -258,7 +260,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
if (val > sysctl_wmem_max)
val = sysctl_wmem_max;
-
+set_sndbuf:
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
if ((val * 2) < SOCK_MIN_SNDBUF)
sk->sk_sndbuf = SOCK_MIN_SNDBUF;
@@ -272,6 +274,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sk->sk_write_space(sk);
break;
+ case SO_SNDBUFFORCE:
+ if (!capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ goto set_sndbuf;
+
case SO_RCVBUF:
/* Don't error on this BSD doesn't and if you think
about it this is right. Otherwise apps have to
@@ -280,7 +289,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
if (val > sysctl_rmem_max)
val = sysctl_rmem_max;
-
+set_rcvbuf:
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
/* FIXME: is this lower bound the right one? */
if ((val * 2) < SOCK_MIN_RCVBUF)
@@ -289,6 +298,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sk->sk_rcvbuf = val * 2;
break;
+ case SO_RCVBUFFORCE:
+ if (!capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+ goto set_rcvbuf;
+
case SO_KEEPALIVE:
#ifdef CONFIG_INET
if (sk->sk_protocol == IPPROTO_TCP)
@@ -325,11 +341,11 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sock_reset_flag(sk, SOCK_LINGER);
else {
#if (BITS_PER_LONG == 32)
- if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
+ if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
else
#endif
- sk->sk_lingertime = ling.l_linger * HZ;
+ sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
sock_set_flag(sk, SOCK_LINGER);
}
break;
@@ -621,7 +637,8 @@ lenout:
* @prot: struct proto associated with this new sock instance
* @zero_it: if we should zero the newly allocated sock
*/
-struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it)
+struct sock *sk_alloc(int family, unsigned int __nocast priority,
+ struct proto *prot, int zero_it)
{
struct sock *sk = NULL;
kmem_cache_t *slab = prot->slab;
@@ -683,6 +700,80 @@ void sk_free(struct sock *sk)
module_put(owner);
}
+struct sock *sk_clone(const struct sock *sk, const unsigned int __nocast priority)
+{
+ struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
+
+ if (newsk != NULL) {
+ struct sk_filter *filter;
+
+ memcpy(newsk, sk, sk->sk_prot->obj_size);
+
+ /* SANITY */
+ sk_node_init(&newsk->sk_node);
+ sock_lock_init(newsk);
+ bh_lock_sock(newsk);
+
+ atomic_set(&newsk->sk_rmem_alloc, 0);
+ atomic_set(&newsk->sk_wmem_alloc, 0);
+ atomic_set(&newsk->sk_omem_alloc, 0);
+ skb_queue_head_init(&newsk->sk_receive_queue);
+ skb_queue_head_init(&newsk->sk_write_queue);
+
+ rwlock_init(&newsk->sk_dst_lock);
+ rwlock_init(&newsk->sk_callback_lock);
+
+ newsk->sk_dst_cache = NULL;
+ newsk->sk_wmem_queued = 0;
+ newsk->sk_forward_alloc = 0;
+ newsk->sk_send_head = NULL;
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+
+ sock_reset_flag(newsk, SOCK_DONE);
+ skb_queue_head_init(&newsk->sk_error_queue);
+
+ filter = newsk->sk_filter;
+ if (filter != NULL)
+ sk_filter_charge(newsk, filter);
+
+ if (unlikely(xfrm_sk_clone_policy(newsk))) {
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ newsk->sk_destruct = NULL;
+ sk_free(newsk);
+ newsk = NULL;
+ goto out;
+ }
+
+ newsk->sk_err = 0;
+ newsk->sk_priority = 0;
+ atomic_set(&newsk->sk_refcnt, 2);
+
+ /*
+ * Increment the counter in the same struct proto as the master
+ * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+ * is the same as sk->sk_prot->socks, as this field was copied
+ * with memcpy).
+ *
+ * This _changes_ the previous behaviour, where
+ * tcp_create_openreq_child always was incrementing the
+ * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+ * to be taken into account in all callers. -acme
+ */
+ sk_refcnt_debug_inc(newsk);
+ newsk->sk_socket = NULL;
+ newsk->sk_sleep = NULL;
+
+ if (newsk->sk_prot->sockets_allocated)
+ atomic_inc(newsk->sk_prot->sockets_allocated);
+ }
+out:
+ return newsk;
+}
+
+EXPORT_SYMBOL_GPL(sk_clone);
+
void __init sk_init(void)
{
if (num_physpages <= 4096) {
@@ -749,7 +840,8 @@ unsigned long sock_i_ino(struct sock *sk)
/*
* Allocate a skb from the socket's send buffer.
*/
-struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
+struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
+ unsigned int __nocast priority)
{
if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
struct sk_buff * skb = alloc_skb(size, priority);
@@ -764,7 +856,8 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int
/*
* Allocate a skb from the socket's receive buffer.
*/
-struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
+struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
+ unsigned int __nocast priority)
{
if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
struct sk_buff *skb = alloc_skb(size, priority);
@@ -779,7 +872,7 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int
/*
* Allocate a memory block from the socket's option memory buffer.
*/
-void *sock_kmalloc(struct sock *sk, int size, int priority)
+void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
{
if ((unsigned)size <= sysctl_optmem_max &&
atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
@@ -1348,11 +1441,7 @@ void sk_common_release(struct sock *sk)
xfrm_sk_free_policy(sk);
-#ifdef INET_REFCNT_DEBUG
- if (atomic_read(&sk->sk_refcnt) != 1)
- printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n",
- sk, atomic_read(&sk->sk_refcnt));
-#endif
+ sk_refcnt_debug_release(sk);
sock_put(sk);
}
@@ -1363,6 +1452,8 @@ static LIST_HEAD(proto_list);
int proto_register(struct proto *prot, int alloc_slab)
{
+ char *request_sock_slab_name = NULL;
+ char *timewait_sock_slab_name;
int rc = -ENOBUFS;
if (alloc_slab) {
@@ -1374,6 +1465,42 @@ int proto_register(struct proto *prot, int alloc_slab)
prot->name);
goto out;
}
+
+ if (prot->rsk_prot != NULL) {
+ static const char mask[] = "request_sock_%s";
+
+ request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+ if (request_sock_slab_name == NULL)
+ goto out_free_sock_slab;
+
+ sprintf(request_sock_slab_name, mask, prot->name);
+ prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
+ prot->rsk_prot->obj_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+
+ if (prot->rsk_prot->slab == NULL) {
+ printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
+ prot->name);
+ goto out_free_request_sock_slab_name;
+ }
+ }
+
+ if (prot->twsk_obj_size) {
+ static const char mask[] = "tw_sock_%s";
+
+ timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+
+ if (timewait_sock_slab_name == NULL)
+ goto out_free_request_sock_slab;
+
+ sprintf(timewait_sock_slab_name, mask, prot->name);
+ prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name,
+ prot->twsk_obj_size,
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (prot->twsk_slab == NULL)
+ goto out_free_timewait_sock_slab_name;
+ }
}
write_lock(&proto_list_lock);
@@ -1382,6 +1509,19 @@ int proto_register(struct proto *prot, int alloc_slab)
rc = 0;
out:
return rc;
+out_free_timewait_sock_slab_name:
+ kfree(timewait_sock_slab_name);
+out_free_request_sock_slab:
+ if (prot->rsk_prot && prot->rsk_prot->slab) {
+ kmem_cache_destroy(prot->rsk_prot->slab);
+ prot->rsk_prot->slab = NULL;
+ }
+out_free_request_sock_slab_name:
+ kfree(request_sock_slab_name);
+out_free_sock_slab:
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+ goto out;
}
EXPORT_SYMBOL(proto_register);
@@ -1389,14 +1529,29 @@ EXPORT_SYMBOL(proto_register);
void proto_unregister(struct proto *prot)
{
write_lock(&proto_list_lock);
+ list_del(&prot->node);
+ write_unlock(&proto_list_lock);
if (prot->slab != NULL) {
kmem_cache_destroy(prot->slab);
prot->slab = NULL;
}
- list_del(&prot->node);
- write_unlock(&proto_list_lock);
+ if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
+ const char *name = kmem_cache_name(prot->rsk_prot->slab);
+
+ kmem_cache_destroy(prot->rsk_prot->slab);
+ kfree(name);
+ prot->rsk_prot->slab = NULL;
+ }
+
+ if (prot->twsk_slab != NULL) {
+ const char *name = kmem_cache_name(prot->twsk_slab);
+
+ kmem_cache_destroy(prot->twsk_slab);
+ kfree(name);
+ prot->twsk_slab = NULL;
+ }
}
EXPORT_SYMBOL(proto_unregister);
@@ -1563,8 +1718,8 @@ EXPORT_SYMBOL(sock_wfree);
EXPORT_SYMBOL(sock_wmalloc);
EXPORT_SYMBOL(sock_i_uid);
EXPORT_SYMBOL(sock_i_ino);
-#ifdef CONFIG_SYSCTL
EXPORT_SYMBOL(sysctl_optmem_max);
+#ifdef CONFIG_SYSCTL
EXPORT_SYMBOL(sysctl_rmem_max);
EXPORT_SYMBOL(sysctl_wmem_max);
#endif
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index c8be646c..2f278c8 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -9,45 +9,23 @@
#include <linux/sysctl.h>
#include <linux/config.h>
#include <linux/module.h>
+#include <linux/socket.h>
+#include <net/sock.h>
#ifdef CONFIG_SYSCTL
extern int netdev_max_backlog;
extern int weight_p;
-extern int no_cong_thresh;
-extern int no_cong;
-extern int lo_cong;
-extern int mod_cong;
-extern int netdev_fastroute;
-extern int net_msg_cost;
-extern int net_msg_burst;
extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;
-extern __u32 sysctl_wmem_default;
-extern __u32 sysctl_rmem_default;
extern int sysctl_core_destroy_delay;
-extern int sysctl_optmem_max;
-extern int sysctl_somaxconn;
#ifdef CONFIG_NET_DIVERT
extern char sysctl_divert_version[];
#endif /* CONFIG_NET_DIVERT */
-/*
- * This strdup() is used for creating copies of network
- * device names to be handed over to sysctl.
- */
-
-char *net_sysctl_strdup(const char *s)
-{
- char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
- if (rv)
- strcpy(rv, s);
- return rv;
-}
-
ctl_table core_table[] = {
#ifdef CONFIG_NET
{
@@ -99,38 +77,6 @@ ctl_table core_table[] = {
.proc_handler = &proc_dointvec
},
{
- .ctl_name = NET_CORE_NO_CONG_THRESH,
- .procname = "no_cong_thresh",
- .data = &no_cong_thresh,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- .ctl_name = NET_CORE_NO_CONG,
- .procname = "no_cong",
- .data = &no_cong,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- .ctl_name = NET_CORE_LO_CONG,
- .procname = "lo_cong",
- .data = &lo_cong,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
- .ctl_name = NET_CORE_MOD_CONG,
- .procname = "mod_cong",
- .data = &mod_cong,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec
- },
- {
.ctl_name = NET_CORE_MSG_COST,
.procname = "message_cost",
.data = &net_msg_cost,
@@ -174,9 +120,15 @@ ctl_table core_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec
},
+ {
+ .ctl_name = NET_CORE_BUDGET,
+ .procname = "netdev_budget",
+ .data = &netdev_budget,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
{ .ctl_name = 0 }
};
-EXPORT_SYMBOL(net_sysctl_strdup);
-
#endif
diff --git a/net/core/utils.c b/net/core/utils.c
index e11a865..7b5970f 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -16,17 +16,19 @@
#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/inet.h>
#include <linux/mm.h>
+#include <linux/net.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/random.h>
#include <linux/percpu.h>
#include <linux/init.h>
+#include <asm/byteorder.h>
#include <asm/system.h>
#include <asm/uaccess.h>
-
/*
This is a maximally equidistributed combined Tausworthe generator
based on code from GNU Scientific Library 1.5 (30 Jun 2004)
@@ -153,3 +155,38 @@ int net_ratelimit(void)
EXPORT_SYMBOL(net_random);
EXPORT_SYMBOL(net_ratelimit);
EXPORT_SYMBOL(net_srandom);
+
+/*
+ * Convert an ASCII string to binary IP.
+ * This is outside of net/ipv4/ because various code that uses IP addresses
+ * is otherwise not dependent on the TCP/IP stack.
+ */
+
+__u32 in_aton(const char *str)
+{
+ unsigned long l;
+ unsigned int val;
+ int i;
+
+ l = 0;
+ for (i = 0; i < 4; i++)
+ {
+ l <<= 8;
+ if (*str != '\0')
+ {
+ val = 0;
+ while (*str != '\0' && *str != '.')
+ {
+ val *= 10;
+ val += *str - '0';
+ str++;
+ }
+ l |= val;
+ if (*str != '\0')
+ str++;
+ }
+ }
+ return(htonl(l));
+}
+
+EXPORT_SYMBOL(in_aton);
diff --git a/net/core/wireless.c b/net/core/wireless.c
index 750cc5d..d17f158 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -2,7 +2,7 @@
* This file implement the Wireless Extensions APIs.
*
* Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com>
- * Copyright (c) 1997-2004 Jean Tourrilhes, All Rights Reserved.
+ * Copyright (c) 1997-2005 Jean Tourrilhes, All Rights Reserved.
*
* (As all part of the Linux kernel, this file is GPL)
*/
@@ -58,6 +58,13 @@
* o Add wmb() in iw_handler_set_spy() for non-coherent archs/cpus
* Based on patch from Pavel Roskin <proski@gnu.org> :
* o Fix kernel data leak to user space in private handler handling
+ *
+ * v7 - 18.3.05 - Jean II
+ * o Remove (struct iw_point *)->pointer from events and streams
+ * o Remove spy_offset from struct iw_handler_def
+ * o Start deprecating dev->get_wireless_stats, output a warning
+ * o If IW_QUAL_DBM is set, show dBm values in /proc/net/wireless
+ * o Don't loose INVALID/DBM flags when clearing UPDATED flags (iwstats)
*/
/***************************** INCLUDES *****************************/
@@ -187,6 +194,12 @@ static const struct iw_ioctl_description standard_ioctl[] = {
.header_type = IW_HEADER_TYPE_ADDR,
.flags = IW_DESCR_FLAG_DUMP,
},
+ [SIOCSIWMLME - SIOCIWFIRST] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .min_tokens = sizeof(struct iw_mlme),
+ .max_tokens = sizeof(struct iw_mlme),
+ },
[SIOCGIWAPLIST - SIOCIWFIRST] = {
.header_type = IW_HEADER_TYPE_POINT,
.token_size = sizeof(struct sockaddr) +
@@ -195,7 +208,10 @@ static const struct iw_ioctl_description standard_ioctl[] = {
.flags = IW_DESCR_FLAG_NOMAX,
},
[SIOCSIWSCAN - SIOCIWFIRST] = {
- .header_type = IW_HEADER_TYPE_PARAM,
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .min_tokens = 0,
+ .max_tokens = sizeof(struct iw_scan_req),
},
[SIOCGIWSCAN - SIOCIWFIRST] = {
.header_type = IW_HEADER_TYPE_POINT,
@@ -273,6 +289,42 @@ static const struct iw_ioctl_description standard_ioctl[] = {
[SIOCGIWPOWER - SIOCIWFIRST] = {
.header_type = IW_HEADER_TYPE_PARAM,
},
+ [SIOCSIWGENIE - SIOCIWFIRST] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_GENERIC_IE_MAX,
+ },
+ [SIOCGIWGENIE - SIOCIWFIRST] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_GENERIC_IE_MAX,
+ },
+ [SIOCSIWAUTH - SIOCIWFIRST] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [SIOCGIWAUTH - SIOCIWFIRST] = {
+ .header_type = IW_HEADER_TYPE_PARAM,
+ },
+ [SIOCSIWENCODEEXT - SIOCIWFIRST] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .min_tokens = sizeof(struct iw_encode_ext),
+ .max_tokens = sizeof(struct iw_encode_ext) +
+ IW_ENCODING_TOKEN_MAX,
+ },
+ [SIOCGIWENCODEEXT - SIOCIWFIRST] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .min_tokens = sizeof(struct iw_encode_ext),
+ .max_tokens = sizeof(struct iw_encode_ext) +
+ IW_ENCODING_TOKEN_MAX,
+ },
+ [SIOCSIWPMKSA - SIOCIWFIRST] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .min_tokens = sizeof(struct iw_pmksa),
+ .max_tokens = sizeof(struct iw_pmksa),
+ },
};
static const int standard_ioctl_num = (sizeof(standard_ioctl) /
sizeof(struct iw_ioctl_description));
@@ -299,6 +351,31 @@ static const struct iw_ioctl_description standard_event[] = {
[IWEVEXPIRED - IWEVFIRST] = {
.header_type = IW_HEADER_TYPE_ADDR,
},
+ [IWEVGENIE - IWEVFIRST] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_GENERIC_IE_MAX,
+ },
+ [IWEVMICHAELMICFAILURE - IWEVFIRST] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = sizeof(struct iw_michaelmicfailure),
+ },
+ [IWEVASSOCREQIE - IWEVFIRST] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_GENERIC_IE_MAX,
+ },
+ [IWEVASSOCRESPIE - IWEVFIRST] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = IW_GENERIC_IE_MAX,
+ },
+ [IWEVPMKIDCAND - IWEVFIRST] = {
+ .header_type = IW_HEADER_TYPE_POINT,
+ .token_size = 1,
+ .max_tokens = sizeof(struct iw_pmkid_cand),
+ },
};
static const int standard_event_num = (sizeof(standard_event) /
sizeof(struct iw_ioctl_description));
@@ -376,10 +453,14 @@ static inline struct iw_statistics *get_wireless_stats(struct net_device *dev)
(dev->wireless_handlers->get_wireless_stats != NULL))
return dev->wireless_handlers->get_wireless_stats(dev);
- /* Old location, will be phased out in next WE */
- return (dev->get_wireless_stats ?
- dev->get_wireless_stats(dev) :
- (struct iw_statistics *) NULL);
+ /* Old location, field to be removed in next WE */
+ if(dev->get_wireless_stats) {
+ printk(KERN_DEBUG "%s (WE) : Driver using old /proc/net/wireless support, please fix driver !\n",
+ dev->name);
+ return dev->get_wireless_stats(dev);
+ }
+ /* Not found */
+ return (struct iw_statistics *) NULL;
}
/* ---------------------------------------------------------------- */
@@ -471,16 +552,18 @@ static __inline__ void wireless_seq_printf_stats(struct seq_file *seq,
dev->name, stats->status, stats->qual.qual,
stats->qual.updated & IW_QUAL_QUAL_UPDATED
? '.' : ' ',
- ((__u8) stats->qual.level),
+ ((__s32) stats->qual.level) -
+ ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
stats->qual.updated & IW_QUAL_LEVEL_UPDATED
? '.' : ' ',
- ((__u8) stats->qual.noise),
+ ((__s32) stats->qual.noise) -
+ ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0),
stats->qual.updated & IW_QUAL_NOISE_UPDATED
? '.' : ' ',
stats->discard.nwid, stats->discard.code,
stats->discard.fragment, stats->discard.retries,
stats->discard.misc, stats->miss.beacon);
- stats->qual.updated = 0;
+ stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
}
}
@@ -501,10 +584,6 @@ static int wireless_seq_show(struct seq_file *seq, void *v)
return 0;
}
-extern void *dev_seq_start(struct seq_file *seq, loff_t *pos);
-extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos);
-extern void dev_seq_stop(struct seq_file *seq, void *v);
-
static struct seq_operations wireless_seq_ops = {
.start = dev_seq_start,
.next = dev_seq_next,
@@ -527,6 +606,7 @@ static struct file_operations wireless_seq_fops = {
int __init wireless_proc_init(void)
{
+ /* Create /proc/net/wireless entry */
if (!proc_net_fops_create("wireless", S_IRUGO, &wireless_seq_fops))
return -ENOMEM;
@@ -561,9 +641,9 @@ static inline int dev_iwstats(struct net_device *dev, struct ifreq *ifr)
sizeof(struct iw_statistics)))
return -EFAULT;
- /* Check if we need to clear the update flag */
+ /* Check if we need to clear the updated flag */
if(wrq->u.data.flags != 0)
- stats->qual.updated = 0;
+ stats->qual.updated &= ~IW_QUAL_ALL_UPDATED;
return 0;
} else
return -EOPNOTSUPP;
@@ -1032,6 +1112,7 @@ static inline int rtnetlink_fill_iwinfo(struct sk_buff * skb,
nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r));
r = NLMSG_DATA(nlh);
r->ifi_family = AF_UNSPEC;
+ r->__ifi_pad = 0;
r->ifi_type = dev->type;
r->ifi_index = dev->ifindex;
r->ifi_flags = dev->flags;
@@ -1073,8 +1154,8 @@ static inline void rtmsg_iwinfo(struct net_device * dev,
kfree_skb(skb);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_LINK;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC);
}
#endif /* WE_EVENT_NETLINK */
@@ -1094,10 +1175,11 @@ void wireless_send_event(struct net_device * dev,
struct iw_event *event; /* Mallocated whole event */
int event_len; /* Its size */
int hdr_len; /* Size of the event header */
+ int wrqu_off = 0; /* Offset in wrqu */
/* Don't "optimise" the following variable, it will crash */
unsigned cmd_index; /* *MUST* be unsigned */
- /* Get the description of the IOCTL */
+ /* Get the description of the Event */
if(cmd <= SIOCIWLAST) {
cmd_index = cmd - SIOCIWFIRST;
if(cmd_index < standard_ioctl_num)
@@ -1140,6 +1222,8 @@ void wireless_send_event(struct net_device * dev,
/* Calculate extra_len - extra is NULL for restricted events */
if(extra != NULL)
extra_len = wrqu->data.length * descr->token_size;
+ /* Always at an offset in wrqu */
+ wrqu_off = IW_EV_POINT_OFF;
#ifdef WE_EVENT_DEBUG
printk(KERN_DEBUG "%s (WE) : Event 0x%04X, tokens %d, extra_len %d\n", dev->name, cmd, wrqu->data.length, extra_len);
#endif /* WE_EVENT_DEBUG */
@@ -1150,7 +1234,7 @@ void wireless_send_event(struct net_device * dev,
event_len = hdr_len + extra_len;
#ifdef WE_EVENT_DEBUG
- printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, event_len %d\n", dev->name, cmd, hdr_len, event_len);
+ printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, wrqu_off %d, event_len %d\n", dev->name, cmd, hdr_len, wrqu_off, event_len);
#endif /* WE_EVENT_DEBUG */
/* Create temporary buffer to hold the event */
@@ -1161,7 +1245,7 @@ void wireless_send_event(struct net_device * dev,
/* Fill event */
event->len = event_len;
event->cmd = cmd;
- memcpy(&event->u, wrqu, hdr_len - IW_EV_LCP_LEN);
+ memcpy(&event->u, ((char *) wrqu) + wrqu_off, hdr_len - IW_EV_LCP_LEN);
if(extra != NULL)
memcpy(((char *) event) + hdr_len, extra, extra_len);
@@ -1182,7 +1266,7 @@ void wireless_send_event(struct net_device * dev,
* Now, the driver can delegate this task to Wireless Extensions.
* It needs to use those standard spy iw_handler in struct iw_handler_def,
* push data to us via wireless_spy_update() and include struct iw_spy_data
- * in its private part (and advertise it in iw_handler_def->spy_offset).
+ * in its private part (and export it in net_device->wireless_data->spy_data).
* One of the main advantage of centralising spy support here is that
* it becomes much easier to improve and extend it without having to touch
* the drivers. One example is the addition of the Spy-Threshold events.
@@ -1199,10 +1283,7 @@ static inline struct iw_spy_data * get_spydata(struct net_device *dev)
/* This is the new way */
if(dev->wireless_data)
return(dev->wireless_data->spy_data);
-
- /* This is the old way. Doesn't work for multi-headed drivers.
- * It will be removed in the next version of WE. */
- return (dev->priv + dev->wireless_handlers->spy_offset);
+ return NULL;
}
/*------------------------------------------------------------------*/
@@ -1217,10 +1298,6 @@ int iw_handler_set_spy(struct net_device * dev,
struct iw_spy_data * spydata = get_spydata(dev);
struct sockaddr * address = (struct sockaddr *) extra;
- if(!dev->wireless_data)
- /* Help user know that driver needs updating */
- printk(KERN_DEBUG "%s (WE) : Driver using old/buggy spy support, please fix driver !\n",
- dev->name);
/* Make sure driver is not buggy or using the old API */
if(!spydata)
return -EOPNOTSUPP;
@@ -1251,7 +1328,7 @@ int iw_handler_set_spy(struct net_device * dev,
sizeof(struct iw_quality) * IW_MAX_SPY);
#ifdef WE_SPY_DEBUG
- printk(KERN_DEBUG "iw_handler_set_spy() : offset %ld, spydata %p, num %d\n", dev->wireless_handlers->spy_offset, spydata, wrqu->data.length);
+ printk(KERN_DEBUG "iw_handler_set_spy() : wireless_data %p, spydata %p, num %d\n", dev->wireless_data, spydata, wrqu->data.length);
for (i = 0; i < wrqu->data.length; i++)
printk(KERN_DEBUG
"%02X:%02X:%02X:%02X:%02X:%02X \n",
@@ -1304,7 +1381,7 @@ int iw_handler_get_spy(struct net_device * dev,
sizeof(struct iw_quality) * spydata->spy_number);
/* Reset updated flags. */
for(i = 0; i < spydata->spy_number; i++)
- spydata->spy_stat[i].updated = 0;
+ spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED;
return 0;
}
@@ -1419,7 +1496,7 @@ void wireless_spy_update(struct net_device * dev,
return;
#ifdef WE_SPY_DEBUG
- printk(KERN_DEBUG "wireless_spy_update() : offset %ld, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_handlers->spy_offset, spydata, address[0], address[1], address[2], address[3], address[4], address[5]);
+ printk(KERN_DEBUG "wireless_spy_update() : wireless_data %p, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_data, spydata, address[0], address[1], address[2], address[3], address[4], address[5]);
#endif /* WE_SPY_DEBUG */
/* Update all records that match */
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
new file mode 100644
index 0000000..187ac18
--- /dev/null
+++ b/net/dccp/Kconfig
@@ -0,0 +1,50 @@
+menu "DCCP Configuration (EXPERIMENTAL)"
+ depends on INET && EXPERIMENTAL
+
+config IP_DCCP
+ tristate "The DCCP Protocol (EXPERIMENTAL)"
+ ---help---
+ Datagram Congestion Control Protocol
+
+ From draft-ietf-dccp-spec-11 <http://www.icir.org/kohler/dcp/draft-ietf-dccp-spec-11.txt>.
+
+ The Datagram Congestion Control Protocol (DCCP) is a transport
+ protocol that implements bidirectional, unicast connections of
+ congestion-controlled, unreliable datagrams. It should be suitable
+ for use by applications such as streaming media, Internet telephony,
+ and on-line games
+
+ To compile this protocol support as a module, choose M here: the
+ module will be called dccp.
+
+ If in doubt, say N.
+
+config INET_DCCP_DIAG
+ depends on IP_DCCP && INET_DIAG
+ def_tristate y if (IP_DCCP = y && INET_DIAG = y)
+ def_tristate m
+
+source "net/dccp/ccids/Kconfig"
+
+menu "DCCP Kernel Hacking"
+ depends on IP_DCCP && DEBUG_KERNEL=y
+
+config IP_DCCP_DEBUG
+ bool "DCCP debug messages"
+ ---help---
+ Only use this if you're hacking DCCP.
+
+ Just say N.
+
+config IP_DCCP_UNLOAD_HACK
+ depends on IP_DCCP=m && IP_DCCP_CCID3=m
+ bool "DCCP control sock unload hack"
+ ---help---
+ Enable this to be able to unload the dccp module when the it
+ has only one refcount held, the control sock one. Just execute
+ "rmmod dccp_ccid3 dccp"
+
+ Just say N.
+endmenu
+
+endmenu
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
new file mode 100644
index 0000000..fb97bb0
--- /dev/null
+++ b/net/dccp/Makefile
@@ -0,0 +1,10 @@
+obj-$(CONFIG_IP_DCCP) += dccp.o
+
+dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \
+ timer.o
+
+obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
+
+dccp_diag-y := diag.o
+
+obj-y += ccids/
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
new file mode 100644
index 0000000..9d8fc0e
--- /dev/null
+++ b/net/dccp/ccid.c
@@ -0,0 +1,139 @@
+/*
+ * net/dccp/ccid.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * CCID infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "ccid.h"
+
+static struct ccid *ccids[CCID_MAX];
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
+static atomic_t ccids_lockct = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(ccids_lock);
+
+/*
+ * The strategy is: modifications ccids vector are short, do not sleep and
+ * veeery rare, but read access should be free of any exclusive locks.
+ */
+static void ccids_write_lock(void)
+{
+ spin_lock(&ccids_lock);
+ while (atomic_read(&ccids_lockct) != 0) {
+ spin_unlock(&ccids_lock);
+ yield();
+ spin_lock(&ccids_lock);
+ }
+}
+
+static inline void ccids_write_unlock(void)
+{
+ spin_unlock(&ccids_lock);
+}
+
+static inline void ccids_read_lock(void)
+{
+ atomic_inc(&ccids_lockct);
+ spin_unlock_wait(&ccids_lock);
+}
+
+static inline void ccids_read_unlock(void)
+{
+ atomic_dec(&ccids_lockct);
+}
+
+#else
+#define ccids_write_lock() do { } while(0)
+#define ccids_write_unlock() do { } while(0)
+#define ccids_read_lock() do { } while(0)
+#define ccids_read_unlock() do { } while(0)
+#endif
+
+int ccid_register(struct ccid *ccid)
+{
+ int err;
+
+ if (ccid->ccid_init == NULL)
+ return -1;
+
+ ccids_write_lock();
+ err = -EEXIST;
+ if (ccids[ccid->ccid_id] == NULL) {
+ ccids[ccid->ccid_id] = ccid;
+ err = 0;
+ }
+ ccids_write_unlock();
+ if (err == 0)
+ pr_info("CCID: Registered CCID %d (%s)\n",
+ ccid->ccid_id, ccid->ccid_name);
+ return err;
+}
+
+EXPORT_SYMBOL_GPL(ccid_register);
+
+int ccid_unregister(struct ccid *ccid)
+{
+ ccids_write_lock();
+ ccids[ccid->ccid_id] = NULL;
+ ccids_write_unlock();
+ pr_info("CCID: Unregistered CCID %d (%s)\n",
+ ccid->ccid_id, ccid->ccid_name);
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(ccid_unregister);
+
+struct ccid *ccid_init(unsigned char id, struct sock *sk)
+{
+ struct ccid *ccid;
+
+#ifdef CONFIG_KMOD
+ if (ccids[id] == NULL)
+ request_module("net-dccp-ccid-%d", id);
+#endif
+ ccids_read_lock();
+
+ ccid = ccids[id];
+ if (ccid == NULL)
+ goto out;
+
+ if (!try_module_get(ccid->ccid_owner))
+ goto out_err;
+
+ if (ccid->ccid_init(sk) != 0)
+ goto out_module_put;
+out:
+ ccids_read_unlock();
+ return ccid;
+out_module_put:
+ module_put(ccid->ccid_owner);
+out_err:
+ ccid = NULL;
+ goto out;
+}
+
+EXPORT_SYMBOL_GPL(ccid_init);
+
+void ccid_exit(struct ccid *ccid, struct sock *sk)
+{
+ if (ccid == NULL)
+ return;
+
+ ccids_read_lock();
+
+ if (ccids[ccid->ccid_id] != NULL) {
+ if (ccid->ccid_exit != NULL)
+ ccid->ccid_exit(sk);
+ module_put(ccid->ccid_owner);
+ }
+
+ ccids_read_unlock();
+}
+
+EXPORT_SYMBOL_GPL(ccid_exit);
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
new file mode 100644
index 0000000..962f1e9
--- /dev/null
+++ b/net/dccp/ccid.h
@@ -0,0 +1,180 @@
+#ifndef _CCID_H
+#define _CCID_H
+/*
+ * net/dccp/ccid.h
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * CCID infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <net/sock.h>
+#include <linux/dccp.h>
+#include <linux/list.h>
+#include <linux/module.h>
+
+#define CCID_MAX 255
+
+struct ccid {
+ unsigned char ccid_id;
+ const char *ccid_name;
+ struct module *ccid_owner;
+ int (*ccid_init)(struct sock *sk);
+ void (*ccid_exit)(struct sock *sk);
+ int (*ccid_hc_rx_init)(struct sock *sk);
+ int (*ccid_hc_tx_init)(struct sock *sk);
+ void (*ccid_hc_rx_exit)(struct sock *sk);
+ void (*ccid_hc_tx_exit)(struct sock *sk);
+ void (*ccid_hc_rx_packet_recv)(struct sock *sk,
+ struct sk_buff *skb);
+ int (*ccid_hc_rx_parse_options)(struct sock *sk,
+ unsigned char option,
+ unsigned char len, u16 idx,
+ unsigned char* value);
+ void (*ccid_hc_rx_insert_options)(struct sock *sk,
+ struct sk_buff *skb);
+ void (*ccid_hc_tx_insert_options)(struct sock *sk,
+ struct sk_buff *skb);
+ void (*ccid_hc_tx_packet_recv)(struct sock *sk,
+ struct sk_buff *skb);
+ int (*ccid_hc_tx_parse_options)(struct sock *sk,
+ unsigned char option,
+ unsigned char len, u16 idx,
+ unsigned char* value);
+ int (*ccid_hc_tx_send_packet)(struct sock *sk,
+ struct sk_buff *skb, int len);
+ void (*ccid_hc_tx_packet_sent)(struct sock *sk, int more,
+ int len);
+ void (*ccid_hc_rx_get_info)(struct sock *sk,
+ struct tcp_info *info);
+ void (*ccid_hc_tx_get_info)(struct sock *sk,
+ struct tcp_info *info);
+};
+
+extern int ccid_register(struct ccid *ccid);
+extern int ccid_unregister(struct ccid *ccid);
+
+extern struct ccid *ccid_init(unsigned char id, struct sock *sk);
+extern void ccid_exit(struct ccid *ccid, struct sock *sk);
+
+static inline void __ccid_get(struct ccid *ccid)
+{
+ __module_get(ccid->ccid_owner);
+}
+
+static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb, int len)
+{
+ int rc = 0;
+ if (ccid->ccid_hc_tx_send_packet != NULL)
+ rc = ccid->ccid_hc_tx_send_packet(sk, skb, len);
+ return rc;
+}
+
+static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
+ int more, int len)
+{
+ if (ccid->ccid_hc_tx_packet_sent != NULL)
+ ccid->ccid_hc_tx_packet_sent(sk, more, len);
+}
+
+static inline int ccid_hc_rx_init(struct ccid *ccid, struct sock *sk)
+{
+ int rc = 0;
+ if (ccid->ccid_hc_rx_init != NULL)
+ rc = ccid->ccid_hc_rx_init(sk);
+ return rc;
+}
+
+static inline int ccid_hc_tx_init(struct ccid *ccid, struct sock *sk)
+{
+ int rc = 0;
+ if (ccid->ccid_hc_tx_init != NULL)
+ rc = ccid->ccid_hc_tx_init(sk);
+ return rc;
+}
+
+static inline void ccid_hc_rx_exit(struct ccid *ccid, struct sock *sk)
+{
+ if (ccid->ccid_hc_rx_exit != NULL &&
+ dccp_sk(sk)->dccps_hc_rx_ccid_private != NULL)
+ ccid->ccid_hc_rx_exit(sk);
+}
+
+static inline void ccid_hc_tx_exit(struct ccid *ccid, struct sock *sk)
+{
+ if (ccid->ccid_hc_tx_exit != NULL &&
+ dccp_sk(sk)->dccps_hc_tx_ccid_private != NULL)
+ ccid->ccid_hc_tx_exit(sk);
+}
+
+static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (ccid->ccid_hc_rx_packet_recv != NULL)
+ ccid->ccid_hc_rx_packet_recv(sk, skb);
+}
+
+static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (ccid->ccid_hc_tx_packet_recv != NULL)
+ ccid->ccid_hc_tx_packet_recv(sk, skb);
+}
+
+static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
+ unsigned char option,
+ unsigned char len, u16 idx,
+ unsigned char* value)
+{
+ int rc = 0;
+ if (ccid->ccid_hc_tx_parse_options != NULL)
+ rc = ccid->ccid_hc_tx_parse_options(sk, option, len, idx,
+ value);
+ return rc;
+}
+
+static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
+ unsigned char option,
+ unsigned char len, u16 idx,
+ unsigned char* value)
+{
+ int rc = 0;
+ if (ccid->ccid_hc_rx_parse_options != NULL)
+ rc = ccid->ccid_hc_rx_parse_options(sk, option, len, idx, value);
+ return rc;
+}
+
+static inline void ccid_hc_tx_insert_options(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (ccid->ccid_hc_tx_insert_options != NULL)
+ ccid->ccid_hc_tx_insert_options(sk, skb);
+}
+
+static inline void ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (ccid->ccid_hc_rx_insert_options != NULL)
+ ccid->ccid_hc_rx_insert_options(sk, skb);
+}
+
+static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
+ struct tcp_info *info)
+{
+ if (ccid->ccid_hc_rx_get_info != NULL)
+ ccid->ccid_hc_rx_get_info(sk, info);
+}
+
+static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
+ struct tcp_info *info)
+{
+ if (ccid->ccid_hc_tx_get_info != NULL)
+ ccid->ccid_hc_tx_get_info(sk, info);
+}
+#endif /* _CCID_H */
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
new file mode 100644
index 0000000..7684d83
--- /dev/null
+++ b/net/dccp/ccids/Kconfig
@@ -0,0 +1,29 @@
+menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
+ depends on IP_DCCP && EXPERIMENTAL
+
+config IP_DCCP_CCID3
+ tristate "CCID3 (TFRC) (EXPERIMENTAL)"
+ depends on IP_DCCP
+ ---help---
+ CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
+ rate-controlled congestion control mechanism. TFRC is designed to
+ be reasonably fair when competing for bandwidth with TCP-like flows,
+ where a flow is "reasonably fair" if its sending rate is generally
+ within a factor of two of the sending rate of a TCP flow under the
+ same conditions. However, TFRC has a much lower variation of
+ throughput over time compared with TCP, which makes CCID 3 more
+ suitable than CCID 2 for applications such streaming media where a
+ relatively smooth sending rate is of importance.
+
+ CCID 3 is further described in [CCID 3 PROFILE]. The TFRC
+ congestion control algorithms were initially described in RFC 3448.
+
+ This text was extracted from draft-ietf-dccp-spec-11.txt.
+
+ If in doubt, say M.
+
+config IP_DCCP_TFRC_LIB
+ depends on IP_DCCP_CCID3
+ def_tristate IP_DCCP_CCID3
+
+endmenu
diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile
new file mode 100644
index 0000000..956f79f
--- /dev/null
+++ b/net/dccp/ccids/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o
+
+dccp_ccid3-y := ccid3.o
+
+obj-y += lib/
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
new file mode 100644
index 0000000..7bf3b3a
--- /dev/null
+++ b/net/dccp/ccids/ccid3.c
@@ -0,0 +1,1221 @@
+/*
+ * net/dccp/ccids/ccid3.c
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *
+ * An implementation of the DCCP protocol
+ *
+ * This code has been developed by the University of Waikato WAND
+ * research group. For further information please see http://www.wand.net.nz/
+ *
+ * This code also uses code from Lulea University, rereleased as GPL by its
+ * authors:
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ * and to make it work as a loadable module in the DCCP stack written by
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include "../ccid.h"
+#include "../dccp.h"
+#include "lib/packet_history.h"
+#include "lib/loss_interval.h"
+#include "lib/tfrc.h"
+#include "ccid3.h"
+
+/*
+ * Reason for maths with 10 here is to avoid 32 bit overflow when a is big.
+ */
+static inline u32 usecs_div(const u32 a, const u32 b)
+{
+ const u32 tmp = a * (USEC_PER_SEC / 10);
+ return b > 20 ? tmp / (b / 10) : tmp;
+}
+
+static int ccid3_debug;
+
+#ifdef CCID3_DEBUG
+#define ccid3_pr_debug(format, a...) \
+ do { if (ccid3_debug) \
+ printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \
+ } while (0)
+#else
+#define ccid3_pr_debug(format, a...)
+#endif
+
+static struct dccp_tx_hist *ccid3_tx_hist;
+static struct dccp_rx_hist *ccid3_rx_hist;
+static struct dccp_li_hist *ccid3_li_hist;
+
+static int ccid3_init(struct sock *sk)
+{
+ ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+ return 0;
+}
+
+static void ccid3_exit(struct sock *sk)
+{
+ ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+}
+
+/* TFRC sender states */
+enum ccid3_hc_tx_states {
+ TFRC_SSTATE_NO_SENT = 1,
+ TFRC_SSTATE_NO_FBACK,
+ TFRC_SSTATE_FBACK,
+ TFRC_SSTATE_TERM,
+};
+
+#ifdef CCID3_DEBUG
+static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
+{
+ static char *ccid3_state_names[] = {
+ [TFRC_SSTATE_NO_SENT] = "NO_SENT",
+ [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
+ [TFRC_SSTATE_FBACK] = "FBACK",
+ [TFRC_SSTATE_TERM] = "TERM",
+ };
+
+ return ccid3_state_names[state];
+}
+#endif
+
+static inline void ccid3_hc_tx_set_state(struct sock *sk,
+ enum ccid3_hc_tx_states state)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+ enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
+
+ ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+ dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
+ ccid3_tx_state_name(state));
+ WARN_ON(state == oldstate);
+ hctx->ccid3hctx_state = state;
+}
+
+/* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */
+static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx)
+{
+ /*
+ * If no feedback spec says t_ipi is 1 second (set elsewhere and then
+ * doubles after every no feedback timer (separate function)
+ */
+ if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
+ hctx->ccid3hctx_t_ipi = usecs_div(hctx->ccid3hctx_s,
+ hctx->ccid3hctx_x);
+}
+
+/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
+static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx)
+{
+ hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
+ TFRC_OPSYS_HALF_TIME_GRAN);
+}
+
+/*
+ * Update X by
+ * If (p > 0)
+ * x_calc = calcX(s, R, p);
+ * X = max(min(X_calc, 2 * X_recv), s / t_mbi);
+ * Else
+ * If (now - tld >= R)
+ * X = max(min(2 * X, 2 * X_recv), s / R);
+ * tld = now;
+ */
+static void ccid3_hc_tx_update_x(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+
+ /* To avoid large error in calcX */
+ if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) {
+ hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s,
+ hctx->ccid3hctx_rtt,
+ hctx->ccid3hctx_p);
+ hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc,
+ 2 * hctx->ccid3hctx_x_recv),
+ (hctx->ccid3hctx_s /
+ TFRC_MAX_BACK_OFF_TIME));
+ } else {
+ struct timeval now;
+
+ do_gettimeofday(&now);
+ if (timeval_delta(&now, &hctx->ccid3hctx_t_ld) >=
+ hctx->ccid3hctx_rtt) {
+ hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_recv,
+ hctx->ccid3hctx_x) * 2,
+ usecs_div(hctx->ccid3hctx_s,
+ hctx->ccid3hctx_rtt));
+ hctx->ccid3hctx_t_ld = now;
+ }
+ }
+}
+
+static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct dccp_sock *dp = dccp_sk(sk);
+ unsigned long next_tmout = 0;
+ struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ /* XXX: set some sensible MIB */
+ sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+ jiffies + HZ / 5);
+ goto out;
+ }
+
+ ccid3_pr_debug("%s, sk=%p, state=%s\n", dccp_role(sk), sk,
+ ccid3_tx_state_name(hctx->ccid3hctx_state));
+
+ switch (hctx->ccid3hctx_state) {
+ case TFRC_SSTATE_TERM:
+ goto out;
+ case TFRC_SSTATE_NO_FBACK:
+ /* Halve send rate */
+ hctx->ccid3hctx_x /= 2;
+ if (hctx->ccid3hctx_x < (hctx->ccid3hctx_s /
+ TFRC_MAX_BACK_OFF_TIME))
+ hctx->ccid3hctx_x = (hctx->ccid3hctx_s /
+ TFRC_MAX_BACK_OFF_TIME);
+
+ ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d "
+ "bytes/s\n",
+ dccp_role(sk), sk,
+ ccid3_tx_state_name(hctx->ccid3hctx_state),
+ hctx->ccid3hctx_x);
+ next_tmout = max_t(u32, 2 * usecs_div(hctx->ccid3hctx_s,
+ hctx->ccid3hctx_x),
+ TFRC_INITIAL_TIMEOUT);
+ /*
+ * FIXME - not sure above calculation is correct. See section
+ * 5 of CCID3 11 should adjust tx_t_ipi and double that to
+ * achieve it really
+ */
+ break;
+ case TFRC_SSTATE_FBACK:
+ /*
+ * Check if IDLE since last timeout and recv rate is less than
+ * 4 packets per RTT
+ */
+ if (!hctx->ccid3hctx_idle ||
+ (hctx->ccid3hctx_x_recv >=
+ 4 * usecs_div(hctx->ccid3hctx_s, hctx->ccid3hctx_rtt))) {
+ ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n",
+ dccp_role(sk), sk,
+ ccid3_tx_state_name(hctx->ccid3hctx_state));
+ /* Halve sending rate */
+
+ /* If (X_calc > 2 * X_recv)
+ * X_recv = max(X_recv / 2, s / (2 * t_mbi));
+ * Else
+ * X_recv = X_calc / 4;
+ */
+ BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P &&
+ hctx->ccid3hctx_x_calc == 0);
+
+ /* check also if p is zero -> x_calc is infinity? */
+ if (hctx->ccid3hctx_p < TFRC_SMALLEST_P ||
+ hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv)
+ hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2,
+ hctx->ccid3hctx_s / (2 * TFRC_MAX_BACK_OFF_TIME));
+ else
+ hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc / 4;
+
+ /* Update sending rate */
+ ccid3_hc_tx_update_x(sk);
+ }
+ /*
+ * Schedule no feedback timer to expire in
+ * max(4 * R, 2 * s / X)
+ */
+ next_tmout = max_t(u32, hctx->ccid3hctx_t_rto,
+ 2 * usecs_div(hctx->ccid3hctx_s,
+ hctx->ccid3hctx_x));
+ break;
+ default:
+ printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+ __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+ dump_stack();
+ goto out;
+ }
+
+ sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+ jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout)));
+ hctx->ccid3hctx_idle = 1;
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+static int ccid3_hc_tx_send_packet(struct sock *sk,
+ struct sk_buff *skb, int len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+ struct dccp_tx_hist_entry *new_packet;
+ struct timeval now;
+ long delay;
+ int rc = -ENOTCONN;
+
+ /* Check if pure ACK or Terminating*/
+
+ /*
+ * XXX: We only call this function for DATA and DATAACK, on, these
+ * packets can have zero length, but why the comment about "pure ACK"?
+ */
+ if (hctx == NULL || len == 0 ||
+ hctx->ccid3hctx_state == TFRC_SSTATE_TERM)
+ goto out;
+
+ /* See if last packet allocated was not sent */
+ new_packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
+ if (new_packet == NULL || new_packet->dccphtx_sent) {
+ new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist,
+ SLAB_ATOMIC);
+
+ rc = -ENOBUFS;
+ if (new_packet == NULL) {
+ ccid3_pr_debug("%s, sk=%p, not enough mem to add "
+ "to history, send refused\n",
+ dccp_role(sk), sk);
+ goto out;
+ }
+
+ dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, new_packet);
+ }
+
+ do_gettimeofday(&now);
+
+ switch (hctx->ccid3hctx_state) {
+ case TFRC_SSTATE_NO_SENT:
+ ccid3_pr_debug("%s, sk=%p, first packet(%llu)\n",
+ dccp_role(sk), sk, dp->dccps_gss);
+
+ hctx->ccid3hctx_no_feedback_timer.function = ccid3_hc_tx_no_feedback_timer;
+ hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk;
+ sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+ jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT));
+ hctx->ccid3hctx_last_win_count = 0;
+ hctx->ccid3hctx_t_last_win_count = now;
+ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+ hctx->ccid3hctx_t_ipi = TFRC_INITIAL_TIMEOUT;
+
+ /* Set nominal send time for initial packet */
+ hctx->ccid3hctx_t_nom = now;
+ timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+ hctx->ccid3hctx_t_ipi);
+ ccid3_calc_new_delta(hctx);
+ rc = 0;
+ break;
+ case TFRC_SSTATE_NO_FBACK:
+ case TFRC_SSTATE_FBACK:
+ delay = (timeval_delta(&now, &hctx->ccid3hctx_t_nom) -
+ hctx->ccid3hctx_delta);
+ ccid3_pr_debug("send_packet delay=%ld\n", delay);
+ delay /= -1000;
+ /* divide by -1000 is to convert to ms and get sign right */
+ rc = delay > 0 ? delay : 0;
+ break;
+ default:
+ printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+ __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+ dump_stack();
+ rc = -EINVAL;
+ break;
+ }
+
+ /* Can we send? if so add options and add to packet history */
+ if (rc == 0)
+ new_packet->dccphtx_ccval =
+ DCCP_SKB_CB(skb)->dccpd_ccval =
+ hctx->ccid3hctx_last_win_count;
+out:
+ return rc;
+}
+
+static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+ struct timeval now;
+
+ BUG_ON(hctx == NULL);
+
+ if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) {
+ ccid3_pr_debug("%s, sk=%p, while state is TFRC_SSTATE_TERM!\n",
+ dccp_role(sk), sk);
+ return;
+ }
+
+ do_gettimeofday(&now);
+
+ /* check if we have sent a data packet */
+ if (len > 0) {
+ unsigned long quarter_rtt;
+ struct dccp_tx_hist_entry *packet;
+
+ packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
+ if (packet == NULL) {
+ printk(KERN_CRIT "%s: packet doesn't exists in "
+ "history!\n", __FUNCTION__);
+ return;
+ }
+ if (packet->dccphtx_sent) {
+ printk(KERN_CRIT "%s: no unsent packet in history!\n",
+ __FUNCTION__);
+ return;
+ }
+ packet->dccphtx_tstamp = now;
+ packet->dccphtx_seqno = dp->dccps_gss;
+ /*
+ * Check if win_count have changed
+ * Algorithm in "8.1. Window Counter Valuer" in
+ * draft-ietf-dccp-ccid3-11.txt
+ */
+ quarter_rtt = timeval_delta(&now, &hctx->ccid3hctx_t_last_win_count);
+ if (likely(hctx->ccid3hctx_rtt > 8))
+ quarter_rtt /= hctx->ccid3hctx_rtt / 4;
+
+ if (quarter_rtt > 0) {
+ hctx->ccid3hctx_t_last_win_count = now;
+ hctx->ccid3hctx_last_win_count = (hctx->ccid3hctx_last_win_count +
+ min_t(unsigned long, quarter_rtt, 5)) % 16;
+ ccid3_pr_debug("%s, sk=%p, window changed from "
+ "%u to %u!\n",
+ dccp_role(sk), sk,
+ packet->dccphtx_ccval,
+ hctx->ccid3hctx_last_win_count);
+ }
+
+ hctx->ccid3hctx_idle = 0;
+ packet->dccphtx_rtt = hctx->ccid3hctx_rtt;
+ packet->dccphtx_sent = 1;
+ } else
+ ccid3_pr_debug("%s, sk=%p, seqno=%llu NOT inserted!\n",
+ dccp_role(sk), sk, dp->dccps_gss);
+
+ switch (hctx->ccid3hctx_state) {
+ case TFRC_SSTATE_NO_SENT:
+ /* if first wasn't pure ack */
+ if (len != 0)
+ printk(KERN_CRIT "%s: %s, First packet sent is noted "
+ "as a data packet\n",
+ __FUNCTION__, dccp_role(sk));
+ return;
+ case TFRC_SSTATE_NO_FBACK:
+ case TFRC_SSTATE_FBACK:
+ if (len > 0) {
+ hctx->ccid3hctx_t_nom = now;
+ ccid3_calc_new_t_ipi(hctx);
+ ccid3_calc_new_delta(hctx);
+ timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+ hctx->ccid3hctx_t_ipi);
+ }
+ break;
+ default:
+ printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+ __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+ dump_stack();
+ break;
+ }
+}
+
+static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+ struct ccid3_options_received *opt_recv;
+ struct dccp_tx_hist_entry *packet;
+ unsigned long next_tmout;
+ u32 t_elapsed;
+ u32 pinv;
+ u32 x_recv;
+ u32 r_sample;
+
+ if (hctx == NULL)
+ return;
+
+ if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) {
+ ccid3_pr_debug("%s, sk=%p, received a packet when "
+ "terminating!\n", dccp_role(sk), sk);
+ return;
+ }
+
+ /* we are only interested in ACKs */
+ if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
+ DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
+ return;
+
+ opt_recv = &hctx->ccid3hctx_options_received;
+
+ t_elapsed = dp->dccps_options_received.dccpor_elapsed_time;
+ x_recv = opt_recv->ccid3or_receive_rate;
+ pinv = opt_recv->ccid3or_loss_event_rate;
+
+ switch (hctx->ccid3hctx_state) {
+ case TFRC_SSTATE_NO_SENT:
+ /* FIXME: what to do here? */
+ return;
+ case TFRC_SSTATE_NO_FBACK:
+ case TFRC_SSTATE_FBACK:
+ /* Calculate new round trip sample by
+ * R_sample = (now - t_recvdata) - t_delay */
+ /* get t_recvdata from history */
+ packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist,
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ if (packet == NULL) {
+ ccid3_pr_debug("%s, sk=%p, seqno %llu(%s) does't "
+ "exist in history!\n",
+ dccp_role(sk), sk,
+ DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
+ return;
+ }
+
+ /* Update RTT */
+ r_sample = timeval_now_delta(&packet->dccphtx_tstamp);
+ /* FIXME: */
+ // r_sample -= usecs_to_jiffies(t_elapsed * 10);
+
+ /* Update RTT estimate by
+ * If (No feedback recv)
+ * R = R_sample;
+ * Else
+ * R = q * R + (1 - q) * R_sample;
+ *
+ * q is a constant, RFC 3448 recomments 0.9
+ */
+ if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
+ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
+ hctx->ccid3hctx_rtt = r_sample;
+ } else
+ hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 +
+ r_sample / 10;
+
+ ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, "
+ "r_sample=%us\n", dccp_role(sk), sk,
+ hctx->ccid3hctx_rtt, r_sample);
+
+ /* Update timeout interval */
+ hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
+ USEC_PER_SEC);
+
+ /* Update receive rate */
+ hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */
+
+ /* Update loss event rate */
+ if (pinv == ~0 || pinv == 0)
+ hctx->ccid3hctx_p = 0;
+ else {
+ hctx->ccid3hctx_p = 1000000 / pinv;
+
+ if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) {
+ hctx->ccid3hctx_p = TFRC_SMALLEST_P;
+ ccid3_pr_debug("%s, sk=%p, Smallest p used!\n",
+ dccp_role(sk), sk);
+ }
+ }
+
+ /* unschedule no feedback timer */
+ sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+
+ /* Update sending rate */
+ ccid3_hc_tx_update_x(sk);
+
+ /* Update next send time */
+ timeval_sub_usecs(&hctx->ccid3hctx_t_nom,
+ hctx->ccid3hctx_t_ipi);
+ ccid3_calc_new_t_ipi(hctx);
+ timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+ hctx->ccid3hctx_t_ipi);
+ ccid3_calc_new_delta(hctx);
+
+ /* remove all packets older than the one acked from history */
+ dccp_tx_hist_purge_older(ccid3_tx_hist,
+ &hctx->ccid3hctx_hist, packet);
+ /*
+ * As we have calculated new ipi, delta, t_nom it is possible that
+ * we now can send a packet, so wake up dccp_wait_for_ccids.
+ */
+ sk->sk_write_space(sk);
+
+ /*
+ * Schedule no feedback timer to expire in
+ * max(4 * R, 2 * s / X)
+ */
+ next_tmout = max(hctx->ccid3hctx_t_rto,
+ 2 * usecs_div(hctx->ccid3hctx_s,
+ hctx->ccid3hctx_x));
+
+ ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to "
+ "expire in %lu jiffies (%luus)\n",
+ dccp_role(sk), sk,
+ usecs_to_jiffies(next_tmout), next_tmout);
+
+ sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+ jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout)));
+
+ /* set idle flag */
+ hctx->ccid3hctx_idle = 1;
+ break;
+ default:
+ printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+ __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+ dump_stack();
+ break;
+ }
+}
+
+static void ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+
+ if (hctx == NULL || !(sk->sk_state == DCCP_OPEN ||
+ sk->sk_state == DCCP_PARTOPEN))
+ return;
+
+ DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
+}
+
+static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
+ unsigned char len, u16 idx,
+ unsigned char *value)
+{
+ int rc = 0;
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+ struct ccid3_options_received *opt_recv;
+
+ if (hctx == NULL)
+ return 0;
+
+ opt_recv = &hctx->ccid3hctx_options_received;
+
+ if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
+ opt_recv->ccid3or_seqno = dp->dccps_gsr;
+ opt_recv->ccid3or_loss_event_rate = ~0;
+ opt_recv->ccid3or_loss_intervals_idx = 0;
+ opt_recv->ccid3or_loss_intervals_len = 0;
+ opt_recv->ccid3or_receive_rate = 0;
+ }
+
+ switch (option) {
+ case TFRC_OPT_LOSS_EVENT_RATE:
+ if (len != 4) {
+ ccid3_pr_debug("%s, sk=%p, invalid len for "
+ "TFRC_OPT_LOSS_EVENT_RATE\n",
+ dccp_role(sk), sk);
+ rc = -EINVAL;
+ } else {
+ opt_recv->ccid3or_loss_event_rate = ntohl(*(u32 *)value);
+ ccid3_pr_debug("%s, sk=%p, LOSS_EVENT_RATE=%u\n",
+ dccp_role(sk), sk,
+ opt_recv->ccid3or_loss_event_rate);
+ }
+ break;
+ case TFRC_OPT_LOSS_INTERVALS:
+ opt_recv->ccid3or_loss_intervals_idx = idx;
+ opt_recv->ccid3or_loss_intervals_len = len;
+ ccid3_pr_debug("%s, sk=%p, LOSS_INTERVALS=(%u, %u)\n",
+ dccp_role(sk), sk,
+ opt_recv->ccid3or_loss_intervals_idx,
+ opt_recv->ccid3or_loss_intervals_len);
+ break;
+ case TFRC_OPT_RECEIVE_RATE:
+ if (len != 4) {
+ ccid3_pr_debug("%s, sk=%p, invalid len for "
+ "TFRC_OPT_RECEIVE_RATE\n",
+ dccp_role(sk), sk);
+ rc = -EINVAL;
+ } else {
+ opt_recv->ccid3or_receive_rate = ntohl(*(u32 *)value);
+ ccid3_pr_debug("%s, sk=%p, RECEIVE_RATE=%u\n",
+ dccp_role(sk), sk,
+ opt_recv->ccid3or_receive_rate);
+ }
+ break;
+ }
+
+ return rc;
+}
+
+static int ccid3_hc_tx_init(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx;
+
+ ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+
+ hctx = dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx),
+ gfp_any());
+ if (hctx == NULL)
+ return -ENOMEM;
+
+ memset(hctx, 0, sizeof(*hctx));
+
+ if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
+ dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
+ hctx->ccid3hctx_s = dp->dccps_packet_size;
+ else
+ hctx->ccid3hctx_s = TFRC_STD_PACKET_SIZE;
+
+ /* Set transmission rate to 1 packet per second */
+ hctx->ccid3hctx_x = hctx->ccid3hctx_s;
+ hctx->ccid3hctx_t_rto = USEC_PER_SEC;
+ hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
+ INIT_LIST_HEAD(&hctx->ccid3hctx_hist);
+ init_timer(&hctx->ccid3hctx_no_feedback_timer);
+
+ return 0;
+}
+
+static void ccid3_hc_tx_exit(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+
+ ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+ BUG_ON(hctx == NULL);
+
+ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
+ sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+
+ /* Empty packet history */
+ dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist);
+
+ kfree(dp->dccps_hc_tx_ccid_private);
+ dp->dccps_hc_tx_ccid_private = NULL;
+}
+
+/*
+ * RX Half Connection methods
+ */
+
+/* TFRC receiver states */
+enum ccid3_hc_rx_states {
+ TFRC_RSTATE_NO_DATA = 1,
+ TFRC_RSTATE_DATA,
+ TFRC_RSTATE_TERM = 127,
+};
+
+#ifdef CCID3_DEBUG
+static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
+{
+ static char *ccid3_rx_state_names[] = {
+ [TFRC_RSTATE_NO_DATA] = "NO_DATA",
+ [TFRC_RSTATE_DATA] = "DATA",
+ [TFRC_RSTATE_TERM] = "TERM",
+ };
+
+ return ccid3_rx_state_names[state];
+}
+#endif
+
+static inline void ccid3_hc_rx_set_state(struct sock *sk,
+ enum ccid3_hc_rx_states state)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+ enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
+
+ ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+ dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
+ ccid3_rx_state_name(state));
+ WARN_ON(state == oldstate);
+ hcrx->ccid3hcrx_state = state;
+}
+
+static void ccid3_hc_rx_send_feedback(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+ struct dccp_rx_hist_entry *packet;
+ struct timeval now;
+
+ ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+
+ do_gettimeofday(&now);
+
+ switch (hcrx->ccid3hcrx_state) {
+ case TFRC_RSTATE_NO_DATA:
+ hcrx->ccid3hcrx_x_recv = 0;
+ break;
+ case TFRC_RSTATE_DATA: {
+ const u32 delta = timeval_delta(&now,
+ &hcrx->ccid3hcrx_tstamp_last_feedback);
+
+ hcrx->ccid3hcrx_x_recv = (hcrx->ccid3hcrx_bytes_recv *
+ USEC_PER_SEC);
+ if (likely(delta > 1))
+ hcrx->ccid3hcrx_x_recv /= delta;
+ }
+ break;
+ default:
+ printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+ __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state);
+ dump_stack();
+ return;
+ }
+
+ packet = dccp_rx_hist_find_data_packet(&hcrx->ccid3hcrx_hist);
+ if (packet == NULL) {
+ printk(KERN_CRIT "%s: %s, sk=%p, no data packet in history!\n",
+ __FUNCTION__, dccp_role(sk), sk);
+ dump_stack();
+ return;
+ }
+
+ hcrx->ccid3hcrx_tstamp_last_feedback = now;
+ hcrx->ccid3hcrx_last_counter = packet->dccphrx_ccval;
+ hcrx->ccid3hcrx_seqno_last_counter = packet->dccphrx_seqno;
+ hcrx->ccid3hcrx_bytes_recv = 0;
+
+ /* Convert to multiples of 10us */
+ hcrx->ccid3hcrx_elapsed_time =
+ timeval_delta(&now, &packet->dccphrx_tstamp) / 10;
+ if (hcrx->ccid3hcrx_p == 0)
+ hcrx->ccid3hcrx_pinv = ~0;
+ else
+ hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p;
+ dccp_send_ack(sk);
+}
+
+static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ u32 x_recv, pinv;
+ struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+
+ if (hcrx == NULL || !(sk->sk_state == DCCP_OPEN ||
+ sk->sk_state == DCCP_PARTOPEN))
+ return;
+
+ DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter;
+
+ if (dccp_packet_without_ack(skb))
+ return;
+
+ if (hcrx->ccid3hcrx_elapsed_time != 0)
+ dccp_insert_option_elapsed_time(sk, skb,
+ hcrx->ccid3hcrx_elapsed_time);
+ dccp_insert_option_timestamp(sk, skb);
+ x_recv = htonl(hcrx->ccid3hcrx_x_recv);
+ pinv = htonl(hcrx->ccid3hcrx_pinv);
+ dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
+ &pinv, sizeof(pinv));
+ dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
+ &x_recv, sizeof(x_recv));
+}
+
+/* calculate first loss interval
+ *
+ * returns estimated loss interval in usecs */
+
+static u32 ccid3_hc_rx_calc_first_li(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+ struct dccp_rx_hist_entry *entry, *next, *tail = NULL;
+ u32 rtt, delta, x_recv, fval, p, tmp2;
+ struct timeval tstamp = { 0, };
+ int interval = 0;
+ int win_count = 0;
+ int step = 0;
+ u64 tmp1;
+
+ list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist,
+ dccphrx_node) {
+ if (dccp_rx_hist_entry_data_packet(entry)) {
+ tail = entry;
+
+ switch (step) {
+ case 0:
+ tstamp = entry->dccphrx_tstamp;
+ win_count = entry->dccphrx_ccval;
+ step = 1;
+ break;
+ case 1:
+ interval = win_count - entry->dccphrx_ccval;
+ if (interval < 0)
+ interval += TFRC_WIN_COUNT_LIMIT;
+ if (interval > 4)
+ goto found;
+ break;
+ }
+ }
+ }
+
+ if (step == 0) {
+ printk(KERN_CRIT "%s: %s, sk=%p, packet history contains no "
+ "data packets!\n",
+ __FUNCTION__, dccp_role(sk), sk);
+ return ~0;
+ }
+
+ if (interval == 0) {
+ ccid3_pr_debug("%s, sk=%p, Could not find a win_count "
+ "interval > 0. Defaulting to 1\n",
+ dccp_role(sk), sk);
+ interval = 1;
+ }
+found:
+ rtt = timeval_delta(&tstamp, &tail->dccphrx_tstamp) * 4 / interval;
+ ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n",
+ dccp_role(sk), sk, rtt);
+ if (rtt == 0)
+ rtt = 1;
+
+ delta = timeval_now_delta(&hcrx->ccid3hcrx_tstamp_last_feedback);
+ x_recv = hcrx->ccid3hcrx_bytes_recv * USEC_PER_SEC;
+ if (likely(delta > 1))
+ x_recv /= delta;
+
+ tmp1 = (u64)x_recv * (u64)rtt;
+ do_div(tmp1,10000000);
+ tmp2 = (u32)tmp1;
+ fval = (hcrx->ccid3hcrx_s * 100000) / tmp2;
+ /* do not alter order above or you will get overflow on 32 bit */
+ p = tfrc_calc_x_reverse_lookup(fval);
+ ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied "
+ "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
+
+ if (p == 0)
+ return ~0;
+ else
+ return 1000000 / p;
+}
+
+static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+
+ if (seq_loss != DCCP_MAX_SEQNO + 1 &&
+ list_empty(&hcrx->ccid3hcrx_li_hist)) {
+ struct dccp_li_hist_entry *li_tail;
+
+ li_tail = dccp_li_hist_interval_new(ccid3_li_hist,
+ &hcrx->ccid3hcrx_li_hist,
+ seq_loss, win_loss);
+ if (li_tail == NULL)
+ return;
+ li_tail->dccplih_interval = ccid3_hc_rx_calc_first_li(sk);
+ }
+ /* FIXME: find end of interval */
+}
+
+static void ccid3_hc_rx_detect_loss(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+ u8 win_loss;
+ const u64 seq_loss = dccp_rx_hist_detect_loss(&hcrx->ccid3hcrx_hist,
+ &hcrx->ccid3hcrx_li_hist,
+ &win_loss);
+
+ ccid3_hc_rx_update_li(sk, seq_loss, win_loss);
+}
+
+static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+ const struct dccp_options_received *opt_recv;
+ struct dccp_rx_hist_entry *packet;
+ struct timeval now;
+ u8 win_count;
+ u32 p_prev;
+ int ins;
+
+ if (hcrx == NULL)
+ return;
+
+ BUG_ON(!(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA ||
+ hcrx->ccid3hcrx_state == TFRC_RSTATE_DATA));
+
+ opt_recv = &dp->dccps_options_received;
+
+ switch (DCCP_SKB_CB(skb)->dccpd_type) {
+ case DCCP_PKT_ACK:
+ if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)
+ return;
+ case DCCP_PKT_DATAACK:
+ if (opt_recv->dccpor_timestamp_echo == 0)
+ break;
+ p_prev = hcrx->ccid3hcrx_rtt;
+ do_gettimeofday(&now);
+ hcrx->ccid3hcrx_rtt = timeval_usecs(&now) -
+ (opt_recv->dccpor_timestamp_echo -
+ opt_recv->dccpor_elapsed_time) * 10;
+ if (p_prev != hcrx->ccid3hcrx_rtt)
+ ccid3_pr_debug("%s, New RTT=%luus, elapsed time=%u\n",
+ dccp_role(sk), hcrx->ccid3hcrx_rtt,
+ opt_recv->dccpor_elapsed_time);
+ break;
+ case DCCP_PKT_DATA:
+ break;
+ default:
+ ccid3_pr_debug("%s, sk=%p, not DATA/DATAACK/ACK packet(%s)\n",
+ dccp_role(sk), sk,
+ dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
+ return;
+ }
+
+ packet = dccp_rx_hist_entry_new(ccid3_rx_hist, opt_recv->dccpor_ndp,
+ skb, SLAB_ATOMIC);
+ if (packet == NULL) {
+ ccid3_pr_debug("%s, sk=%p, Not enough mem to add rx packet "
+ "to history (consider it lost)!",
+ dccp_role(sk), sk);
+ return;
+ }
+
+ win_count = packet->dccphrx_ccval;
+
+ ins = dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist,
+ &hcrx->ccid3hcrx_li_hist, packet);
+
+ if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK)
+ return;
+
+ switch (hcrx->ccid3hcrx_state) {
+ case TFRC_RSTATE_NO_DATA:
+ ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial "
+ "feedback\n",
+ dccp_role(sk), sk,
+ dccp_state_name(sk->sk_state), skb);
+ ccid3_hc_rx_send_feedback(sk);
+ ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
+ return;
+ case TFRC_RSTATE_DATA:
+ hcrx->ccid3hcrx_bytes_recv += skb->len -
+ dccp_hdr(skb)->dccph_doff * 4;
+ if (ins != 0)
+ break;
+
+ do_gettimeofday(&now);
+ if (timeval_delta(&now, &hcrx->ccid3hcrx_tstamp_last_ack) >=
+ hcrx->ccid3hcrx_rtt) {
+ hcrx->ccid3hcrx_tstamp_last_ack = now;
+ ccid3_hc_rx_send_feedback(sk);
+ }
+ return;
+ default:
+ printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+ __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state);
+ dump_stack();
+ return;
+ }
+
+ /* Dealing with packet loss */
+ ccid3_pr_debug("%s, sk=%p(%s), data loss! Reacting...\n",
+ dccp_role(sk), sk, dccp_state_name(sk->sk_state));
+
+ ccid3_hc_rx_detect_loss(sk);
+ p_prev = hcrx->ccid3hcrx_p;
+
+ /* Calculate loss event rate */
+ if (!list_empty(&hcrx->ccid3hcrx_li_hist))
+ /* Scaling up by 1000000 as fixed decimal */
+ hcrx->ccid3hcrx_p = 1000000 / dccp_li_hist_calc_i_mean(&hcrx->ccid3hcrx_li_hist);
+
+ if (hcrx->ccid3hcrx_p > p_prev) {
+ ccid3_hc_rx_send_feedback(sk);
+ return;
+ }
+}
+
+static int ccid3_hc_rx_init(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_rx_sock *hcrx;
+
+ ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+
+ hcrx = dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx),
+ gfp_any());
+ if (hcrx == NULL)
+ return -ENOMEM;
+
+ memset(hcrx, 0, sizeof(*hcrx));
+
+ if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
+ dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
+ hcrx->ccid3hcrx_s = dp->dccps_packet_size;
+ else
+ hcrx->ccid3hcrx_s = TFRC_STD_PACKET_SIZE;
+
+ hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
+ INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist);
+ INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist);
+ /*
+ * XXX this seems to be paranoid, need to think more about this, for
+ * now start with something different than zero. -acme
+ */
+ hcrx->ccid3hcrx_rtt = USEC_PER_SEC / 5;
+ return 0;
+}
+
+static void ccid3_hc_rx_exit(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+
+ ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+
+ if (hcrx == NULL)
+ return;
+
+ ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
+
+ /* Empty packet history */
+ dccp_rx_hist_purge(ccid3_rx_hist, &hcrx->ccid3hcrx_hist);
+
+ /* Empty loss interval history */
+ dccp_li_hist_purge(ccid3_li_hist, &hcrx->ccid3hcrx_li_hist);
+
+ kfree(dp->dccps_hc_rx_ccid_private);
+ dp->dccps_hc_rx_ccid_private = NULL;
+}
+
+static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ const struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+
+ if (hcrx == NULL)
+ return;
+
+ info->tcpi_ca_state = hcrx->ccid3hcrx_state;
+ info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+ info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt;
+}
+
+static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ const struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+
+ if (hctx == NULL)
+ return;
+
+ info->tcpi_rto = hctx->ccid3hctx_t_rto;
+ info->tcpi_rtt = hctx->ccid3hctx_rtt;
+}
+
+static struct ccid ccid3 = {
+ .ccid_id = 3,
+ .ccid_name = "ccid3",
+ .ccid_owner = THIS_MODULE,
+ .ccid_init = ccid3_init,
+ .ccid_exit = ccid3_exit,
+ .ccid_hc_tx_init = ccid3_hc_tx_init,
+ .ccid_hc_tx_exit = ccid3_hc_tx_exit,
+ .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet,
+ .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent,
+ .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv,
+ .ccid_hc_tx_insert_options = ccid3_hc_tx_insert_options,
+ .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options,
+ .ccid_hc_rx_init = ccid3_hc_rx_init,
+ .ccid_hc_rx_exit = ccid3_hc_rx_exit,
+ .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
+ .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv,
+ .ccid_hc_rx_get_info = ccid3_hc_rx_get_info,
+ .ccid_hc_tx_get_info = ccid3_hc_tx_get_info,
+};
+
+module_param(ccid3_debug, int, 0444);
+MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
+
+static __init int ccid3_module_init(void)
+{
+ int rc = -ENOBUFS;
+
+ ccid3_rx_hist = dccp_rx_hist_new("ccid3");
+ if (ccid3_rx_hist == NULL)
+ goto out;
+
+ ccid3_tx_hist = dccp_tx_hist_new("ccid3");
+ if (ccid3_tx_hist == NULL)
+ goto out_free_rx;
+
+ ccid3_li_hist = dccp_li_hist_new("ccid3");
+ if (ccid3_li_hist == NULL)
+ goto out_free_tx;
+
+ rc = ccid_register(&ccid3);
+ if (rc != 0)
+ goto out_free_loss_interval_history;
+out:
+ return rc;
+
+out_free_loss_interval_history:
+ dccp_li_hist_delete(ccid3_li_hist);
+ ccid3_li_hist = NULL;
+out_free_tx:
+ dccp_tx_hist_delete(ccid3_tx_hist);
+ ccid3_tx_hist = NULL;
+out_free_rx:
+ dccp_rx_hist_delete(ccid3_rx_hist);
+ ccid3_rx_hist = NULL;
+ goto out;
+}
+module_init(ccid3_module_init);
+
+static __exit void ccid3_module_exit(void)
+{
+#ifdef CONFIG_IP_DCCP_UNLOAD_HACK
+ /*
+ * Hack to use while developing, so that we get rid of the control
+ * sock, that is what keeps a refcount on dccp.ko -acme
+ */
+ extern void dccp_ctl_sock_exit(void);
+
+ dccp_ctl_sock_exit();
+#endif
+ ccid_unregister(&ccid3);
+
+ if (ccid3_tx_hist != NULL) {
+ dccp_tx_hist_delete(ccid3_tx_hist);
+ ccid3_tx_hist = NULL;
+ }
+ if (ccid3_rx_hist != NULL) {
+ dccp_rx_hist_delete(ccid3_rx_hist);
+ ccid3_rx_hist = NULL;
+ }
+ if (ccid3_li_hist != NULL) {
+ dccp_li_hist_delete(ccid3_li_hist);
+ ccid3_li_hist = NULL;
+ }
+}
+module_exit(ccid3_module_exit);
+
+MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, "
+ "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
+MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("net-dccp-ccid-3");
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
new file mode 100644
index 0000000..ee8cbac
--- /dev/null
+++ b/net/dccp/ccids/ccid3.h
@@ -0,0 +1,137 @@
+/*
+ * net/dccp/ccids/ccid3.h
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *
+ * An implementation of the DCCP protocol
+ *
+ * This code has been developed by the University of Waikato WAND
+ * research group. For further information please see http://www.wand.net.nz/
+ * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ *
+ * This code also uses code from Lulea University, rereleased as GPL by its
+ * authors:
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ * and to make it work as a loadable module in the DCCP stack written by
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _DCCP_CCID3_H_
+#define _DCCP_CCID3_H_
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/time.h>
+#include <linux/types.h>
+
+#define TFRC_MIN_PACKET_SIZE 16
+#define TFRC_STD_PACKET_SIZE 256
+#define TFRC_MAX_PACKET_SIZE 65535
+
+/* Two seconds as per CCID3 spec */
+#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
+
+/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
+#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
+
+/* In seconds */
+#define TFRC_MAX_BACK_OFF_TIME 64
+
+#define TFRC_SMALLEST_P 40
+
+enum ccid3_options {
+ TFRC_OPT_LOSS_EVENT_RATE = 192,
+ TFRC_OPT_LOSS_INTERVALS = 193,
+ TFRC_OPT_RECEIVE_RATE = 194,
+};
+
+struct ccid3_options_received {
+ u64 ccid3or_seqno:48,
+ ccid3or_loss_intervals_idx:16;
+ u16 ccid3or_loss_intervals_len;
+ u32 ccid3or_loss_event_rate;
+ u32 ccid3or_receive_rate;
+};
+
+/** struct ccid3_hc_tx_sock - CCID3 sender half connection sock
+ *
+ * @ccid3hctx_state - Sender state
+ * @ccid3hctx_x - Current sending rate
+ * @ccid3hctx_x_recv - Receive rate
+ * @ccid3hctx_x_calc - Calculated send (?) rate
+ * @ccid3hctx_s - Packet size
+ * @ccid3hctx_rtt - Estimate of current round trip time in usecs
+ * @@ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
+ * @ccid3hctx_last_win_count - Last window counter sent
+ * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
+ * with last_win_count value sent
+ * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
+ * @ccid3hctx_idle - FIXME
+ * @ccid3hctx_t_ld - Time last doubled during slow start
+ * @ccid3hctx_t_nom - Nominal send time of next packet
+ * @ccid3hctx_t_ipi - Interpacket (send) interval
+ * @ccid3hctx_delta - Send timer delta
+ * @ccid3hctx_hist - Packet history
+ */
+struct ccid3_hc_tx_sock {
+ u32 ccid3hctx_x;
+ u32 ccid3hctx_x_recv;
+ u32 ccid3hctx_x_calc;
+ u16 ccid3hctx_s;
+ u32 ccid3hctx_rtt;
+ u32 ccid3hctx_p;
+ u8 ccid3hctx_state;
+ u8 ccid3hctx_last_win_count;
+ u8 ccid3hctx_idle;
+ struct timeval ccid3hctx_t_last_win_count;
+ struct timer_list ccid3hctx_no_feedback_timer;
+ struct timeval ccid3hctx_t_ld;
+ struct timeval ccid3hctx_t_nom;
+ u32 ccid3hctx_t_rto;
+ u32 ccid3hctx_t_ipi;
+ u32 ccid3hctx_delta;
+ struct list_head ccid3hctx_hist;
+ struct ccid3_options_received ccid3hctx_options_received;
+};
+
+struct ccid3_hc_rx_sock {
+ u64 ccid3hcrx_seqno_last_counter:48,
+ ccid3hcrx_state:8,
+ ccid3hcrx_last_counter:4;
+ unsigned long ccid3hcrx_rtt;
+ u32 ccid3hcrx_p;
+ u32 ccid3hcrx_bytes_recv;
+ struct timeval ccid3hcrx_tstamp_last_feedback;
+ struct timeval ccid3hcrx_tstamp_last_ack;
+ struct list_head ccid3hcrx_hist;
+ struct list_head ccid3hcrx_li_hist;
+ u16 ccid3hcrx_s;
+ u32 ccid3hcrx_pinv;
+ u32 ccid3hcrx_elapsed_time;
+ u32 ccid3hcrx_x_recv;
+};
+
+#define ccid3_hc_tx_field(s,field) (s->dccps_hc_tx_ccid_private == NULL ? 0 : \
+ ((struct ccid3_hc_tx_sock *)s->dccps_hc_tx_ccid_private)->ccid3hctx_##field)
+
+#define ccid3_hc_rx_field(s,field) (s->dccps_hc_rx_ccid_private == NULL ? 0 : \
+ ((struct ccid3_hc_rx_sock *)s->dccps_hc_rx_ccid_private)->ccid3hcrx_##field)
+
+#endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile
new file mode 100644
index 0000000..5f940a6
--- /dev/null
+++ b/net/dccp/ccids/lib/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o
+
+dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
new file mode 100644
index 0000000..4c01a54
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -0,0 +1,144 @@
+/*
+ * net/dccp/ccids/lib/loss_interval.c
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include "loss_interval.h"
+
+struct dccp_li_hist *dccp_li_hist_new(const char *name)
+{
+ struct dccp_li_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
+ static const char dccp_li_hist_mask[] = "li_hist_%s";
+ char *slab_name;
+
+ if (hist == NULL)
+ goto out;
+
+ slab_name = kmalloc(strlen(name) + sizeof(dccp_li_hist_mask) - 1,
+ GFP_ATOMIC);
+ if (slab_name == NULL)
+ goto out_free_hist;
+
+ sprintf(slab_name, dccp_li_hist_mask, name);
+ hist->dccplih_slab = kmem_cache_create(slab_name,
+ sizeof(struct dccp_li_hist_entry),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (hist->dccplih_slab == NULL)
+ goto out_free_slab_name;
+out:
+ return hist;
+out_free_slab_name:
+ kfree(slab_name);
+out_free_hist:
+ kfree(hist);
+ hist = NULL;
+ goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_new);
+
+void dccp_li_hist_delete(struct dccp_li_hist *hist)
+{
+ const char* name = kmem_cache_name(hist->dccplih_slab);
+
+ kmem_cache_destroy(hist->dccplih_slab);
+ kfree(name);
+ kfree(hist);
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_delete);
+
+void dccp_li_hist_purge(struct dccp_li_hist *hist, struct list_head *list)
+{
+ struct dccp_li_hist_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, list, dccplih_node) {
+ list_del_init(&entry->dccplih_node);
+ kmem_cache_free(hist->dccplih_slab, entry);
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_purge);
+
+/* Weights used to calculate loss event rate */
+/*
+ * These are integers as per section 8 of RFC3448. We can then divide by 4 *
+ * when we use it.
+ */
+static const int dccp_li_hist_w[DCCP_LI_HIST_IVAL_F_LENGTH] = {
+ 4, 4, 4, 4, 3, 2, 1, 1,
+};
+
+u32 dccp_li_hist_calc_i_mean(struct list_head *list)
+{
+ struct dccp_li_hist_entry *li_entry, *li_next;
+ int i = 0;
+ u32 i_tot;
+ u32 i_tot0 = 0;
+ u32 i_tot1 = 0;
+ u32 w_tot = 0;
+
+ list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) {
+ if (i < DCCP_LI_HIST_IVAL_F_LENGTH) {
+ i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i];
+ w_tot += dccp_li_hist_w[i];
+ }
+
+ if (i != 0)
+ i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1];
+
+ if (++i > DCCP_LI_HIST_IVAL_F_LENGTH)
+ break;
+ }
+
+ if (i != DCCP_LI_HIST_IVAL_F_LENGTH)
+ return 0;
+
+ i_tot = max(i_tot0, i_tot1);
+
+ /* FIXME: Why do we do this? -Ian McDonald */
+ if (i_tot * 4 < w_tot)
+ i_tot = w_tot * 4;
+
+ return i_tot * 4 / w_tot;
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean);
+
+struct dccp_li_hist_entry *dccp_li_hist_interval_new(struct dccp_li_hist *hist,
+ struct list_head *list,
+ const u64 seq_loss,
+ const u8 win_loss)
+{
+ struct dccp_li_hist_entry *tail = NULL, *entry;
+ int i;
+
+ for (i = 0; i <= DCCP_LI_HIST_IVAL_F_LENGTH; ++i) {
+ entry = dccp_li_hist_entry_new(hist, SLAB_ATOMIC);
+ if (entry == NULL) {
+ dccp_li_hist_purge(hist, list);
+ return NULL;
+ }
+ if (tail == NULL)
+ tail = entry;
+ list_add(&entry->dccplih_node, list);
+ }
+
+ entry->dccplih_seqno = seq_loss;
+ entry->dccplih_win_count = win_loss;
+ return tail;
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_interval_new);
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
new file mode 100644
index 0000000..13ad47b
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -0,0 +1,61 @@
+#ifndef _DCCP_LI_HIST_
+#define _DCCP_LI_HIST_
+/*
+ * net/dccp/ccids/lib/loss_interval.h
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+
+#define DCCP_LI_HIST_IVAL_F_LENGTH 8
+
+struct dccp_li_hist {
+ kmem_cache_t *dccplih_slab;
+};
+
+extern struct dccp_li_hist *dccp_li_hist_new(const char *name);
+extern void dccp_li_hist_delete(struct dccp_li_hist *hist);
+
+struct dccp_li_hist_entry {
+ struct list_head dccplih_node;
+ u64 dccplih_seqno:48,
+ dccplih_win_count:4;
+ u32 dccplih_interval;
+};
+
+static inline struct dccp_li_hist_entry *
+ dccp_li_hist_entry_new(struct dccp_li_hist *hist,
+ const unsigned int __nocast prio)
+{
+ return kmem_cache_alloc(hist->dccplih_slab, prio);
+}
+
+static inline void dccp_li_hist_entry_delete(struct dccp_li_hist *hist,
+ struct dccp_li_hist_entry *entry)
+{
+ if (entry != NULL)
+ kmem_cache_free(hist->dccplih_slab, entry);
+}
+
+extern void dccp_li_hist_purge(struct dccp_li_hist *hist,
+ struct list_head *list);
+
+extern u32 dccp_li_hist_calc_i_mean(struct list_head *list);
+
+extern struct dccp_li_hist_entry *
+ dccp_li_hist_interval_new(struct dccp_li_hist *hist,
+ struct list_head *list,
+ const u64 seq_loss,
+ const u8 win_loss);
+#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
new file mode 100644
index 0000000..d3f9d20
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -0,0 +1,398 @@
+/*
+ * net/dccp/packet_history.h
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *
+ * An implementation of the DCCP protocol
+ *
+ * This code has been developed by the University of Waikato WAND
+ * research group. For further information please see http://www.wand.net.nz/
+ * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ *
+ * This code also uses code from Lulea University, rereleased as GPL by its
+ * authors:
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ * and to make it work as a loadable module in the DCCP stack written by
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include "packet_history.h"
+
+struct dccp_rx_hist *dccp_rx_hist_new(const char *name)
+{
+ struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
+ static const char dccp_rx_hist_mask[] = "rx_hist_%s";
+ char *slab_name;
+
+ if (hist == NULL)
+ goto out;
+
+ slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1,
+ GFP_ATOMIC);
+ if (slab_name == NULL)
+ goto out_free_hist;
+
+ sprintf(slab_name, dccp_rx_hist_mask, name);
+ hist->dccprxh_slab = kmem_cache_create(slab_name,
+ sizeof(struct dccp_rx_hist_entry),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (hist->dccprxh_slab == NULL)
+ goto out_free_slab_name;
+out:
+ return hist;
+out_free_slab_name:
+ kfree(slab_name);
+out_free_hist:
+ kfree(hist);
+ hist = NULL;
+ goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_new);
+
+void dccp_rx_hist_delete(struct dccp_rx_hist *hist)
+{
+ const char* name = kmem_cache_name(hist->dccprxh_slab);
+
+ kmem_cache_destroy(hist->dccprxh_slab);
+ kfree(name);
+ kfree(hist);
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_delete);
+
+void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list)
+{
+ struct dccp_rx_hist_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, list, dccphrx_node) {
+ list_del_init(&entry->dccphrx_node);
+ kmem_cache_free(hist->dccprxh_slab, entry);
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_purge);
+
+struct dccp_rx_hist_entry *
+ dccp_rx_hist_find_data_packet(const struct list_head *list)
+{
+ struct dccp_rx_hist_entry *entry, *packet = NULL;
+
+ list_for_each_entry(entry, list, dccphrx_node)
+ if (entry->dccphrx_type == DCCP_PKT_DATA ||
+ entry->dccphrx_type == DCCP_PKT_DATAACK) {
+ packet = entry;
+ break;
+ }
+
+ return packet;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet);
+
+int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
+ struct list_head *rx_list,
+ struct list_head *li_list,
+ struct dccp_rx_hist_entry *packet)
+{
+ struct dccp_rx_hist_entry *entry, *next, *iter;
+ u8 num_later = 0;
+
+ iter = dccp_rx_hist_head(rx_list);
+ if (iter == NULL)
+ dccp_rx_hist_add_entry(rx_list, packet);
+ else {
+ const u64 seqno = packet->dccphrx_seqno;
+
+ if (after48(seqno, iter->dccphrx_seqno))
+ dccp_rx_hist_add_entry(rx_list, packet);
+ else {
+ if (dccp_rx_hist_entry_data_packet(iter))
+ num_later = 1;
+
+ list_for_each_entry_continue(iter, rx_list,
+ dccphrx_node) {
+ if (after48(seqno, iter->dccphrx_seqno)) {
+ dccp_rx_hist_add_entry(&iter->dccphrx_node,
+ packet);
+ goto trim_history;
+ }
+
+ if (dccp_rx_hist_entry_data_packet(iter))
+ num_later++;
+
+ if (num_later == TFRC_RECV_NUM_LATE_LOSS) {
+ dccp_rx_hist_entry_delete(hist, packet);
+ return 1;
+ }
+ }
+
+ if (num_later < TFRC_RECV_NUM_LATE_LOSS)
+ dccp_rx_hist_add_entry(rx_list, packet);
+ /*
+ * FIXME: else what? should we destroy the packet
+ * like above?
+ */
+ }
+ }
+
+trim_history:
+ /*
+ * Trim history (remove all packets after the NUM_LATE_LOSS + 1
+ * data packets)
+ */
+ num_later = TFRC_RECV_NUM_LATE_LOSS + 1;
+
+ if (!list_empty(li_list)) {
+ list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
+ if (num_later == 0) {
+ list_del_init(&entry->dccphrx_node);
+ dccp_rx_hist_entry_delete(hist, entry);
+ } else if (dccp_rx_hist_entry_data_packet(entry))
+ --num_later;
+ }
+ } else {
+ int step = 0;
+ u8 win_count = 0; /* Not needed, but lets shut up gcc */
+ int tmp;
+ /*
+ * We have no loss interval history so we need at least one
+ * rtt:s of data packets to approximate rtt.
+ */
+ list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
+ if (num_later == 0) {
+ switch (step) {
+ case 0:
+ step = 1;
+ /* OK, find next data packet */
+ num_later = 1;
+ break;
+ case 1:
+ step = 2;
+ /* OK, find next data packet */
+ num_later = 1;
+ win_count = entry->dccphrx_ccval;
+ break;
+ case 2:
+ tmp = win_count - entry->dccphrx_ccval;
+ if (tmp < 0)
+ tmp += TFRC_WIN_COUNT_LIMIT;
+ if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) {
+ /*
+ * We have found a packet older
+ * than one rtt remove the rest
+ */
+ step = 3;
+ } else /* OK, find next data packet */
+ num_later = 1;
+ break;
+ case 3:
+ list_del_init(&entry->dccphrx_node);
+ dccp_rx_hist_entry_delete(hist, entry);
+ break;
+ }
+ } else if (dccp_rx_hist_entry_data_packet(entry))
+ --num_later;
+ }
+ }
+
+ return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet);
+
+u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
+ struct list_head *li_list, u8 *win_loss)
+{
+ struct dccp_rx_hist_entry *entry, *next, *packet;
+ struct dccp_rx_hist_entry *a_loss = NULL;
+ struct dccp_rx_hist_entry *b_loss = NULL;
+ u64 seq_loss = DCCP_MAX_SEQNO + 1;
+ u8 num_later = TFRC_RECV_NUM_LATE_LOSS;
+
+ list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
+ if (num_later == 0) {
+ b_loss = entry;
+ break;
+ } else if (dccp_rx_hist_entry_data_packet(entry))
+ --num_later;
+ }
+
+ if (b_loss == NULL)
+ goto out;
+
+ num_later = 1;
+ list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
+ if (num_later == 0) {
+ a_loss = entry;
+ break;
+ } else if (dccp_rx_hist_entry_data_packet(entry))
+ --num_later;
+ }
+
+ if (a_loss == NULL) {
+ if (list_empty(li_list)) {
+ /* no loss event have occured yet */
+ LIMIT_NETDEBUG("%s: TODO: find a lost data packet by "
+ "comparing to initial seqno\n",
+ __FUNCTION__);
+ goto out;
+ } else {
+ LIMIT_NETDEBUG("%s: Less than 4 data pkts in history!",
+ __FUNCTION__);
+ goto out;
+ }
+ }
+
+ /* Locate a lost data packet */
+ entry = packet = b_loss;
+ list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
+ u64 delta = dccp_delta_seqno(entry->dccphrx_seqno,
+ packet->dccphrx_seqno);
+
+ if (delta != 0) {
+ if (dccp_rx_hist_entry_data_packet(packet))
+ --delta;
+ /*
+ * FIXME: check this, probably this % usage is because
+ * in earlier drafts the ndp count was just 8 bits
+ * long, but now it cam be up to 24 bits long.
+ */
+#if 0
+ if (delta % DCCP_NDP_LIMIT !=
+ (packet->dccphrx_ndp -
+ entry->dccphrx_ndp) % DCCP_NDP_LIMIT)
+#endif
+ if (delta != packet->dccphrx_ndp - entry->dccphrx_ndp) {
+ seq_loss = entry->dccphrx_seqno;
+ dccp_inc_seqno(&seq_loss);
+ }
+ }
+ packet = entry;
+ if (packet == a_loss)
+ break;
+ }
+out:
+ if (seq_loss != DCCP_MAX_SEQNO + 1)
+ *win_loss = a_loss->dccphrx_ccval;
+ else
+ *win_loss = 0; /* Paranoia */
+
+ return seq_loss;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_detect_loss);
+
+struct dccp_tx_hist *dccp_tx_hist_new(const char *name)
+{
+ struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
+ static const char dccp_tx_hist_mask[] = "tx_hist_%s";
+ char *slab_name;
+
+ if (hist == NULL)
+ goto out;
+
+ slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1,
+ GFP_ATOMIC);
+ if (slab_name == NULL)
+ goto out_free_hist;
+
+ sprintf(slab_name, dccp_tx_hist_mask, name);
+ hist->dccptxh_slab = kmem_cache_create(slab_name,
+ sizeof(struct dccp_tx_hist_entry),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if (hist->dccptxh_slab == NULL)
+ goto out_free_slab_name;
+out:
+ return hist;
+out_free_slab_name:
+ kfree(slab_name);
+out_free_hist:
+ kfree(hist);
+ hist = NULL;
+ goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_new);
+
+void dccp_tx_hist_delete(struct dccp_tx_hist *hist)
+{
+ const char* name = kmem_cache_name(hist->dccptxh_slab);
+
+ kmem_cache_destroy(hist->dccptxh_slab);
+ kfree(name);
+ kfree(hist);
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_delete);
+
+struct dccp_tx_hist_entry *
+ dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq)
+{
+ struct dccp_tx_hist_entry *packet = NULL, *entry;
+
+ list_for_each_entry(entry, list, dccphtx_node)
+ if (entry->dccphtx_seqno == seq) {
+ packet = entry;
+ break;
+ }
+
+ return packet;
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry);
+
+void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
+ struct list_head *list,
+ struct dccp_tx_hist_entry *packet)
+{
+ struct dccp_tx_hist_entry *next;
+
+ list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) {
+ list_del_init(&packet->dccphtx_node);
+ dccp_tx_hist_entry_delete(hist, packet);
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older);
+
+void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list)
+{
+ struct dccp_tx_hist_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, list, dccphtx_node) {
+ list_del_init(&entry->dccphtx_node);
+ dccp_tx_hist_entry_delete(hist, entry);
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_purge);
+
+MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, "
+ "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
+MODULE_DESCRIPTION("DCCP TFRC library");
+MODULE_LICENSE("GPL");
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
new file mode 100644
index 0000000..fb90a91
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -0,0 +1,199 @@
+/*
+ * net/dccp/packet_history.h
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *
+ * An implementation of the DCCP protocol
+ *
+ * This code has been developed by the University of Waikato WAND
+ * research group. For further information please see http://www.wand.net.nz/
+ * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ *
+ * This code also uses code from Lulea University, rereleased as GPL by its
+ * authors:
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ * and to make it work as a loadable module in the DCCP stack written by
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DCCP_PKT_HIST_
+#define _DCCP_PKT_HIST_
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+
+#include "../../dccp.h"
+
+/* Number of later packets received before one is considered lost */
+#define TFRC_RECV_NUM_LATE_LOSS 3
+
+#define TFRC_WIN_COUNT_PER_RTT 4
+#define TFRC_WIN_COUNT_LIMIT 16
+
+struct dccp_tx_hist_entry {
+ struct list_head dccphtx_node;
+ u64 dccphtx_seqno:48,
+ dccphtx_ccval:4,
+ dccphtx_sent:1;
+ u32 dccphtx_rtt;
+ struct timeval dccphtx_tstamp;
+};
+
+struct dccp_rx_hist_entry {
+ struct list_head dccphrx_node;
+ u64 dccphrx_seqno:48,
+ dccphrx_ccval:4,
+ dccphrx_type:4;
+ u32 dccphrx_ndp; /* In fact it is from 8 to 24 bits */
+ struct timeval dccphrx_tstamp;
+};
+
+struct dccp_tx_hist {
+ kmem_cache_t *dccptxh_slab;
+};
+
+extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name);
+extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist);
+
+struct dccp_rx_hist {
+ kmem_cache_t *dccprxh_slab;
+};
+
+extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name);
+extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist);
+extern struct dccp_rx_hist_entry *
+ dccp_rx_hist_find_data_packet(const struct list_head *list);
+
+static inline struct dccp_tx_hist_entry *
+ dccp_tx_hist_entry_new(struct dccp_tx_hist *hist,
+ const unsigned int __nocast prio)
+{
+ struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab,
+ prio);
+
+ if (entry != NULL)
+ entry->dccphtx_sent = 0;
+
+ return entry;
+}
+
+static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist,
+ struct dccp_tx_hist_entry *entry)
+{
+ if (entry != NULL)
+ kmem_cache_free(hist->dccptxh_slab, entry);
+}
+
+extern struct dccp_tx_hist_entry *
+ dccp_tx_hist_find_entry(const struct list_head *list,
+ const u64 seq);
+
+static inline void dccp_tx_hist_add_entry(struct list_head *list,
+ struct dccp_tx_hist_entry *entry)
+{
+ list_add(&entry->dccphtx_node, list);
+}
+
+extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
+ struct list_head *list,
+ struct dccp_tx_hist_entry *next);
+
+extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist,
+ struct list_head *list);
+
+static inline struct dccp_tx_hist_entry *
+ dccp_tx_hist_head(struct list_head *list)
+{
+ struct dccp_tx_hist_entry *head = NULL;
+
+ if (!list_empty(list))
+ head = list_entry(list->next, struct dccp_tx_hist_entry,
+ dccphtx_node);
+ return head;
+}
+
+static inline struct dccp_rx_hist_entry *
+ dccp_rx_hist_entry_new(struct dccp_rx_hist *hist,
+ const u32 ndp,
+ const struct sk_buff *skb,
+ const unsigned int __nocast prio)
+{
+ struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab,
+ prio);
+
+ if (entry != NULL) {
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+
+ entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+ entry->dccphrx_ccval = dh->dccph_ccval;
+ entry->dccphrx_type = dh->dccph_type;
+ entry->dccphrx_ndp = ndp;
+ do_gettimeofday(&(entry->dccphrx_tstamp));
+ }
+
+ return entry;
+}
+
+static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist,
+ struct dccp_rx_hist_entry *entry)
+{
+ if (entry != NULL)
+ kmem_cache_free(hist->dccprxh_slab, entry);
+}
+
+extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist,
+ struct list_head *list);
+
+static inline void dccp_rx_hist_add_entry(struct list_head *list,
+ struct dccp_rx_hist_entry *entry)
+{
+ list_add(&entry->dccphrx_node, list);
+}
+
+static inline struct dccp_rx_hist_entry *
+ dccp_rx_hist_head(struct list_head *list)
+{
+ struct dccp_rx_hist_entry *head = NULL;
+
+ if (!list_empty(list))
+ head = list_entry(list->next, struct dccp_rx_hist_entry,
+ dccphrx_node);
+ return head;
+}
+
+static inline int
+ dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry)
+{
+ return entry->dccphrx_type == DCCP_PKT_DATA ||
+ entry->dccphrx_type == DCCP_PKT_DATAACK;
+}
+
+extern int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
+ struct list_head *rx_list,
+ struct list_head *li_list,
+ struct dccp_rx_hist_entry *packet);
+
+extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
+ struct list_head *li_list, u8 *win_loss);
+
+#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
new file mode 100644
index 0000000..130c4c4
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -0,0 +1,22 @@
+#ifndef _TFRC_H_
+#define _TFRC_H_
+/*
+ * net/dccp/ccids/lib/tfrc.h
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/types.h>
+
+extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
+extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+
+#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
new file mode 100644
index 0000000..d2b5933
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -0,0 +1,644 @@
+/*
+ * net/dccp/ccids/lib/tfrc_equation.c
+ *
+ * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <asm/bug.h>
+#include <asm/div64.h>
+
+#include "tfrc.h"
+
+#define TFRC_CALC_X_ARRSIZE 500
+
+#define TFRC_CALC_X_SPLIT 50000
+/* equivalent to 0.05 */
+
+static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
+ { 37172, 8172 },
+ { 53499, 11567 },
+ { 66664, 14180 },
+ { 78298, 16388 },
+ { 89021, 18339 },
+ { 99147, 20108 },
+ { 108858, 21738 },
+ { 118273, 23260 },
+ { 127474, 24693 },
+ { 136520, 26052 },
+ { 145456, 27348 },
+ { 154316, 28589 },
+ { 163130, 29783 },
+ { 171919, 30935 },
+ { 180704, 32049 },
+ { 189502, 33130 },
+ { 198328, 34180 },
+ { 207194, 35202 },
+ { 216114, 36198 },
+ { 225097, 37172 },
+ { 234153, 38123 },
+ { 243294, 39055 },
+ { 252527, 39968 },
+ { 261861, 40864 },
+ { 271305, 41743 },
+ { 280866, 42607 },
+ { 290553, 43457 },
+ { 300372, 44293 },
+ { 310333, 45117 },
+ { 320441, 45929 },
+ { 330705, 46729 },
+ { 341131, 47518 },
+ { 351728, 48297 },
+ { 362501, 49066 },
+ { 373460, 49826 },
+ { 384609, 50577 },
+ { 395958, 51320 },
+ { 407513, 52054 },
+ { 419281, 52780 },
+ { 431270, 53499 },
+ { 443487, 54211 },
+ { 455940, 54916 },
+ { 468635, 55614 },
+ { 481581, 56306 },
+ { 494785, 56991 },
+ { 508254, 57671 },
+ { 521996, 58345 },
+ { 536019, 59014 },
+ { 550331, 59677 },
+ { 564939, 60335 },
+ { 579851, 60988 },
+ { 595075, 61636 },
+ { 610619, 62279 },
+ { 626491, 62918 },
+ { 642700, 63553 },
+ { 659253, 64183 },
+ { 676158, 64809 },
+ { 693424, 65431 },
+ { 711060, 66050 },
+ { 729073, 66664 },
+ { 747472, 67275 },
+ { 766266, 67882 },
+ { 785464, 68486 },
+ { 805073, 69087 },
+ { 825103, 69684 },
+ { 845562, 70278 },
+ { 866460, 70868 },
+ { 887805, 71456 },
+ { 909606, 72041 },
+ { 931873, 72623 },
+ { 954614, 73202 },
+ { 977839, 73778 },
+ { 1001557, 74352 },
+ { 1025777, 74923 },
+ { 1050508, 75492 },
+ { 1075761, 76058 },
+ { 1101544, 76621 },
+ { 1127867, 77183 },
+ { 1154739, 77741 },
+ { 1182172, 78298 },
+ { 1210173, 78852 },
+ { 1238753, 79405 },
+ { 1267922, 79955 },
+ { 1297689, 80503 },
+ { 1328066, 81049 },
+ { 1359060, 81593 },
+ { 1390684, 82135 },
+ { 1422947, 82675 },
+ { 1455859, 83213 },
+ { 1489430, 83750 },
+ { 1523671, 84284 },
+ { 1558593, 84817 },
+ { 1594205, 85348 },
+ { 1630518, 85878 },
+ { 1667543, 86406 },
+ { 1705290, 86932 },
+ { 1743770, 87457 },
+ { 1782994, 87980 },
+ { 1822973, 88501 },
+ { 1863717, 89021 },
+ { 1905237, 89540 },
+ { 1947545, 90057 },
+ { 1990650, 90573 },
+ { 2034566, 91087 },
+ { 2079301, 91600 },
+ { 2124869, 92111 },
+ { 2171279, 92622 },
+ { 2218543, 93131 },
+ { 2266673, 93639 },
+ { 2315680, 94145 },
+ { 2365575, 94650 },
+ { 2416371, 95154 },
+ { 2468077, 95657 },
+ { 2520707, 96159 },
+ { 2574271, 96660 },
+ { 2628782, 97159 },
+ { 2684250, 97658 },
+ { 2740689, 98155 },
+ { 2798110, 98651 },
+ { 2856524, 99147 },
+ { 2915944, 99641 },
+ { 2976382, 100134 },
+ { 3037850, 100626 },
+ { 3100360, 101117 },
+ { 3163924, 101608 },
+ { 3228554, 102097 },
+ { 3294263, 102586 },
+ { 3361063, 103073 },
+ { 3428966, 103560 },
+ { 3497984, 104045 },
+ { 3568131, 104530 },
+ { 3639419, 105014 },
+ { 3711860, 105498 },
+ { 3785467, 105980 },
+ { 3860253, 106462 },
+ { 3936229, 106942 },
+ { 4013410, 107422 },
+ { 4091808, 107902 },
+ { 4171435, 108380 },
+ { 4252306, 108858 },
+ { 4334431, 109335 },
+ { 4417825, 109811 },
+ { 4502501, 110287 },
+ { 4588472, 110762 },
+ { 4675750, 111236 },
+ { 4764349, 111709 },
+ { 4854283, 112182 },
+ { 4945564, 112654 },
+ { 5038206, 113126 },
+ { 5132223, 113597 },
+ { 5227627, 114067 },
+ { 5324432, 114537 },
+ { 5422652, 115006 },
+ { 5522299, 115474 },
+ { 5623389, 115942 },
+ { 5725934, 116409 },
+ { 5829948, 116876 },
+ { 5935446, 117342 },
+ { 6042439, 117808 },
+ { 6150943, 118273 },
+ { 6260972, 118738 },
+ { 6372538, 119202 },
+ { 6485657, 119665 },
+ { 6600342, 120128 },
+ { 6716607, 120591 },
+ { 6834467, 121053 },
+ { 6953935, 121514 },
+ { 7075025, 121976 },
+ { 7197752, 122436 },
+ { 7322131, 122896 },
+ { 7448175, 123356 },
+ { 7575898, 123815 },
+ { 7705316, 124274 },
+ { 7836442, 124733 },
+ { 7969291, 125191 },
+ { 8103877, 125648 },
+ { 8240216, 126105 },
+ { 8378321, 126562 },
+ { 8518208, 127018 },
+ { 8659890, 127474 },
+ { 8803384, 127930 },
+ { 8948702, 128385 },
+ { 9095861, 128840 },
+ { 9244875, 129294 },
+ { 9395760, 129748 },
+ { 9548529, 130202 },
+ { 9703198, 130655 },
+ { 9859782, 131108 },
+ { 10018296, 131561 },
+ { 10178755, 132014 },
+ { 10341174, 132466 },
+ { 10505569, 132917 },
+ { 10671954, 133369 },
+ { 10840345, 133820 },
+ { 11010757, 134271 },
+ { 11183206, 134721 },
+ { 11357706, 135171 },
+ { 11534274, 135621 },
+ { 11712924, 136071 },
+ { 11893673, 136520 },
+ { 12076536, 136969 },
+ { 12261527, 137418 },
+ { 12448664, 137867 },
+ { 12637961, 138315 },
+ { 12829435, 138763 },
+ { 13023101, 139211 },
+ { 13218974, 139658 },
+ { 13417071, 140106 },
+ { 13617407, 140553 },
+ { 13819999, 140999 },
+ { 14024862, 141446 },
+ { 14232012, 141892 },
+ { 14441465, 142339 },
+ { 14653238, 142785 },
+ { 14867346, 143230 },
+ { 15083805, 143676 },
+ { 15302632, 144121 },
+ { 15523842, 144566 },
+ { 15747453, 145011 },
+ { 15973479, 145456 },
+ { 16201939, 145900 },
+ { 16432847, 146345 },
+ { 16666221, 146789 },
+ { 16902076, 147233 },
+ { 17140429, 147677 },
+ { 17381297, 148121 },
+ { 17624696, 148564 },
+ { 17870643, 149007 },
+ { 18119154, 149451 },
+ { 18370247, 149894 },
+ { 18623936, 150336 },
+ { 18880241, 150779 },
+ { 19139176, 151222 },
+ { 19400759, 151664 },
+ { 19665007, 152107 },
+ { 19931936, 152549 },
+ { 20201564, 152991 },
+ { 20473907, 153433 },
+ { 20748982, 153875 },
+ { 21026807, 154316 },
+ { 21307399, 154758 },
+ { 21590773, 155199 },
+ { 21876949, 155641 },
+ { 22165941, 156082 },
+ { 22457769, 156523 },
+ { 22752449, 156964 },
+ { 23049999, 157405 },
+ { 23350435, 157846 },
+ { 23653774, 158287 },
+ { 23960036, 158727 },
+ { 24269236, 159168 },
+ { 24581392, 159608 },
+ { 24896521, 160049 },
+ { 25214642, 160489 },
+ { 25535772, 160929 },
+ { 25859927, 161370 },
+ { 26187127, 161810 },
+ { 26517388, 162250 },
+ { 26850728, 162690 },
+ { 27187165, 163130 },
+ { 27526716, 163569 },
+ { 27869400, 164009 },
+ { 28215234, 164449 },
+ { 28564236, 164889 },
+ { 28916423, 165328 },
+ { 29271815, 165768 },
+ { 29630428, 166208 },
+ { 29992281, 166647 },
+ { 30357392, 167087 },
+ { 30725779, 167526 },
+ { 31097459, 167965 },
+ { 31472452, 168405 },
+ { 31850774, 168844 },
+ { 32232445, 169283 },
+ { 32617482, 169723 },
+ { 33005904, 170162 },
+ { 33397730, 170601 },
+ { 33792976, 171041 },
+ { 34191663, 171480 },
+ { 34593807, 171919 },
+ { 34999428, 172358 },
+ { 35408544, 172797 },
+ { 35821174, 173237 },
+ { 36237335, 173676 },
+ { 36657047, 174115 },
+ { 37080329, 174554 },
+ { 37507197, 174993 },
+ { 37937673, 175433 },
+ { 38371773, 175872 },
+ { 38809517, 176311 },
+ { 39250924, 176750 },
+ { 39696012, 177190 },
+ { 40144800, 177629 },
+ { 40597308, 178068 },
+ { 41053553, 178507 },
+ { 41513554, 178947 },
+ { 41977332, 179386 },
+ { 42444904, 179825 },
+ { 42916290, 180265 },
+ { 43391509, 180704 },
+ { 43870579, 181144 },
+ { 44353520, 181583 },
+ { 44840352, 182023 },
+ { 45331092, 182462 },
+ { 45825761, 182902 },
+ { 46324378, 183342 },
+ { 46826961, 183781 },
+ { 47333531, 184221 },
+ { 47844106, 184661 },
+ { 48358706, 185101 },
+ { 48877350, 185541 },
+ { 49400058, 185981 },
+ { 49926849, 186421 },
+ { 50457743, 186861 },
+ { 50992759, 187301 },
+ { 51531916, 187741 },
+ { 52075235, 188181 },
+ { 52622735, 188622 },
+ { 53174435, 189062 },
+ { 53730355, 189502 },
+ { 54290515, 189943 },
+ { 54854935, 190383 },
+ { 55423634, 190824 },
+ { 55996633, 191265 },
+ { 56573950, 191706 },
+ { 57155606, 192146 },
+ { 57741621, 192587 },
+ { 58332014, 193028 },
+ { 58926806, 193470 },
+ { 59526017, 193911 },
+ { 60129666, 194352 },
+ { 60737774, 194793 },
+ { 61350361, 195235 },
+ { 61967446, 195677 },
+ { 62589050, 196118 },
+ { 63215194, 196560 },
+ { 63845897, 197002 },
+ { 64481179, 197444 },
+ { 65121061, 197886 },
+ { 65765563, 198328 },
+ { 66414705, 198770 },
+ { 67068508, 199213 },
+ { 67726992, 199655 },
+ { 68390177, 200098 },
+ { 69058085, 200540 },
+ { 69730735, 200983 },
+ { 70408147, 201426 },
+ { 71090343, 201869 },
+ { 71777343, 202312 },
+ { 72469168, 202755 },
+ { 73165837, 203199 },
+ { 73867373, 203642 },
+ { 74573795, 204086 },
+ { 75285124, 204529 },
+ { 76001380, 204973 },
+ { 76722586, 205417 },
+ { 77448761, 205861 },
+ { 78179926, 206306 },
+ { 78916102, 206750 },
+ { 79657310, 207194 },
+ { 80403571, 207639 },
+ { 81154906, 208084 },
+ { 81911335, 208529 },
+ { 82672880, 208974 },
+ { 83439562, 209419 },
+ { 84211402, 209864 },
+ { 84988421, 210309 },
+ { 85770640, 210755 },
+ { 86558080, 211201 },
+ { 87350762, 211647 },
+ { 88148708, 212093 },
+ { 88951938, 212539 },
+ { 89760475, 212985 },
+ { 90574339, 213432 },
+ { 91393551, 213878 },
+ { 92218133, 214325 },
+ { 93048107, 214772 },
+ { 93883493, 215219 },
+ { 94724314, 215666 },
+ { 95570590, 216114 },
+ { 96422343, 216561 },
+ { 97279594, 217009 },
+ { 98142366, 217457 },
+ { 99010679, 217905 },
+ { 99884556, 218353 },
+ { 100764018, 218801 },
+ { 101649086, 219250 },
+ { 102539782, 219698 },
+ { 103436128, 220147 },
+ { 104338146, 220596 },
+ { 105245857, 221046 },
+ { 106159284, 221495 },
+ { 107078448, 221945 },
+ { 108003370, 222394 },
+ { 108934074, 222844 },
+ { 109870580, 223294 },
+ { 110812910, 223745 },
+ { 111761087, 224195 },
+ { 112715133, 224646 },
+ { 113675069, 225097 },
+ { 114640918, 225548 },
+ { 115612702, 225999 },
+ { 116590442, 226450 },
+ { 117574162, 226902 },
+ { 118563882, 227353 },
+ { 119559626, 227805 },
+ { 120561415, 228258 },
+ { 121569272, 228710 },
+ { 122583219, 229162 },
+ { 123603278, 229615 },
+ { 124629471, 230068 },
+ { 125661822, 230521 },
+ { 126700352, 230974 },
+ { 127745083, 231428 },
+ { 128796039, 231882 },
+ { 129853241, 232336 },
+ { 130916713, 232790 },
+ { 131986475, 233244 },
+ { 133062553, 233699 },
+ { 134144966, 234153 },
+ { 135233739, 234608 },
+ { 136328894, 235064 },
+ { 137430453, 235519 },
+ { 138538440, 235975 },
+ { 139652876, 236430 },
+ { 140773786, 236886 },
+ { 141901190, 237343 },
+ { 143035113, 237799 },
+ { 144175576, 238256 },
+ { 145322604, 238713 },
+ { 146476218, 239170 },
+ { 147636442, 239627 },
+ { 148803298, 240085 },
+ { 149976809, 240542 },
+ { 151156999, 241000 },
+ { 152343890, 241459 },
+ { 153537506, 241917 },
+ { 154737869, 242376 },
+ { 155945002, 242835 },
+ { 157158929, 243294 },
+ { 158379673, 243753 },
+ { 159607257, 244213 },
+ { 160841704, 244673 },
+ { 162083037, 245133 },
+ { 163331279, 245593 },
+ { 164586455, 246054 },
+ { 165848586, 246514 },
+ { 167117696, 246975 },
+ { 168393810, 247437 },
+ { 169676949, 247898 },
+ { 170967138, 248360 },
+ { 172264399, 248822 },
+ { 173568757, 249284 },
+ { 174880235, 249747 },
+ { 176198856, 250209 },
+ { 177524643, 250672 },
+ { 178857621, 251136 },
+ { 180197813, 251599 },
+ { 181545242, 252063 },
+ { 182899933, 252527 },
+ { 184261908, 252991 },
+ { 185631191, 253456 },
+ { 187007807, 253920 },
+ { 188391778, 254385 },
+ { 189783129, 254851 },
+ { 191181884, 255316 },
+ { 192588065, 255782 },
+ { 194001698, 256248 },
+ { 195422805, 256714 },
+ { 196851411, 257181 },
+ { 198287540, 257648 },
+ { 199731215, 258115 },
+ { 201182461, 258582 },
+ { 202641302, 259050 },
+ { 204107760, 259518 },
+ { 205581862, 259986 },
+ { 207063630, 260454 },
+ { 208553088, 260923 },
+ { 210050262, 261392 },
+ { 211555174, 261861 },
+ { 213067849, 262331 },
+ { 214588312, 262800 },
+ { 216116586, 263270 },
+ { 217652696, 263741 },
+ { 219196666, 264211 },
+ { 220748520, 264682 },
+ { 222308282, 265153 },
+ { 223875978, 265625 },
+ { 225451630, 266097 },
+ { 227035265, 266569 },
+ { 228626905, 267041 },
+ { 230226576, 267514 },
+ { 231834302, 267986 },
+ { 233450107, 268460 },
+ { 235074016, 268933 },
+ { 236706054, 269407 },
+ { 238346244, 269881 },
+ { 239994613, 270355 },
+ { 241651183, 270830 },
+ { 243315981, 271305 }
+};
+
+/* Calculate the send rate as per section 3.1 of RFC3448
+
+Returns send rate in bytes per second
+
+Integer maths and lookups are used as not allowed floating point in kernel
+
+The function for Xcalc as per section 3.1 of RFC3448 is:
+
+X = s
+ -------------------------------------------------------------
+ R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2)))
+
+where
+X is the trasmit rate in bytes/second
+s is the packet size in bytes
+R is the round trip time in seconds
+p is the loss event rate, between 0 and 1.0, of the number of loss events
+ as a fraction of the number of packets transmitted
+t_RTO is the TCP retransmission timeout value in seconds
+b is the number of packets acknowledged by a single TCP acknowledgement
+
+we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes:
+
+X = s
+ -----------------------------------------------------------------------
+ R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2)))
+
+
+which we can break down into:
+
+X = s
+ --------
+ R * f(p)
+
+where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p))
+
+Function parameters:
+s - bytes
+R - RTT in usecs
+p - loss rate (decimal fraction multiplied by 1,000,000)
+
+Returns Xcalc in bytes per second
+
+DON'T alter this code unless you run test cases against it as the code
+has been manipulated to stop underflow/overlow.
+
+*/
+u32 tfrc_calc_x(u16 s, u32 R, u32 p)
+{
+ int index;
+ u32 f;
+ u64 tmp1, tmp2;
+
+ if (p < TFRC_CALC_X_SPLIT)
+ index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1;
+ else
+ index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1;
+
+ if (index < 0)
+ /* p should be 0 unless there is a bug in my code */
+ index = 0;
+
+ if (R == 0)
+ R = 1; /* RTT can't be zero or else divide by zero */
+
+ BUG_ON(index >= TFRC_CALC_X_ARRSIZE);
+
+ if (p >= TFRC_CALC_X_SPLIT)
+ f = tfrc_calc_x_lookup[index][0];
+ else
+ f = tfrc_calc_x_lookup[index][1];
+
+ tmp1 = ((u64)s * 100000000);
+ tmp2 = ((u64)R * (u64)f);
+ do_div(tmp2, 10000);
+ do_div(tmp1, tmp2);
+ /* Don't alter above math unless you test due to overflow on 32 bit */
+
+ return (u32)tmp1;
+}
+
+EXPORT_SYMBOL_GPL(tfrc_calc_x);
+
+/*
+ * args: fvalue - function value to match
+ * returns: p closest to that value
+ *
+ * both fvalue and p are multiplied by 1,000,000 to use ints
+ */
+u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
+{
+ int ctr = 0;
+ int small;
+
+ if (fvalue < tfrc_calc_x_lookup[0][1])
+ return 0;
+
+ if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1])
+ small = 1;
+ else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0])
+ return 1000000;
+ else
+ small = 0;
+
+ while (fvalue > tfrc_calc_x_lookup[ctr][small])
+ ctr++;
+
+ if (small)
+ return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE;
+ else
+ return 1000000 * ctr / TFRC_CALC_X_ARRSIZE;
+}
+
+EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
new file mode 100644
index 0000000..33456c0
--- /dev/null
+++ b/net/dccp/dccp.h
@@ -0,0 +1,493 @@
+#ifndef _DCCP_H
+#define _DCCP_H
+/*
+ * net/dccp/dccp.h
+ *
+ * An implementation of the DCCP protocol
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <net/snmp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+extern int dccp_debug;
+
+#define dccp_pr_debug(format, a...) \
+ do { if (dccp_debug) \
+ printk(KERN_DEBUG "%s: " format, __FUNCTION__ , ##a); \
+ } while (0)
+#define dccp_pr_debug_cat(format, a...) do { if (dccp_debug) \
+ printk(format, ##a); } while (0)
+#else
+#define dccp_pr_debug(format, a...)
+#define dccp_pr_debug_cat(format, a...)
+#endif
+
+extern struct inet_hashinfo dccp_hashinfo;
+
+extern atomic_t dccp_orphan_count;
+extern int dccp_tw_count;
+extern void dccp_tw_deschedule(struct inet_timewait_sock *tw);
+
+extern void dccp_time_wait(struct sock *sk, int state, int timeo);
+
+/* FIXME: Right size this */
+#define DCCP_MAX_OPT_LEN 128
+
+#define DCCP_MAX_PACKET_HDR 32
+
+#define MAX_DCCP_HEADER (DCCP_MAX_PACKET_HDR + DCCP_MAX_OPT_LEN + MAX_HEADER)
+
+#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
+ * state, about 60 seconds */
+
+/* draft-ietf-dccp-spec-11.txt initial RTO value */
+#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ))
+
+/* Maximal interval between probes for local resources. */
+#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
+
+#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */
+
+extern struct proto dccp_v4_prot;
+
+/* is seq1 < seq2 ? */
+static inline int before48(const u64 seq1, const u64 seq2)
+{
+ return (s64)((seq1 << 16) - (seq2 << 16)) < 0;
+}
+
+/* is seq1 > seq2 ? */
+static inline int after48(const u64 seq1, const u64 seq2)
+{
+ return (s64)((seq2 << 16) - (seq1 << 16)) < 0;
+}
+
+/* is seq2 <= seq1 <= seq3 ? */
+static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
+{
+ return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
+}
+
+static inline u64 max48(const u64 seq1, const u64 seq2)
+{
+ return after48(seq1, seq2) ? seq1 : seq2;
+}
+
+enum {
+ DCCP_MIB_NUM = 0,
+ DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */
+ DCCP_MIB_ESTABRESETS, /* EstabResets */
+ DCCP_MIB_CURRESTAB, /* CurrEstab */
+ DCCP_MIB_OUTSEGS, /* OutSegs */
+ DCCP_MIB_OUTRSTS,
+ DCCP_MIB_ABORTONTIMEOUT,
+ DCCP_MIB_TIMEOUTS,
+ DCCP_MIB_ABORTFAILED,
+ DCCP_MIB_PASSIVEOPENS,
+ DCCP_MIB_ATTEMPTFAILS,
+ DCCP_MIB_OUTDATAGRAMS,
+ DCCP_MIB_INERRS,
+ DCCP_MIB_OPTMANDATORYERROR,
+ DCCP_MIB_INVALIDOPT,
+ __DCCP_MIB_MAX
+};
+
+#define DCCP_MIB_MAX __DCCP_MIB_MAX
+struct dccp_mib {
+ unsigned long mibs[DCCP_MIB_MAX];
+} __SNMP_MIB_ALIGN__;
+
+DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
+#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
+#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
+#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field)
+#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
+#define DCCP_ADD_STATS_BH(field, val) \
+ SNMP_ADD_STATS_BH(dccp_statistics, field, val)
+#define DCCP_ADD_STATS_USER(field, val) \
+ SNMP_ADD_STATS_USER(dccp_statistics, field, val)
+
+extern int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb);
+extern int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb);
+
+extern int dccp_send_response(struct sock *sk);
+extern void dccp_send_ack(struct sock *sk);
+extern void dccp_send_delayed_ack(struct sock *sk);
+extern void dccp_send_sync(struct sock *sk, const u64 seq,
+ const enum dccp_pkt_type pkt_type);
+
+extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo);
+extern void dccp_write_space(struct sock *sk);
+
+extern void dccp_init_xmit_timers(struct sock *sk);
+static inline void dccp_clear_xmit_timers(struct sock *sk)
+{
+ inet_csk_clear_xmit_timers(sk);
+}
+
+extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
+
+extern const char *dccp_packet_name(const int type);
+extern const char *dccp_state_name(const int state);
+
+static inline void dccp_set_state(struct sock *sk, const int state)
+{
+ const int oldstate = sk->sk_state;
+
+ dccp_pr_debug("%s(%p) %-10.10s -> %s\n",
+ dccp_role(sk), sk,
+ dccp_state_name(oldstate), dccp_state_name(state));
+ WARN_ON(state == oldstate);
+
+ switch (state) {
+ case DCCP_OPEN:
+ if (oldstate != DCCP_OPEN)
+ DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
+ break;
+
+ case DCCP_CLOSED:
+ if (oldstate == DCCP_CLOSING || oldstate == DCCP_OPEN)
+ DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
+
+ sk->sk_prot->unhash(sk);
+ if (inet_csk(sk)->icsk_bind_hash != NULL &&
+ !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+ inet_put_port(&dccp_hashinfo, sk);
+ /* fall through */
+ default:
+ if (oldstate == DCCP_OPEN)
+ DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
+ }
+
+ /* Change state AFTER socket is unhashed to avoid closed
+ * socket sitting in hash tables.
+ */
+ sk->sk_state = state;
+}
+
+static inline void dccp_done(struct sock *sk)
+{
+ dccp_set_state(sk, DCCP_CLOSED);
+ dccp_clear_xmit_timers(sk);
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ else
+ inet_csk_destroy_sock(sk);
+}
+
+static inline void dccp_openreq_init(struct request_sock *req,
+ struct dccp_sock *dp,
+ struct sk_buff *skb)
+{
+ /*
+ * FIXME: fill in the other req fields from the DCCP options
+ * received
+ */
+ inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport;
+ inet_rsk(req)->acked = 0;
+ req->rcv_wnd = 0;
+}
+
+extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
+
+extern struct sock *dccp_create_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb);
+
+extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
+
+extern void dccp_v4_err(struct sk_buff *skb, u32);
+
+extern int dccp_v4_rcv(struct sk_buff *skb);
+
+extern struct sock *dccp_v4_request_recv_sock(struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst);
+extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct request_sock **prev);
+
+extern int dccp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb);
+extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ struct dccp_hdr *dh, unsigned len);
+extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned len);
+
+extern void dccp_close(struct sock *sk, long timeout);
+extern struct sk_buff *dccp_make_response(struct sock *sk,
+ struct dst_entry *dst,
+ struct request_sock *req);
+extern struct sk_buff *dccp_make_reset(struct sock *sk,
+ struct dst_entry *dst,
+ enum dccp_reset_codes code);
+
+extern int dccp_connect(struct sock *sk);
+extern int dccp_disconnect(struct sock *sk, int flags);
+extern int dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+extern int dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int optlen);
+extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk,
+ struct msghdr *msg, size_t size);
+extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
+ struct msghdr *msg, size_t len, int nonblock,
+ int flags, int *addr_len);
+extern void dccp_shutdown(struct sock *sk, int how);
+
+extern int dccp_v4_checksum(const struct sk_buff *skb,
+ const u32 saddr, const u32 daddr);
+
+extern int dccp_v4_send_reset(struct sock *sk,
+ enum dccp_reset_codes code);
+extern void dccp_send_close(struct sock *sk, const int active);
+
+struct dccp_skb_cb {
+ __u8 dccpd_type;
+ __u8 dccpd_reset_code;
+ __u8 dccpd_service;
+ __u8 dccpd_ccval;
+ __u64 dccpd_seq;
+ __u64 dccpd_ack_seq;
+ int dccpd_opt_len;
+};
+
+#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
+
+static inline int dccp_non_data_packet(const struct sk_buff *skb)
+{
+ const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+
+ return type == DCCP_PKT_ACK ||
+ type == DCCP_PKT_CLOSE ||
+ type == DCCP_PKT_CLOSEREQ ||
+ type == DCCP_PKT_RESET ||
+ type == DCCP_PKT_SYNC ||
+ type == DCCP_PKT_SYNCACK;
+}
+
+static inline int dccp_packet_without_ack(const struct sk_buff *skb)
+{
+ const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+
+ return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
+}
+
+#define DCCP_MAX_SEQNO ((((u64)1) << 48) - 1)
+#define DCCP_PKT_WITHOUT_ACK_SEQ (DCCP_MAX_SEQNO << 2)
+
+static inline void dccp_set_seqno(u64 *seqno, u64 value)
+{
+ if (value > DCCP_MAX_SEQNO)
+ value -= DCCP_MAX_SEQNO + 1;
+ *seqno = value;
+}
+
+static inline u64 dccp_delta_seqno(u64 seqno1, u64 seqno2)
+{
+ return ((seqno2 << 16) - (seqno1 << 16)) >> 16;
+}
+
+static inline void dccp_inc_seqno(u64 *seqno)
+{
+ if (++*seqno > DCCP_MAX_SEQNO)
+ *seqno = 0;
+}
+
+static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
+{
+ struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
+ sizeof(*dh));
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ dh->dccph_seq = htonl((gss >> 32)) >> 8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ dh->dccph_seq = htonl((gss >> 32));
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+ dhx->dccph_seq_low = htonl(gss & 0xffffffff);
+}
+
+static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
+ const u64 gsr)
+{
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ dhack->dccph_ack_nr_high = htonl((gsr >> 32)) >> 8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ dhack->dccph_ack_nr_high = htonl((gsr >> 32));
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+ dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff);
+}
+
+static inline void dccp_update_gsr(struct sock *sk, u64 seq)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ dp->dccps_gsr = seq;
+ dccp_set_seqno(&dp->dccps_swl,
+ (dp->dccps_gsr + 1 -
+ (dp->dccps_options.dccpo_sequence_window / 4)));
+ dccp_set_seqno(&dp->dccps_swh,
+ (dp->dccps_gsr +
+ (3 * dp->dccps_options.dccpo_sequence_window) / 4));
+}
+
+static inline void dccp_update_gss(struct sock *sk, u64 seq)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ dp->dccps_awh = dp->dccps_gss = seq;
+ dccp_set_seqno(&dp->dccps_awl,
+ (dp->dccps_gss -
+ dp->dccps_options.dccpo_sequence_window + 1));
+}
+
+extern void dccp_insert_options(struct sock *sk, struct sk_buff *skb);
+extern void dccp_insert_option_elapsed_time(struct sock *sk,
+ struct sk_buff *skb,
+ u32 elapsed_time);
+extern void dccp_insert_option_timestamp(struct sock *sk,
+ struct sk_buff *skb);
+extern void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
+ unsigned char option,
+ const void *value, unsigned char len);
+
+extern struct socket *dccp_ctl_socket;
+
+#define DCCP_ACKPKTS_STATE_RECEIVED 0
+#define DCCP_ACKPKTS_STATE_ECN_MARKED (1 << 6)
+#define DCCP_ACKPKTS_STATE_NOT_RECEIVED (3 << 6)
+
+#define DCCP_ACKPKTS_STATE_MASK 0xC0 /* 11000000 */
+#define DCCP_ACKPKTS_LEN_MASK 0x3F /* 00111111 */
+
+/** struct dccp_ackpkts - acknowledgeable packets
+ *
+ * This data structure is the one defined in the DCCP draft
+ * Appendix A.
+ *
+ * @dccpap_buf_head - circular buffer head
+ * @dccpap_buf_tail - circular buffer tail
+ * @dccpap_buf_ackno - ack # of the most recent packet acknowledgeable in the
+ * buffer (i.e. %dccpap_buf_head)
+ * @dccpap_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
+ * by the buffer with State 0
+ *
+ * Additionally, the HC-Receiver must keep some information about the
+ * Ack Vectors it has recently sent. For each packet sent carrying an
+ * Ack Vector, it remembers four variables:
+ *
+ * @dccpap_ack_seqno - the Sequence Number used for the packet
+ * (HC-Receiver seqno)
+ * @dccpap_ack_ptr - the value of buf_head at the time of acknowledgement.
+ * @dccpap_ack_ackno - the Acknowledgement Number used for the packet
+ * (HC-Sender seqno)
+ * @dccpap_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
+ *
+ * @dccpap_buf_len - circular buffer length
+ * @dccpap_time - the time in usecs
+ * @dccpap_buf - circular buffer of acknowledgeable packets
+ */
+struct dccp_ackpkts {
+ unsigned int dccpap_buf_head;
+ unsigned int dccpap_buf_tail;
+ u64 dccpap_buf_ackno;
+ u64 dccpap_ack_seqno;
+ u64 dccpap_ack_ackno;
+ unsigned int dccpap_ack_ptr;
+ unsigned int dccpap_buf_vector_len;
+ unsigned int dccpap_ack_vector_len;
+ unsigned int dccpap_buf_len;
+ struct timeval dccpap_time;
+ u8 dccpap_buf_nonce;
+ u8 dccpap_ack_nonce;
+ u8 dccpap_buf[0];
+};
+
+extern struct dccp_ackpkts *
+ dccp_ackpkts_alloc(unsigned int len,
+ const unsigned int __nocast priority);
+extern void dccp_ackpkts_free(struct dccp_ackpkts *ap);
+extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state);
+extern void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap,
+ struct sock *sk, u64 ackno);
+
+static inline suseconds_t timeval_usecs(const struct timeval *tv)
+{
+ return tv->tv_sec * USEC_PER_SEC + tv->tv_usec;
+}
+
+static inline suseconds_t timeval_delta(const struct timeval *large,
+ const struct timeval *small)
+{
+ time_t secs = large->tv_sec - small->tv_sec;
+ suseconds_t usecs = large->tv_usec - small->tv_usec;
+
+ if (usecs < 0) {
+ secs--;
+ usecs += USEC_PER_SEC;
+ }
+ return secs * USEC_PER_SEC + usecs;
+}
+
+static inline void timeval_add_usecs(struct timeval *tv,
+ const suseconds_t usecs)
+{
+ tv->tv_usec += usecs;
+ while (tv->tv_usec >= USEC_PER_SEC) {
+ tv->tv_sec++;
+ tv->tv_usec -= USEC_PER_SEC;
+ }
+}
+
+static inline void timeval_sub_usecs(struct timeval *tv,
+ const suseconds_t usecs)
+{
+ tv->tv_usec -= usecs;
+ while (tv->tv_usec < 0) {
+ tv->tv_sec--;
+ tv->tv_usec += USEC_PER_SEC;
+ }
+}
+
+/*
+ * Returns the difference in usecs between timeval
+ * passed in and current time
+ */
+static inline suseconds_t timeval_now_delta(const struct timeval *tv)
+{
+ struct timeval now;
+ do_gettimeofday(&now);
+ return timeval_delta(&now, tv);
+}
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+extern void dccp_ackvector_print(const u64 ackno,
+ const unsigned char *vector, int len);
+extern void dccp_ackpkts_print(const struct dccp_ackpkts *ap);
+#else
+static inline void dccp_ackvector_print(const u64 ackno,
+ const unsigned char *vector,
+ int len) { }
+static inline void dccp_ackpkts_print(const struct dccp_ackpkts *ap) { }
+#endif
+
+#endif /* _DCCP_H */
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
new file mode 100644
index 0000000..f675d8e6
--- /dev/null
+++ b/net/dccp/diag.c
@@ -0,0 +1,71 @@
+/*
+ * net/dccp/diag.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+static void dccp_get_info(struct sock *sk, struct tcp_info *info)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ memset(info, 0, sizeof(*info));
+
+ info->tcpi_state = sk->sk_state;
+ info->tcpi_retransmits = icsk->icsk_retransmits;
+ info->tcpi_probes = icsk->icsk_probes_out;
+ info->tcpi_backoff = icsk->icsk_backoff;
+ info->tcpi_pmtu = dp->dccps_pmtu_cookie;
+
+ if (dp->dccps_options.dccpo_send_ack_vector)
+ info->tcpi_options |= TCPI_OPT_SACK;
+
+ ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
+ ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
+}
+
+static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *_info)
+{
+ r->idiag_rqueue = r->idiag_wqueue = 0;
+
+ if (_info != NULL)
+ dccp_get_info(sk, _info);
+}
+
+static struct inet_diag_handler dccp_diag_handler = {
+ .idiag_hashinfo = &dccp_hashinfo,
+ .idiag_get_info = dccp_diag_get_info,
+ .idiag_type = DCCPDIAG_GETSOCK,
+ .idiag_info_size = sizeof(struct tcp_info),
+};
+
+static int __init dccp_diag_init(void)
+{
+ return inet_diag_register(&dccp_diag_handler);
+}
+
+static void __exit dccp_diag_fini(void)
+{
+ inet_diag_unregister(&dccp_diag_handler);
+}
+
+module_init(dccp_diag_init);
+module_exit(dccp_diag_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCP inet_diag handler");
diff --git a/net/dccp/input.c b/net/dccp/input.c
new file mode 100644
index 0000000..ef29cef
--- /dev/null
+++ b/net/dccp/input.c
@@ -0,0 +1,600 @@
+/*
+ * net/dccp/input.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+
+#include <net/sock.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+static void dccp_fin(struct sock *sk, struct sk_buff *skb)
+{
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ sock_set_flag(sk, SOCK_DONE);
+ __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
+ sk->sk_data_ready(sk, 0);
+}
+
+static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
+{
+ dccp_v4_send_reset(sk, DCCP_RESET_CODE_CLOSED);
+ dccp_fin(sk, skb);
+ dccp_set_state(sk, DCCP_CLOSED);
+ sk_wake_async(sk, 1, POLL_HUP);
+}
+
+static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
+{
+ /*
+ * Step 7: Check for unexpected packet types
+ * If (S.is_server and P.type == CloseReq)
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
+ dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
+ return;
+ }
+
+ dccp_set_state(sk, DCCP_CLOSING);
+ dccp_send_close(sk, 0);
+}
+
+static inline void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (dp->dccps_options.dccpo_send_ack_vector)
+ dccp_ackpkts_check_rcv_ackno(dp->dccps_hc_rx_ackpkts, sk,
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+}
+
+static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+ struct dccp_sock *dp = dccp_sk(sk);
+ u64 lswl, lawl;
+
+ /*
+ * Step 5: Prepare sequence numbers for Sync
+ * If P.type == Sync or P.type == SyncAck,
+ * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL,
+ * / * P is valid, so update sequence number variables
+ * accordingly. After this update, P will pass the tests
+ * in Step 6. A SyncAck is generated if necessary in
+ * Step 15 * /
+ * Update S.GSR, S.SWL, S.SWH
+ * Otherwise,
+ * Drop packet and return
+ */
+ if (dh->dccph_type == DCCP_PKT_SYNC ||
+ dh->dccph_type == DCCP_PKT_SYNCACK) {
+ if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ dp->dccps_awl, dp->dccps_awh) &&
+ !before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swl))
+ dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
+ else
+ return -1;
+ }
+
+ /*
+ * Step 6: Check sequence numbers
+ * Let LSWL = S.SWL and LAWL = S.AWL
+ * If P.type == CloseReq or P.type == Close or P.type == Reset,
+ * LSWL := S.GSR + 1, LAWL := S.GAR
+ * If LSWL <= P.seqno <= S.SWH
+ * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH),
+ * Update S.GSR, S.SWL, S.SWH
+ * If P.type != Sync,
+ * Update S.GAR
+ * Otherwise,
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ lswl = dp->dccps_swl;
+ lawl = dp->dccps_awl;
+
+ if (dh->dccph_type == DCCP_PKT_CLOSEREQ ||
+ dh->dccph_type == DCCP_PKT_CLOSE ||
+ dh->dccph_type == DCCP_PKT_RESET) {
+ lswl = dp->dccps_gsr;
+ dccp_inc_seqno(&lswl);
+ lawl = dp->dccps_gar;
+ }
+
+ if (between48(DCCP_SKB_CB(skb)->dccpd_seq, lswl, dp->dccps_swh) &&
+ (DCCP_SKB_CB(skb)->dccpd_ack_seq == DCCP_PKT_WITHOUT_ACK_SEQ ||
+ between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ lawl, dp->dccps_awh))) {
+ dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
+
+ if (dh->dccph_type != DCCP_PKT_SYNC &&
+ (DCCP_SKB_CB(skb)->dccpd_ack_seq !=
+ DCCP_PKT_WITHOUT_ACK_SEQ))
+ dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq;
+ } else {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: Step 6 failed for %s packet, "
+ "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
+ "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
+ "sending SYNC...\n",
+ dccp_packet_name(dh->dccph_type),
+ (unsigned long long) lswl,
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_seq,
+ (unsigned long long) dp->dccps_swh,
+ (DCCP_SKB_CB(skb)->dccpd_ack_seq ==
+ DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" : "exists",
+ (unsigned long long) lawl,
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ (unsigned long long) dp->dccps_awh);
+ dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
+ return -1;
+ }
+
+ return 0;
+}
+
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (dccp_check_seqno(sk, skb))
+ goto discard;
+
+ if (dccp_parse_options(sk, skb))
+ goto discard;
+
+ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_event_ack_recv(sk, skb);
+
+ /*
+ * FIXME: check ECN to see if we should use
+ * DCCP_ACKPKTS_STATE_ECN_MARKED
+ */
+ if (dp->dccps_options.dccpo_send_ack_vector) {
+ struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
+
+ if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts,
+ DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_ACKPKTS_STATE_RECEIVED)) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: acknowledgeable "
+ "packets buffer full!\n");
+ ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
+ inet_csk_schedule_ack(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MIN,
+ DCCP_RTO_MAX);
+ goto discard;
+ }
+
+ /*
+ * FIXME: this activation is probably wrong, have to study more
+ * TCP delack machinery and how it fits into DCCP draft, but
+ * for now it kinda "works" 8)
+ */
+ if (!inet_csk_ack_scheduled(sk)) {
+ inet_csk_schedule_ack(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5 * HZ,
+ DCCP_RTO_MAX);
+ }
+ }
+
+ ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+ ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+
+ switch (dccp_hdr(skb)->dccph_type) {
+ case DCCP_PKT_DATAACK:
+ case DCCP_PKT_DATA:
+ /*
+ * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED
+ * option if it is.
+ */
+ __skb_pull(skb, dh->dccph_doff * 4);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, sk);
+ sk->sk_data_ready(sk, 0);
+ return 0;
+ case DCCP_PKT_ACK:
+ goto discard;
+ case DCCP_PKT_RESET:
+ /*
+ * Step 9: Process Reset
+ * If P.type == Reset,
+ * Tear down connection
+ * S.state := TIMEWAIT
+ * Set TIMEWAIT timer
+ * Drop packet and return
+ */
+ dccp_fin(sk, skb);
+ dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+ return 0;
+ case DCCP_PKT_CLOSEREQ:
+ dccp_rcv_closereq(sk, skb);
+ goto discard;
+ case DCCP_PKT_CLOSE:
+ dccp_rcv_close(sk, skb);
+ return 0;
+ case DCCP_PKT_REQUEST:
+ /* Step 7
+ * or (S.is_server and P.type == Response)
+ * or (S.is_client and P.type == Request)
+ * or (S.state >= OPEN and P.type == Request
+ * and P.seqno >= S.OSR)
+ * or (S.state >= OPEN and P.type == Response
+ * and P.seqno >= S.OSR)
+ * or (S.state == RESPOND and P.type == Data),
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ if (dp->dccps_role != DCCP_ROLE_LISTEN)
+ goto send_sync;
+ goto check_seq;
+ case DCCP_PKT_RESPONSE:
+ if (dp->dccps_role != DCCP_ROLE_CLIENT)
+ goto send_sync;
+check_seq:
+ if (!before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_osr)) {
+send_sync:
+ dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_PKT_SYNC);
+ }
+ break;
+ case DCCP_PKT_SYNC:
+ dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_PKT_SYNCACK);
+ /*
+ * From the draft:
+ *
+ * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets
+ * MAY have non-zero-length application data areas, whose
+ * contents * receivers MUST ignore.
+ */
+ goto discard;
+ }
+
+ DCCP_INC_STATS_BH(DCCP_MIB_INERRS);
+discard:
+ __kfree_skb(skb);
+ return 0;
+}
+
+static int dccp_rcv_request_sent_state_process(struct sock *sk,
+ struct sk_buff *skb,
+ const struct dccp_hdr *dh,
+ const unsigned len)
+{
+ /*
+ * Step 4: Prepare sequence numbers in REQUEST
+ * If S.state == REQUEST,
+ * If (P.type == Response or P.type == Reset)
+ * and S.AWL <= P.ackno <= S.AWH,
+ * / * Set sequence number variables corresponding to the
+ * other endpoint, so P will pass the tests in Step 6 * /
+ * Set S.GSR, S.ISR, S.SWL, S.SWH
+ * / * Response processing continues in Step 10; Reset
+ * processing continues in Step 9 * /
+ */
+ if (dh->dccph_type == DCCP_PKT_RESPONSE) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ /* Stop the REQUEST timer */
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+ BUG_TRAP(sk->sk_send_head != NULL);
+ __kfree_skb(sk->sk_send_head);
+ sk->sk_send_head = NULL;
+
+ if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ dp->dccps_awl, dp->dccps_awh)) {
+ dccp_pr_debug("invalid ackno: S.AWL=%llu, "
+ "P.ackno=%llu, S.AWH=%llu \n",
+ (unsigned long long)dp->dccps_awl,
+ (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ (unsigned long long)dp->dccps_awh);
+ goto out_invalid_packet;
+ }
+
+ dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+ dccp_update_gsr(sk, dp->dccps_isr);
+ /*
+ * SWL and AWL are initially adjusted so that they are not less than
+ * the initial Sequence Numbers received and sent, respectively:
+ * SWL := max(GSR + 1 - floor(W/4), ISR),
+ * AWL := max(GSS - W' + 1, ISS).
+ * These adjustments MUST be applied only at the beginning of the
+ * connection.
+ *
+ * AWL was adjusted in dccp_v4_connect -acme
+ */
+ dccp_set_seqno(&dp->dccps_swl,
+ max48(dp->dccps_swl, dp->dccps_isr));
+
+ if (ccid_hc_rx_init(dp->dccps_hc_rx_ccid, sk) != 0 ||
+ ccid_hc_tx_init(dp->dccps_hc_tx_ccid, sk) != 0) {
+ ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+ ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+ /* FIXME: send appropriate RESET code */
+ goto out_invalid_packet;
+ }
+
+ dccp_sync_mss(sk, dp->dccps_pmtu_cookie);
+
+ /*
+ * Step 10: Process REQUEST state (second part)
+ * If S.state == REQUEST,
+ * / * If we get here, P is a valid Response from the
+ * server (see Step 4), and we should move to
+ * PARTOPEN state. PARTOPEN means send an Ack,
+ * don't send Data packets, retransmit Acks
+ * periodically, and always include any Init Cookie
+ * from the Response * /
+ * S.state := PARTOPEN
+ * Set PARTOPEN timer
+ * Continue with S.state == PARTOPEN
+ * / * Step 12 will send the Ack completing the
+ * three-way handshake * /
+ */
+ dccp_set_state(sk, DCCP_PARTOPEN);
+
+ /* Make sure socket is routed, for correct metrics. */
+ inet_sk_rebuild_header(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, 0, POLL_OUT);
+ }
+
+ if (sk->sk_write_pending || icsk->icsk_ack.pingpong ||
+ icsk->icsk_accept_queue.rskq_defer_accept) {
+ /* Save one ACK. Data will be ready after
+ * several ticks, if write_pending is set.
+ *
+ * It may be deleted, but with this feature tcpdumps
+ * look so _wonderfully_ clever, that I was not able
+ * to stand against the temptation 8) --ANK
+ */
+ /*
+ * OK, in DCCP we can as well do a similar trick, its
+ * even in the draft, but there is no need for us to
+ * schedule an ack here, as dccp_sendmsg does this for
+ * us, also stated in the draft. -acme
+ */
+ __kfree_skb(skb);
+ return 0;
+ }
+ dccp_send_ack(sk);
+ return -1;
+ }
+
+out_invalid_packet:
+ return 1; /* dccp_v4_do_rcv will send a reset, but...
+ FIXME: the reset code should be
+ DCCP_RESET_CODE_PACKET_ERROR */
+}
+
+static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
+ struct sk_buff *skb,
+ const struct dccp_hdr *dh,
+ const unsigned len)
+{
+ int queued = 0;
+
+ switch (dh->dccph_type) {
+ case DCCP_PKT_RESET:
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+ break;
+ case DCCP_PKT_DATAACK:
+ case DCCP_PKT_ACK:
+ /*
+ * FIXME: we should be reseting the PARTOPEN (DELACK) timer
+ * here but only if we haven't used the DELACK timer for
+ * something else, like sending a delayed ack for a TIMESTAMP
+ * echo, etc, for now were not clearing it, sending an extra
+ * ACK when there is nothing else to do in DELACK is not a big
+ * deal after all.
+ */
+
+ /* Stop the PARTOPEN timer */
+ if (sk->sk_state == DCCP_PARTOPEN)
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+
+ dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
+ dccp_set_state(sk, DCCP_OPEN);
+
+ if (dh->dccph_type == DCCP_PKT_DATAACK) {
+ dccp_rcv_established(sk, skb, dh, len);
+ queued = 1; /* packet was queued
+ (by dccp_rcv_established) */
+ }
+ break;
+ }
+
+ return queued;
+}
+
+int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ struct dccp_hdr *dh, unsigned len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ const int old_state = sk->sk_state;
+ int queued = 0;
+
+ /*
+ * Step 3: Process LISTEN state
+ * (Continuing from dccp_v4_do_rcv and dccp_v6_do_rcv)
+ *
+ * If S.state == LISTEN,
+ * If P.type == Request or P contains a valid Init Cookie
+ * option,
+ * * Must scan the packet's options to check for an Init
+ * Cookie. Only the Init Cookie is processed here,
+ * however; other options are processed in Step 8. This
+ * scan need only be performed if the endpoint uses Init
+ * Cookies *
+ * * Generate a new socket and switch to that socket *
+ * Set S := new socket for this port pair
+ * S.state = RESPOND
+ * Choose S.ISS (initial seqno) or set from Init Cookie
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+ * Continue with S.state == RESPOND
+ * * A Response packet will be generated in Step 11 *
+ * Otherwise,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ *
+ * NOTE: the check for the packet types is done in
+ * dccp_rcv_state_process
+ */
+ if (sk->sk_state == DCCP_LISTEN) {
+ if (dh->dccph_type == DCCP_PKT_REQUEST) {
+ if (dccp_v4_conn_request(sk, skb) < 0)
+ return 1;
+
+ /* FIXME: do congestion control initialization */
+ goto discard;
+ }
+ if (dh->dccph_type == DCCP_PKT_RESET)
+ goto discard;
+
+ /* Caller (dccp_v4_do_rcv) will send Reset(No Connection)*/
+ return 1;
+ }
+
+ if (sk->sk_state != DCCP_REQUESTING) {
+ if (dccp_check_seqno(sk, skb))
+ goto discard;
+
+ /*
+ * Step 8: Process options and mark acknowledgeable
+ */
+ if (dccp_parse_options(sk, skb))
+ goto discard;
+
+ if (DCCP_SKB_CB(skb)->dccpd_ack_seq !=
+ DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_event_ack_recv(sk, skb);
+
+ ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+ ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+
+ /*
+ * FIXME: check ECN to see if we should use
+ * DCCP_ACKPKTS_STATE_ECN_MARKED
+ */
+ if (dp->dccps_options.dccpo_send_ack_vector) {
+ if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts,
+ DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_ACKPKTS_STATE_RECEIVED))
+ goto discard;
+ /*
+ * FIXME: this activation is probably wrong, have to
+ * study more TCP delack machinery and how it fits into
+ * DCCP draft, but for now it kinda "works" 8)
+ */
+ if ((dp->dccps_hc_rx_ackpkts->dccpap_ack_seqno ==
+ DCCP_MAX_SEQNO + 1) &&
+ !inet_csk_ack_scheduled(sk)) {
+ inet_csk_schedule_ack(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MIN,
+ DCCP_RTO_MAX);
+ }
+ }
+ }
+
+ /*
+ * Step 9: Process Reset
+ * If P.type == Reset,
+ * Tear down connection
+ * S.state := TIMEWAIT
+ * Set TIMEWAIT timer
+ * Drop packet and return
+ */
+ if (dh->dccph_type == DCCP_PKT_RESET) {
+ /*
+ * Queue the equivalent of TCP fin so that dccp_recvmsg
+ * exits the loop
+ */
+ dccp_fin(sk, skb);
+ dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+ return 0;
+ /*
+ * Step 7: Check for unexpected packet types
+ * If (S.is_server and P.type == CloseReq)
+ * or (S.is_server and P.type == Response)
+ * or (S.is_client and P.type == Request)
+ * or (S.state == RESPOND and P.type == Data),
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ } else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
+ (dh->dccph_type == DCCP_PKT_RESPONSE ||
+ dh->dccph_type == DCCP_PKT_CLOSEREQ)) ||
+ (dp->dccps_role == DCCP_ROLE_CLIENT &&
+ dh->dccph_type == DCCP_PKT_REQUEST) ||
+ (sk->sk_state == DCCP_RESPOND &&
+ dh->dccph_type == DCCP_PKT_DATA)) {
+ dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+ DCCP_PKT_SYNC);
+ goto discard;
+ } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
+ dccp_rcv_closereq(sk, skb);
+ goto discard;
+ } else if (dh->dccph_type == DCCP_PKT_CLOSE) {
+ dccp_rcv_close(sk, skb);
+ return 0;
+ }
+
+ switch (sk->sk_state) {
+ case DCCP_CLOSED:
+ return 1;
+
+ case DCCP_REQUESTING:
+ /* FIXME: do congestion control initialization */
+
+ queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
+ if (queued >= 0)
+ return queued;
+
+ __kfree_skb(skb);
+ return 0;
+
+ case DCCP_RESPOND:
+ case DCCP_PARTOPEN:
+ queued = dccp_rcv_respond_partopen_state_process(sk, skb,
+ dh, len);
+ break;
+ }
+
+ if (dh->dccph_type == DCCP_PKT_ACK ||
+ dh->dccph_type == DCCP_PKT_DATAACK) {
+ switch (old_state) {
+ case DCCP_PARTOPEN:
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, 0, POLL_OUT);
+ break;
+ }
+ }
+
+ if (!queued) {
+discard:
+ __kfree_skb(skb);
+ }
+ return 0;
+}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
new file mode 100644
index 0000000..3fc75db
--- /dev/null
+++ b/net/dccp/ipv4.c
@@ -0,0 +1,1356 @@
+/*
+ * net/dccp/ipv4.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/icmp.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+
+#include <net/icmp.h>
+#include <net/inet_hashtables.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
+ .lhash_lock = RW_LOCK_UNLOCKED,
+ .lhash_users = ATOMIC_INIT(0),
+ .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
+ .portalloc_lock = SPIN_LOCK_UNLOCKED,
+ .port_rover = 1024 - 1,
+};
+
+EXPORT_SYMBOL_GPL(dccp_hashinfo);
+
+static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
+{
+ return inet_csk_get_port(&dccp_hashinfo, sk, snum);
+}
+
+static void dccp_v4_hash(struct sock *sk)
+{
+ inet_hash(&dccp_hashinfo, sk);
+}
+
+static void dccp_v4_unhash(struct sock *sk)
+{
+ inet_unhash(&dccp_hashinfo, sk);
+}
+
+/* called with local bh disabled */
+static int __dccp_v4_check_established(struct sock *sk, const __u16 lport,
+ struct inet_timewait_sock **twp)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ const u32 daddr = inet->rcv_saddr;
+ const u32 saddr = inet->daddr;
+ const int dif = sk->sk_bound_dev_if;
+ INET_ADDR_COOKIE(acookie, saddr, daddr)
+ const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+ const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport,
+ dccp_hashinfo.ehash_size);
+ struct inet_ehash_bucket *head = &dccp_hashinfo.ehash[hash];
+ const struct sock *sk2;
+ const struct hlist_node *node;
+ struct inet_timewait_sock *tw;
+
+ write_lock(&head->lock);
+
+ /* Check TIME-WAIT sockets first. */
+ sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
+ tw = inet_twsk(sk2);
+
+ if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+ goto not_unique;
+ }
+ tw = NULL;
+
+ /* And established part... */
+ sk_for_each(sk2, node, &head->chain) {
+ if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+ goto not_unique;
+ }
+
+ /* Must record num and sport now. Otherwise we will see
+ * in hash table socket with a funny identity. */
+ inet->num = lport;
+ inet->sport = htons(lport);
+ sk->sk_hashent = hash;
+ BUG_TRAP(sk_unhashed(sk));
+ __sk_add_node(sk, &head->chain);
+ sock_prot_inc_use(sk->sk_prot);
+ write_unlock(&head->lock);
+
+ if (twp != NULL) {
+ *twp = tw;
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+ } else if (tw != NULL) {
+ /* Silly. Should hash-dance instead... */
+ inet_twsk_deschedule(tw, &dccp_death_row);
+ NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+ inet_twsk_put(tw);
+ }
+
+ return 0;
+
+not_unique:
+ write_unlock(&head->lock);
+ return -EADDRNOTAVAIL;
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+static int dccp_v4_hash_connect(struct sock *sk)
+{
+ const unsigned short snum = inet_sk(sk)->num;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+ int ret;
+
+ if (snum == 0) {
+ int rover;
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int remaining = (high - low) + 1;
+ struct hlist_node *node;
+ struct inet_timewait_sock *tw = NULL;
+
+ local_bh_disable();
+
+ /* TODO. Actually it is not so bad idea to remove
+ * dccp_hashinfo.portalloc_lock before next submission to
+ * Linus.
+ * As soon as we touch this place at all it is time to think.
+ *
+ * Now it protects single _advisory_ variable
+ * dccp_hashinfo.port_rover, hence it is mostly useless.
+ * Code will work nicely if we just delete it, but
+ * I am afraid in contented case it will work not better or
+ * even worse: another cpu just will hit the same bucket
+ * and spin there.
+ * So some cpu salt could remove both contention and
+ * memory pingpong. Any ideas how to do this in a nice way?
+ */
+ spin_lock(&dccp_hashinfo.portalloc_lock);
+ rover = dccp_hashinfo.port_rover;
+
+ do {
+ rover++;
+ if ((rover < low) || (rover > high))
+ rover = low;
+ head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
+ dccp_hashinfo.bhash_size)];
+ spin_lock(&head->lock);
+
+ /* Does not bother with rcv_saddr checks,
+ * because the established check is already
+ * unique enough.
+ */
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
+ if (tb->port == rover) {
+ BUG_TRAP(!hlist_empty(&tb->owners));
+ if (tb->fastreuse >= 0)
+ goto next_port;
+ if (!__dccp_v4_check_established(sk,
+ rover,
+ &tw))
+ goto ok;
+ goto next_port;
+ }
+ }
+
+ tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
+ head, rover);
+ if (tb == NULL) {
+ spin_unlock(&head->lock);
+ break;
+ }
+ tb->fastreuse = -1;
+ goto ok;
+
+ next_port:
+ spin_unlock(&head->lock);
+ } while (--remaining > 0);
+ dccp_hashinfo.port_rover = rover;
+ spin_unlock(&dccp_hashinfo.portalloc_lock);
+
+ local_bh_enable();
+
+ return -EADDRNOTAVAIL;
+
+ok:
+ /* All locks still held and bhs disabled */
+ dccp_hashinfo.port_rover = rover;
+ spin_unlock(&dccp_hashinfo.portalloc_lock);
+
+ inet_bind_hash(sk, tb, rover);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->sport = htons(rover);
+ __inet_hash(&dccp_hashinfo, sk, 0);
+ }
+ spin_unlock(&head->lock);
+
+ if (tw != NULL) {
+ inet_twsk_deschedule(tw, &dccp_death_row);
+ inet_twsk_put(tw);
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
+ dccp_hashinfo.bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+ if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
+ __inet_hash(&dccp_hashinfo, sk, 0);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ } else {
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+ ret = __dccp_v4_check_established(sk, snum, NULL);
+out:
+ local_bh_enable();
+ return ret;
+ }
+}
+
+static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ struct rtable *rt;
+ u32 daddr, nexthop;
+ int tmp;
+ int err;
+
+ dp->dccps_role = DCCP_ROLE_CLIENT;
+
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
+
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ nexthop = daddr = usin->sin_addr.s_addr;
+ if (inet->opt != NULL && inet->opt->srr) {
+ if (daddr == 0)
+ return -EINVAL;
+ nexthop = inet->opt->faddr;
+ }
+
+ tmp = ip_route_connect(&rt, nexthop, inet->saddr,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+ IPPROTO_DCCP,
+ inet->sport, usin->sin_port, sk);
+ if (tmp < 0)
+ return tmp;
+
+ if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+ ip_rt_put(rt);
+ return -ENETUNREACH;
+ }
+
+ if (inet->opt == NULL || !inet->opt->srr)
+ daddr = rt->rt_dst;
+
+ if (inet->saddr == 0)
+ inet->saddr = rt->rt_src;
+ inet->rcv_saddr = inet->saddr;
+
+ inet->dport = usin->sin_port;
+ inet->daddr = daddr;
+
+ dp->dccps_ext_header_len = 0;
+ if (inet->opt != NULL)
+ dp->dccps_ext_header_len = inet->opt->optlen;
+ /*
+ * Socket identity is still unknown (sport may be zero).
+ * However we set state to DCCP_REQUESTING and not releasing socket
+ * lock select source port, enter ourselves into the hash tables and
+ * complete initialization after this.
+ */
+ dccp_set_state(sk, DCCP_REQUESTING);
+ err = dccp_v4_hash_connect(sk);
+ if (err != 0)
+ goto failure;
+
+ err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
+ if (err != 0)
+ goto failure;
+
+ /* OK, now commit destination to socket. */
+ sk_setup_caps(sk, &rt->u.dst);
+
+ dp->dccps_gar =
+ dp->dccps_iss = secure_dccp_sequence_number(inet->saddr,
+ inet->daddr,
+ inet->sport,
+ usin->sin_port);
+ dccp_update_gss(sk, dp->dccps_iss);
+
+ /*
+ * SWL and AWL are initially adjusted so that they are not less than
+ * the initial Sequence Numbers received and sent, respectively:
+ * SWL := max(GSR + 1 - floor(W/4), ISR),
+ * AWL := max(GSS - W' + 1, ISS).
+ * These adjustments MUST be applied only at the beginning of the
+ * connection.
+ */
+ dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
+
+ inet->id = dp->dccps_iss ^ jiffies;
+
+ err = dccp_connect(sk);
+ rt = NULL;
+ if (err != 0)
+ goto failure;
+out:
+ return err;
+failure:
+ /*
+ * This unhashes the socket and releases the local port, if necessary.
+ */
+ dccp_set_state(sk, DCCP_CLOSED);
+ ip_rt_put(rt);
+ sk->sk_route_caps = 0;
+ inet->dport = 0;
+ goto out;
+}
+
+/*
+ * This routine does path mtu discovery as defined in RFC1191.
+ */
+static inline void dccp_do_pmtu_discovery(struct sock *sk,
+ const struct iphdr *iph,
+ u32 mtu)
+{
+ struct dst_entry *dst;
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct dccp_sock *dp = dccp_sk(sk);
+
+ /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs
+ * send out by Linux are always < 576bytes so they should go through
+ * unfragmented).
+ */
+ if (sk->sk_state == DCCP_LISTEN)
+ return;
+
+ /* We don't check in the destentry if pmtu discovery is forbidden
+ * on this route. We just assume that no packet_to_big packets
+ * are send back when pmtu discovery is not active.
+ * There is a small race when the user changes this flag in the
+ * route, but I think that's acceptable.
+ */
+ if ((dst = __sk_dst_check(sk, 0)) == NULL)
+ return;
+
+ dst->ops->update_pmtu(dst, mtu);
+
+ /* Something is about to be wrong... Remember soft error
+ * for the case, if this connection will not able to recover.
+ */
+ if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+ sk->sk_err_soft = EMSGSIZE;
+
+ mtu = dst_mtu(dst);
+
+ if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+ dp->dccps_pmtu_cookie > mtu) {
+ dccp_sync_mss(sk, mtu);
+
+ /*
+ * From: draft-ietf-dccp-spec-11.txt
+ *
+ * DCCP-Sync packets are the best choice for upward
+ * probing, since DCCP-Sync probes do not risk application
+ * data loss.
+ */
+ dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
+ } /* else let the usual retransmit timer handle it */
+}
+
+static void dccp_v4_ctl_send_ack(struct sk_buff *rxskb)
+{
+ int err;
+ struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+ const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_ack_bits);
+ struct sk_buff *skb;
+
+ if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
+ return;
+
+ skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_DCCP_HEADER);
+
+ skb->dst = dst_clone(rxskb->dst);
+
+ skb->h.raw = skb_push(skb, dccp_hdr_ack_len);
+ dh = dccp_hdr(skb);
+ memset(dh, 0, dccp_hdr_ack_len);
+
+ /* Build DCCP header and checksum it. */
+ dh->dccph_type = DCCP_PKT_ACK;
+ dh->dccph_sport = rxdh->dccph_dport;
+ dh->dccph_dport = rxdh->dccph_sport;
+ dh->dccph_doff = dccp_hdr_ack_len / 4;
+ dh->dccph_x = 1;
+
+ dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+ DCCP_SKB_CB(rxskb)->dccpd_seq);
+
+ bh_lock_sock(dccp_ctl_socket->sk);
+ err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk,
+ rxskb->nh.iph->daddr,
+ rxskb->nh.iph->saddr, NULL);
+ bh_unlock_sock(dccp_ctl_socket->sk);
+
+ if (err == NET_XMIT_CN || err == 0) {
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ }
+}
+
+static void dccp_v4_reqsk_send_ack(struct sk_buff *skb,
+ struct request_sock *req)
+{
+ dccp_v4_ctl_send_ack(skb);
+}
+
+static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
+ struct dst_entry *dst)
+{
+ int err = -1;
+ struct sk_buff *skb;
+
+ /* First, grab a route. */
+
+ if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
+ goto out;
+
+ skb = dccp_make_response(sk, dst, req);
+ if (skb != NULL) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+
+ err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
+ ireq->rmt_addr,
+ ireq->opt);
+ if (err == NET_XMIT_CN)
+ err = 0;
+ }
+
+out:
+ dst_release(dst);
+ return err;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some sort of error
+ * condition. If err < 0 then the socket should be closed and the error
+ * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code.
+ * After adjustment header points to the first 8 bytes of the tcp header. We
+ * need to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When someone else
+ * accesses the socket the ICMP is just dropped and for some paths there is no
+ * check at all. A more general error queue to queue errors for later handling
+ * is probably better.
+ */
+void dccp_v4_err(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (struct iphdr *)skb->data;
+ const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data +
+ (iph->ihl << 2));
+ struct dccp_sock *dp;
+ struct inet_sock *inet;
+ const int type = skb->h.icmph->type;
+ const int code = skb->h.icmph->code;
+ struct sock *sk;
+ __u64 seq;
+ int err;
+
+ if (skb->len < (iph->ihl << 2) + 8) {
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ return;
+ }
+
+ sk = inet_lookup(&dccp_hashinfo, iph->daddr, dh->dccph_dport,
+ iph->saddr, dh->dccph_sport, inet_iif(skb));
+ if (sk == NULL) {
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ return;
+ }
+
+ if (sk->sk_state == DCCP_TIME_WAIT) {
+ inet_twsk_put((struct inet_timewait_sock *)sk);
+ return;
+ }
+
+ bh_lock_sock(sk);
+ /* If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ */
+ if (sock_owned_by_user(sk))
+ NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+
+ if (sk->sk_state == DCCP_CLOSED)
+ goto out;
+
+ dp = dccp_sk(sk);
+ seq = dccp_hdr_seq(skb);
+ if (sk->sk_state != DCCP_LISTEN &&
+ !between48(seq, dp->dccps_swl, dp->dccps_swh)) {
+ NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
+ switch (type) {
+ case ICMP_SOURCE_QUENCH:
+ /* Just silently ignore these. */
+ goto out;
+ case ICMP_PARAMETERPROB:
+ err = EPROTO;
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code > NR_ICMP_UNREACH)
+ goto out;
+
+ if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+ if (!sock_owned_by_user(sk))
+ dccp_do_pmtu_discovery(sk, iph, info);
+ goto out;
+ }
+
+ err = icmp_err_convert[code].errno;
+ break;
+ case ICMP_TIME_EXCEEDED:
+ err = EHOSTUNREACH;
+ break;
+ default:
+ goto out;
+ }
+
+ switch (sk->sk_state) {
+ struct request_sock *req , **prev;
+ case DCCP_LISTEN:
+ if (sock_owned_by_user(sk))
+ goto out;
+ req = inet_csk_search_req(sk, &prev, dh->dccph_dport,
+ iph->daddr, iph->saddr);
+ if (!req)
+ goto out;
+
+ /*
+ * ICMPs are not backlogged, hence we cannot get an established
+ * socket here.
+ */
+ BUG_TRAP(!req->sk);
+
+ if (seq != dccp_rsk(req)->dreq_iss) {
+ NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+ /*
+ * Still in RESPOND, just remove it silently.
+ * There is no good way to pass the error to the newly
+ * created socket, and POSIX does not want network
+ * errors returned from accept().
+ */
+ inet_csk_reqsk_queue_drop(sk, req, prev);
+ goto out;
+
+ case DCCP_REQUESTING:
+ case DCCP_RESPOND:
+ if (!sock_owned_by_user(sk)) {
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ sk->sk_err = err;
+
+ sk->sk_error_report(sk);
+
+ dccp_done(sk);
+ } else
+ sk->sk_err_soft = err;
+ goto out;
+ }
+
+ /* If we've already connected we will keep trying
+ * until we time out, or the user gives up.
+ *
+ * rfc1122 4.2.3.9 allows to consider as hard errors
+ * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+ * but it is obsoleted by pmtu discovery).
+ *
+ * Note, that in modern internet, where routing is unreliable
+ * and in each dark corner broken firewalls sit, sending random
+ * errors ordered by their masters even this two messages finally lose
+ * their original sense (even Linux sends invalid PORT_UNREACHs)
+ *
+ * Now we are in compliance with RFCs.
+ * --ANK (980905)
+ */
+
+ inet = inet_sk(sk);
+ if (!sock_owned_by_user(sk) && inet->recverr) {
+ sk->sk_err = err;
+ sk->sk_error_report(sk);
+ } else /* Only an error on timeout */
+ sk->sk_err_soft = err;
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code)
+{
+ struct sk_buff *skb;
+ /*
+ * FIXME: what if rebuild_header fails?
+ * Should we be doing a rebuild_header here?
+ */
+ int err = inet_sk_rebuild_header(sk);
+
+ if (err != 0)
+ return err;
+
+ skb = dccp_make_reset(sk, sk->sk_dst_cache, code);
+ if (skb != NULL) {
+ const struct dccp_sock *dp = dccp_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
+
+ err = ip_build_and_send_pkt(skb, sk,
+ inet->saddr, inet->daddr, NULL);
+ if (err == NET_XMIT_CN)
+ err = 0;
+
+ ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+ ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+ }
+
+ return err;
+}
+
+static inline u64 dccp_v4_init_sequence(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ return secure_dccp_sequence_number(skb->nh.iph->daddr,
+ skb->nh.iph->saddr,
+ dccp_hdr(skb)->dccph_dport,
+ dccp_hdr(skb)->dccph_sport);
+}
+
+int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct inet_request_sock *ireq;
+ struct dccp_sock dp;
+ struct request_sock *req;
+ struct dccp_request_sock *dreq;
+ const __u32 saddr = skb->nh.iph->saddr;
+ const __u32 daddr = skb->nh.iph->daddr;
+ struct dst_entry *dst = NULL;
+
+ /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
+ if (((struct rtable *)skb->dst)->rt_flags &
+ (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto drop;
+
+ /*
+ * TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ if (inet_csk_reqsk_queue_is_full(sk))
+ goto drop;
+
+ /*
+ * Accept backlog is full. If we have already queued enough
+ * of warm entries in syn queue, drop request. It is better than
+ * clogging syn queue with openreqs with exponentially increasing
+ * timeout.
+ */
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ goto drop;
+
+ req = reqsk_alloc(sk->sk_prot->rsk_prot);
+ if (req == NULL)
+ goto drop;
+
+ /* FIXME: process options */
+
+ dccp_openreq_init(req, &dp, skb);
+
+ ireq = inet_rsk(req);
+ ireq->loc_addr = daddr;
+ ireq->rmt_addr = saddr;
+ /* FIXME: Merge Aristeu's option parsing code when ready */
+ req->rcv_wnd = 100; /* Fake, option parsing will get the
+ right value */
+ ireq->opt = NULL;
+
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+ *
+ * In fact we defer setting S.GSR, S.SWL, S.SWH to
+ * dccp_create_openreq_child.
+ */
+ dreq = dccp_rsk(req);
+ dreq->dreq_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+ dreq->dreq_iss = dccp_v4_init_sequence(sk, skb);
+ dreq->dreq_service = dccp_hdr_request(skb)->dccph_req_service;
+
+ if (dccp_v4_send_response(sk, req, dst))
+ goto drop_and_free;
+
+ inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+ return 0;
+
+drop_and_free:
+ /*
+ * FIXME: should be reqsk_free after implementing req->rsk_ops
+ */
+ __reqsk_free(req);
+drop:
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ return -1;
+}
+
+/*
+ * The three way handshake has completed - we got a valid ACK or DATAACK -
+ * now create the new socket.
+ *
+ * This is the equivalent of TCP's tcp_v4_syn_recv_sock
+ */
+struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet_request_sock *ireq;
+ struct inet_sock *newinet;
+ struct dccp_sock *newdp;
+ struct sock *newsk;
+
+ if (sk_acceptq_is_full(sk))
+ goto exit_overflow;
+
+ if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
+ goto exit;
+
+ newsk = dccp_create_openreq_child(sk, req, skb);
+ if (newsk == NULL)
+ goto exit;
+
+ sk_setup_caps(newsk, dst);
+
+ newdp = dccp_sk(newsk);
+ newinet = inet_sk(newsk);
+ ireq = inet_rsk(req);
+ newinet->daddr = ireq->rmt_addr;
+ newinet->rcv_saddr = ireq->loc_addr;
+ newinet->saddr = ireq->loc_addr;
+ newinet->opt = ireq->opt;
+ ireq->opt = NULL;
+ newinet->mc_index = inet_iif(skb);
+ newinet->mc_ttl = skb->nh.iph->ttl;
+ newinet->id = jiffies;
+
+ dccp_sync_mss(newsk, dst_mtu(dst));
+
+ __inet_hash(&dccp_hashinfo, newsk, 0);
+ __inet_inherit_port(&dccp_hashinfo, sk, newsk);
+
+ return newsk;
+
+exit_overflow:
+ NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
+exit:
+ NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+ dst_release(dst);
+ return NULL;
+}
+
+static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+ const struct iphdr *iph = skb->nh.iph;
+ struct sock *nsk;
+ struct request_sock **prev;
+ /* Find possible connection requests. */
+ struct request_sock *req = inet_csk_search_req(sk, &prev,
+ dh->dccph_sport,
+ iph->saddr, iph->daddr);
+ if (req != NULL)
+ return dccp_check_req(sk, skb, req, prev);
+
+ nsk = __inet_lookup_established(&dccp_hashinfo,
+ iph->saddr, dh->dccph_sport,
+ iph->daddr, ntohs(dh->dccph_dport),
+ inet_iif(skb));
+ if (nsk != NULL) {
+ if (nsk->sk_state != DCCP_TIME_WAIT) {
+ bh_lock_sock(nsk);
+ return nsk;
+ }
+ inet_twsk_put((struct inet_timewait_sock *)nsk);
+ return NULL;
+ }
+
+ return sk;
+}
+
+int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr,
+ const u32 daddr)
+{
+ const struct dccp_hdr* dh = dccp_hdr(skb);
+ int checksum_len;
+ u32 tmp;
+
+ if (dh->dccph_cscov == 0)
+ checksum_len = skb->len;
+ else {
+ checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32);
+ checksum_len = checksum_len < skb->len ? checksum_len :
+ skb->len;
+ }
+
+ tmp = csum_partial((unsigned char *)dh, checksum_len, 0);
+ return csum_tcpudp_magic(saddr, daddr, checksum_len,
+ IPPROTO_DCCP, tmp);
+}
+
+static int dccp_v4_verify_checksum(struct sk_buff *skb,
+ const u32 saddr, const u32 daddr)
+{
+ struct dccp_hdr *dh = dccp_hdr(skb);
+ int checksum_len;
+ u32 tmp;
+
+ if (dh->dccph_cscov == 0)
+ checksum_len = skb->len;
+ else {
+ checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32);
+ checksum_len = checksum_len < skb->len ? checksum_len :
+ skb->len;
+ }
+ tmp = csum_partial((unsigned char *)dh, checksum_len, 0);
+ return csum_tcpudp_magic(saddr, daddr, checksum_len,
+ IPPROTO_DCCP, tmp) == 0 ? 0 : -1;
+}
+
+static struct dst_entry* dccp_v4_route_skb(struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct rtable *rt;
+ struct flowi fl = { .oif = ((struct rtable *)skb->dst)->rt_iif,
+ .nl_u = { .ip4_u =
+ { .daddr = skb->nh.iph->saddr,
+ .saddr = skb->nh.iph->daddr,
+ .tos = RT_CONN_FLAGS(sk) } },
+ .proto = sk->sk_protocol,
+ .uli_u = { .ports =
+ { .sport = dccp_hdr(skb)->dccph_dport,
+ .dport = dccp_hdr(skb)->dccph_sport }
+ }
+ };
+
+ if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+ }
+
+ return &rt->u.dst;
+}
+
+static void dccp_v4_ctl_send_reset(struct sk_buff *rxskb)
+{
+ int err;
+ struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+ const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_reset);
+ struct sk_buff *skb;
+ struct dst_entry *dst;
+ u64 seqno;
+
+ /* Never send a reset in response to a reset. */
+ if (rxdh->dccph_type == DCCP_PKT_RESET)
+ return;
+
+ if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
+ return;
+
+ dst = dccp_v4_route_skb(dccp_ctl_socket->sk, rxskb);
+ if (dst == NULL)
+ return;
+
+ skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC);
+ if (skb == NULL)
+ goto out;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_DCCP_HEADER);
+ skb->dst = dst_clone(dst);
+
+ skb->h.raw = skb_push(skb, dccp_hdr_reset_len);
+ dh = dccp_hdr(skb);
+ memset(dh, 0, dccp_hdr_reset_len);
+
+ /* Build DCCP header and checksum it. */
+ dh->dccph_type = DCCP_PKT_RESET;
+ dh->dccph_sport = rxdh->dccph_dport;
+ dh->dccph_dport = rxdh->dccph_sport;
+ dh->dccph_doff = dccp_hdr_reset_len / 4;
+ dh->dccph_x = 1;
+ dccp_hdr_reset(skb)->dccph_reset_code =
+ DCCP_SKB_CB(rxskb)->dccpd_reset_code;
+
+ /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */
+ seqno = 0;
+ if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
+
+ dccp_hdr_set_seq(dh, seqno);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+ DCCP_SKB_CB(rxskb)->dccpd_seq);
+
+ dh->dccph_checksum = dccp_v4_checksum(skb, rxskb->nh.iph->saddr,
+ rxskb->nh.iph->daddr);
+
+ bh_lock_sock(dccp_ctl_socket->sk);
+ err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk,
+ rxskb->nh.iph->daddr,
+ rxskb->nh.iph->saddr, NULL);
+ bh_unlock_sock(dccp_ctl_socket->sk);
+
+ if (err == NET_XMIT_CN || err == 0) {
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+ }
+out:
+ dst_release(dst);
+}
+
+int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_hdr *dh = dccp_hdr(skb);
+
+ if (sk->sk_state == DCCP_OPEN) { /* Fast path */
+ if (dccp_rcv_established(sk, skb, dh, skb->len))
+ goto reset;
+ return 0;
+ }
+
+ /*
+ * Step 3: Process LISTEN state
+ * If S.state == LISTEN,
+ * If P.type == Request or P contains a valid Init Cookie
+ * option,
+ * * Must scan the packet's options to check for an Init
+ * Cookie. Only the Init Cookie is processed here,
+ * however; other options are processed in Step 8. This
+ * scan need only be performed if the endpoint uses Init
+ * Cookies *
+ * * Generate a new socket and switch to that socket *
+ * Set S := new socket for this port pair
+ * S.state = RESPOND
+ * Choose S.ISS (initial seqno) or set from Init Cookie
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+ * Continue with S.state == RESPOND
+ * * A Response packet will be generated in Step 11 *
+ * Otherwise,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ *
+ * NOTE: the check for the packet types is done in
+ * dccp_rcv_state_process
+ */
+ if (sk->sk_state == DCCP_LISTEN) {
+ struct sock *nsk = dccp_v4_hnd_req(sk, skb);
+
+ if (nsk == NULL)
+ goto discard;
+
+ if (nsk != sk) {
+ if (dccp_child_process(sk, nsk, skb))
+ goto reset;
+ return 0;
+ }
+ }
+
+ if (dccp_rcv_state_process(sk, skb, dh, skb->len))
+ goto reset;
+ return 0;
+
+reset:
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+ dccp_v4_ctl_send_reset(skb);
+discard:
+ kfree_skb(skb);
+ return 0;
+}
+
+static inline int dccp_invalid_packet(struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh;
+
+ if (skb->pkt_type != PACKET_HOST)
+ return 1;
+
+ if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: pskb_may_pull failed\n");
+ return 1;
+ }
+
+ dh = dccp_hdr(skb);
+
+ /* If the packet type is not understood, drop packet and return */
+ if (dh->dccph_type >= DCCP_PKT_INVALID) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: invalid packet type\n");
+ return 1;
+ }
+
+ /*
+ * If P.Data Offset is too small for packet type, or too large for
+ * packet, drop packet and return
+ */
+ if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) "
+ "too small 1\n",
+ dh->dccph_doff);
+ return 1;
+ }
+
+ if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) "
+ "too small 2\n",
+ dh->dccph_doff);
+ return 1;
+ }
+
+ dh = dccp_hdr(skb);
+
+ /*
+ * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
+ * has short sequence numbers), drop packet and return
+ */
+ if (dh->dccph_x == 0 &&
+ dh->dccph_type != DCCP_PKT_DATA &&
+ dh->dccph_type != DCCP_PKT_ACK &&
+ dh->dccph_type != DCCP_PKT_DATAACK) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.type (%s) not Data, Ack "
+ "nor DataAck and P.X == 0\n",
+ dccp_packet_name(dh->dccph_type));
+ return 1;
+ }
+
+ /* If the header checksum is incorrect, drop packet and return */
+ if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
+ skb->nh.iph->daddr) < 0) {
+ LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is "
+ "incorrect\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+/* this is called when real data arrives */
+int dccp_v4_rcv(struct sk_buff *skb)
+{
+ const struct dccp_hdr *dh;
+ struct sock *sk;
+ int rc;
+
+ /* Step 1: Check header basics: */
+
+ if (dccp_invalid_packet(skb))
+ goto discard_it;
+
+ dh = dccp_hdr(skb);
+#if 0
+ /*
+ * Use something like this to simulate some DATA/DATAACK loss to test
+ * dccp_ackpkts_add, you'll get something like this on a session that
+ * sends 10 DATA/DATAACK packets:
+ *
+ * ackpkts_print: 281473596467422 |0,0|3,0|0,0|3,0|0,0|3,0|0,0|3,0|0,1|
+ *
+ * 0, 0 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == just this packet
+ * 0, 1 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == two adjacent packets
+ * with the same state
+ * 3, 0 means: DCCP_ACKPKTS_STATE_NOT_RECEIVED, RLE == just this packet
+ *
+ * So...
+ *
+ * 281473596467422 was received
+ * 281473596467421 was not received
+ * 281473596467420 was received
+ * 281473596467419 was not received
+ * 281473596467418 was received
+ * 281473596467417 was not received
+ * 281473596467416 was received
+ * 281473596467415 was not received
+ * 281473596467414 was received
+ * 281473596467413 was received (this one was the 3way handshake
+ * RESPONSE)
+ *
+ */
+ if (dh->dccph_type == DCCP_PKT_DATA ||
+ dh->dccph_type == DCCP_PKT_DATAACK) {
+ static int discard = 0;
+
+ if (discard) {
+ discard = 0;
+ goto discard_it;
+ }
+ discard = 1;
+ }
+#endif
+ DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
+ DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
+
+ dccp_pr_debug("%8.8s "
+ "src=%u.%u.%u.%u@%-5d "
+ "dst=%u.%u.%u.%u@%-5d seq=%llu",
+ dccp_packet_name(dh->dccph_type),
+ NIPQUAD(skb->nh.iph->saddr), ntohs(dh->dccph_sport),
+ NIPQUAD(skb->nh.iph->daddr), ntohs(dh->dccph_dport),
+ (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+
+ if (dccp_packet_without_ack(skb)) {
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
+ dccp_pr_debug_cat("\n");
+ } else {
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
+ dccp_pr_debug_cat(", ack=%llu\n",
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ }
+
+ /* Step 2:
+ * Look up flow ID in table and get corresponding socket */
+ sk = __inet_lookup(&dccp_hashinfo,
+ skb->nh.iph->saddr, dh->dccph_sport,
+ skb->nh.iph->daddr, ntohs(dh->dccph_dport),
+ inet_iif(skb));
+
+ /*
+ * Step 2:
+ * If no socket ...
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (sk == NULL) {
+ dccp_pr_debug("failed to look up flow ID in table and "
+ "get corresponding socket\n");
+ goto no_dccp_socket;
+ }
+
+ /*
+ * Step 2:
+ * ... or S.state == TIMEWAIT,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+
+ if (sk->sk_state == DCCP_TIME_WAIT) {
+ dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: "
+ "do_time_wait\n");
+ goto do_time_wait;
+ }
+
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ dccp_pr_debug("xfrm4_policy_check failed\n");
+ goto discard_and_relse;
+ }
+
+ if (sk_filter(sk, skb, 0)) {
+ dccp_pr_debug("sk_filter failed\n");
+ goto discard_and_relse;
+ }
+
+ skb->dev = NULL;
+
+ bh_lock_sock(sk);
+ rc = 0;
+ if (!sock_owned_by_user(sk))
+ rc = dccp_v4_do_rcv(sk, skb);
+ else
+ sk_add_backlog(sk, skb);
+ bh_unlock_sock(sk);
+
+ sock_put(sk);
+ return rc;
+
+no_dccp_socket:
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+ goto discard_it;
+ /*
+ * Step 2:
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ */
+ if (dh->dccph_type != DCCP_PKT_RESET) {
+ DCCP_SKB_CB(skb)->dccpd_reset_code =
+ DCCP_RESET_CODE_NO_CONNECTION;
+ dccp_v4_ctl_send_reset(skb);
+ }
+
+discard_it:
+ /* Discard frame. */
+ kfree_skb(skb);
+ return 0;
+
+discard_and_relse:
+ sock_put(sk);
+ goto discard_it;
+
+do_time_wait:
+ inet_twsk_put((struct inet_timewait_sock *)sk);
+ goto no_dccp_socket;
+}
+
+static int dccp_v4_init_sock(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ static int dccp_ctl_socket_init = 1;
+
+ dccp_options_init(&dp->dccps_options);
+
+ if (dp->dccps_options.dccpo_send_ack_vector) {
+ dp->dccps_hc_rx_ackpkts =
+ dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN,
+ GFP_KERNEL);
+
+ if (dp->dccps_hc_rx_ackpkts == NULL)
+ return -ENOMEM;
+ }
+
+ /*
+ * FIXME: We're hardcoding the CCID, and doing this at this point makes
+ * the listening (master) sock get CCID control blocks, which is not
+ * necessary, but for now, to not mess with the test userspace apps,
+ * lets leave it here, later the real solution is to do this in a
+ * setsockopt(CCIDs-I-want/accept). -acme
+ */
+ if (likely(!dccp_ctl_socket_init)) {
+ dp->dccps_hc_rx_ccid = ccid_init(dp->dccps_options.dccpo_ccid,
+ sk);
+ dp->dccps_hc_tx_ccid = ccid_init(dp->dccps_options.dccpo_ccid,
+ sk);
+ if (dp->dccps_hc_rx_ccid == NULL ||
+ dp->dccps_hc_tx_ccid == NULL) {
+ ccid_exit(dp->dccps_hc_rx_ccid, sk);
+ ccid_exit(dp->dccps_hc_tx_ccid, sk);
+ dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts);
+ dp->dccps_hc_rx_ackpkts = NULL;
+ dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+ return -ENOMEM;
+ }
+ } else
+ dccp_ctl_socket_init = 0;
+
+ dccp_init_xmit_timers(sk);
+ inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT;
+ sk->sk_state = DCCP_CLOSED;
+ sk->sk_write_space = dccp_write_space;
+ dp->dccps_mss_cache = 536;
+ dp->dccps_role = DCCP_ROLE_UNDEFINED;
+
+ return 0;
+}
+
+static int dccp_v4_destroy_sock(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ /*
+ * DCCP doesn't use sk_qrite_queue, just sk_send_head
+ * for retransmissions
+ */
+ if (sk->sk_send_head != NULL) {
+ kfree_skb(sk->sk_send_head);
+ sk->sk_send_head = NULL;
+ }
+
+ /* Clean up a referenced DCCP bind bucket. */
+ if (inet_csk(sk)->icsk_bind_hash != NULL)
+ inet_put_port(&dccp_hashinfo, sk);
+
+ ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+ ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+ dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts);
+ dp->dccps_hc_rx_ackpkts = NULL;
+ ccid_exit(dp->dccps_hc_rx_ccid, sk);
+ ccid_exit(dp->dccps_hc_tx_ccid, sk);
+ dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+
+ return 0;
+}
+
+static void dccp_v4_reqsk_destructor(struct request_sock *req)
+{
+ kfree(inet_rsk(req)->opt);
+}
+
+static struct request_sock_ops dccp_request_sock_ops = {
+ .family = PF_INET,
+ .obj_size = sizeof(struct dccp_request_sock),
+ .rtx_syn_ack = dccp_v4_send_response,
+ .send_ack = dccp_v4_reqsk_send_ack,
+ .destructor = dccp_v4_reqsk_destructor,
+ .send_reset = dccp_v4_ctl_send_reset,
+};
+
+struct proto dccp_v4_prot = {
+ .name = "DCCP",
+ .owner = THIS_MODULE,
+ .close = dccp_close,
+ .connect = dccp_v4_connect,
+ .disconnect = dccp_disconnect,
+ .ioctl = dccp_ioctl,
+ .init = dccp_v4_init_sock,
+ .setsockopt = dccp_setsockopt,
+ .getsockopt = dccp_getsockopt,
+ .sendmsg = dccp_sendmsg,
+ .recvmsg = dccp_recvmsg,
+ .backlog_rcv = dccp_v4_do_rcv,
+ .hash = dccp_v4_hash,
+ .unhash = dccp_v4_unhash,
+ .accept = inet_csk_accept,
+ .get_port = dccp_v4_get_port,
+ .shutdown = dccp_shutdown,
+ .destroy = dccp_v4_destroy_sock,
+ .orphan_count = &dccp_orphan_count,
+ .max_header = MAX_DCCP_HEADER,
+ .obj_size = sizeof(struct dccp_sock),
+ .rsk_prot = &dccp_request_sock_ops,
+ .twsk_obj_size = sizeof(struct inet_timewait_sock),
+};
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
new file mode 100644
index 0000000..ce5dff4
--- /dev/null
+++ b/net/dccp/minisocks.c
@@ -0,0 +1,264 @@
+/*
+ * net/dccp/minisocks.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+
+#include <net/sock.h>
+#include <net/xfrm.h>
+#include <net/inet_timewait_sock.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+struct inet_timewait_death_row dccp_death_row = {
+ .sysctl_max_tw_buckets = NR_FILE * 2,
+ .period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
+ .death_lock = SPIN_LOCK_UNLOCKED,
+ .hashinfo = &dccp_hashinfo,
+ .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
+ (unsigned long)&dccp_death_row),
+ .twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work,
+ inet_twdr_twkill_work,
+ &dccp_death_row),
+/* Short-time timewait calendar */
+
+ .twcal_hand = -1,
+ .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+ (unsigned long)&dccp_death_row),
+};
+
+void dccp_time_wait(struct sock *sk, int state, int timeo)
+{
+ struct inet_timewait_sock *tw = NULL;
+
+ if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
+ tw = inet_twsk_alloc(sk, state);
+
+ if (tw != NULL) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+
+ /* Linkage updates. */
+ __inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
+
+ /* Get the TIME_WAIT timeout firing. */
+ if (timeo < rto)
+ timeo = rto;
+
+ tw->tw_timeout = DCCP_TIMEWAIT_LEN;
+ if (state == DCCP_TIME_WAIT)
+ timeo = DCCP_TIMEWAIT_LEN;
+
+ inet_twsk_schedule(tw, &dccp_death_row, timeo,
+ DCCP_TIMEWAIT_LEN);
+ inet_twsk_put(tw);
+ } else {
+ /* Sorry, if we're out of memory, just CLOSE this
+ * socket up. We've got bigger problems than
+ * non-graceful socket closings.
+ */
+ LIMIT_NETDEBUG(KERN_INFO "DCCP: time wait bucket "
+ "table overflow\n");
+ }
+
+ dccp_done(sk);
+}
+
+struct sock *dccp_create_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb)
+{
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * // Generate a new socket and switch to that socket
+ * Set S := new socket for this port pair
+ */
+ struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+
+ if (newsk != NULL) {
+ const struct dccp_request_sock *dreq = dccp_rsk(req);
+ struct inet_connection_sock *newicsk = inet_csk(sk);
+ struct dccp_sock *newdp = dccp_sk(newsk);
+
+ newdp->dccps_hc_rx_ackpkts = NULL;
+ newdp->dccps_role = DCCP_ROLE_SERVER;
+ newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
+
+ if (newdp->dccps_options.dccpo_send_ack_vector) {
+ newdp->dccps_hc_rx_ackpkts =
+ dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN,
+ GFP_ATOMIC);
+ /*
+ * XXX: We're using the same CCIDs set on the parent,
+ * i.e. sk_clone copied the master sock and left the
+ * CCID pointers for this child, that is why we do the
+ * __ccid_get calls.
+ */
+ if (unlikely(newdp->dccps_hc_rx_ackpkts == NULL))
+ goto out_free;
+ }
+
+ if (unlikely(ccid_hc_rx_init(newdp->dccps_hc_rx_ccid,
+ newsk) != 0 ||
+ ccid_hc_tx_init(newdp->dccps_hc_tx_ccid,
+ newsk) != 0)) {
+ dccp_ackpkts_free(newdp->dccps_hc_rx_ackpkts);
+ ccid_hc_rx_exit(newdp->dccps_hc_rx_ccid, newsk);
+ ccid_hc_tx_exit(newdp->dccps_hc_tx_ccid, newsk);
+out_free:
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ newsk->sk_destruct = NULL;
+ sk_free(newsk);
+ return NULL;
+ }
+
+ __ccid_get(newdp->dccps_hc_rx_ccid);
+ __ccid_get(newdp->dccps_hc_tx_ccid);
+
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * Choose S.ISS (initial seqno) or set from Init Cookie
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
+ * Cookie
+ */
+
+ /* See dccp_v4_conn_request */
+ newdp->dccps_options.dccpo_sequence_window = req->rcv_wnd;
+
+ newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr;
+ dccp_update_gsr(newsk, dreq->dreq_isr);
+
+ newdp->dccps_iss = dreq->dreq_iss;
+ dccp_update_gss(newsk, dreq->dreq_iss);
+
+ /*
+ * SWL and AWL are initially adjusted so that they are not less than
+ * the initial Sequence Numbers received and sent, respectively:
+ * SWL := max(GSR + 1 - floor(W/4), ISR),
+ * AWL := max(GSS - W' + 1, ISS).
+ * These adjustments MUST be applied only at the beginning of the
+ * connection.
+ */
+ dccp_set_seqno(&newdp->dccps_swl,
+ max48(newdp->dccps_swl, newdp->dccps_isr));
+ dccp_set_seqno(&newdp->dccps_awl,
+ max48(newdp->dccps_awl, newdp->dccps_iss));
+
+ dccp_init_xmit_timers(newsk);
+
+ DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
+ }
+ return newsk;
+}
+
+/*
+ * Process an incoming packet for RESPOND sockets represented
+ * as an request_sock.
+ */
+struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct request_sock **prev)
+{
+ struct sock *child = NULL;
+
+ /* Check for retransmitted REQUEST */
+ if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
+ if (after48(DCCP_SKB_CB(skb)->dccpd_seq,
+ dccp_rsk(req)->dreq_isr)) {
+ struct dccp_request_sock *dreq = dccp_rsk(req);
+
+ dccp_pr_debug("Retransmitted REQUEST\n");
+ /* Send another RESPONSE packet */
+ dccp_set_seqno(&dreq->dreq_iss, dreq->dreq_iss + 1);
+ dccp_set_seqno(&dreq->dreq_isr,
+ DCCP_SKB_CB(skb)->dccpd_seq);
+ req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+ }
+ /* Network Duplicate, discard packet */
+ return NULL;
+ }
+
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
+
+ if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK &&
+ dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK)
+ goto drop;
+
+ /* Invalid ACK */
+ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) {
+ dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
+ "dreq_iss=%llu\n",
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ (unsigned long long)
+ dccp_rsk(req)->dreq_iss);
+ goto drop;
+ }
+
+ child = dccp_v4_request_recv_sock(sk, skb, req, NULL);
+ if (child == NULL)
+ goto listen_overflow;
+
+ /* FIXME: deal with options */
+
+ inet_csk_reqsk_queue_unlink(sk, req, prev);
+ inet_csk_reqsk_queue_removed(sk, req);
+ inet_csk_reqsk_queue_add(sk, req, child);
+out:
+ return child;
+listen_overflow:
+ dccp_pr_debug("listen_overflow!\n");
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
+drop:
+ if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
+ req->rsk_ops->send_reset(skb);
+
+ inet_csk_reqsk_queue_drop(sk, req, prev);
+ goto out;
+}
+
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ */
+int dccp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb)
+{
+ int ret = 0;
+ const int state = child->sk_state;
+
+ if (!sock_owned_by_user(child)) {
+ ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb),
+ skb->len);
+
+ /* Wakeup parent, send SIGIO */
+ if (state == DCCP_RESPOND && child->sk_state != state)
+ parent->sk_data_ready(parent, 0);
+ } else {
+ /* Alas, it is possible again, because we do lookup
+ * in main socket hash table and lock on listening
+ * socket does not protect us more.
+ */
+ sk_add_backlog(child, skb);
+ }
+
+ bh_unlock_sock(child);
+ sock_put(child);
+ return ret;
+}
diff --git a/net/dccp/options.c b/net/dccp/options.c
new file mode 100644
index 0000000..382c589
--- /dev/null
+++ b/net/dccp/options.c
@@ -0,0 +1,855 @@
+/*
+ * net/dccp/options.c
+ *
+ * An implementation of the DCCP protocol
+ * Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap,
+ struct sock *sk,
+ const u64 ackno,
+ const unsigned char len,
+ const unsigned char *vector);
+
+/* stores the default values for new connection. may be changed with sysctl */
+static const struct dccp_options dccpo_default_values = {
+ .dccpo_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW,
+ .dccpo_ccid = DCCPF_INITIAL_CCID,
+ .dccpo_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR,
+ .dccpo_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT,
+};
+
+void dccp_options_init(struct dccp_options *dccpo)
+{
+ memcpy(dccpo, &dccpo_default_values, sizeof(*dccpo));
+}
+
+static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
+{
+ u32 value = 0;
+
+ if (len > 3)
+ value += *bf++ << 24;
+ if (len > 2)
+ value += *bf++ << 16;
+ if (len > 1)
+ value += *bf++ << 8;
+ if (len > 0)
+ value += *bf;
+
+ return value;
+}
+
+int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+#ifdef CONFIG_IP_DCCP_DEBUG
+ const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+ "CLIENT rx opt: " : "server rx opt: ";
+#endif
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+ const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
+ unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
+ unsigned char *opt_ptr = options;
+ const unsigned char *opt_end = (unsigned char *)dh +
+ (dh->dccph_doff * 4);
+ struct dccp_options_received *opt_recv = &dp->dccps_options_received;
+ unsigned char opt, len;
+ unsigned char *value;
+
+ memset(opt_recv, 0, sizeof(*opt_recv));
+
+ while (opt_ptr != opt_end) {
+ opt = *opt_ptr++;
+ len = 0;
+ value = NULL;
+
+ /* Check if this isn't a single byte option */
+ if (opt > DCCPO_MAX_RESERVED) {
+ if (opt_ptr == opt_end)
+ goto out_invalid_option;
+
+ len = *opt_ptr++;
+ if (len < 3)
+ goto out_invalid_option;
+ /*
+ * Remove the type and len fields, leaving
+ * just the value size
+ */
+ len -= 2;
+ value = opt_ptr;
+ opt_ptr += len;
+
+ if (opt_ptr > opt_end)
+ goto out_invalid_option;
+ }
+
+ switch (opt) {
+ case DCCPO_PADDING:
+ break;
+ case DCCPO_NDP_COUNT:
+ if (len > 3)
+ goto out_invalid_option;
+
+ opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
+ dccp_pr_debug("%sNDP count=%d\n", debug_prefix,
+ opt_recv->dccpor_ndp);
+ break;
+ case DCCPO_ACK_VECTOR_0:
+ if (len > DCCP_MAX_ACK_VECTOR_LEN)
+ goto out_invalid_option;
+
+ if (pkt_type == DCCP_PKT_DATA)
+ continue;
+
+ opt_recv->dccpor_ack_vector_len = len;
+ opt_recv->dccpor_ack_vector_idx = value - options;
+
+ dccp_pr_debug("%sACK vector 0, len=%d, ack_ackno=%llu\n",
+ debug_prefix, len,
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ value, len);
+ dccp_ackpkts_check_rcv_ackvector(dp->dccps_hc_rx_ackpkts,
+ sk,
+ DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ len, value);
+ break;
+ case DCCPO_TIMESTAMP:
+ if (len != 4)
+ goto out_invalid_option;
+
+ opt_recv->dccpor_timestamp = ntohl(*(u32 *)value);
+
+ dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp;
+ do_gettimeofday(&dp->dccps_timestamp_time);
+
+ dccp_pr_debug("%sTIMESTAMP=%u, ackno=%llu\n",
+ debug_prefix, opt_recv->dccpor_timestamp,
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ break;
+ case DCCPO_TIMESTAMP_ECHO:
+ if (len != 4 && len != 6 && len != 8)
+ goto out_invalid_option;
+
+ opt_recv->dccpor_timestamp_echo = ntohl(*(u32 *)value);
+
+ dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, ",
+ debug_prefix,
+ opt_recv->dccpor_timestamp_echo,
+ len + 2,
+ (unsigned long long)
+ DCCP_SKB_CB(skb)->dccpd_ack_seq);
+
+ if (len > 4) {
+ if (len == 6)
+ opt_recv->dccpor_elapsed_time =
+ ntohs(*(u16 *)(value + 4));
+ else
+ opt_recv->dccpor_elapsed_time =
+ ntohl(*(u32 *)(value + 4));
+
+ dccp_pr_debug("%sTIMESTAMP_ECHO ELAPSED_TIME=%d\n",
+ debug_prefix,
+ opt_recv->dccpor_elapsed_time);
+ }
+ break;
+ case DCCPO_ELAPSED_TIME:
+ if (len != 2 && len != 4)
+ goto out_invalid_option;
+
+ if (pkt_type == DCCP_PKT_DATA)
+ continue;
+
+ if (len == 2)
+ opt_recv->dccpor_elapsed_time =
+ ntohs(*(u16 *)value);
+ else
+ opt_recv->dccpor_elapsed_time =
+ ntohl(*(u32 *)value);
+
+ dccp_pr_debug("%sELAPSED_TIME=%d\n", debug_prefix,
+ opt_recv->dccpor_elapsed_time);
+ break;
+ /*
+ * From draft-ietf-dccp-spec-11.txt:
+ *
+ * Option numbers 128 through 191 are for
+ * options sent from the HC-Sender to the
+ * HC-Receiver; option numbers 192 through 255
+ * are for options sent from the HC-Receiver to
+ * the HC-Sender.
+ */
+ case 128 ... 191: {
+ const u16 idx = value - options;
+
+ if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
+ opt, len, idx,
+ value) != 0)
+ goto out_invalid_option;
+ }
+ break;
+ case 192 ... 255: {
+ const u16 idx = value - options;
+
+ if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
+ opt, len, idx,
+ value) != 0)
+ goto out_invalid_option;
+ }
+ break;
+ default:
+ pr_info("DCCP(%p): option %d(len=%d) not "
+ "implemented, ignoring\n",
+ sk, opt, len);
+ break;
+ }
+ }
+
+ return 0;
+
+out_invalid_option:
+ DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
+ pr_info("DCCP(%p): invalid option %d, len=%d\n", sk, opt, len);
+ return -1;
+}
+
+static void dccp_encode_value_var(const u32 value, unsigned char *to,
+ const unsigned int len)
+{
+ if (len > 3)
+ *to++ = (value & 0xFF000000) >> 24;
+ if (len > 2)
+ *to++ = (value & 0xFF0000) >> 16;
+ if (len > 1)
+ *to++ = (value & 0xFF00) >> 8;
+ if (len > 0)
+ *to++ = (value & 0xFF);
+}
+
+static inline int dccp_ndp_len(const int ndp)
+{
+ return likely(ndp <= 0xFF) ? 1 : ndp <= 0xFFFF ? 2 : 3;
+}
+
+void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
+ const unsigned char option,
+ const void *value, const unsigned char len)
+{
+ unsigned char *to;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) {
+ LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
+ "%d option!\n", option);
+ return;
+ }
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2;
+
+ to = skb_push(skb, len + 2);
+ *to++ = option;
+ *to++ = len + 2;
+
+ memcpy(to, value, len);
+}
+
+EXPORT_SYMBOL_GPL(dccp_insert_option);
+
+static void dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ int ndp = dp->dccps_ndp_count;
+
+ if (dccp_non_data_packet(skb))
+ ++dp->dccps_ndp_count;
+ else
+ dp->dccps_ndp_count = 0;
+
+ if (ndp > 0) {
+ unsigned char *ptr;
+ const int ndp_len = dccp_ndp_len(ndp);
+ const int len = ndp_len + 2;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+ return;
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+ ptr = skb_push(skb, len);
+ *ptr++ = DCCPO_NDP_COUNT;
+ *ptr++ = len;
+ dccp_encode_value_var(ndp, ptr, ndp_len);
+ }
+}
+
+static inline int dccp_elapsed_time_len(const u32 elapsed_time)
+{
+ return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
+}
+
+void dccp_insert_option_elapsed_time(struct sock *sk,
+ struct sk_buff *skb,
+ u32 elapsed_time)
+{
+#ifdef CONFIG_IP_DCCP_DEBUG
+ struct dccp_sock *dp = dccp_sk(sk);
+ const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+ "CLIENT TX opt: " : "server TX opt: ";
+#endif
+ const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+ const int len = 2 + elapsed_time_len;
+ unsigned char *to;
+
+ if (elapsed_time_len == 0)
+ return;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+ LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to "
+ "insert elapsed time!\n");
+ return;
+ }
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+ to = skb_push(skb, len);
+ *to++ = DCCPO_ELAPSED_TIME;
+ *to++ = len;
+
+ if (elapsed_time_len == 2) {
+ const u16 var16 = htons((u16)elapsed_time);
+ memcpy(to, &var16, 2);
+ } else {
+ const u32 var32 = htonl(elapsed_time);
+ memcpy(to, &var32, 4);
+ }
+
+ dccp_pr_debug("%sELAPSED_TIME=%u, len=%d, seqno=%llu\n",
+ debug_prefix, elapsed_time,
+ len,
+ (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+}
+
+EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
+
+static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+#ifdef CONFIG_IP_DCCP_DEBUG
+ const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+ "CLIENT TX opt: " : "server TX opt: ";
+#endif
+ struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
+ int len = ap->dccpap_buf_vector_len + 2;
+ const u32 elapsed_time = timeval_now_delta(&ap->dccpap_time) / 10;
+ unsigned char *to, *from;
+
+ if (elapsed_time != 0)
+ dccp_insert_option_elapsed_time(sk, skb, elapsed_time);
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+ LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to "
+ "insert ACK Vector!\n");
+ return;
+ }
+
+ /*
+ * XXX: now we have just one ack vector sent record, so
+ * we have to wait for it to be cleared.
+ *
+ * Of course this is not acceptable, but this is just for
+ * basic testing now.
+ */
+ if (ap->dccpap_ack_seqno != DCCP_MAX_SEQNO + 1)
+ return;
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+ to = skb_push(skb, len);
+ *to++ = DCCPO_ACK_VECTOR_0;
+ *to++ = len;
+
+ len = ap->dccpap_buf_vector_len;
+ from = ap->dccpap_buf + ap->dccpap_buf_head;
+
+ /* Check if buf_head wraps */
+ if (ap->dccpap_buf_head + len > ap->dccpap_buf_len) {
+ const unsigned int tailsize = (ap->dccpap_buf_len -
+ ap->dccpap_buf_head);
+
+ memcpy(to, from, tailsize);
+ to += tailsize;
+ len -= tailsize;
+ from = ap->dccpap_buf;
+ }
+
+ memcpy(to, from, len);
+ /*
+ * From draft-ietf-dccp-spec-11.txt:
+ *
+ * For each acknowledgement it sends, the HC-Receiver will add an
+ * acknowledgement record. ack_seqno will equal the HC-Receiver
+ * sequence number it used for the ack packet; ack_ptr will equal
+ * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
+ * equal buf_nonce.
+ *
+ * This implemention uses just one ack record for now.
+ */
+ ap->dccpap_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+ ap->dccpap_ack_ptr = ap->dccpap_buf_head;
+ ap->dccpap_ack_ackno = ap->dccpap_buf_ackno;
+ ap->dccpap_ack_nonce = ap->dccpap_buf_nonce;
+ ap->dccpap_ack_vector_len = ap->dccpap_buf_vector_len;
+
+ dccp_pr_debug("%sACK Vector 0, len=%d, ack_seqno=%llu, "
+ "ack_ackno=%llu\n",
+ debug_prefix, ap->dccpap_ack_vector_len,
+ (unsigned long long) ap->dccpap_ack_seqno,
+ (unsigned long long) ap->dccpap_ack_ackno);
+}
+
+void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
+{
+ struct timeval tv;
+ u32 now;
+
+ do_gettimeofday(&tv);
+ now = (tv.tv_sec * USEC_PER_SEC + tv.tv_usec) / 10;
+ /* yes this will overflow but that is the point as we want a
+ * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
+
+ now = htonl(now);
+ dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now));
+}
+
+EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
+
+static void dccp_insert_option_timestamp_echo(struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+#ifdef CONFIG_IP_DCCP_DEBUG
+ const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+ "CLIENT TX opt: " : "server TX opt: ";
+#endif
+ u32 tstamp_echo;
+ const u32 elapsed_time =
+ timeval_now_delta(&dp->dccps_timestamp_time) / 10;
+ const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+ const int len = 6 + elapsed_time_len;
+ unsigned char *to;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+ LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
+ "timestamp echo!\n");
+ return;
+ }
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+ to = skb_push(skb, len);
+ *to++ = DCCPO_TIMESTAMP_ECHO;
+ *to++ = len;
+
+ tstamp_echo = htonl(dp->dccps_timestamp_echo);
+ memcpy(to, &tstamp_echo, 4);
+ to += 4;
+
+ if (elapsed_time_len == 2) {
+ const u16 var16 = htons((u16)elapsed_time);
+ memcpy(to, &var16, 2);
+ } else if (elapsed_time_len == 4) {
+ const u32 var32 = htonl(elapsed_time);
+ memcpy(to, &var32, 4);
+ }
+
+ dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, seqno=%llu\n",
+ debug_prefix, dp->dccps_timestamp_echo,
+ len,
+ (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+
+ dp->dccps_timestamp_echo = 0;
+ dp->dccps_timestamp_time.tv_sec = 0;
+ dp->dccps_timestamp_time.tv_usec = 0;
+}
+
+void dccp_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
+
+ if (dp->dccps_options.dccpo_send_ndp_count)
+ dccp_insert_option_ndp(sk, skb);
+
+ if (!dccp_packet_without_ack(skb)) {
+ if (dp->dccps_options.dccpo_send_ack_vector &&
+ (dp->dccps_hc_rx_ackpkts->dccpap_buf_ackno !=
+ DCCP_MAX_SEQNO + 1))
+ dccp_insert_option_ack_vector(sk, skb);
+
+ if (dp->dccps_timestamp_echo != 0)
+ dccp_insert_option_timestamp_echo(sk, skb);
+ }
+
+ ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb);
+ ccid_hc_tx_insert_options(dp->dccps_hc_tx_ccid, sk, skb);
+
+ /* XXX: insert other options when appropriate */
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) {
+ /* The length of all options has to be a multiple of 4 */
+ int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
+
+ if (padding != 0) {
+ padding = 4 - padding;
+ memset(skb_push(skb, padding), 0, padding);
+ DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
+ }
+ }
+}
+
+struct dccp_ackpkts *dccp_ackpkts_alloc(const unsigned int len,
+ const unsigned int __nocast priority)
+{
+ struct dccp_ackpkts *ap = kmalloc(sizeof(*ap) + len, priority);
+
+ if (ap != NULL) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+ memset(ap->dccpap_buf, 0xFF, len);
+#endif
+ ap->dccpap_buf_len = len;
+ ap->dccpap_buf_head =
+ ap->dccpap_buf_tail =
+ ap->dccpap_buf_len - 1;
+ ap->dccpap_buf_ackno =
+ ap->dccpap_ack_ackno =
+ ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
+ ap->dccpap_buf_nonce = ap->dccpap_buf_nonce = 0;
+ ap->dccpap_ack_ptr = 0;
+ ap->dccpap_time.tv_sec = 0;
+ ap->dccpap_time.tv_usec = 0;
+ ap->dccpap_buf_vector_len = ap->dccpap_ack_vector_len = 0;
+ }
+
+ return ap;
+}
+
+void dccp_ackpkts_free(struct dccp_ackpkts *ap)
+{
+ if (ap != NULL) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+ memset(ap, 0xFF, sizeof(*ap) + ap->dccpap_buf_len);
+#endif
+ kfree(ap);
+ }
+}
+
+static inline u8 dccp_ackpkts_state(const struct dccp_ackpkts *ap,
+ const unsigned int index)
+{
+ return ap->dccpap_buf[index] & DCCP_ACKPKTS_STATE_MASK;
+}
+
+static inline u8 dccp_ackpkts_len(const struct dccp_ackpkts *ap,
+ const unsigned int index)
+{
+ return ap->dccpap_buf[index] & DCCP_ACKPKTS_LEN_MASK;
+}
+
+/*
+ * If several packets are missing, the HC-Receiver may prefer to enter multiple
+ * bytes with run length 0, rather than a single byte with a larger run length;
+ * this simplifies table updates if one of the missing packets arrives.
+ */
+static inline int dccp_ackpkts_set_buf_head_state(struct dccp_ackpkts *ap,
+ const unsigned int packets,
+ const unsigned char state)
+{
+ unsigned int gap;
+ signed long new_head;
+
+ if (ap->dccpap_buf_vector_len + packets > ap->dccpap_buf_len)
+ return -ENOBUFS;
+
+ gap = packets - 1;
+ new_head = ap->dccpap_buf_head - packets;
+
+ if (new_head < 0) {
+ if (gap > 0) {
+ memset(ap->dccpap_buf, DCCP_ACKPKTS_STATE_NOT_RECEIVED,
+ gap + new_head + 1);
+ gap = -new_head;
+ }
+ new_head += ap->dccpap_buf_len;
+ }
+
+ ap->dccpap_buf_head = new_head;
+
+ if (gap > 0)
+ memset(ap->dccpap_buf + ap->dccpap_buf_head + 1,
+ DCCP_ACKPKTS_STATE_NOT_RECEIVED, gap);
+
+ ap->dccpap_buf[ap->dccpap_buf_head] = state;
+ ap->dccpap_buf_vector_len += packets;
+ return 0;
+}
+
+/*
+ * Implements the draft-ietf-dccp-spec-11.txt Appendix A
+ */
+int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state)
+{
+ /*
+ * Check at the right places if the buffer is full, if it is, tell the
+ * caller to start dropping packets till the HC-Sender acks our ACK
+ * vectors, when we will free up space in dccpap_buf.
+ *
+ * We may well decide to do buffer compression, etc, but for now lets
+ * just drop.
+ *
+ * From Appendix A:
+ *
+ * Of course, the circular buffer may overflow, either when the
+ * HC-Sender is sending data at a very high rate, when the
+ * HC-Receiver's acknowledgements are not reaching the HC-Sender,
+ * or when the HC-Sender is forgetting to acknowledge those acks
+ * (so the HC-Receiver is unable to clean up old state). In this
+ * case, the HC-Receiver should either compress the buffer (by
+ * increasing run lengths when possible), transfer its state to
+ * a larger buffer, or, as a last resort, drop all received
+ * packets, without processing them whatsoever, until its buffer
+ * shrinks again.
+ */
+
+ /* See if this is the first ackno being inserted */
+ if (ap->dccpap_buf_vector_len == 0) {
+ ap->dccpap_buf[ap->dccpap_buf_head] = state;
+ ap->dccpap_buf_vector_len = 1;
+ } else if (after48(ackno, ap->dccpap_buf_ackno)) {
+ const u64 delta = dccp_delta_seqno(ap->dccpap_buf_ackno,
+ ackno);
+
+ /*
+ * Look if the state of this packet is the same as the
+ * previous ackno and if so if we can bump the head len.
+ */
+ if (delta == 1 &&
+ dccp_ackpkts_state(ap, ap->dccpap_buf_head) == state &&
+ (dccp_ackpkts_len(ap, ap->dccpap_buf_head) <
+ DCCP_ACKPKTS_LEN_MASK))
+ ap->dccpap_buf[ap->dccpap_buf_head]++;
+ else if (dccp_ackpkts_set_buf_head_state(ap, delta, state))
+ return -ENOBUFS;
+ } else {
+ /*
+ * A.1.2. Old Packets
+ *
+ * When a packet with Sequence Number S arrives, and
+ * S <= buf_ackno, the HC-Receiver will scan the table
+ * for the byte corresponding to S. (Indexing structures
+ * could reduce the complexity of this scan.)
+ */
+ u64 delta = dccp_delta_seqno(ackno, ap->dccpap_buf_ackno);
+ unsigned int index = ap->dccpap_buf_head;
+
+ while (1) {
+ const u8 len = dccp_ackpkts_len(ap, index);
+ const u8 state = dccp_ackpkts_state(ap, index);
+ /*
+ * valid packets not yet in dccpap_buf have a reserved
+ * entry, with a len equal to 0.
+ */
+ if (state == DCCP_ACKPKTS_STATE_NOT_RECEIVED &&
+ len == 0 && delta == 0) { /* Found our
+ reserved seat! */
+ dccp_pr_debug("Found %llu reserved seat!\n",
+ (unsigned long long) ackno);
+ ap->dccpap_buf[index] = state;
+ goto out;
+ }
+ /* len == 0 means one packet */
+ if (delta < len + 1)
+ goto out_duplicate;
+
+ delta -= len + 1;
+ if (++index == ap->dccpap_buf_len)
+ index = 0;
+ }
+ }
+
+ ap->dccpap_buf_ackno = ackno;
+ do_gettimeofday(&ap->dccpap_time);
+out:
+ dccp_pr_debug("");
+ dccp_ackpkts_print(ap);
+ return 0;
+
+out_duplicate:
+ /* Duplicate packet */
+ dccp_pr_debug("Received a dup or already considered lost "
+ "packet: %llu\n", (unsigned long long) ackno);
+ return -EILSEQ;
+}
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+void dccp_ackvector_print(const u64 ackno, const unsigned char *vector,
+ int len)
+{
+ if (!dccp_debug)
+ return;
+
+ printk("ACK vector len=%d, ackno=%llu |", len,
+ (unsigned long long) ackno);
+
+ while (len--) {
+ const u8 state = (*vector & DCCP_ACKPKTS_STATE_MASK) >> 6;
+ const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK);
+
+ printk("%d,%d|", state, rl);
+ ++vector;
+ }
+
+ printk("\n");
+}
+
+void dccp_ackpkts_print(const struct dccp_ackpkts *ap)
+{
+ dccp_ackvector_print(ap->dccpap_buf_ackno,
+ ap->dccpap_buf + ap->dccpap_buf_head,
+ ap->dccpap_buf_vector_len);
+}
+#endif
+
+static void dccp_ackpkts_trow_away_ack_record(struct dccp_ackpkts *ap)
+{
+ /*
+ * As we're keeping track of the ack vector size
+ * (dccpap_buf_vector_len) and the sent ack vector size
+ * (dccpap_ack_vector_len) we don't need dccpap_buf_tail at all, but
+ * keep this code here as in the future we'll implement a vector of
+ * ack records, as suggested in draft-ietf-dccp-spec-11.txt
+ * Appendix A. -acme
+ */
+#if 0
+ ap->dccpap_buf_tail = ap->dccpap_ack_ptr + 1;
+ if (ap->dccpap_buf_tail >= ap->dccpap_buf_len)
+ ap->dccpap_buf_tail -= ap->dccpap_buf_len;
+#endif
+ ap->dccpap_buf_vector_len -= ap->dccpap_ack_vector_len;
+}
+
+void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk,
+ u64 ackno)
+{
+ /* Check if we actually sent an ACK vector */
+ if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)
+ return;
+
+ if (ackno == ap->dccpap_ack_seqno) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+ struct dccp_sock *dp = dccp_sk(sk);
+ const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+ "CLIENT rx ack: " : "server rx ack: ";
+#endif
+ dccp_pr_debug("%sACK packet 0, len=%d, ack_seqno=%llu, "
+ "ack_ackno=%llu, ACKED!\n",
+ debug_prefix, 1,
+ (unsigned long long) ap->dccpap_ack_seqno,
+ (unsigned long long) ap->dccpap_ack_ackno);
+ dccp_ackpkts_trow_away_ack_record(ap);
+ ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
+ }
+}
+
+static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap,
+ struct sock *sk, u64 ackno,
+ const unsigned char len,
+ const unsigned char *vector)
+{
+ unsigned char i;
+
+ /* Check if we actually sent an ACK vector */
+ if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)
+ return;
+ /*
+ * We're in the receiver half connection, so if the received an ACK
+ * vector ackno (e.g. 50) before dccpap_ack_seqno (e.g. 52), we're
+ * not interested.
+ *
+ * Extra explanation with example:
+ *
+ * if we received an ACK vector with ackno 50, it can only be acking
+ * 50, 49, 48, etc, not 52 (the seqno for the ACK vector we sent).
+ */
+ /* dccp_pr_debug("is %llu < %llu? ", ackno, ap->dccpap_ack_seqno); */
+ if (before48(ackno, ap->dccpap_ack_seqno)) {
+ /* dccp_pr_debug_cat("yes\n"); */
+ return;
+ }
+ /* dccp_pr_debug_cat("no\n"); */
+
+ i = len;
+ while (i--) {
+ const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK);
+ u64 ackno_end_rl;
+
+ dccp_set_seqno(&ackno_end_rl, ackno - rl);
+
+ /*
+ * dccp_pr_debug("is %llu <= %llu <= %llu? ", ackno_end_rl,
+ * ap->dccpap_ack_seqno, ackno);
+ */
+ if (between48(ap->dccpap_ack_seqno, ackno_end_rl, ackno)) {
+ const u8 state = (*vector &
+ DCCP_ACKPKTS_STATE_MASK) >> 6;
+ /* dccp_pr_debug_cat("yes\n"); */
+
+ if (state != DCCP_ACKPKTS_STATE_NOT_RECEIVED) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+ struct dccp_sock *dp = dccp_sk(sk);
+ const char *debug_prefix =
+ dp->dccps_role == DCCP_ROLE_CLIENT ?
+ "CLIENT rx ack: " : "server rx ack: ";
+#endif
+ dccp_pr_debug("%sACK vector 0, len=%d, "
+ "ack_seqno=%llu, ack_ackno=%llu, "
+ "ACKED!\n",
+ debug_prefix, len,
+ (unsigned long long)
+ ap->dccpap_ack_seqno,
+ (unsigned long long)
+ ap->dccpap_ack_ackno);
+ dccp_ackpkts_trow_away_ack_record(ap);
+ }
+ /*
+ * If dccpap_ack_seqno was not received, no problem
+ * we'll send another ACK vector.
+ */
+ ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
+ break;
+ }
+ /* dccp_pr_debug_cat("no\n"); */
+
+ dccp_set_seqno(&ackno, ackno_end_rl - 1);
+ ++vector;
+ }
+}
diff --git a/net/dccp/output.c b/net/dccp/output.c
new file mode 100644
index 0000000..28de157
--- /dev/null
+++ b/net/dccp/output.c
@@ -0,0 +1,528 @@
+/*
+ * net/dccp/output.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+
+#include <net/sock.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+static inline void dccp_event_ack_sent(struct sock *sk)
+{
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+}
+
+/*
+ * All SKB's seen here are completely headerless. It is our
+ * job to build the DCCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ */
+int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (likely(skb != NULL)) {
+ const struct inet_sock *inet = inet_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ struct dccp_hdr *dh;
+ /* XXX For now we're using only 48 bits sequence numbers */
+ const int dccp_header_size = sizeof(*dh) +
+ sizeof(struct dccp_hdr_ext) +
+ dccp_packet_hdr_len(dcb->dccpd_type);
+ int err, set_ack = 1;
+ u64 ackno = dp->dccps_gsr;
+
+ dccp_inc_seqno(&dp->dccps_gss);
+
+ switch (dcb->dccpd_type) {
+ case DCCP_PKT_DATA:
+ set_ack = 0;
+ break;
+ case DCCP_PKT_SYNC:
+ case DCCP_PKT_SYNCACK:
+ ackno = dcb->dccpd_seq;
+ break;
+ }
+
+ dcb->dccpd_seq = dp->dccps_gss;
+ dccp_insert_options(sk, skb);
+
+ skb->h.raw = skb_push(skb, dccp_header_size);
+ dh = dccp_hdr(skb);
+ /*
+ * Data packets are not cloned as they are never retransmitted
+ */
+ if (skb_cloned(skb))
+ skb_set_owner_w(skb, sk);
+
+ /* Build DCCP header and checksum it. */
+ memset(dh, 0, dccp_header_size);
+ dh->dccph_type = dcb->dccpd_type;
+ dh->dccph_sport = inet->sport;
+ dh->dccph_dport = inet->dport;
+ dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4;
+ dh->dccph_ccval = dcb->dccpd_ccval;
+ /* XXX For now we're using only 48 bits sequence numbers */
+ dh->dccph_x = 1;
+
+ dp->dccps_awh = dp->dccps_gss;
+ dccp_hdr_set_seq(dh, dp->dccps_gss);
+ if (set_ack)
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
+
+ switch (dcb->dccpd_type) {
+ case DCCP_PKT_REQUEST:
+ dccp_hdr_request(skb)->dccph_req_service =
+ dcb->dccpd_service;
+ break;
+ case DCCP_PKT_RESET:
+ dccp_hdr_reset(skb)->dccph_reset_code =
+ dcb->dccpd_reset_code;
+ break;
+ }
+
+ dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr,
+ inet->daddr);
+
+ if (set_ack)
+ dccp_event_ack_sent(sk);
+
+ DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+
+ err = ip_queue_xmit(skb, 0);
+ if (err <= 0)
+ return err;
+
+ /* NET_XMIT_CN is special. It does not guarantee,
+ * that this packet is lost. It tells that device
+ * is about to start to drop packets or already
+ * drops some packets of the same priority and
+ * invokes us to send less aggressively.
+ */
+ return err == NET_XMIT_CN ? 0 : err;
+ }
+ return -ENOBUFS;
+}
+
+unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ int mss_now;
+
+ /*
+ * FIXME: we really should be using the af_specific thing to support
+ * IPv6.
+ * mss_now = pmtu - tp->af_specific->net_header_len -
+ * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext);
+ */
+ mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) -
+ sizeof(struct dccp_hdr_ext);
+
+ /* Now subtract optional transport overhead */
+ mss_now -= dp->dccps_ext_header_len;
+
+ /*
+ * FIXME: this should come from the CCID infrastructure, where, say,
+ * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
+ * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
+ * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
+ * make it a multiple of 4
+ */
+
+ mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
+
+ /* And store cached results */
+ dp->dccps_pmtu_cookie = pmtu;
+ dp->dccps_mss_cache = mss_now;
+
+ return mss_now;
+}
+
+void dccp_write_space(struct sock *sk)
+{
+ read_lock(&sk->sk_callback_lock);
+
+ if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ wake_up_interruptible(sk->sk_sleep);
+ /* Should agree with poll, otherwise some programs break */
+ if (sock_writeable(sk))
+ sk_wake_async(sk, 2, POLL_OUT);
+
+ read_unlock(&sk->sk_callback_lock);
+}
+
+/**
+ * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
+ * @sk: socket to wait for
+ * @timeo: for how long
+ */
+static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb,
+ long *timeo)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ DEFINE_WAIT(wait);
+ long delay;
+ int rc;
+
+ while (1) {
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+ goto do_error;
+ if (!*timeo)
+ goto do_nonblock;
+ if (signal_pending(current))
+ goto do_interrupted;
+
+ rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
+ skb->len);
+ if (rc <= 0)
+ break;
+ delay = msecs_to_jiffies(rc);
+ if (delay > *timeo || delay < 0)
+ goto do_nonblock;
+
+ sk->sk_write_pending++;
+ release_sock(sk);
+ *timeo -= schedule_timeout(delay);
+ lock_sock(sk);
+ sk->sk_write_pending--;
+ }
+out:
+ finish_wait(sk->sk_sleep, &wait);
+ return rc;
+
+do_error:
+ rc = -EPIPE;
+ goto out;
+do_nonblock:
+ rc = -EAGAIN;
+ goto out;
+do_interrupted:
+ rc = sock_intr_errno(*timeo);
+ goto out;
+}
+
+int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
+ skb->len);
+
+ if (err > 0)
+ err = dccp_wait_for_ccid(sk, skb, timeo);
+
+ if (err == 0) {
+ const struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ const int len = skb->len;
+
+ if (sk->sk_state == DCCP_PARTOPEN) {
+ /* See 8.1.5. Handshake Completion */
+ inet_csk_schedule_ack(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ inet_csk(sk)->icsk_rto,
+ DCCP_RTO_MAX);
+ dcb->dccpd_type = DCCP_PKT_DATAACK;
+ /*
+ * FIXME: we really should have a
+ * dccps_ack_pending or use icsk.
+ */
+ } else if (inet_csk_ack_scheduled(sk) ||
+ dp->dccps_timestamp_echo != 0 ||
+ (dp->dccps_options.dccpo_send_ack_vector &&
+ ap->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1 &&
+ ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1))
+ dcb->dccpd_type = DCCP_PKT_DATAACK;
+ else
+ dcb->dccpd_type = DCCP_PKT_DATA;
+
+ err = dccp_transmit_skb(sk, skb);
+ ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
+ }
+
+ return err;
+}
+
+int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (inet_sk_rebuild_header(sk) != 0)
+ return -EHOSTUNREACH; /* Routing failure or similar. */
+
+ return dccp_transmit_skb(sk, (skb_cloned(skb) ?
+ pskb_copy(skb, GFP_ATOMIC):
+ skb_clone(skb, GFP_ATOMIC)));
+}
+
+struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req)
+{
+ struct dccp_hdr *dh;
+ const int dccp_header_size = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_response);
+ struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN +
+ dccp_header_size, 1,
+ GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size);
+
+ skb->dst = dst_clone(dst);
+ skb->csum = 0;
+
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
+ DCCP_SKB_CB(skb)->dccpd_seq = dccp_rsk(req)->dreq_iss;
+ dccp_insert_options(sk, skb);
+
+ skb->h.raw = skb_push(skb, dccp_header_size);
+
+ dh = dccp_hdr(skb);
+ memset(dh, 0, dccp_header_size);
+
+ dh->dccph_sport = inet_sk(sk)->sport;
+ dh->dccph_dport = inet_rsk(req)->rmt_port;
+ dh->dccph_doff = (dccp_header_size +
+ DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
+ dh->dccph_type = DCCP_PKT_RESPONSE;
+ dh->dccph_x = 1;
+ dccp_hdr_set_seq(dh, dccp_rsk(req)->dreq_iss);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dccp_rsk(req)->dreq_isr);
+
+ dh->dccph_checksum = dccp_v4_checksum(skb, inet_rsk(req)->loc_addr,
+ inet_rsk(req)->rmt_addr);
+
+ DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+ return skb;
+}
+
+struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
+ const enum dccp_reset_codes code)
+
+{
+ struct dccp_hdr *dh;
+ struct dccp_sock *dp = dccp_sk(sk);
+ const int dccp_header_size = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_reset);
+ struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN +
+ dccp_header_size, 1,
+ GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size);
+
+ skb->dst = dst_clone(dst);
+ skb->csum = 0;
+
+ dccp_inc_seqno(&dp->dccps_gss);
+
+ DCCP_SKB_CB(skb)->dccpd_reset_code = code;
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET;
+ DCCP_SKB_CB(skb)->dccpd_seq = dp->dccps_gss;
+ dccp_insert_options(sk, skb);
+
+ skb->h.raw = skb_push(skb, dccp_header_size);
+
+ dh = dccp_hdr(skb);
+ memset(dh, 0, dccp_header_size);
+
+ dh->dccph_sport = inet_sk(sk)->sport;
+ dh->dccph_dport = inet_sk(sk)->dport;
+ dh->dccph_doff = (dccp_header_size +
+ DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
+ dh->dccph_type = DCCP_PKT_RESET;
+ dh->dccph_x = 1;
+ dccp_hdr_set_seq(dh, dp->dccps_gss);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dp->dccps_gsr);
+
+ dccp_hdr_reset(skb)->dccph_reset_code = code;
+
+ dh->dccph_checksum = dccp_v4_checksum(skb, inet_sk(sk)->saddr,
+ inet_sk(sk)->daddr);
+
+ DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+ return skb;
+}
+
+/*
+ * Do all connect socket setups that can be done AF independent.
+ */
+static inline void dccp_connect_init(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk->sk_err = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+
+ dccp_sync_mss(sk, dst_mtu(dst));
+
+ /*
+ * FIXME: set dp->{dccps_swh,dccps_swl}, with
+ * something like dccp_inc_seq
+ */
+
+ icsk->icsk_retransmits = 0;
+}
+
+int dccp_connect(struct sock *sk)
+{
+ struct sk_buff *skb;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ dccp_connect_init(sk);
+
+ skb = alloc_skb(MAX_DCCP_HEADER + 15, sk->sk_allocation);
+ if (unlikely(skb == NULL))
+ return -ENOBUFS;
+
+ /* Reserve space for headers. */
+ skb_reserve(skb, MAX_DCCP_HEADER);
+
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
+ /* FIXME: set service to something meaningful, coming
+ * from userspace*/
+ DCCP_SKB_CB(skb)->dccpd_service = 0;
+ skb->csum = 0;
+ skb_set_owner_w(skb, sk);
+
+ BUG_TRAP(sk->sk_send_head == NULL);
+ sk->sk_send_head = skb;
+ dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
+ DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
+
+ /* Timer for repeating the REQUEST until an answer. */
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ icsk->icsk_rto, DCCP_RTO_MAX);
+ return 0;
+}
+
+void dccp_send_ack(struct sock *sk)
+{
+ /* If we have been reset, we may not send again. */
+ if (sk->sk_state != DCCP_CLOSED) {
+ struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC);
+
+ if (skb == NULL) {
+ inet_csk_schedule_ack(sk);
+ inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX,
+ DCCP_RTO_MAX);
+ return;
+ }
+
+ /* Reserve space for headers */
+ skb_reserve(skb, MAX_DCCP_HEADER);
+ skb->csum = 0;
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK;
+ skb_set_owner_w(skb, sk);
+ dccp_transmit_skb(sk, skb);
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_send_ack);
+
+void dccp_send_delayed_ack(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ /*
+ * FIXME: tune this timer. elapsed time fixes the skew, so no problem
+ * with using 2s, and active senders also piggyback the ACK into a
+ * DATAACK packet, so this is really for quiescent senders.
+ */
+ unsigned long timeout = jiffies + 2 * HZ;
+
+ /* Use new timeout only if there wasn't a older one earlier. */
+ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
+ /* If delack timer was blocked or is about to expire,
+ * send ACK now.
+ *
+ * FIXME: check the "about to expire" part
+ */
+ if (icsk->icsk_ack.blocked) {
+ dccp_send_ack(sk);
+ return;
+ }
+
+ if (!time_before(timeout, icsk->icsk_ack.timeout))
+ timeout = icsk->icsk_ack.timeout;
+ }
+ icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ icsk->icsk_ack.timeout = timeout;
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+
+void dccp_send_sync(struct sock *sk, const u64 seq,
+ const enum dccp_pkt_type pkt_type)
+{
+ /*
+ * We are not putting this on the write queue, so
+ * dccp_transmit_skb() will set the ownership to this
+ * sock.
+ */
+ struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC);
+
+ if (skb == NULL)
+ /* FIXME: how to make sure the sync is sent? */
+ return;
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, MAX_DCCP_HEADER);
+ skb->csum = 0;
+ DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
+ DCCP_SKB_CB(skb)->dccpd_seq = seq;
+
+ skb_set_owner_w(skb, sk);
+ dccp_transmit_skb(sk, skb);
+}
+
+/*
+ * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This
+ * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under
+ * any circumstances.
+ */
+void dccp_send_close(struct sock *sk, const int active)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb;
+ const unsigned int prio = active ? GFP_KERNEL : GFP_ATOMIC;
+
+ skb = alloc_skb(sk->sk_prot->max_header, prio);
+ if (skb == NULL)
+ return;
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, sk->sk_prot->max_header);
+ skb->csum = 0;
+ DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ?
+ DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ;
+
+ skb_set_owner_w(skb, sk);
+ if (active) {
+ BUG_TRAP(sk->sk_send_head == NULL);
+ sk->sk_send_head = skb;
+ dccp_transmit_skb(sk, skb_clone(skb, prio));
+ } else
+ dccp_transmit_skb(sk, skb);
+
+ ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+ ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
new file mode 100644
index 0000000..18a0e69
--- /dev/null
+++ b/net/dccp/proto.c
@@ -0,0 +1,826 @@
+/*
+ * net/dccp/proto.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <net/checksum.h>
+
+#include <net/inet_common.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+
+#include <asm/semaphore.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/poll.h>
+#include <linux/dccp.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
+
+atomic_t dccp_orphan_count = ATOMIC_INIT(0);
+
+static struct net_protocol dccp_protocol = {
+ .handler = dccp_v4_rcv,
+ .err_handler = dccp_v4_err,
+};
+
+const char *dccp_packet_name(const int type)
+{
+ static const char *dccp_packet_names[] = {
+ [DCCP_PKT_REQUEST] = "REQUEST",
+ [DCCP_PKT_RESPONSE] = "RESPONSE",
+ [DCCP_PKT_DATA] = "DATA",
+ [DCCP_PKT_ACK] = "ACK",
+ [DCCP_PKT_DATAACK] = "DATAACK",
+ [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
+ [DCCP_PKT_CLOSE] = "CLOSE",
+ [DCCP_PKT_RESET] = "RESET",
+ [DCCP_PKT_SYNC] = "SYNC",
+ [DCCP_PKT_SYNCACK] = "SYNCACK",
+ };
+
+ if (type >= DCCP_NR_PKT_TYPES)
+ return "INVALID";
+ else
+ return dccp_packet_names[type];
+}
+
+EXPORT_SYMBOL_GPL(dccp_packet_name);
+
+const char *dccp_state_name(const int state)
+{
+ static char *dccp_state_names[] = {
+ [DCCP_OPEN] = "OPEN",
+ [DCCP_REQUESTING] = "REQUESTING",
+ [DCCP_PARTOPEN] = "PARTOPEN",
+ [DCCP_LISTEN] = "LISTEN",
+ [DCCP_RESPOND] = "RESPOND",
+ [DCCP_CLOSING] = "CLOSING",
+ [DCCP_TIME_WAIT] = "TIME_WAIT",
+ [DCCP_CLOSED] = "CLOSED",
+ };
+
+ if (state >= DCCP_MAX_STATES)
+ return "INVALID STATE!";
+ else
+ return dccp_state_names[state];
+}
+
+EXPORT_SYMBOL_GPL(dccp_state_name);
+
+static inline int dccp_listen_start(struct sock *sk)
+{
+ dccp_sk(sk)->dccps_role = DCCP_ROLE_LISTEN;
+ return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
+}
+
+int dccp_disconnect(struct sock *sk, int flags)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ int err = 0;
+ const int old_state = sk->sk_state;
+
+ if (old_state != DCCP_CLOSED)
+ dccp_set_state(sk, DCCP_CLOSED);
+
+ /* ABORT function of RFC793 */
+ if (old_state == DCCP_LISTEN) {
+ inet_csk_listen_stop(sk);
+ /* FIXME: do the active reset thing */
+ } else if (old_state == DCCP_REQUESTING)
+ sk->sk_err = ECONNRESET;
+
+ dccp_clear_xmit_timers(sk);
+ __skb_queue_purge(&sk->sk_receive_queue);
+ if (sk->sk_send_head != NULL) {
+ __kfree_skb(sk->sk_send_head);
+ sk->sk_send_head = NULL;
+ }
+
+ inet->dport = 0;
+
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ inet_reset_saddr(sk);
+
+ sk->sk_shutdown = 0;
+ sock_reset_flag(sk, SOCK_DONE);
+
+ icsk->icsk_backoff = 0;
+ inet_csk_delack_init(sk);
+ __sk_dst_reset(sk);
+
+ BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
+
+ sk->sk_error_report(sk);
+ return err;
+}
+
+/*
+ * Wait for a DCCP event.
+ *
+ * Note that we don't need to lock the socket, as the upper poll layers
+ * take care of normal races (between the test and the event) and we don't
+ * go look at any of the socket buffers directly.
+ */
+static unsigned int dccp_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ unsigned int mask;
+ struct sock *sk = sock->sk;
+
+ poll_wait(file, sk->sk_sleep, wait);
+ if (sk->sk_state == DCCP_LISTEN)
+ return inet_csk_listen_poll(sk);
+
+ /* Socket is not locked. We are protected from async events
+ by poll logic and correct handling of state changes
+ made by another threads is impossible in any case.
+ */
+
+ mask = 0;
+ if (sk->sk_err)
+ mask = POLLERR;
+
+ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
+ mask |= POLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLIN | POLLRDNORM;
+
+ /* Connected? */
+ if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
+ if (atomic_read(&sk->sk_rmem_alloc) > 0)
+ mask |= POLLIN | POLLRDNORM;
+
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+ if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+ mask |= POLLOUT | POLLWRNORM;
+ } else { /* send SIGIO later */
+ set_bit(SOCK_ASYNC_NOSPACE,
+ &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+ /* Race breaker. If space is freed after
+ * wspace test but before the flags are set,
+ * IO signal will be lost.
+ */
+ if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ }
+ }
+ return mask;
+}
+
+int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+ dccp_pr_debug("entry\n");
+ return -ENOIOCTLCMD;
+}
+
+int dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int optlen)
+{
+ struct dccp_sock *dp;
+ int err;
+ int val;
+
+ if (level != SOL_DCCP)
+ return ip_setsockopt(sk, level, optname, optval, optlen);
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ dp = dccp_sk(sk);
+ err = 0;
+
+ switch (optname) {
+ case DCCP_SOCKOPT_PACKET_SIZE:
+ dp->dccps_packet_size = val;
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct dccp_sock *dp;
+ int val, len;
+
+ if (level != SOL_DCCP)
+ return ip_getsockopt(sk, level, optname, optval, optlen);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(int));
+ if (len < 0)
+ return -EINVAL;
+
+ dp = dccp_sk(sk);
+
+ switch (optname) {
+ case DCCP_SOCKOPT_PACKET_SIZE:
+ val = dp->dccps_packet_size;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+ const int flags = msg->msg_flags;
+ const int noblock = flags & MSG_DONTWAIT;
+ struct sk_buff *skb;
+ int rc, size;
+ long timeo;
+
+ if (len > dp->dccps_mss_cache)
+ return -EMSGSIZE;
+
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, noblock);
+
+ /*
+ * We have to use sk_stream_wait_connect here to set sk_write_pending,
+ * so that the trick in dccp_rcv_request_sent_state_process.
+ */
+ /* Wait for a connection to finish. */
+ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING))
+ if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
+ goto out_release;
+
+ size = sk->sk_prot->max_header + len;
+ release_sock(sk);
+ skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+ lock_sock(sk);
+ if (skb == NULL)
+ goto out_release;
+
+ skb_reserve(skb, sk->sk_prot->max_header);
+ rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+ if (rc != 0)
+ goto out_discard;
+
+ rc = dccp_write_xmit(sk, skb, &timeo);
+ /*
+ * XXX we don't use sk_write_queue, so just discard the packet.
+ * Current plan however is to _use_ sk_write_queue with
+ * an algorith similar to tcp_sendmsg, where the main difference
+ * is that in DCCP we have to respect packet boundaries, so
+ * no coalescing of skbs.
+ *
+ * This bug was _quickly_ found & fixed by just looking at an OSTRA
+ * generated callgraph 8) -acme
+ */
+ if (rc != 0)
+ goto out_discard;
+out_release:
+ release_sock(sk);
+ return rc ? : len;
+out_discard:
+ kfree_skb(skb);
+ goto out_release;
+}
+
+int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t len, int nonblock, int flags, int *addr_len)
+{
+ const struct dccp_hdr *dh;
+ long timeo;
+
+ lock_sock(sk);
+
+ if (sk->sk_state == DCCP_LISTEN) {
+ len = -ENOTCONN;
+ goto out;
+ }
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ do {
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+ if (skb == NULL)
+ goto verify_sock_status;
+
+ dh = dccp_hdr(skb);
+
+ if (dh->dccph_type == DCCP_PKT_DATA ||
+ dh->dccph_type == DCCP_PKT_DATAACK)
+ goto found_ok_skb;
+
+ if (dh->dccph_type == DCCP_PKT_RESET ||
+ dh->dccph_type == DCCP_PKT_CLOSE) {
+ dccp_pr_debug("found fin ok!\n");
+ len = 0;
+ goto found_fin_ok;
+ }
+ dccp_pr_debug("packet_type=%s\n",
+ dccp_packet_name(dh->dccph_type));
+ sk_eat_skb(sk, skb);
+verify_sock_status:
+ if (sock_flag(sk, SOCK_DONE)) {
+ len = 0;
+ break;
+ }
+
+ if (sk->sk_err) {
+ len = sock_error(sk);
+ break;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ len = 0;
+ break;
+ }
+
+ if (sk->sk_state == DCCP_CLOSED) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ len = -ENOTCONN;
+ break;
+ }
+ len = 0;
+ break;
+ }
+
+ if (!timeo) {
+ len = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ len = sock_intr_errno(timeo);
+ break;
+ }
+
+ sk_wait_data(sk, &timeo);
+ continue;
+ found_ok_skb:
+ if (len > skb->len)
+ len = skb->len;
+ else if (len < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+
+ if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
+ /* Exception. Bailout! */
+ len = -EFAULT;
+ break;
+ }
+ found_fin_ok:
+ if (!(flags & MSG_PEEK))
+ sk_eat_skb(sk, skb);
+ break;
+ } while (1);
+out:
+ release_sock(sk);
+ return len;
+}
+
+static int inet_dccp_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ unsigned char old_state;
+ int err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
+ goto out;
+
+ old_state = sk->sk_state;
+ if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
+ goto out;
+
+ /* Really, if the socket is already in listen state
+ * we can only allow the backlog to be adjusted.
+ */
+ if (old_state != DCCP_LISTEN) {
+ /*
+ * FIXME: here it probably should be sk->sk_prot->listen_start
+ * see tcp_listen_start
+ */
+ err = dccp_listen_start(sk);
+ if (err)
+ goto out;
+ }
+ sk->sk_max_ack_backlog = backlog;
+ err = 0;
+
+out:
+ release_sock(sk);
+ return err;
+}
+
+static const unsigned char dccp_new_state[] = {
+ /* current state: new state: action: */
+ [0] = DCCP_CLOSED,
+ [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
+ [DCCP_REQUESTING] = DCCP_CLOSED,
+ [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
+ [DCCP_LISTEN] = DCCP_CLOSED,
+ [DCCP_RESPOND] = DCCP_CLOSED,
+ [DCCP_CLOSING] = DCCP_CLOSED,
+ [DCCP_TIME_WAIT] = DCCP_CLOSED,
+ [DCCP_CLOSED] = DCCP_CLOSED,
+};
+
+static int dccp_close_state(struct sock *sk)
+{
+ const int next = dccp_new_state[sk->sk_state];
+ const int ns = next & DCCP_STATE_MASK;
+
+ if (ns != sk->sk_state)
+ dccp_set_state(sk, ns);
+
+ return next & DCCP_ACTION_FIN;
+}
+
+void dccp_close(struct sock *sk, long timeout)
+{
+ struct sk_buff *skb;
+
+ lock_sock(sk);
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (sk->sk_state == DCCP_LISTEN) {
+ dccp_set_state(sk, DCCP_CLOSED);
+
+ /* Special case. */
+ inet_csk_listen_stop(sk);
+
+ goto adjudge_to_death;
+ }
+
+ /*
+ * We need to flush the recv. buffs. We do this only on the
+ * descriptor close, not protocol-sourced closes, because the
+ *reader process may not have drained the data yet!
+ */
+ /* FIXME: check for unread data */
+ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ __kfree_skb(skb);
+ }
+
+ if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+ /* Check zero linger _after_ checking for unread data. */
+ sk->sk_prot->disconnect(sk, 0);
+ } else if (dccp_close_state(sk)) {
+ dccp_send_close(sk, 1);
+ }
+
+ sk_stream_wait_close(sk, timeout);
+
+adjudge_to_death:
+ /*
+ * It is the last release_sock in its life. It will remove backlog.
+ */
+ release_sock(sk);
+ /*
+ * Now socket is owned by kernel and we acquire BH lock
+ * to finish close. No need to check for user refs.
+ */
+ local_bh_disable();
+ bh_lock_sock(sk);
+ BUG_TRAP(!sock_owned_by_user(sk));
+
+ sock_hold(sk);
+ sock_orphan(sk);
+
+ /*
+ * The last release_sock may have processed the CLOSE or RESET
+ * packet moving sock to CLOSED state, if not we have to fire
+ * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination"
+ * in draft-ietf-dccp-spec-11. -acme
+ */
+ if (sk->sk_state == DCCP_CLOSING) {
+ /* FIXME: should start at 2 * RTT */
+ /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ DCCP_RTO_MAX);
+#if 0
+ /* Yeah, we should use sk->sk_prot->orphan_count, etc */
+ dccp_set_state(sk, DCCP_CLOSED);
+#endif
+ }
+
+ atomic_inc(sk->sk_prot->orphan_count);
+ if (sk->sk_state == DCCP_CLOSED)
+ inet_csk_destroy_sock(sk);
+
+ /* Otherwise, socket is reprieved until protocol close. */
+
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ sock_put(sk);
+}
+
+void dccp_shutdown(struct sock *sk, int how)
+{
+ dccp_pr_debug("entry\n");
+}
+
+static struct proto_ops inet_dccp_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet_getname,
+ /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
+ .poll = dccp_poll,
+ .ioctl = inet_ioctl,
+ /* FIXME: work on inet_listen to rename it to sock_common_listen */
+ .listen = inet_dccp_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+extern struct net_proto_family inet_family_ops;
+
+static struct inet_protosw dccp_v4_protosw = {
+ .type = SOCK_DCCP,
+ .protocol = IPPROTO_DCCP,
+ .prot = &dccp_v4_prot,
+ .ops = &inet_dccp_ops,
+ .capability = -1,
+ .no_check = 0,
+ .flags = 0,
+};
+
+/*
+ * This is the global socket data structure used for responding to
+ * the Out-of-the-blue (OOTB) packets. A control sock will be created
+ * for this socket at the initialization time.
+ */
+struct socket *dccp_ctl_socket;
+
+static char dccp_ctl_socket_err_msg[] __initdata =
+ KERN_ERR "DCCP: Failed to create the control socket.\n";
+
+static int __init dccp_ctl_sock_init(void)
+{
+ int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP,
+ &dccp_ctl_socket);
+ if (rc < 0)
+ printk(dccp_ctl_socket_err_msg);
+ else {
+ dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC;
+ inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1;
+
+ /* Unhash it so that IP input processing does not even
+ * see it, we do not wish this socket to see incoming
+ * packets.
+ */
+ dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk);
+ }
+
+ return rc;
+}
+
+#ifdef CONFIG_IP_DCCP_UNLOAD_HACK
+void dccp_ctl_sock_exit(void)
+{
+ if (dccp_ctl_socket != NULL) {
+ sock_release(dccp_ctl_socket);
+ dccp_ctl_socket = NULL;
+ }
+}
+
+EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit);
+#endif
+
+static int __init init_dccp_v4_mibs(void)
+{
+ int rc = -ENOMEM;
+
+ dccp_statistics[0] = alloc_percpu(struct dccp_mib);
+ if (dccp_statistics[0] == NULL)
+ goto out;
+
+ dccp_statistics[1] = alloc_percpu(struct dccp_mib);
+ if (dccp_statistics[1] == NULL)
+ goto out_free_one;
+
+ rc = 0;
+out:
+ return rc;
+out_free_one:
+ free_percpu(dccp_statistics[0]);
+ dccp_statistics[0] = NULL;
+ goto out;
+
+}
+
+static int thash_entries;
+module_param(thash_entries, int, 0444);
+MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+int dccp_debug;
+module_param(dccp_debug, int, 0444);
+MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
+#endif
+
+static int __init dccp_init(void)
+{
+ unsigned long goal;
+ int ehash_order, bhash_order, i;
+ int rc = proto_register(&dccp_v4_prot, 1);
+
+ if (rc)
+ goto out;
+
+ dccp_hashinfo.bind_bucket_cachep =
+ kmem_cache_create("dccp_bind_bucket",
+ sizeof(struct inet_bind_bucket), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!dccp_hashinfo.bind_bucket_cachep)
+ goto out_proto_unregister;
+
+ /*
+ * Size and allocate the main established and bind bucket
+ * hash tables.
+ *
+ * The methodology is similar to that of the buffer cache.
+ */
+ if (num_physpages >= (128 * 1024))
+ goal = num_physpages >> (21 - PAGE_SHIFT);
+ else
+ goal = num_physpages >> (23 - PAGE_SHIFT);
+
+ if (thash_entries)
+ goal = (thash_entries *
+ sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
+ for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
+ ;
+ do {
+ dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
+ sizeof(struct inet_ehash_bucket);
+ dccp_hashinfo.ehash_size >>= 1;
+ while (dccp_hashinfo.ehash_size &
+ (dccp_hashinfo.ehash_size - 1))
+ dccp_hashinfo.ehash_size--;
+ dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
+ __get_free_pages(GFP_ATOMIC, ehash_order);
+ } while (!dccp_hashinfo.ehash && --ehash_order > 0);
+
+ if (!dccp_hashinfo.ehash) {
+ printk(KERN_CRIT "Failed to allocate DCCP "
+ "established hash table\n");
+ goto out_free_bind_bucket_cachep;
+ }
+
+ for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
+ rwlock_init(&dccp_hashinfo.ehash[i].lock);
+ INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
+ }
+
+ bhash_order = ehash_order;
+
+ do {
+ dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
+ sizeof(struct inet_bind_hashbucket);
+ if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
+ bhash_order > 0)
+ continue;
+ dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
+ __get_free_pages(GFP_ATOMIC, bhash_order);
+ } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
+
+ if (!dccp_hashinfo.bhash) {
+ printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n");
+ goto out_free_dccp_ehash;
+ }
+
+ for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
+ spin_lock_init(&dccp_hashinfo.bhash[i].lock);
+ INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
+ }
+
+ if (init_dccp_v4_mibs())
+ goto out_free_dccp_bhash;
+
+ rc = -EAGAIN;
+ if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP))
+ goto out_free_dccp_v4_mibs;
+
+ inet_register_protosw(&dccp_v4_protosw);
+
+ rc = dccp_ctl_sock_init();
+ if (rc)
+ goto out_unregister_protosw;
+out:
+ return rc;
+out_unregister_protosw:
+ inet_unregister_protosw(&dccp_v4_protosw);
+ inet_del_protocol(&dccp_protocol, IPPROTO_DCCP);
+out_free_dccp_v4_mibs:
+ free_percpu(dccp_statistics[0]);
+ free_percpu(dccp_statistics[1]);
+ dccp_statistics[0] = dccp_statistics[1] = NULL;
+out_free_dccp_bhash:
+ free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
+ dccp_hashinfo.bhash = NULL;
+out_free_dccp_ehash:
+ free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
+ dccp_hashinfo.ehash = NULL;
+out_free_bind_bucket_cachep:
+ kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+ dccp_hashinfo.bind_bucket_cachep = NULL;
+out_proto_unregister:
+ proto_unregister(&dccp_v4_prot);
+ goto out;
+}
+
+static const char dccp_del_proto_err_msg[] __exitdata =
+ KERN_ERR "can't remove dccp net_protocol\n";
+
+static void __exit dccp_fini(void)
+{
+ inet_unregister_protosw(&dccp_v4_protosw);
+
+ if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0)
+ printk(dccp_del_proto_err_msg);
+
+ free_percpu(dccp_statistics[0]);
+ free_percpu(dccp_statistics[1]);
+ free_pages((unsigned long)dccp_hashinfo.bhash,
+ get_order(dccp_hashinfo.bhash_size *
+ sizeof(struct inet_bind_hashbucket)));
+ free_pages((unsigned long)dccp_hashinfo.ehash,
+ get_order(dccp_hashinfo.ehash_size *
+ sizeof(struct inet_ehash_bucket)));
+ kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+ proto_unregister(&dccp_v4_prot);
+}
+
+module_init(dccp_init);
+module_exit(dccp_fini);
+
+/*
+ * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
+ * values directly, Also cover the case where the protocol is not specified,
+ * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6");
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
+MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
new file mode 100644
index 0000000..aa34b57
--- /dev/null
+++ b/net/dccp/timer.c
@@ -0,0 +1,255 @@
+/*
+ * net/dccp/timer.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+
+#include "dccp.h"
+
+static void dccp_write_timer(unsigned long data);
+static void dccp_keepalive_timer(unsigned long data);
+static void dccp_delack_timer(unsigned long data);
+
+void dccp_init_xmit_timers(struct sock *sk)
+{
+ inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
+ &dccp_keepalive_timer);
+}
+
+static void dccp_write_err(struct sock *sk)
+{
+ sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+ sk->sk_error_report(sk);
+
+ dccp_v4_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+ dccp_done(sk);
+ DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT);
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int dccp_write_timeout(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ int retry_until;
+
+ if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
+ if (icsk->icsk_retransmits != 0)
+ dst_negative_advice(&sk->sk_dst_cache);
+ retry_until = icsk->icsk_syn_retries ? :
+ /* FIXME! */ 3 /* FIXME! sysctl_tcp_syn_retries */;
+ } else {
+ if (icsk->icsk_retransmits >=
+ /* FIXME! sysctl_tcp_retries1 */ 5 /* FIXME! */) {
+ /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu
+ black hole detection. :-(
+
+ It is place to make it. It is not made. I do not want
+ to make it. It is disguisting. It does not work in any
+ case. Let me to cite the same draft, which requires for
+ us to implement this:
+
+ "The one security concern raised by this memo is that ICMP black holes
+ are often caused by over-zealous security administrators who block
+ all ICMP messages. It is vitally important that those who design and
+ deploy security systems understand the impact of strict filtering on
+ upper-layer protocols. The safest web site in the world is worthless
+ if most TCP implementations cannot transfer data from it. It would
+ be far nicer to have all of the black holes fixed rather than fixing
+ all of the TCP implementations."
+
+ Golden words :-).
+ */
+
+ dst_negative_advice(&sk->sk_dst_cache);
+ }
+
+ retry_until = /* FIXME! */ 15 /* FIXME! sysctl_tcp_retries2 */;
+ /*
+ * FIXME: see tcp_write_timout and tcp_out_of_resources
+ */
+ }
+
+ if (icsk->icsk_retransmits >= retry_until) {
+ /* Has it gone just too far? */
+ dccp_write_err(sk);
+ return 1;
+ }
+ return 0;
+}
+
+/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
+static void dccp_delack_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ icsk->icsk_ack.blocked = 1;
+ NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
+ sk_reset_timer(sk, &icsk->icsk_delack_timer,
+ jiffies + TCP_DELACK_MIN);
+ goto out;
+ }
+
+ if (sk->sk_state == DCCP_CLOSED ||
+ !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+ goto out;
+ if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_delack_timer,
+ icsk->icsk_ack.timeout);
+ goto out;
+ }
+
+ icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
+
+ if (inet_csk_ack_scheduled(sk)) {
+ if (!icsk->icsk_ack.pingpong) {
+ /* Delayed ACK missed: inflate ATO. */
+ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
+ icsk->icsk_rto);
+ } else {
+ /* Delayed ACK missed: leave pingpong mode and
+ * deflate ATO.
+ */
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ }
+ dccp_send_ack(sk);
+ NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
+ }
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/*
+ * The DCCP retransmit timer.
+ */
+static void dccp_retransmit_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ /*
+ * sk->sk_send_head has to have one skb with
+ * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP
+ * packet types (REQUEST, RESPONSE, the ACK in the 3way handshake
+ * (PARTOPEN timer), etc).
+ */
+ BUG_TRAP(sk->sk_send_head != NULL);
+
+ /*
+ * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
+ * sent, no need to retransmit, this sock is dead.
+ */
+ if (dccp_write_timeout(sk))
+ goto out;
+
+ /*
+ * We want to know the number of packets retransmitted, not the
+ * total number of retransmissions of clones of original packets.
+ */
+ if (icsk->icsk_retransmits == 0)
+ DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS);
+
+ if (dccp_retransmit_skb(sk, sk->sk_send_head) < 0) {
+ /*
+ * Retransmission failed because of local congestion,
+ * do not backoff.
+ */
+ if (icsk->icsk_retransmits == 0)
+ icsk->icsk_retransmits = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ min(icsk->icsk_rto,
+ TCP_RESOURCE_PROBE_INTERVAL),
+ DCCP_RTO_MAX);
+ goto out;
+ }
+
+ icsk->icsk_backoff++;
+ icsk->icsk_retransmits++;
+
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
+ DCCP_RTO_MAX);
+ if (icsk->icsk_retransmits > 3 /* FIXME: sysctl_dccp_retries1 */)
+ __sk_dst_reset(sk);
+out:;
+}
+
+static void dccp_write_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int event = 0;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later */
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+ jiffies + (HZ / 20));
+ goto out;
+ }
+
+ if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending)
+ goto out;
+
+ if (time_after(icsk->icsk_timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+ icsk->icsk_timeout);
+ goto out;
+ }
+
+ event = icsk->icsk_pending;
+ icsk->icsk_pending = 0;
+
+ switch (event) {
+ case ICSK_TIME_RETRANS:
+ dccp_retransmit_timer(sk);
+ break;
+ }
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/*
+ * Timer for listening sockets
+ */
+static void dccp_response_timer(struct sock *sk)
+{
+ inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT,
+ DCCP_RTO_MAX);
+}
+
+static void dccp_keepalive_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ /* Only process if socket is not in use. */
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ inet_csk_reset_keepalive_timer(sk, HZ / 20);
+ goto out;
+ }
+
+ if (sk->sk_state == DCCP_LISTEN) {
+ dccp_response_timer(sk);
+ goto out;
+ }
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
index 2101da5..92f2ec4 100644
--- a/net/decnet/Kconfig
+++ b/net/decnet/Kconfig
@@ -1,6 +1,29 @@
#
# DECnet configuration
#
+config DECNET
+ tristate "DECnet Support"
+ ---help---
+ The DECnet networking protocol was used in many products made by
+ Digital (now Compaq). It provides reliable stream and sequenced
+ packet communications over which run a variety of services similar
+ to those which run over TCP/IP.
+
+ To find some tools to use with the kernel layer support, please
+ look at Patrick Caulfield's web site:
+ <http://linux-decnet.sourceforge.net/>.
+
+ More detailed documentation is available in
+ <file:Documentation/networking/decnet.txt>.
+
+ Be sure to say Y to "/proc file system support" and "Sysctl support"
+ below when using DECnet, since you will need sysctl support to aid
+ in configuration at run time.
+
+ The DECnet code is also available as a module ( = code which can be
+ inserted in and removed from the running kernel whenever you want).
+ The module is called decnet.
+
config DECNET_ROUTER
bool "DECnet: router support (EXPERIMENTAL)"
depends on DECNET && EXPERIMENTAL
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 29bb3cd..348f36b 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -118,7 +118,7 @@ Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat
#include <linux/netfilter.h>
#include <linux/seq_file.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/flow.h>
#include <asm/system.h>
#include <asm/ioctls.h>
@@ -536,7 +536,7 @@ static void dn_keepalive(struct sock *sk)
* we are double checking that we are not sending too
* many of these keepalive frames.
*/
- if (skb_queue_len(&scp->other_xmit_queue) == 0)
+ if (skb_queue_empty(&scp->other_xmit_queue))
dn_nsp_send_link(sk, DN_NOCHANGE, 0);
}
@@ -1191,7 +1191,7 @@ static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table
struct dn_scp *scp = DN_SK(sk);
int mask = datagram_poll(file, sock, wait);
- if (skb_queue_len(&scp->other_receive_queue))
+ if (!skb_queue_empty(&scp->other_receive_queue))
mask |= POLLRDBAND;
return mask;
@@ -1214,7 +1214,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCATMARK:
lock_sock(sk);
- val = (skb_queue_len(&scp->other_receive_queue) != 0);
+ val = !skb_queue_empty(&scp->other_receive_queue);
if (scp->state != DN_RUN)
val = -ENOTCONN;
release_sock(sk);
@@ -1630,7 +1630,7 @@ static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int
int len = 0;
if (flags & MSG_OOB)
- return skb_queue_len(q) ? 1 : 0;
+ return !skb_queue_empty(q) ? 1 : 0;
while(skb != (struct sk_buff *)q) {
struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -1707,7 +1707,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
if (sk->sk_err)
goto out;
- if (skb_queue_len(&scp->other_receive_queue)) {
+ if (!skb_queue_empty(&scp->other_receive_queue)) {
if (!(flags & MSG_OOB)) {
msg->msg_flags |= MSG_OOB;
if (!scp->other_report) {
@@ -1763,7 +1763,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
nskb = skb->next;
if (skb->len == 0) {
- skb_unlink(skb);
+ skb_unlink(skb, queue);
kfree_skb(skb);
/*
* N.B. Don't refer to skb or cb after this point
@@ -1876,17 +1876,27 @@ static inline unsigned int dn_current_mss(struct sock *sk, int flags)
return mss_now;
}
-static int dn_error(struct sock *sk, int flags, int err)
+/*
+ * N.B. We get the timeout wrong here, but then we always did get it
+ * wrong before and this is another step along the road to correcting
+ * it. It ought to get updated each time we pass through the routine,
+ * but in practise it probably doesn't matter too much for now.
+ */
+static inline struct sk_buff *dn_alloc_send_pskb(struct sock *sk,
+ unsigned long datalen, int noblock,
+ int *errcode)
{
- if (err == -EPIPE)
- err = sock_error(sk) ? : -EPIPE;
- if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
- send_sig(SIGPIPE, current, 0);
- return err;
+ struct sk_buff *skb = sock_alloc_send_skb(sk, datalen,
+ noblock, errcode);
+ if (skb) {
+ skb->protocol = __constant_htons(ETH_P_DNA_RT);
+ skb->pkt_type = PACKET_OUTGOING;
+ }
+ return skb;
}
static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
- struct msghdr *msg, size_t size)
+ struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
struct dn_scp *scp = DN_SK(sk);
@@ -1901,7 +1911,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
struct dn_skb_cb *cb;
size_t len;
unsigned char fctype;
- long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ long timeo;
if (flags & ~(MSG_TRYHARD|MSG_OOB|MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|MSG_MORE|MSG_CMSG_COMPAT))
return -EOPNOTSUPP;
@@ -1909,18 +1919,21 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
if (addr_len && (addr_len != sizeof(struct sockaddr_dn)))
return -EINVAL;
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/*
* The only difference between stream sockets and sequenced packet
* sockets is that the stream sockets always behave as if MSG_EOR
* has been set.
*/
if (sock->type == SOCK_STREAM) {
- if (flags & MSG_EOR)
- return -EINVAL;
+ if (flags & MSG_EOR) {
+ err = -EINVAL;
+ goto out;
+ }
flags |= MSG_EOR;
}
- lock_sock(sk);
err = dn_check_state(sk, addr, addr_len, &timeo, flags);
if (err)
@@ -1989,8 +2002,12 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
/*
* Get a suitably sized skb.
+ * 64 is a bit of a hack really, but its larger than any
+ * link-layer headers and has served us well as a good
+ * guess as to their real length.
*/
- skb = dn_alloc_send_skb(sk, &len, flags & MSG_DONTWAIT, timeo, &err);
+ skb = dn_alloc_send_pskb(sk, len + 64 + DN_MAX_NSP_DATA_HEADER,
+ flags & MSG_DONTWAIT, &err);
if (err)
break;
@@ -2000,7 +2017,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
cb = DN_SKB_CB(skb);
- skb_reserve(skb, DN_MAX_NSP_DATA_HEADER);
+ skb_reserve(skb, 64 + DN_MAX_NSP_DATA_HEADER);
if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
err = -EFAULT;
@@ -2045,7 +2062,7 @@ out:
return sent ? sent : err;
out_err:
- err = dn_error(sk, flags, err);
+ err = sk_stream_error(sk, flags, err);
release_sock(sk);
return err;
}
@@ -2073,7 +2090,7 @@ static struct notifier_block dn_dev_notifier = {
.notifier_call = dn_device_event,
};
-extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *);
+extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
static struct packet_type dn_dix_packet_type = {
.type = __constant_htons(ETH_P_DNA_RT),
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index ee7bf46..5610bb1 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -716,13 +716,13 @@ static int dn_dev_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *a
}
static int dn_dev_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa,
- u32 pid, u32 seq, int event)
+ u32 pid, u32 seq, int event, unsigned int flags)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
ifm = NLMSG_DATA(nlh);
ifm->ifa_family = AF_DECnet;
@@ -752,16 +752,16 @@ static void rtmsg_ifa(int event, struct dn_ifaddr *ifa)
skb = alloc_skb(size, GFP_KERNEL);
if (!skb) {
- netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS);
+ netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, ENOBUFS);
return;
}
- if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event) < 0) {
+ if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, EINVAL);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_IFADDR;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_DECnet_IFADDR, GFP_KERNEL);
+ NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_IFADDR;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_DECnet_IFADDR, GFP_KERNEL);
}
static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
@@ -790,7 +790,8 @@ static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
if (dn_dev_fill_ifaddr(skb, ifa,
NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
- RTM_NEWADDR) <= 0)
+ RTM_NEWADDR,
+ NLM_F_MULTI) <= 0)
goto done;
}
}
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 9934b25..99bc061 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (t < s_t)
continue;
if (t > s_t)
- memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+ memset(&cb->args[1], 0,
+ sizeof(cb->args) - sizeof(cb->args[0]));
tb = dn_fib_get_table(t, 0);
if (tb == NULL)
continue;
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index f6dfe96..8d0cc3c 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -101,7 +101,6 @@ struct neigh_table dn_neigh_table = {
.id = "dn_neigh_cache",
.parms ={
.tbl = &dn_neigh_table,
- .entries = 0,
.base_reachable_time = 30 * HZ,
.retrans_time = 1 * HZ,
.gc_staletime = 60 * HZ,
@@ -149,12 +148,12 @@ static int dn_neigh_construct(struct neighbour *neigh)
__neigh_parms_put(neigh->parms);
neigh->parms = neigh_parms_clone(parms);
- rcu_read_unlock();
if (dn_db->use_long)
neigh->ops = &dn_long_ops;
else
neigh->ops = &dn_short_ops;
+ rcu_read_unlock();
if (dn->flags & DN_NDFLAG_P3)
neigh->ops = &dn_phase3_ops;
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 202dbde..369f25b 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -60,7 +60,7 @@
#include <linux/inet.h>
#include <linux/route.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/system.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 42abbf3..53633d3 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -137,69 +137,6 @@ struct sk_buff *dn_alloc_skb(struct sock *sk, int size, int pri)
}
/*
- * Wrapper for the above, for allocs of data skbs. We try and get the
- * whole size thats been asked for (plus 11 bytes of header). If this
- * fails, then we try for any size over 16 bytes for SOCK_STREAMS.
- */
-struct sk_buff *dn_alloc_send_skb(struct sock *sk, size_t *size, int noblock, long timeo, int *err)
-{
- int space;
- int len;
- struct sk_buff *skb = NULL;
-
- *err = 0;
-
- while(skb == NULL) {
- if (signal_pending(current)) {
- *err = sock_intr_errno(timeo);
- break;
- }
-
- if (sk->sk_shutdown & SEND_SHUTDOWN) {
- *err = EINVAL;
- break;
- }
-
- if (sk->sk_err)
- break;
-
- len = *size + 11;
- space = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
-
- if (space < len) {
- if ((sk->sk_socket->type == SOCK_STREAM) &&
- (space >= (16 + 11)))
- len = space;
- }
-
- if (space < len) {
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- if (noblock) {
- *err = EWOULDBLOCK;
- break;
- }
-
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
- SOCK_SLEEP_PRE(sk)
-
- if ((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) <
- len)
- schedule();
-
- SOCK_SLEEP_POST(sk)
- continue;
- }
-
- if ((skb = dn_alloc_skb(sk, len, sk->sk_allocation)) == NULL)
- continue;
-
- *size = len - 11;
- }
-
- return skb;
-}
-
-/*
* Calculate persist timer based upon the smoothed round
* trip time and the variance. Backoff according to the
* nsp_backoff[] array.
@@ -342,7 +279,8 @@ int dn_nsp_xmit_timeout(struct sock *sk)
dn_nsp_output(sk);
- if (skb_queue_len(&scp->data_xmit_queue) || skb_queue_len(&scp->other_xmit_queue))
+ if (!skb_queue_empty(&scp->data_xmit_queue) ||
+ !skb_queue_empty(&scp->other_xmit_queue))
scp->persist = dn_nsp_persist(sk);
return 0;
@@ -478,7 +416,7 @@ int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff
xmit_count = cb2->xmit_count;
segnum = cb2->segnum;
/* Remove and drop ack'ed packet */
- skb_unlink(ack);
+ skb_unlink(ack, q);
kfree_skb(ack);
ack = NULL;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 1e7b5c3..2c915f3 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -572,7 +572,7 @@ static int dn_route_ptp_hello(struct sk_buff *skb)
return NET_RX_SUCCESS;
}
-int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct dn_skb_cb *cb;
unsigned char flags = 0;
@@ -1465,7 +1465,8 @@ int dn_route_input(struct sk_buff *skb)
return dn_route_input_slow(skb);
}
-static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
+static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+ int event, int nowait, unsigned int flags)
{
struct dn_route *rt = (struct dn_route *)skb->dst;
struct rtmsg *r;
@@ -1473,9 +1474,8 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int
unsigned char *b = skb->tail;
struct rta_cacheinfo ci;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
r = NLMSG_DATA(nlh);
- nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
r->rtm_family = AF_DECnet;
r->rtm_dst_len = 16;
r->rtm_src_len = 0;
@@ -1596,7 +1596,7 @@ int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
- err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
+ err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0);
if (err == 0)
goto out_free;
@@ -1644,7 +1644,8 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
continue;
skb->dst = dst_clone(&rt->u.dst);
if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
+ cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+ 1, NLM_F_MULTI) <= 0) {
dst_release(xchg(&skb->dst, NULL));
rcu_read_unlock_bh();
goto done;
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 597587d..1060de7 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -342,14 +342,15 @@ static struct notifier_block dn_fib_rules_notifier = {
.notifier_call = dn_fib_rules_event,
};
-static int dn_fib_fill_rule(struct sk_buff *skb, struct dn_fib_rule *r, struct netlink_callback *cb)
+static int dn_fib_fill_rule(struct sk_buff *skb, struct dn_fib_rule *r,
+ struct netlink_callback *cb, unsigned int flags)
{
struct rtmsg *rtm;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
- nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm));
+ nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWRULE, sizeof(*rtm), flags);
rtm = NLMSG_DATA(nlh);
rtm->rtm_family = AF_DECnet;
rtm->rtm_dst_len = r->r_dst_len;
@@ -394,7 +395,7 @@ int dn_fib_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
for(r = dn_fib_rules, idx = 0; r; r = r->r_next, idx++) {
if (idx < s_idx)
continue;
- if (dn_fib_fill_rule(skb, r, cb) < 0)
+ if (dn_fib_fill_rule(skb, r, cb, NLM_F_MULTI) < 0)
break;
}
read_unlock(&dn_fib_rules_lock);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index dad5603..eeba56f 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -79,7 +79,7 @@ for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_n
static DEFINE_RWLOCK(dn_fib_tables_lock);
struct dn_fib_table *dn_fib_tables[RT_TABLE_MAX + 1];
-static kmem_cache_t *dn_hash_kmem;
+static kmem_cache_t *dn_hash_kmem __read_mostly;
static int dn_fib_hash_zombies;
static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz)
@@ -270,13 +270,13 @@ static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern
static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
u8 tb_id, u8 type, u8 scope, void *dst, int dst_len,
- struct dn_fib_info *fi)
+ struct dn_fib_info *fi, unsigned int flags)
{
struct rtmsg *rtm;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
rtm = NLMSG_DATA(nlh);
rtm->rtm_family = AF_DECnet;
rtm->rtm_dst_len = dst_len;
@@ -345,14 +345,14 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, int tb_id,
if (dn_fib_dump_info(skb, pid, nlh->nlmsg_seq, event, tb_id,
f->fn_type, f->fn_scope, &f->fn_key, z,
- DN_FIB_INFO(f)) < 0) {
+ DN_FIB_INFO(f), 0) < 0) {
kfree_skb(skb);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_ROUTE;
+ NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_ROUTE;
if (nlh->nlmsg_flags & NLM_F_ECHO)
atomic_inc(&skb->users);
- netlink_broadcast(rtnl, skb, pid, RTMGRP_DECnet_ROUTE, GFP_KERNEL);
+ netlink_broadcast(rtnl, skb, pid, RTNLGRP_DECnet_ROUTE, GFP_KERNEL);
if (nlh->nlmsg_flags & NLM_F_ECHO)
netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
}
@@ -377,7 +377,7 @@ static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb,
tb->n,
(f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type,
f->fn_scope, &f->fn_key, dz->dz_order,
- f->fn_info) < 0) {
+ f->fn_info, NLM_F_MULTI) < 0) {
cb->args[3] = i;
return -1;
}
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 284a999..1ab94c6 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -19,6 +19,7 @@
#include <linux/netfilter.h>
#include <linux/spinlock.h>
#include <linux/netlink.h>
+#include <linux/netfilter_decnet.h>
#include <net/sock.h>
#include <net/flow.h>
@@ -71,10 +72,10 @@ static void dnrmg_send_peer(struct sk_buff *skb)
switch(flags & DN_RT_CNTL_MSK) {
case DN_RT_PKT_L1RT:
- group = DNRMG_L1_GROUP;
+ group = DNRNG_NLGRP_L1;
break;
case DN_RT_PKT_L2RT:
- group = DNRMG_L2_GROUP;
+ group = DNRNG_NLGRP_L2;
break;
default:
return;
@@ -83,7 +84,7 @@ static void dnrmg_send_peer(struct sk_buff *skb)
skb2 = dnrmg_build_message(skb, &status);
if (skb2 == NULL)
return;
- NETLINK_CB(skb2).dst_groups = group;
+ NETLINK_CB(skb2).dst_group = group;
netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC);
}
@@ -138,7 +139,8 @@ static int __init init(void)
{
int rv = 0;
- dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, dnrmg_receive_user_sk);
+ dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
+ dnrmg_receive_user_sk, THIS_MODULE);
if (dnrmg == NULL) {
printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
return -ENOMEM;
@@ -162,6 +164,7 @@ static void __exit fini(void)
MODULE_DESCRIPTION("DECnet Routing Message Grabulator");
MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>");
MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
module_init(init);
module_exit(fini);
diff --git a/net/econet/Kconfig b/net/econet/Kconfig
new file mode 100644
index 0000000..39a2d29
--- /dev/null
+++ b/net/econet/Kconfig
@@ -0,0 +1,36 @@
+#
+# Acorn Econet/AUN protocols
+#
+
+config ECONET
+ tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && INET
+ ---help---
+ Econet is a fairly old and slow networking protocol mainly used by
+ Acorn computers to access file and print servers. It uses native
+ Econet network cards. AUN is an implementation of the higher level
+ parts of Econet that runs over ordinary Ethernet connections, on
+ top of the UDP packet protocol, which in turn runs on top of the
+ Internet protocol IP.
+
+ If you say Y here, you can choose with the next two options whether
+ to send Econet/AUN traffic over a UDP Ethernet connection or over
+ a native Econet network card.
+
+ To compile this driver as a module, choose M here: the module
+ will be called econet.
+
+config ECONET_AUNUDP
+ bool "AUN over UDP"
+ depends on ECONET
+ help
+ Say Y here if you want to send Econet/AUN traffic over a UDP
+ connection (UDP is a packet based protocol that runs on top of the
+ Internet protocol IP) using an ordinary Ethernet network card.
+
+config ECONET_NATIVE
+ bool "Native Econet"
+ depends on ECONET
+ help
+ Say Y here if you have a native Econet network card installed in
+ your computer.
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index de691e1..4a62093 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -159,7 +159,7 @@ static int econet_recvmsg(struct kiocb *iocb, struct socket *sock,
err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
if (err)
goto out_free;
- sk->sk_stamp = skb->stamp;
+ skb_get_timestamp(skb, &sk->sk_stamp);
if (msg->msg_name)
memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
@@ -869,7 +869,7 @@ static void aun_tx_ack(unsigned long seq, int result)
foundit:
tx_result(skb->sk, eb->cookie, result);
- skb_unlink(skb);
+ skb_unlink(skb, &aun_queue);
spin_unlock_irqrestore(&aun_queue_lock, flags);
kfree_skb(skb);
}
@@ -947,7 +947,7 @@ static void ab_cleanup(unsigned long h)
{
tx_result(skb->sk, eb->cookie,
ECTYPE_TRANSMIT_NOT_PRESENT);
- skb_unlink(skb);
+ skb_unlink(skb, &aun_queue);
kfree_skb(skb);
}
skb = newskb;
@@ -1009,7 +1009,7 @@ release:
* Receive an Econet frame from a device.
*/
-static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct ec_framehdr *hdr;
struct sock *sk;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 6617ea4..87a052a 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -62,8 +62,6 @@
#include <asm/system.h>
#include <asm/checksum.h>
-extern int __init netdev_boot_setup(char *str);
-
__setup("ether=", netdev_boot_setup);
/*
@@ -92,10 +90,9 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
* Set the source hardware address.
*/
- if(saddr)
- memcpy(eth->h_source,saddr,dev->addr_len);
- else
- memcpy(eth->h_source,dev->dev_addr,dev->addr_len);
+ if(!saddr)
+ saddr = dev->dev_addr;
+ memcpy(eth->h_source,saddr,dev->addr_len);
/*
* Anyway, the loopback-device should never use this function...
@@ -156,7 +153,7 @@ int eth_rebuild_header(struct sk_buff *skb)
* This is normal practice and works for any 'now in use' protocol.
*/
-unsigned short eth_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
{
struct ethhdr *eth;
unsigned char *rawp;
@@ -164,7 +161,6 @@ unsigned short eth_type_trans(struct sk_buff *skb, struct net_device *dev)
skb->mac.raw=skb->data;
skb_pull(skb,ETH_HLEN);
eth = eth_hdr(skb);
- skb->input_dev = dev;
if(*eth->h_dest&1)
{
diff --git a/net/ethernet/sysctl_net_ether.c b/net/ethernet/sysctl_net_ether.c
index b81a6d5..66b39fc 100644
--- a/net/ethernet/sysctl_net_ether.c
+++ b/net/ethernet/sysctl_net_ether.c
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/sysctl.h>
+#include <linux/if_ether.h>
ctl_table ether_table[] = {
{0}
diff --git a/net/ieee80211/Kconfig b/net/ieee80211/Kconfig
new file mode 100644
index 0000000..91b16fb
--- /dev/null
+++ b/net/ieee80211/Kconfig
@@ -0,0 +1,68 @@
+config IEEE80211
+ tristate "Generic IEEE 802.11 Networking Stack"
+ ---help---
+ This option enables the hardware independent IEEE 802.11
+ networking stack.
+
+config IEEE80211_DEBUG
+ bool "Enable full debugging output"
+ depends on IEEE80211
+ ---help---
+ This option will enable debug tracing output for the
+ ieee80211 network stack.
+
+ This will result in the kernel module being ~70k larger. You
+ can control which debug output is sent to the kernel log by
+ setting the value in
+
+ /proc/net/ieee80211/debug_level
+
+ For example:
+
+ % echo 0x00000FFO > /proc/net/ieee80211/debug_level
+
+ For a list of values you can assign to debug_level, you
+ can look at the bit mask values in <net/ieee80211.h>
+
+ If you are not trying to debug or develop the ieee80211
+ subsystem, you most likely want to say N here.
+
+config IEEE80211_CRYPT_WEP
+ tristate "IEEE 802.11 WEP encryption (802.1x)"
+ depends on IEEE80211
+ select CRYPTO
+ select CRYPTO_ARC4
+ select CRC32
+ ---help---
+ Include software based cipher suites in support of IEEE
+ 802.11's WEP. This is needed for WEP as well as 802.1x.
+
+ This can be compiled as a modules and it will be called
+ "ieee80211_crypt_wep".
+
+config IEEE80211_CRYPT_CCMP
+ tristate "IEEE 802.11i CCMP support"
+ depends on IEEE80211
+ select CRYPTO
+ select CRYPTO_AES
+ ---help---
+ Include software based cipher suites in support of IEEE 802.11i
+ (aka TGi, WPA, WPA2, WPA-PSK, etc.) for use with CCMP enabled
+ networks.
+
+ This can be compiled as a modules and it will be called
+ "ieee80211_crypt_ccmp".
+
+config IEEE80211_CRYPT_TKIP
+ tristate "IEEE 802.11i TKIP encryption"
+ depends on IEEE80211
+ select CRYPTO
+ select CRYPTO_MICHAEL_MIC
+ ---help---
+ Include software based cipher suites in support of IEEE 802.11i
+ (aka TGi, WPA, WPA2, WPA-PSK, etc.) for use with TKIP enabled
+ networks.
+
+ This can be compiled as a modules and it will be called
+ "ieee80211_crypt_tkip".
+
diff --git a/net/ieee80211/Makefile b/net/ieee80211/Makefile
new file mode 100644
index 0000000..a6ccac5
--- /dev/null
+++ b/net/ieee80211/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_IEEE80211) += ieee80211.o
+obj-$(CONFIG_IEEE80211) += ieee80211_crypt.o
+obj-$(CONFIG_IEEE80211_CRYPT_WEP) += ieee80211_crypt_wep.o
+obj-$(CONFIG_IEEE80211_CRYPT_CCMP) += ieee80211_crypt_ccmp.o
+obj-$(CONFIG_IEEE80211_CRYPT_TKIP) += ieee80211_crypt_tkip.o
+ieee80211-objs := \
+ ieee80211_module.o \
+ ieee80211_tx.o \
+ ieee80211_rx.o \
+ ieee80211_wx.o
+
diff --git a/net/ieee80211/ieee80211_crypt.c b/net/ieee80211/ieee80211_crypt.c
new file mode 100644
index 0000000..61a9d92
--- /dev/null
+++ b/net/ieee80211/ieee80211_crypt.c
@@ -0,0 +1,258 @@
+/*
+ * Host AP crypto routines
+ *
+ * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Portions Copyright (C) 2004, Intel Corporation <jketreno@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/string.h>
+#include <asm/errno.h>
+
+#include <net/ieee80211.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("HostAP crypto");
+MODULE_LICENSE("GPL");
+
+struct ieee80211_crypto_alg {
+ struct list_head list;
+ struct ieee80211_crypto_ops *ops;
+};
+
+struct ieee80211_crypto {
+ struct list_head algs;
+ spinlock_t lock;
+};
+
+static struct ieee80211_crypto *hcrypt;
+
+void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force)
+{
+ struct list_head *ptr, *n;
+ struct ieee80211_crypt_data *entry;
+
+ for (ptr = ieee->crypt_deinit_list.next, n = ptr->next;
+ ptr != &ieee->crypt_deinit_list; ptr = n, n = ptr->next) {
+ entry = list_entry(ptr, struct ieee80211_crypt_data, list);
+
+ if (atomic_read(&entry->refcnt) != 0 && !force)
+ continue;
+
+ list_del(ptr);
+
+ if (entry->ops) {
+ entry->ops->deinit(entry->priv);
+ module_put(entry->ops->owner);
+ }
+ kfree(entry);
+ }
+}
+
+void ieee80211_crypt_deinit_handler(unsigned long data)
+{
+ struct ieee80211_device *ieee = (struct ieee80211_device *)data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ieee->lock, flags);
+ ieee80211_crypt_deinit_entries(ieee, 0);
+ if (!list_empty(&ieee->crypt_deinit_list)) {
+ printk(KERN_DEBUG "%s: entries remaining in delayed crypt "
+ "deletion list\n", ieee->dev->name);
+ ieee->crypt_deinit_timer.expires = jiffies + HZ;
+ add_timer(&ieee->crypt_deinit_timer);
+ }
+ spin_unlock_irqrestore(&ieee->lock, flags);
+
+}
+
+void ieee80211_crypt_delayed_deinit(struct ieee80211_device *ieee,
+ struct ieee80211_crypt_data **crypt)
+{
+ struct ieee80211_crypt_data *tmp;
+ unsigned long flags;
+
+ if (*crypt == NULL)
+ return;
+
+ tmp = *crypt;
+ *crypt = NULL;
+
+ /* must not run ops->deinit() while there may be pending encrypt or
+ * decrypt operations. Use a list of delayed deinits to avoid needing
+ * locking. */
+
+ spin_lock_irqsave(&ieee->lock, flags);
+ list_add(&tmp->list, &ieee->crypt_deinit_list);
+ if (!timer_pending(&ieee->crypt_deinit_timer)) {
+ ieee->crypt_deinit_timer.expires = jiffies + HZ;
+ add_timer(&ieee->crypt_deinit_timer);
+ }
+ spin_unlock_irqrestore(&ieee->lock, flags);
+}
+
+int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops)
+{
+ unsigned long flags;
+ struct ieee80211_crypto_alg *alg;
+
+ if (hcrypt == NULL)
+ return -1;
+
+ alg = kmalloc(sizeof(*alg), GFP_KERNEL);
+ if (alg == NULL)
+ return -ENOMEM;
+
+ memset(alg, 0, sizeof(*alg));
+ alg->ops = ops;
+
+ spin_lock_irqsave(&hcrypt->lock, flags);
+ list_add(&alg->list, &hcrypt->algs);
+ spin_unlock_irqrestore(&hcrypt->lock, flags);
+
+ printk(KERN_DEBUG "ieee80211_crypt: registered algorithm '%s'\n",
+ ops->name);
+
+ return 0;
+}
+
+int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops)
+{
+ unsigned long flags;
+ struct list_head *ptr;
+ struct ieee80211_crypto_alg *del_alg = NULL;
+
+ if (hcrypt == NULL)
+ return -1;
+
+ spin_lock_irqsave(&hcrypt->lock, flags);
+ for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) {
+ struct ieee80211_crypto_alg *alg =
+ (struct ieee80211_crypto_alg *)ptr;
+ if (alg->ops == ops) {
+ list_del(&alg->list);
+ del_alg = alg;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&hcrypt->lock, flags);
+
+ if (del_alg) {
+ printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm "
+ "'%s'\n", ops->name);
+ kfree(del_alg);
+ }
+
+ return del_alg ? 0 : -1;
+}
+
+struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name)
+{
+ unsigned long flags;
+ struct list_head *ptr;
+ struct ieee80211_crypto_alg *found_alg = NULL;
+
+ if (hcrypt == NULL)
+ return NULL;
+
+ spin_lock_irqsave(&hcrypt->lock, flags);
+ for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) {
+ struct ieee80211_crypto_alg *alg =
+ (struct ieee80211_crypto_alg *)ptr;
+ if (strcmp(alg->ops->name, name) == 0) {
+ found_alg = alg;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&hcrypt->lock, flags);
+
+ if (found_alg)
+ return found_alg->ops;
+ else
+ return NULL;
+}
+
+static void *ieee80211_crypt_null_init(int keyidx)
+{
+ return (void *)1;
+}
+static void ieee80211_crypt_null_deinit(void *priv)
+{
+}
+
+static struct ieee80211_crypto_ops ieee80211_crypt_null = {
+ .name = "NULL",
+ .init = ieee80211_crypt_null_init,
+ .deinit = ieee80211_crypt_null_deinit,
+ .encrypt_mpdu = NULL,
+ .decrypt_mpdu = NULL,
+ .encrypt_msdu = NULL,
+ .decrypt_msdu = NULL,
+ .set_key = NULL,
+ .get_key = NULL,
+ .extra_prefix_len = 0,
+ .extra_postfix_len = 0,
+ .owner = THIS_MODULE,
+};
+
+static int __init ieee80211_crypto_init(void)
+{
+ int ret = -ENOMEM;
+
+ hcrypt = kmalloc(sizeof(*hcrypt), GFP_KERNEL);
+ if (!hcrypt)
+ goto out;
+
+ memset(hcrypt, 0, sizeof(*hcrypt));
+ INIT_LIST_HEAD(&hcrypt->algs);
+ spin_lock_init(&hcrypt->lock);
+
+ ret = ieee80211_register_crypto_ops(&ieee80211_crypt_null);
+ if (ret < 0) {
+ kfree(hcrypt);
+ hcrypt = NULL;
+ }
+ out:
+ return ret;
+}
+
+static void __exit ieee80211_crypto_deinit(void)
+{
+ struct list_head *ptr, *n;
+
+ if (hcrypt == NULL)
+ return;
+
+ for (ptr = hcrypt->algs.next, n = ptr->next; ptr != &hcrypt->algs;
+ ptr = n, n = ptr->next) {
+ struct ieee80211_crypto_alg *alg =
+ (struct ieee80211_crypto_alg *)ptr;
+ list_del(ptr);
+ printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm "
+ "'%s' (deinit)\n", alg->ops->name);
+ kfree(alg);
+ }
+
+ kfree(hcrypt);
+}
+
+EXPORT_SYMBOL(ieee80211_crypt_deinit_entries);
+EXPORT_SYMBOL(ieee80211_crypt_deinit_handler);
+EXPORT_SYMBOL(ieee80211_crypt_delayed_deinit);
+
+EXPORT_SYMBOL(ieee80211_register_crypto_ops);
+EXPORT_SYMBOL(ieee80211_unregister_crypto_ops);
+EXPORT_SYMBOL(ieee80211_get_crypto_ops);
+
+module_init(ieee80211_crypto_init);
+module_exit(ieee80211_crypto_deinit);
diff --git a/net/ieee80211/ieee80211_crypt_ccmp.c b/net/ieee80211/ieee80211_crypt_ccmp.c
new file mode 100644
index 0000000..8fc13f4
--- /dev/null
+++ b/net/ieee80211/ieee80211_crypt_ccmp.c
@@ -0,0 +1,455 @@
+/*
+ * Host AP crypt: host-based CCMP encryption implementation for Host AP driver
+ *
+ * Copyright (c) 2003-2004, Jouni Malinen <jkmaline@cc.hut.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <asm/string.h>
+#include <linux/wireless.h>
+
+#include <net/ieee80211.h>
+
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("Host AP crypt: CCMP");
+MODULE_LICENSE("GPL");
+
+#define AES_BLOCK_LEN 16
+#define CCMP_HDR_LEN 8
+#define CCMP_MIC_LEN 8
+#define CCMP_TK_LEN 16
+#define CCMP_PN_LEN 6
+
+struct ieee80211_ccmp_data {
+ u8 key[CCMP_TK_LEN];
+ int key_set;
+
+ u8 tx_pn[CCMP_PN_LEN];
+ u8 rx_pn[CCMP_PN_LEN];
+
+ u32 dot11RSNAStatsCCMPFormatErrors;
+ u32 dot11RSNAStatsCCMPReplays;
+ u32 dot11RSNAStatsCCMPDecryptErrors;
+
+ int key_idx;
+
+ struct crypto_tfm *tfm;
+
+ /* scratch buffers for virt_to_page() (crypto API) */
+ u8 tx_b0[AES_BLOCK_LEN], tx_b[AES_BLOCK_LEN],
+ tx_e[AES_BLOCK_LEN], tx_s0[AES_BLOCK_LEN];
+ u8 rx_b0[AES_BLOCK_LEN], rx_b[AES_BLOCK_LEN], rx_a[AES_BLOCK_LEN];
+};
+
+static void ieee80211_ccmp_aes_encrypt(struct crypto_tfm *tfm,
+ const u8 pt[16], u8 ct[16])
+{
+ struct scatterlist src, dst;
+
+ src.page = virt_to_page(pt);
+ src.offset = offset_in_page(pt);
+ src.length = AES_BLOCK_LEN;
+
+ dst.page = virt_to_page(ct);
+ dst.offset = offset_in_page(ct);
+ dst.length = AES_BLOCK_LEN;
+
+ crypto_cipher_encrypt(tfm, &dst, &src, AES_BLOCK_LEN);
+}
+
+static void *ieee80211_ccmp_init(int key_idx)
+{
+ struct ieee80211_ccmp_data *priv;
+
+ priv = kmalloc(sizeof(*priv), GFP_ATOMIC);
+ if (priv == NULL)
+ goto fail;
+ memset(priv, 0, sizeof(*priv));
+ priv->key_idx = key_idx;
+
+ priv->tfm = crypto_alloc_tfm("aes", 0);
+ if (priv->tfm == NULL) {
+ printk(KERN_DEBUG "ieee80211_crypt_ccmp: could not allocate "
+ "crypto API aes\n");
+ goto fail;
+ }
+
+ return priv;
+
+ fail:
+ if (priv) {
+ if (priv->tfm)
+ crypto_free_tfm(priv->tfm);
+ kfree(priv);
+ }
+
+ return NULL;
+}
+
+static void ieee80211_ccmp_deinit(void *priv)
+{
+ struct ieee80211_ccmp_data *_priv = priv;
+ if (_priv && _priv->tfm)
+ crypto_free_tfm(_priv->tfm);
+ kfree(priv);
+}
+
+static inline void xor_block(u8 * b, u8 * a, size_t len)
+{
+ int i;
+ for (i = 0; i < len; i++)
+ b[i] ^= a[i];
+}
+
+static void ccmp_init_blocks(struct crypto_tfm *tfm,
+ struct ieee80211_hdr *hdr,
+ u8 * pn, size_t dlen, u8 * b0, u8 * auth, u8 * s0)
+{
+ u8 *pos, qc = 0;
+ size_t aad_len;
+ u16 fc;
+ int a4_included, qc_included;
+ u8 aad[2 * AES_BLOCK_LEN];
+
+ fc = le16_to_cpu(hdr->frame_ctl);
+ a4_included = ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
+ (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS));
+ qc_included = ((WLAN_FC_GET_TYPE(fc) == IEEE80211_FTYPE_DATA) &&
+ (WLAN_FC_GET_STYPE(fc) & 0x08));
+ aad_len = 22;
+ if (a4_included)
+ aad_len += 6;
+ if (qc_included) {
+ pos = (u8 *) & hdr->addr4;
+ if (a4_included)
+ pos += 6;
+ qc = *pos & 0x0f;
+ aad_len += 2;
+ }
+
+ /* CCM Initial Block:
+ * Flag (Include authentication header, M=3 (8-octet MIC),
+ * L=1 (2-octet Dlen))
+ * Nonce: 0x00 | A2 | PN
+ * Dlen */
+ b0[0] = 0x59;
+ b0[1] = qc;
+ memcpy(b0 + 2, hdr->addr2, ETH_ALEN);
+ memcpy(b0 + 8, pn, CCMP_PN_LEN);
+ b0[14] = (dlen >> 8) & 0xff;
+ b0[15] = dlen & 0xff;
+
+ /* AAD:
+ * FC with bits 4..6 and 11..13 masked to zero; 14 is always one
+ * A1 | A2 | A3
+ * SC with bits 4..15 (seq#) masked to zero
+ * A4 (if present)
+ * QC (if present)
+ */
+ pos = (u8 *) hdr;
+ aad[0] = 0; /* aad_len >> 8 */
+ aad[1] = aad_len & 0xff;
+ aad[2] = pos[0] & 0x8f;
+ aad[3] = pos[1] & 0xc7;
+ memcpy(aad + 4, hdr->addr1, 3 * ETH_ALEN);
+ pos = (u8 *) & hdr->seq_ctl;
+ aad[22] = pos[0] & 0x0f;
+ aad[23] = 0; /* all bits masked */
+ memset(aad + 24, 0, 8);
+ if (a4_included)
+ memcpy(aad + 24, hdr->addr4, ETH_ALEN);
+ if (qc_included) {
+ aad[a4_included ? 30 : 24] = qc;
+ /* rest of QC masked */
+ }
+
+ /* Start with the first block and AAD */
+ ieee80211_ccmp_aes_encrypt(tfm, b0, auth);
+ xor_block(auth, aad, AES_BLOCK_LEN);
+ ieee80211_ccmp_aes_encrypt(tfm, auth, auth);
+ xor_block(auth, &aad[AES_BLOCK_LEN], AES_BLOCK_LEN);
+ ieee80211_ccmp_aes_encrypt(tfm, auth, auth);
+ b0[0] &= 0x07;
+ b0[14] = b0[15] = 0;
+ ieee80211_ccmp_aes_encrypt(tfm, b0, s0);
+}
+
+static int ieee80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct ieee80211_ccmp_data *key = priv;
+ int data_len, i, blocks, last, len;
+ u8 *pos, *mic;
+ struct ieee80211_hdr *hdr;
+ u8 *b0 = key->tx_b0;
+ u8 *b = key->tx_b;
+ u8 *e = key->tx_e;
+ u8 *s0 = key->tx_s0;
+
+ if (skb_headroom(skb) < CCMP_HDR_LEN ||
+ skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len)
+ return -1;
+
+ data_len = skb->len - hdr_len;
+ pos = skb_push(skb, CCMP_HDR_LEN);
+ memmove(pos, pos + CCMP_HDR_LEN, hdr_len);
+ pos += hdr_len;
+ mic = skb_put(skb, CCMP_MIC_LEN);
+
+ i = CCMP_PN_LEN - 1;
+ while (i >= 0) {
+ key->tx_pn[i]++;
+ if (key->tx_pn[i] != 0)
+ break;
+ i--;
+ }
+
+ *pos++ = key->tx_pn[5];
+ *pos++ = key->tx_pn[4];
+ *pos++ = 0;
+ *pos++ = (key->key_idx << 6) | (1 << 5) /* Ext IV included */ ;
+ *pos++ = key->tx_pn[3];
+ *pos++ = key->tx_pn[2];
+ *pos++ = key->tx_pn[1];
+ *pos++ = key->tx_pn[0];
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ ccmp_init_blocks(key->tfm, hdr, key->tx_pn, data_len, b0, b, s0);
+
+ blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN;
+ last = data_len % AES_BLOCK_LEN;
+
+ for (i = 1; i <= blocks; i++) {
+ len = (i == blocks && last) ? last : AES_BLOCK_LEN;
+ /* Authentication */
+ xor_block(b, pos, len);
+ ieee80211_ccmp_aes_encrypt(key->tfm, b, b);
+ /* Encryption, with counter */
+ b0[14] = (i >> 8) & 0xff;
+ b0[15] = i & 0xff;
+ ieee80211_ccmp_aes_encrypt(key->tfm, b0, e);
+ xor_block(pos, e, len);
+ pos += len;
+ }
+
+ for (i = 0; i < CCMP_MIC_LEN; i++)
+ mic[i] = b[i] ^ s0[i];
+
+ return 0;
+}
+
+static int ieee80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct ieee80211_ccmp_data *key = priv;
+ u8 keyidx, *pos;
+ struct ieee80211_hdr *hdr;
+ u8 *b0 = key->rx_b0;
+ u8 *b = key->rx_b;
+ u8 *a = key->rx_a;
+ u8 pn[6];
+ int i, blocks, last, len;
+ size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN - CCMP_MIC_LEN;
+ u8 *mic = skb->data + skb->len - CCMP_MIC_LEN;
+
+ if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) {
+ key->dot11RSNAStatsCCMPFormatErrors++;
+ return -1;
+ }
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ pos = skb->data + hdr_len;
+ keyidx = pos[3];
+ if (!(keyidx & (1 << 5))) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "CCMP: received packet without ExtIV"
+ " flag from " MAC_FMT "\n", MAC_ARG(hdr->addr2));
+ }
+ key->dot11RSNAStatsCCMPFormatErrors++;
+ return -2;
+ }
+ keyidx >>= 6;
+ if (key->key_idx != keyidx) {
+ printk(KERN_DEBUG "CCMP: RX tkey->key_idx=%d frame "
+ "keyidx=%d priv=%p\n", key->key_idx, keyidx, priv);
+ return -6;
+ }
+ if (!key->key_set) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "CCMP: received packet from " MAC_FMT
+ " with keyid=%d that does not have a configured"
+ " key\n", MAC_ARG(hdr->addr2), keyidx);
+ }
+ return -3;
+ }
+
+ pn[0] = pos[7];
+ pn[1] = pos[6];
+ pn[2] = pos[5];
+ pn[3] = pos[4];
+ pn[4] = pos[1];
+ pn[5] = pos[0];
+ pos += 8;
+
+ if (memcmp(pn, key->rx_pn, CCMP_PN_LEN) <= 0) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "CCMP: replay detected: STA=" MAC_FMT
+ " previous PN %02x%02x%02x%02x%02x%02x "
+ "received PN %02x%02x%02x%02x%02x%02x\n",
+ MAC_ARG(hdr->addr2), MAC_ARG(key->rx_pn),
+ MAC_ARG(pn));
+ }
+ key->dot11RSNAStatsCCMPReplays++;
+ return -4;
+ }
+
+ ccmp_init_blocks(key->tfm, hdr, pn, data_len, b0, a, b);
+ xor_block(mic, b, CCMP_MIC_LEN);
+
+ blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN;
+ last = data_len % AES_BLOCK_LEN;
+
+ for (i = 1; i <= blocks; i++) {
+ len = (i == blocks && last) ? last : AES_BLOCK_LEN;
+ /* Decrypt, with counter */
+ b0[14] = (i >> 8) & 0xff;
+ b0[15] = i & 0xff;
+ ieee80211_ccmp_aes_encrypt(key->tfm, b0, b);
+ xor_block(pos, b, len);
+ /* Authentication */
+ xor_block(a, pos, len);
+ ieee80211_ccmp_aes_encrypt(key->tfm, a, a);
+ pos += len;
+ }
+
+ if (memcmp(mic, a, CCMP_MIC_LEN) != 0) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "CCMP: decrypt failed: STA="
+ MAC_FMT "\n", MAC_ARG(hdr->addr2));
+ }
+ key->dot11RSNAStatsCCMPDecryptErrors++;
+ return -5;
+ }
+
+ memcpy(key->rx_pn, pn, CCMP_PN_LEN);
+
+ /* Remove hdr and MIC */
+ memmove(skb->data + CCMP_HDR_LEN, skb->data, hdr_len);
+ skb_pull(skb, CCMP_HDR_LEN);
+ skb_trim(skb, skb->len - CCMP_MIC_LEN);
+
+ return keyidx;
+}
+
+static int ieee80211_ccmp_set_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct ieee80211_ccmp_data *data = priv;
+ int keyidx;
+ struct crypto_tfm *tfm = data->tfm;
+
+ keyidx = data->key_idx;
+ memset(data, 0, sizeof(*data));
+ data->key_idx = keyidx;
+ data->tfm = tfm;
+ if (len == CCMP_TK_LEN) {
+ memcpy(data->key, key, CCMP_TK_LEN);
+ data->key_set = 1;
+ if (seq) {
+ data->rx_pn[0] = seq[5];
+ data->rx_pn[1] = seq[4];
+ data->rx_pn[2] = seq[3];
+ data->rx_pn[3] = seq[2];
+ data->rx_pn[4] = seq[1];
+ data->rx_pn[5] = seq[0];
+ }
+ crypto_cipher_setkey(data->tfm, data->key, CCMP_TK_LEN);
+ } else if (len == 0)
+ data->key_set = 0;
+ else
+ return -1;
+
+ return 0;
+}
+
+static int ieee80211_ccmp_get_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct ieee80211_ccmp_data *data = priv;
+
+ if (len < CCMP_TK_LEN)
+ return -1;
+
+ if (!data->key_set)
+ return 0;
+ memcpy(key, data->key, CCMP_TK_LEN);
+
+ if (seq) {
+ seq[0] = data->tx_pn[5];
+ seq[1] = data->tx_pn[4];
+ seq[2] = data->tx_pn[3];
+ seq[3] = data->tx_pn[2];
+ seq[4] = data->tx_pn[1];
+ seq[5] = data->tx_pn[0];
+ }
+
+ return CCMP_TK_LEN;
+}
+
+static char *ieee80211_ccmp_print_stats(char *p, void *priv)
+{
+ struct ieee80211_ccmp_data *ccmp = priv;
+ p += sprintf(p, "key[%d] alg=CCMP key_set=%d "
+ "tx_pn=%02x%02x%02x%02x%02x%02x "
+ "rx_pn=%02x%02x%02x%02x%02x%02x "
+ "format_errors=%d replays=%d decrypt_errors=%d\n",
+ ccmp->key_idx, ccmp->key_set,
+ MAC_ARG(ccmp->tx_pn), MAC_ARG(ccmp->rx_pn),
+ ccmp->dot11RSNAStatsCCMPFormatErrors,
+ ccmp->dot11RSNAStatsCCMPReplays,
+ ccmp->dot11RSNAStatsCCMPDecryptErrors);
+
+ return p;
+}
+
+static struct ieee80211_crypto_ops ieee80211_crypt_ccmp = {
+ .name = "CCMP",
+ .init = ieee80211_ccmp_init,
+ .deinit = ieee80211_ccmp_deinit,
+ .encrypt_mpdu = ieee80211_ccmp_encrypt,
+ .decrypt_mpdu = ieee80211_ccmp_decrypt,
+ .encrypt_msdu = NULL,
+ .decrypt_msdu = NULL,
+ .set_key = ieee80211_ccmp_set_key,
+ .get_key = ieee80211_ccmp_get_key,
+ .print_stats = ieee80211_ccmp_print_stats,
+ .extra_prefix_len = CCMP_HDR_LEN,
+ .extra_postfix_len = CCMP_MIC_LEN,
+ .owner = THIS_MODULE,
+};
+
+static int __init ieee80211_crypto_ccmp_init(void)
+{
+ return ieee80211_register_crypto_ops(&ieee80211_crypt_ccmp);
+}
+
+static void __exit ieee80211_crypto_ccmp_exit(void)
+{
+ ieee80211_unregister_crypto_ops(&ieee80211_crypt_ccmp);
+}
+
+module_init(ieee80211_crypto_ccmp_init);
+module_exit(ieee80211_crypto_ccmp_exit);
diff --git a/net/ieee80211/ieee80211_crypt_tkip.c b/net/ieee80211/ieee80211_crypt_tkip.c
new file mode 100644
index 0000000..d4f9164
--- /dev/null
+++ b/net/ieee80211/ieee80211_crypt_tkip.c
@@ -0,0 +1,683 @@
+/*
+ * Host AP crypt: host-based TKIP encryption implementation for Host AP driver
+ *
+ * Copyright (c) 2003-2004, Jouni Malinen <jkmaline@cc.hut.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_arp.h>
+#include <asm/string.h>
+
+#include <net/ieee80211.h>
+
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+#include <linux/crc32.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("Host AP crypt: TKIP");
+MODULE_LICENSE("GPL");
+
+struct ieee80211_tkip_data {
+#define TKIP_KEY_LEN 32
+ u8 key[TKIP_KEY_LEN];
+ int key_set;
+
+ u32 tx_iv32;
+ u16 tx_iv16;
+ u16 tx_ttak[5];
+ int tx_phase1_done;
+
+ u32 rx_iv32;
+ u16 rx_iv16;
+ u16 rx_ttak[5];
+ int rx_phase1_done;
+ u32 rx_iv32_new;
+ u16 rx_iv16_new;
+
+ u32 dot11RSNAStatsTKIPReplays;
+ u32 dot11RSNAStatsTKIPICVErrors;
+ u32 dot11RSNAStatsTKIPLocalMICFailures;
+
+ int key_idx;
+
+ struct crypto_tfm *tfm_arc4;
+ struct crypto_tfm *tfm_michael;
+
+ /* scratch buffers for virt_to_page() (crypto API) */
+ u8 rx_hdr[16], tx_hdr[16];
+};
+
+static void *ieee80211_tkip_init(int key_idx)
+{
+ struct ieee80211_tkip_data *priv;
+
+ priv = kmalloc(sizeof(*priv), GFP_ATOMIC);
+ if (priv == NULL)
+ goto fail;
+ memset(priv, 0, sizeof(*priv));
+ priv->key_idx = key_idx;
+
+ priv->tfm_arc4 = crypto_alloc_tfm("arc4", 0);
+ if (priv->tfm_arc4 == NULL) {
+ printk(KERN_DEBUG "ieee80211_crypt_tkip: could not allocate "
+ "crypto API arc4\n");
+ goto fail;
+ }
+
+ priv->tfm_michael = crypto_alloc_tfm("michael_mic", 0);
+ if (priv->tfm_michael == NULL) {
+ printk(KERN_DEBUG "ieee80211_crypt_tkip: could not allocate "
+ "crypto API michael_mic\n");
+ goto fail;
+ }
+
+ return priv;
+
+ fail:
+ if (priv) {
+ if (priv->tfm_michael)
+ crypto_free_tfm(priv->tfm_michael);
+ if (priv->tfm_arc4)
+ crypto_free_tfm(priv->tfm_arc4);
+ kfree(priv);
+ }
+
+ return NULL;
+}
+
+static void ieee80211_tkip_deinit(void *priv)
+{
+ struct ieee80211_tkip_data *_priv = priv;
+ if (_priv && _priv->tfm_michael)
+ crypto_free_tfm(_priv->tfm_michael);
+ if (_priv && _priv->tfm_arc4)
+ crypto_free_tfm(_priv->tfm_arc4);
+ kfree(priv);
+}
+
+static inline u16 RotR1(u16 val)
+{
+ return (val >> 1) | (val << 15);
+}
+
+static inline u8 Lo8(u16 val)
+{
+ return val & 0xff;
+}
+
+static inline u8 Hi8(u16 val)
+{
+ return val >> 8;
+}
+
+static inline u16 Lo16(u32 val)
+{
+ return val & 0xffff;
+}
+
+static inline u16 Hi16(u32 val)
+{
+ return val >> 16;
+}
+
+static inline u16 Mk16(u8 hi, u8 lo)
+{
+ return lo | (((u16) hi) << 8);
+}
+
+static inline u16 Mk16_le(u16 * v)
+{
+ return le16_to_cpu(*v);
+}
+
+static const u16 Sbox[256] = {
+ 0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154,
+ 0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A,
+ 0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B,
+ 0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B,
+ 0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F,
+ 0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F,
+ 0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5,
+ 0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F,
+ 0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB,
+ 0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397,
+ 0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED,
+ 0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A,
+ 0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194,
+ 0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3,
+ 0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104,
+ 0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D,
+ 0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39,
+ 0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695,
+ 0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83,
+ 0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76,
+ 0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4,
+ 0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B,
+ 0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0,
+ 0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018,
+ 0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751,
+ 0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85,
+ 0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12,
+ 0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9,
+ 0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7,
+ 0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A,
+ 0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8,
+ 0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A,
+};
+
+static inline u16 _S_(u16 v)
+{
+ u16 t = Sbox[Hi8(v)];
+ return Sbox[Lo8(v)] ^ ((t << 8) | (t >> 8));
+}
+
+#define PHASE1_LOOP_COUNT 8
+
+static void tkip_mixing_phase1(u16 * TTAK, const u8 * TK, const u8 * TA,
+ u32 IV32)
+{
+ int i, j;
+
+ /* Initialize the 80-bit TTAK from TSC (IV32) and TA[0..5] */
+ TTAK[0] = Lo16(IV32);
+ TTAK[1] = Hi16(IV32);
+ TTAK[2] = Mk16(TA[1], TA[0]);
+ TTAK[3] = Mk16(TA[3], TA[2]);
+ TTAK[4] = Mk16(TA[5], TA[4]);
+
+ for (i = 0; i < PHASE1_LOOP_COUNT; i++) {
+ j = 2 * (i & 1);
+ TTAK[0] += _S_(TTAK[4] ^ Mk16(TK[1 + j], TK[0 + j]));
+ TTAK[1] += _S_(TTAK[0] ^ Mk16(TK[5 + j], TK[4 + j]));
+ TTAK[2] += _S_(TTAK[1] ^ Mk16(TK[9 + j], TK[8 + j]));
+ TTAK[3] += _S_(TTAK[2] ^ Mk16(TK[13 + j], TK[12 + j]));
+ TTAK[4] += _S_(TTAK[3] ^ Mk16(TK[1 + j], TK[0 + j])) + i;
+ }
+}
+
+static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK,
+ u16 IV16)
+{
+ /* Make temporary area overlap WEP seed so that the final copy can be
+ * avoided on little endian hosts. */
+ u16 *PPK = (u16 *) & WEPSeed[4];
+
+ /* Step 1 - make copy of TTAK and bring in TSC */
+ PPK[0] = TTAK[0];
+ PPK[1] = TTAK[1];
+ PPK[2] = TTAK[2];
+ PPK[3] = TTAK[3];
+ PPK[4] = TTAK[4];
+ PPK[5] = TTAK[4] + IV16;
+
+ /* Step 2 - 96-bit bijective mixing using S-box */
+ PPK[0] += _S_(PPK[5] ^ Mk16_le((u16 *) & TK[0]));
+ PPK[1] += _S_(PPK[0] ^ Mk16_le((u16 *) & TK[2]));
+ PPK[2] += _S_(PPK[1] ^ Mk16_le((u16 *) & TK[4]));
+ PPK[3] += _S_(PPK[2] ^ Mk16_le((u16 *) & TK[6]));
+ PPK[4] += _S_(PPK[3] ^ Mk16_le((u16 *) & TK[8]));
+ PPK[5] += _S_(PPK[4] ^ Mk16_le((u16 *) & TK[10]));
+
+ PPK[0] += RotR1(PPK[5] ^ Mk16_le((u16 *) & TK[12]));
+ PPK[1] += RotR1(PPK[0] ^ Mk16_le((u16 *) & TK[14]));
+ PPK[2] += RotR1(PPK[1]);
+ PPK[3] += RotR1(PPK[2]);
+ PPK[4] += RotR1(PPK[3]);
+ PPK[5] += RotR1(PPK[4]);
+
+ /* Step 3 - bring in last of TK bits, assign 24-bit WEP IV value
+ * WEPSeed[0..2] is transmitted as WEP IV */
+ WEPSeed[0] = Hi8(IV16);
+ WEPSeed[1] = (Hi8(IV16) | 0x20) & 0x7F;
+ WEPSeed[2] = Lo8(IV16);
+ WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((u16 *) & TK[0])) >> 1);
+
+#ifdef __BIG_ENDIAN
+ {
+ int i;
+ for (i = 0; i < 6; i++)
+ PPK[i] = (PPK[i] << 8) | (PPK[i] >> 8);
+ }
+#endif
+}
+
+static int ieee80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+ int len;
+ u8 rc4key[16], *pos, *icv;
+ struct ieee80211_hdr *hdr;
+ u32 crc;
+ struct scatterlist sg;
+
+ if (skb_headroom(skb) < 8 || skb_tailroom(skb) < 4 ||
+ skb->len < hdr_len)
+ return -1;
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ if (!tkey->tx_phase1_done) {
+ tkip_mixing_phase1(tkey->tx_ttak, tkey->key, hdr->addr2,
+ tkey->tx_iv32);
+ tkey->tx_phase1_done = 1;
+ }
+ tkip_mixing_phase2(rc4key, tkey->key, tkey->tx_ttak, tkey->tx_iv16);
+
+ len = skb->len - hdr_len;
+ pos = skb_push(skb, 8);
+ memmove(pos, pos + 8, hdr_len);
+ pos += hdr_len;
+ icv = skb_put(skb, 4);
+
+ *pos++ = rc4key[0];
+ *pos++ = rc4key[1];
+ *pos++ = rc4key[2];
+ *pos++ = (tkey->key_idx << 6) | (1 << 5) /* Ext IV included */ ;
+ *pos++ = tkey->tx_iv32 & 0xff;
+ *pos++ = (tkey->tx_iv32 >> 8) & 0xff;
+ *pos++ = (tkey->tx_iv32 >> 16) & 0xff;
+ *pos++ = (tkey->tx_iv32 >> 24) & 0xff;
+
+ crc = ~crc32_le(~0, pos, len);
+ icv[0] = crc;
+ icv[1] = crc >> 8;
+ icv[2] = crc >> 16;
+ icv[3] = crc >> 24;
+
+ crypto_cipher_setkey(tkey->tfm_arc4, rc4key, 16);
+ sg.page = virt_to_page(pos);
+ sg.offset = offset_in_page(pos);
+ sg.length = len + 4;
+ crypto_cipher_encrypt(tkey->tfm_arc4, &sg, &sg, len + 4);
+
+ tkey->tx_iv16++;
+ if (tkey->tx_iv16 == 0) {
+ tkey->tx_phase1_done = 0;
+ tkey->tx_iv32++;
+ }
+
+ return 0;
+}
+
+static int ieee80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+ u8 rc4key[16];
+ u8 keyidx, *pos;
+ u32 iv32;
+ u16 iv16;
+ struct ieee80211_hdr *hdr;
+ u8 icv[4];
+ u32 crc;
+ struct scatterlist sg;
+ int plen;
+
+ if (skb->len < hdr_len + 8 + 4)
+ return -1;
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ pos = skb->data + hdr_len;
+ keyidx = pos[3];
+ if (!(keyidx & (1 << 5))) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "TKIP: received packet without ExtIV"
+ " flag from " MAC_FMT "\n", MAC_ARG(hdr->addr2));
+ }
+ return -2;
+ }
+ keyidx >>= 6;
+ if (tkey->key_idx != keyidx) {
+ printk(KERN_DEBUG "TKIP: RX tkey->key_idx=%d frame "
+ "keyidx=%d priv=%p\n", tkey->key_idx, keyidx, priv);
+ return -6;
+ }
+ if (!tkey->key_set) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "TKIP: received packet from " MAC_FMT
+ " with keyid=%d that does not have a configured"
+ " key\n", MAC_ARG(hdr->addr2), keyidx);
+ }
+ return -3;
+ }
+ iv16 = (pos[0] << 8) | pos[2];
+ iv32 = pos[4] | (pos[5] << 8) | (pos[6] << 16) | (pos[7] << 24);
+ pos += 8;
+
+ if (iv32 < tkey->rx_iv32 ||
+ (iv32 == tkey->rx_iv32 && iv16 <= tkey->rx_iv16)) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "TKIP: replay detected: STA=" MAC_FMT
+ " previous TSC %08x%04x received TSC "
+ "%08x%04x\n", MAC_ARG(hdr->addr2),
+ tkey->rx_iv32, tkey->rx_iv16, iv32, iv16);
+ }
+ tkey->dot11RSNAStatsTKIPReplays++;
+ return -4;
+ }
+
+ if (iv32 != tkey->rx_iv32 || !tkey->rx_phase1_done) {
+ tkip_mixing_phase1(tkey->rx_ttak, tkey->key, hdr->addr2, iv32);
+ tkey->rx_phase1_done = 1;
+ }
+ tkip_mixing_phase2(rc4key, tkey->key, tkey->rx_ttak, iv16);
+
+ plen = skb->len - hdr_len - 12;
+
+ crypto_cipher_setkey(tkey->tfm_arc4, rc4key, 16);
+ sg.page = virt_to_page(pos);
+ sg.offset = offset_in_page(pos);
+ sg.length = plen + 4;
+ crypto_cipher_decrypt(tkey->tfm_arc4, &sg, &sg, plen + 4);
+
+ crc = ~crc32_le(~0, pos, plen);
+ icv[0] = crc;
+ icv[1] = crc >> 8;
+ icv[2] = crc >> 16;
+ icv[3] = crc >> 24;
+ if (memcmp(icv, pos + plen, 4) != 0) {
+ if (iv32 != tkey->rx_iv32) {
+ /* Previously cached Phase1 result was already lost, so
+ * it needs to be recalculated for the next packet. */
+ tkey->rx_phase1_done = 0;
+ }
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "TKIP: ICV error detected: STA="
+ MAC_FMT "\n", MAC_ARG(hdr->addr2));
+ }
+ tkey->dot11RSNAStatsTKIPICVErrors++;
+ return -5;
+ }
+
+ /* Update real counters only after Michael MIC verification has
+ * completed */
+ tkey->rx_iv32_new = iv32;
+ tkey->rx_iv16_new = iv16;
+
+ /* Remove IV and ICV */
+ memmove(skb->data + 8, skb->data, hdr_len);
+ skb_pull(skb, 8);
+ skb_trim(skb, skb->len - 4);
+
+ return keyidx;
+}
+
+static int michael_mic(struct ieee80211_tkip_data *tkey, u8 * key, u8 * hdr,
+ u8 * data, size_t data_len, u8 * mic)
+{
+ struct scatterlist sg[2];
+
+ if (tkey->tfm_michael == NULL) {
+ printk(KERN_WARNING "michael_mic: tfm_michael == NULL\n");
+ return -1;
+ }
+ sg[0].page = virt_to_page(hdr);
+ sg[0].offset = offset_in_page(hdr);
+ sg[0].length = 16;
+
+ sg[1].page = virt_to_page(data);
+ sg[1].offset = offset_in_page(data);
+ sg[1].length = data_len;
+
+ crypto_digest_init(tkey->tfm_michael);
+ crypto_digest_setkey(tkey->tfm_michael, key, 8);
+ crypto_digest_update(tkey->tfm_michael, sg, 2);
+ crypto_digest_final(tkey->tfm_michael, mic);
+
+ return 0;
+}
+
+static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
+{
+ struct ieee80211_hdr *hdr11;
+
+ hdr11 = (struct ieee80211_hdr *)skb->data;
+ switch (le16_to_cpu(hdr11->frame_ctl) &
+ (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) {
+ case IEEE80211_FCTL_TODS:
+ memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
+ memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
+ break;
+ case IEEE80211_FCTL_FROMDS:
+ memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
+ memcpy(hdr + ETH_ALEN, hdr11->addr3, ETH_ALEN); /* SA */
+ break;
+ case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS:
+ memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */
+ memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */
+ break;
+ case 0:
+ memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */
+ memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */
+ break;
+ }
+
+ hdr[12] = 0; /* priority */
+ hdr[13] = hdr[14] = hdr[15] = 0; /* reserved */
+}
+
+static int ieee80211_michael_mic_add(struct sk_buff *skb, int hdr_len,
+ void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+ u8 *pos;
+
+ if (skb_tailroom(skb) < 8 || skb->len < hdr_len) {
+ printk(KERN_DEBUG "Invalid packet for Michael MIC add "
+ "(tailroom=%d hdr_len=%d skb->len=%d)\n",
+ skb_tailroom(skb), hdr_len, skb->len);
+ return -1;
+ }
+
+ michael_mic_hdr(skb, tkey->tx_hdr);
+ pos = skb_put(skb, 8);
+ if (michael_mic(tkey, &tkey->key[16], tkey->tx_hdr,
+ skb->data + hdr_len, skb->len - 8 - hdr_len, pos))
+ return -1;
+
+ return 0;
+}
+
+#if WIRELESS_EXT >= 18
+static void ieee80211_michael_mic_failure(struct net_device *dev,
+ struct ieee80211_hdr *hdr, int keyidx)
+{
+ union iwreq_data wrqu;
+ struct iw_michaelmicfailure ev;
+
+ /* TODO: needed parameters: count, keyid, key type, TSC */
+ memset(&ev, 0, sizeof(ev));
+ ev.flags = keyidx & IW_MICFAILURE_KEY_ID;
+ if (hdr->addr1[0] & 0x01)
+ ev.flags |= IW_MICFAILURE_GROUP;
+ else
+ ev.flags |= IW_MICFAILURE_PAIRWISE;
+ ev.src_addr.sa_family = ARPHRD_ETHER;
+ memcpy(ev.src_addr.sa_data, hdr->addr2, ETH_ALEN);
+ memset(&wrqu, 0, sizeof(wrqu));
+ wrqu.data.length = sizeof(ev);
+ wireless_send_event(dev, IWEVMICHAELMICFAILURE, &wrqu, (char *)&ev);
+}
+#elif WIRELESS_EXT >= 15
+static void ieee80211_michael_mic_failure(struct net_device *dev,
+ struct ieee80211_hdr *hdr, int keyidx)
+{
+ union iwreq_data wrqu;
+ char buf[128];
+
+ /* TODO: needed parameters: count, keyid, key type, TSC */
+ sprintf(buf, "MLME-MICHAELMICFAILURE.indication(keyid=%d %scast addr="
+ MAC_FMT ")", keyidx, hdr->addr1[0] & 0x01 ? "broad" : "uni",
+ MAC_ARG(hdr->addr2));
+ memset(&wrqu, 0, sizeof(wrqu));
+ wrqu.data.length = strlen(buf);
+ wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf);
+}
+#else /* WIRELESS_EXT >= 15 */
+static inline void ieee80211_michael_mic_failure(struct net_device *dev,
+ struct ieee80211_hdr *hdr,
+ int keyidx)
+{
+}
+#endif /* WIRELESS_EXT >= 15 */
+
+static int ieee80211_michael_mic_verify(struct sk_buff *skb, int keyidx,
+ int hdr_len, void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+ u8 mic[8];
+
+ if (!tkey->key_set)
+ return -1;
+
+ michael_mic_hdr(skb, tkey->rx_hdr);
+ if (michael_mic(tkey, &tkey->key[24], tkey->rx_hdr,
+ skb->data + hdr_len, skb->len - 8 - hdr_len, mic))
+ return -1;
+ if (memcmp(mic, skb->data + skb->len - 8, 8) != 0) {
+ struct ieee80211_hdr *hdr;
+ hdr = (struct ieee80211_hdr *)skb->data;
+ printk(KERN_DEBUG "%s: Michael MIC verification failed for "
+ "MSDU from " MAC_FMT " keyidx=%d\n",
+ skb->dev ? skb->dev->name : "N/A", MAC_ARG(hdr->addr2),
+ keyidx);
+ if (skb->dev)
+ ieee80211_michael_mic_failure(skb->dev, hdr, keyidx);
+ tkey->dot11RSNAStatsTKIPLocalMICFailures++;
+ return -1;
+ }
+
+ /* Update TSC counters for RX now that the packet verification has
+ * completed. */
+ tkey->rx_iv32 = tkey->rx_iv32_new;
+ tkey->rx_iv16 = tkey->rx_iv16_new;
+
+ skb_trim(skb, skb->len - 8);
+
+ return 0;
+}
+
+static int ieee80211_tkip_set_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+ int keyidx;
+ struct crypto_tfm *tfm = tkey->tfm_michael;
+ struct crypto_tfm *tfm2 = tkey->tfm_arc4;
+
+ keyidx = tkey->key_idx;
+ memset(tkey, 0, sizeof(*tkey));
+ tkey->key_idx = keyidx;
+ tkey->tfm_michael = tfm;
+ tkey->tfm_arc4 = tfm2;
+ if (len == TKIP_KEY_LEN) {
+ memcpy(tkey->key, key, TKIP_KEY_LEN);
+ tkey->key_set = 1;
+ tkey->tx_iv16 = 1; /* TSC is initialized to 1 */
+ if (seq) {
+ tkey->rx_iv32 = (seq[5] << 24) | (seq[4] << 16) |
+ (seq[3] << 8) | seq[2];
+ tkey->rx_iv16 = (seq[1] << 8) | seq[0];
+ }
+ } else if (len == 0)
+ tkey->key_set = 0;
+ else
+ return -1;
+
+ return 0;
+}
+
+static int ieee80211_tkip_get_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct ieee80211_tkip_data *tkey = priv;
+
+ if (len < TKIP_KEY_LEN)
+ return -1;
+
+ if (!tkey->key_set)
+ return 0;
+ memcpy(key, tkey->key, TKIP_KEY_LEN);
+
+ if (seq) {
+ /* Return the sequence number of the last transmitted frame. */
+ u16 iv16 = tkey->tx_iv16;
+ u32 iv32 = tkey->tx_iv32;
+ if (iv16 == 0)
+ iv32--;
+ iv16--;
+ seq[0] = tkey->tx_iv16;
+ seq[1] = tkey->tx_iv16 >> 8;
+ seq[2] = tkey->tx_iv32;
+ seq[3] = tkey->tx_iv32 >> 8;
+ seq[4] = tkey->tx_iv32 >> 16;
+ seq[5] = tkey->tx_iv32 >> 24;
+ }
+
+ return TKIP_KEY_LEN;
+}
+
+static char *ieee80211_tkip_print_stats(char *p, void *priv)
+{
+ struct ieee80211_tkip_data *tkip = priv;
+ p += sprintf(p, "key[%d] alg=TKIP key_set=%d "
+ "tx_pn=%02x%02x%02x%02x%02x%02x "
+ "rx_pn=%02x%02x%02x%02x%02x%02x "
+ "replays=%d icv_errors=%d local_mic_failures=%d\n",
+ tkip->key_idx, tkip->key_set,
+ (tkip->tx_iv32 >> 24) & 0xff,
+ (tkip->tx_iv32 >> 16) & 0xff,
+ (tkip->tx_iv32 >> 8) & 0xff,
+ tkip->tx_iv32 & 0xff,
+ (tkip->tx_iv16 >> 8) & 0xff,
+ tkip->tx_iv16 & 0xff,
+ (tkip->rx_iv32 >> 24) & 0xff,
+ (tkip->rx_iv32 >> 16) & 0xff,
+ (tkip->rx_iv32 >> 8) & 0xff,
+ tkip->rx_iv32 & 0xff,
+ (tkip->rx_iv16 >> 8) & 0xff,
+ tkip->rx_iv16 & 0xff,
+ tkip->dot11RSNAStatsTKIPReplays,
+ tkip->dot11RSNAStatsTKIPICVErrors,
+ tkip->dot11RSNAStatsTKIPLocalMICFailures);
+ return p;
+}
+
+static struct ieee80211_crypto_ops ieee80211_crypt_tkip = {
+ .name = "TKIP",
+ .init = ieee80211_tkip_init,
+ .deinit = ieee80211_tkip_deinit,
+ .encrypt_mpdu = ieee80211_tkip_encrypt,
+ .decrypt_mpdu = ieee80211_tkip_decrypt,
+ .encrypt_msdu = ieee80211_michael_mic_add,
+ .decrypt_msdu = ieee80211_michael_mic_verify,
+ .set_key = ieee80211_tkip_set_key,
+ .get_key = ieee80211_tkip_get_key,
+ .print_stats = ieee80211_tkip_print_stats,
+ .extra_prefix_len = 4 + 4, /* IV + ExtIV */
+ .extra_postfix_len = 8 + 4, /* MIC + ICV */
+ .owner = THIS_MODULE,
+};
+
+static int __init ieee80211_crypto_tkip_init(void)
+{
+ return ieee80211_register_crypto_ops(&ieee80211_crypt_tkip);
+}
+
+static void __exit ieee80211_crypto_tkip_exit(void)
+{
+ ieee80211_unregister_crypto_ops(&ieee80211_crypt_tkip);
+}
+
+module_init(ieee80211_crypto_tkip_init);
+module_exit(ieee80211_crypto_tkip_exit);
diff --git a/net/ieee80211/ieee80211_crypt_wep.c b/net/ieee80211/ieee80211_crypt_wep.c
new file mode 100644
index 0000000..b4d2514
--- /dev/null
+++ b/net/ieee80211/ieee80211_crypt_wep.c
@@ -0,0 +1,258 @@
+/*
+ * Host AP crypt: host-based WEP encryption implementation for Host AP driver
+ *
+ * Copyright (c) 2002-2004, Jouni Malinen <jkmaline@cc.hut.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/skbuff.h>
+#include <asm/string.h>
+
+#include <net/ieee80211.h>
+
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+#include <linux/crc32.h>
+
+MODULE_AUTHOR("Jouni Malinen");
+MODULE_DESCRIPTION("Host AP crypt: WEP");
+MODULE_LICENSE("GPL");
+
+struct prism2_wep_data {
+ u32 iv;
+#define WEP_KEY_LEN 13
+ u8 key[WEP_KEY_LEN + 1];
+ u8 key_len;
+ u8 key_idx;
+ struct crypto_tfm *tfm;
+};
+
+static void *prism2_wep_init(int keyidx)
+{
+ struct prism2_wep_data *priv;
+
+ priv = kmalloc(sizeof(*priv), GFP_ATOMIC);
+ if (priv == NULL)
+ goto fail;
+ memset(priv, 0, sizeof(*priv));
+ priv->key_idx = keyidx;
+
+ priv->tfm = crypto_alloc_tfm("arc4", 0);
+ if (priv->tfm == NULL) {
+ printk(KERN_DEBUG "ieee80211_crypt_wep: could not allocate "
+ "crypto API arc4\n");
+ goto fail;
+ }
+
+ /* start WEP IV from a random value */
+ get_random_bytes(&priv->iv, 4);
+
+ return priv;
+
+ fail:
+ if (priv) {
+ if (priv->tfm)
+ crypto_free_tfm(priv->tfm);
+ kfree(priv);
+ }
+ return NULL;
+}
+
+static void prism2_wep_deinit(void *priv)
+{
+ struct prism2_wep_data *_priv = priv;
+ if (_priv && _priv->tfm)
+ crypto_free_tfm(_priv->tfm);
+ kfree(priv);
+}
+
+/* Perform WEP encryption on given skb that has at least 4 bytes of headroom
+ * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted,
+ * so the payload length increases with 8 bytes.
+ *
+ * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data))
+ */
+static int prism2_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct prism2_wep_data *wep = priv;
+ u32 crc, klen, len;
+ u8 key[WEP_KEY_LEN + 3];
+ u8 *pos, *icv;
+ struct scatterlist sg;
+
+ if (skb_headroom(skb) < 4 || skb_tailroom(skb) < 4 ||
+ skb->len < hdr_len)
+ return -1;
+
+ len = skb->len - hdr_len;
+ pos = skb_push(skb, 4);
+ memmove(pos, pos + 4, hdr_len);
+ pos += hdr_len;
+
+ klen = 3 + wep->key_len;
+
+ wep->iv++;
+
+ /* Fluhrer, Mantin, and Shamir have reported weaknesses in the key
+ * scheduling algorithm of RC4. At least IVs (KeyByte + 3, 0xff, N)
+ * can be used to speedup attacks, so avoid using them. */
+ if ((wep->iv & 0xff00) == 0xff00) {
+ u8 B = (wep->iv >> 16) & 0xff;
+ if (B >= 3 && B < klen)
+ wep->iv += 0x0100;
+ }
+
+ /* Prepend 24-bit IV to RC4 key and TX frame */
+ *pos++ = key[0] = (wep->iv >> 16) & 0xff;
+ *pos++ = key[1] = (wep->iv >> 8) & 0xff;
+ *pos++ = key[2] = wep->iv & 0xff;
+ *pos++ = wep->key_idx << 6;
+
+ /* Copy rest of the WEP key (the secret part) */
+ memcpy(key + 3, wep->key, wep->key_len);
+
+ /* Append little-endian CRC32 and encrypt it to produce ICV */
+ crc = ~crc32_le(~0, pos, len);
+ icv = skb_put(skb, 4);
+ icv[0] = crc;
+ icv[1] = crc >> 8;
+ icv[2] = crc >> 16;
+ icv[3] = crc >> 24;
+
+ crypto_cipher_setkey(wep->tfm, key, klen);
+ sg.page = virt_to_page(pos);
+ sg.offset = offset_in_page(pos);
+ sg.length = len + 4;
+ crypto_cipher_encrypt(wep->tfm, &sg, &sg, len + 4);
+
+ return 0;
+}
+
+/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of
+ * the frame: IV (4 bytes), encrypted payload (including SNAP header),
+ * ICV (4 bytes). len includes both IV and ICV.
+ *
+ * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on
+ * failure. If frame is OK, IV and ICV will be removed.
+ */
+static int prism2_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
+{
+ struct prism2_wep_data *wep = priv;
+ u32 crc, klen, plen;
+ u8 key[WEP_KEY_LEN + 3];
+ u8 keyidx, *pos, icv[4];
+ struct scatterlist sg;
+
+ if (skb->len < hdr_len + 8)
+ return -1;
+
+ pos = skb->data + hdr_len;
+ key[0] = *pos++;
+ key[1] = *pos++;
+ key[2] = *pos++;
+ keyidx = *pos++ >> 6;
+ if (keyidx != wep->key_idx)
+ return -1;
+
+ klen = 3 + wep->key_len;
+
+ /* Copy rest of the WEP key (the secret part) */
+ memcpy(key + 3, wep->key, wep->key_len);
+
+ /* Apply RC4 to data and compute CRC32 over decrypted data */
+ plen = skb->len - hdr_len - 8;
+
+ crypto_cipher_setkey(wep->tfm, key, klen);
+ sg.page = virt_to_page(pos);
+ sg.offset = offset_in_page(pos);
+ sg.length = plen + 4;
+ crypto_cipher_decrypt(wep->tfm, &sg, &sg, plen + 4);
+
+ crc = ~crc32_le(~0, pos, plen);
+ icv[0] = crc;
+ icv[1] = crc >> 8;
+ icv[2] = crc >> 16;
+ icv[3] = crc >> 24;
+ if (memcmp(icv, pos + plen, 4) != 0) {
+ /* ICV mismatch - drop frame */
+ return -2;
+ }
+
+ /* Remove IV and ICV */
+ memmove(skb->data + 4, skb->data, hdr_len);
+ skb_pull(skb, 4);
+ skb_trim(skb, skb->len - 4);
+
+ return 0;
+}
+
+static int prism2_wep_set_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct prism2_wep_data *wep = priv;
+
+ if (len < 0 || len > WEP_KEY_LEN)
+ return -1;
+
+ memcpy(wep->key, key, len);
+ wep->key_len = len;
+
+ return 0;
+}
+
+static int prism2_wep_get_key(void *key, int len, u8 * seq, void *priv)
+{
+ struct prism2_wep_data *wep = priv;
+
+ if (len < wep->key_len)
+ return -1;
+
+ memcpy(key, wep->key, wep->key_len);
+
+ return wep->key_len;
+}
+
+static char *prism2_wep_print_stats(char *p, void *priv)
+{
+ struct prism2_wep_data *wep = priv;
+ p += sprintf(p, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len);
+ return p;
+}
+
+static struct ieee80211_crypto_ops ieee80211_crypt_wep = {
+ .name = "WEP",
+ .init = prism2_wep_init,
+ .deinit = prism2_wep_deinit,
+ .encrypt_mpdu = prism2_wep_encrypt,
+ .decrypt_mpdu = prism2_wep_decrypt,
+ .encrypt_msdu = NULL,
+ .decrypt_msdu = NULL,
+ .set_key = prism2_wep_set_key,
+ .get_key = prism2_wep_get_key,
+ .print_stats = prism2_wep_print_stats,
+ .extra_prefix_len = 4, /* IV */
+ .extra_postfix_len = 4, /* ICV */
+ .owner = THIS_MODULE,
+};
+
+static int __init ieee80211_crypto_wep_init(void)
+{
+ return ieee80211_register_crypto_ops(&ieee80211_crypt_wep);
+}
+
+static void __exit ieee80211_crypto_wep_exit(void)
+{
+ ieee80211_unregister_crypto_ops(&ieee80211_crypt_wep);
+}
+
+module_init(ieee80211_crypto_wep_init);
+module_exit(ieee80211_crypto_wep_exit);
diff --git a/net/ieee80211/ieee80211_module.c b/net/ieee80211/ieee80211_module.c
new file mode 100644
index 0000000..03a4734
--- /dev/null
+++ b/net/ieee80211/ieee80211_module.c
@@ -0,0 +1,297 @@
+/*******************************************************************************
+
+ Copyright(c) 2004 Intel Corporation. All rights reserved.
+
+ Portions of this file are based on the WEP enablement code provided by the
+ Host AP project hostap-drivers v0.1.3
+ Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ <jkmaline@cc.hut.fi>
+ Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program; if not, write to the Free Software Foundation, Inc., 59
+ Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+ The full GNU General Public License is included in this distribution in the
+ file called LICENSE.
+
+ Contact Information:
+ James P. Ketrenos <ipw2100-admin@linux.intel.com>
+ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+*******************************************************************************/
+
+#include <linux/compiler.h>
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/if_arp.h>
+#include <linux/in6.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/wireless.h>
+#include <linux/etherdevice.h>
+#include <asm/uaccess.h>
+#include <net/arp.h>
+
+#include <net/ieee80211.h>
+
+MODULE_DESCRIPTION("802.11 data/management/control stack");
+MODULE_AUTHOR
+ ("Copyright (C) 2004 Intel Corporation <jketreno@linux.intel.com>");
+MODULE_LICENSE("GPL");
+
+#define DRV_NAME "ieee80211"
+
+static inline int ieee80211_networks_allocate(struct ieee80211_device *ieee)
+{
+ if (ieee->networks)
+ return 0;
+
+ ieee->networks =
+ kmalloc(MAX_NETWORK_COUNT * sizeof(struct ieee80211_network),
+ GFP_KERNEL);
+ if (!ieee->networks) {
+ printk(KERN_WARNING "%s: Out of memory allocating beacons\n",
+ ieee->dev->name);
+ return -ENOMEM;
+ }
+
+ memset(ieee->networks, 0,
+ MAX_NETWORK_COUNT * sizeof(struct ieee80211_network));
+
+ return 0;
+}
+
+static inline void ieee80211_networks_free(struct ieee80211_device *ieee)
+{
+ if (!ieee->networks)
+ return;
+ kfree(ieee->networks);
+ ieee->networks = NULL;
+}
+
+static inline void ieee80211_networks_initialize(struct ieee80211_device *ieee)
+{
+ int i;
+
+ INIT_LIST_HEAD(&ieee->network_free_list);
+ INIT_LIST_HEAD(&ieee->network_list);
+ for (i = 0; i < MAX_NETWORK_COUNT; i++)
+ list_add_tail(&ieee->networks[i].list,
+ &ieee->network_free_list);
+}
+
+struct net_device *alloc_ieee80211(int sizeof_priv)
+{
+ struct ieee80211_device *ieee;
+ struct net_device *dev;
+ int err;
+
+ IEEE80211_DEBUG_INFO("Initializing...\n");
+
+ dev = alloc_etherdev(sizeof(struct ieee80211_device) + sizeof_priv);
+ if (!dev) {
+ IEEE80211_ERROR("Unable to network device.\n");
+ goto failed;
+ }
+ ieee = netdev_priv(dev);
+ dev->hard_start_xmit = ieee80211_xmit;
+
+ ieee->dev = dev;
+
+ err = ieee80211_networks_allocate(ieee);
+ if (err) {
+ IEEE80211_ERROR("Unable to allocate beacon storage: %d\n", err);
+ goto failed;
+ }
+ ieee80211_networks_initialize(ieee);
+
+ /* Default fragmentation threshold is maximum payload size */
+ ieee->fts = DEFAULT_FTS;
+ ieee->scan_age = DEFAULT_MAX_SCAN_AGE;
+ ieee->open_wep = 1;
+
+ /* Default to enabling full open WEP with host based encrypt/decrypt */
+ ieee->host_encrypt = 1;
+ ieee->host_decrypt = 1;
+ ieee->ieee802_1x = 1; /* Default to supporting 802.1x */
+
+ INIT_LIST_HEAD(&ieee->crypt_deinit_list);
+ init_timer(&ieee->crypt_deinit_timer);
+ ieee->crypt_deinit_timer.data = (unsigned long)ieee;
+ ieee->crypt_deinit_timer.function = ieee80211_crypt_deinit_handler;
+
+ spin_lock_init(&ieee->lock);
+
+ ieee->wpa_enabled = 0;
+ ieee->tkip_countermeasures = 0;
+ ieee->drop_unencrypted = 0;
+ ieee->privacy_invoked = 0;
+ ieee->ieee802_1x = 1;
+
+ return dev;
+
+ failed:
+ if (dev)
+ free_netdev(dev);
+ return NULL;
+}
+
+void free_ieee80211(struct net_device *dev)
+{
+ struct ieee80211_device *ieee = netdev_priv(dev);
+
+ int i;
+
+ del_timer_sync(&ieee->crypt_deinit_timer);
+ ieee80211_crypt_deinit_entries(ieee, 1);
+
+ for (i = 0; i < WEP_KEYS; i++) {
+ struct ieee80211_crypt_data *crypt = ieee->crypt[i];
+ if (crypt) {
+ if (crypt->ops) {
+ crypt->ops->deinit(crypt->priv);
+ module_put(crypt->ops->owner);
+ }
+ kfree(crypt);
+ ieee->crypt[i] = NULL;
+ }
+ }
+
+ ieee80211_networks_free(ieee);
+ free_netdev(dev);
+}
+
+#ifdef CONFIG_IEEE80211_DEBUG
+
+static int debug = 0;
+u32 ieee80211_debug_level = 0;
+struct proc_dir_entry *ieee80211_proc = NULL;
+
+static int show_debug_level(char *page, char **start, off_t offset,
+ int count, int *eof, void *data)
+{
+ return snprintf(page, count, "0x%08X\n", ieee80211_debug_level);
+}
+
+static int store_debug_level(struct file *file, const char __user * buffer,
+ unsigned long count, void *data)
+{
+ char buf[] = "0x00000000";
+ char *p = (char *)buf;
+ unsigned long val;
+
+ if (count > sizeof(buf) - 1)
+ count = sizeof(buf) - 1;
+
+ if (copy_from_user(buf, buffer, count))
+ return count;
+ buf[count] = 0;
+ /*
+ * what a FPOS... What, sscanf(buf, "%i", &val) would be too
+ * scary?
+ */
+ if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+ p++;
+ if (p[0] == 'x' || p[0] == 'X')
+ p++;
+ val = simple_strtoul(p, &p, 16);
+ } else
+ val = simple_strtoul(p, &p, 10);
+ if (p == buf)
+ printk(KERN_INFO DRV_NAME
+ ": %s is not in hex or decimal form.\n", buf);
+ else
+ ieee80211_debug_level = val;
+
+ return strlen(buf);
+}
+
+static int __init ieee80211_init(void)
+{
+ struct proc_dir_entry *e;
+
+ ieee80211_debug_level = debug;
+ ieee80211_proc = create_proc_entry(DRV_NAME, S_IFDIR, proc_net);
+ if (ieee80211_proc == NULL) {
+ IEEE80211_ERROR("Unable to create " DRV_NAME
+ " proc directory\n");
+ return -EIO;
+ }
+ e = create_proc_entry("debug_level", S_IFREG | S_IRUGO | S_IWUSR,
+ ieee80211_proc);
+ if (!e) {
+ remove_proc_entry(DRV_NAME, proc_net);
+ ieee80211_proc = NULL;
+ return -EIO;
+ }
+ e->read_proc = show_debug_level;
+ e->write_proc = store_debug_level;
+ e->data = NULL;
+
+ return 0;
+}
+
+static void __exit ieee80211_exit(void)
+{
+ if (ieee80211_proc) {
+ remove_proc_entry("debug_level", ieee80211_proc);
+ remove_proc_entry(DRV_NAME, proc_net);
+ ieee80211_proc = NULL;
+ }
+}
+
+#include <linux/moduleparam.h>
+module_param(debug, int, 0444);
+MODULE_PARM_DESC(debug, "debug output mask");
+
+module_exit(ieee80211_exit);
+module_init(ieee80211_init);
+#endif
+
+const char *escape_essid(const char *essid, u8 essid_len)
+{
+ static char escaped[IW_ESSID_MAX_SIZE * 2 + 1];
+ const char *s = essid;
+ char *d = escaped;
+
+ if (ieee80211_is_empty_essid(essid, essid_len)) {
+ memcpy(escaped, "<hidden>", sizeof("<hidden>"));
+ return escaped;
+ }
+
+ essid_len = min(essid_len, (u8) IW_ESSID_MAX_SIZE);
+ while (essid_len--) {
+ if (*s == '\0') {
+ *d++ = '\\';
+ *d++ = '0';
+ s++;
+ } else {
+ *d++ = *s++;
+ }
+ }
+ *d = '\0';
+ return escaped;
+}
+
+EXPORT_SYMBOL(alloc_ieee80211);
+EXPORT_SYMBOL(free_ieee80211);
+EXPORT_SYMBOL(escape_essid);
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
new file mode 100644
index 0000000..f7dcd85
--- /dev/null
+++ b/net/ieee80211/ieee80211_rx.c
@@ -0,0 +1,1193 @@
+/*
+ * Original code based Host AP (software wireless LAN access point) driver
+ * for Intersil Prism2/2.5/3 - hostap.o module, common routines
+ *
+ * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ * <jkmaline@cc.hut.fi>
+ * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+ * Copyright (c) 2004, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation. See README and COPYING for
+ * more details.
+ */
+
+#include <linux/compiler.h>
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/if_arp.h>
+#include <linux/in6.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/wireless.h>
+#include <linux/etherdevice.h>
+#include <asm/uaccess.h>
+#include <linux/ctype.h>
+
+#include <net/ieee80211.h>
+
+static inline void ieee80211_monitor_rx(struct ieee80211_device *ieee,
+ struct sk_buff *skb,
+ struct ieee80211_rx_stats *rx_stats)
+{
+ struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+ u16 fc = le16_to_cpu(hdr->frame_ctl);
+
+ skb->dev = ieee->dev;
+ skb->mac.raw = skb->data;
+ skb_pull(skb, ieee80211_get_hdrlen(fc));
+ skb->pkt_type = PACKET_OTHERHOST;
+ skb->protocol = __constant_htons(ETH_P_80211_RAW);
+ memset(skb->cb, 0, sizeof(skb->cb));
+ netif_rx(skb);
+}
+
+/* Called only as a tasklet (software IRQ) */
+static struct ieee80211_frag_entry *ieee80211_frag_cache_find(struct
+ ieee80211_device
+ *ieee,
+ unsigned int seq,
+ unsigned int frag,
+ u8 * src,
+ u8 * dst)
+{
+ struct ieee80211_frag_entry *entry;
+ int i;
+
+ for (i = 0; i < IEEE80211_FRAG_CACHE_LEN; i++) {
+ entry = &ieee->frag_cache[i];
+ if (entry->skb != NULL &&
+ time_after(jiffies, entry->first_frag_time + 2 * HZ)) {
+ IEEE80211_DEBUG_FRAG("expiring fragment cache entry "
+ "seq=%u last_frag=%u\n",
+ entry->seq, entry->last_frag);
+ dev_kfree_skb_any(entry->skb);
+ entry->skb = NULL;
+ }
+
+ if (entry->skb != NULL && entry->seq == seq &&
+ (entry->last_frag + 1 == frag || frag == -1) &&
+ memcmp(entry->src_addr, src, ETH_ALEN) == 0 &&
+ memcmp(entry->dst_addr, dst, ETH_ALEN) == 0)
+ return entry;
+ }
+
+ return NULL;
+}
+
+/* Called only as a tasklet (software IRQ) */
+static struct sk_buff *ieee80211_frag_cache_get(struct ieee80211_device *ieee,
+ struct ieee80211_hdr *hdr)
+{
+ struct sk_buff *skb = NULL;
+ u16 sc;
+ unsigned int frag, seq;
+ struct ieee80211_frag_entry *entry;
+
+ sc = le16_to_cpu(hdr->seq_ctl);
+ frag = WLAN_GET_SEQ_FRAG(sc);
+ seq = WLAN_GET_SEQ_SEQ(sc);
+
+ if (frag == 0) {
+ /* Reserve enough space to fit maximum frame length */
+ skb = dev_alloc_skb(ieee->dev->mtu +
+ sizeof(struct ieee80211_hdr) +
+ 8 /* LLC */ +
+ 2 /* alignment */ +
+ 8 /* WEP */ + ETH_ALEN /* WDS */ );
+ if (skb == NULL)
+ return NULL;
+
+ entry = &ieee->frag_cache[ieee->frag_next_idx];
+ ieee->frag_next_idx++;
+ if (ieee->frag_next_idx >= IEEE80211_FRAG_CACHE_LEN)
+ ieee->frag_next_idx = 0;
+
+ if (entry->skb != NULL)
+ dev_kfree_skb_any(entry->skb);
+
+ entry->first_frag_time = jiffies;
+ entry->seq = seq;
+ entry->last_frag = frag;
+ entry->skb = skb;
+ memcpy(entry->src_addr, hdr->addr2, ETH_ALEN);
+ memcpy(entry->dst_addr, hdr->addr1, ETH_ALEN);
+ } else {
+ /* received a fragment of a frame for which the head fragment
+ * should have already been received */
+ entry = ieee80211_frag_cache_find(ieee, seq, frag, hdr->addr2,
+ hdr->addr1);
+ if (entry != NULL) {
+ entry->last_frag = frag;
+ skb = entry->skb;
+ }
+ }
+
+ return skb;
+}
+
+/* Called only as a tasklet (software IRQ) */
+static int ieee80211_frag_cache_invalidate(struct ieee80211_device *ieee,
+ struct ieee80211_hdr *hdr)
+{
+ u16 sc;
+ unsigned int seq;
+ struct ieee80211_frag_entry *entry;
+
+ sc = le16_to_cpu(hdr->seq_ctl);
+ seq = WLAN_GET_SEQ_SEQ(sc);
+
+ entry = ieee80211_frag_cache_find(ieee, seq, -1, hdr->addr2,
+ hdr->addr1);
+
+ if (entry == NULL) {
+ IEEE80211_DEBUG_FRAG("could not invalidate fragment cache "
+ "entry (seq=%u)\n", seq);
+ return -1;
+ }
+
+ entry->skb = NULL;
+ return 0;
+}
+
+#ifdef NOT_YET
+/* ieee80211_rx_frame_mgtmt
+ *
+ * Responsible for handling management control frames
+ *
+ * Called by ieee80211_rx */
+static inline int
+ieee80211_rx_frame_mgmt(struct ieee80211_device *ieee, struct sk_buff *skb,
+ struct ieee80211_rx_stats *rx_stats, u16 type,
+ u16 stype)
+{
+ if (ieee->iw_mode == IW_MODE_MASTER) {
+ printk(KERN_DEBUG "%s: Master mode not yet suppported.\n",
+ ieee->dev->name);
+ return 0;
+/*
+ hostap_update_sta_ps(ieee, (struct hostap_ieee80211_hdr *)
+ skb->data);*/
+ }
+
+ if (ieee->hostapd && type == WLAN_FC_TYPE_MGMT) {
+ if (stype == WLAN_FC_STYPE_BEACON &&
+ ieee->iw_mode == IW_MODE_MASTER) {
+ struct sk_buff *skb2;
+ /* Process beacon frames also in kernel driver to
+ * update STA(AP) table statistics */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ hostap_rx(skb2->dev, skb2, rx_stats);
+ }
+
+ /* send management frames to the user space daemon for
+ * processing */
+ ieee->apdevstats.rx_packets++;
+ ieee->apdevstats.rx_bytes += skb->len;
+ prism2_rx_80211(ieee->apdev, skb, rx_stats, PRISM2_RX_MGMT);
+ return 0;
+ }
+
+ if (ieee->iw_mode == IW_MODE_MASTER) {
+ if (type != WLAN_FC_TYPE_MGMT && type != WLAN_FC_TYPE_CTRL) {
+ printk(KERN_DEBUG "%s: unknown management frame "
+ "(type=0x%02x, stype=0x%02x) dropped\n",
+ skb->dev->name, type, stype);
+ return -1;
+ }
+
+ hostap_rx(skb->dev, skb, rx_stats);
+ return 0;
+ }
+
+ printk(KERN_DEBUG "%s: hostap_rx_frame_mgmt: management frame "
+ "received in non-Host AP mode\n", skb->dev->name);
+ return -1;
+}
+#endif
+
+/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
+/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
+static unsigned char rfc1042_header[] = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 };
+
+/* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */
+static unsigned char bridge_tunnel_header[] =
+ { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
+/* No encapsulation header if EtherType < 0x600 (=length) */
+
+/* Called by ieee80211_rx_frame_decrypt */
+static int ieee80211_is_eapol_frame(struct ieee80211_device *ieee,
+ struct sk_buff *skb)
+{
+ struct net_device *dev = ieee->dev;
+ u16 fc, ethertype;
+ struct ieee80211_hdr *hdr;
+ u8 *pos;
+
+ if (skb->len < 24)
+ return 0;
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ fc = le16_to_cpu(hdr->frame_ctl);
+
+ /* check that the frame is unicast frame to us */
+ if ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
+ IEEE80211_FCTL_TODS &&
+ memcmp(hdr->addr1, dev->dev_addr, ETH_ALEN) == 0 &&
+ memcmp(hdr->addr3, dev->dev_addr, ETH_ALEN) == 0) {
+ /* ToDS frame with own addr BSSID and DA */
+ } else if ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
+ IEEE80211_FCTL_FROMDS &&
+ memcmp(hdr->addr1, dev->dev_addr, ETH_ALEN) == 0) {
+ /* FromDS frame with own addr as DA */
+ } else
+ return 0;
+
+ if (skb->len < 24 + 8)
+ return 0;
+
+ /* check for port access entity Ethernet type */
+ pos = skb->data + 24;
+ ethertype = (pos[6] << 8) | pos[7];
+ if (ethertype == ETH_P_PAE)
+ return 1;
+
+ return 0;
+}
+
+/* Called only as a tasklet (software IRQ), by ieee80211_rx */
+static inline int
+ieee80211_rx_frame_decrypt(struct ieee80211_device *ieee, struct sk_buff *skb,
+ struct ieee80211_crypt_data *crypt)
+{
+ struct ieee80211_hdr *hdr;
+ int res, hdrlen;
+
+ if (crypt == NULL || crypt->ops->decrypt_mpdu == NULL)
+ return 0;
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ hdrlen = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl));
+
+#ifdef CONFIG_IEEE80211_CRYPT_TKIP
+ if (ieee->tkip_countermeasures && strcmp(crypt->ops->name, "TKIP") == 0) {
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "%s: TKIP countermeasures: dropped "
+ "received packet from " MAC_FMT "\n",
+ ieee->dev->name, MAC_ARG(hdr->addr2));
+ }
+ return -1;
+ }
+#endif
+
+ atomic_inc(&crypt->refcnt);
+ res = crypt->ops->decrypt_mpdu(skb, hdrlen, crypt->priv);
+ atomic_dec(&crypt->refcnt);
+ if (res < 0) {
+ IEEE80211_DEBUG_DROP("decryption failed (SA=" MAC_FMT
+ ") res=%d\n", MAC_ARG(hdr->addr2), res);
+ if (res == -2)
+ IEEE80211_DEBUG_DROP("Decryption failed ICV "
+ "mismatch (key %d)\n",
+ skb->data[hdrlen + 3] >> 6);
+ ieee->ieee_stats.rx_discards_undecryptable++;
+ return -1;
+ }
+
+ return res;
+}
+
+/* Called only as a tasklet (software IRQ), by ieee80211_rx */
+static inline int
+ieee80211_rx_frame_decrypt_msdu(struct ieee80211_device *ieee,
+ struct sk_buff *skb, int keyidx,
+ struct ieee80211_crypt_data *crypt)
+{
+ struct ieee80211_hdr *hdr;
+ int res, hdrlen;
+
+ if (crypt == NULL || crypt->ops->decrypt_msdu == NULL)
+ return 0;
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ hdrlen = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl));
+
+ atomic_inc(&crypt->refcnt);
+ res = crypt->ops->decrypt_msdu(skb, keyidx, hdrlen, crypt->priv);
+ atomic_dec(&crypt->refcnt);
+ if (res < 0) {
+ printk(KERN_DEBUG "%s: MSDU decryption/MIC verification failed"
+ " (SA=" MAC_FMT " keyidx=%d)\n",
+ ieee->dev->name, MAC_ARG(hdr->addr2), keyidx);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* All received frames are sent to this function. @skb contains the frame in
+ * IEEE 802.11 format, i.e., in the format it was sent over air.
+ * This function is called only as a tasklet (software IRQ). */
+int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
+ struct ieee80211_rx_stats *rx_stats)
+{
+ struct net_device *dev = ieee->dev;
+ struct ieee80211_hdr *hdr;
+ size_t hdrlen;
+ u16 fc, type, stype, sc;
+ struct net_device_stats *stats;
+ unsigned int frag;
+ u8 *payload;
+ u16 ethertype;
+#ifdef NOT_YET
+ struct net_device *wds = NULL;
+ struct sk_buff *skb2 = NULL;
+ struct net_device *wds = NULL;
+ int frame_authorized = 0;
+ int from_assoc_ap = 0;
+ void *sta = NULL;
+#endif
+ u8 dst[ETH_ALEN];
+ u8 src[ETH_ALEN];
+ struct ieee80211_crypt_data *crypt = NULL;
+ int keyidx = 0;
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ stats = &ieee->stats;
+
+ if (skb->len < 10) {
+ printk(KERN_INFO "%s: SKB length < 10\n", dev->name);
+ goto rx_dropped;
+ }
+
+ fc = le16_to_cpu(hdr->frame_ctl);
+ type = WLAN_FC_GET_TYPE(fc);
+ stype = WLAN_FC_GET_STYPE(fc);
+ sc = le16_to_cpu(hdr->seq_ctl);
+ frag = WLAN_GET_SEQ_FRAG(sc);
+ hdrlen = ieee80211_get_hdrlen(fc);
+
+#ifdef NOT_YET
+#if WIRELESS_EXT > 15
+ /* Put this code here so that we avoid duplicating it in all
+ * Rx paths. - Jean II */
+#ifdef IW_WIRELESS_SPY /* defined in iw_handler.h */
+ /* If spy monitoring on */
+ if (iface->spy_data.spy_number > 0) {
+ struct iw_quality wstats;
+ wstats.level = rx_stats->signal;
+ wstats.noise = rx_stats->noise;
+ wstats.updated = 6; /* No qual value */
+ /* Update spy records */
+ wireless_spy_update(dev, hdr->addr2, &wstats);
+ }
+#endif /* IW_WIRELESS_SPY */
+#endif /* WIRELESS_EXT > 15 */
+ hostap_update_rx_stats(local->ap, hdr, rx_stats);
+#endif
+
+#if WIRELESS_EXT > 15
+ if (ieee->iw_mode == IW_MODE_MONITOR) {
+ ieee80211_monitor_rx(ieee, skb, rx_stats);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+ return 1;
+ }
+#endif
+
+ if (ieee->host_decrypt) {
+ int idx = 0;
+ if (skb->len >= hdrlen + 3)
+ idx = skb->data[hdrlen + 3] >> 6;
+ crypt = ieee->crypt[idx];
+#ifdef NOT_YET
+ sta = NULL;
+
+ /* Use station specific key to override default keys if the
+ * receiver address is a unicast address ("individual RA"). If
+ * bcrx_sta_key parameter is set, station specific key is used
+ * even with broad/multicast targets (this is against IEEE
+ * 802.11, but makes it easier to use different keys with
+ * stations that do not support WEP key mapping). */
+
+ if (!(hdr->addr1[0] & 0x01) || local->bcrx_sta_key)
+ (void)hostap_handle_sta_crypto(local, hdr, &crypt,
+ &sta);
+#endif
+
+ /* allow NULL decrypt to indicate an station specific override
+ * for default encryption */
+ if (crypt && (crypt->ops == NULL ||
+ crypt->ops->decrypt_mpdu == NULL))
+ crypt = NULL;
+
+ if (!crypt && (fc & IEEE80211_FCTL_PROTECTED)) {
+ /* This seems to be triggered by some (multicast?)
+ * frames from other than current BSS, so just drop the
+ * frames silently instead of filling system log with
+ * these reports. */
+ IEEE80211_DEBUG_DROP("Decryption failed (not set)"
+ " (SA=" MAC_FMT ")\n",
+ MAC_ARG(hdr->addr2));
+ ieee->ieee_stats.rx_discards_undecryptable++;
+ goto rx_dropped;
+ }
+ }
+#ifdef NOT_YET
+ if (type != WLAN_FC_TYPE_DATA) {
+ if (type == WLAN_FC_TYPE_MGMT && stype == WLAN_FC_STYPE_AUTH &&
+ fc & IEEE80211_FCTL_PROTECTED && ieee->host_decrypt &&
+ (keyidx = hostap_rx_frame_decrypt(ieee, skb, crypt)) < 0) {
+ printk(KERN_DEBUG "%s: failed to decrypt mgmt::auth "
+ "from " MAC_FMT "\n", dev->name,
+ MAC_ARG(hdr->addr2));
+ /* TODO: could inform hostapd about this so that it
+ * could send auth failure report */
+ goto rx_dropped;
+ }
+
+ if (ieee80211_rx_frame_mgmt(ieee, skb, rx_stats, type, stype))
+ goto rx_dropped;
+ else
+ goto rx_exit;
+ }
+#endif
+
+ /* Data frame - extract src/dst addresses */
+ if (skb->len < IEEE80211_3ADDR_LEN)
+ goto rx_dropped;
+
+ switch (fc & (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) {
+ case IEEE80211_FCTL_FROMDS:
+ memcpy(dst, hdr->addr1, ETH_ALEN);
+ memcpy(src, hdr->addr3, ETH_ALEN);
+ break;
+ case IEEE80211_FCTL_TODS:
+ memcpy(dst, hdr->addr3, ETH_ALEN);
+ memcpy(src, hdr->addr2, ETH_ALEN);
+ break;
+ case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS:
+ if (skb->len < IEEE80211_4ADDR_LEN)
+ goto rx_dropped;
+ memcpy(dst, hdr->addr3, ETH_ALEN);
+ memcpy(src, hdr->addr4, ETH_ALEN);
+ break;
+ case 0:
+ memcpy(dst, hdr->addr1, ETH_ALEN);
+ memcpy(src, hdr->addr2, ETH_ALEN);
+ break;
+ }
+
+#ifdef NOT_YET
+ if (hostap_rx_frame_wds(ieee, hdr, fc, &wds))
+ goto rx_dropped;
+ if (wds) {
+ skb->dev = dev = wds;
+ stats = hostap_get_stats(dev);
+ }
+
+ if (ieee->iw_mode == IW_MODE_MASTER && !wds &&
+ (fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
+ IEEE80211_FCTL_FROMDS && ieee->stadev
+ && memcmp(hdr->addr2, ieee->assoc_ap_addr, ETH_ALEN) == 0) {
+ /* Frame from BSSID of the AP for which we are a client */
+ skb->dev = dev = ieee->stadev;
+ stats = hostap_get_stats(dev);
+ from_assoc_ap = 1;
+ }
+#endif
+
+ dev->last_rx = jiffies;
+
+#ifdef NOT_YET
+ if ((ieee->iw_mode == IW_MODE_MASTER ||
+ ieee->iw_mode == IW_MODE_REPEAT) && !from_assoc_ap) {
+ switch (hostap_handle_sta_rx(ieee, dev, skb, rx_stats,
+ wds != NULL)) {
+ case AP_RX_CONTINUE_NOT_AUTHORIZED:
+ frame_authorized = 0;
+ break;
+ case AP_RX_CONTINUE:
+ frame_authorized = 1;
+ break;
+ case AP_RX_DROP:
+ goto rx_dropped;
+ case AP_RX_EXIT:
+ goto rx_exit;
+ }
+ }
+#endif
+
+ /* Nullfunc frames may have PS-bit set, so they must be passed to
+ * hostap_handle_sta_rx() before being dropped here. */
+ if (stype != IEEE80211_STYPE_DATA &&
+ stype != IEEE80211_STYPE_DATA_CFACK &&
+ stype != IEEE80211_STYPE_DATA_CFPOLL &&
+ stype != IEEE80211_STYPE_DATA_CFACKPOLL) {
+ if (stype != IEEE80211_STYPE_NULLFUNC)
+ IEEE80211_DEBUG_DROP("RX: dropped data frame "
+ "with no data (type=0x%02x, "
+ "subtype=0x%02x, len=%d)\n",
+ type, stype, skb->len);
+ goto rx_dropped;
+ }
+
+ /* skb: hdr + (possibly fragmented, possibly encrypted) payload */
+
+ if (ieee->host_decrypt && (fc & IEEE80211_FCTL_PROTECTED) &&
+ (keyidx = ieee80211_rx_frame_decrypt(ieee, skb, crypt)) < 0)
+ goto rx_dropped;
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+
+ /* skb: hdr + (possibly fragmented) plaintext payload */
+ // PR: FIXME: hostap has additional conditions in the "if" below:
+ // ieee->host_decrypt && (fc & IEEE80211_FCTL_PROTECTED) &&
+ if ((frag != 0 || (fc & IEEE80211_FCTL_MOREFRAGS))) {
+ int flen;
+ struct sk_buff *frag_skb = ieee80211_frag_cache_get(ieee, hdr);
+ IEEE80211_DEBUG_FRAG("Rx Fragment received (%u)\n", frag);
+
+ if (!frag_skb) {
+ IEEE80211_DEBUG(IEEE80211_DL_RX | IEEE80211_DL_FRAG,
+ "Rx cannot get skb from fragment "
+ "cache (morefrag=%d seq=%u frag=%u)\n",
+ (fc & IEEE80211_FCTL_MOREFRAGS) != 0,
+ WLAN_GET_SEQ_SEQ(sc), frag);
+ goto rx_dropped;
+ }
+
+ flen = skb->len;
+ if (frag != 0)
+ flen -= hdrlen;
+
+ if (frag_skb->tail + flen > frag_skb->end) {
+ printk(KERN_WARNING "%s: host decrypted and "
+ "reassembled frame did not fit skb\n",
+ dev->name);
+ ieee80211_frag_cache_invalidate(ieee, hdr);
+ goto rx_dropped;
+ }
+
+ if (frag == 0) {
+ /* copy first fragment (including full headers) into
+ * beginning of the fragment cache skb */
+ memcpy(skb_put(frag_skb, flen), skb->data, flen);
+ } else {
+ /* append frame payload to the end of the fragment
+ * cache skb */
+ memcpy(skb_put(frag_skb, flen), skb->data + hdrlen,
+ flen);
+ }
+ dev_kfree_skb_any(skb);
+ skb = NULL;
+
+ if (fc & IEEE80211_FCTL_MOREFRAGS) {
+ /* more fragments expected - leave the skb in fragment
+ * cache for now; it will be delivered to upper layers
+ * after all fragments have been received */
+ goto rx_exit;
+ }
+
+ /* this was the last fragment and the frame will be
+ * delivered, so remove skb from fragment cache */
+ skb = frag_skb;
+ hdr = (struct ieee80211_hdr *)skb->data;
+ ieee80211_frag_cache_invalidate(ieee, hdr);
+ }
+
+ /* skb: hdr + (possible reassembled) full MSDU payload; possibly still
+ * encrypted/authenticated */
+ if (ieee->host_decrypt && (fc & IEEE80211_FCTL_PROTECTED) &&
+ ieee80211_rx_frame_decrypt_msdu(ieee, skb, keyidx, crypt))
+ goto rx_dropped;
+
+ hdr = (struct ieee80211_hdr *)skb->data;
+ if (crypt && !(fc & IEEE80211_FCTL_PROTECTED) && !ieee->open_wep) {
+ if ( /*ieee->ieee802_1x && */
+ ieee80211_is_eapol_frame(ieee, skb)) {
+ /* pass unencrypted EAPOL frames even if encryption is
+ * configured */
+ } else {
+ IEEE80211_DEBUG_DROP("encryption configured, but RX "
+ "frame not encrypted (SA=" MAC_FMT
+ ")\n", MAC_ARG(hdr->addr2));
+ goto rx_dropped;
+ }
+ }
+
+ if (crypt && !(fc & IEEE80211_FCTL_PROTECTED) && !ieee->open_wep &&
+ !ieee80211_is_eapol_frame(ieee, skb)) {
+ IEEE80211_DEBUG_DROP("dropped unencrypted RX data "
+ "frame from " MAC_FMT
+ " (drop_unencrypted=1)\n",
+ MAC_ARG(hdr->addr2));
+ goto rx_dropped;
+ }
+
+ /* skb: hdr + (possible reassembled) full plaintext payload */
+
+ payload = skb->data + hdrlen;
+ ethertype = (payload[6] << 8) | payload[7];
+
+#ifdef NOT_YET
+ /* If IEEE 802.1X is used, check whether the port is authorized to send
+ * the received frame. */
+ if (ieee->ieee802_1x && ieee->iw_mode == IW_MODE_MASTER) {
+ if (ethertype == ETH_P_PAE) {
+ printk(KERN_DEBUG "%s: RX: IEEE 802.1X frame\n",
+ dev->name);
+ if (ieee->hostapd && ieee->apdev) {
+ /* Send IEEE 802.1X frames to the user
+ * space daemon for processing */
+ prism2_rx_80211(ieee->apdev, skb, rx_stats,
+ PRISM2_RX_MGMT);
+ ieee->apdevstats.rx_packets++;
+ ieee->apdevstats.rx_bytes += skb->len;
+ goto rx_exit;
+ }
+ } else if (!frame_authorized) {
+ printk(KERN_DEBUG "%s: dropped frame from "
+ "unauthorized port (IEEE 802.1X): "
+ "ethertype=0x%04x\n", dev->name, ethertype);
+ goto rx_dropped;
+ }
+ }
+#endif
+
+ /* convert hdr + possible LLC headers into Ethernet header */
+ if (skb->len - hdrlen >= 8 &&
+ ((memcmp(payload, rfc1042_header, SNAP_SIZE) == 0 &&
+ ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
+ memcmp(payload, bridge_tunnel_header, SNAP_SIZE) == 0)) {
+ /* remove RFC1042 or Bridge-Tunnel encapsulation and
+ * replace EtherType */
+ skb_pull(skb, hdrlen + SNAP_SIZE);
+ memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
+ memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
+ } else {
+ u16 len;
+ /* Leave Ethernet header part of hdr and full payload */
+ skb_pull(skb, hdrlen);
+ len = htons(skb->len);
+ memcpy(skb_push(skb, 2), &len, 2);
+ memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
+ memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
+ }
+
+#ifdef NOT_YET
+ if (wds && ((fc & (IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) ==
+ IEEE80211_FCTL_TODS) && skb->len >= ETH_HLEN + ETH_ALEN) {
+ /* Non-standard frame: get addr4 from its bogus location after
+ * the payload */
+ memcpy(skb->data + ETH_ALEN,
+ skb->data + skb->len - ETH_ALEN, ETH_ALEN);
+ skb_trim(skb, skb->len - ETH_ALEN);
+ }
+#endif
+
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+
+#ifdef NOT_YET
+ if (ieee->iw_mode == IW_MODE_MASTER && !wds && ieee->ap->bridge_packets) {
+ if (dst[0] & 0x01) {
+ /* copy multicast frame both to the higher layers and
+ * to the wireless media */
+ ieee->ap->bridged_multicast++;
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2 == NULL)
+ printk(KERN_DEBUG "%s: skb_clone failed for "
+ "multicast frame\n", dev->name);
+ } else if (hostap_is_sta_assoc(ieee->ap, dst)) {
+ /* send frame directly to the associated STA using
+ * wireless media and not passing to higher layers */
+ ieee->ap->bridged_unicast++;
+ skb2 = skb;
+ skb = NULL;
+ }
+ }
+
+ if (skb2 != NULL) {
+ /* send to wireless media */
+ skb2->protocol = __constant_htons(ETH_P_802_3);
+ skb2->mac.raw = skb2->nh.raw = skb2->data;
+ /* skb2->nh.raw = skb2->data + ETH_HLEN; */
+ skb2->dev = dev;
+ dev_queue_xmit(skb2);
+ }
+#endif
+
+ if (skb) {
+ skb->protocol = eth_type_trans(skb, dev);
+ memset(skb->cb, 0, sizeof(skb->cb));
+ skb->dev = dev;
+ skb->ip_summed = CHECKSUM_NONE; /* 802.11 crc not sufficient */
+ netif_rx(skb);
+ }
+
+ rx_exit:
+#ifdef NOT_YET
+ if (sta)
+ hostap_handle_sta_release(sta);
+#endif
+ return 1;
+
+ rx_dropped:
+ stats->rx_dropped++;
+
+ /* Returning 0 indicates to caller that we have not handled the SKB--
+ * so it is still allocated and can be used again by underlying
+ * hardware as a DMA target */
+ return 0;
+}
+
+#define MGMT_FRAME_FIXED_PART_LENGTH 0x24
+
+static inline int ieee80211_is_ofdm_rate(u8 rate)
+{
+ switch (rate & ~IEEE80211_BASIC_RATE_MASK) {
+ case IEEE80211_OFDM_RATE_6MB:
+ case IEEE80211_OFDM_RATE_9MB:
+ case IEEE80211_OFDM_RATE_12MB:
+ case IEEE80211_OFDM_RATE_18MB:
+ case IEEE80211_OFDM_RATE_24MB:
+ case IEEE80211_OFDM_RATE_36MB:
+ case IEEE80211_OFDM_RATE_48MB:
+ case IEEE80211_OFDM_RATE_54MB:
+ return 1;
+ }
+ return 0;
+}
+
+static inline int ieee80211_network_init(struct ieee80211_device *ieee,
+ struct ieee80211_probe_response
+ *beacon,
+ struct ieee80211_network *network,
+ struct ieee80211_rx_stats *stats)
+{
+#ifdef CONFIG_IEEE80211_DEBUG
+ char rates_str[64];
+ char *p;
+#endif
+ struct ieee80211_info_element *info_element;
+ u16 left;
+ u8 i;
+
+ /* Pull out fixed field data */
+ memcpy(network->bssid, beacon->header.addr3, ETH_ALEN);
+ network->capability = beacon->capability;
+ network->last_scanned = jiffies;
+ network->time_stamp[0] = beacon->time_stamp[0];
+ network->time_stamp[1] = beacon->time_stamp[1];
+ network->beacon_interval = beacon->beacon_interval;
+ /* Where to pull this? beacon->listen_interval; */
+ network->listen_interval = 0x0A;
+ network->rates_len = network->rates_ex_len = 0;
+ network->last_associate = 0;
+ network->ssid_len = 0;
+ network->flags = 0;
+ network->atim_window = 0;
+
+ if (stats->freq == IEEE80211_52GHZ_BAND) {
+ /* for A band (No DS info) */
+ network->channel = stats->received_channel;
+ } else
+ network->flags |= NETWORK_HAS_CCK;
+
+ network->wpa_ie_len = 0;
+ network->rsn_ie_len = 0;
+
+ info_element = &beacon->info_element;
+ left = stats->len - ((void *)info_element - (void *)beacon);
+ while (left >= sizeof(struct ieee80211_info_element_hdr)) {
+ if (sizeof(struct ieee80211_info_element_hdr) +
+ info_element->len > left) {
+ IEEE80211_DEBUG_SCAN
+ ("SCAN: parse failed: info_element->len + 2 > left : info_element->len+2=%Zd left=%d.\n",
+ info_element->len +
+ sizeof(struct ieee80211_info_element), left);
+ return 1;
+ }
+
+ switch (info_element->id) {
+ case MFIE_TYPE_SSID:
+ if (ieee80211_is_empty_essid(info_element->data,
+ info_element->len)) {
+ network->flags |= NETWORK_EMPTY_ESSID;
+ break;
+ }
+
+ network->ssid_len = min(info_element->len,
+ (u8) IW_ESSID_MAX_SIZE);
+ memcpy(network->ssid, info_element->data,
+ network->ssid_len);
+ if (network->ssid_len < IW_ESSID_MAX_SIZE)
+ memset(network->ssid + network->ssid_len, 0,
+ IW_ESSID_MAX_SIZE - network->ssid_len);
+
+ IEEE80211_DEBUG_SCAN("MFIE_TYPE_SSID: '%s' len=%d.\n",
+ network->ssid, network->ssid_len);
+ break;
+
+ case MFIE_TYPE_RATES:
+#ifdef CONFIG_IEEE80211_DEBUG
+ p = rates_str;
+#endif
+ network->rates_len =
+ min(info_element->len, MAX_RATES_LENGTH);
+ for (i = 0; i < network->rates_len; i++) {
+ network->rates[i] = info_element->data[i];
+#ifdef CONFIG_IEEE80211_DEBUG
+ p += snprintf(p,
+ sizeof(rates_str) - (p -
+ rates_str),
+ "%02X ", network->rates[i]);
+#endif
+ if (ieee80211_is_ofdm_rate
+ (info_element->data[i])) {
+ network->flags |= NETWORK_HAS_OFDM;
+ if (info_element->data[i] &
+ IEEE80211_BASIC_RATE_MASK)
+ network->flags &=
+ ~NETWORK_HAS_CCK;
+ }
+ }
+
+ IEEE80211_DEBUG_SCAN("MFIE_TYPE_RATES: '%s' (%d)\n",
+ rates_str, network->rates_len);
+ break;
+
+ case MFIE_TYPE_RATES_EX:
+#ifdef CONFIG_IEEE80211_DEBUG
+ p = rates_str;
+#endif
+ network->rates_ex_len =
+ min(info_element->len, MAX_RATES_EX_LENGTH);
+ for (i = 0; i < network->rates_ex_len; i++) {
+ network->rates_ex[i] = info_element->data[i];
+#ifdef CONFIG_IEEE80211_DEBUG
+ p += snprintf(p,
+ sizeof(rates_str) - (p -
+ rates_str),
+ "%02X ", network->rates[i]);
+#endif
+ if (ieee80211_is_ofdm_rate
+ (info_element->data[i])) {
+ network->flags |= NETWORK_HAS_OFDM;
+ if (info_element->data[i] &
+ IEEE80211_BASIC_RATE_MASK)
+ network->flags &=
+ ~NETWORK_HAS_CCK;
+ }
+ }
+
+ IEEE80211_DEBUG_SCAN("MFIE_TYPE_RATES_EX: '%s' (%d)\n",
+ rates_str, network->rates_ex_len);
+ break;
+
+ case MFIE_TYPE_DS_SET:
+ IEEE80211_DEBUG_SCAN("MFIE_TYPE_DS_SET: %d\n",
+ info_element->data[0]);
+ if (stats->freq == IEEE80211_24GHZ_BAND)
+ network->channel = info_element->data[0];
+ break;
+
+ case MFIE_TYPE_FH_SET:
+ IEEE80211_DEBUG_SCAN("MFIE_TYPE_FH_SET: ignored\n");
+ break;
+
+ case MFIE_TYPE_CF_SET:
+ IEEE80211_DEBUG_SCAN("MFIE_TYPE_CF_SET: ignored\n");
+ break;
+
+ case MFIE_TYPE_TIM:
+ IEEE80211_DEBUG_SCAN("MFIE_TYPE_TIM: ignored\n");
+ break;
+
+ case MFIE_TYPE_IBSS_SET:
+ IEEE80211_DEBUG_SCAN("MFIE_TYPE_IBSS_SET: ignored\n");
+ break;
+
+ case MFIE_TYPE_CHALLENGE:
+ IEEE80211_DEBUG_SCAN("MFIE_TYPE_CHALLENGE: ignored\n");
+ break;
+
+ case MFIE_TYPE_GENERIC:
+ IEEE80211_DEBUG_SCAN("MFIE_TYPE_GENERIC: %d bytes\n",
+ info_element->len);
+ if (info_element->len >= 4 &&
+ info_element->data[0] == 0x00 &&
+ info_element->data[1] == 0x50 &&
+ info_element->data[2] == 0xf2 &&
+ info_element->data[3] == 0x01) {
+ network->wpa_ie_len = min(info_element->len + 2,
+ MAX_WPA_IE_LEN);
+ memcpy(network->wpa_ie, info_element,
+ network->wpa_ie_len);
+ }
+ break;
+
+ case MFIE_TYPE_RSN:
+ IEEE80211_DEBUG_SCAN("MFIE_TYPE_RSN: %d bytes\n",
+ info_element->len);
+ network->rsn_ie_len = min(info_element->len + 2,
+ MAX_WPA_IE_LEN);
+ memcpy(network->rsn_ie, info_element,
+ network->rsn_ie_len);
+ break;
+
+ default:
+ IEEE80211_DEBUG_SCAN("unsupported IE %d\n",
+ info_element->id);
+ break;
+ }
+
+ left -= sizeof(struct ieee80211_info_element_hdr) +
+ info_element->len;
+ info_element = (struct ieee80211_info_element *)
+ &info_element->data[info_element->len];
+ }
+
+ network->mode = 0;
+ if (stats->freq == IEEE80211_52GHZ_BAND)
+ network->mode = IEEE_A;
+ else {
+ if (network->flags & NETWORK_HAS_OFDM)
+ network->mode |= IEEE_G;
+ if (network->flags & NETWORK_HAS_CCK)
+ network->mode |= IEEE_B;
+ }
+
+ if (network->mode == 0) {
+ IEEE80211_DEBUG_SCAN("Filtered out '%s (" MAC_FMT ")' "
+ "network.\n",
+ escape_essid(network->ssid,
+ network->ssid_len),
+ MAC_ARG(network->bssid));
+ return 1;
+ }
+
+ if (ieee80211_is_empty_essid(network->ssid, network->ssid_len))
+ network->flags |= NETWORK_EMPTY_ESSID;
+
+ memcpy(&network->stats, stats, sizeof(network->stats));
+
+ return 0;
+}
+
+static inline int is_same_network(struct ieee80211_network *src,
+ struct ieee80211_network *dst)
+{
+ /* A network is only a duplicate if the channel, BSSID, and ESSID
+ * all match. We treat all <hidden> with the same BSSID and channel
+ * as one network */
+ return ((src->ssid_len == dst->ssid_len) &&
+ (src->channel == dst->channel) &&
+ !memcmp(src->bssid, dst->bssid, ETH_ALEN) &&
+ !memcmp(src->ssid, dst->ssid, src->ssid_len));
+}
+
+static inline void update_network(struct ieee80211_network *dst,
+ struct ieee80211_network *src)
+{
+ memcpy(&dst->stats, &src->stats, sizeof(struct ieee80211_rx_stats));
+ dst->capability = src->capability;
+ memcpy(dst->rates, src->rates, src->rates_len);
+ dst->rates_len = src->rates_len;
+ memcpy(dst->rates_ex, src->rates_ex, src->rates_ex_len);
+ dst->rates_ex_len = src->rates_ex_len;
+
+ dst->mode = src->mode;
+ dst->flags = src->flags;
+ dst->time_stamp[0] = src->time_stamp[0];
+ dst->time_stamp[1] = src->time_stamp[1];
+
+ dst->beacon_interval = src->beacon_interval;
+ dst->listen_interval = src->listen_interval;
+ dst->atim_window = src->atim_window;
+
+ memcpy(dst->wpa_ie, src->wpa_ie, src->wpa_ie_len);
+ dst->wpa_ie_len = src->wpa_ie_len;
+ memcpy(dst->rsn_ie, src->rsn_ie, src->rsn_ie_len);
+ dst->rsn_ie_len = src->rsn_ie_len;
+
+ dst->last_scanned = jiffies;
+ /* dst->last_associate is not overwritten */
+}
+
+static inline void ieee80211_process_probe_response(struct ieee80211_device
+ *ieee,
+ struct
+ ieee80211_probe_response
+ *beacon,
+ struct ieee80211_rx_stats
+ *stats)
+{
+ struct ieee80211_network network;
+ struct ieee80211_network *target;
+ struct ieee80211_network *oldest = NULL;
+#ifdef CONFIG_IEEE80211_DEBUG
+ struct ieee80211_info_element *info_element = &beacon->info_element;
+#endif
+ unsigned long flags;
+
+ IEEE80211_DEBUG_SCAN("'%s' (" MAC_FMT
+ "): %c%c%c%c %c%c%c%c-%c%c%c%c %c%c%c%c\n",
+ escape_essid(info_element->data,
+ info_element->len),
+ MAC_ARG(beacon->header.addr3),
+ (beacon->capability & (1 << 0xf)) ? '1' : '0',
+ (beacon->capability & (1 << 0xe)) ? '1' : '0',
+ (beacon->capability & (1 << 0xd)) ? '1' : '0',
+ (beacon->capability & (1 << 0xc)) ? '1' : '0',
+ (beacon->capability & (1 << 0xb)) ? '1' : '0',
+ (beacon->capability & (1 << 0xa)) ? '1' : '0',
+ (beacon->capability & (1 << 0x9)) ? '1' : '0',
+ (beacon->capability & (1 << 0x8)) ? '1' : '0',
+ (beacon->capability & (1 << 0x7)) ? '1' : '0',
+ (beacon->capability & (1 << 0x6)) ? '1' : '0',
+ (beacon->capability & (1 << 0x5)) ? '1' : '0',
+ (beacon->capability & (1 << 0x4)) ? '1' : '0',
+ (beacon->capability & (1 << 0x3)) ? '1' : '0',
+ (beacon->capability & (1 << 0x2)) ? '1' : '0',
+ (beacon->capability & (1 << 0x1)) ? '1' : '0',
+ (beacon->capability & (1 << 0x0)) ? '1' : '0');
+
+ if (ieee80211_network_init(ieee, beacon, &network, stats)) {
+ IEEE80211_DEBUG_SCAN("Dropped '%s' (" MAC_FMT ") via %s.\n",
+ escape_essid(info_element->data,
+ info_element->len),
+ MAC_ARG(beacon->header.addr3),
+ WLAN_FC_GET_STYPE(beacon->header.
+ frame_ctl) ==
+ IEEE80211_STYPE_PROBE_RESP ?
+ "PROBE RESPONSE" : "BEACON");
+ return;
+ }
+
+ /* The network parsed correctly -- so now we scan our known networks
+ * to see if we can find it in our list.
+ *
+ * NOTE: This search is definitely not optimized. Once its doing
+ * the "right thing" we'll optimize it for efficiency if
+ * necessary */
+
+ /* Search for this entry in the list and update it if it is
+ * already there. */
+
+ spin_lock_irqsave(&ieee->lock, flags);
+
+ list_for_each_entry(target, &ieee->network_list, list) {
+ if (is_same_network(target, &network))
+ break;
+
+ if ((oldest == NULL) ||
+ (target->last_scanned < oldest->last_scanned))
+ oldest = target;
+ }
+
+ /* If we didn't find a match, then get a new network slot to initialize
+ * with this beacon's information */
+ if (&target->list == &ieee->network_list) {
+ if (list_empty(&ieee->network_free_list)) {
+ /* If there are no more slots, expire the oldest */
+ list_del(&oldest->list);
+ target = oldest;
+ IEEE80211_DEBUG_SCAN("Expired '%s' (" MAC_FMT ") from "
+ "network list.\n",
+ escape_essid(target->ssid,
+ target->ssid_len),
+ MAC_ARG(target->bssid));
+ } else {
+ /* Otherwise just pull from the free list */
+ target = list_entry(ieee->network_free_list.next,
+ struct ieee80211_network, list);
+ list_del(ieee->network_free_list.next);
+ }
+
+#ifdef CONFIG_IEEE80211_DEBUG
+ IEEE80211_DEBUG_SCAN("Adding '%s' (" MAC_FMT ") via %s.\n",
+ escape_essid(network.ssid,
+ network.ssid_len),
+ MAC_ARG(network.bssid),
+ WLAN_FC_GET_STYPE(beacon->header.
+ frame_ctl) ==
+ IEEE80211_STYPE_PROBE_RESP ?
+ "PROBE RESPONSE" : "BEACON");
+#endif
+ memcpy(target, &network, sizeof(*target));
+ list_add_tail(&target->list, &ieee->network_list);
+ } else {
+ IEEE80211_DEBUG_SCAN("Updating '%s' (" MAC_FMT ") via %s.\n",
+ escape_essid(target->ssid,
+ target->ssid_len),
+ MAC_ARG(target->bssid),
+ WLAN_FC_GET_STYPE(beacon->header.
+ frame_ctl) ==
+ IEEE80211_STYPE_PROBE_RESP ?
+ "PROBE RESPONSE" : "BEACON");
+ update_network(target, &network);
+ }
+
+ spin_unlock_irqrestore(&ieee->lock, flags);
+}
+
+void ieee80211_rx_mgt(struct ieee80211_device *ieee,
+ struct ieee80211_hdr *header,
+ struct ieee80211_rx_stats *stats)
+{
+ switch (WLAN_FC_GET_STYPE(header->frame_ctl)) {
+ case IEEE80211_STYPE_ASSOC_RESP:
+ IEEE80211_DEBUG_MGMT("received ASSOCIATION RESPONSE (%d)\n",
+ WLAN_FC_GET_STYPE(header->frame_ctl));
+ break;
+
+ case IEEE80211_STYPE_REASSOC_RESP:
+ IEEE80211_DEBUG_MGMT("received REASSOCIATION RESPONSE (%d)\n",
+ WLAN_FC_GET_STYPE(header->frame_ctl));
+ break;
+
+ case IEEE80211_STYPE_PROBE_RESP:
+ IEEE80211_DEBUG_MGMT("received PROBE RESPONSE (%d)\n",
+ WLAN_FC_GET_STYPE(header->frame_ctl));
+ IEEE80211_DEBUG_SCAN("Probe response\n");
+ ieee80211_process_probe_response(ieee,
+ (struct
+ ieee80211_probe_response *)
+ header, stats);
+ break;
+
+ case IEEE80211_STYPE_BEACON:
+ IEEE80211_DEBUG_MGMT("received BEACON (%d)\n",
+ WLAN_FC_GET_STYPE(header->frame_ctl));
+ IEEE80211_DEBUG_SCAN("Beacon\n");
+ ieee80211_process_probe_response(ieee,
+ (struct
+ ieee80211_probe_response *)
+ header, stats);
+ break;
+
+ default:
+ IEEE80211_DEBUG_MGMT("received UNKNOWN (%d)\n",
+ WLAN_FC_GET_STYPE(header->frame_ctl));
+ IEEE80211_WARNING("%s: Unknown management packet: %d\n",
+ ieee->dev->name,
+ WLAN_FC_GET_STYPE(header->frame_ctl));
+ break;
+ }
+}
+
+EXPORT_SYMBOL(ieee80211_rx_mgt);
+EXPORT_SYMBOL(ieee80211_rx);
diff --git a/net/ieee80211/ieee80211_tx.c b/net/ieee80211/ieee80211_tx.c
new file mode 100644
index 0000000..c9aaff3
--- /dev/null
+++ b/net/ieee80211/ieee80211_tx.c
@@ -0,0 +1,428 @@
+/******************************************************************************
+
+ Copyright(c) 2003 - 2004 Intel Corporation. All rights reserved.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program; if not, write to the Free Software Foundation, Inc., 59
+ Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+ The full GNU General Public License is included in this distribution in the
+ file called LICENSE.
+
+ Contact Information:
+ James P. Ketrenos <ipw2100-admin@linux.intel.com>
+ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+******************************************************************************/
+#include <linux/compiler.h>
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/if_arp.h>
+#include <linux/in6.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/wireless.h>
+#include <linux/etherdevice.h>
+#include <asm/uaccess.h>
+
+#include <net/ieee80211.h>
+
+/*
+
+802.11 Data Frame
+
+ ,-------------------------------------------------------------------.
+Bytes | 2 | 2 | 6 | 6 | 6 | 2 | 0..2312 | 4 |
+ |------|------|---------|---------|---------|------|---------|------|
+Desc. | ctrl | dura | DA/RA | TA | SA | Sequ | Frame | fcs |
+ | | tion | (BSSID) | | | ence | data | |
+ `--------------------------------------------------| |------'
+Total: 28 non-data bytes `----.----'
+ |
+ .- 'Frame data' expands to <---------------------------'
+ |
+ V
+ ,---------------------------------------------------.
+Bytes | 1 | 1 | 1 | 3 | 2 | 0-2304 |
+ |------|------|---------|----------|------|---------|
+Desc. | SNAP | SNAP | Control |Eth Tunnel| Type | IP |
+ | DSAP | SSAP | | | | Packet |
+ | 0xAA | 0xAA |0x03 (UI)|0x00-00-F8| | |
+ `-----------------------------------------| |
+Total: 8 non-data bytes `----.----'
+ |
+ .- 'IP Packet' expands, if WEP enabled, to <--'
+ |
+ V
+ ,-----------------------.
+Bytes | 4 | 0-2296 | 4 |
+ |-----|-----------|-----|
+Desc. | IV | Encrypted | ICV |
+ | | IP Packet | |
+ `-----------------------'
+Total: 8 non-data bytes
+
+802.3 Ethernet Data Frame
+
+ ,-----------------------------------------.
+Bytes | 6 | 6 | 2 | Variable | 4 |
+ |-------|-------|------|-----------|------|
+Desc. | Dest. | Source| Type | IP Packet | fcs |
+ | MAC | MAC | | | |
+ `-----------------------------------------'
+Total: 18 non-data bytes
+
+In the event that fragmentation is required, the incoming payload is split into
+N parts of size ieee->fts. The first fragment contains the SNAP header and the
+remaining packets are just data.
+
+If encryption is enabled, each fragment payload size is reduced by enough space
+to add the prefix and postfix (IV and ICV totalling 8 bytes in the case of WEP)
+So if you have 1500 bytes of payload with ieee->fts set to 500 without
+encryption it will take 3 frames. With WEP it will take 4 frames as the
+payload of each frame is reduced to 492 bytes.
+
+* SKB visualization
+*
+* ,- skb->data
+* |
+* | ETHERNET HEADER ,-<-- PAYLOAD
+* | | 14 bytes from skb->data
+* | 2 bytes for Type --> ,T. | (sizeof ethhdr)
+* | | | |
+* |,-Dest.--. ,--Src.---. | | |
+* | 6 bytes| | 6 bytes | | | |
+* v | | | | | |
+* 0 | v 1 | v | v 2
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+* ^ | ^ | ^ |
+* | | | | | |
+* | | | | `T' <---- 2 bytes for Type
+* | | | |
+* | | '---SNAP--' <-------- 6 bytes for SNAP
+* | |
+* `-IV--' <-------------------- 4 bytes for IV (WEP)
+*
+* SNAP HEADER
+*
+*/
+
+static u8 P802_1H_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0xf8 };
+static u8 RFC1042_OUI[P80211_OUI_LEN] = { 0x00, 0x00, 0x00 };
+
+static inline int ieee80211_put_snap(u8 * data, u16 h_proto)
+{
+ struct ieee80211_snap_hdr *snap;
+ u8 *oui;
+
+ snap = (struct ieee80211_snap_hdr *)data;
+ snap->dsap = 0xaa;
+ snap->ssap = 0xaa;
+ snap->ctrl = 0x03;
+
+ if (h_proto == 0x8137 || h_proto == 0x80f3)
+ oui = P802_1H_OUI;
+ else
+ oui = RFC1042_OUI;
+ snap->oui[0] = oui[0];
+ snap->oui[1] = oui[1];
+ snap->oui[2] = oui[2];
+
+ *(u16 *) (data + SNAP_SIZE) = htons(h_proto);
+
+ return SNAP_SIZE + sizeof(u16);
+}
+
+static inline int ieee80211_encrypt_fragment(struct ieee80211_device *ieee,
+ struct sk_buff *frag, int hdr_len)
+{
+ struct ieee80211_crypt_data *crypt = ieee->crypt[ieee->tx_keyidx];
+ int res;
+
+#ifdef CONFIG_IEEE80211_CRYPT_TKIP
+ struct ieee80211_hdr *header;
+
+ if (ieee->tkip_countermeasures &&
+ crypt && crypt->ops && strcmp(crypt->ops->name, "TKIP") == 0) {
+ header = (struct ieee80211_hdr *)frag->data;
+ if (net_ratelimit()) {
+ printk(KERN_DEBUG "%s: TKIP countermeasures: dropped "
+ "TX packet to " MAC_FMT "\n",
+ ieee->dev->name, MAC_ARG(header->addr1));
+ }
+ return -1;
+ }
+#endif
+ /* To encrypt, frame format is:
+ * IV (4 bytes), clear payload (including SNAP), ICV (4 bytes) */
+
+ // PR: FIXME: Copied from hostap. Check fragmentation/MSDU/MPDU encryption.
+ /* Host-based IEEE 802.11 fragmentation for TX is not yet supported, so
+ * call both MSDU and MPDU encryption functions from here. */
+ atomic_inc(&crypt->refcnt);
+ res = 0;
+ if (crypt->ops->encrypt_msdu)
+ res = crypt->ops->encrypt_msdu(frag, hdr_len, crypt->priv);
+ if (res == 0 && crypt->ops->encrypt_mpdu)
+ res = crypt->ops->encrypt_mpdu(frag, hdr_len, crypt->priv);
+
+ atomic_dec(&crypt->refcnt);
+ if (res < 0) {
+ printk(KERN_INFO "%s: Encryption failed: len=%d.\n",
+ ieee->dev->name, frag->len);
+ ieee->ieee_stats.tx_discards++;
+ return -1;
+ }
+
+ return 0;
+}
+
+void ieee80211_txb_free(struct ieee80211_txb *txb)
+{
+ int i;
+ if (unlikely(!txb))
+ return;
+ for (i = 0; i < txb->nr_frags; i++)
+ if (txb->fragments[i])
+ dev_kfree_skb_any(txb->fragments[i]);
+ kfree(txb);
+}
+
+static struct ieee80211_txb *ieee80211_alloc_txb(int nr_frags, int txb_size,
+ int gfp_mask)
+{
+ struct ieee80211_txb *txb;
+ int i;
+ txb = kmalloc(sizeof(struct ieee80211_txb) + (sizeof(u8 *) * nr_frags),
+ gfp_mask);
+ if (!txb)
+ return NULL;
+
+ memset(txb, 0, sizeof(struct ieee80211_txb));
+ txb->nr_frags = nr_frags;
+ txb->frag_size = txb_size;
+
+ for (i = 0; i < nr_frags; i++) {
+ txb->fragments[i] = dev_alloc_skb(txb_size);
+ if (unlikely(!txb->fragments[i])) {
+ i--;
+ break;
+ }
+ }
+ if (unlikely(i != nr_frags)) {
+ while (i >= 0)
+ dev_kfree_skb_any(txb->fragments[i--]);
+ kfree(txb);
+ return NULL;
+ }
+ return txb;
+}
+
+/* SKBs are added to the ieee->tx_queue. */
+int ieee80211_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ieee80211_device *ieee = netdev_priv(dev);
+ struct ieee80211_txb *txb = NULL;
+ struct ieee80211_hdr *frag_hdr;
+ int i, bytes_per_frag, nr_frags, bytes_last_frag, frag_size;
+ unsigned long flags;
+ struct net_device_stats *stats = &ieee->stats;
+ int ether_type, encrypt;
+ int bytes, fc, hdr_len;
+ struct sk_buff *skb_frag;
+ struct ieee80211_hdr header = { /* Ensure zero initialized */
+ .duration_id = 0,
+ .seq_ctl = 0
+ };
+ u8 dest[ETH_ALEN], src[ETH_ALEN];
+
+ struct ieee80211_crypt_data *crypt;
+
+ spin_lock_irqsave(&ieee->lock, flags);
+
+ /* If there is no driver handler to take the TXB, dont' bother
+ * creating it... */
+ if (!ieee->hard_start_xmit) {
+ printk(KERN_WARNING "%s: No xmit handler.\n", ieee->dev->name);
+ goto success;
+ }
+
+ if (unlikely(skb->len < SNAP_SIZE + sizeof(u16))) {
+ printk(KERN_WARNING "%s: skb too small (%d).\n",
+ ieee->dev->name, skb->len);
+ goto success;
+ }
+
+ ether_type = ntohs(((struct ethhdr *)skb->data)->h_proto);
+
+ crypt = ieee->crypt[ieee->tx_keyidx];
+
+ encrypt = !(ether_type == ETH_P_PAE && ieee->ieee802_1x) &&
+ ieee->host_encrypt && crypt && crypt->ops;
+
+ if (!encrypt && ieee->ieee802_1x &&
+ ieee->drop_unencrypted && ether_type != ETH_P_PAE) {
+ stats->tx_dropped++;
+ goto success;
+ }
+
+ /* Save source and destination addresses */
+ memcpy(&dest, skb->data, ETH_ALEN);
+ memcpy(&src, skb->data + ETH_ALEN, ETH_ALEN);
+
+ /* Advance the SKB to the start of the payload */
+ skb_pull(skb, sizeof(struct ethhdr));
+
+ /* Determine total amount of storage required for TXB packets */
+ bytes = skb->len + SNAP_SIZE + sizeof(u16);
+
+ if (encrypt)
+ fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA |
+ IEEE80211_FCTL_PROTECTED;
+ else
+ fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA;
+
+ if (ieee->iw_mode == IW_MODE_INFRA) {
+ fc |= IEEE80211_FCTL_TODS;
+ /* To DS: Addr1 = BSSID, Addr2 = SA,
+ Addr3 = DA */
+ memcpy(&header.addr1, ieee->bssid, ETH_ALEN);
+ memcpy(&header.addr2, &src, ETH_ALEN);
+ memcpy(&header.addr3, &dest, ETH_ALEN);
+ } else if (ieee->iw_mode == IW_MODE_ADHOC) {
+ /* not From/To DS: Addr1 = DA, Addr2 = SA,
+ Addr3 = BSSID */
+ memcpy(&header.addr1, dest, ETH_ALEN);
+ memcpy(&header.addr2, src, ETH_ALEN);
+ memcpy(&header.addr3, ieee->bssid, ETH_ALEN);
+ }
+ header.frame_ctl = cpu_to_le16(fc);
+ hdr_len = IEEE80211_3ADDR_LEN;
+
+ /* Determine fragmentation size based on destination (multicast
+ * and broadcast are not fragmented) */
+ if (is_multicast_ether_addr(dest) || is_broadcast_ether_addr(dest))
+ frag_size = MAX_FRAG_THRESHOLD;
+ else
+ frag_size = ieee->fts;
+
+ /* Determine amount of payload per fragment. Regardless of if
+ * this stack is providing the full 802.11 header, one will
+ * eventually be affixed to this fragment -- so we must account for
+ * it when determining the amount of payload space. */
+ bytes_per_frag = frag_size - IEEE80211_3ADDR_LEN;
+ if (ieee->config &
+ (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS))
+ bytes_per_frag -= IEEE80211_FCS_LEN;
+
+ /* Each fragment may need to have room for encryptiong pre/postfix */
+ if (encrypt)
+ bytes_per_frag -= crypt->ops->extra_prefix_len +
+ crypt->ops->extra_postfix_len;
+
+ /* Number of fragments is the total bytes_per_frag /
+ * payload_per_fragment */
+ nr_frags = bytes / bytes_per_frag;
+ bytes_last_frag = bytes % bytes_per_frag;
+ if (bytes_last_frag)
+ nr_frags++;
+ else
+ bytes_last_frag = bytes_per_frag;
+
+ /* When we allocate the TXB we allocate enough space for the reserve
+ * and full fragment bytes (bytes_per_frag doesn't include prefix,
+ * postfix, header, FCS, etc.) */
+ txb = ieee80211_alloc_txb(nr_frags, frag_size, GFP_ATOMIC);
+ if (unlikely(!txb)) {
+ printk(KERN_WARNING "%s: Could not allocate TXB\n",
+ ieee->dev->name);
+ goto failed;
+ }
+ txb->encrypted = encrypt;
+ txb->payload_size = bytes;
+
+ for (i = 0; i < nr_frags; i++) {
+ skb_frag = txb->fragments[i];
+
+ if (encrypt)
+ skb_reserve(skb_frag, crypt->ops->extra_prefix_len);
+
+ frag_hdr = (struct ieee80211_hdr *)skb_put(skb_frag, hdr_len);
+ memcpy(frag_hdr, &header, hdr_len);
+
+ /* If this is not the last fragment, then add the MOREFRAGS
+ * bit to the frame control */
+ if (i != nr_frags - 1) {
+ frag_hdr->frame_ctl =
+ cpu_to_le16(fc | IEEE80211_FCTL_MOREFRAGS);
+ bytes = bytes_per_frag;
+ } else {
+ /* The last fragment takes the remaining length */
+ bytes = bytes_last_frag;
+ }
+
+ /* Put a SNAP header on the first fragment */
+ if (i == 0) {
+ ieee80211_put_snap(skb_put
+ (skb_frag, SNAP_SIZE + sizeof(u16)),
+ ether_type);
+ bytes -= SNAP_SIZE + sizeof(u16);
+ }
+
+ memcpy(skb_put(skb_frag, bytes), skb->data, bytes);
+
+ /* Advance the SKB... */
+ skb_pull(skb, bytes);
+
+ /* Encryption routine will move the header forward in order
+ * to insert the IV between the header and the payload */
+ if (encrypt)
+ ieee80211_encrypt_fragment(ieee, skb_frag, hdr_len);
+ if (ieee->config &
+ (CFG_IEEE80211_COMPUTE_FCS | CFG_IEEE80211_RESERVE_FCS))
+ skb_put(skb_frag, 4);
+ }
+
+ success:
+ spin_unlock_irqrestore(&ieee->lock, flags);
+
+ dev_kfree_skb_any(skb);
+
+ if (txb) {
+ if ((*ieee->hard_start_xmit) (txb, dev) == 0) {
+ stats->tx_packets++;
+ stats->tx_bytes += txb->payload_size;
+ return 0;
+ }
+ ieee80211_txb_free(txb);
+ }
+
+ return 0;
+
+ failed:
+ spin_unlock_irqrestore(&ieee->lock, flags);
+ netif_stop_queue(dev);
+ stats->tx_errors++;
+ return 1;
+
+}
+
+EXPORT_SYMBOL(ieee80211_txb_free);
diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c
new file mode 100644
index 0000000..94882f3
--- /dev/null
+++ b/net/ieee80211/ieee80211_wx.c
@@ -0,0 +1,468 @@
+/******************************************************************************
+
+ Copyright(c) 2004 Intel Corporation. All rights reserved.
+
+ Portions of this file are based on the WEP enablement code provided by the
+ Host AP project hostap-drivers v0.1.3
+ Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
+ <jkmaline@cc.hut.fi>
+ Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program; if not, write to the Free Software Foundation, Inc., 59
+ Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+ The full GNU General Public License is included in this distribution in the
+ file called LICENSE.
+
+ Contact Information:
+ James P. Ketrenos <ipw2100-admin@linux.intel.com>
+ Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+******************************************************************************/
+
+#include <linux/kmod.h>
+#include <linux/module.h>
+
+#include <net/ieee80211.h>
+#include <linux/wireless.h>
+
+static const char *ieee80211_modes[] = {
+ "?", "a", "b", "ab", "g", "ag", "bg", "abg"
+};
+
+#define MAX_CUSTOM_LEN 64
+static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee,
+ char *start, char *stop,
+ struct ieee80211_network *network)
+{
+ char custom[MAX_CUSTOM_LEN];
+ char *p;
+ struct iw_event iwe;
+ int i, j;
+ u8 max_rate, rate;
+
+ /* First entry *MUST* be the AP MAC address */
+ iwe.cmd = SIOCGIWAP;
+ iwe.u.ap_addr.sa_family = ARPHRD_ETHER;
+ memcpy(iwe.u.ap_addr.sa_data, network->bssid, ETH_ALEN);
+ start = iwe_stream_add_event(start, stop, &iwe, IW_EV_ADDR_LEN);
+
+ /* Remaining entries will be displayed in the order we provide them */
+
+ /* Add the ESSID */
+ iwe.cmd = SIOCGIWESSID;
+ iwe.u.data.flags = 1;
+ if (network->flags & NETWORK_EMPTY_ESSID) {
+ iwe.u.data.length = sizeof("<hidden>");
+ start = iwe_stream_add_point(start, stop, &iwe, "<hidden>");
+ } else {
+ iwe.u.data.length = min(network->ssid_len, (u8) 32);
+ start = iwe_stream_add_point(start, stop, &iwe, network->ssid);
+ }
+
+ /* Add the protocol name */
+ iwe.cmd = SIOCGIWNAME;
+ snprintf(iwe.u.name, IFNAMSIZ, "IEEE 802.11%s",
+ ieee80211_modes[network->mode]);
+ start = iwe_stream_add_event(start, stop, &iwe, IW_EV_CHAR_LEN);
+
+ /* Add mode */
+ iwe.cmd = SIOCGIWMODE;
+ if (network->capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS)) {
+ if (network->capability & WLAN_CAPABILITY_ESS)
+ iwe.u.mode = IW_MODE_MASTER;
+ else
+ iwe.u.mode = IW_MODE_ADHOC;
+
+ start = iwe_stream_add_event(start, stop, &iwe, IW_EV_UINT_LEN);
+ }
+
+ /* Add frequency/channel */
+ iwe.cmd = SIOCGIWFREQ;
+/* iwe.u.freq.m = ieee80211_frequency(network->channel, network->mode);
+ iwe.u.freq.e = 3; */
+ iwe.u.freq.m = network->channel;
+ iwe.u.freq.e = 0;
+ iwe.u.freq.i = 0;
+ start = iwe_stream_add_event(start, stop, &iwe, IW_EV_FREQ_LEN);
+
+ /* Add encryption capability */
+ iwe.cmd = SIOCGIWENCODE;
+ if (network->capability & WLAN_CAPABILITY_PRIVACY)
+ iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY;
+ else
+ iwe.u.data.flags = IW_ENCODE_DISABLED;
+ iwe.u.data.length = 0;
+ start = iwe_stream_add_point(start, stop, &iwe, network->ssid);
+
+ /* Add basic and extended rates */
+ max_rate = 0;
+ p = custom;
+ p += snprintf(p, MAX_CUSTOM_LEN - (p - custom), " Rates (Mb/s): ");
+ for (i = 0, j = 0; i < network->rates_len;) {
+ if (j < network->rates_ex_len &&
+ ((network->rates_ex[j] & 0x7F) <
+ (network->rates[i] & 0x7F)))
+ rate = network->rates_ex[j++] & 0x7F;
+ else
+ rate = network->rates[i++] & 0x7F;
+ if (rate > max_rate)
+ max_rate = rate;
+ p += snprintf(p, MAX_CUSTOM_LEN - (p - custom),
+ "%d%s ", rate >> 1, (rate & 1) ? ".5" : "");
+ }
+ for (; j < network->rates_ex_len; j++) {
+ rate = network->rates_ex[j] & 0x7F;
+ p += snprintf(p, MAX_CUSTOM_LEN - (p - custom),
+ "%d%s ", rate >> 1, (rate & 1) ? ".5" : "");
+ if (rate > max_rate)
+ max_rate = rate;
+ }
+
+ iwe.cmd = SIOCGIWRATE;
+ iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0;
+ iwe.u.bitrate.value = max_rate * 500000;
+ start = iwe_stream_add_event(start, stop, &iwe, IW_EV_PARAM_LEN);
+
+ iwe.cmd = IWEVCUSTOM;
+ iwe.u.data.length = p - custom;
+ if (iwe.u.data.length)
+ start = iwe_stream_add_point(start, stop, &iwe, custom);
+
+ /* Add quality statistics */
+ /* TODO: Fix these values... */
+ iwe.cmd = IWEVQUAL;
+ iwe.u.qual.qual = network->stats.signal;
+ iwe.u.qual.level = network->stats.rssi;
+ iwe.u.qual.noise = network->stats.noise;
+ iwe.u.qual.updated = network->stats.mask & IEEE80211_STATMASK_WEMASK;
+ if (!(network->stats.mask & IEEE80211_STATMASK_RSSI))
+ iwe.u.qual.updated |= IW_QUAL_LEVEL_INVALID;
+ if (!(network->stats.mask & IEEE80211_STATMASK_NOISE))
+ iwe.u.qual.updated |= IW_QUAL_NOISE_INVALID;
+ if (!(network->stats.mask & IEEE80211_STATMASK_SIGNAL))
+ iwe.u.qual.updated |= IW_QUAL_QUAL_INVALID;
+
+ start = iwe_stream_add_event(start, stop, &iwe, IW_EV_QUAL_LEN);
+
+ iwe.cmd = IWEVCUSTOM;
+ p = custom;
+
+ iwe.u.data.length = p - custom;
+ if (iwe.u.data.length)
+ start = iwe_stream_add_point(start, stop, &iwe, custom);
+
+ if (ieee->wpa_enabled && network->wpa_ie_len) {
+ char buf[MAX_WPA_IE_LEN * 2 + 30];
+
+ u8 *p = buf;
+ p += sprintf(p, "wpa_ie=");
+ for (i = 0; i < network->wpa_ie_len; i++) {
+ p += sprintf(p, "%02x", network->wpa_ie[i]);
+ }
+
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = IWEVCUSTOM;
+ iwe.u.data.length = strlen(buf);
+ start = iwe_stream_add_point(start, stop, &iwe, buf);
+ }
+
+ if (ieee->wpa_enabled && network->rsn_ie_len) {
+ char buf[MAX_WPA_IE_LEN * 2 + 30];
+
+ u8 *p = buf;
+ p += sprintf(p, "rsn_ie=");
+ for (i = 0; i < network->rsn_ie_len; i++) {
+ p += sprintf(p, "%02x", network->rsn_ie[i]);
+ }
+
+ memset(&iwe, 0, sizeof(iwe));
+ iwe.cmd = IWEVCUSTOM;
+ iwe.u.data.length = strlen(buf);
+ start = iwe_stream_add_point(start, stop, &iwe, buf);
+ }
+
+ /* Add EXTRA: Age to display seconds since last beacon/probe response
+ * for given network. */
+ iwe.cmd = IWEVCUSTOM;
+ p = custom;
+ p += snprintf(p, MAX_CUSTOM_LEN - (p - custom),
+ " Last beacon: %lums ago",
+ (jiffies - network->last_scanned) / (HZ / 100));
+ iwe.u.data.length = p - custom;
+ if (iwe.u.data.length)
+ start = iwe_stream_add_point(start, stop, &iwe, custom);
+
+ return start;
+}
+
+int ieee80211_wx_get_scan(struct ieee80211_device *ieee,
+ struct iw_request_info *info,
+ union iwreq_data *wrqu, char *extra)
+{
+ struct ieee80211_network *network;
+ unsigned long flags;
+
+ char *ev = extra;
+ char *stop = ev + IW_SCAN_MAX_DATA;
+ int i = 0;
+
+ IEEE80211_DEBUG_WX("Getting scan\n");
+
+ spin_lock_irqsave(&ieee->lock, flags);
+
+ list_for_each_entry(network, &ieee->network_list, list) {
+ i++;
+ if (ieee->scan_age == 0 ||
+ time_after(network->last_scanned + ieee->scan_age, jiffies))
+ ev = ipw2100_translate_scan(ieee, ev, stop, network);
+ else
+ IEEE80211_DEBUG_SCAN("Not showing network '%s ("
+ MAC_FMT ")' due to age (%lums).\n",
+ escape_essid(network->ssid,
+ network->ssid_len),
+ MAC_ARG(network->bssid),
+ (jiffies -
+ network->last_scanned) / (HZ /
+ 100));
+ }
+
+ spin_unlock_irqrestore(&ieee->lock, flags);
+
+ wrqu->data.length = ev - extra;
+ wrqu->data.flags = 0;
+
+ IEEE80211_DEBUG_WX("exit: %d networks returned.\n", i);
+
+ return 0;
+}
+
+int ieee80211_wx_set_encode(struct ieee80211_device *ieee,
+ struct iw_request_info *info,
+ union iwreq_data *wrqu, char *keybuf)
+{
+ struct iw_point *erq = &(wrqu->encoding);
+ struct net_device *dev = ieee->dev;
+ struct ieee80211_security sec = {
+ .flags = 0
+ };
+ int i, key, key_provided, len;
+ struct ieee80211_crypt_data **crypt;
+
+ IEEE80211_DEBUG_WX("SET_ENCODE\n");
+
+ key = erq->flags & IW_ENCODE_INDEX;
+ if (key) {
+ if (key > WEP_KEYS)
+ return -EINVAL;
+ key--;
+ key_provided = 1;
+ } else {
+ key_provided = 0;
+ key = ieee->tx_keyidx;
+ }
+
+ IEEE80211_DEBUG_WX("Key: %d [%s]\n", key, key_provided ?
+ "provided" : "default");
+
+ crypt = &ieee->crypt[key];
+
+ if (erq->flags & IW_ENCODE_DISABLED) {
+ if (key_provided && *crypt) {
+ IEEE80211_DEBUG_WX("Disabling encryption on key %d.\n",
+ key);
+ ieee80211_crypt_delayed_deinit(ieee, crypt);
+ } else
+ IEEE80211_DEBUG_WX("Disabling encryption.\n");
+
+ /* Check all the keys to see if any are still configured,
+ * and if no key index was provided, de-init them all */
+ for (i = 0; i < WEP_KEYS; i++) {
+ if (ieee->crypt[i] != NULL) {
+ if (key_provided)
+ break;
+ ieee80211_crypt_delayed_deinit(ieee,
+ &ieee->crypt[i]);
+ }
+ }
+
+ if (i == WEP_KEYS) {
+ sec.enabled = 0;
+ sec.level = SEC_LEVEL_0;
+ sec.flags |= SEC_ENABLED | SEC_LEVEL;
+ }
+
+ goto done;
+ }
+
+ sec.enabled = 1;
+ sec.flags |= SEC_ENABLED;
+
+ if (*crypt != NULL && (*crypt)->ops != NULL &&
+ strcmp((*crypt)->ops->name, "WEP") != 0) {
+ /* changing to use WEP; deinit previously used algorithm
+ * on this key */
+ ieee80211_crypt_delayed_deinit(ieee, crypt);
+ }
+
+ if (*crypt == NULL) {
+ struct ieee80211_crypt_data *new_crypt;
+
+ /* take WEP into use */
+ new_crypt = kmalloc(sizeof(struct ieee80211_crypt_data),
+ GFP_KERNEL);
+ if (new_crypt == NULL)
+ return -ENOMEM;
+ memset(new_crypt, 0, sizeof(struct ieee80211_crypt_data));
+ new_crypt->ops = ieee80211_get_crypto_ops("WEP");
+ if (!new_crypt->ops) {
+ request_module("ieee80211_crypt_wep");
+ new_crypt->ops = ieee80211_get_crypto_ops("WEP");
+ }
+
+ if (new_crypt->ops && try_module_get(new_crypt->ops->owner))
+ new_crypt->priv = new_crypt->ops->init(key);
+
+ if (!new_crypt->ops || !new_crypt->priv) {
+ kfree(new_crypt);
+ new_crypt = NULL;
+
+ printk(KERN_WARNING "%s: could not initialize WEP: "
+ "load module ieee80211_crypt_wep\n", dev->name);
+ return -EOPNOTSUPP;
+ }
+ *crypt = new_crypt;
+ }
+
+ /* If a new key was provided, set it up */
+ if (erq->length > 0) {
+ len = erq->length <= 5 ? 5 : 13;
+ memcpy(sec.keys[key], keybuf, erq->length);
+ if (len > erq->length)
+ memset(sec.keys[key] + erq->length, 0,
+ len - erq->length);
+ IEEE80211_DEBUG_WX("Setting key %d to '%s' (%d:%d bytes)\n",
+ key, escape_essid(sec.keys[key], len),
+ erq->length, len);
+ sec.key_sizes[key] = len;
+ (*crypt)->ops->set_key(sec.keys[key], len, NULL,
+ (*crypt)->priv);
+ sec.flags |= (1 << key);
+ /* This ensures a key will be activated if no key is
+ * explicitely set */
+ if (key == sec.active_key)
+ sec.flags |= SEC_ACTIVE_KEY;
+ } else {
+ len = (*crypt)->ops->get_key(sec.keys[key], WEP_KEY_LEN,
+ NULL, (*crypt)->priv);
+ if (len == 0) {
+ /* Set a default key of all 0 */
+ IEEE80211_DEBUG_WX("Setting key %d to all zero.\n",
+ key);
+ memset(sec.keys[key], 0, 13);
+ (*crypt)->ops->set_key(sec.keys[key], 13, NULL,
+ (*crypt)->priv);
+ sec.key_sizes[key] = 13;
+ sec.flags |= (1 << key);
+ }
+
+ /* No key data - just set the default TX key index */
+ if (key_provided) {
+ IEEE80211_DEBUG_WX
+ ("Setting key %d to default Tx key.\n", key);
+ ieee->tx_keyidx = key;
+ sec.active_key = key;
+ sec.flags |= SEC_ACTIVE_KEY;
+ }
+ }
+
+ done:
+ ieee->open_wep = !(erq->flags & IW_ENCODE_RESTRICTED);
+ sec.auth_mode = ieee->open_wep ? WLAN_AUTH_OPEN : WLAN_AUTH_SHARED_KEY;
+ sec.flags |= SEC_AUTH_MODE;
+ IEEE80211_DEBUG_WX("Auth: %s\n", sec.auth_mode == WLAN_AUTH_OPEN ?
+ "OPEN" : "SHARED KEY");
+
+ /* For now we just support WEP, so only set that security level...
+ * TODO: When WPA is added this is one place that needs to change */
+ sec.flags |= SEC_LEVEL;
+ sec.level = SEC_LEVEL_1; /* 40 and 104 bit WEP */
+
+ if (ieee->set_security)
+ ieee->set_security(dev, &sec);
+
+ /* Do not reset port if card is in Managed mode since resetting will
+ * generate new IEEE 802.11 authentication which may end up in looping
+ * with IEEE 802.1X. If your hardware requires a reset after WEP
+ * configuration (for example... Prism2), implement the reset_port in
+ * the callbacks structures used to initialize the 802.11 stack. */
+ if (ieee->reset_on_keychange &&
+ ieee->iw_mode != IW_MODE_INFRA &&
+ ieee->reset_port && ieee->reset_port(dev)) {
+ printk(KERN_DEBUG "%s: reset_port failed\n", dev->name);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int ieee80211_wx_get_encode(struct ieee80211_device *ieee,
+ struct iw_request_info *info,
+ union iwreq_data *wrqu, char *keybuf)
+{
+ struct iw_point *erq = &(wrqu->encoding);
+ int len, key;
+ struct ieee80211_crypt_data *crypt;
+
+ IEEE80211_DEBUG_WX("GET_ENCODE\n");
+
+ key = erq->flags & IW_ENCODE_INDEX;
+ if (key) {
+ if (key > WEP_KEYS)
+ return -EINVAL;
+ key--;
+ } else
+ key = ieee->tx_keyidx;
+
+ crypt = ieee->crypt[key];
+ erq->flags = key + 1;
+
+ if (crypt == NULL || crypt->ops == NULL) {
+ erq->length = 0;
+ erq->flags |= IW_ENCODE_DISABLED;
+ return 0;
+ }
+
+ if (strcmp(crypt->ops->name, "WEP") != 0) {
+ /* only WEP is supported with wireless extensions, so just
+ * report that encryption is used */
+ erq->length = 0;
+ erq->flags |= IW_ENCODE_ENABLED;
+ return 0;
+ }
+
+ len = crypt->ops->get_key(keybuf, WEP_KEY_LEN, NULL, crypt->priv);
+ erq->length = (len >= 0 ? len : 0);
+
+ erq->flags |= IW_ENCODE_ENABLED;
+
+ if (ieee->open_wep)
+ erq->flags |= IW_ENCODE_OPEN;
+ else
+ erq->flags |= IW_ENCODE_RESTRICTED;
+
+ return 0;
+}
+
+EXPORT_SYMBOL(ieee80211_wx_get_scan);
+EXPORT_SYMBOL(ieee80211_wx_set_encode);
+EXPORT_SYMBOL(ieee80211_wx_get_encode);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6d3e8b1..e55136a 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -3,7 +3,6 @@
#
config IP_MULTICAST
bool "IP: multicasting"
- depends on INET
help
This is code for addressing several networked computers at once,
enlarging your kernel by about 2 KB. You need multicasting if you
@@ -17,7 +16,6 @@ config IP_MULTICAST
config IP_ADVANCED_ROUTER
bool "IP: advanced router"
- depends on INET
---help---
If you intend to run your Linux box mostly as a router, i.e. as a
computer that forwards and redistributes network packets, say Y; you
@@ -53,6 +51,40 @@ config IP_ADVANCED_ROUTER
If unsure, say N here.
+choice
+ prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
+ depends on IP_ADVANCED_ROUTER
+ default ASK_IP_FIB_HASH
+
+config ASK_IP_FIB_HASH
+ bool "FIB_HASH"
+ ---help---
+ Current FIB is very proven and good enough for most users.
+
+config IP_FIB_TRIE
+ bool "FIB_TRIE"
+ ---help---
+ Use new experimental LC-trie as FIB lookup algoritm.
+ This improves lookup performance if you have a large
+ number of routes.
+
+ LC-trie is a longest matching prefix lookup algorithm which
+ performs better than FIB_HASH for large routing tables.
+ But, it consumes more memory and is more complex.
+
+ LC-trie is described in:
+
+ IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
+ An experimental study of compression methods for dynamic tries
+ Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+
+endchoice
+
+config IP_FIB_HASH
+ def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
+
config IP_MULTIPLE_TABLES
bool "IP: policy routing"
depends on IP_ADVANCED_ROUTER
@@ -92,7 +124,7 @@ config IP_ROUTE_MULTIPATH
config IP_ROUTE_MULTIPATH_CACHED
bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
- depends on: IP_ROUTE_MULTIPATH
+ depends on IP_ROUTE_MULTIPATH
help
Normally, equal cost multipath routing is not supported by the
routing cache. If you say Y here, alternative routes are cached
@@ -145,7 +177,6 @@ config IP_ROUTE_VERBOSE
config IP_PNP
bool "IP: kernel level autoconfiguration"
- depends on INET
help
This enables automatic configuration of IP addresses of devices and
of the routing table during kernel boot, based on either information
@@ -204,8 +235,6 @@ config IP_PNP_RARP
# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
config NET_IPIP
tristate "IP: tunneling"
- depends on INET
- select INET_TUNNEL
---help---
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -222,8 +251,6 @@ config NET_IPIP
config NET_IPGRE
tristate "IP: GRE tunnels over IP"
- depends on INET
- select XFRM
help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -281,7 +308,7 @@ config IP_PIMSM_V2
config ARPD
bool "IP: ARP daemon support (EXPERIMENTAL)"
- depends on INET && EXPERIMENTAL
+ depends on EXPERIMENTAL
---help---
Normally, the kernel maintains an internal cache which maps IP
addresses to hardware addresses on the local network, so that
@@ -306,7 +333,6 @@ config ARPD
config SYN_COOKIES
bool "IP: TCP syncookie support (disabled per default)"
- depends on INET
---help---
Normal TCP/IP networking is open to an attack known as "SYN
flooding". This denial-of-service attack prevents legitimate remote
@@ -343,7 +369,6 @@ config SYN_COOKIES
config INET_AH
tristate "IP: AH transformation"
- depends on INET
select XFRM
select CRYPTO
select CRYPTO_HMAC
@@ -356,7 +381,6 @@ config INET_AH
config INET_ESP
tristate "IP: ESP transformation"
- depends on INET
select XFRM
select CRYPTO
select CRYPTO_HMAC
@@ -370,7 +394,6 @@ config INET_ESP
config INET_IPCOMP
tristate "IP: IPComp transformation"
- depends on INET
select XFRM
select INET_TUNNEL
select CRYPTO
@@ -383,7 +406,6 @@ config INET_IPCOMP
config INET_TUNNEL
tristate "IP: tunnel transformation"
- depends on INET
select XFRM
---help---
Support for generic IP tunnel transformation, which is required by
@@ -391,21 +413,122 @@ config INET_TUNNEL
If unsure, say Y.
-config IP_TCPDIAG
- tristate "IP: TCP socket monitoring interface"
- depends on INET
+config INET_DIAG
+ tristate "INET: socket monitoring interface"
default y
---help---
- Support for TCP socket monitoring interface used by native Linux
- tools such as ss. ss is included in iproute2, currently downloadable
- at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support
- and have selected IPv6 as a module, you need to build this as a
- module too.
+ Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+ native Linux tools such as ss. ss is included in iproute2, currently
+ downloadable at <http://developer.osdl.org/dev/iproute2>.
If unsure, say Y.
-config IP_TCPDIAG_IPV6
- def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
+config INET_TCP_DIAG
+ depends on INET_DIAG
+ def_tristate INET_DIAG
+
+config TCP_CONG_ADVANCED
+ bool "TCP: advanced congestion control"
+ ---help---
+ Support for selection of various TCP congestion control
+ modules.
+
+ Nearly all users can safely say no here, and a safe default
+ selection will be made (BIC-TCP with new Reno as a fallback).
+
+ If unsure, say N.
+
+# TCP Reno is builtin (required as fallback)
+menu "TCP congestion control"
+ depends on TCP_CONG_ADVANCED
+
+config TCP_CONG_BIC
+ tristate "Binary Increase Congestion (BIC) control"
+ default y
+ ---help---
+ BIC-TCP is a sender-side only change that ensures a linear RTT
+ fairness under large windows while offering both scalability and
+ bounded TCP-friendliness. The protocol combines two schemes
+ called additive increase and binary search increase. When the
+ congestion window is large, additive increase with a large
+ increment ensures linear RTT fairness as well as good
+ scalability. Under small congestion windows, binary search
+ increase provides TCP friendliness.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+
+config TCP_CONG_WESTWOOD
+ tristate "TCP Westwood+"
+ default m
+ ---help---
+ TCP Westwood+ is a sender-side only modification of the TCP Reno
+ protocol stack that optimizes the performance of TCP congestion
+ control. It is based on end-to-end bandwidth estimation to set
+ congestion window and slow start threshold after a congestion
+ episode. Using this estimation, TCP Westwood+ adaptively sets a
+ slow start threshold and a congestion window which takes into
+ account the bandwidth used at the time congestion is experienced.
+ TCP Westwood+ significantly increases fairness wrt TCP Reno in
+ wired networks and throughput over wireless links.
+
+config TCP_CONG_HTCP
+ tristate "H-TCP"
+ default m
+ ---help---
+ H-TCP is a send-side only modifications of the TCP Reno
+ protocol stack that optimizes the performance of TCP
+ congestion control for high speed network links. It uses a
+ modeswitch to change the alpha and beta parameters of TCP Reno
+ based on network conditions and in a way so as to be fair with
+ other Reno and H-TCP flows.
+
+config TCP_CONG_HSTCP
+ tristate "High Speed TCP"
+ depends on EXPERIMENTAL
+ default n
+ ---help---
+ Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+ A modification to TCP's congestion control mechanism for use
+ with large congestion windows. A table indicates how much to
+ increase the congestion window by when an ACK is received.
+ For more detail see http://www.icir.org/floyd/hstcp.html
+
+config TCP_CONG_HYBLA
+ tristate "TCP-Hybla congestion control algorithm"
+ depends on EXPERIMENTAL
+ default n
+ ---help---
+ TCP-Hybla is a sender-side only change that eliminates penalization of
+ long-RTT, large-bandwidth connections, like when satellite legs are
+ involved, expecially when sharing a common bottleneck with normal
+ terrestrial connections.
+
+config TCP_CONG_VEGAS
+ tristate "TCP Vegas"
+ depends on EXPERIMENTAL
+ default n
+ ---help---
+ TCP Vegas is a sender-side only change to TCP that anticipates
+ the onset of congestion by estimating the bandwidth. TCP Vegas
+ adjusts the sending rate by modifying the congestion
+ window. TCP Vegas should provide less packet loss, but it is
+ not as aggressive as TCP Reno.
+
+config TCP_CONG_SCALABLE
+ tristate "Scalable TCP"
+ depends on EXPERIMENTAL
+ default n
+ ---help---
+ Scalable TCP is a sender-side only change to TCP which uses a
+ MIMD congestion control algorithm which has some nice scaling
+ properties, though is known to have fairness issues.
+ See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+
+endmenu
+
+config TCP_CONG_BIC
+ tristate
+ depends on !TCP_CONG_ADVANCED
+ default y
source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 8b379627..f0435d0 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -2,13 +2,17 @@
# Makefile for the Linux TCP/IP (INET) layer.
#
-obj-y := utils.o route.o inetpeer.o protocol.o \
+obj-y := route.o inetpeer.o protocol.o \
ip_input.o ip_fragment.o ip_forward.o ip_options.o \
- ip_output.o ip_sockglue.o \
- tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
+ ip_output.o ip_sockglue.o inet_hashtables.o \
+ inet_timewait_sock.o inet_connection_sock.o \
+ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
+ tcp_minisocks.o tcp_cong.o \
datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
- sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o
+ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o
+obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
+obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
@@ -26,8 +30,16 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_IP_VS) += ipvs/
-obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
+obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
+obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
+obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
+obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
+obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b3cb49c..bf147f8 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -99,6 +99,7 @@
#include <net/arp.h>
#include <net/route.h>
#include <net/ip_fib.h>
+#include <net/inet_connection_sock.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <linux/skbuff.h>
@@ -112,11 +113,7 @@
#include <linux/mroute.h>
#endif
-DEFINE_SNMP_STAT(struct linux_mib, net_statistics);
-
-#ifdef INET_REFCNT_DEBUG
-atomic_t inet_sock_nr;
-#endif
+DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
extern void ip_mc_drop_socket(struct sock *sk);
@@ -153,11 +150,7 @@ void inet_sock_destruct(struct sock *sk)
if (inet->opt)
kfree(inet->opt);
dst_release(sk->sk_dst_cache);
-#ifdef INET_REFCNT_DEBUG
- atomic_dec(&inet_sock_nr);
- printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
- sk, atomic_read(&inet_sock_nr));
-#endif
+ sk_refcnt_debug_dec(sk);
}
/*
@@ -210,7 +203,7 @@ int inet_listen(struct socket *sock, int backlog)
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
- err = tcp_listen_start(sk);
+ err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
if (err)
goto out;
}
@@ -235,12 +228,14 @@ static int inet_create(struct socket *sock, int protocol)
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
- int err;
+ int try_loading_module = 0;
+ int err = -ESOCKTNOSUPPORT;
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
answer = NULL;
+lookup_protocol:
rcu_read_lock();
list_for_each_rcu(p, &inetsw[sock->type]) {
answer = list_entry(p, struct inet_protosw, list);
@@ -261,9 +256,28 @@ static int inet_create(struct socket *sock, int protocol)
answer = NULL;
}
- err = -ESOCKTNOSUPPORT;
- if (!answer)
- goto out_rcu_unlock;
+ if (unlikely(answer == NULL)) {
+ if (try_loading_module < 2) {
+ rcu_read_unlock();
+ /*
+ * Be more specific, e.g. net-pf-2-proto-132-type-1
+ * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+ */
+ if (++try_loading_module == 1)
+ request_module("net-pf-%d-proto-%d-type-%d",
+ PF_INET, protocol, sock->type);
+ /*
+ * Fall back to generic, e.g. net-pf-2-proto-132
+ * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+ */
+ else
+ request_module("net-pf-%d-proto-%d",
+ PF_INET, protocol);
+ goto lookup_protocol;
+ } else
+ goto out_rcu_unlock;
+ }
+
err = -EPERM;
if (answer->capability > 0 && !capable(answer->capability))
goto out_rcu_unlock;
@@ -317,9 +331,7 @@ static int inet_create(struct socket *sock, int protocol)
inet->mc_index = 0;
inet->mc_list = NULL;
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet_sock_nr);
-#endif
+ sk_refcnt_debug_inc(sk);
if (inet->num) {
/* It assumes that any protocol which allows
@@ -847,10 +859,6 @@ static struct net_proto_family inet_family_ops = {
.owner = THIS_MODULE,
};
-
-extern void tcp_init(void);
-extern void tcp_v4_init(struct net_proto_family *);
-
/* Upon startup we insert all the elements in inetsw_array[] into
* the linked list inetsw.
*/
@@ -961,6 +969,119 @@ void inet_unregister_protosw(struct inet_protosw *p)
}
}
+/*
+ * Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr;
+
+static int inet_sk_reselect_saddr(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ int err;
+ struct rtable *rt;
+ __u32 old_saddr = inet->saddr;
+ __u32 new_saddr;
+ __u32 daddr = inet->daddr;
+
+ if (inet->opt && inet->opt->srr)
+ daddr = inet->opt->faddr;
+
+ /* Query new route. */
+ err = ip_route_connect(&rt, daddr, 0,
+ RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if,
+ sk->sk_protocol,
+ inet->sport, inet->dport, sk);
+ if (err)
+ return err;
+
+ sk_setup_caps(sk, &rt->u.dst);
+
+ new_saddr = rt->rt_src;
+
+ if (new_saddr == old_saddr)
+ return 0;
+
+ if (sysctl_ip_dynaddr > 1) {
+ printk(KERN_INFO "%s(): shifting inet->"
+ "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
+ __FUNCTION__,
+ NIPQUAD(old_saddr),
+ NIPQUAD(new_saddr));
+ }
+
+ inet->saddr = inet->rcv_saddr = new_saddr;
+
+ /*
+ * XXX The only one ugly spot where we need to
+ * XXX really change the sockets identity after
+ * XXX it has entered the hashes. -DaveM
+ *
+ * Besides that, it does not check for connection
+ * uniqueness. Wait for troubles.
+ */
+ __sk_prot_rehash(sk);
+ return 0;
+}
+
+int inet_sk_rebuild_header(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+ u32 daddr;
+ int err;
+
+ /* Route is OK, nothing to do. */
+ if (rt)
+ return 0;
+
+ /* Reroute. */
+ daddr = inet->daddr;
+ if (inet->opt && inet->opt->srr)
+ daddr = inet->opt->faddr;
+{
+ struct flowi fl = {
+ .oif = sk->sk_bound_dev_if,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = daddr,
+ .saddr = inet->saddr,
+ .tos = RT_CONN_FLAGS(sk),
+ },
+ },
+ .proto = sk->sk_protocol,
+ .uli_u = {
+ .ports = {
+ .sport = inet->sport,
+ .dport = inet->dport,
+ },
+ },
+ };
+
+ err = ip_route_output_flow(&rt, &fl, sk, 0);
+}
+ if (!err)
+ sk_setup_caps(sk, &rt->u.dst);
+ else {
+ /* Routing failed... */
+ sk->sk_route_caps = 0;
+ /*
+ * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+ * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+ */
+ if (!sysctl_ip_dynaddr ||
+ sk->sk_state != TCP_SYN_SENT ||
+ (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+ (err = inet_sk_reselect_saddr(sk)) != 0)
+ sk->sk_err_soft = -err;
+ }
+
+ return err;
+}
+
+EXPORT_SYMBOL(inet_sk_rebuild_header);
+
#ifdef CONFIG_IP_MULTICAST
static struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
@@ -1007,7 +1128,15 @@ static int __init init_ipv4_mibs(void)
}
static int ipv4_proc_init(void);
-extern void ipfrag_init(void);
+
+/*
+ * IP protocol layer initialiser
+ */
+
+static struct packet_type ip_packet_type = {
+ .type = __constant_htons(ETH_P_IP),
+ .func = ip_rcv,
+};
static int __init inet_init(void)
{
@@ -1102,6 +1231,8 @@ static int __init inet_init(void)
ipfrag_init();
+ dev_add_pack(&ip_packet_type);
+
rc = 0;
out:
return rc;
@@ -1117,15 +1248,10 @@ module_init(inet_init);
/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS
-extern int fib_proc_init(void);
-extern void fib_proc_exit(void);
-extern int ip_misc_proc_init(void);
-extern int raw_proc_init(void);
-extern void raw_proc_exit(void);
-extern int tcp4_proc_init(void);
-extern void tcp4_proc_exit(void);
-extern int udp4_proc_init(void);
-extern void udp4_proc_exit(void);
+#ifdef CONFIG_IP_FIB_TRIE
+extern int fib_stat_proc_init(void);
+extern void fib_stat_proc_exit(void);
+#endif
static int __init ipv4_proc_init(void)
{
@@ -1139,11 +1265,19 @@ static int __init ipv4_proc_init(void)
goto out_udp;
if (fib_proc_init())
goto out_fib;
+#ifdef CONFIG_IP_FIB_TRIE
+ if (fib_stat_proc_init())
+ goto out_fib_stat;
+#endif
if (ip_misc_proc_init())
goto out_misc;
out:
return rc;
out_misc:
+#ifdef CONFIG_IP_FIB_TRIE
+ fib_stat_proc_exit();
+out_fib_stat:
+#endif
fib_proc_exit();
out_fib:
udp4_proc_exit();
@@ -1181,7 +1315,4 @@ EXPORT_SYMBOL(inet_stream_connect);
EXPORT_SYMBOL(inet_stream_ops);
EXPORT_SYMBOL(inet_unregister_protosw);
EXPORT_SYMBOL(net_statistics);
-
-#ifdef INET_REFCNT_DEBUG
-EXPORT_SYMBOL(inet_sock_nr);
-#endif
+EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 0e98f22..035ad2c 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -200,7 +200,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
xfrm_state_put(x);
}
-static int ah_init_state(struct xfrm_state *x, void *args)
+static int ah_init_state(struct xfrm_state *x)
{
struct ah_data *ahp = NULL;
struct xfrm_algo_desc *aalg_desc;
@@ -263,10 +263,8 @@ static int ah_init_state(struct xfrm_state *x, void *args)
error:
if (ahp) {
- if (ahp->work_icv)
- kfree(ahp->work_icv);
- if (ahp->tfm)
- crypto_free_tfm(ahp->tfm);
+ kfree(ahp->work_icv);
+ crypto_free_tfm(ahp->tfm);
kfree(ahp);
}
return -EINVAL;
@@ -279,14 +277,10 @@ static void ah_destroy(struct xfrm_state *x)
if (!ahp)
return;
- if (ahp->work_icv) {
- kfree(ahp->work_icv);
- ahp->work_icv = NULL;
- }
- if (ahp->tfm) {
- crypto_free_tfm(ahp->tfm);
- ahp->tfm = NULL;
- }
+ kfree(ahp->work_icv);
+ ahp->work_icv = NULL;
+ crypto_free_tfm(ahp->tfm);
+ ahp->tfm = NULL;
kfree(ahp);
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a642fd6..8bf312b 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -700,7 +700,7 @@ void arp_send(int type, int ptype, u32 dest_ip,
static void parp_redo(struct sk_buff *skb)
{
nf_reset(skb);
- arp_rcv(skb, skb->dev, NULL);
+ arp_rcv(skb, skb->dev, NULL, skb->dev);
}
/*
@@ -865,7 +865,7 @@ static int arp_process(struct sk_buff *skb)
if (n)
neigh_release(n);
- if (skb->stamp.tv_sec == LOCALLY_ENQUEUED ||
+ if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
in_dev->arp_parms->proxy_delay == 0) {
arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
@@ -927,7 +927,7 @@ out:
* Receive an arp request from the device layer.
*/
-int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct arphdr *arp;
@@ -948,6 +948,8 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
goto out_of_mem;
+ memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
freeskb:
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index b1db561..c1b42b5 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -16,9 +16,10 @@
#include <linux/module.h>
#include <linux/ip.h>
#include <linux/in.h>
+#include <net/ip.h>
#include <net/sock.h>
-#include <net/tcp.h>
#include <net/route.h>
+#include <net/tcp_states.h>
int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 478a301..ba2895a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1030,14 +1030,13 @@ static struct notifier_block ip_netdev_notifier = {
};
static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
- u32 pid, u32 seq, int event)
+ u32 pid, u32 seq, int event, unsigned int flags)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
- if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
ifm = NLMSG_DATA(nlh);
ifm->ifa_family = AF_INET;
ifm->ifa_prefixlen = ifa->ifa_prefixlen;
@@ -1090,7 +1089,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
continue;
if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
- RTM_NEWADDR) <= 0) {
+ RTM_NEWADDR, NLM_F_MULTI) <= 0) {
rcu_read_unlock();
goto done;
}
@@ -1112,13 +1111,12 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
if (!skb)
- netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
- else if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) {
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, ENOBUFS);
+ else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, EINVAL);
} else {
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_IFADDR, GFP_KERNEL);
}
}
@@ -1472,7 +1470,7 @@ static void devinet_sysctl_register(struct in_device *in_dev,
* by sysctl and we wouldn't want anyone to change it under our feet
* (see SIOCSIFNAME).
*/
- dev_name = net_sysctl_strdup(dev_name);
+ dev_name = kstrdup(dev_name, GFP_KERNEL);
if (!dev_name)
goto free;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index eae84cc..1b5a09d 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -331,8 +331,8 @@ static void esp4_err(struct sk_buff *skb, u32 info)
x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
if (!x)
return;
- NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
- ntohl(esph->spi), ntohl(iph->daddr)));
+ NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
+ ntohl(esph->spi), ntohl(iph->daddr));
xfrm_state_put(x);
}
@@ -343,26 +343,18 @@ static void esp_destroy(struct xfrm_state *x)
if (!esp)
return;
- if (esp->conf.tfm) {
- crypto_free_tfm(esp->conf.tfm);
- esp->conf.tfm = NULL;
- }
- if (esp->conf.ivec) {
- kfree(esp->conf.ivec);
- esp->conf.ivec = NULL;
- }
- if (esp->auth.tfm) {
- crypto_free_tfm(esp->auth.tfm);
- esp->auth.tfm = NULL;
- }
- if (esp->auth.work_icv) {
- kfree(esp->auth.work_icv);
- esp->auth.work_icv = NULL;
- }
+ crypto_free_tfm(esp->conf.tfm);
+ esp->conf.tfm = NULL;
+ kfree(esp->conf.ivec);
+ esp->conf.ivec = NULL;
+ crypto_free_tfm(esp->auth.tfm);
+ esp->auth.tfm = NULL;
+ kfree(esp->auth.work_icv);
+ esp->auth.work_icv = NULL;
kfree(esp);
}
-static int esp_init_state(struct xfrm_state *x, void *args)
+static int esp_init_state(struct xfrm_state *x)
{
struct esp_data *esp = NULL;
@@ -395,10 +387,10 @@ static int esp_init_state(struct xfrm_state *x, void *args)
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
crypto_tfm_alg_digestsize(esp->auth.tfm)) {
- NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
- x->aalg->alg_name,
- crypto_tfm_alg_digestsize(esp->auth.tfm),
- aalg_desc->uinfo.auth.icv_fullbits/8));
+ NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+ x->aalg->alg_name,
+ crypto_tfm_alg_digestsize(esp->auth.tfm),
+ aalg_desc->uinfo.auth.icv_fullbits/8);
goto error;
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 563e7d6..4e1379f 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -516,6 +516,59 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
#undef BRD1_OK
}
+static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
+{
+
+ struct fib_result res;
+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
+ .fwmark = frn->fl_fwmark,
+ .tos = frn->fl_tos,
+ .scope = frn->fl_scope } } };
+ if (tb) {
+ local_bh_disable();
+
+ frn->tb_id = tb->tb_id;
+ frn->err = tb->tb_lookup(tb, &fl, &res);
+
+ if (!frn->err) {
+ frn->prefixlen = res.prefixlen;
+ frn->nh_sel = res.nh_sel;
+ frn->type = res.type;
+ frn->scope = res.scope;
+ }
+ local_bh_enable();
+ }
+}
+
+static void nl_fib_input(struct sock *sk, int len)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh = NULL;
+ struct fib_result_nl *frn;
+ int err;
+ u32 pid;
+ struct fib_table *tb;
+
+ skb = skb_recv_datagram(sk, 0, 0, &err);
+ nlh = (struct nlmsghdr *)skb->data;
+
+ frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
+ tb = fib_get_table(frn->tb_id_in);
+
+ nl_fib_lookup(frn, tb);
+
+ pid = nlh->nlmsg_pid; /*pid of sending process */
+ NETLINK_CB(skb).pid = 0; /* from kernel */
+ NETLINK_CB(skb).dst_pid = pid;
+ NETLINK_CB(skb).dst_group = 0; /* unicast */
+ netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
+}
+
+static void nl_fib_lookup_init(void)
+{
+ netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE);
+}
+
static void fib_disable_ip(struct net_device *dev, int force)
{
if (fib_sync_down(0, dev, force))
@@ -604,8 +657,8 @@ void __init ip_fib_init(void)
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
+ nl_fib_lookup_init();
}
EXPORT_SYMBOL(inet_addr_type);
-EXPORT_SYMBOL(ip_dev_find);
EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 6506dcc..2a8c9af 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -45,8 +45,8 @@
#include "fib_lookup.h"
-static kmem_cache_t *fn_hash_kmem;
-static kmem_cache_t *fn_alias_kmem;
+static kmem_cache_t *fn_hash_kmem __read_mostly;
+static kmem_cache_t *fn_alias_kmem __read_mostly;
struct fib_node {
struct hlist_node fn_hash;
@@ -703,7 +703,8 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
&f->fn_key,
fz->fz_order,
fa->fa_tos,
- fa->fa_info) < 0) {
+ fa->fa_info,
+ NLM_F_MULTI) < 0) {
cb->args[3] = i;
return -1;
}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index ac4485f..ef6609e 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -7,6 +7,7 @@
struct fib_alias {
struct list_head fa_list;
+ struct rcu_head rcu;
struct fib_info *fa_info;
u8 fa_tos;
u8 fa_type;
@@ -30,7 +31,8 @@ extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *,
struct kern_rta *rta, struct fib_info *fi);
extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
u8 tb_id, u8 type, u8 scope, void *dst,
- int dst_len, u8 tos, struct fib_info *fi);
+ int dst_len, u8 tos, struct fib_info *fi,
+ unsigned int);
extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
int z, int tb_id,
struct nlmsghdr *n, struct netlink_skb_parms *req);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 39d0aad..0b298bb 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -367,13 +367,14 @@ static struct notifier_block fib_rules_notifier = {
static __inline__ int inet_fill_rule(struct sk_buff *skb,
struct fib_rule *r,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ unsigned int flags)
{
struct rtmsg *rtm;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
- nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm));
+ nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWRULE, sizeof(*rtm), flags);
rtm = NLMSG_DATA(nlh);
rtm->rtm_family = AF_INET;
rtm->rtm_dst_len = r->r_dst_len;
@@ -422,7 +423,7 @@ int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
if (idx < s_idx)
continue;
- if (inet_fill_rule(skb, r, cb) < 0)
+ if (inet_fill_rule(skb, r, cb, NLM_F_MULTI) < 0)
break;
}
read_unlock(&fib_rules_lock);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 029362d..d41219e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -276,7 +276,7 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
struct nlmsghdr *n, struct netlink_skb_parms *req)
{
struct sk_buff *skb;
- u32 pid = req ? req->pid : 0;
+ u32 pid = req ? req->pid : n->nlmsg_pid;
int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
skb = alloc_skb(size, GFP_KERNEL);
@@ -286,14 +286,14 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
fa->fa_type, fa->fa_scope, &key, z,
fa->fa_tos,
- fa->fa_info) < 0) {
+ fa->fa_info, 0) < 0) {
kfree_skb(skb);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
+ NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE;
if (n->nlmsg_flags&NLM_F_ECHO)
atomic_inc(&skb->users);
- netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
+ netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL);
if (n->nlmsg_flags&NLM_F_ECHO)
netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
}
@@ -593,10 +593,13 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
struct hlist_head *new_laddrhash,
unsigned int new_size)
{
+ struct hlist_head *old_info_hash, *old_laddrhash;
unsigned int old_size = fib_hash_size;
- unsigned int i;
+ unsigned int i, bytes;
write_lock(&fib_info_lock);
+ old_info_hash = fib_info_hash;
+ old_laddrhash = fib_info_laddrhash;
fib_hash_size = new_size;
for (i = 0; i < old_size; i++) {
@@ -636,6 +639,10 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
fib_info_laddrhash = new_laddrhash;
write_unlock(&fib_info_lock);
+
+ bytes = old_size * sizeof(struct hlist_head *);
+ fib_hash_free(old_info_hash, bytes);
+ fib_hash_free(old_laddrhash, bytes);
}
struct fib_info *
@@ -847,6 +854,7 @@ failure:
return NULL;
}
+/* Note! fib_semantic_match intentionally uses RCU list functions. */
int fib_semantic_match(struct list_head *head, const struct flowi *flp,
struct fib_result *res, __u32 zone, __u32 mask,
int prefixlen)
@@ -854,7 +862,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
struct fib_alias *fa;
int nh_sel = 0;
- list_for_each_entry(fa, head, fa_list) {
+ list_for_each_entry_rcu(fa, head, fa_list) {
int err;
if (fa->fa_tos &&
@@ -932,13 +940,13 @@ u32 __fib_res_prefsrc(struct fib_result *res)
int
fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
- struct fib_info *fi)
+ struct fib_info *fi, unsigned int flags)
{
struct rtmsg *rtm;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
rtm = NLMSG_DATA(nlh);
rtm->rtm_family = AF_INET;
rtm->rtm_dst_len = dst_len;
@@ -1035,7 +1043,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
}
nl->nlmsg_flags = NLM_F_REQUEST;
- nl->nlmsg_pid = 0;
+ nl->nlmsg_pid = current->pid;
nl->nlmsg_seq = 0;
nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
if (cmd == SIOCDELRT) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
new file mode 100644
index 0000000..b2dea4e
--- /dev/null
+++ b/net/ipv4/fib_trie.c
@@ -0,0 +1,2528 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
+ * & Swedish University of Agricultural Sciences.
+ *
+ * Jens Laas <jens.laas@data.slu.se> Swedish University of
+ * Agricultural Sciences.
+ *
+ * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
+ *
+ * This work is based on the LPC-trie which is originally descibed in:
+ *
+ * An experimental study of compression methods for dynamic tries
+ * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+ *
+ *
+ * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
+ *
+ * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
+ *
+ *
+ * Code from fib_hash has been reused which includes the following header:
+ *
+ *
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 FIB: lookup engine and maintenance routines.
+ *
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define VERSION "0.402"
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include "fib_lookup.h"
+
+#undef CONFIG_IP_FIB_TRIE_STATS
+#define MAX_CHILDS 16384
+
+#define KEYLENGTH (8*sizeof(t_key))
+#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
+#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
+
+typedef unsigned int t_key;
+
+#define T_TNODE 0
+#define T_LEAF 1
+#define NODE_TYPE_MASK 0x1UL
+#define NODE_PARENT(node) \
+ ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
+
+#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
+
+#define NODE_SET_PARENT(node, ptr) \
+ rcu_assign_pointer((node)->parent, \
+ ((unsigned long)(ptr)) | NODE_TYPE(node))
+
+#define IS_TNODE(n) (!(n->parent & T_LEAF))
+#define IS_LEAF(n) (n->parent & T_LEAF)
+
+struct node {
+ t_key key;
+ unsigned long parent;
+};
+
+struct leaf {
+ t_key key;
+ unsigned long parent;
+ struct hlist_head list;
+ struct rcu_head rcu;
+};
+
+struct leaf_info {
+ struct hlist_node hlist;
+ struct rcu_head rcu;
+ int plen;
+ struct list_head falh;
+};
+
+struct tnode {
+ t_key key;
+ unsigned long parent;
+ unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
+ unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
+ unsigned short full_children; /* KEYLENGTH bits needed */
+ unsigned short empty_children; /* KEYLENGTH bits needed */
+ struct rcu_head rcu;
+ struct node *child[0];
+};
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+struct trie_use_stats {
+ unsigned int gets;
+ unsigned int backtrack;
+ unsigned int semantic_match_passed;
+ unsigned int semantic_match_miss;
+ unsigned int null_node_hit;
+ unsigned int resize_node_skipped;
+};
+#endif
+
+struct trie_stat {
+ unsigned int totdepth;
+ unsigned int maxdepth;
+ unsigned int tnodes;
+ unsigned int leaves;
+ unsigned int nullpointers;
+ unsigned int nodesizes[MAX_CHILDS];
+};
+
+struct trie {
+ struct node *trie;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats stats;
+#endif
+ int size;
+ unsigned int revision;
+};
+
+static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
+static struct node *resize(struct trie *t, struct tnode *tn);
+static struct tnode *inflate(struct trie *t, struct tnode *tn);
+static struct tnode *halve(struct trie *t, struct tnode *tn);
+static void tnode_free(struct tnode *tn);
+static void trie_dump_seq(struct seq_file *seq, struct trie *t);
+
+static kmem_cache_t *fn_alias_kmem __read_mostly;
+static struct trie *trie_local = NULL, *trie_main = NULL;
+
+
+/* rcu_read_lock needs to be hold by caller from readside */
+
+static inline struct node *tnode_get_child(struct tnode *tn, int i)
+{
+ BUG_ON(i >= 1 << tn->bits);
+
+ return rcu_dereference(tn->child[i]);
+}
+
+static inline int tnode_child_length(const struct tnode *tn)
+{
+ return 1 << tn->bits;
+}
+
+static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
+{
+ if (offset < KEYLENGTH)
+ return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
+ else
+ return 0;
+}
+
+static inline int tkey_equals(t_key a, t_key b)
+{
+ return a == b;
+}
+
+static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
+{
+ if (bits == 0 || offset >= KEYLENGTH)
+ return 1;
+ bits = bits > KEYLENGTH ? KEYLENGTH : bits;
+ return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
+}
+
+static inline int tkey_mismatch(t_key a, int offset, t_key b)
+{
+ t_key diff = a ^ b;
+ int i = offset;
+
+ if (!diff)
+ return 0;
+ while ((diff << i) >> (KEYLENGTH-1) == 0)
+ i++;
+ return i;
+}
+
+/*
+ To understand this stuff, an understanding of keys and all their bits is
+ necessary. Every node in the trie has a key associated with it, but not
+ all of the bits in that key are significant.
+
+ Consider a node 'n' and its parent 'tp'.
+
+ If n is a leaf, every bit in its key is significant. Its presence is
+ necessitaded by path compression, since during a tree traversal (when
+ searching for a leaf - unless we are doing an insertion) we will completely
+ ignore all skipped bits we encounter. Thus we need to verify, at the end of
+ a potentially successful search, that we have indeed been walking the
+ correct key path.
+
+ Note that we can never "miss" the correct key in the tree if present by
+ following the wrong path. Path compression ensures that segments of the key
+ that are the same for all keys with a given prefix are skipped, but the
+ skipped part *is* identical for each node in the subtrie below the skipped
+ bit! trie_insert() in this implementation takes care of that - note the
+ call to tkey_sub_equals() in trie_insert().
+
+ if n is an internal node - a 'tnode' here, the various parts of its key
+ have many different meanings.
+
+ Example:
+ _________________________________________________________________
+ | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
+ -----------------------------------------------------------------
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+
+ _________________________________________________________________
+ | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
+ -----------------------------------------------------------------
+ 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+
+ tp->pos = 7
+ tp->bits = 3
+ n->pos = 15
+ n->bits = 4
+
+ First, let's just ignore the bits that come before the parent tp, that is
+ the bits from 0 to (tp->pos-1). They are *known* but at this point we do
+ not use them for anything.
+
+ The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
+ index into the parent's child array. That is, they will be used to find
+ 'n' among tp's children.
+
+ The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
+ for the node n.
+
+ All the bits we have seen so far are significant to the node n. The rest
+ of the bits are really not needed or indeed known in n->key.
+
+ The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
+ n's child array, and will of course be different for each child.
+
+
+ The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
+ at this point.
+
+*/
+
+static inline void check_tnode(const struct tnode *tn)
+{
+ WARN_ON(tn && tn->pos+tn->bits > 32);
+}
+
+static int halve_threshold = 25;
+static int inflate_threshold = 50;
+
+
+static void __alias_free_mem(struct rcu_head *head)
+{
+ struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+ kmem_cache_free(fn_alias_kmem, fa);
+}
+
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
+{
+ call_rcu(&fa->rcu, __alias_free_mem);
+}
+
+static void __leaf_free_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct leaf, rcu));
+}
+
+static inline void free_leaf(struct leaf *leaf)
+{
+ call_rcu(&leaf->rcu, __leaf_free_rcu);
+}
+
+static void __leaf_info_free_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct leaf_info, rcu));
+}
+
+static inline void free_leaf_info(struct leaf_info *leaf)
+{
+ call_rcu(&leaf->rcu, __leaf_info_free_rcu);
+}
+
+static struct tnode *tnode_alloc(unsigned int size)
+{
+ struct page *pages;
+
+ if (size <= PAGE_SIZE)
+ return kcalloc(size, 1, GFP_KERNEL);
+
+ pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
+ if (!pages)
+ return NULL;
+
+ return page_address(pages);
+}
+
+static void __tnode_free_rcu(struct rcu_head *head)
+{
+ struct tnode *tn = container_of(head, struct tnode, rcu);
+ unsigned int size = sizeof(struct tnode) +
+ (1 << tn->bits) * sizeof(struct node *);
+
+ if (size <= PAGE_SIZE)
+ kfree(tn);
+ else
+ free_pages((unsigned long)tn, get_order(size));
+}
+
+static inline void tnode_free(struct tnode *tn)
+{
+ call_rcu(&tn->rcu, __tnode_free_rcu);
+}
+
+static struct leaf *leaf_new(void)
+{
+ struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
+ if (l) {
+ l->parent = T_LEAF;
+ INIT_HLIST_HEAD(&l->list);
+ }
+ return l;
+}
+
+static struct leaf_info *leaf_info_new(int plen)
+{
+ struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
+ if (li) {
+ li->plen = plen;
+ INIT_LIST_HEAD(&li->falh);
+ }
+ return li;
+}
+
+static struct tnode* tnode_new(t_key key, int pos, int bits)
+{
+ int nchildren = 1<<bits;
+ int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
+ struct tnode *tn = tnode_alloc(sz);
+
+ if (tn) {
+ memset(tn, 0, sz);
+ tn->parent = T_TNODE;
+ tn->pos = pos;
+ tn->bits = bits;
+ tn->key = key;
+ tn->full_children = 0;
+ tn->empty_children = 1<<bits;
+ }
+
+ pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
+ (unsigned int) (sizeof(struct node) * 1<<bits));
+ return tn;
+}
+
+/*
+ * Check whether a tnode 'n' is "full", i.e. it is an internal node
+ * and no bits are skipped. See discussion in dyntree paper p. 6
+ */
+
+static inline int tnode_full(const struct tnode *tn, const struct node *n)
+{
+ if (n == NULL || IS_LEAF(n))
+ return 0;
+
+ return ((struct tnode *) n)->pos == tn->pos + tn->bits;
+}
+
+static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
+{
+ tnode_put_child_reorg(tn, i, n, -1);
+}
+
+ /*
+ * Add a child at position i overwriting the old value.
+ * Update the value of full_children and empty_children.
+ */
+
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
+{
+ struct node *chi = tn->child[i];
+ int isfull;
+
+ BUG_ON(i >= 1<<tn->bits);
+
+
+ /* update emptyChildren */
+ if (n == NULL && chi != NULL)
+ tn->empty_children++;
+ else if (n != NULL && chi == NULL)
+ tn->empty_children--;
+
+ /* update fullChildren */
+ if (wasfull == -1)
+ wasfull = tnode_full(tn, chi);
+
+ isfull = tnode_full(tn, n);
+ if (wasfull && !isfull)
+ tn->full_children--;
+ else if (!wasfull && isfull)
+ tn->full_children++;
+
+ if (n)
+ NODE_SET_PARENT(n, tn);
+
+ rcu_assign_pointer(tn->child[i], n);
+}
+
+static struct node *resize(struct trie *t, struct tnode *tn)
+{
+ int i;
+ int err = 0;
+ struct tnode *old_tn;
+
+ if (!tn)
+ return NULL;
+
+ pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+ tn, inflate_threshold, halve_threshold);
+
+ /* No children */
+ if (tn->empty_children == tnode_child_length(tn)) {
+ tnode_free(tn);
+ return NULL;
+ }
+ /* One child */
+ if (tn->empty_children == tnode_child_length(tn) - 1)
+ for (i = 0; i < tnode_child_length(tn); i++) {
+ struct node *n;
+
+ n = tn->child[i];
+ if (!n)
+ continue;
+
+ /* compress one level */
+ NODE_SET_PARENT(n, NULL);
+ tnode_free(tn);
+ return n;
+ }
+ /*
+ * Double as long as the resulting node has a number of
+ * nonempty nodes that are above the threshold.
+ */
+
+ /*
+ * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+ * the Helsinki University of Technology and Matti Tikkanen of Nokia
+ * Telecommunications, page 6:
+ * "A node is doubled if the ratio of non-empty children to all
+ * children in the *doubled* node is at least 'high'."
+ *
+ * 'high' in this instance is the variable 'inflate_threshold'. It
+ * is expressed as a percentage, so we multiply it with
+ * tnode_child_length() and instead of multiplying by 2 (since the
+ * child array will be doubled by inflate()) and multiplying
+ * the left-hand side by 100 (to handle the percentage thing) we
+ * multiply the left-hand side by 50.
+ *
+ * The left-hand side may look a bit weird: tnode_child_length(tn)
+ * - tn->empty_children is of course the number of non-null children
+ * in the current node. tn->full_children is the number of "full"
+ * children, that is non-null tnodes with a skip value of 0.
+ * All of those will be doubled in the resulting inflated tnode, so
+ * we just count them one extra time here.
+ *
+ * A clearer way to write this would be:
+ *
+ * to_be_doubled = tn->full_children;
+ * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
+ * tn->full_children;
+ *
+ * new_child_length = tnode_child_length(tn) * 2;
+ *
+ * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
+ * new_child_length;
+ * if (new_fill_factor >= inflate_threshold)
+ *
+ * ...and so on, tho it would mess up the while () loop.
+ *
+ * anyway,
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+ * inflate_threshold
+ *
+ * avoid a division:
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+ * inflate_threshold * new_child_length
+ *
+ * expand not_to_be_doubled and to_be_doubled, and shorten:
+ * 100 * (tnode_child_length(tn) - tn->empty_children +
+ * tn->full_children) >= inflate_threshold * new_child_length
+ *
+ * expand new_child_length:
+ * 100 * (tnode_child_length(tn) - tn->empty_children +
+ * tn->full_children) >=
+ * inflate_threshold * tnode_child_length(tn) * 2
+ *
+ * shorten again:
+ * 50 * (tn->full_children + tnode_child_length(tn) -
+ * tn->empty_children) >= inflate_threshold *
+ * tnode_child_length(tn)
+ *
+ */
+
+ check_tnode(tn);
+
+ err = 0;
+ while ((tn->full_children > 0 &&
+ 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
+ inflate_threshold * tnode_child_length(tn))) {
+
+ old_tn = tn;
+ tn = inflate(t, tn);
+ if (IS_ERR(tn)) {
+ tn = old_tn;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.resize_node_skipped++;
+#endif
+ break;
+ }
+ }
+
+ check_tnode(tn);
+
+ /*
+ * Halve as long as the number of empty children in this
+ * node is above threshold.
+ */
+
+ err = 0;
+ while (tn->bits > 1 &&
+ 100 * (tnode_child_length(tn) - tn->empty_children) <
+ halve_threshold * tnode_child_length(tn)) {
+
+ old_tn = tn;
+ tn = halve(t, tn);
+ if (IS_ERR(tn)) {
+ tn = old_tn;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.resize_node_skipped++;
+#endif
+ break;
+ }
+ }
+
+
+ /* Only one child remains */
+ if (tn->empty_children == tnode_child_length(tn) - 1)
+ for (i = 0; i < tnode_child_length(tn); i++) {
+ struct node *n;
+
+ n = tn->child[i];
+ if (!n)
+ continue;
+
+ /* compress one level */
+
+ NODE_SET_PARENT(n, NULL);
+ tnode_free(tn);
+ return n;
+ }
+
+ return (struct node *) tn;
+}
+
+static struct tnode *inflate(struct trie *t, struct tnode *tn)
+{
+ struct tnode *inode;
+ struct tnode *oldtnode = tn;
+ int olen = tnode_child_length(tn);
+ int i;
+
+ pr_debug("In inflate\n");
+
+ tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
+
+ if (!tn)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Preallocate and store tnodes before the actual work so we
+ * don't get into an inconsistent state if memory allocation
+ * fails. In case of failure we return the oldnode and inflate
+ * of tnode is ignored.
+ */
+
+ for (i = 0; i < olen; i++) {
+ struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
+
+ if (inode &&
+ IS_TNODE(inode) &&
+ inode->pos == oldtnode->pos + oldtnode->bits &&
+ inode->bits > 1) {
+ struct tnode *left, *right;
+ t_key m = TKEY_GET_MASK(inode->pos, 1);
+
+ left = tnode_new(inode->key&(~m), inode->pos + 1,
+ inode->bits - 1);
+ if (!left)
+ goto nomem;
+
+ right = tnode_new(inode->key|m, inode->pos + 1,
+ inode->bits - 1);
+
+ if (!right) {
+ tnode_free(left);
+ goto nomem;
+ }
+
+ put_child(t, tn, 2*i, (struct node *) left);
+ put_child(t, tn, 2*i+1, (struct node *) right);
+ }
+ }
+
+ for (i = 0; i < olen; i++) {
+ struct node *node = tnode_get_child(oldtnode, i);
+ struct tnode *left, *right;
+ int size, j;
+
+ /* An empty child */
+ if (node == NULL)
+ continue;
+
+ /* A leaf or an internal node with skipped bits */
+
+ if (IS_LEAF(node) || ((struct tnode *) node)->pos >
+ tn->pos + tn->bits - 1) {
+ if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
+ 1) == 0)
+ put_child(t, tn, 2*i, node);
+ else
+ put_child(t, tn, 2*i+1, node);
+ continue;
+ }
+
+ /* An internal node with two children */
+ inode = (struct tnode *) node;
+
+ if (inode->bits == 1) {
+ put_child(t, tn, 2*i, inode->child[0]);
+ put_child(t, tn, 2*i+1, inode->child[1]);
+
+ tnode_free(inode);
+ continue;
+ }
+
+ /* An internal node with more than two children */
+
+ /* We will replace this node 'inode' with two new
+ * ones, 'left' and 'right', each with half of the
+ * original children. The two new nodes will have
+ * a position one bit further down the key and this
+ * means that the "significant" part of their keys
+ * (see the discussion near the top of this file)
+ * will differ by one bit, which will be "0" in
+ * left's key and "1" in right's key. Since we are
+ * moving the key position by one step, the bit that
+ * we are moving away from - the bit at position
+ * (inode->pos) - is the one that will differ between
+ * left and right. So... we synthesize that bit in the
+ * two new keys.
+ * The mask 'm' below will be a single "one" bit at
+ * the position (inode->pos)
+ */
+
+ /* Use the old key, but set the new significant
+ * bit to zero.
+ */
+
+ left = (struct tnode *) tnode_get_child(tn, 2*i);
+ put_child(t, tn, 2*i, NULL);
+
+ BUG_ON(!left);
+
+ right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+ put_child(t, tn, 2*i+1, NULL);
+
+ BUG_ON(!right);
+
+ size = tnode_child_length(left);
+ for (j = 0; j < size; j++) {
+ put_child(t, left, j, inode->child[j]);
+ put_child(t, right, j, inode->child[j + size]);
+ }
+ put_child(t, tn, 2*i, resize(t, left));
+ put_child(t, tn, 2*i+1, resize(t, right));
+
+ tnode_free(inode);
+ }
+ tnode_free(oldtnode);
+ return tn;
+nomem:
+ {
+ int size = tnode_child_length(tn);
+ int j;
+
+ for (j = 0; j < size; j++)
+ if (tn->child[j])
+ tnode_free((struct tnode *)tn->child[j]);
+
+ tnode_free(tn);
+
+ return ERR_PTR(-ENOMEM);
+ }
+}
+
+static struct tnode *halve(struct trie *t, struct tnode *tn)
+{
+ struct tnode *oldtnode = tn;
+ struct node *left, *right;
+ int i;
+ int olen = tnode_child_length(tn);
+
+ pr_debug("In halve\n");
+
+ tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
+
+ if (!tn)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Preallocate and store tnodes before the actual work so we
+ * don't get into an inconsistent state if memory allocation
+ * fails. In case of failure we return the oldnode and halve
+ * of tnode is ignored.
+ */
+
+ for (i = 0; i < olen; i += 2) {
+ left = tnode_get_child(oldtnode, i);
+ right = tnode_get_child(oldtnode, i+1);
+
+ /* Two nonempty children */
+ if (left && right) {
+ struct tnode *newn;
+
+ newn = tnode_new(left->key, tn->pos + tn->bits, 1);
+
+ if (!newn)
+ goto nomem;
+
+ put_child(t, tn, i/2, (struct node *)newn);
+ }
+
+ }
+
+ for (i = 0; i < olen; i += 2) {
+ struct tnode *newBinNode;
+
+ left = tnode_get_child(oldtnode, i);
+ right = tnode_get_child(oldtnode, i+1);
+
+ /* At least one of the children is empty */
+ if (left == NULL) {
+ if (right == NULL) /* Both are empty */
+ continue;
+ put_child(t, tn, i/2, right);
+ continue;
+ }
+
+ if (right == NULL) {
+ put_child(t, tn, i/2, left);
+ continue;
+ }
+
+ /* Two nonempty children */
+ newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
+ put_child(t, tn, i/2, NULL);
+ put_child(t, newBinNode, 0, left);
+ put_child(t, newBinNode, 1, right);
+ put_child(t, tn, i/2, resize(t, newBinNode));
+ }
+ tnode_free(oldtnode);
+ return tn;
+nomem:
+ {
+ int size = tnode_child_length(tn);
+ int j;
+
+ for (j = 0; j < size; j++)
+ if (tn->child[j])
+ tnode_free((struct tnode *)tn->child[j]);
+
+ tnode_free(tn);
+
+ return ERR_PTR(-ENOMEM);
+ }
+}
+
+static void trie_init(struct trie *t)
+{
+ if (!t)
+ return;
+
+ t->size = 0;
+ rcu_assign_pointer(t->trie, NULL);
+ t->revision = 0;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ memset(&t->stats, 0, sizeof(struct trie_use_stats));
+#endif
+}
+
+/* readside most use rcu_read_lock currently dump routines
+ via get_fa_head and dump */
+
+static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
+{
+ struct hlist_node *node;
+ struct leaf_info *li;
+
+ hlist_for_each_entry_rcu(li, node, head, hlist)
+ if (li->plen == plen)
+ return li;
+
+ return NULL;
+}
+
+static inline struct list_head * get_fa_head(struct leaf *l, int plen)
+{
+ struct leaf_info *li = find_leaf_info(&l->list, plen);
+
+ if (!li)
+ return NULL;
+
+ return &li->falh;
+}
+
+static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
+{
+ struct leaf_info *li = NULL, *last = NULL;
+ struct hlist_node *node;
+
+ if (hlist_empty(head)) {
+ hlist_add_head_rcu(&new->hlist, head);
+ } else {
+ hlist_for_each_entry(li, node, head, hlist) {
+ if (new->plen > li->plen)
+ break;
+
+ last = li;
+ }
+ if (last)
+ hlist_add_after_rcu(&last->hlist, &new->hlist);
+ else
+ hlist_add_before_rcu(&new->hlist, &li->hlist);
+ }
+}
+
+/* rcu_read_lock needs to be hold by caller from readside */
+
+static struct leaf *
+fib_find_node(struct trie *t, u32 key)
+{
+ int pos;
+ struct tnode *tn;
+ struct node *n;
+
+ pos = 0;
+ n = rcu_dereference(t->trie);
+
+ while (n != NULL && NODE_TYPE(n) == T_TNODE) {
+ tn = (struct tnode *) n;
+
+ check_tnode(tn);
+
+ if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
+ pos = tn->pos + tn->bits;
+ n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+ } else
+ break;
+ }
+ /* Case we have found a leaf. Compare prefixes */
+
+ if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
+ return (struct leaf *)n;
+
+ return NULL;
+}
+
+static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
+{
+ int wasfull;
+ t_key cindex, key;
+ struct tnode *tp = NULL;
+
+ key = tn->key;
+
+ while (tn != NULL && NODE_PARENT(tn) != NULL) {
+
+ tp = NODE_PARENT(tn);
+ cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+ wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
+ tn = (struct tnode *) resize (t, (struct tnode *)tn);
+ tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
+
+ if (!NODE_PARENT(tn))
+ break;
+
+ tn = NODE_PARENT(tn);
+ }
+ /* Handle last (top) tnode */
+ if (IS_TNODE(tn))
+ tn = (struct tnode*) resize(t, (struct tnode *)tn);
+
+ return (struct node*) tn;
+}
+
+/* only used from updater-side */
+
+static struct list_head *
+fib_insert_node(struct trie *t, int *err, u32 key, int plen)
+{
+ int pos, newpos;
+ struct tnode *tp = NULL, *tn = NULL;
+ struct node *n;
+ struct leaf *l;
+ int missbit;
+ struct list_head *fa_head = NULL;
+ struct leaf_info *li;
+ t_key cindex;
+
+ pos = 0;
+ n = t->trie;
+
+ /* If we point to NULL, stop. Either the tree is empty and we should
+ * just put a new leaf in if, or we have reached an empty child slot,
+ * and we should just put our new leaf in that.
+ * If we point to a T_TNODE, check if it matches our key. Note that
+ * a T_TNODE might be skipping any number of bits - its 'pos' need
+ * not be the parent's 'pos'+'bits'!
+ *
+ * If it does match the current key, get pos/bits from it, extract
+ * the index from our key, push the T_TNODE and walk the tree.
+ *
+ * If it doesn't, we have to replace it with a new T_TNODE.
+ *
+ * If we point to a T_LEAF, it might or might not have the same key
+ * as we do. If it does, just change the value, update the T_LEAF's
+ * value, and return it.
+ * If it doesn't, we need to replace it with a T_TNODE.
+ */
+
+ while (n != NULL && NODE_TYPE(n) == T_TNODE) {
+ tn = (struct tnode *) n;
+
+ check_tnode(tn);
+
+ if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
+ tp = tn;
+ pos = tn->pos + tn->bits;
+ n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+
+ BUG_ON(n && NODE_PARENT(n) != tn);
+ } else
+ break;
+ }
+
+ /*
+ * n ----> NULL, LEAF or TNODE
+ *
+ * tp is n's (parent) ----> NULL or TNODE
+ */
+
+ BUG_ON(tp && IS_LEAF(tp));
+
+ /* Case 1: n is a leaf. Compare prefixes */
+
+ if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
+ struct leaf *l = (struct leaf *) n;
+
+ li = leaf_info_new(plen);
+
+ if (!li) {
+ *err = -ENOMEM;
+ goto err;
+ }
+
+ fa_head = &li->falh;
+ insert_leaf_info(&l->list, li);
+ goto done;
+ }
+ t->size++;
+ l = leaf_new();
+
+ if (!l) {
+ *err = -ENOMEM;
+ goto err;
+ }
+
+ l->key = key;
+ li = leaf_info_new(plen);
+
+ if (!li) {
+ tnode_free((struct tnode *) l);
+ *err = -ENOMEM;
+ goto err;
+ }
+
+ fa_head = &li->falh;
+ insert_leaf_info(&l->list, li);
+
+ if (t->trie && n == NULL) {
+ /* Case 2: n is NULL, and will just insert a new leaf */
+
+ NODE_SET_PARENT(l, tp);
+
+ cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+ put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+ } else {
+ /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
+ /*
+ * Add a new tnode here
+ * first tnode need some special handling
+ */
+
+ if (tp)
+ pos = tp->pos+tp->bits;
+ else
+ pos = 0;
+
+ if (n) {
+ newpos = tkey_mismatch(key, pos, n->key);
+ tn = tnode_new(n->key, newpos, 1);
+ } else {
+ newpos = 0;
+ tn = tnode_new(key, newpos, 1); /* First tnode */
+ }
+
+ if (!tn) {
+ free_leaf_info(li);
+ tnode_free((struct tnode *) l);
+ *err = -ENOMEM;
+ goto err;
+ }
+
+ NODE_SET_PARENT(tn, tp);
+
+ missbit = tkey_extract_bits(key, newpos, 1);
+ put_child(t, tn, missbit, (struct node *)l);
+ put_child(t, tn, 1-missbit, n);
+
+ if (tp) {
+ cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+ put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
+ } else {
+ rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
+ tp = tn;
+ }
+ }
+
+ if (tp && tp->pos + tp->bits > 32)
+ printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
+ tp, tp->pos, tp->bits, key, plen);
+
+ /* Rebalance the trie */
+
+ rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
+done:
+ t->revision++;
+err:
+ return fa_head;
+}
+
+static int
+fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+ struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
+{
+ struct trie *t = (struct trie *) tb->tb_data;
+ struct fib_alias *fa, *new_fa;
+ struct list_head *fa_head = NULL;
+ struct fib_info *fi;
+ int plen = r->rtm_dst_len;
+ int type = r->rtm_type;
+ u8 tos = r->rtm_tos;
+ u32 key, mask;
+ int err;
+ struct leaf *l;
+
+ if (plen > 32)
+ return -EINVAL;
+
+ key = 0;
+ if (rta->rta_dst)
+ memcpy(&key, rta->rta_dst, 4);
+
+ key = ntohl(key);
+
+ pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
+
+ mask = ntohl(inet_make_mask(plen));
+
+ if (key & ~mask)
+ return -EINVAL;
+
+ key = key & mask;
+
+ fi = fib_create_info(r, rta, nlhdr, &err);
+
+ if (!fi)
+ goto err;
+
+ l = fib_find_node(t, key);
+ fa = NULL;
+
+ if (l) {
+ fa_head = get_fa_head(l, plen);
+ fa = fib_find_alias(fa_head, tos, fi->fib_priority);
+ }
+
+ /* Now fa, if non-NULL, points to the first fib alias
+ * with the same keys [prefix,tos,priority], if such key already
+ * exists or to the node before which we will insert new one.
+ *
+ * If fa is NULL, we will need to allocate a new one and
+ * insert to the head of f.
+ *
+ * If f is NULL, no fib node matched the destination key
+ * and we need to allocate a new one of those as well.
+ */
+
+ if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
+ struct fib_alias *fa_orig;
+
+ err = -EEXIST;
+ if (nlhdr->nlmsg_flags & NLM_F_EXCL)
+ goto out;
+
+ if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
+ struct fib_info *fi_drop;
+ u8 state;
+
+ err = -ENOBUFS;
+ new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+ if (new_fa == NULL)
+ goto out;
+
+ fi_drop = fa->fa_info;
+ new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_info = fi;
+ new_fa->fa_type = type;
+ new_fa->fa_scope = r->rtm_scope;
+ state = fa->fa_state;
+ new_fa->fa_state &= ~FA_S_ACCESSED;
+
+ list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+ alias_free_mem_rcu(fa);
+
+ fib_release_info(fi_drop);
+ if (state & FA_S_ACCESSED)
+ rt_cache_flush(-1);
+
+ goto succeeded;
+ }
+ /* Error if we find a perfect match which
+ * uses the same scope, type, and nexthop
+ * information.
+ */
+ fa_orig = fa;
+ list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
+ if (fa->fa_tos != tos)
+ break;
+ if (fa->fa_info->fib_priority != fi->fib_priority)
+ break;
+ if (fa->fa_type == type &&
+ fa->fa_scope == r->rtm_scope &&
+ fa->fa_info == fi) {
+ goto out;
+ }
+ }
+ if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
+ fa = fa_orig;
+ }
+ err = -ENOENT;
+ if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
+ goto out;
+
+ err = -ENOBUFS;
+ new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+ if (new_fa == NULL)
+ goto out;
+
+ new_fa->fa_info = fi;
+ new_fa->fa_tos = tos;
+ new_fa->fa_type = type;
+ new_fa->fa_scope = r->rtm_scope;
+ new_fa->fa_state = 0;
+ /*
+ * Insert new entry to the list.
+ */
+
+ if (!fa_head) {
+ fa_head = fib_insert_node(t, &err, key, plen);
+ err = 0;
+ if (err)
+ goto out_free_new_fa;
+ }
+
+ list_add_tail_rcu(&new_fa->fa_list,
+ (fa ? &fa->fa_list : fa_head));
+
+ rt_cache_flush(-1);
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
+succeeded:
+ return 0;
+
+out_free_new_fa:
+ kmem_cache_free(fn_alias_kmem, new_fa);
+out:
+ fib_release_info(fi);
+err:
+ return err;
+}
+
+
+/* should be clalled with rcu_read_lock */
+static inline int check_leaf(struct trie *t, struct leaf *l,
+ t_key key, int *plen, const struct flowi *flp,
+ struct fib_result *res)
+{
+ int err, i;
+ t_key mask;
+ struct leaf_info *li;
+ struct hlist_head *hhead = &l->list;
+ struct hlist_node *node;
+
+ hlist_for_each_entry_rcu(li, node, hhead, hlist) {
+ i = li->plen;
+ mask = ntohl(inet_make_mask(i));
+ if (l->key != (key & mask))
+ continue;
+
+ if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
+ *plen = i;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.semantic_match_passed++;
+#endif
+ return err;
+ }
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.semantic_match_miss++;
+#endif
+ }
+ return 1;
+}
+
+static int
+fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+{
+ struct trie *t = (struct trie *) tb->tb_data;
+ int plen, ret = 0;
+ struct node *n;
+ struct tnode *pn;
+ int pos, bits;
+ t_key key = ntohl(flp->fl4_dst);
+ int chopped_off;
+ t_key cindex = 0;
+ int current_prefix_length = KEYLENGTH;
+ struct tnode *cn;
+ t_key node_prefix, key_prefix, pref_mismatch;
+ int mp;
+
+ rcu_read_lock();
+
+ n = rcu_dereference(t->trie);
+ if (!n)
+ goto failed;
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.gets++;
+#endif
+
+ /* Just a leaf? */
+ if (IS_LEAF(n)) {
+ if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
+ goto found;
+ goto failed;
+ }
+ pn = (struct tnode *) n;
+ chopped_off = 0;
+
+ while (pn) {
+ pos = pn->pos;
+ bits = pn->bits;
+
+ if (!chopped_off)
+ cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
+
+ n = tnode_get_child(pn, cindex);
+
+ if (n == NULL) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.null_node_hit++;
+#endif
+ goto backtrace;
+ }
+
+ if (IS_LEAF(n)) {
+ if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
+ goto found;
+ else
+ goto backtrace;
+ }
+
+#define HL_OPTIMIZE
+#ifdef HL_OPTIMIZE
+ cn = (struct tnode *)n;
+
+ /*
+ * It's a tnode, and we can do some extra checks here if we
+ * like, to avoid descending into a dead-end branch.
+ * This tnode is in the parent's child array at index
+ * key[p_pos..p_pos+p_bits] but potentially with some bits
+ * chopped off, so in reality the index may be just a
+ * subprefix, padded with zero at the end.
+ * We can also take a look at any skipped bits in this
+ * tnode - everything up to p_pos is supposed to be ok,
+ * and the non-chopped bits of the index (se previous
+ * paragraph) are also guaranteed ok, but the rest is
+ * considered unknown.
+ *
+ * The skipped bits are key[pos+bits..cn->pos].
+ */
+
+ /* If current_prefix_length < pos+bits, we are already doing
+ * actual prefix matching, which means everything from
+ * pos+(bits-chopped_off) onward must be zero along some
+ * branch of this subtree - otherwise there is *no* valid
+ * prefix present. Here we can only check the skipped
+ * bits. Remember, since we have already indexed into the
+ * parent's child array, we know that the bits we chopped of
+ * *are* zero.
+ */
+
+ /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
+
+ if (current_prefix_length < pos+bits) {
+ if (tkey_extract_bits(cn->key, current_prefix_length,
+ cn->pos - current_prefix_length) != 0 ||
+ !(cn->child[0]))
+ goto backtrace;
+ }
+
+ /*
+ * If chopped_off=0, the index is fully validated and we
+ * only need to look at the skipped bits for this, the new,
+ * tnode. What we actually want to do is to find out if
+ * these skipped bits match our key perfectly, or if we will
+ * have to count on finding a matching prefix further down,
+ * because if we do, we would like to have some way of
+ * verifying the existence of such a prefix at this point.
+ */
+
+ /* The only thing we can do at this point is to verify that
+ * any such matching prefix can indeed be a prefix to our
+ * key, and if the bits in the node we are inspecting that
+ * do not match our key are not ZERO, this cannot be true.
+ * Thus, find out where there is a mismatch (before cn->pos)
+ * and verify that all the mismatching bits are zero in the
+ * new tnode's key.
+ */
+
+ /* Note: We aren't very concerned about the piece of the key
+ * that precede pn->pos+pn->bits, since these have already been
+ * checked. The bits after cn->pos aren't checked since these are
+ * by definition "unknown" at this point. Thus, what we want to
+ * see is if we are about to enter the "prefix matching" state,
+ * and in that case verify that the skipped bits that will prevail
+ * throughout this subtree are zero, as they have to be if we are
+ * to find a matching prefix.
+ */
+
+ node_prefix = MASK_PFX(cn->key, cn->pos);
+ key_prefix = MASK_PFX(key, cn->pos);
+ pref_mismatch = key_prefix^node_prefix;
+ mp = 0;
+
+ /* In short: If skipped bits in this node do not match the search
+ * key, enter the "prefix matching" state.directly.
+ */
+ if (pref_mismatch) {
+ while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
+ mp++;
+ pref_mismatch = pref_mismatch <<1;
+ }
+ key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
+
+ if (key_prefix != 0)
+ goto backtrace;
+
+ if (current_prefix_length >= cn->pos)
+ current_prefix_length = mp;
+ }
+#endif
+ pn = (struct tnode *)n; /* Descend */
+ chopped_off = 0;
+ continue;
+
+backtrace:
+ chopped_off++;
+
+ /* As zero don't change the child key (cindex) */
+ while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
+ chopped_off++;
+
+ /* Decrease current_... with bits chopped off */
+ if (current_prefix_length > pn->pos + pn->bits - chopped_off)
+ current_prefix_length = pn->pos + pn->bits - chopped_off;
+
+ /*
+ * Either we do the actual chop off according or if we have
+ * chopped off all bits in this tnode walk up to our parent.
+ */
+
+ if (chopped_off <= pn->bits) {
+ cindex &= ~(1 << (chopped_off-1));
+ } else {
+ if (NODE_PARENT(pn) == NULL)
+ goto failed;
+
+ /* Get Child's index */
+ cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
+ pn = NODE_PARENT(pn);
+ chopped_off = 0;
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats.backtrack++;
+#endif
+ goto backtrace;
+ }
+ }
+failed:
+ ret = 1;
+found:
+ rcu_read_unlock();
+ return ret;
+}
+
+/* only called from updater side */
+static int trie_leaf_remove(struct trie *t, t_key key)
+{
+ t_key cindex;
+ struct tnode *tp = NULL;
+ struct node *n = t->trie;
+ struct leaf *l;
+
+ pr_debug("entering trie_leaf_remove(%p)\n", n);
+
+ /* Note that in the case skipped bits, those bits are *not* checked!
+ * When we finish this, we will have NULL or a T_LEAF, and the
+ * T_LEAF may or may not match our key.
+ */
+
+ while (n != NULL && IS_TNODE(n)) {
+ struct tnode *tn = (struct tnode *) n;
+ check_tnode(tn);
+ n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
+
+ BUG_ON(n && NODE_PARENT(n) != tn);
+ }
+ l = (struct leaf *) n;
+
+ if (!n || !tkey_equals(l->key, key))
+ return 0;
+
+ /*
+ * Key found.
+ * Remove the leaf and rebalance the tree
+ */
+
+ t->revision++;
+ t->size--;
+
+ preempt_disable();
+ tp = NODE_PARENT(n);
+ tnode_free((struct tnode *) n);
+
+ if (tp) {
+ cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+ put_child(t, (struct tnode *)tp, cindex, NULL);
+ rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
+ } else
+ rcu_assign_pointer(t->trie, NULL);
+ preempt_enable();
+
+ return 1;
+}
+
+static int
+fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+ struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
+{
+ struct trie *t = (struct trie *) tb->tb_data;
+ u32 key, mask;
+ int plen = r->rtm_dst_len;
+ u8 tos = r->rtm_tos;
+ struct fib_alias *fa, *fa_to_delete;
+ struct list_head *fa_head;
+ struct leaf *l;
+ struct leaf_info *li;
+
+
+ if (plen > 32)
+ return -EINVAL;
+
+ key = 0;
+ if (rta->rta_dst)
+ memcpy(&key, rta->rta_dst, 4);
+
+ key = ntohl(key);
+ mask = ntohl(inet_make_mask(plen));
+
+ if (key & ~mask)
+ return -EINVAL;
+
+ key = key & mask;
+ l = fib_find_node(t, key);
+
+ if (!l)
+ return -ESRCH;
+
+ fa_head = get_fa_head(l, plen);
+ fa = fib_find_alias(fa_head, tos, 0);
+
+ if (!fa)
+ return -ESRCH;
+
+ pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+
+ fa_to_delete = NULL;
+ fa_head = fa->fa_list.prev;
+
+ list_for_each_entry(fa, fa_head, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (fa->fa_tos != tos)
+ break;
+
+ if ((!r->rtm_type ||
+ fa->fa_type == r->rtm_type) &&
+ (r->rtm_scope == RT_SCOPE_NOWHERE ||
+ fa->fa_scope == r->rtm_scope) &&
+ (!r->rtm_protocol ||
+ fi->fib_protocol == r->rtm_protocol) &&
+ fib_nh_match(r, nlhdr, rta, fi) == 0) {
+ fa_to_delete = fa;
+ break;
+ }
+ }
+
+ if (!fa_to_delete)
+ return -ESRCH;
+
+ fa = fa_to_delete;
+ rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
+
+ l = fib_find_node(t, key);
+ li = find_leaf_info(&l->list, plen);
+
+ list_del_rcu(&fa->fa_list);
+
+ if (list_empty(fa_head)) {
+ hlist_del_rcu(&li->hlist);
+ free_leaf_info(li);
+ }
+
+ if (hlist_empty(&l->list))
+ trie_leaf_remove(t, key);
+
+ if (fa->fa_state & FA_S_ACCESSED)
+ rt_cache_flush(-1);
+
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
+ return 0;
+}
+
+static int trie_flush_list(struct trie *t, struct list_head *head)
+{
+ struct fib_alias *fa, *fa_node;
+ int found = 0;
+
+ list_for_each_entry_safe(fa, fa_node, head, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
+ list_del_rcu(&fa->fa_list);
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
+ found++;
+ }
+ }
+ return found;
+}
+
+static int trie_flush_leaf(struct trie *t, struct leaf *l)
+{
+ int found = 0;
+ struct hlist_head *lih = &l->list;
+ struct hlist_node *node, *tmp;
+ struct leaf_info *li = NULL;
+
+ hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
+ found += trie_flush_list(t, &li->falh);
+
+ if (list_empty(&li->falh)) {
+ hlist_del_rcu(&li->hlist);
+ free_leaf_info(li);
+ }
+ }
+ return found;
+}
+
+/* rcu_read_lock needs to be hold by caller from readside */
+
+static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
+{
+ struct node *c = (struct node *) thisleaf;
+ struct tnode *p;
+ int idx;
+ struct node *trie = rcu_dereference(t->trie);
+
+ if (c == NULL) {
+ if (trie == NULL)
+ return NULL;
+
+ if (IS_LEAF(trie)) /* trie w. just a leaf */
+ return (struct leaf *) trie;
+
+ p = (struct tnode*) trie; /* Start */
+ } else
+ p = (struct tnode *) NODE_PARENT(c);
+
+ while (p) {
+ int pos, last;
+
+ /* Find the next child of the parent */
+ if (c)
+ pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
+ else
+ pos = 0;
+
+ last = 1 << p->bits;
+ for (idx = pos; idx < last ; idx++) {
+ c = rcu_dereference(p->child[idx]);
+
+ if (!c)
+ continue;
+
+ /* Decend if tnode */
+ while (IS_TNODE(c)) {
+ p = (struct tnode *) c;
+ idx = 0;
+
+ /* Rightmost non-NULL branch */
+ if (p && IS_TNODE(p))
+ while (!(c = rcu_dereference(p->child[idx]))
+ && idx < (1<<p->bits)) idx++;
+
+ /* Done with this tnode? */
+ if (idx >= (1 << p->bits) || !c)
+ goto up;
+ }
+ return (struct leaf *) c;
+ }
+up:
+ /* No more children go up one step */
+ c = (struct node *) p;
+ p = (struct tnode *) NODE_PARENT(p);
+ }
+ return NULL; /* Ready. Root of trie */
+}
+
+static int fn_trie_flush(struct fib_table *tb)
+{
+ struct trie *t = (struct trie *) tb->tb_data;
+ struct leaf *ll = NULL, *l = NULL;
+ int found = 0, h;
+
+ t->revision++;
+
+ rcu_read_lock();
+ for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
+ found += trie_flush_leaf(t, l);
+
+ if (ll && hlist_empty(&ll->list))
+ trie_leaf_remove(t, ll->key);
+ ll = l;
+ }
+ rcu_read_unlock();
+
+ if (ll && hlist_empty(&ll->list))
+ trie_leaf_remove(t, ll->key);
+
+ pr_debug("trie_flush found=%d\n", found);
+ return found;
+}
+
+static int trie_last_dflt = -1;
+
+static void
+fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+{
+ struct trie *t = (struct trie *) tb->tb_data;
+ int order, last_idx;
+ struct fib_info *fi = NULL;
+ struct fib_info *last_resort;
+ struct fib_alias *fa = NULL;
+ struct list_head *fa_head;
+ struct leaf *l;
+
+ last_idx = -1;
+ last_resort = NULL;
+ order = -1;
+
+ rcu_read_lock();
+
+ l = fib_find_node(t, 0);
+ if (!l)
+ goto out;
+
+ fa_head = get_fa_head(l, 0);
+ if (!fa_head)
+ goto out;
+
+ if (list_empty(fa_head))
+ goto out;
+
+ list_for_each_entry_rcu(fa, fa_head, fa_list) {
+ struct fib_info *next_fi = fa->fa_info;
+
+ if (fa->fa_scope != res->scope ||
+ fa->fa_type != RTN_UNICAST)
+ continue;
+
+ if (next_fi->fib_priority > res->fi->fib_priority)
+ break;
+ if (!next_fi->fib_nh[0].nh_gw ||
+ next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+ continue;
+ fa->fa_state |= FA_S_ACCESSED;
+
+ if (fi == NULL) {
+ if (next_fi != res->fi)
+ break;
+ } else if (!fib_detect_death(fi, order, &last_resort,
+ &last_idx, &trie_last_dflt)) {
+ if (res->fi)
+ fib_info_put(res->fi);
+ res->fi = fi;
+ atomic_inc(&fi->fib_clntref);
+ trie_last_dflt = order;
+ goto out;
+ }
+ fi = next_fi;
+ order++;
+ }
+ if (order <= 0 || fi == NULL) {
+ trie_last_dflt = -1;
+ goto out;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
+ if (res->fi)
+ fib_info_put(res->fi);
+ res->fi = fi;
+ atomic_inc(&fi->fib_clntref);
+ trie_last_dflt = order;
+ goto out;
+ }
+ if (last_idx >= 0) {
+ if (res->fi)
+ fib_info_put(res->fi);
+ res->fi = last_resort;
+ if (last_resort)
+ atomic_inc(&last_resort->fib_clntref);
+ }
+ trie_last_dflt = last_idx;
+ out:;
+ rcu_read_unlock();
+}
+
+static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int i, s_i;
+ struct fib_alias *fa;
+
+ u32 xkey = htonl(key);
+
+ s_i = cb->args[3];
+ i = 0;
+
+ /* rcu_read_lock is hold by caller */
+
+ list_for_each_entry_rcu(fa, fah, fa_list) {
+ if (i < s_i) {
+ i++;
+ continue;
+ }
+ if (fa->fa_info->fib_nh == NULL) {
+ printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
+ i++;
+ continue;
+ }
+ if (fa->fa_info == NULL) {
+ printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
+ i++;
+ continue;
+ }
+
+ if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWROUTE,
+ tb->tb_id,
+ fa->fa_type,
+ fa->fa_scope,
+ &xkey,
+ plen,
+ fa->fa_tos,
+ fa->fa_info, 0) < 0) {
+ cb->args[3] = i;
+ return -1;
+ }
+ i++;
+ }
+ cb->args[3] = i;
+ return skb->len;
+}
+
+static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ int h, s_h;
+ struct list_head *fa_head;
+ struct leaf *l = NULL;
+
+ s_h = cb->args[2];
+
+ for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
+ if (h < s_h)
+ continue;
+ if (h > s_h)
+ memset(&cb->args[3], 0,
+ sizeof(cb->args) - 3*sizeof(cb->args[0]));
+
+ fa_head = get_fa_head(l, plen);
+
+ if (!fa_head)
+ continue;
+
+ if (list_empty(fa_head))
+ continue;
+
+ if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
+ cb->args[2] = h;
+ return -1;
+ }
+ }
+ cb->args[2] = h;
+ return skb->len;
+}
+
+static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int m, s_m;
+ struct trie *t = (struct trie *) tb->tb_data;
+
+ s_m = cb->args[1];
+
+ rcu_read_lock();
+ for (m = 0; m <= 32; m++) {
+ if (m < s_m)
+ continue;
+ if (m > s_m)
+ memset(&cb->args[2], 0,
+ sizeof(cb->args) - 2*sizeof(cb->args[0]));
+
+ if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
+ cb->args[1] = m;
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+ cb->args[1] = m;
+ return skb->len;
+out:
+ rcu_read_unlock();
+ return -1;
+}
+
+/* Fix more generic FIB names for init later */
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+struct fib_table * fib_hash_init(int id)
+#else
+struct fib_table * __init fib_hash_init(int id)
+#endif
+{
+ struct fib_table *tb;
+ struct trie *t;
+
+ if (fn_alias_kmem == NULL)
+ fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+ sizeof(struct fib_alias),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+
+ tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
+ GFP_KERNEL);
+ if (tb == NULL)
+ return NULL;
+
+ tb->tb_id = id;
+ tb->tb_lookup = fn_trie_lookup;
+ tb->tb_insert = fn_trie_insert;
+ tb->tb_delete = fn_trie_delete;
+ tb->tb_flush = fn_trie_flush;
+ tb->tb_select_default = fn_trie_select_default;
+ tb->tb_dump = fn_trie_dump;
+ memset(tb->tb_data, 0, sizeof(struct trie));
+
+ t = (struct trie *) tb->tb_data;
+
+ trie_init(t);
+
+ if (id == RT_TABLE_LOCAL)
+ trie_local = t;
+ else if (id == RT_TABLE_MAIN)
+ trie_main = t;
+
+ if (id == RT_TABLE_LOCAL)
+ printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
+
+ return tb;
+}
+
+/* Trie dump functions */
+
+static void putspace_seq(struct seq_file *seq, int n)
+{
+ while (n--)
+ seq_printf(seq, " ");
+}
+
+static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
+{
+ while (bits--)
+ seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
+}
+
+static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
+ int pend, int cindex, int bits)
+{
+ putspace_seq(seq, indent);
+ if (IS_LEAF(n))
+ seq_printf(seq, "|");
+ else
+ seq_printf(seq, "+");
+ if (bits) {
+ seq_printf(seq, "%d/", cindex);
+ printbin_seq(seq, cindex, bits);
+ seq_printf(seq, ": ");
+ } else
+ seq_printf(seq, "<root>: ");
+ seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
+
+ if (IS_LEAF(n)) {
+ struct leaf *l = (struct leaf *)n;
+ struct fib_alias *fa;
+ int i;
+
+ seq_printf(seq, "key=%d.%d.%d.%d\n",
+ n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
+
+ for (i = 32; i >= 0; i--)
+ if (find_leaf_info(&l->list, i)) {
+ struct list_head *fa_head = get_fa_head(l, i);
+
+ if (!fa_head)
+ continue;
+
+ if (list_empty(fa_head))
+ continue;
+
+ putspace_seq(seq, indent+2);
+ seq_printf(seq, "{/%d...dumping}\n", i);
+
+ list_for_each_entry_rcu(fa, fa_head, fa_list) {
+ putspace_seq(seq, indent+2);
+ if (fa->fa_info == NULL) {
+ seq_printf(seq, "Error fa_info=NULL\n");
+ continue;
+ }
+ if (fa->fa_info->fib_nh == NULL) {
+ seq_printf(seq, "Error _fib_nh=NULL\n");
+ continue;
+ }
+
+ seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
+ fa->fa_type,
+ fa->fa_scope,
+ fa->fa_tos);
+ }
+ }
+ } else {
+ struct tnode *tn = (struct tnode *)n;
+ int plen = ((struct tnode *)n)->pos;
+ t_key prf = MASK_PFX(n->key, plen);
+
+ seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
+ prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
+
+ putspace_seq(seq, indent); seq_printf(seq, "| ");
+ seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
+ printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
+ seq_printf(seq, "}\n");
+ putspace_seq(seq, indent); seq_printf(seq, "| ");
+ seq_printf(seq, "{pos=%d", tn->pos);
+ seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
+ seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
+ putspace_seq(seq, indent); seq_printf(seq, "| ");
+ seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
+ }
+}
+
+static void trie_dump_seq(struct seq_file *seq, struct trie *t)
+{
+ struct node *n;
+ int cindex = 0;
+ int indent = 1;
+ int pend = 0;
+ int depth = 0;
+ struct tnode *tn;
+
+ rcu_read_lock();
+ n = rcu_dereference(t->trie);
+ seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
+
+ if (!n) {
+ seq_printf(seq, "------ trie is empty\n");
+
+ rcu_read_unlock();
+ return;
+ }
+
+ printnode_seq(seq, indent, n, pend, cindex, 0);
+
+ if (!IS_TNODE(n)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ tn = (struct tnode *)n;
+ pend = tn->pos+tn->bits;
+ putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
+ indent += 3;
+ depth++;
+
+ while (tn && cindex < (1 << tn->bits)) {
+ struct node *child = rcu_dereference(tn->child[cindex]);
+ if (!child)
+ cindex++;
+ else {
+ /* Got a child */
+ printnode_seq(seq, indent, child, pend,
+ cindex, tn->bits);
+
+ if (IS_LEAF(child))
+ cindex++;
+
+ else {
+ /*
+ * New tnode. Decend one level
+ */
+
+ depth++;
+ n = child;
+ tn = (struct tnode *)n;
+ pend = tn->pos+tn->bits;
+ putspace_seq(seq, indent);
+ seq_printf(seq, "\\--\n");
+ indent += 3;
+ cindex = 0;
+ }
+ }
+
+ /*
+ * Test if we are done
+ */
+
+ while (cindex >= (1 << tn->bits)) {
+ /*
+ * Move upwards and test for root
+ * pop off all traversed nodes
+ */
+
+ if (NODE_PARENT(tn) == NULL) {
+ tn = NULL;
+ break;
+ }
+
+ cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
+ cindex++;
+ tn = NODE_PARENT(tn);
+ pend = tn->pos + tn->bits;
+ indent -= 3;
+ depth--;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static struct trie_stat *trie_stat_new(void)
+{
+ struct trie_stat *s;
+ int i;
+
+ s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
+ if (!s)
+ return NULL;
+
+ s->totdepth = 0;
+ s->maxdepth = 0;
+ s->tnodes = 0;
+ s->leaves = 0;
+ s->nullpointers = 0;
+
+ for (i = 0; i < MAX_CHILDS; i++)
+ s->nodesizes[i] = 0;
+
+ return s;
+}
+
+static struct trie_stat *trie_collect_stats(struct trie *t)
+{
+ struct node *n;
+ struct trie_stat *s = trie_stat_new();
+ int cindex = 0;
+ int pend = 0;
+ int depth = 0;
+
+ if (!s)
+ return NULL;
+
+ rcu_read_lock();
+ n = rcu_dereference(t->trie);
+
+ if (!n)
+ return s;
+
+ if (IS_TNODE(n)) {
+ struct tnode *tn = (struct tnode *)n;
+ pend = tn->pos+tn->bits;
+ s->nodesizes[tn->bits]++;
+ depth++;
+
+ while (tn && cindex < (1 << tn->bits)) {
+ struct node *ch = rcu_dereference(tn->child[cindex]);
+ if (ch) {
+
+ /* Got a child */
+
+ if (IS_LEAF(tn->child[cindex])) {
+ cindex++;
+
+ /* stats */
+ if (depth > s->maxdepth)
+ s->maxdepth = depth;
+ s->totdepth += depth;
+ s->leaves++;
+ } else {
+ /*
+ * New tnode. Decend one level
+ */
+
+ s->tnodes++;
+ s->nodesizes[tn->bits]++;
+ depth++;
+
+ n = ch;
+ tn = (struct tnode *)n;
+ pend = tn->pos+tn->bits;
+
+ cindex = 0;
+ }
+ } else {
+ cindex++;
+ s->nullpointers++;
+ }
+
+ /*
+ * Test if we are done
+ */
+
+ while (cindex >= (1 << tn->bits)) {
+ /*
+ * Move upwards and test for root
+ * pop off all traversed nodes
+ */
+
+ if (NODE_PARENT(tn) == NULL) {
+ tn = NULL;
+ n = NULL;
+ break;
+ }
+
+ cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
+ tn = NODE_PARENT(tn);
+ cindex++;
+ n = (struct node *)tn;
+ pend = tn->pos+tn->bits;
+ depth--;
+ }
+ }
+ }
+
+ rcu_read_unlock();
+ return s;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
+{
+ return NULL;
+}
+
+static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
+{
+ return NULL;
+}
+
+static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (!ip_fib_main_table)
+ return NULL;
+
+ if (*pos)
+ return fib_triestat_get_next(seq);
+ else
+ return SEQ_START_TOKEN;
+}
+
+static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return fib_triestat_get_first(seq);
+ else
+ return fib_triestat_get_next(seq);
+}
+
+static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+/*
+ * This outputs /proc/net/fib_triestats
+ *
+ * It always works in backward compatibility mode.
+ * The format of the file is not supposed to be changed.
+ */
+
+static void collect_and_show(struct trie *t, struct seq_file *seq)
+{
+ int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
+ int i, max, pointers;
+ struct trie_stat *stat;
+ int avdepth;
+
+ stat = trie_collect_stats(t);
+
+ bytes = 0;
+ seq_printf(seq, "trie=%p\n", t);
+
+ if (stat) {
+ if (stat->leaves)
+ avdepth = stat->totdepth*100 / stat->leaves;
+ else
+ avdepth = 0;
+ seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
+ seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
+
+ seq_printf(seq, "Leaves: %d\n", stat->leaves);
+ bytes += sizeof(struct leaf) * stat->leaves;
+ seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
+ bytes += sizeof(struct tnode) * stat->tnodes;
+
+ max = MAX_CHILDS-1;
+
+ while (max >= 0 && stat->nodesizes[max] == 0)
+ max--;
+ pointers = 0;
+
+ for (i = 1; i <= max; i++)
+ if (stat->nodesizes[i] != 0) {
+ seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
+ pointers += (1<<i) * stat->nodesizes[i];
+ }
+ seq_printf(seq, "\n");
+ seq_printf(seq, "Pointers: %d\n", pointers);
+ bytes += sizeof(struct node *) * pointers;
+ seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
+ seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
+
+ kfree(stat);
+ }
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ seq_printf(seq, "Counters:\n---------\n");
+ seq_printf(seq,"gets = %d\n", t->stats.gets);
+ seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
+ seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
+ seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
+ seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
+ seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
+#ifdef CLEAR_STATS
+ memset(&(t->stats), 0, sizeof(t->stats));
+#endif
+#endif /* CONFIG_IP_FIB_TRIE_STATS */
+}
+
+static int fib_triestat_seq_show(struct seq_file *seq, void *v)
+{
+ char bf[128];
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
+ sizeof(struct leaf), sizeof(struct tnode));
+ if (trie_local)
+ collect_and_show(trie_local, seq);
+
+ if (trie_main)
+ collect_and_show(trie_main, seq);
+ } else {
+ snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
+
+ seq_printf(seq, "%-127s\n", bf);
+ }
+ return 0;
+}
+
+static struct seq_operations fib_triestat_seq_ops = {
+ .start = fib_triestat_seq_start,
+ .next = fib_triestat_seq_next,
+ .stop = fib_triestat_seq_stop,
+ .show = fib_triestat_seq_show,
+};
+
+static int fib_triestat_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+
+ rc = seq_open(file, &fib_triestat_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+out:
+ return rc;
+out_kfree:
+ goto out;
+}
+
+static struct file_operations fib_triestat_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_triestat_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+int __init fib_stat_proc_init(void)
+{
+ if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+void __init fib_stat_proc_exit(void)
+{
+ proc_net_remove("fib_triestat");
+}
+
+static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
+{
+ return NULL;
+}
+
+static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
+{
+ return NULL;
+}
+
+static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (!ip_fib_main_table)
+ return NULL;
+
+ if (*pos)
+ return fib_trie_get_next(seq);
+ else
+ return SEQ_START_TOKEN;
+}
+
+static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return fib_trie_get_first(seq);
+ else
+ return fib_trie_get_next(seq);
+
+}
+
+static void fib_trie_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+/*
+ * This outputs /proc/net/fib_trie.
+ *
+ * It always works in backward compatibility mode.
+ * The format of the file is not supposed to be changed.
+ */
+
+static int fib_trie_seq_show(struct seq_file *seq, void *v)
+{
+ char bf[128];
+
+ if (v == SEQ_START_TOKEN) {
+ if (trie_local)
+ trie_dump_seq(seq, trie_local);
+
+ if (trie_main)
+ trie_dump_seq(seq, trie_main);
+ } else {
+ snprintf(bf, sizeof(bf),
+ "*\t%08X\t%08X", 200, 400);
+ seq_printf(seq, "%-127s\n", bf);
+ }
+
+ return 0;
+}
+
+static struct seq_operations fib_trie_seq_ops = {
+ .start = fib_trie_seq_start,
+ .next = fib_trie_seq_next,
+ .stop = fib_trie_seq_stop,
+ .show = fib_trie_seq_show,
+};
+
+static int fib_trie_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+
+ rc = seq_open(file, &fib_trie_seq_ops);
+ if (rc)
+ goto out_kfree;
+
+ seq = file->private_data;
+out:
+ return rc;
+out_kfree:
+ goto out;
+}
+
+static struct file_operations fib_trie_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_trie_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release= seq_release_private,
+};
+
+int __init fib_proc_init(void)
+{
+ if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+void __init fib_proc_exit(void)
+{
+ proc_net_remove("fib_trie");
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 85bf0d3..24eb56a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -114,7 +114,7 @@ struct icmp_bxm {
/*
* Statistics
*/
-DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
/* An array of errno for error messages from dest unreach. */
/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
@@ -207,6 +207,7 @@ int sysctl_icmp_ignore_bogus_error_responses;
int sysctl_icmp_ratelimit = 1 * HZ;
int sysctl_icmp_ratemask = 0x1818;
+int sysctl_icmp_errors_use_inbound_ifaddr;
/*
* ICMP control array. This specifies what to do with each ICMP.
@@ -348,12 +349,12 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
{
struct sk_buff *skb;
- ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
- icmp_param->data_len+icmp_param->head_len,
- icmp_param->head_len,
- ipc, rt, MSG_DONTWAIT);
-
- if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
+ if (ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
+ icmp_param->data_len+icmp_param->head_len,
+ icmp_param->head_len,
+ ipc, rt, MSG_DONTWAIT) < 0)
+ ip_flush_pending_frames(icmp_socket->sk);
+ else if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
struct icmphdr *icmph = skb->h.icmph;
unsigned int csum = 0;
struct sk_buff *skb1;
@@ -511,8 +512,12 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info)
*/
saddr = iph->daddr;
- if (!(rt->rt_flags & RTCF_LOCAL))
- saddr = 0;
+ if (!(rt->rt_flags & RTCF_LOCAL)) {
+ if (sysctl_icmp_errors_use_inbound_ifaddr)
+ saddr = inet_select_addr(skb_in->dev, 0, RT_SCOPE_LINK);
+ else
+ saddr = 0;
+ }
tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
IPTOS_PREC_INTERNETCONTROL) :
@@ -622,11 +627,10 @@ static void icmp_unreach(struct sk_buff *skb)
break;
case ICMP_FRAG_NEEDED:
if (ipv4_config.no_pmtu_disc) {
- LIMIT_NETDEBUG(
- printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
+ LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: "
"fragmentation needed "
"and DF set.\n",
- NIPQUAD(iph->daddr)));
+ NIPQUAD(iph->daddr));
} else {
info = ip_rt_frag_needed(iph,
ntohs(icmph->un.frag.mtu));
@@ -635,10 +639,9 @@ static void icmp_unreach(struct sk_buff *skb)
}
break;
case ICMP_SR_FAILED:
- LIMIT_NETDEBUG(
- printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
+ LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
"Route Failed.\n",
- NIPQUAD(iph->daddr)));
+ NIPQUAD(iph->daddr));
break;
default:
break;
@@ -931,8 +934,7 @@ int icmp_rcv(struct sk_buff *skb)
case CHECKSUM_HW:
if (!(u16)csum_fold(skb->csum))
break;
- NETDEBUG(if (net_ratelimit())
- printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n");
case CHECKSUM_NONE:
if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
goto error;
@@ -965,7 +967,8 @@ int icmp_rcv(struct sk_buff *skb)
* RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
* discarded if to broadcast/multicast.
*/
- if (icmph->type == ICMP_ECHO &&
+ if ((icmph->type == ICMP_ECHO ||
+ icmph->type == ICMP_TIMESTAMP) &&
sysctl_icmp_echo_ignore_broadcasts) {
goto error;
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1f31831..44607f4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -904,7 +904,7 @@ int igmp_rcv(struct sk_buff *skb)
case IGMP_MTRACE_RESP:
break;
default:
- NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
+ NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
}
in_dev_put(in_dev);
kfree_skb(skb);
@@ -1615,9 +1615,10 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
{
int err;
u32 addr = imr->imr_multiaddr.s_addr;
- struct ip_mc_socklist *iml, *i;
+ struct ip_mc_socklist *iml=NULL, *i;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
+ int ifindex;
int count = 0;
if (!MULTICAST(addr))
@@ -1633,37 +1634,30 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
goto done;
}
- iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
-
err = -EADDRINUSE;
+ ifindex = imr->imr_ifindex;
for (i = inet->mc_list; i; i = i->next) {
- if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
- /* New style additions are reference counted */
- if (imr->imr_address.s_addr == 0) {
- i->count++;
- err = 0;
- }
+ if (i->multi.imr_multiaddr.s_addr == addr &&
+ i->multi.imr_ifindex == ifindex)
goto done;
- }
count++;
}
err = -ENOBUFS;
- if (iml == NULL || count >= sysctl_igmp_max_memberships)
+ if (count >= sysctl_igmp_max_memberships)
+ goto done;
+ iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
+ if (iml == NULL)
goto done;
+
memcpy(&iml->multi, imr, sizeof(*imr));
iml->next = inet->mc_list;
- iml->count = 1;
iml->sflist = NULL;
iml->sfmode = MCAST_EXCLUDE;
inet->mc_list = iml;
ip_mc_inc_group(in_dev, addr);
- iml = NULL;
err = 0;
-
done:
rtnl_shunlock();
- if (iml)
- sock_kfree_s(sk, iml, sizeof(*iml));
return err;
}
@@ -1693,30 +1687,25 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *iml, **imlp;
+ struct in_device *in_dev;
+ u32 group = imr->imr_multiaddr.s_addr;
+ u32 ifindex;
rtnl_lock();
+ in_dev = ip_mc_find_dev(imr);
+ if (!in_dev) {
+ rtnl_unlock();
+ return -ENODEV;
+ }
+ ifindex = imr->imr_ifindex;
for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
- if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
- iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
- (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) {
- struct in_device *in_dev;
-
- in_dev = inetdev_by_index(iml->multi.imr_ifindex);
- if (in_dev)
- (void) ip_mc_leave_src(sk, iml, in_dev);
- if (--iml->count) {
- rtnl_unlock();
- if (in_dev)
- in_dev_put(in_dev);
- return 0;
- }
+ if (iml->multi.imr_multiaddr.s_addr == group &&
+ iml->multi.imr_ifindex == ifindex) {
+ (void) ip_mc_leave_src(sk, iml, in_dev);
*imlp = iml->next;
- if (in_dev) {
- ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
- in_dev_put(in_dev);
- }
+ ip_mc_dec_group(in_dev, group);
rtnl_unlock();
sock_kfree_s(sk, iml, sizeof(*iml));
return 0;
@@ -1736,6 +1725,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
struct in_device *in_dev = NULL;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
+ int leavegroup = 0;
int i, j, rv;
if (!MULTICAST(addr))
@@ -1755,15 +1745,20 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
err = -EADDRNOTAVAIL;
for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
- if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0)
+ if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
+ && pmc->multi.imr_ifindex == imr.imr_ifindex)
break;
}
- if (!pmc) /* must have a prior join */
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
goto done;
+ }
/* if a source filter was set, must be the same mode as before */
if (pmc->sflist) {
- if (pmc->sfmode != omode)
+ if (pmc->sfmode != omode) {
+ err = -EINVAL;
goto done;
+ }
} else if (pmc->sfmode != omode) {
/* allow mode switches for empty-set filters */
ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
@@ -1775,7 +1770,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
psl = pmc->sflist;
if (!add) {
if (!psl)
- goto done;
+ goto done; /* err = -EADDRNOTAVAIL */
rv = !0;
for (i=0; i<psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
@@ -1784,7 +1779,13 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
break;
}
if (rv) /* source not found */
+ goto done; /* err = -EADDRNOTAVAIL */
+
+ /* special case - (INCLUDE, empty) == LEAVE_GROUP */
+ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+ leavegroup = 1;
goto done;
+ }
/* update the interface filter */
ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
@@ -1842,18 +1843,21 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
&mreqs->imr_sourceaddr, 1);
done:
rtnl_shunlock();
+ if (leavegroup)
+ return ip_mc_leave_group(sk, &imr);
return err;
}
int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
{
- int err;
+ int err = 0;
struct ip_mreqn imr;
u32 addr = msf->imsf_multiaddr;
struct ip_mc_socklist *pmc;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *newpsl, *psl;
+ int leavegroup = 0;
if (!MULTICAST(addr))
return -EINVAL;
@@ -1872,15 +1876,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
err = -ENODEV;
goto done;
}
- err = -EADDRNOTAVAIL;
+
+ /* special case - (INCLUDE, empty) == LEAVE_GROUP */
+ if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
+ leavegroup = 1;
+ goto done;
+ }
for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
pmc->multi.imr_ifindex == imr.imr_ifindex)
break;
}
- if (!pmc) /* must have a prior join */
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
goto done;
+ }
if (msf->imsf_numsrc) {
newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
@@ -1909,8 +1920,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
0, NULL, 0);
pmc->sflist = newpsl;
pmc->sfmode = msf->imsf_fmode;
+ err = 0;
done:
rtnl_shunlock();
+ if (leavegroup)
+ err = ip_mc_leave_group(sk, &imr);
return err;
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 0000000..fe3c6d3
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,641 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Support for INET connection oriented protocols.
+ *
+ * Authors: See the TCP sources
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/jhash.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+
+#ifdef INET_CSK_DEBUG
+const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+#endif
+
+/*
+ * This array holds the first and last local port number.
+ * For high-usage systems, use sysctl to change this to
+ * 32768-61000
+ */
+int sysctl_local_port_range[2] = { 1024, 4999 };
+
+static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
+{
+ const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
+ struct sock *sk2;
+ struct hlist_node *node;
+ int reuse = sk->sk_reuse;
+
+ sk_for_each_bound(sk2, node, &tb->owners) {
+ if (sk != sk2 &&
+ !inet_v6_ipv6only(sk2) &&
+ (!sk->sk_bound_dev_if ||
+ !sk2->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+ if (!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) {
+ const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
+ if (!sk2_rcv_saddr || !sk_rcv_saddr ||
+ sk2_rcv_saddr == sk_rcv_saddr)
+ break;
+ }
+ }
+ }
+ return node != NULL;
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+int inet_csk_get_port(struct inet_hashinfo *hashinfo,
+ struct sock *sk, unsigned short snum)
+{
+ struct inet_bind_hashbucket *head;
+ struct hlist_node *node;
+ struct inet_bind_bucket *tb;
+ int ret;
+
+ local_bh_disable();
+ if (!snum) {
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int remaining = (high - low) + 1;
+ int rover;
+
+ spin_lock(&hashinfo->portalloc_lock);
+ if (hashinfo->port_rover < low)
+ rover = low;
+ else
+ rover = hashinfo->port_rover;
+ do {
+ rover++;
+ if (rover > high)
+ rover = low;
+ head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
+ spin_lock(&head->lock);
+ inet_bind_bucket_for_each(tb, node, &head->chain)
+ if (tb->port == rover)
+ goto next;
+ break;
+ next:
+ spin_unlock(&head->lock);
+ } while (--remaining > 0);
+ hashinfo->port_rover = rover;
+ spin_unlock(&hashinfo->portalloc_lock);
+
+ /* Exhausted local port range during search? It is not
+ * possible for us to be holding one of the bind hash
+ * locks if this test triggers, because if 'remaining'
+ * drops to zero, we broke out of the do/while loop at
+ * the top level, not from the 'break;' statement.
+ */
+ ret = 1;
+ if (remaining <= 0)
+ goto fail;
+
+ /* OK, here is the one we will use. HEAD is
+ * non-NULL and we hold it's mutex.
+ */
+ snum = rover;
+ } else {
+ head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
+ spin_lock(&head->lock);
+ inet_bind_bucket_for_each(tb, node, &head->chain)
+ if (tb->port == snum)
+ goto tb_found;
+ }
+ tb = NULL;
+ goto tb_not_found;
+tb_found:
+ if (!hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse > 1)
+ goto success;
+ if (tb->fastreuse > 0 &&
+ sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
+ goto success;
+ } else {
+ ret = 1;
+ if (inet_csk_bind_conflict(sk, tb))
+ goto fail_unlock;
+ }
+ }
+tb_not_found:
+ ret = 1;
+ if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
+ goto fail_unlock;
+ if (hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+ tb->fastreuse = 1;
+ else
+ tb->fastreuse = 0;
+ } else if (tb->fastreuse &&
+ (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+ tb->fastreuse = 0;
+success:
+ if (!inet_csk(sk)->icsk_bind_hash)
+ inet_bind_hash(sk, tb, snum);
+ BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
+ ret = 0;
+
+fail_unlock:
+ spin_unlock(&head->lock);
+fail:
+ local_bh_enable();
+ return ret;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_get_port);
+
+/*
+ * Wait for an incoming connection, avoid race conditions. This must be called
+ * with the socket locked.
+ */
+static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ DEFINE_WAIT(wait);
+ int err;
+
+ /*
+ * True wake-one mechanism for incoming connections: only
+ * one process gets woken up, not the 'whole herd'.
+ * Since we do not 'race & poll' for established sockets
+ * anymore, the common case will execute the loop only once.
+ *
+ * Subtle issue: "add_wait_queue_exclusive()" will be added
+ * after any current non-exclusive waiters, and we know that
+ * it will always _stay_ after any new non-exclusive waiters
+ * because all non-exclusive waiters are added at the
+ * beginning of the wait-queue. As such, it's ok to "drop"
+ * our exclusiveness temporarily when we get woken up without
+ * having to remove and re-insert us on the wait queue.
+ */
+ for (;;) {
+ prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+ TASK_INTERRUPTIBLE);
+ release_sock(sk);
+ if (reqsk_queue_empty(&icsk->icsk_accept_queue))
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ err = 0;
+ if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
+ break;
+ err = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ break;
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ break;
+ err = -EAGAIN;
+ if (!timeo)
+ break;
+ }
+ finish_wait(sk->sk_sleep, &wait);
+ return err;
+}
+
+/*
+ * This will accept the next outstanding connection.
+ */
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sock *newsk;
+ int error;
+
+ lock_sock(sk);
+
+ /* We need to make sure that this socket is listening,
+ * and that it has something pending.
+ */
+ error = -EINVAL;
+ if (sk->sk_state != TCP_LISTEN)
+ goto out_err;
+
+ /* Find already established connection */
+ if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+ long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+ /* If this is a non blocking socket don't sleep */
+ error = -EAGAIN;
+ if (!timeo)
+ goto out_err;
+
+ error = inet_csk_wait_for_connect(sk, timeo);
+ if (error)
+ goto out_err;
+ }
+
+ newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
+ BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
+out:
+ release_sock(sk);
+ return newsk;
+out_err:
+ newsk = NULL;
+ *err = error;
+ goto out;
+}
+
+EXPORT_SYMBOL(inet_csk_accept);
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies
+ * to optimize.
+ */
+void inet_csk_init_xmit_timers(struct sock *sk,
+ void (*retransmit_handler)(unsigned long),
+ void (*delack_handler)(unsigned long),
+ void (*keepalive_handler)(unsigned long))
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ init_timer(&icsk->icsk_retransmit_timer);
+ init_timer(&icsk->icsk_delack_timer);
+ init_timer(&sk->sk_timer);
+
+ icsk->icsk_retransmit_timer.function = retransmit_handler;
+ icsk->icsk_delack_timer.function = delack_handler;
+ sk->sk_timer.function = keepalive_handler;
+
+ icsk->icsk_retransmit_timer.data =
+ icsk->icsk_delack_timer.data =
+ sk->sk_timer.data = (unsigned long)sk;
+
+ icsk->icsk_pending = icsk->icsk_ack.pending = 0;
+}
+
+EXPORT_SYMBOL(inet_csk_init_xmit_timers);
+
+void inet_csk_clear_xmit_timers(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ sk_stop_timer(sk, &icsk->icsk_delack_timer);
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+
+EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
+
+void inet_csk_delete_keepalive_timer(struct sock *sk)
+{
+ sk_stop_timer(sk, &sk->sk_timer);
+}
+
+EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+
+void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+
+EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
+
+struct dst_entry* inet_csk_route_req(struct sock *sk,
+ const struct request_sock *req)
+{
+ struct rtable *rt;
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct ip_options *opt = inet_rsk(req)->opt;
+ struct flowi fl = { .oif = sk->sk_bound_dev_if,
+ .nl_u = { .ip4_u =
+ { .daddr = ((opt && opt->srr) ?
+ opt->faddr :
+ ireq->rmt_addr),
+ .saddr = ireq->loc_addr,
+ .tos = RT_CONN_FLAGS(sk) } },
+ .proto = sk->sk_protocol,
+ .uli_u = { .ports =
+ { .sport = inet_sk(sk)->sport,
+ .dport = ireq->rmt_port } } };
+
+ if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+ }
+ if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+ ip_rt_put(rt);
+ IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
+ }
+ return &rt->u.dst;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_route_req);
+
+static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
+ const u32 rnd, const u16 synq_hsize)
+{
+ return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
+#else
+#define AF_INET_FAMILY(fam) 1
+#endif
+
+struct request_sock *inet_csk_search_req(const struct sock *sk,
+ struct request_sock ***prevp,
+ const __u16 rport, const __u32 raddr,
+ const __u32 laddr)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ struct request_sock *req, **prev;
+
+ for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
+ lopt->nr_table_entries)];
+ (req = *prev) != NULL;
+ prev = &req->dl_next) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (ireq->rmt_port == rport &&
+ ireq->rmt_addr == raddr &&
+ ireq->loc_addr == laddr &&
+ AF_INET_FAMILY(req->rsk_ops->family)) {
+ BUG_TRAP(!req->sk);
+ *prevp = prev;
+ break;
+ }
+ }
+
+ return req;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_search_req);
+
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+ const unsigned timeout)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
+ lopt->hash_rnd, lopt->nr_table_entries);
+
+ reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+ inet_csk_reqsk_queue_added(sk, timeout);
+}
+
+/* Only thing we need from tcp.h */
+extern int sysctl_tcp_synack_retries;
+
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+
+void inet_csk_reqsk_queue_prune(struct sock *parent,
+ const unsigned long interval,
+ const unsigned long timeout,
+ const unsigned long max_rto)
+{
+ struct inet_connection_sock *icsk = inet_csk(parent);
+ struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+ struct listen_sock *lopt = queue->listen_opt;
+ int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+ int thresh = max_retries;
+ unsigned long now = jiffies;
+ struct request_sock **reqp, *req;
+ int i, budget;
+
+ if (lopt == NULL || lopt->qlen == 0)
+ return;
+
+ /* Normally all the openreqs are young and become mature
+ * (i.e. converted to established socket) for first timeout.
+ * If synack was not acknowledged for 3 seconds, it means
+ * one of the following things: synack was lost, ack was lost,
+ * rtt is high or nobody planned to ack (i.e. synflood).
+ * When server is a bit loaded, queue is populated with old
+ * open requests, reducing effective size of queue.
+ * When server is well loaded, queue size reduces to zero
+ * after several minutes of work. It is not synflood,
+ * it is normal operation. The solution is pruning
+ * too old entries overriding normal timeout, when
+ * situation becomes dangerous.
+ *
+ * Essentially, we reserve half of room for young
+ * embrions; and abort old ones without pity, if old
+ * ones are about to clog our table.
+ */
+ if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+ int young = (lopt->qlen_young<<1);
+
+ while (thresh > 2) {
+ if (lopt->qlen < young)
+ break;
+ thresh--;
+ young <<= 1;
+ }
+ }
+
+ if (queue->rskq_defer_accept)
+ max_retries = queue->rskq_defer_accept;
+
+ budget = 2 * (lopt->nr_table_entries / (timeout / interval));
+ i = lopt->clock_hand;
+
+ do {
+ reqp=&lopt->syn_table[i];
+ while ((req = *reqp) != NULL) {
+ if (time_after_eq(now, req->expires)) {
+ if ((req->retrans < thresh ||
+ (inet_rsk(req)->acked && req->retrans < max_retries))
+ && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
+ unsigned long timeo;
+
+ if (req->retrans++ == 0)
+ lopt->qlen_young--;
+ timeo = min((timeout << req->retrans), max_rto);
+ req->expires = now + timeo;
+ reqp = &req->dl_next;
+ continue;
+ }
+
+ /* Drop this request */
+ inet_csk_reqsk_queue_unlink(parent, req, reqp);
+ reqsk_queue_removed(queue, req);
+ reqsk_free(req);
+ continue;
+ }
+ reqp = &req->dl_next;
+ }
+
+ i = (i + 1) & (lopt->nr_table_entries - 1);
+
+ } while (--budget > 0);
+
+ lopt->clock_hand = i;
+
+ if (lopt->qlen)
+ inet_csk_reset_keepalive_timer(parent, interval);
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
+
+struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
+ const unsigned int __nocast priority)
+{
+ struct sock *newsk = sk_clone(sk, priority);
+
+ if (newsk != NULL) {
+ struct inet_connection_sock *newicsk = inet_csk(newsk);
+
+ newsk->sk_state = TCP_SYN_RECV;
+ newicsk->icsk_bind_hash = NULL;
+
+ inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
+ newsk->sk_write_space = sk_stream_write_space;
+
+ newicsk->icsk_retransmits = 0;
+ newicsk->icsk_backoff = 0;
+ newicsk->icsk_probes_out = 0;
+
+ /* Deinitialize accept_queue to trap illegal accesses. */
+ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+ }
+ return newsk;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_clone);
+
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all. Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void inet_csk_destroy_sock(struct sock *sk)
+{
+ BUG_TRAP(sk->sk_state == TCP_CLOSE);
+ BUG_TRAP(sock_flag(sk, SOCK_DEAD));
+
+ /* It cannot be in hash table! */
+ BUG_TRAP(sk_unhashed(sk));
+
+ /* If it has not 0 inet_sk(sk)->num, it must be bound */
+ BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash);
+
+ sk->sk_prot->destroy(sk);
+
+ sk_stream_kill_queues(sk);
+
+ xfrm_sk_free_policy(sk);
+
+ sk_refcnt_debug_release(sk);
+
+ atomic_dec(sk->sk_prot->orphan_count);
+ sock_put(sk);
+}
+
+EXPORT_SYMBOL(inet_csk_destroy_sock);
+
+int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+
+ if (rc != 0)
+ return rc;
+
+ sk->sk_max_ack_backlog = 0;
+ sk->sk_ack_backlog = 0;
+ inet_csk_delack_init(sk);
+
+ /* There is race window here: we announce ourselves listening,
+ * but this transition is still not validated by get_port().
+ * It is OK, because this socket enters to hash table only
+ * after validation is complete.
+ */
+ sk->sk_state = TCP_LISTEN;
+ if (!sk->sk_prot->get_port(sk, inet->num)) {
+ inet->sport = htons(inet->num);
+
+ sk_dst_reset(sk);
+ sk->sk_prot->hash(sk);
+
+ return 0;
+ }
+
+ sk->sk_state = TCP_CLOSE;
+ __reqsk_queue_destroy(&icsk->icsk_accept_queue);
+ return -EADDRINUSE;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+
+/*
+ * This routine closes sockets which have been at least partially
+ * opened, but not yet accepted.
+ */
+void inet_csk_listen_stop(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock *acc_req;
+ struct request_sock *req;
+
+ inet_csk_delete_keepalive_timer(sk);
+
+ /* make all the listen_opt local to us */
+ acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+
+ /* Following specs, it would be better either to send FIN
+ * (and enter FIN-WAIT-1, it is normal close)
+ * or to send active reset (abort).
+ * Certainly, it is pretty dangerous while synflood, but it is
+ * bad justification for our negligence 8)
+ * To be honest, we are not able to make either
+ * of the variants now. --ANK
+ */
+ reqsk_queue_destroy(&icsk->icsk_accept_queue);
+
+ while ((req = acc_req) != NULL) {
+ struct sock *child = req->sk;
+
+ acc_req = req->dl_next;
+
+ local_bh_disable();
+ bh_lock_sock(child);
+ BUG_TRAP(!sock_owned_by_user(child));
+ sock_hold(child);
+
+ sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ atomic_inc(sk->sk_prot->orphan_count);
+
+ inet_csk_destroy_sock(child);
+
+ bh_unlock_sock(child);
+ local_bh_enable();
+ sock_put(child);
+
+ sk_acceptq_removed(sk);
+ __reqsk_free(req);
+ }
+ BUG_TRAP(!sk->sk_ack_backlog);
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
new file mode 100644
index 0000000..71f3c73
--- /dev/null
+++ b/net/ipv4/inet_diag.c
@@ -0,0 +1,868 @@
+/*
+ * inet_diag.c Module for monitoring INET transport protocols sockets.
+ *
+ * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/inet6_hashtables.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+#include <linux/inet_diag.h>
+
+static const struct inet_diag_handler **inet_diag_table;
+
+struct inet_diag_entry {
+ u32 *saddr;
+ u32 *daddr;
+ u16 sport;
+ u16 dport;
+ u16 family;
+ u16 userlocks;
+};
+
+static struct sock *idiagnl;
+
+#define INET_DIAG_PUT(skb, attrtype, attrlen) \
+ RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
+
+static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
+ int ext, u32 pid, u32 seq, u16 nlmsg_flags,
+ const struct nlmsghdr *unlh)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ void *info = NULL;
+ struct inet_diag_meminfo *minfo = NULL;
+ unsigned char *b = skb->tail;
+ const struct inet_diag_handler *handler;
+
+ handler = inet_diag_table[unlh->nlmsg_type];
+ BUG_ON(handler == NULL);
+
+ nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+ nlh->nlmsg_flags = nlmsg_flags;
+
+ r = NLMSG_DATA(nlh);
+ if (sk->sk_state != TCP_TIME_WAIT) {
+ if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
+ minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO,
+ sizeof(*minfo));
+ if (ext & (1 << (INET_DIAG_INFO - 1)))
+ info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
+ handler->idiag_info_size);
+
+ if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
+ size_t len = strlen(icsk->icsk_ca_ops->name);
+ strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
+ icsk->icsk_ca_ops->name);
+ }
+ }
+ r->idiag_family = sk->sk_family;
+ r->idiag_state = sk->sk_state;
+ r->idiag_timer = 0;
+ r->idiag_retrans = 0;
+
+ r->id.idiag_if = sk->sk_bound_dev_if;
+ r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
+ r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+
+ if (r->idiag_state == TCP_TIME_WAIT) {
+ const struct inet_timewait_sock *tw = inet_twsk(sk);
+ long tmo = tw->tw_ttd - jiffies;
+ if (tmo < 0)
+ tmo = 0;
+
+ r->id.idiag_sport = tw->tw_sport;
+ r->id.idiag_dport = tw->tw_dport;
+ r->id.idiag_src[0] = tw->tw_rcv_saddr;
+ r->id.idiag_dst[0] = tw->tw_daddr;
+ r->idiag_state = tw->tw_substate;
+ r->idiag_timer = 3;
+ r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ;
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (r->idiag_family == AF_INET6) {
+ const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+ &tcp6tw->tw_v6_rcv_saddr);
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+ &tcp6tw->tw_v6_daddr);
+ }
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+ }
+
+ r->id.idiag_sport = inet->sport;
+ r->id.idiag_dport = inet->dport;
+ r->id.idiag_src[0] = inet->rcv_saddr;
+ r->id.idiag_dst[0] = inet->daddr;
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (r->idiag_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+ &np->rcv_saddr);
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+ &np->daddr);
+ }
+#endif
+
+#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+ r->idiag_timer = 1;
+ r->idiag_retrans = icsk->icsk_retransmits;
+ r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ r->idiag_timer = 4;
+ r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ } else if (timer_pending(&sk->sk_timer)) {
+ r->idiag_timer = 2;
+ r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+ } else {
+ r->idiag_timer = 0;
+ r->idiag_expires = 0;
+ }
+#undef EXPIRES_IN_MS
+
+ r->idiag_uid = sock_i_uid(sk);
+ r->idiag_inode = sock_i_ino(sk);
+
+ if (minfo) {
+ minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc);
+ minfo->idiag_wmem = sk->sk_wmem_queued;
+ minfo->idiag_fmem = sk->sk_forward_alloc;
+ minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc);
+ }
+
+ handler->idiag_get_info(sk, r, info);
+
+ if (sk->sk_state < TCP_TIME_WAIT &&
+ icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
+ icsk->icsk_ca_ops->get_info(sk, ext, skb);
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+rtattr_failure:
+nlmsg_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
+{
+ int err;
+ struct sock *sk;
+ struct inet_diag_req *req = NLMSG_DATA(nlh);
+ struct sk_buff *rep;
+ struct inet_hashinfo *hashinfo;
+ const struct inet_diag_handler *handler;
+
+ handler = inet_diag_table[nlh->nlmsg_type];
+ BUG_ON(handler == NULL);
+ hashinfo = handler->idiag_hashinfo;
+
+ if (req->idiag_family == AF_INET) {
+ sk = inet_lookup(hashinfo, req->id.idiag_dst[0],
+ req->id.idiag_dport, req->id.idiag_src[0],
+ req->id.idiag_sport, req->id.idiag_if);
+ }
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ else if (req->idiag_family == AF_INET6) {
+ sk = inet6_lookup(hashinfo,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
+ }
+#endif
+ else {
+ return -EINVAL;
+ }
+
+ if (sk == NULL)
+ return -ENOENT;
+
+ err = -ESTALE;
+ if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
+ req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
+ ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
+ (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
+ goto out;
+
+ err = -ENOMEM;
+ rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
+ sizeof(struct inet_diag_meminfo) +
+ handler->idiag_info_size + 64)),
+ GFP_KERNEL);
+ if (!rep)
+ goto out;
+
+ if (inet_diag_fill(rep, sk, req->idiag_ext,
+ NETLINK_CB(in_skb).pid,
+ nlh->nlmsg_seq, 0, nlh) <= 0)
+ BUG();
+
+ err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
+ MSG_DONTWAIT);
+ if (err > 0)
+ err = 0;
+
+out:
+ if (sk) {
+ if (sk->sk_state == TCP_TIME_WAIT)
+ inet_twsk_put((struct inet_timewait_sock *)sk);
+ else
+ sock_put(sk);
+ }
+ return err;
+}
+
+static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
+{
+ int words = bits >> 5;
+
+ bits &= 0x1f;
+
+ if (words) {
+ if (memcmp(a1, a2, words << 2))
+ return 0;
+ }
+ if (bits) {
+ __u32 w1, w2;
+ __u32 mask;
+
+ w1 = a1[words];
+ w2 = a2[words];
+
+ mask = htonl((0xffffffff) << (32 - bits));
+
+ if ((w1 ^ w2) & mask)
+ return 0;
+ }
+
+ return 1;
+}
+
+
+static int inet_diag_bc_run(const void *bc, int len,
+ const struct inet_diag_entry *entry)
+{
+ while (len > 0) {
+ int yes = 1;
+ const struct inet_diag_bc_op *op = bc;
+
+ switch (op->code) {
+ case INET_DIAG_BC_NOP:
+ break;
+ case INET_DIAG_BC_JMP:
+ yes = 0;
+ break;
+ case INET_DIAG_BC_S_GE:
+ yes = entry->sport >= op[1].no;
+ break;
+ case INET_DIAG_BC_S_LE:
+ yes = entry->dport <= op[1].no;
+ break;
+ case INET_DIAG_BC_D_GE:
+ yes = entry->dport >= op[1].no;
+ break;
+ case INET_DIAG_BC_D_LE:
+ yes = entry->dport <= op[1].no;
+ break;
+ case INET_DIAG_BC_AUTO:
+ yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+ break;
+ case INET_DIAG_BC_S_COND:
+ case INET_DIAG_BC_D_COND: {
+ struct inet_diag_hostcond *cond;
+ u32 *addr;
+
+ cond = (struct inet_diag_hostcond *)(op + 1);
+ if (cond->port != -1 &&
+ cond->port != (op->code == INET_DIAG_BC_S_COND ?
+ entry->sport : entry->dport)) {
+ yes = 0;
+ break;
+ }
+
+ if (cond->prefix_len == 0)
+ break;
+
+ if (op->code == INET_DIAG_BC_S_COND)
+ addr = entry->saddr;
+ else
+ addr = entry->daddr;
+
+ if (bitstring_match(addr, cond->addr, cond->prefix_len))
+ break;
+ if (entry->family == AF_INET6 &&
+ cond->family == AF_INET) {
+ if (addr[0] == 0 && addr[1] == 0 &&
+ addr[2] == htonl(0xffff) &&
+ bitstring_match(addr + 3, cond->addr,
+ cond->prefix_len))
+ break;
+ }
+ yes = 0;
+ break;
+ }
+ }
+
+ if (yes) {
+ len -= op->yes;
+ bc += op->yes;
+ } else {
+ len -= op->no;
+ bc += op->no;
+ }
+ }
+ return (len == 0);
+}
+
+static int valid_cc(const void *bc, int len, int cc)
+{
+ while (len >= 0) {
+ const struct inet_diag_bc_op *op = bc;
+
+ if (cc > len)
+ return 0;
+ if (cc == len)
+ return 1;
+ if (op->yes < 4)
+ return 0;
+ len -= op->yes;
+ bc += op->yes;
+ }
+ return 0;
+}
+
+static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+{
+ const unsigned char *bc = bytecode;
+ int len = bytecode_len;
+
+ while (len > 0) {
+ struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc;
+
+//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
+ switch (op->code) {
+ case INET_DIAG_BC_AUTO:
+ case INET_DIAG_BC_S_COND:
+ case INET_DIAG_BC_D_COND:
+ case INET_DIAG_BC_S_GE:
+ case INET_DIAG_BC_S_LE:
+ case INET_DIAG_BC_D_GE:
+ case INET_DIAG_BC_D_LE:
+ if (op->yes < 4 || op->yes > len + 4)
+ return -EINVAL;
+ case INET_DIAG_BC_JMP:
+ if (op->no < 4 || op->no > len + 4)
+ return -EINVAL;
+ if (op->no < len &&
+ !valid_cc(bytecode, bytecode_len, len - op->no))
+ return -EINVAL;
+ break;
+ case INET_DIAG_BC_NOP:
+ if (op->yes < 4 || op->yes > len + 4)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ bc += op->yes;
+ len -= op->yes;
+ }
+ return len == 0 ? 0 : -EINVAL;
+}
+
+static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+
+ if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+ struct inet_diag_entry entry;
+ struct rtattr *bc = (struct rtattr *)(r + 1);
+ struct inet_sock *inet = inet_sk(sk);
+
+ entry.family = sk->sk_family;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (entry.family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ entry.saddr = np->rcv_saddr.s6_addr32;
+ entry.daddr = np->daddr.s6_addr32;
+ } else
+#endif
+ {
+ entry.saddr = &inet->rcv_saddr;
+ entry.daddr = &inet->daddr;
+ }
+ entry.sport = inet->num;
+ entry.dport = ntohs(inet->dport);
+ entry.userlocks = sk->sk_userlocks;
+
+ if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
+ return 0;
+ }
+
+ return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+
+static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
+ struct request_sock *req,
+ u32 pid, u32 seq,
+ const struct nlmsghdr *unlh)
+{
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct inet_sock *inet = inet_sk(sk);
+ unsigned char *b = skb->tail;
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+ nlh->nlmsg_flags = NLM_F_MULTI;
+ r = NLMSG_DATA(nlh);
+
+ r->idiag_family = sk->sk_family;
+ r->idiag_state = TCP_SYN_RECV;
+ r->idiag_timer = 1;
+ r->idiag_retrans = req->retrans;
+
+ r->id.idiag_if = sk->sk_bound_dev_if;
+ r->id.idiag_cookie[0] = (u32)(unsigned long)req;
+ r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
+
+ tmo = req->expires - jiffies;
+ if (tmo < 0)
+ tmo = 0;
+
+ r->id.idiag_sport = inet->sport;
+ r->id.idiag_dport = ireq->rmt_port;
+ r->id.idiag_src[0] = ireq->loc_addr;
+ r->id.idiag_dst[0] = ireq->rmt_addr;
+ r->idiag_expires = jiffies_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = sock_i_uid(sk);
+ r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (r->idiag_family == AF_INET6) {
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+ &tcp6_rsk(req)->loc_addr);
+ ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+ &tcp6_rsk(req)->rmt_addr);
+ }
+#endif
+ nlh->nlmsg_len = skb->tail - b;
+
+ return skb->len;
+
+nlmsg_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct inet_diag_entry entry;
+ struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt;
+ struct rtattr *bc = NULL;
+ struct inet_sock *inet = inet_sk(sk);
+ int j, s_j;
+ int reqnum, s_reqnum;
+ int err = 0;
+
+ s_j = cb->args[3];
+ s_reqnum = cb->args[4];
+
+ if (s_j > 0)
+ s_j--;
+
+ entry.family = sk->sk_family;
+
+ read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ lopt = icsk->icsk_accept_queue.listen_opt;
+ if (!lopt || !lopt->qlen)
+ goto out;
+
+ if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+ bc = (struct rtattr *)(r + 1);
+ entry.sport = inet->num;
+ entry.userlocks = sk->sk_userlocks;
+ }
+
+ for (j = s_j; j < lopt->nr_table_entries; j++) {
+ struct request_sock *req, *head = lopt->syn_table[j];
+
+ reqnum = 0;
+ for (req = head; req; reqnum++, req = req->dl_next) {
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (reqnum < s_reqnum)
+ continue;
+ if (r->id.idiag_dport != ireq->rmt_port &&
+ r->id.idiag_dport)
+ continue;
+
+ if (bc) {
+ entry.saddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ (entry.family == AF_INET6) ?
+ tcp6_rsk(req)->loc_addr.s6_addr32 :
+#endif
+ &ireq->loc_addr;
+ entry.daddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ (entry.family == AF_INET6) ?
+ tcp6_rsk(req)->rmt_addr.s6_addr32 :
+#endif
+ &ireq->rmt_addr;
+ entry.dport = ntohs(ireq->rmt_port);
+
+ if (!inet_diag_bc_run(RTA_DATA(bc),
+ RTA_PAYLOAD(bc), &entry))
+ continue;
+ }
+
+ err = inet_diag_fill_req(skb, sk, req,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, cb->nlh);
+ if (err < 0) {
+ cb->args[3] = j + 1;
+ cb->args[4] = reqnum;
+ goto out;
+ }
+ }
+
+ s_reqnum = 0;
+ }
+
+out:
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+ return err;
+}
+
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int i, num;
+ int s_i, s_num;
+ struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+ const struct inet_diag_handler *handler;
+ struct inet_hashinfo *hashinfo;
+
+ handler = inet_diag_table[cb->nlh->nlmsg_type];
+ BUG_ON(handler == NULL);
+ hashinfo = handler->idiag_hashinfo;
+
+ s_i = cb->args[1];
+ s_num = num = cb->args[2];
+
+ if (cb->args[0] == 0) {
+ if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+ goto skip_listen_ht;
+
+ inet_listen_lock(hashinfo);
+ for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
+ struct sock *sk;
+ struct hlist_node *node;
+
+ num = 0;
+ sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (r->id.idiag_sport != inet->sport &&
+ r->id.idiag_sport)
+ goto next_listen;
+
+ if (!(r->idiag_states & TCPF_LISTEN) ||
+ r->id.idiag_dport ||
+ cb->args[3] > 0)
+ goto syn_recv;
+
+ if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+ inet_listen_unlock(hashinfo);
+ goto done;
+ }
+
+syn_recv:
+ if (!(r->idiag_states & TCPF_SYN_RECV))
+ goto next_listen;
+
+ if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
+ inet_listen_unlock(hashinfo);
+ goto done;
+ }
+
+next_listen:
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ ++num;
+ }
+
+ s_num = 0;
+ cb->args[3] = 0;
+ cb->args[4] = 0;
+ }
+ inet_listen_unlock(hashinfo);
+skip_listen_ht:
+ cb->args[0] = 1;
+ s_i = num = s_num = 0;
+ }
+
+ if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+ return skb->len;
+
+ for (i = s_i; i < hashinfo->ehash_size; i++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+ struct sock *sk;
+ struct hlist_node *node;
+
+ if (i > s_i)
+ s_num = 0;
+
+ read_lock_bh(&head->lock);
+
+ num = 0;
+ sk_for_each(sk, node, &head->chain) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_normal;
+ if (!(r->idiag_states & (1 << sk->sk_state)))
+ goto next_normal;
+ if (r->id.idiag_sport != inet->sport &&
+ r->id.idiag_sport)
+ goto next_normal;
+ if (r->id.idiag_dport != inet->dport && r->id.idiag_dport)
+ goto next_normal;
+ if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+ read_unlock_bh(&head->lock);
+ goto done;
+ }
+next_normal:
+ ++num;
+ }
+
+ if (r->idiag_states & TCPF_TIME_WAIT) {
+ sk_for_each(sk, node,
+ &hashinfo->ehash[i + hashinfo->ehash_size].chain) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_dying;
+ if (r->id.idiag_sport != inet->sport &&
+ r->id.idiag_sport)
+ goto next_dying;
+ if (r->id.idiag_dport != inet->dport &&
+ r->id.idiag_dport)
+ goto next_dying;
+ if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+ read_unlock_bh(&head->lock);
+ goto done;
+ }
+next_dying:
+ ++num;
+ }
+ }
+ read_unlock_bh(&head->lock);
+ }
+
+done:
+ cb->args[1] = i;
+ cb->args[2] = num;
+ return skb->len;
+}
+
+static int inet_diag_dump_done(struct netlink_callback *cb)
+{
+ return 0;
+}
+
+
+static __inline__ int
+inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
+ return 0;
+
+ if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX)
+ goto err_inval;
+
+ if (inet_diag_table[nlh->nlmsg_type] == NULL)
+ return -ENOENT;
+
+ if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len)
+ goto err_inval;
+
+ if (nlh->nlmsg_flags&NLM_F_DUMP) {
+ if (nlh->nlmsg_len >
+ (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) {
+ struct rtattr *rta = (void *)(NLMSG_DATA(nlh) +
+ sizeof(struct inet_diag_req));
+ if (rta->rta_type != INET_DIAG_REQ_BYTECODE ||
+ rta->rta_len < 8 ||
+ rta->rta_len >
+ (nlh->nlmsg_len -
+ NLMSG_SPACE(sizeof(struct inet_diag_req))))
+ goto err_inval;
+ if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
+ goto err_inval;
+ }
+ return netlink_dump_start(idiagnl, skb, nlh,
+ inet_diag_dump,
+ inet_diag_dump_done);
+ } else {
+ return inet_diag_get_exact(skb, nlh);
+ }
+
+err_inval:
+ return -EINVAL;
+}
+
+
+static inline void inet_diag_rcv_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr * nlh;
+
+ if (skb->len >= NLMSG_SPACE(0)) {
+ nlh = (struct nlmsghdr *)skb->data;
+ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ return;
+ err = inet_diag_rcv_msg(skb, nlh);
+ if (err || nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, err);
+ }
+}
+
+static void inet_diag_rcv(struct sock *sk, int len)
+{
+ struct sk_buff *skb;
+ unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
+
+ while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
+ inet_diag_rcv_skb(skb);
+ kfree_skb(skb);
+ }
+}
+
+static DEFINE_SPINLOCK(inet_diag_register_lock);
+
+int inet_diag_register(const struct inet_diag_handler *h)
+{
+ const __u16 type = h->idiag_type;
+ int err = -EINVAL;
+
+ if (type >= INET_DIAG_GETSOCK_MAX)
+ goto out;
+
+ spin_lock(&inet_diag_register_lock);
+ err = -EEXIST;
+ if (inet_diag_table[type] == NULL) {
+ inet_diag_table[type] = h;
+ err = 0;
+ }
+ spin_unlock(&inet_diag_register_lock);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_register);
+
+void inet_diag_unregister(const struct inet_diag_handler *h)
+{
+ const __u16 type = h->idiag_type;
+
+ if (type >= INET_DIAG_GETSOCK_MAX)
+ return;
+
+ spin_lock(&inet_diag_register_lock);
+ inet_diag_table[type] = NULL;
+ spin_unlock(&inet_diag_register_lock);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(inet_diag_unregister);
+
+static int __init inet_diag_init(void)
+{
+ const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
+ sizeof(struct inet_diag_handler *));
+ int err = -ENOMEM;
+
+ inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL);
+ if (!inet_diag_table)
+ goto out;
+
+ memset(inet_diag_table, 0, inet_diag_table_size);
+ idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv,
+ THIS_MODULE);
+ if (idiagnl == NULL)
+ goto out_free_table;
+ err = 0;
+out:
+ return err;
+out_free_table:
+ kfree(inet_diag_table);
+ goto out;
+}
+
+static void __exit inet_diag_exit(void)
+{
+ sock_release(idiagnl->sk_socket);
+ kfree(inet_diag_table);
+}
+
+module_init(inet_diag_init);
+module_exit(inet_diag_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
new file mode 100644
index 0000000..e8d29fe
--- /dev/null
+++ b/net/ipv4/inet_hashtables.c
@@ -0,0 +1,165 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic INET transport hashtables
+ *
+ * Authors: Lotsa people, from code originally in tcp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+
+/*
+ * Allocate and initialize a new local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
+ struct inet_bind_hashbucket *head,
+ const unsigned short snum)
+{
+ struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC);
+
+ if (tb != NULL) {
+ tb->port = snum;
+ tb->fastreuse = 0;
+ INIT_HLIST_HEAD(&tb->owners);
+ hlist_add_head(&tb->node, &head->chain);
+ }
+ return tb;
+}
+
+EXPORT_SYMBOL(inet_bind_bucket_create);
+
+/*
+ * Caller must hold hashbucket lock for this tb with local BH disabled
+ */
+void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb)
+{
+ if (hlist_empty(&tb->owners)) {
+ __hlist_del(&tb->node);
+ kmem_cache_free(cachep, tb);
+ }
+}
+
+void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+ const unsigned short snum)
+{
+ inet_sk(sk)->num = snum;
+ sk_add_bind_node(sk, &tb->owners);
+ inet_csk(sk)->icsk_bind_hash = tb;
+}
+
+EXPORT_SYMBOL(inet_bind_hash);
+
+/*
+ * Get rid of any references to a local port held by the given sock.
+ */
+static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+ const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
+ struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+ struct inet_bind_bucket *tb;
+
+ spin_lock(&head->lock);
+ tb = inet_csk(sk)->icsk_bind_hash;
+ __sk_del_bind_node(sk);
+ inet_csk(sk)->icsk_bind_hash = NULL;
+ inet_sk(sk)->num = 0;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ spin_unlock(&head->lock);
+}
+
+void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+ local_bh_disable();
+ __inet_put_port(hashinfo, sk);
+ local_bh_enable();
+}
+
+EXPORT_SYMBOL(inet_put_port);
+
+/*
+ * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
+ * Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines (wake up each
+ * exclusive lock release). It should be ifdefed really.
+ */
+void inet_listen_wlock(struct inet_hashinfo *hashinfo)
+{
+ write_lock(&hashinfo->lhash_lock);
+
+ if (atomic_read(&hashinfo->lhash_users)) {
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait_exclusive(&hashinfo->lhash_wait,
+ &wait, TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&hashinfo->lhash_users))
+ break;
+ write_unlock_bh(&hashinfo->lhash_lock);
+ schedule();
+ write_lock_bh(&hashinfo->lhash_lock);
+ }
+
+ finish_wait(&hashinfo->lhash_wait, &wait);
+ }
+}
+
+EXPORT_SYMBOL(inet_listen_wlock);
+
+/*
+ * Don't inline this cruft. Here are some nice properties to exploit here. The
+ * BSD API does not allow a listening sock to specify the remote port nor the
+ * remote address for the connection. So always assume those are both
+ * wildcarded during the search since they can never be otherwise.
+ */
+struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr,
+ const unsigned short hnum, const int dif)
+{
+ struct sock *result = NULL, *sk;
+ const struct hlist_node *node;
+ int hiscore = -1;
+
+ sk_for_each(sk, node, head) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+ if (inet->num == hnum && !ipv6_only_sock(sk)) {
+ const __u32 rcv_saddr = inet->rcv_saddr;
+ int score = sk->sk_family == PF_INET ? 1 : 0;
+
+ if (rcv_saddr) {
+ if (rcv_saddr != daddr)
+ continue;
+ score += 2;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ continue;
+ score += 2;
+ }
+ if (score == 5)
+ return sk;
+ if (score > hiscore) {
+ hiscore = score;
+ result = sk;
+ }
+ }
+ }
+ return result;
+}
+
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
new file mode 100644
index 0000000..4d1502a
--- /dev/null
+++ b/net/ipv4/inet_timewait_sock.c
@@ -0,0 +1,384 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic TIME_WAIT sockets functions
+ *
+ * From code orinally in TCP
+ */
+
+#include <linux/config.h>
+
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+
+/* Must be called with locally disabled BHs. */
+void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo)
+{
+ struct inet_bind_hashbucket *bhead;
+ struct inet_bind_bucket *tb;
+ /* Unlink from established hashes. */
+ struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent];
+
+ write_lock(&ehead->lock);
+ if (hlist_unhashed(&tw->tw_node)) {
+ write_unlock(&ehead->lock);
+ return;
+ }
+ __hlist_del(&tw->tw_node);
+ sk_node_init(&tw->tw_node);
+ write_unlock(&ehead->lock);
+
+ /* Disassociate with bind bucket. */
+ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
+ spin_lock(&bhead->lock);
+ tb = tw->tw_tb;
+ __hlist_del(&tw->tw_bind_node);
+ tw->tw_tb = NULL;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ spin_unlock(&bhead->lock);
+#ifdef SOCK_REFCNT_DEBUG
+ if (atomic_read(&tw->tw_refcnt) != 1) {
+ printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
+ tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
+ }
+#endif
+ inet_twsk_put(tw);
+}
+
+EXPORT_SYMBOL_GPL(__inet_twsk_kill);
+
+/*
+ * Enter the time wait state. This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the relevant info into it
+ * from the SK, and mess with hash chains and list linkage.
+ */
+void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+ struct inet_hashinfo *hashinfo)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent];
+ struct inet_bind_hashbucket *bhead;
+ /* Step 1: Put TW into bind hash. Original socket stays there too.
+ Note, that any socket with inet->num != 0 MUST be bound in
+ binding cache, even if it is closed.
+ */
+ bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)];
+ spin_lock(&bhead->lock);
+ tw->tw_tb = icsk->icsk_bind_hash;
+ BUG_TRAP(icsk->icsk_bind_hash);
+ inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+ spin_unlock(&bhead->lock);
+
+ write_lock(&ehead->lock);
+
+ /* Step 2: Remove SK from established hash. */
+ if (__sk_del_node_init(sk))
+ sock_prot_dec_use(sk->sk_prot);
+
+ /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
+ inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain);
+ atomic_inc(&tw->tw_refcnt);
+
+ write_unlock(&ehead->lock);
+}
+
+EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
+{
+ struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab,
+ SLAB_ATOMIC);
+ if (tw != NULL) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+ /* Give us an identity. */
+ tw->tw_daddr = inet->daddr;
+ tw->tw_rcv_saddr = inet->rcv_saddr;
+ tw->tw_bound_dev_if = sk->sk_bound_dev_if;
+ tw->tw_num = inet->num;
+ tw->tw_state = TCP_TIME_WAIT;
+ tw->tw_substate = state;
+ tw->tw_sport = inet->sport;
+ tw->tw_dport = inet->dport;
+ tw->tw_family = sk->sk_family;
+ tw->tw_reuse = sk->sk_reuse;
+ tw->tw_hashent = sk->sk_hashent;
+ tw->tw_ipv6only = 0;
+ tw->tw_prot = sk->sk_prot_creator;
+ atomic_set(&tw->tw_refcnt, 1);
+ inet_twsk_dead_node_init(tw);
+ }
+
+ return tw;
+}
+
+EXPORT_SYMBOL_GPL(inet_twsk_alloc);
+
+/* Returns non-zero if quota exceeded. */
+static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
+ const int slot)
+{
+ struct inet_timewait_sock *tw;
+ struct hlist_node *node;
+ unsigned int killed;
+ int ret;
+
+ /* NOTE: compare this to previous version where lock
+ * was released after detaching chain. It was racy,
+ * because tw buckets are scheduled in not serialized context
+ * in 2.3 (with netfilter), and with softnet it is common, because
+ * soft irqs are not sequenced.
+ */
+ killed = 0;
+ ret = 0;
+rescan:
+ inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
+ __inet_twsk_del_dead_node(tw);
+ spin_unlock(&twdr->death_lock);
+ __inet_twsk_kill(tw, twdr->hashinfo);
+ inet_twsk_put(tw);
+ killed++;
+ spin_lock(&twdr->death_lock);
+ if (killed > INET_TWDR_TWKILL_QUOTA) {
+ ret = 1;
+ break;
+ }
+
+ /* While we dropped twdr->death_lock, another cpu may have
+ * killed off the next TW bucket in the list, therefore
+ * do a fresh re-read of the hlist head node with the
+ * lock reacquired. We still use the hlist traversal
+ * macro in order to get the prefetches.
+ */
+ goto rescan;
+ }
+
+ twdr->tw_count -= killed;
+ NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
+
+ return ret;
+}
+
+void inet_twdr_hangman(unsigned long data)
+{
+ struct inet_timewait_death_row *twdr;
+ int unsigned need_timer;
+
+ twdr = (struct inet_timewait_death_row *)data;
+ spin_lock(&twdr->death_lock);
+
+ if (twdr->tw_count == 0)
+ goto out;
+
+ need_timer = 0;
+ if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
+ twdr->thread_slots |= (1 << twdr->slot);
+ mb();
+ schedule_work(&twdr->twkill_work);
+ need_timer = 1;
+ } else {
+ /* We purged the entire slot, anything left? */
+ if (twdr->tw_count)
+ need_timer = 1;
+ }
+ twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
+ if (need_timer)
+ mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+out:
+ spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_hangman);
+
+extern void twkill_slots_invalid(void);
+
+void inet_twdr_twkill_work(void *data)
+{
+ struct inet_timewait_death_row *twdr = data;
+ int i;
+
+ if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
+ twkill_slots_invalid();
+
+ while (twdr->thread_slots) {
+ spin_lock_bh(&twdr->death_lock);
+ for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
+ if (!(twdr->thread_slots & (1 << i)))
+ continue;
+
+ while (inet_twdr_do_twkill_work(twdr, i) != 0) {
+ if (need_resched()) {
+ spin_unlock_bh(&twdr->death_lock);
+ schedule();
+ spin_lock_bh(&twdr->death_lock);
+ }
+ }
+
+ twdr->thread_slots &= ~(1 << i);
+ }
+ spin_unlock_bh(&twdr->death_lock);
+ }
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
+
+/* These are always called from BH context. See callers in
+ * tcp_input.c to verify this.
+ */
+
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void inet_twsk_deschedule(struct inet_timewait_sock *tw,
+ struct inet_timewait_death_row *twdr)
+{
+ spin_lock(&twdr->death_lock);
+ if (inet_twsk_del_dead_node(tw)) {
+ inet_twsk_put(tw);
+ if (--twdr->tw_count == 0)
+ del_timer(&twdr->tw_timer);
+ }
+ spin_unlock(&twdr->death_lock);
+ __inet_twsk_kill(tw, twdr->hashinfo);
+}
+
+EXPORT_SYMBOL(inet_twsk_deschedule);
+
+void inet_twsk_schedule(struct inet_timewait_sock *tw,
+ struct inet_timewait_death_row *twdr,
+ const int timeo, const int timewait_len)
+{
+ struct hlist_head *list;
+ int slot;
+
+ /* timeout := RTO * 3.5
+ *
+ * 3.5 = 1+2+0.5 to wait for two retransmits.
+ *
+ * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+ * our ACK acking that FIN can be lost. If N subsequent retransmitted
+ * FINs (or previous seqments) are lost (probability of such event
+ * is p^(N+1), where p is probability to lose single packet and
+ * time to detect the loss is about RTO*(2^N - 1) with exponential
+ * backoff). Normal timewait length is calculated so, that we
+ * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+ * [ BTW Linux. following BSD, violates this requirement waiting
+ * only for 60sec, we should wait at least for 240 secs.
+ * Well, 240 consumes too much of resources 8)
+ * ]
+ * This interval is not reduced to catch old duplicate and
+ * responces to our wandering segments living for two MSLs.
+ * However, if we use PAWS to detect
+ * old duplicates, we can reduce the interval to bounds required
+ * by RTO, rather than MSL. So, if peer understands PAWS, we
+ * kill tw bucket after 3.5*RTO (it is important that this number
+ * is greater than TS tick!) and detect old duplicates with help
+ * of PAWS.
+ */
+ slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
+
+ spin_lock(&twdr->death_lock);
+
+ /* Unlink it, if it was scheduled */
+ if (inet_twsk_del_dead_node(tw))
+ twdr->tw_count--;
+ else
+ atomic_inc(&tw->tw_refcnt);
+
+ if (slot >= INET_TWDR_RECYCLE_SLOTS) {
+ /* Schedule to slow timer */
+ if (timeo >= timewait_len) {
+ slot = INET_TWDR_TWKILL_SLOTS - 1;
+ } else {
+ slot = (timeo + twdr->period - 1) / twdr->period;
+ if (slot >= INET_TWDR_TWKILL_SLOTS)
+ slot = INET_TWDR_TWKILL_SLOTS - 1;
+ }
+ tw->tw_ttd = jiffies + timeo;
+ slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
+ list = &twdr->cells[slot];
+ } else {
+ tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
+
+ if (twdr->twcal_hand < 0) {
+ twdr->twcal_hand = 0;
+ twdr->twcal_jiffie = jiffies;
+ twdr->twcal_timer.expires = twdr->twcal_jiffie +
+ (slot << INET_TWDR_RECYCLE_TICK);
+ add_timer(&twdr->twcal_timer);
+ } else {
+ if (time_after(twdr->twcal_timer.expires,
+ jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
+ mod_timer(&twdr->twcal_timer,
+ jiffies + (slot << INET_TWDR_RECYCLE_TICK));
+ slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
+ }
+ list = &twdr->twcal_row[slot];
+ }
+
+ hlist_add_head(&tw->tw_death_node, list);
+
+ if (twdr->tw_count++ == 0)
+ mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+ spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twsk_schedule);
+
+void inet_twdr_twcal_tick(unsigned long data)
+{
+ struct inet_timewait_death_row *twdr;
+ int n, slot;
+ unsigned long j;
+ unsigned long now = jiffies;
+ int killed = 0;
+ int adv = 0;
+
+ twdr = (struct inet_timewait_death_row *)data;
+
+ spin_lock(&twdr->death_lock);
+ if (twdr->twcal_hand < 0)
+ goto out;
+
+ slot = twdr->twcal_hand;
+ j = twdr->twcal_jiffie;
+
+ for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
+ if (time_before_eq(j, now)) {
+ struct hlist_node *node, *safe;
+ struct inet_timewait_sock *tw;
+
+ inet_twsk_for_each_inmate_safe(tw, node, safe,
+ &twdr->twcal_row[slot]) {
+ __inet_twsk_del_dead_node(tw);
+ __inet_twsk_kill(tw, twdr->hashinfo);
+ inet_twsk_put(tw);
+ killed++;
+ }
+ } else {
+ if (!adv) {
+ adv = 1;
+ twdr->twcal_jiffie = j;
+ twdr->twcal_hand = slot;
+ }
+
+ if (!hlist_empty(&twdr->twcal_row[slot])) {
+ mod_timer(&twdr->twcal_timer, j);
+ goto out;
+ }
+ }
+ j += 1 << INET_TWDR_RECYCLE_TICK;
+ slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
+ }
+ twdr->twcal_hand = -1;
+
+out:
+ if ((twdr->tw_count -= killed) == 0)
+ del_timer(&twdr->tw_timer);
+ NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
+ spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 9547395..f84ba9c 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -20,6 +20,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/net.h>
+#include <net/ip.h>
#include <net/inetpeer.h>
/*
@@ -72,7 +73,7 @@
/* Exported for inet_getid inline function. */
DEFINE_SPINLOCK(inet_peer_idlock);
-static kmem_cache_t *peer_cachep;
+static kmem_cache_t *peer_cachep __read_mostly;
#define node_height(x) x->avl_height
static struct inet_peer peer_fake_node = {
@@ -450,11 +451,12 @@ static void peer_check_expire(unsigned long dummy)
/* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
* interval depending on the total number of entries (more entries,
* less interval). */
- peer_periodic_timer.expires = jiffies
- + inet_peer_gc_maxtime
- - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
- peer_total / inet_peer_threshold * HZ;
+ if (peer_total >= inet_peer_threshold)
+ peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
+ else
+ peer_periodic_timer.expires = jiffies
+ + inet_peer_gc_maxtime
+ - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
+ peer_total / inet_peer_threshold * HZ;
add_timer(&peer_periodic_timer);
}
-
-EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 77094aa..0923add1 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -76,16 +76,12 @@ int ip_forward(struct sk_buff *skb)
* that reaches zero, we must reply an ICMP control message telling
* that the packet's lifetime expired.
*/
-
- iph = skb->nh.iph;
-
- if (iph->ttl <= 1)
+ if (skb->nh.iph->ttl <= 1)
goto too_many_hops;
if (!xfrm4_route_forward(skb))
goto drop;
- iph = skb->nh.iph;
rt = (struct rtable*)skb->dst;
if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7f68e27..e7d26d9 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
return ip_frag_intern(hash, qp);
out_nomem:
- NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n"));
+ LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
return NULL;
}
@@ -457,7 +457,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (pskb_pull(skb, ihl) == NULL)
goto err;
- if (pskb_trim(skb, end-offset))
+ if (pskb_trim_rcsum(skb, end-offset))
goto err;
/* Find out which fragments are in front and at the back of us
@@ -533,7 +533,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (skb->dev)
qp->iif = skb->dev->ifindex;
skb->dev = NULL;
- qp->stamp = skb->stamp;
+ skb_get_timestamp(skb, &qp->stamp);
qp->meat += skb->len;
atomic_add(skb->truesize, &ip_frag_mem);
if (offset == 0)
@@ -615,7 +615,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
head->next = NULL;
head->dev = dev;
- head->stamp = qp->stamp;
+ skb_set_timestamp(head, &qp->stamp);
iph = head->nh.iph;
iph->frag_off = 0;
@@ -625,10 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
return head;
out_nomem:
- NETDEBUG(if (net_ratelimit())
- printk(KERN_ERR
- "IP: queue_glue: no memory for gluing queue %p\n",
- qp));
+ LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
+ "queue %p\n", qp);
goto out_fail;
out_oversize:
if (net_ratelimit())
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8848355..f0d5740 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -290,7 +290,6 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int
dev_hold(dev);
ipgre_tunnel_link(nt);
- /* Do not decrement MOD_USE_COUNT here. */
return nt;
failed:
@@ -1277,12 +1276,28 @@ err1:
goto out;
}
-static void ipgre_fini(void)
+static void __exit ipgre_destroy_tunnels(void)
+{
+ int prio;
+
+ for (prio = 0; prio < 4; prio++) {
+ int h;
+ for (h = 0; h < HASH_SIZE; h++) {
+ struct ip_tunnel *t;
+ while ((t = tunnels[prio][h]) != NULL)
+ unregister_netdevice(t->dev);
+ }
+ }
+}
+
+static void __exit ipgre_fini(void)
{
if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
printk(KERN_INFO "ipgre close: can't remove protocol\n");
- unregister_netdev(ipgre_fb_tunnel_dev);
+ rtnl_lock();
+ ipgre_destroy_tunnels();
+ rtnl_unlock();
}
module_init(ipgre_init);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4e47a26..473d0f2 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -150,7 +150,7 @@
* SNMP management statistics
*/
-DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly;
/*
* Process Router Attention IP option
@@ -184,6 +184,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
raw_rcv(last, skb2);
}
last = sk;
+ nf_reset(skb);
}
}
@@ -200,10 +201,6 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
{
int ihl = skb->nh.iph->ihl*4;
-#ifdef CONFIG_NETFILTER_DEBUG
- nf_debug_ip_local_deliver(skb);
-#endif /*CONFIG_NETFILTER_DEBUG*/
-
__skb_pull(skb, ihl);
/* Free reference early: we don't need it any more, and it may
@@ -228,8 +225,8 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
/* If there maybe a raw socket we must check - if not we
* don't care less
*/
- if (raw_sk)
- raw_v4_input(skb, skb->nh.iph, hash);
+ if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash))
+ raw_sk = NULL;
if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
int ret;
@@ -282,22 +279,78 @@ int ip_local_deliver(struct sk_buff *skb)
ip_local_deliver_finish);
}
-static inline int ip_rcv_finish(struct sk_buff *skb)
+static inline int ip_rcv_options(struct sk_buff *skb)
{
+ struct ip_options *opt;
+ struct iphdr *iph;
struct net_device *dev = skb->dev;
+
+ /* It looks as overkill, because not all
+ IP options require packet mangling.
+ But it is the easiest for now, especially taking
+ into account that combination of IP options
+ and running sniffer is extremely rare condition.
+ --ANK (980813)
+ */
+ if (skb_cow(skb, skb_headroom(skb))) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ goto drop;
+ }
+
+ iph = skb->nh.iph;
+
+ if (ip_options_compile(NULL, skb)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+ goto drop;
+ }
+
+ opt = &(IPCB(skb)->opt);
+ if (unlikely(opt->srr)) {
+ struct in_device *in_dev = in_dev_get(dev);
+ if (in_dev) {
+ if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+ if (IN_DEV_LOG_MARTIANS(in_dev) &&
+ net_ratelimit())
+ printk(KERN_INFO "source route option "
+ "%u.%u.%u.%u -> %u.%u.%u.%u\n",
+ NIPQUAD(iph->saddr),
+ NIPQUAD(iph->daddr));
+ in_dev_put(in_dev);
+ goto drop;
+ }
+
+ in_dev_put(in_dev);
+ }
+
+ if (ip_options_rcv_srr(skb))
+ goto drop;
+ }
+
+ return 0;
+drop:
+ return -1;
+}
+
+static inline int ip_rcv_finish(struct sk_buff *skb)
+{
struct iphdr *iph = skb->nh.iph;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
- if (skb->dst == NULL) {
- if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
+ if (likely(skb->dst == NULL)) {
+ int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
+ skb->dev);
+ if (unlikely(err)) {
+ if (err == -EHOSTUNREACH)
+ IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
goto drop;
+ }
}
#ifdef CONFIG_NET_CLS_ROUTE
- if (skb->dst->tclassid) {
+ if (unlikely(skb->dst->tclassid)) {
struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
u32 idx = skb->dst->tclassid;
st[idx&0xFF].o_packets++;
@@ -307,48 +360,11 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
}
#endif
- if (iph->ihl > 5) {
- struct ip_options *opt;
-
- /* It looks as overkill, because not all
- IP options require packet mangling.
- But it is the easiest for now, especially taking
- into account that combination of IP options
- and running sniffer is extremely rare condition.
- --ANK (980813)
- */
-
- if (skb_cow(skb, skb_headroom(skb))) {
- IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
- goto drop;
- }
- iph = skb->nh.iph;
-
- if (ip_options_compile(NULL, skb))
- goto inhdr_error;
-
- opt = &(IPCB(skb)->opt);
- if (opt->srr) {
- struct in_device *in_dev = in_dev_get(dev);
- if (in_dev) {
- if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
- if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
- printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
- NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
- in_dev_put(in_dev);
- goto drop;
- }
- in_dev_put(in_dev);
- }
- if (ip_options_rcv_srr(skb))
- goto drop;
- }
- }
+ if (iph->ihl > 5 && ip_rcv_options(skb))
+ goto drop;
return dst_input(skb);
-inhdr_error:
- IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
return NET_RX_DROP;
@@ -357,9 +373,10 @@ drop:
/*
* Main IP Receive routine.
*/
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct iphdr *iph;
+ u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
@@ -391,29 +408,27 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
*/
if (iph->ihl < 5 || iph->version != 4)
- goto inhdr_error;
+ goto inhdr_error;
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = skb->nh.iph;
- if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
- goto inhdr_error;
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ goto inhdr_error;
- {
- __u32 len = ntohs(iph->tot_len);
- if (skb->len < len || len < (iph->ihl<<2))
- goto inhdr_error;
+ len = ntohs(iph->tot_len);
+ if (skb->len < len || len < (iph->ihl*4))
+ goto inhdr_error;
- /* Our transport medium may have padded the buffer out. Now we know it
- * is IP we can trim to the true length of the frame.
- * Note this now means skb->len holds ntohs(iph->tot_len).
- */
- if (pskb_trim_rcsum(skb, len)) {
- IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
- goto drop;
- }
+ /* Our transport medium may have padded the buffer out. Now we know it
+ * is IP we can trim to the true length of the frame.
+ * Note this now means skb->len holds ntohs(iph->tot_len).
+ */
+ if (pskb_trim_rcsum(skb, len)) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+ goto drop;
}
return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
@@ -427,5 +442,4 @@ out:
return NET_RX_DROP;
}
-EXPORT_SYMBOL(ip_rcv);
EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 6d89f3f..bce4e87 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -489,23 +489,18 @@ void ip_options_undo(struct ip_options * opt)
}
}
-int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user)
+static struct ip_options *ip_options_get_alloc(const int optlen)
{
- struct ip_options *opt;
+ struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3),
+ GFP_KERNEL);
+ if (opt)
+ memset(opt, 0, sizeof(*opt));
+ return opt;
+}
- opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL);
- if (!opt)
- return -ENOMEM;
- memset(opt, 0, sizeof(struct ip_options));
- if (optlen) {
- if (user) {
- if (copy_from_user(opt->__data, data, optlen)) {
- kfree(opt);
- return -EFAULT;
- }
- } else
- memcpy(opt->__data, data, optlen);
- }
+static int ip_options_get_finish(struct ip_options **optp,
+ struct ip_options *opt, int optlen)
+{
while (optlen & 3)
opt->__data[optlen++] = IPOPT_END;
opt->optlen = optlen;
@@ -521,6 +516,30 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in
return 0;
}
+int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen)
+{
+ struct ip_options *opt = ip_options_get_alloc(optlen);
+
+ if (!opt)
+ return -ENOMEM;
+ if (optlen && copy_from_user(opt->__data, data, optlen)) {
+ kfree(opt);
+ return -EFAULT;
+ }
+ return ip_options_get_finish(optp, opt, optlen);
+}
+
+int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen)
+{
+ struct ip_options *opt = ip_options_get_alloc(optlen);
+
+ if (!opt)
+ return -ENOMEM;
+ if (optlen)
+ memcpy(opt->__data, data, optlen);
+ return ip_options_get_finish(optp, opt, optlen);
+}
+
void ip_forward_options(struct sk_buff *skb)
{
struct ip_options * opt = &(IPCB(skb)->opt);
@@ -620,6 +639,3 @@ int ip_options_rcv_srr(struct sk_buff *skb)
}
return 0;
}
-
-EXPORT_SYMBOL(ip_options_compile);
-EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 760dc82..3f1a263 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -69,13 +69,10 @@
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
-#include <net/tcp.h>
-#include <net/udp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/icmp.h>
-#include <net/raw.h>
#include <net/checksum.h>
#include <net/inetpeer.h>
#include <net/checksum.h>
@@ -84,12 +81,8 @@
#include <linux/netfilter_bridge.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
+#include <linux/tcp.h>
-/*
- * Shall we try to damage output packets if routing dev changes?
- */
-
-int sysctl_ip_dynaddr;
int sysctl_ip_default_ttl = IPDEFTTL;
/* Generate a checksum for an outgoing IP datagram. */
@@ -107,11 +100,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
newskb->pkt_type = PACKET_LOOPBACK;
newskb->ip_summed = CHECKSUM_UNNECESSARY;
BUG_TRAP(newskb->dst);
-
-#ifdef CONFIG_NETFILTER_DEBUG
- nf_debug_ip_loopback_xmit(newskb);
-#endif
- nf_reset(newskb);
netif_rx(newskb);
return 0;
}
@@ -170,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
dst_output);
}
+EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
+
static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
@@ -192,12 +182,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
skb = skb2;
}
-#ifdef CONFIG_NETFILTER_DEBUG
- nf_debug_ip_finish_output2(skb);
-#endif /*CONFIG_NETFILTER_DEBUG*/
-
- nf_reset(skb);
-
if (hh) {
int hh_alen;
@@ -216,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
return -EINVAL;
}
-int ip_finish_output(struct sk_buff *skb)
+static inline int ip_finish_output(struct sk_buff *skb)
{
struct net_device *dev = skb->dst->dev;
@@ -340,8 +324,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
if (ip_route_output_flow(&rt, &fl, sk, 0))
goto no_route;
}
- __sk_dst_set(sk, &rt->u.dst);
- tcp_v4_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->u.dst);
}
skb->dst = dst_clone(&rt->u.dst);
@@ -391,7 +374,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->pkt_type = from->pkt_type;
to->priority = from->priority;
to->protocol = from->protocol;
- to->security = from->security;
dst_release(to->dst);
to->dst = dst_clone(from->dst);
to->dev = from->dev;
@@ -404,7 +386,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
#endif
#ifdef CONFIG_NETFILTER
to->nfmark = from->nfmark;
- to->nfcache = from->nfcache;
/* Connection association is same as pre-frag packet */
nf_conntrack_put(to->nfct);
to->nfct = from->nfct;
@@ -415,9 +396,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->nf_bridge = from->nf_bridge;
nf_bridge_get(to->nf_bridge);
#endif
-#ifdef CONFIG_NETFILTER_DEBUG
- to->nf_debug = from->nf_debug;
-#endif
#endif
}
@@ -595,7 +573,7 @@ slow_path:
*/
if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
- NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+ NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
err = -ENOMEM;
goto fail;
}
@@ -1334,23 +1312,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
ip_rt_put(rt);
}
-/*
- * IP protocol layer initialiser
- */
-
-static struct packet_type ip_packet_type = {
- .type = __constant_htons(ETH_P_IP),
- .func = ip_rcv,
-};
-
-/*
- * IP registers the packet type and then calls the subprotocol initialisers
- */
-
void __init ip_init(void)
{
- dev_add_pack(&ip_packet_type);
-
ip_rt_init();
inet_initpeers();
@@ -1359,12 +1322,7 @@ void __init ip_init(void)
#endif
}
-EXPORT_SYMBOL(ip_finish_output);
EXPORT_SYMBOL(ip_fragment);
EXPORT_SYMBOL(ip_generic_getfrag);
EXPORT_SYMBOL(ip_queue_xmit);
EXPORT_SYMBOL(ip_send_check);
-
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(sysctl_ip_default_ttl);
-#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 47012b9..2f0b47d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -153,7 +153,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
switch (cmsg->cmsg_type) {
case IP_RETOPTS:
err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
- err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
+ err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40);
if (err)
return err;
break;
@@ -360,14 +360,14 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
err = copied;
/* Reset and regenerate socket error */
- spin_lock_irq(&sk->sk_error_queue.lock);
+ spin_lock_bh(&sk->sk_error_queue.lock);
sk->sk_err = 0;
if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
- spin_unlock_irq(&sk->sk_error_queue.lock);
+ spin_unlock_bh(&sk->sk_error_queue.lock);
sk->sk_error_report(sk);
} else
- spin_unlock_irq(&sk->sk_error_queue.lock);
+ spin_unlock_bh(&sk->sk_error_queue.lock);
out_free_skb:
kfree_skb(skb);
@@ -425,7 +425,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
struct ip_options * opt = NULL;
if (optlen > 40 || optlen < 0)
goto e_inval;
- err = ip_options_get(&opt, optval, optlen, 1);
+ err = ip_options_get_from_user(&opt, optval, optlen);
if (err)
break;
if (sk->sk_type == SOCK_STREAM) {
@@ -614,7 +614,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
}
case IP_MSFILTER:
{
- extern int sysctl_optmem_max;
extern int sysctl_igmp_max_msf;
struct ip_msfilter *msf;
@@ -677,11 +676,11 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
mreq.imr_address.s_addr = mreqs.imr_interface;
mreq.imr_ifindex = 0;
err = ip_mc_join_group(sk, &mreq);
- if (err)
+ if (err && err != -EADDRINUSE)
break;
omode = MCAST_INCLUDE;
add = 1;
- } else /*IP_DROP_SOURCE_MEMBERSHIP */ {
+ } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
omode = MCAST_INCLUDE;
add = 0;
}
@@ -754,7 +753,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
mreq.imr_address.s_addr = 0;
mreq.imr_ifindex = greqs.gsr_interface;
err = ip_mc_join_group(sk, &mreq);
- if (err)
+ if (err && err != -EADDRINUSE)
break;
greqs.gsr_interface = mreq.imr_ifindex;
omode = MCAST_INCLUDE;
@@ -769,7 +768,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
}
case MCAST_MSFILTER:
{
- extern int sysctl_optmem_max;
extern int sysctl_igmp_max_msf;
struct sockaddr_in *psin;
struct ip_msfilter *msf = NULL;
@@ -848,6 +846,9 @@ mc_msf_out:
case IP_IPSEC_POLICY:
case IP_XFRM_POLICY:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ break;
err = xfrm_user_policy(sk, optname, optval, optlen);
break;
@@ -1087,7 +1088,5 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
EXPORT_SYMBOL(ip_cmsg_recv);
-#ifdef CONFIG_IP_SCTP_MODULE
EXPORT_SYMBOL(ip_getsockopt);
EXPORT_SYMBOL(ip_setsockopt);
-#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 1a23c52..fc718df 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -214,8 +214,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
spi, IPPROTO_COMP, AF_INET);
if (!x)
return;
- NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
- spi, NIPQUAD(iph->daddr)));
+ NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
+ spi, NIPQUAD(iph->daddr));
xfrm_state_put(x);
}
@@ -236,15 +236,10 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
t->props.mode = 1;
t->props.saddr.a4 = x->props.saddr.a4;
t->props.flags = x->props.flags;
-
- t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family);
- if (t->type == NULL)
- goto error;
-
- if (t->type->init_state(t, NULL))
+
+ if (xfrm_init_state(t))
goto error;
- t->km.state = XFRM_STATE_VALID;
atomic_set(&t->tunnel_users, 1);
out:
return t;
@@ -350,8 +345,7 @@ static void ipcomp_free_tfms(struct crypto_tfm **tfms)
for_each_cpu(cpu) {
struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu);
- if (tfm)
- crypto_free_tfm(tfm);
+ crypto_free_tfm(tfm);
}
free_percpu(tfms);
}
@@ -363,7 +357,7 @@ static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name)
int cpu;
/* This can be any valid CPU ID so we don't need locking. */
- cpu = smp_processor_id();
+ cpu = raw_smp_processor_id();
list_for_each_entry(pos, &ipcomp_tfms_list, list) {
struct crypto_tfm *tfm;
@@ -422,7 +416,7 @@ static void ipcomp_destroy(struct xfrm_state *x)
kfree(ipcd);
}
-static int ipcomp_init_state(struct xfrm_state *x, void *args)
+static int ipcomp_init_state(struct xfrm_state *x)
{
int err;
struct ipcomp_data *ipcd;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f250903..953129d 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -54,6 +54,7 @@
#include <linux/major.h>
#include <linux/root_dev.h>
#include <linux/delay.h>
+#include <linux/nfs_fs.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/ipconfig.h>
@@ -393,7 +394,7 @@ static int __init ic_defaults(void)
#ifdef IPCONFIG_RARP
-static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
static struct packet_type rarp_packet_type __initdata = {
.type = __constant_htons(ETH_P_RARP),
@@ -414,7 +415,7 @@ static inline void ic_rarp_cleanup(void)
* Process received RARP packet.
*/
static int __init
-ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct arphdr *rarp;
unsigned char *rarp_ptr;
@@ -555,7 +556,7 @@ struct bootp_pkt { /* BOOTP packet format */
#define DHCPRELEASE 7
#define DHCPINFORM 8
-static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
static struct packet_type bootp_packet_type __initdata = {
.type = __constant_htons(ETH_P_IP),
@@ -823,7 +824,7 @@ static void __init ic_do_bootp_ext(u8 *ext)
/*
* Receive BOOTP reply.
*/
-static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct bootp_pkt *b;
struct iphdr *h;
@@ -1149,8 +1150,10 @@ static int __init ic_dynamic(void)
ic_rarp_cleanup();
#endif
- if (!ic_got_reply)
+ if (!ic_got_reply) {
+ ic_myaddr = INADDR_NONE;
return -1;
+ }
printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
((ic_got_reply & IC_RARP) ? "RARP"
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 68a7873..c05c1df 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -255,7 +255,6 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c
dev_hold(dev);
ipip_tunnel_link(nt);
- /* Do not decrement MOD_USE_COUNT here. */
return nt;
failed:
@@ -273,7 +272,7 @@ static void ipip_tunnel_uninit(struct net_device *dev)
dev_put(dev);
}
-static void ipip_err(struct sk_buff *skb, void *__unused)
+static void ipip_err(struct sk_buff *skb, u32 info)
{
#ifndef I_WISH_WORLD_WERE_PERFECT
@@ -852,11 +851,39 @@ static int __init ipip_fb_tunnel_init(struct net_device *dev)
return 0;
}
+#ifdef CONFIG_INET_TUNNEL
static struct xfrm_tunnel ipip_handler = {
.handler = ipip_rcv,
.err_handler = ipip_err,
};
+static inline int ipip_register(void)
+{
+ return xfrm4_tunnel_register(&ipip_handler);
+}
+
+static inline int ipip_unregister(void)
+{
+ return xfrm4_tunnel_deregister(&ipip_handler);
+}
+#else
+static struct net_protocol ipip_protocol = {
+ .handler = ipip_rcv,
+ .err_handler = ipip_err,
+ .no_policy = 1,
+};
+
+static inline int ipip_register(void)
+{
+ return inet_add_protocol(&ipip_protocol, IPPROTO_IPIP);
+}
+
+static inline int ipip_unregister(void)
+{
+ return inet_del_protocol(&ipip_protocol, IPPROTO_IPIP);
+}
+#endif
+
static char banner[] __initdata =
KERN_INFO "IPv4 over IPv4 tunneling driver\n";
@@ -866,7 +893,7 @@ static int __init ipip_init(void)
printk(banner);
- if (xfrm4_tunnel_register(&ipip_handler) < 0) {
+ if (ipip_register() < 0) {
printk(KERN_INFO "ipip init: can't register tunnel\n");
return -EAGAIN;
}
@@ -888,16 +915,33 @@ static int __init ipip_init(void)
err2:
free_netdev(ipip_fb_tunnel_dev);
err1:
- xfrm4_tunnel_deregister(&ipip_handler);
+ ipip_unregister();
goto out;
}
+static void __exit ipip_destroy_tunnels(void)
+{
+ int prio;
+
+ for (prio = 1; prio < 4; prio++) {
+ int h;
+ for (h = 0; h < HASH_SIZE; h++) {
+ struct ip_tunnel *t;
+ while ((t = tunnels[prio][h]) != NULL)
+ unregister_netdevice(t->dev);
+ }
+ }
+}
+
static void __exit ipip_fini(void)
{
- if (xfrm4_tunnel_deregister(&ipip_handler) < 0)
+ if (ipip_unregister() < 0)
printk(KERN_INFO "ipip close: can't deregister tunnel\n");
- unregister_netdev(ipip_fb_tunnel_dev);
+ rtnl_lock();
+ ipip_destroy_tunnels();
+ unregister_netdevice(ipip_fb_tunnel_dev);
+ rtnl_unlock();
}
module_init(ipip_init);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e21c049..9dbf590 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
In this case data path is free of exclusive locks at all.
*/
-static kmem_cache_t *mrt_cachep;
+static kmem_cache_t *mrt_cachep __read_mostly;
static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
@@ -297,6 +297,7 @@ static int vif_delete(int vifi)
static void ipmr_destroy_unres(struct mfc_cache *c)
{
struct sk_buff *skb;
+ struct nlmsgerr *e;
atomic_dec(&cache_resolve_queue_len);
@@ -306,7 +307,9 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
nlh->nlmsg_type = NLMSG_ERROR;
nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
- ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
+ e = NLMSG_DATA(nlh);
+ e->error = -ETIMEDOUT;
+ memset(&e->msg, 0, sizeof(e->msg));
netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
} else
kfree_skb(skb);
@@ -359,7 +362,7 @@ out:
/* Fill oifs list. It is called under write locked mrt_lock. */
-static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
+static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
{
int vifi;
@@ -499,6 +502,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
{
struct sk_buff *skb;
+ struct nlmsgerr *e;
/*
* Play the pending entries through our router
@@ -515,7 +519,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
nlh->nlmsg_type = NLMSG_ERROR;
nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
- ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
+ e = NLMSG_DATA(nlh);
+ e->error = -EMSGSIZE;
+ memset(&e->msg, 0, sizeof(e->msg));
}
err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
} else
@@ -721,7 +727,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
if (c != NULL) {
write_lock_bh(&mrt_lock);
c->mfc_parent = mfc->mfcc_parent;
- ipmr_update_threshoulds(c, mfc->mfcc_ttls);
+ ipmr_update_thresholds(c, mfc->mfcc_ttls);
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
write_unlock_bh(&mrt_lock);
@@ -738,7 +744,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
c->mfc_origin=mfc->mfcc_origin.s_addr;
c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
c->mfc_parent=mfc->mfcc_parent;
- ipmr_update_threshoulds(c, mfc->mfcc_ttls);
+ ipmr_update_thresholds(c, mfc->mfcc_ttls);
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
@@ -1350,6 +1356,7 @@ int ip_mr_input(struct sk_buff *skb)
*/
read_lock(&mrt_lock);
if (mroute_socket) {
+ nf_reset(skb);
raw_rcv(mroute_socket, skb);
read_unlock(&mrt_lock);
return 0;
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 63a82b4..c9820bf 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -2,11 +2,11 @@
# IP Virtual Server configuration
#
menu "IP: Virtual Server Configuration"
- depends on INET && NETFILTER
+ depends on NETFILTER
config IP_VS
tristate "IP virtual server support (EXPERIMENTAL)"
- depends on INET && NETFILTER
+ depends on NETFILTER
---help---
IP Virtual Server support will let you build a high-performance
virtual server based on cluster of two or more real servers. This
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d9212ad..6e092da 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -26,6 +26,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <net/protocol.h>
+#include <net/tcp.h>
#include <asm/system.h>
#include <linux/stat.h>
#include <linux/proc_fs.h>
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index fd6feb5..e11952e 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -40,7 +40,7 @@
static struct list_head *ip_vs_conn_tab;
/* SLAB cache for IPVS connections */
-static kmem_cache_t *ip_vs_conn_cachep;
+static kmem_cache_t *ip_vs_conn_cachep __read_mostly;
/* counter for current IPVS connections */
static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
@@ -548,7 +548,6 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
if (del_timer(&cp->timer))
mod_timer(&cp->timer, jiffies);
- __ip_vs_conn_put(cp);
}
@@ -759,12 +758,11 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
return 1;
}
-
+/* Called from keventd and must protect itself from softirqs */
void ip_vs_random_dropentry(void)
{
int idx;
struct ip_vs_conn *cp;
- struct ip_vs_conn *ct;
/*
* Randomly scan 1/32 of the whole table every second
@@ -775,7 +773,7 @@ void ip_vs_random_dropentry(void)
/*
* Lock is actually needed in this loop.
*/
- ct_write_lock(hash);
+ ct_write_lock_bh(hash);
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
@@ -801,23 +799,14 @@ void ip_vs_random_dropentry(void)
continue;
}
- /*
- * Drop the entry, and drop its ct if not referenced
- */
- atomic_inc(&cp->refcnt);
- ct_write_unlock(hash);
-
- if ((ct = cp->control))
- atomic_inc(&ct->refcnt);
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
- if (ct) {
+ if (cp->control) {
IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(ct);
+ ip_vs_conn_expire_now(cp->control);
}
- ct_write_lock(hash);
}
- ct_write_unlock(hash);
+ ct_write_unlock_bh(hash);
}
}
@@ -829,7 +818,6 @@ static void ip_vs_conn_flush(void)
{
int idx;
struct ip_vs_conn *cp;
- struct ip_vs_conn *ct;
flush_again:
for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
@@ -839,18 +827,13 @@ static void ip_vs_conn_flush(void)
ct_write_lock_bh(idx);
list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
- atomic_inc(&cp->refcnt);
- ct_write_unlock(idx);
- if ((ct = cp->control))
- atomic_inc(&ct->refcnt);
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
- if (ct) {
+ if (cp->control) {
IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(ct);
+ ip_vs_conn_expire_now(cp->control);
}
- ct_write_lock(idx);
}
ct_write_unlock_bh(idx);
}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 5fb257d..3ac7eec 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -22,6 +22,7 @@
*
* Changes:
* Paul `Rusty' Russell properly handle non-linear skbs
+ * Harald Welte don't use nfcache
*
*/
@@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
+ if (!((*pskb)->ipvs_property))
return NF_ACCEPT;
/* The packet was sent from IPVS, exit this chain */
@@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
- skb->nfcache |= NFC_IPVS_PROPERTY;
+ skb->ipvs_property = 1;
verdict = NF_ACCEPT;
out:
@@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
EnterFunction(11);
- if (skb->nfcache & NFC_IPVS_PROPERTY)
+ if (skb->ipvs_property)
return NF_ACCEPT;
iph = skb->nh.iph;
@@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
ip_vs_conn_put(cp);
- skb->nfcache |= NFC_IPVS_PROPERTY;
+ skb->ipvs_property = 1;
LeaveFunction(11);
return NF_ACCEPT;
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 218d970..2d66848 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -90,7 +90,8 @@ int ip_vs_get_debug_level(void)
#endif
/*
- * update_defense_level is called from keventd and from sysctl.
+ * update_defense_level is called from keventd and from sysctl,
+ * so it needs to protect itself from softirqs
*/
static void update_defense_level(void)
{
@@ -110,6 +111,8 @@ static void update_defense_level(void)
nomem = (availmem < sysctl_ip_vs_amemthresh);
+ local_bh_disable();
+
/* drop_entry */
spin_lock(&__ip_vs_dropentry_lock);
switch (sysctl_ip_vs_drop_entry) {
@@ -206,6 +209,8 @@ static void update_defense_level(void)
if (to_change >= 0)
ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
write_unlock(&__ip_vs_securetcp_lock);
+
+ local_bh_enable();
}
@@ -1360,9 +1365,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
/* Restore the correct value */
*valp = val;
} else {
- local_bh_disable();
update_defense_level();
- local_bh_enable();
}
}
return rc;
@@ -1595,7 +1598,7 @@ static ctl_table vs_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
{
.ctl_name = NET_IPV4,
.procname = "ipv4",
@@ -1610,7 +1613,7 @@ static ctl_table vs_root_table[] = {
.ctl_name = CTL_NET,
.procname = "net",
.mode = 0555,
- .child = ipv4_table,
+ .child = ipvs_ipv4_table,
},
{ .ctl_name = 0 }
};
@@ -2059,7 +2062,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
dst->addr = src->addr;
dst->port = src->port;
dst->fwmark = src->fwmark;
- strcpy(dst->sched_name, src->scheduler->name);
+ strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
@@ -2080,6 +2083,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
if (count >= get->num_services)
goto out;
+ memset(&entry, 0, sizeof(entry));
ip_vs_copy_service(&entry, svc);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
@@ -2094,6 +2098,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
if (count >= get->num_services)
goto out;
+ memset(&entry, 0, sizeof(entry));
ip_vs_copy_service(&entry, svc);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
@@ -2304,12 +2309,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
memset(&d, 0, sizeof(d));
if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
- strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
+ strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
d[0].syncid = ip_vs_master_syncid;
}
if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
- strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
+ strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
d[1].syncid = ip_vs_backup_syncid;
}
if (copy_to_user(user, &d, sizeof(d)) != 0)
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index c035838..561cda3 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -131,7 +131,7 @@ static ctl_table vs_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
{
.ctl_name = NET_IPV4,
.procname = "ipv4",
@@ -146,7 +146,7 @@ static ctl_table lblc_root_table[] = {
.ctl_name = CTL_NET,
.procname = "net",
.mode = 0555,
- .child = ipv4_table
+ .child = ipvs_ipv4_table
},
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 22b5dd5..ce456db 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -320,7 +320,7 @@ static ctl_table vs_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
{
.ctl_name = NET_IPV4,
.procname = "ipv4",
@@ -335,7 +335,7 @@ static ctl_table lblcr_root_table[] = {
.ctl_name = CTL_NET,
.procname = "net",
.mode = 0555,
- .child = ipv4_table
+ .child = ipvs_ipv4_table
},
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index e65de67..c194089 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -604,14 +604,14 @@ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
}
-static void tcp_init(struct ip_vs_protocol *pp)
+static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
{
IP_VS_INIT_HASH_TABLE(tcp_apps);
pp->timeout_table = tcp_timeouts;
}
-static void tcp_exit(struct ip_vs_protocol *pp)
+static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
{
}
@@ -621,8 +621,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.protocol = IPPROTO_TCP,
.dont_defrag = 0,
.appcnt = ATOMIC_INIT(0),
- .init = tcp_init,
- .exit = tcp_exit,
+ .init = ip_vs_tcp_init,
+ .exit = ip_vs_tcp_exit,
.register_app = tcp_register_app,
.unregister_app = tcp_unregister_app,
.conn_schedule = tcp_conn_schedule,
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 25c4795..574d1f5 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -839,10 +839,10 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
ip_vs_sync_state |= state;
if (state == IP_VS_STATE_MASTER) {
- strcpy(ip_vs_master_mcast_ifn, mcast_ifn);
+ strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
ip_vs_master_syncid = syncid;
} else {
- strcpy(ip_vs_backup_mcast_ifn, mcast_ifn);
+ strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
ip_vs_backup_syncid = syncid;
}
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index de21da0..3b87482 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,8 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
#define IP_VS_XMIT(skb, rt) \
do { \
- nf_reset_debug(skb); \
- (skb)->nfcache |= NFC_IPVS_PROPERTY; \
+ (skb)->ipvs_property = 1; \
(skb)->ip_summed = CHECKSUM_NONE; \
NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
(rt)->u.dst.dev, dst_output); \
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index cf2e6bc..db67373 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -31,6 +31,7 @@
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/module.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <net/ip.h>
@@ -106,7 +107,7 @@ static int drr_dev_event(struct notifier_block *this,
return NOTIFY_DONE;
}
-struct notifier_block drr_dev_notifier = {
+static struct notifier_block drr_dev_notifier = {
.notifier_call = drr_dev_event,
};
@@ -247,3 +248,4 @@ static void __exit drr_exit(void)
module_init(drr_init);
module_exit(drr_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c
index 805a16e..5249dbe 100644
--- a/net/ipv4/multipath_random.c
+++ b/net/ipv4/multipath_random.c
@@ -31,6 +31,7 @@
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/module.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <net/ip.h>
@@ -126,3 +127,4 @@ static void __exit random_exit(void)
module_init(random_init);
module_exit(random_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c
index 061b6b2..b6cd287 100644
--- a/net/ipv4/multipath_rr.c
+++ b/net/ipv4/multipath_rr.c
@@ -31,6 +31,7 @@
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/module.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <net/ip.h>
@@ -93,3 +94,4 @@ static void __exit rr_exit(void)
module_init(rr_init);
module_exit(rr_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c
index c3d2ca1..bd7d75b 100644
--- a/net/ipv4/multipath_wrandom.c
+++ b/net/ipv4/multipath_wrandom.c
@@ -31,6 +31,7 @@
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/module.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <net/ip.h>
@@ -342,3 +343,4 @@ static void __exit wrandom_exit(void)
module_init(wrandom_init);
module_exit(wrandom_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
new file mode 100644
index 0000000..ae0779d
--- /dev/null
+++ b/net/ipv4/netfilter.c
@@ -0,0 +1,139 @@
+/* IPv4 specific functions of netfilter core */
+
+#include <linux/config.h>
+#ifdef CONFIG_NETFILTER
+
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/route.h>
+#include <linux/ip.h>
+
+/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
+int ip_route_me_harder(struct sk_buff **pskb)
+{
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct rtable *rt;
+ struct flowi fl = {};
+ struct dst_entry *odst;
+ unsigned int hh_len;
+
+ /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
+ * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
+ */
+ if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
+ fl.nl_u.ip4_u.daddr = iph->daddr;
+ fl.nl_u.ip4_u.saddr = iph->saddr;
+ fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+ fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
+#endif
+ fl.proto = iph->protocol;
+ if (ip_route_output_key(&rt, &fl) != 0)
+ return -1;
+
+ /* Drop old route. */
+ dst_release((*pskb)->dst);
+ (*pskb)->dst = &rt->u.dst;
+ } else {
+ /* non-local src, find valid iif to satisfy
+ * rp-filter when calling ip_route_input. */
+ fl.nl_u.ip4_u.daddr = iph->saddr;
+ if (ip_route_output_key(&rt, &fl) != 0)
+ return -1;
+
+ odst = (*pskb)->dst;
+ if (ip_route_input(*pskb, iph->daddr, iph->saddr,
+ RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
+ dst_release(&rt->u.dst);
+ return -1;
+ }
+ dst_release(&rt->u.dst);
+ dst_release(odst);
+ }
+
+ if ((*pskb)->dst->error)
+ return -1;
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = (*pskb)->dst->dev->hard_header_len;
+ if (skb_headroom(*pskb) < hh_len) {
+ struct sk_buff *nskb;
+
+ nskb = skb_realloc_headroom(*pskb, hh_len);
+ if (!nskb)
+ return -1;
+ if ((*pskb)->sk)
+ skb_set_owner_w(nskb, (*pskb)->sk);
+ kfree_skb(*pskb);
+ *pskb = nskb;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ip_route_me_harder);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip_rt_info {
+ u_int32_t daddr;
+ u_int32_t saddr;
+ u_int8_t tos;
+};
+
+static void queue_save(const struct sk_buff *skb, struct nf_info *info)
+{
+ struct ip_rt_info *rt_info = nf_info_reroute(info);
+
+ if (info->hook == NF_IP_LOCAL_OUT) {
+ const struct iphdr *iph = skb->nh.iph;
+
+ rt_info->tos = iph->tos;
+ rt_info->daddr = iph->daddr;
+ rt_info->saddr = iph->saddr;
+ }
+}
+
+static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info)
+{
+ const struct ip_rt_info *rt_info = nf_info_reroute(info);
+
+ if (info->hook == NF_IP_LOCAL_OUT) {
+ struct iphdr *iph = (*pskb)->nh.iph;
+
+ if (!(iph->tos == rt_info->tos
+ && iph->daddr == rt_info->daddr
+ && iph->saddr == rt_info->saddr))
+ return ip_route_me_harder(pskb);
+ }
+ return 0;
+}
+
+static struct nf_queue_rerouter ip_reroute = {
+ .rer_size = sizeof(struct ip_rt_info),
+ .save = queue_save,
+ .reroute = queue_reroute,
+};
+
+static int init(void)
+{
+ return nf_register_queue_rerouter(PF_INET, &ip_reroute);
+}
+
+static void fini(void)
+{
+ nf_unregister_queue_rerouter(PF_INET);
+}
+
+module_init(init);
+module_exit(fini);
+
+#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 46d4cb1..30aa8e2 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -34,12 +34,23 @@ config IP_NF_CT_ACCT
config IP_NF_CONNTRACK_MARK
bool 'Connection mark tracking support'
+ depends on IP_NF_CONNTRACK
help
This option enables support for connection marks, used by the
`CONNMARK' target and `connmark' match. Similar to the mark value
of packets, but this mark value is kept in the conntrack session
instead of the individual packets.
+config IP_NF_CONNTRACK_EVENTS
+ bool "Connection tracking events"
+ depends on IP_NF_CONNTRACK
+ help
+ If this option is enabled, the connection tracking code will
+ provide a notifier chain that can be used by other kernel code
+ to get notified about changes in the connection tracking state.
+
+ IF unsure, say `N'.
+
config IP_NF_CT_PROTO_SCTP
tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
depends on IP_NF_CONNTRACK && EXPERIMENTAL
@@ -75,6 +86,25 @@ config IP_NF_IRC
To compile it as a module, choose M here. If unsure, say Y.
+config IP_NF_NETBIOS_NS
+ tristate "NetBIOS name service protocol support (EXPERIMENTAL)"
+ depends on IP_NF_CONNTRACK && EXPERIMENTAL
+ help
+ NetBIOS name service requests are sent as broadcast messages from an
+ unprivileged port and responded to with unicast messages to the
+ same port. This make them hard to firewall properly because connection
+ tracking doesn't deal with broadcasts. This helper tracks locally
+ originating NetBIOS name service requests and the corresponding
+ responses. It relies on correct IP address configuration, specifically
+ netmask and broadcast address. When properly configured, the output
+ of "ip address show" should look similar to this:
+
+ $ ip -4 address show eth0
+ 4: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast qlen 1000
+ inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP_NF_TFTP
tristate "TFTP protocol support"
depends on IP_NF_CONNTRACK
@@ -100,11 +130,15 @@ config IP_NF_AMANDA
To compile it as a module, choose M here. If unsure, say Y.
config IP_NF_QUEUE
- tristate "Userspace queueing via NETLINK"
+ tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
help
Netfilter has the ability to queue packets to user space: the
netlink device can be used to access them using this driver.
+ This option enables the old IPv4-only "ip_queue" implementation
+ which has been obsoleted by the new "nfnetlink_queue" code (see
+ CONFIG_NETFILTER_NETLINK_QUEUE).
+
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_IPTABLES
@@ -340,6 +374,17 @@ config IP_NF_MATCH_SCTP
If you want to compile it as a module, say M here and read
<file:Documentation/modules.txt>. If unsure, say `N'.
+config IP_NF_MATCH_DCCP
+ tristate 'DCCP protocol match support'
+ depends on IP_NF_IPTABLES
+ help
+ With this option enabled, you will be able to use the iptables
+ `dccp' match in order to match on DCCP source/destination ports
+ and DCCP flags.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
config IP_NF_MATCH_COMMENT
tristate 'comment match support'
depends on IP_NF_IPTABLES
@@ -361,6 +406,16 @@ config IP_NF_MATCH_CONNMARK
<file:Documentation/modules.txt>. The module will be called
ipt_connmark.o. If unsure, say `N'.
+config IP_NF_MATCH_CONNBYTES
+ tristate 'Connection byte/packet counter match support'
+ depends on IP_NF_CT_ACCT && IP_NF_IPTABLES
+ help
+ This option adds a `connbytes' match, which allows you to match the
+ number of bytes and/or packets for each direction within a connection.
+
+ If you want to compile it as a module, say M here and read
+ <file:Documentation/modules.txt>. If unsure, say `N'.
+
config IP_NF_MATCH_HASHLIMIT
tristate 'hashlimit match support'
depends on IP_NF_IPTABLES
@@ -375,6 +430,19 @@ config IP_NF_MATCH_HASHLIMIT
destination IP' or `500pps from any given source IP' with a single
IPtables rule.
+config IP_NF_MATCH_STRING
+ tristate 'string match support'
+ depends on IP_NF_IPTABLES
+ select TEXTSEARCH
+ select TEXTSEARCH_KMP
+ select TEXTSEARCH_BM
+ select TEXTSEARCH_FSM
+ help
+ This option adds a `string' match, which allows you to look for
+ pattern matchings in packets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
# `filter', generic and specific targets
config IP_NF_FILTER
tristate "Packet filtering"
@@ -616,6 +684,20 @@ config IP_NF_TARGET_CLASSIFY
To compile it as a module, choose M here. If unsure, say N.
+config IP_NF_TARGET_TTL
+ tristate 'TTL target support'
+ depends on IP_NF_MANGLE
+ help
+ This option adds a `TTL' target, which enables the user to modify
+ the TTL value of the IP header.
+
+ While it is safe to decrement/lower the TTL, this target also enables
+ functionality to increment and set the TTL value of the IP header to
+ arbitrary values. This is EXTREMELY DANGEROUS since you can easily
+ create immortal packets that loop forever on the network.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP_NF_TARGET_CONNMARK
tristate 'CONNMARK target support'
depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
@@ -692,5 +774,11 @@ config IP_NF_ARP_MANGLE
Allows altering the ARP packet payload: source and destination
hardware and network addresses.
+config IP_NF_CONNTRACK_NETLINK
+ tristate 'Connection tracking netlink interface'
+ depends on IP_NF_CONNTRACK && NETFILTER_NETLINK
+ help
+ This option enables support for a netlink-based userspace interface
+
endmenu
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 45796d5..1ba0db7 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -9,6 +9,10 @@ iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helpe
# connection tracking
obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
+# conntrack netlink interface
+obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o
+
+
# SCTP protocol connection tracking
obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
@@ -17,6 +21,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o
# NAT helpers
obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
@@ -38,6 +43,7 @@ obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
+obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o
obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
@@ -54,11 +60,13 @@ obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
+obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o
obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
+obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
# targets
obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
@@ -78,6 +86,7 @@ obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
+obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
# generic ARP tables
obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
@@ -87,3 +96,4 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ipt_NFQUEUE.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index df79f5e..fa16342 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -60,7 +60,6 @@ static DECLARE_MUTEX(arpt_mutex);
#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
-#include <linux/netfilter_ipv4/lockhelp.h>
#include <linux/netfilter_ipv4/listhelp.h>
struct arpt_table_info {
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 3dbddd0..dc20881 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -26,7 +26,6 @@
#include <net/checksum.h>
#include <net/udp.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
@@ -41,8 +40,8 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
static char *conns[] = { "DATA ", "MESG ", "INDEX " };
/* This is slow, but it's simple. --RR */
-static char amanda_buffer[65536];
-static DECLARE_LOCK(amanda_buffer_lock);
+static char *amanda_buffer;
+static DEFINE_SPINLOCK(amanda_buffer_lock);
unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
enum ip_conntrack_info ctinfo,
@@ -76,7 +75,7 @@ static int help(struct sk_buff **pskb,
return NF_ACCEPT;
}
- LOCK_BH(&amanda_buffer_lock);
+ spin_lock_bh(&amanda_buffer_lock);
skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
data = amanda_buffer;
data_limit = amanda_buffer + (*pskb)->len - dataoff;
@@ -102,14 +101,14 @@ static int help(struct sk_buff **pskb,
if (port == 0 || len > 5)
break;
- exp = ip_conntrack_expect_alloc();
+ exp = ip_conntrack_expect_alloc(ct);
if (exp == NULL) {
ret = NF_DROP;
goto out;
}
exp->expectfn = NULL;
- exp->master = ct;
+ exp->flags = 0;
exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
exp->tuple.src.u.tcp.port = 0;
@@ -127,14 +126,13 @@ static int help(struct sk_buff **pskb,
ret = ip_nat_amanda_hook(pskb, ctinfo,
tmp - amanda_buffer,
len, exp);
- else if (ip_conntrack_expect_related(exp) != 0) {
- ip_conntrack_expect_free(exp);
+ else if (ip_conntrack_expect_related(exp) != 0)
ret = NF_DROP;
- }
+ ip_conntrack_expect_put(exp);
}
out:
- UNLOCK_BH(&amanda_buffer_lock);
+ spin_unlock_bh(&amanda_buffer_lock);
return ret;
}
@@ -156,11 +154,25 @@ static struct ip_conntrack_helper amanda_helper = {
static void __exit fini(void)
{
ip_conntrack_helper_unregister(&amanda_helper);
+ kfree(amanda_buffer);
}
static int __init init(void)
{
- return ip_conntrack_helper_register(&amanda_helper);
+ int ret;
+
+ amanda_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!amanda_buffer)
+ return -ENOMEM;
+
+ ret = ip_conntrack_helper_register(&amanda_helper);
+ if (ret < 0) {
+ kfree(amanda_buffer);
+ return ret;
+ }
+ return 0;
+
+
}
module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 09e8246..19cba16 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -37,11 +37,12 @@
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/moduleparam.h>
+#include <linux/notifier.h>
-/* This rwlock protects the main hash table, protocol/helper/expected
+/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
registrations, conntrack timers*/
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
#include <linux/netfilter_ipv4/ip_conntrack.h>
#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -49,7 +50,7 @@
#include <linux/netfilter_ipv4/ip_conntrack_core.h>
#include <linux/netfilter_ipv4/listhelp.h>
-#define IP_CONNTRACK_VERSION "2.1"
+#define IP_CONNTRACK_VERSION "2.3"
#if 0
#define DEBUGP printk
@@ -57,7 +58,7 @@
#define DEBUGP(format, args...)
#endif
-DECLARE_RWLOCK(ip_conntrack_lock);
+DEFINE_RWLOCK(ip_conntrack_lock);
/* ip_conntrack_standalone needs this */
atomic_t ip_conntrack_count = ATOMIC_INIT(0);
@@ -69,22 +70,81 @@ static LIST_HEAD(helpers);
unsigned int ip_conntrack_htable_size = 0;
int ip_conntrack_max;
struct list_head *ip_conntrack_hash;
-static kmem_cache_t *ip_conntrack_cachep;
-static kmem_cache_t *ip_conntrack_expect_cachep;
+static kmem_cache_t *ip_conntrack_cachep __read_mostly;
+static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
struct ip_conntrack ip_conntrack_untracked;
unsigned int ip_ct_log_invalid;
static LIST_HEAD(unconfirmed);
static int ip_conntrack_vmalloc;
-DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+static unsigned int ip_conntrack_next_id = 1;
+static unsigned int ip_conntrack_expect_next_id = 1;
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+struct notifier_block *ip_conntrack_chain;
+struct notifier_block *ip_conntrack_expect_chain;
+
+DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
-void
-ip_conntrack_put(struct ip_conntrack *ct)
+/* deliver cached events and clear cache entry - must be called with locally
+ * disabled softirqs */
+static inline void
+__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
{
- IP_NF_ASSERT(ct);
- nf_conntrack_put(&ct->ct_general);
+ DEBUGP("ecache: delivering events for %p\n", ecache->ct);
+ if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
+ notifier_call_chain(&ip_conntrack_chain, ecache->events,
+ ecache->ct);
+ ecache->events = 0;
+ ip_conntrack_put(ecache->ct);
+ ecache->ct = NULL;
}
+/* Deliver all cached events for a particular conntrack. This is called
+ * by code prior to async packet handling or freeing the skb */
+void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
+{
+ struct ip_conntrack_ecache *ecache;
+
+ local_bh_disable();
+ ecache = &__get_cpu_var(ip_conntrack_ecache);
+ if (ecache->ct == ct)
+ __ip_ct_deliver_cached_events(ecache);
+ local_bh_enable();
+}
+
+void __ip_ct_event_cache_init(struct ip_conntrack *ct)
+{
+ struct ip_conntrack_ecache *ecache;
+
+ /* take care of delivering potentially old events */
+ ecache = &__get_cpu_var(ip_conntrack_ecache);
+ BUG_ON(ecache->ct == ct);
+ if (ecache->ct)
+ __ip_ct_deliver_cached_events(ecache);
+ /* initialize for this conntrack/packet */
+ ecache->ct = ct;
+ nf_conntrack_get(&ct->ct_general);
+}
+
+/* flush the event cache - touches other CPU's data and must not be called while
+ * packets are still passing through the code */
+static void ip_ct_event_cache_flush(void)
+{
+ struct ip_conntrack_ecache *ecache;
+ int cpu;
+
+ for_each_cpu(cpu) {
+ ecache = &per_cpu(ip_conntrack_ecache, cpu);
+ if (ecache->ct)
+ ip_conntrack_put(ecache->ct);
+ }
+}
+#else
+static inline void ip_ct_event_cache_flush(void) {}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+
static int ip_conntrack_hash_rnd_initted;
static unsigned int ip_conntrack_hash_rnd;
@@ -137,30 +197,51 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
/* ip_conntrack_expect helper functions */
-static void destroy_expect(struct ip_conntrack_expect *exp)
+void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
{
- ip_conntrack_put(exp->master);
+ ASSERT_WRITE_LOCK(&ip_conntrack_lock);
IP_NF_ASSERT(!timer_pending(&exp->timeout));
- kmem_cache_free(ip_conntrack_expect_cachep, exp);
- CONNTRACK_STAT_INC(expect_delete);
-}
-
-static void unlink_expect(struct ip_conntrack_expect *exp)
-{
- MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
list_del(&exp->list);
- /* Logically in destroy_expect, but we hold the lock here. */
+ CONNTRACK_STAT_INC(expect_delete);
exp->master->expecting--;
+ ip_conntrack_expect_put(exp);
}
static void expectation_timed_out(unsigned long ul_expect)
{
struct ip_conntrack_expect *exp = (void *)ul_expect;
- WRITE_LOCK(&ip_conntrack_lock);
- unlink_expect(exp);
- WRITE_UNLOCK(&ip_conntrack_lock);
- destroy_expect(exp);
+ write_lock_bh(&ip_conntrack_lock);
+ ip_ct_unlink_expect(exp);
+ write_unlock_bh(&ip_conntrack_lock);
+ ip_conntrack_expect_put(exp);
+}
+
+struct ip_conntrack_expect *
+__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_expect *i;
+
+ list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+ if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
+ atomic_inc(&i->use);
+ return i;
+ }
+ }
+ return NULL;
+}
+
+/* Just find a expectation corresponding to a tuple. */
+struct ip_conntrack_expect *
+ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_expect *i;
+
+ read_lock_bh(&ip_conntrack_lock);
+ i = __ip_conntrack_expect_find(tuple);
+ read_unlock_bh(&ip_conntrack_lock);
+
+ return i;
}
/* If an expectation for this connection is found, it gets delete from
@@ -177,17 +258,21 @@ find_expectation(const struct ip_conntrack_tuple *tuple)
master ct never got confirmed, we'd hold a reference to it
and weird things would happen to future packets). */
if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
- && is_confirmed(i->master)
- && del_timer(&i->timeout)) {
- unlink_expect(i);
- return i;
+ && is_confirmed(i->master)) {
+ if (i->flags & IP_CT_EXPECT_PERMANENT) {
+ atomic_inc(&i->use);
+ return i;
+ } else if (del_timer(&i->timeout)) {
+ ip_ct_unlink_expect(i);
+ return i;
+ }
}
}
return NULL;
}
/* delete all expectations for this conntrack */
-static void remove_expectations(struct ip_conntrack *ct)
+void ip_ct_remove_expectations(struct ip_conntrack *ct)
{
struct ip_conntrack_expect *i, *tmp;
@@ -197,8 +282,8 @@ static void remove_expectations(struct ip_conntrack *ct)
list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
if (i->master == ct && del_timer(&i->timeout)) {
- unlink_expect(i);
- destroy_expect(i);
+ ip_ct_unlink_expect(i);
+ ip_conntrack_expect_put(i);
}
}
}
@@ -209,7 +294,7 @@ clean_from_lists(struct ip_conntrack *ct)
unsigned int ho, hr;
DEBUGP("clean_from_lists(%p)\n", ct);
- MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+ ASSERT_WRITE_LOCK(&ip_conntrack_lock);
ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -217,7 +302,7 @@ clean_from_lists(struct ip_conntrack *ct)
LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
/* Destroy all pending expectations */
- remove_expectations(ct);
+ ip_ct_remove_expectations(ct);
}
static void
@@ -230,22 +315,25 @@ destroy_conntrack(struct nf_conntrack *nfct)
IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
IP_NF_ASSERT(!timer_pending(&ct->timeout));
+ ip_conntrack_event(IPCT_DESTROY, ct);
+ set_bit(IPS_DYING_BIT, &ct->status);
+
/* To make sure we don't get any weird locking issues here:
* destroy_conntrack() MUST NOT be called with a write lock
* to ip_conntrack_lock!!! -HW */
- proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+ proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
if (proto && proto->destroy)
proto->destroy(ct);
if (ip_conntrack_destroyed)
ip_conntrack_destroyed(ct);
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
/* Expectations will have been removed in clean_from_lists,
* except TFTP can create an expectation on the first packet,
* before connection is in the list, so we need to clean here,
* too. */
- remove_expectations(ct);
+ ip_ct_remove_expectations(ct);
/* We overload first tuple to link into unconfirmed list. */
if (!is_confirmed(ct)) {
@@ -254,26 +342,25 @@ destroy_conntrack(struct nf_conntrack *nfct)
}
CONNTRACK_STAT_INC(delete);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
if (ct->master)
ip_conntrack_put(ct->master);
DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
- kmem_cache_free(ip_conntrack_cachep, ct);
- atomic_dec(&ip_conntrack_count);
+ ip_conntrack_free(ct);
}
static void death_by_timeout(unsigned long ul_conntrack)
{
struct ip_conntrack *ct = (void *)ul_conntrack;
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
/* Inside lock so preempt is disabled on module removal path.
* Otherwise we can get spurious warnings. */
CONNTRACK_STAT_INC(delete_list);
clean_from_lists(ct);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
ip_conntrack_put(ct);
}
@@ -282,19 +369,19 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
const struct ip_conntrack_tuple *tuple,
const struct ip_conntrack *ignored_conntrack)
{
- MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ ASSERT_READ_LOCK(&ip_conntrack_lock);
return tuplehash_to_ctrack(i) != ignored_conntrack
&& ip_ct_tuple_equal(tuple, &i->tuple);
}
-static struct ip_conntrack_tuple_hash *
+struct ip_conntrack_tuple_hash *
__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
const struct ip_conntrack *ignored_conntrack)
{
struct ip_conntrack_tuple_hash *h;
unsigned int hash = hash_conntrack(tuple);
- MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ ASSERT_READ_LOCK(&ip_conntrack_lock);
list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
CONNTRACK_STAT_INC(found);
@@ -313,15 +400,38 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
{
struct ip_conntrack_tuple_hash *h;
- READ_LOCK(&ip_conntrack_lock);
+ read_lock_bh(&ip_conntrack_lock);
h = __ip_conntrack_find(tuple, ignored_conntrack);
if (h)
atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
- READ_UNLOCK(&ip_conntrack_lock);
+ read_unlock_bh(&ip_conntrack_lock);
return h;
}
+static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
+ unsigned int hash,
+ unsigned int repl_hash)
+{
+ ct->id = ++ip_conntrack_next_id;
+ list_prepend(&ip_conntrack_hash[hash],
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+ list_prepend(&ip_conntrack_hash[repl_hash],
+ &ct->tuplehash[IP_CT_DIR_REPLY].list);
+}
+
+void ip_conntrack_hash_insert(struct ip_conntrack *ct)
+{
+ unsigned int hash, repl_hash;
+
+ hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ write_lock_bh(&ip_conntrack_lock);
+ __ip_conntrack_hash_insert(ct, hash, repl_hash);
+ write_unlock_bh(&ip_conntrack_lock);
+}
+
/* Confirm a connection given skb; places it in hash table */
int
__ip_conntrack_confirm(struct sk_buff **pskb)
@@ -352,7 +462,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
IP_NF_ASSERT(!is_confirmed(ct));
DEBUGP("Confirming conntrack %p\n", ct);
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
@@ -368,10 +478,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
/* Remove from unconfirmed list */
list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
- list_prepend(&ip_conntrack_hash[hash],
- &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
- list_prepend(&ip_conntrack_hash[repl_hash],
- &ct->tuplehash[IP_CT_DIR_REPLY]);
+ __ip_conntrack_hash_insert(ct, hash, repl_hash);
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
@@ -380,12 +487,22 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
atomic_inc(&ct->ct_general.use);
set_bit(IPS_CONFIRMED_BIT, &ct->status);
CONNTRACK_STAT_INC(insert);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
+ if (ct->helper)
+ ip_conntrack_event_cache(IPCT_HELPER, *pskb);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+ if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+ test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+ ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
+#endif
+ ip_conntrack_event_cache(master_ct(ct) ?
+ IPCT_RELATED : IPCT_NEW, *pskb);
+
return NF_ACCEPT;
}
CONNTRACK_STAT_INC(insert_failed);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
return NF_DROP;
}
@@ -398,9 +515,9 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
{
struct ip_conntrack_tuple_hash *h;
- READ_LOCK(&ip_conntrack_lock);
+ read_lock_bh(&ip_conntrack_lock);
h = __ip_conntrack_find(tuple, ignored_conntrack);
- READ_UNLOCK(&ip_conntrack_lock);
+ read_unlock_bh(&ip_conntrack_lock);
return h != NULL;
}
@@ -419,13 +536,13 @@ static int early_drop(struct list_head *chain)
struct ip_conntrack *ct = NULL;
int dropped = 0;
- READ_LOCK(&ip_conntrack_lock);
+ read_lock_bh(&ip_conntrack_lock);
h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
if (h) {
ct = tuplehash_to_ctrack(h);
atomic_inc(&ct->ct_general.use);
}
- READ_UNLOCK(&ip_conntrack_lock);
+ read_unlock_bh(&ip_conntrack_lock);
if (!ct)
return dropped;
@@ -445,34 +562,84 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i,
return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
}
-static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+static struct ip_conntrack_helper *
+__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
{
return LIST_FIND(&helpers, helper_cmp,
struct ip_conntrack_helper *,
tuple);
}
-/* Allocate a new conntrack: we return -ENOMEM if classification
- failed due to stress. Otherwise it really is unclassifiable. */
-static struct ip_conntrack_tuple_hash *
-init_conntrack(const struct ip_conntrack_tuple *tuple,
- struct ip_conntrack_protocol *protocol,
- struct sk_buff *skb)
+struct ip_conntrack_helper *
+ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_helper *helper;
+
+ /* need ip_conntrack_lock to assure that helper exists until
+ * try_module_get() is called */
+ read_lock_bh(&ip_conntrack_lock);
+
+ helper = __ip_conntrack_helper_find(tuple);
+ if (helper) {
+ /* need to increase module usage count to assure helper will
+ * not go away while the caller is e.g. busy putting a
+ * conntrack in the hash that uses the helper */
+ if (!try_module_get(helper->me))
+ helper = NULL;
+ }
+
+ read_unlock_bh(&ip_conntrack_lock);
+
+ return helper;
+}
+
+void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
+{
+ module_put(helper->me);
+}
+
+struct ip_conntrack_protocol *
+__ip_conntrack_proto_find(u_int8_t protocol)
+{
+ return ip_ct_protos[protocol];
+}
+
+/* this is guaranteed to always return a valid protocol helper, since
+ * it falls back to generic_protocol */
+struct ip_conntrack_protocol *
+ip_conntrack_proto_find_get(u_int8_t protocol)
+{
+ struct ip_conntrack_protocol *p;
+
+ preempt_disable();
+ p = __ip_conntrack_proto_find(protocol);
+ if (p) {
+ if (!try_module_get(p->me))
+ p = &ip_conntrack_generic_protocol;
+ }
+ preempt_enable();
+
+ return p;
+}
+
+void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
+{
+ module_put(p->me);
+}
+
+struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
+ struct ip_conntrack_tuple *repl)
{
struct ip_conntrack *conntrack;
- struct ip_conntrack_tuple repl_tuple;
- size_t hash;
- struct ip_conntrack_expect *exp;
if (!ip_conntrack_hash_rnd_initted) {
get_random_bytes(&ip_conntrack_hash_rnd, 4);
ip_conntrack_hash_rnd_initted = 1;
}
- hash = hash_conntrack(tuple);
-
if (ip_conntrack_max
&& atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+ unsigned int hash = hash_conntrack(orig);
/* Try dropping from this hash chain. */
if (!early_drop(&ip_conntrack_hash[hash])) {
if (net_ratelimit())
@@ -483,11 +650,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
}
}
- if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
- DEBUGP("Can't invert tuple.\n");
- return NULL;
- }
-
conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
if (!conntrack) {
DEBUGP("Can't allocate conntrack.\n");
@@ -497,18 +659,51 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
memset(conntrack, 0, sizeof(*conntrack));
atomic_set(&conntrack->ct_general.use, 1);
conntrack->ct_general.destroy = destroy_conntrack;
- conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
- conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
- if (!protocol->new(conntrack, skb)) {
- kmem_cache_free(ip_conntrack_cachep, conntrack);
- return NULL;
- }
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
/* Don't set timer yet: wait for confirmation */
init_timer(&conntrack->timeout);
conntrack->timeout.data = (unsigned long)conntrack;
conntrack->timeout.function = death_by_timeout;
- WRITE_LOCK(&ip_conntrack_lock);
+ atomic_inc(&ip_conntrack_count);
+
+ return conntrack;
+}
+
+void
+ip_conntrack_free(struct ip_conntrack *conntrack)
+{
+ atomic_dec(&ip_conntrack_count);
+ kmem_cache_free(ip_conntrack_cachep, conntrack);
+}
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+ * failed due to stress. Otherwise it really is unclassifiable */
+static struct ip_conntrack_tuple_hash *
+init_conntrack(struct ip_conntrack_tuple *tuple,
+ struct ip_conntrack_protocol *protocol,
+ struct sk_buff *skb)
+{
+ struct ip_conntrack *conntrack;
+ struct ip_conntrack_tuple repl_tuple;
+ struct ip_conntrack_expect *exp;
+
+ if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
+ DEBUGP("Can't invert tuple.\n");
+ return NULL;
+ }
+
+ conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
+ if (conntrack == NULL || IS_ERR(conntrack))
+ return (struct ip_conntrack_tuple_hash *)conntrack;
+
+ if (!protocol->new(conntrack, skb)) {
+ ip_conntrack_free(conntrack);
+ return NULL;
+ }
+
+ write_lock_bh(&ip_conntrack_lock);
exp = find_expectation(tuple);
if (exp) {
@@ -517,13 +712,18 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
/* Welcome, Mr. Bond. We've been expecting you... */
__set_bit(IPS_EXPECTED_BIT, &conntrack->status);
conntrack->master = exp->master;
-#if CONFIG_IP_NF_CONNTRACK_MARK
+#ifdef CONFIG_IP_NF_CONNTRACK_MARK
conntrack->mark = exp->master->mark;
#endif
+#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
+ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
+ /* this is ugly, but there is no other place where to put it */
+ conntrack->nat.masq_index = exp->master->nat.masq_index;
+#endif
nf_conntrack_get(&conntrack->master->ct_general);
CONNTRACK_STAT_INC(expect_new);
} else {
- conntrack->helper = ip_ct_find_helper(&repl_tuple);
+ conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
CONNTRACK_STAT_INC(new);
}
@@ -531,13 +731,12 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
/* Overload tuple linked list to put us in unconfirmed list. */
list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
- atomic_inc(&ip_conntrack_count);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
if (exp) {
if (exp->expectfn)
exp->expectfn(conntrack, exp);
- destroy_expect(exp);
+ ip_conntrack_expect_put(exp);
}
return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
@@ -609,7 +808,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
struct ip_conntrack *ct;
enum ip_conntrack_info ctinfo;
struct ip_conntrack_protocol *proto;
- int set_reply;
+ int set_reply = 0;
int ret;
/* Previously seen (loopback or untracked)? Ignore. */
@@ -627,9 +826,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
return NF_DROP;
}
- /* FIXME: Do this right please. --RR */
- (*pskb)->nfcache |= NFC_UNKNOWN;
-
/* Doesn't cover locally-generated broadcast, so not worth it. */
#if 0
/* Ignore broadcast: no `connection'. */
@@ -645,7 +841,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
}
#endif
- proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
+ proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
/* It may be an special packet, error, unclean...
* inverse of the return code tells to the netfilter
@@ -681,8 +877,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
return -ret;
}
- if (set_reply)
- set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
+ if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+ ip_conntrack_event_cache(IPCT_STATUS, *pskb);
return ret;
}
@@ -691,7 +887,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse,
const struct ip_conntrack_tuple *orig)
{
return ip_ct_invert_tuple(inverse, orig,
- ip_ct_find_proto(orig->dst.protonum));
+ __ip_conntrack_proto_find(orig->dst.protonum));
}
/* Would two expected things clash? */
@@ -723,20 +919,23 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
{
struct ip_conntrack_expect *i;
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
/* choose the the oldest expectation to evict */
list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
if (expect_matches(i, exp) && del_timer(&i->timeout)) {
- unlink_expect(i);
- WRITE_UNLOCK(&ip_conntrack_lock);
- destroy_expect(i);
+ ip_ct_unlink_expect(i);
+ write_unlock_bh(&ip_conntrack_lock);
+ ip_conntrack_expect_put(i);
return;
}
}
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
}
-struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
+/* We don't increase the master conntrack refcount for non-fulfilled
+ * conntracks. During the conntrack destruction, the expectations are
+ * always killed before the conntrack itself */
+struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
{
struct ip_conntrack_expect *new;
@@ -745,31 +944,31 @@ struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
DEBUGP("expect_related: OOM allocating expect\n");
return NULL;
}
- new->master = NULL;
+ new->master = me;
+ atomic_set(&new->use, 1);
return new;
}
-void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
+void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
{
- kmem_cache_free(ip_conntrack_expect_cachep, expect);
+ if (atomic_dec_and_test(&exp->use))
+ kmem_cache_free(ip_conntrack_expect_cachep, exp);
}
static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
{
- atomic_inc(&exp->master->ct_general.use);
+ atomic_inc(&exp->use);
exp->master->expecting++;
list_add(&exp->list, &ip_conntrack_expect_list);
- if (exp->master->helper->timeout) {
- init_timer(&exp->timeout);
- exp->timeout.data = (unsigned long)exp;
- exp->timeout.function = expectation_timed_out;
- exp->timeout.expires
- = jiffies + exp->master->helper->timeout * HZ;
- add_timer(&exp->timeout);
- } else
- exp->timeout.function = NULL;
+ init_timer(&exp->timeout);
+ exp->timeout.data = (unsigned long)exp;
+ exp->timeout.function = expectation_timed_out;
+ exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
+ add_timer(&exp->timeout);
+ exp->id = ++ip_conntrack_expect_next_id;
+ atomic_inc(&exp->use);
CONNTRACK_STAT_INC(expect_create);
}
@@ -781,8 +980,8 @@ static void evict_oldest_expect(struct ip_conntrack *master)
list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
if (i->master == master) {
if (del_timer(&i->timeout)) {
- unlink_expect(i);
- destroy_expect(i);
+ ip_ct_unlink_expect(i);
+ ip_conntrack_expect_put(i);
}
break;
}
@@ -808,14 +1007,12 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
list_for_each_entry(i, &ip_conntrack_expect_list, list) {
if (expect_matches(i, expect)) {
/* Refresh timer: if it's dying, ignore.. */
if (refresh_timer(i)) {
ret = 0;
- /* We don't need the one they've given us. */
- ip_conntrack_expect_free(expect);
goto out;
}
} else if (expect_clash(i, expect)) {
@@ -830,9 +1027,10 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
evict_oldest_expect(expect->master);
ip_conntrack_expect_insert(expect);
+ ip_conntrack_expect_event(IPEXP_NEW, expect);
ret = 0;
out:
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
return ret;
}
@@ -841,7 +1039,7 @@ out:
void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
const struct ip_conntrack_tuple *newreply)
{
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
/* Should be unconfirmed, so not in hash table yet */
IP_NF_ASSERT(!is_confirmed(conntrack));
@@ -850,25 +1048,40 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
if (!conntrack->master && conntrack->expecting == 0)
- conntrack->helper = ip_ct_find_helper(newreply);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ conntrack->helper = __ip_conntrack_helper_find(newreply);
+ write_unlock_bh(&ip_conntrack_lock);
}
int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
{
BUG_ON(me->timeout == 0);
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
list_prepend(&helpers, me);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
return 0;
}
+struct ip_conntrack_helper *
+__ip_conntrack_helper_find_byname(const char *name)
+{
+ struct ip_conntrack_helper *h;
+
+ list_for_each_entry(h, &helpers, list) {
+ if (!strcmp(h->name, name))
+ return h;
+ }
+
+ return NULL;
+}
+
static inline int unhelp(struct ip_conntrack_tuple_hash *i,
const struct ip_conntrack_helper *me)
{
- if (tuplehash_to_ctrack(i)->helper == me)
+ if (tuplehash_to_ctrack(i)->helper == me) {
+ ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
tuplehash_to_ctrack(i)->helper = NULL;
+ }
return 0;
}
@@ -878,14 +1091,14 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
struct ip_conntrack_expect *exp, *tmp;
/* Need write lock here, to delete helper. */
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
LIST_DELETE(&helpers, me);
/* Get rid of expectations */
list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
if (exp->master->helper == me && del_timer(&exp->timeout)) {
- unlink_expect(exp);
- destroy_expect(exp);
+ ip_ct_unlink_expect(exp);
+ ip_conntrack_expect_put(exp);
}
}
/* Get rid of expecteds, set helpers to NULL. */
@@ -893,7 +1106,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
for (i = 0; i < ip_conntrack_htable_size; i++)
LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
struct ip_conntrack_tuple_hash *, me);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
/* Someone could be still looking at the helper in a bh. */
synchronize_net();
@@ -925,40 +1138,63 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
ct->timeout.expires = extra_jiffies;
ct_add_counters(ct, ctinfo, skb);
} else {
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
/* Need del_timer for race avoidance (may already be dying). */
if (del_timer(&ct->timeout)) {
ct->timeout.expires = jiffies + extra_jiffies;
add_timer(&ct->timeout);
+ ip_conntrack_event_cache(IPCT_REFRESH, skb);
}
ct_add_counters(ct, ctinfo, skb);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
}
}
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
+ * in ip_conntrack_core, since we don't want the protocols to autoload
+ * or depend on ctnetlink */
+int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple)
+{
+ NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
+ &tuple->src.u.tcp.port);
+ NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
+ &tuple->dst.u.tcp.port);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
+ struct ip_conntrack_tuple *t)
+{
+ if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
+ return -EINVAL;
+
+ t->src.u.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
+ t->dst.u.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
+
+ return 0;
+}
+#endif
+
/* Returns new sk_buff, or NULL */
struct sk_buff *
ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
{
-#ifdef CONFIG_NETFILTER_DEBUG
- unsigned int olddebug = skb->nf_debug;
-#endif
-
skb_orphan(skb);
local_bh_disable();
skb = ip_defrag(skb, user);
local_bh_enable();
- if (skb) {
+ if (skb)
ip_send_check(skb->nh.iph);
- skb->nfcache |= NFC_ALTERED;
-#ifdef CONFIG_NETFILTER_DEBUG
- /* Packet path as if nothing had happened. */
- skb->nf_debug = olddebug;
-#endif
- }
-
return skb;
}
@@ -997,7 +1233,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
{
struct ip_conntrack_tuple_hash *h = NULL;
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
struct ip_conntrack_tuple_hash *, iter, data);
@@ -1009,7 +1245,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
struct ip_conntrack_tuple_hash *, iter, data);
if (h)
atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
return h;
}
@@ -1108,23 +1344,31 @@ static void free_conntrack_hash(void)
* ip_conntrack_htable_size));
}
-/* Mishearing the voices in his head, our hero wonders how he's
- supposed to kill the mall. */
-void ip_conntrack_cleanup(void)
+void ip_conntrack_flush()
{
- ip_ct_attach = NULL;
/* This makes sure all current packets have passed through
netfilter framework. Roll on, two-stage module
delete... */
synchronize_net();
-
+
+ ip_ct_event_cache_flush();
i_see_dead_people:
ip_ct_iterate_cleanup(kill_all, NULL);
if (atomic_read(&ip_conntrack_count) != 0) {
schedule();
goto i_see_dead_people;
}
+ /* wait until all references to ip_conntrack_untracked are dropped */
+ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
+ schedule();
+}
+/* Mishearing the voices in his head, our hero wonders how he's
+ supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
+{
+ ip_ct_attach = NULL;
+ ip_conntrack_flush();
kmem_cache_destroy(ip_conntrack_cachep);
kmem_cache_destroy(ip_conntrack_expect_cachep);
free_conntrack_hash();
@@ -1201,14 +1445,14 @@ int __init ip_conntrack_init(void)
}
/* Don't NEED lock here, but good form anyway. */
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
for (i = 0; i < MAX_IP_CT_PROTO; i++)
ip_ct_protos[i] = &ip_conntrack_generic_protocol;
/* Sew in builtin protocols. */
ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
for (i = 0; i < ip_conntrack_htable_size; i++)
INIT_LIST_HEAD(&ip_conntrack_hash[i]);
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index dd86503..1b79ec3 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -16,7 +16,6 @@
#include <net/checksum.h>
#include <net/tcp.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
#include <linux/moduleparam.h>
@@ -26,9 +25,8 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
MODULE_DESCRIPTION("ftp connection tracking helper");
/* This is slow, but it's simple. --RR */
-static char ftp_buffer[65536];
-
-static DECLARE_LOCK(ip_ftp_lock);
+static char *ftp_buffer;
+static DEFINE_SPINLOCK(ip_ftp_lock);
#define MAX_PORTS 8
static int ports[MAX_PORTS];
@@ -263,7 +261,8 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
}
/* We don't update if it's older than what we have. */
-static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
+static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
+ struct sk_buff *skb)
{
unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
@@ -277,10 +276,13 @@ static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
oldest = i;
}
- if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER)
+ if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
- else if (oldest != NUM_SEQ_TO_REMEMBER)
+ ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+ } else if (oldest != NUM_SEQ_TO_REMEMBER) {
info->seq_aft_nl[dir][oldest] = nl_seq;
+ ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+ }
}
static int help(struct sk_buff **pskb,
@@ -319,7 +321,7 @@ static int help(struct sk_buff **pskb,
}
datalen = (*pskb)->len - dataoff;
- LOCK_BH(&ip_ftp_lock);
+ spin_lock_bh(&ip_ftp_lock);
fb_ptr = skb_header_pointer(*pskb, dataoff,
(*pskb)->len - dataoff, ftp_buffer);
BUG_ON(fb_ptr == NULL);
@@ -377,7 +379,7 @@ static int help(struct sk_buff **pskb,
fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
/* Allocate expectation which will be inserted */
- exp = ip_conntrack_expect_alloc();
+ exp = ip_conntrack_expect_alloc(ct);
if (exp == NULL) {
ret = NF_DROP;
goto out;
@@ -404,8 +406,7 @@ static int help(struct sk_buff **pskb,
networks, or the packet filter itself). */
if (!loose) {
ret = NF_ACCEPT;
- ip_conntrack_expect_free(exp);
- goto out_update_nl;
+ goto out_put_expect;
}
exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
| (array[2] << 8) | array[3]);
@@ -420,7 +421,7 @@ static int help(struct sk_buff **pskb,
{ 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
exp->expectfn = NULL;
- exp->master = ct;
+ exp->flags = 0;
/* Now, NAT might want to mangle the packet, and register the
* (possibly changed) expectation itself. */
@@ -429,20 +430,22 @@ static int help(struct sk_buff **pskb,
matchoff, matchlen, exp, &seq);
else {
/* Can't expect this? Best to drop packet now. */
- if (ip_conntrack_expect_related(exp) != 0) {
- ip_conntrack_expect_free(exp);
+ if (ip_conntrack_expect_related(exp) != 0)
ret = NF_DROP;
- } else
+ else
ret = NF_ACCEPT;
}
+out_put_expect:
+ ip_conntrack_expect_put(exp);
+
out_update_nl:
/* Now if this ends in \n, update ftp info. Seq may have been
* adjusted by NAT code. */
if (ends_in_nl)
- update_nl_seq(seq, ct_ftp_info,dir);
+ update_nl_seq(seq, ct_ftp_info,dir, *pskb);
out:
- UNLOCK_BH(&ip_ftp_lock);
+ spin_unlock_bh(&ip_ftp_lock);
return ret;
}
@@ -458,6 +461,8 @@ static void fini(void)
ports[i]);
ip_conntrack_helper_unregister(&ftp[i]);
}
+
+ kfree(ftp_buffer);
}
static int __init init(void)
@@ -465,6 +470,10 @@ static int __init init(void)
int i, ret;
char *tmpname;
+ ftp_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!ftp_buffer)
+ return -ENOMEM;
+
if (ports_c == 0)
ports[ports_c++] = FTP_PORT;
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 33cc734..d7a8a98 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -29,7 +29,6 @@
#include <net/checksum.h>
#include <net/tcp.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
#include <linux/moduleparam.h>
@@ -40,8 +39,8 @@ static int ports_c;
static int max_dcc_channels = 8;
static unsigned int dcc_timeout = 300;
/* This is slow, but it's simple. --RR */
-static char irc_buffer[65536];
-static DECLARE_LOCK(irc_buffer_lock);
+static char *irc_buffer;
+static DEFINE_SPINLOCK(irc_buffer_lock);
unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
enum ip_conntrack_info ctinfo,
@@ -141,7 +140,7 @@ static int help(struct sk_buff **pskb,
if (dataoff >= (*pskb)->len)
return NF_ACCEPT;
- LOCK_BH(&irc_buffer_lock);
+ spin_lock_bh(&irc_buffer_lock);
ib_ptr = skb_header_pointer(*pskb, dataoff,
(*pskb)->len - dataoff, irc_buffer);
BUG_ON(ib_ptr == NULL);
@@ -198,7 +197,7 @@ static int help(struct sk_buff **pskb,
continue;
}
- exp = ip_conntrack_expect_alloc();
+ exp = ip_conntrack_expect_alloc(ct);
if (exp == NULL) {
ret = NF_DROP;
goto out;
@@ -222,22 +221,21 @@ static int help(struct sk_buff **pskb,
{ { 0, { 0 } },
{ 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
exp->expectfn = NULL;
- exp->master = ct;
+ exp->flags = 0;
if (ip_nat_irc_hook)
ret = ip_nat_irc_hook(pskb, ctinfo,
addr_beg_p - ib_ptr,
addr_end_p - addr_beg_p,
exp);
- else if (ip_conntrack_expect_related(exp) != 0) {
- ip_conntrack_expect_free(exp);
+ else if (ip_conntrack_expect_related(exp) != 0)
ret = NF_DROP;
- }
+ ip_conntrack_expect_put(exp);
goto out;
} /* for .. NUM_DCCPROTO */
} /* while data < ... */
out:
- UNLOCK_BH(&irc_buffer_lock);
+ spin_unlock_bh(&irc_buffer_lock);
return ret;
}
@@ -260,6 +258,10 @@ static int __init init(void)
printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n");
return -EBUSY;
}
+
+ irc_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!irc_buffer)
+ return -ENOMEM;
/* If no port given, default to standard irc port */
if (ports_c == 0)
@@ -307,6 +309,7 @@ static void fini(void)
ports[i]);
ip_conntrack_helper_unregister(&irc_helpers[i]);
}
+ kfree(irc_buffer);
}
module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
new file mode 100644
index 0000000..2b5cf9c
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
@@ -0,0 +1,131 @@
+/*
+ * NetBIOS name service broadcast connection tracking helper
+ *
+ * (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+/*
+ * This helper tracks locally originating NetBIOS name service
+ * requests by issuing permanent expectations (valid until
+ * timing out) matching all reply connections from the
+ * destination network. The only NetBIOS specific thing is
+ * actually the port number.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+
+static unsigned int timeout = 3;
+module_param(timeout, int, 0600);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+static int help(struct sk_buff **pskb,
+ struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
+{
+ struct ip_conntrack_expect *exp;
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct udphdr _uh, *uh;
+ struct rtable *rt = (struct rtable *)(*pskb)->dst;
+ struct in_device *in_dev;
+ u_int32_t mask = 0;
+
+ /* we're only interested in locally generated packets */
+ if ((*pskb)->sk == NULL)
+ goto out;
+ if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
+ goto out;
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ goto out;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get(rt->u.dst.dev);
+ if (in_dev != NULL) {
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_broadcast == iph->daddr) {
+ mask = ifa->ifa_mask;
+ break;
+ }
+ } endfor_ifa(in_dev);
+ }
+ rcu_read_unlock();
+
+ if (mask == 0)
+ goto out;
+
+ uh = skb_header_pointer(*pskb, iph->ihl * 4, sizeof(_uh), &_uh);
+ BUG_ON(uh == NULL);
+
+ exp = ip_conntrack_expect_alloc(ct);
+ if (exp == NULL)
+ goto out;
+ memset(&exp->tuple, 0, sizeof(exp->tuple));
+ exp->tuple.src.ip = iph->daddr & mask;
+ exp->tuple.dst.ip = iph->saddr;
+ exp->tuple.dst.u.udp.port = uh->source;
+ exp->tuple.dst.protonum = IPPROTO_UDP;
+
+ memset(&exp->mask, 0, sizeof(exp->mask));
+ exp->mask.src.ip = mask;
+ exp->mask.dst.ip = 0xFFFFFFFF;
+ exp->mask.dst.u.udp.port = 0xFFFF;
+ exp->mask.dst.protonum = 0xFF;
+
+ exp->expectfn = NULL;
+ exp->flags = IP_CT_EXPECT_PERMANENT;
+
+ ip_conntrack_expect_related(exp);
+ ip_conntrack_expect_put(exp);
+
+ ip_ct_refresh_acct(ct, ctinfo, NULL, timeout * HZ);
+out:
+ return NF_ACCEPT;
+}
+
+static struct ip_conntrack_helper helper = {
+ .name = "netbios-ns",
+ .tuple = {
+ .src.u.udp.port = __constant_htons(137),
+ .dst.protonum = IPPROTO_UDP,
+ },
+ .mask = {
+ .src.u.udp.port = 0xFFFF,
+ .dst.protonum = 0xFF,
+ },
+ .max_expected = 1,
+ .me = THIS_MODULE,
+ .help = help,
+};
+
+static int __init init(void)
+{
+ helper.timeout = timeout;
+ return ip_conntrack_helper_register(&helper);
+}
+
+static void __exit fini(void)
+{
+ ip_conntrack_helper_unregister(&helper);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
new file mode 100644
index 0000000..15aef35
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -0,0 +1,1584 @@
+/* Connection tracking via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2003 by Patrick Mchardy <kaber@trash.net>
+ * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * I've reworked this stuff to use attributes instead of conntrack
+ * structures. 5.44 am. I need more tea. --pablo 05/07/11.
+ *
+ * Initial connection tracking via netlink development funded and
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+MODULE_LICENSE("GPL");
+
+static char __initdata version[] = "0.90";
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+
+static inline int
+ctnetlink_dump_tuples_proto(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple)
+{
+ struct ip_conntrack_protocol *proto;
+
+ NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
+
+ proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
+ if (proto && proto->tuple_to_nfattr)
+ return proto->tuple_to_nfattr(skb, tuple);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_tuples(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple)
+{
+ struct nfattr *nest_parms;
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_IP);
+ NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip);
+ NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip);
+ NFA_NEST_END(skb, nest_parms);
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO);
+ ctnetlink_dump_tuples_proto(skb, tuple);
+ NFA_NEST_END(skb, nest_parms);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ u_int32_t status = htonl((u_int32_t) ct->status);
+ NFA_PUT(skb, CTA_STATUS, sizeof(status), &status);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ long timeout_l = ct->timeout.expires - jiffies;
+ u_int32_t timeout;
+
+ if (timeout_l < 0)
+ timeout = 0;
+ else
+ timeout = htonl(timeout_l / HZ);
+
+ NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+
+ struct nfattr *nest_proto;
+ int ret;
+
+ if (!proto || !proto->to_nfattr)
+ return 0;
+
+ nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
+
+ ret = proto->to_nfattr(skb, nest_proto, ct);
+
+ ip_conntrack_proto_put(proto);
+
+ NFA_NEST_END(skb, nest_proto);
+
+ return ret;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ struct nfattr *nest_helper;
+
+ if (!ct->helper)
+ return 0;
+
+ nest_helper = NFA_NEST(skb, CTA_HELP);
+ NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name);
+
+ if (ct->helper->to_nfattr)
+ ct->helper->to_nfattr(skb, ct);
+
+ NFA_NEST_END(skb, nest_helper);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+static inline int
+ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
+ enum ip_conntrack_dir dir)
+{
+ enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+ struct nfattr *nest_count = NFA_NEST(skb, type);
+ u_int64_t tmp;
+
+ tmp = cpu_to_be64(ct->counters[dir].packets);
+ NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(u_int64_t), &tmp);
+
+ tmp = cpu_to_be64(ct->counters[dir].bytes);
+ NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(u_int64_t), &tmp);
+
+ NFA_NEST_END(skb, nest_count);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+#else
+#define ctnetlink_dump_counters(a, b, c) (0)
+#endif
+
+#ifdef CONFIG_IP_NF_CONNTRACK_MARK
+static inline int
+ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ u_int32_t mark = htonl(ct->mark);
+
+ NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+#else
+#define ctnetlink_dump_mark(a, b) (0)
+#endif
+
+static inline int
+ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ u_int32_t id = htonl(ct->id);
+ NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+ unsigned int use = htonl(atomic_read(&ct->ct_general.use));
+
+ NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use);
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)
+
+static int
+ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+ int event, int nowait,
+ const struct ip_conntrack *ct)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct nfattr *nest_parms;
+ unsigned char *b;
+
+ b = skb->tail;
+
+ event |= NFNL_SUBSYS_CTNETLINK << 8;
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+
+ nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+ if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nfattr_failure;
+ NFA_NEST_END(skb, nest_parms);
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+ if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nfattr_failure;
+ NFA_NEST_END(skb, nest_parms);
+
+ if (ctnetlink_dump_status(skb, ct) < 0 ||
+ ctnetlink_dump_timeout(skb, ct) < 0 ||
+ ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+ ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct) < 0 ||
+ ctnetlink_dump_helpinfo(skb, ct) < 0 ||
+ ctnetlink_dump_mark(skb, ct) < 0 ||
+ ctnetlink_dump_id(skb, ct) < 0 ||
+ ctnetlink_dump_use(skb, ct) < 0)
+ goto nfattr_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+nfattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static int ctnetlink_conntrack_event(struct notifier_block *this,
+ unsigned long events, void *ptr)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct nfattr *nest_parms;
+ struct ip_conntrack *ct = (struct ip_conntrack *)ptr;
+ struct sk_buff *skb;
+ unsigned int type;
+ unsigned char *b;
+ unsigned int flags = 0, group;
+
+ /* ignore our fake conntrack entry */
+ if (ct == &ip_conntrack_untracked)
+ return NOTIFY_DONE;
+
+ if (events & IPCT_DESTROY) {
+ type = IPCTNL_MSG_CT_DELETE;
+ group = NFNLGRP_CONNTRACK_DESTROY;
+ goto alloc_skb;
+ }
+ if (events & (IPCT_NEW | IPCT_RELATED)) {
+ type = IPCTNL_MSG_CT_NEW;
+ flags = NLM_F_CREATE|NLM_F_EXCL;
+ /* dump everything */
+ events = ~0UL;
+ group = NFNLGRP_CONNTRACK_NEW;
+ goto alloc_skb;
+ }
+ if (events & (IPCT_STATUS |
+ IPCT_PROTOINFO |
+ IPCT_HELPER |
+ IPCT_HELPINFO |
+ IPCT_NATINFO)) {
+ type = IPCTNL_MSG_CT_NEW;
+ group = NFNLGRP_CONNTRACK_UPDATE;
+ goto alloc_skb;
+ }
+
+ return NOTIFY_DONE;
+
+alloc_skb:
+ /* FIXME: Check if there are any listeners before, don't hurt performance */
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb)
+ return NOTIFY_DONE;
+
+ b = skb->tail;
+
+ type |= NFNL_SUBSYS_CTNETLINK << 8;
+ nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+
+ nlh->nlmsg_flags = flags;
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+ if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+ goto nfattr_failure;
+ NFA_NEST_END(skb, nest_parms);
+
+ nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+ if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+ goto nfattr_failure;
+ NFA_NEST_END(skb, nest_parms);
+
+ /* NAT stuff is now a status flag */
+ if ((events & IPCT_STATUS || events & IPCT_NATINFO)
+ && ctnetlink_dump_status(skb, ct) < 0)
+ goto nfattr_failure;
+ if (events & IPCT_REFRESH
+ && ctnetlink_dump_timeout(skb, ct) < 0)
+ goto nfattr_failure;
+ if (events & IPCT_PROTOINFO
+ && ctnetlink_dump_protoinfo(skb, ct) < 0)
+ goto nfattr_failure;
+ if (events & IPCT_HELPINFO
+ && ctnetlink_dump_helpinfo(skb, ct) < 0)
+ goto nfattr_failure;
+
+ if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+ ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
+ goto nfattr_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+ nfnetlink_send(skb, 0, group, 0);
+ return NOTIFY_DONE;
+
+nlmsg_failure:
+nfattr_failure:
+ kfree_skb(skb);
+ return NOTIFY_DONE;
+}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
+static int ctnetlink_done(struct netlink_callback *cb)
+{
+ DEBUGP("entered %s\n", __FUNCTION__);
+ return 0;
+}
+
+static int
+ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ip_conntrack *ct = NULL;
+ struct ip_conntrack_tuple_hash *h;
+ struct list_head *i;
+ u_int32_t *id = (u_int32_t *) &cb->args[1];
+
+ DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__,
+ cb->args[0], *id);
+
+ read_lock_bh(&ip_conntrack_lock);
+ for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+ list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
+ h = (struct ip_conntrack_tuple_hash *) i;
+ if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+ ct = tuplehash_to_ctrack(h);
+ if (ct->id <= *id)
+ continue;
+ if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_CT_NEW,
+ 1, ct) < 0)
+ goto out;
+ *id = ct->id;
+ }
+ }
+out:
+ read_unlock_bh(&ip_conntrack_lock);
+
+ DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+
+ return skb->len;
+}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+static int
+ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ip_conntrack *ct = NULL;
+ struct ip_conntrack_tuple_hash *h;
+ struct list_head *i;
+ u_int32_t *id = (u_int32_t *) &cb->args[1];
+
+ DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__,
+ cb->args[0], *id);
+
+ write_lock_bh(&ip_conntrack_lock);
+ for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+ list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
+ h = (struct ip_conntrack_tuple_hash *) i;
+ if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+ ct = tuplehash_to_ctrack(h);
+ if (ct->id <= *id)
+ continue;
+ if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_CT_NEW,
+ 1, ct) < 0)
+ goto out;
+ *id = ct->id;
+
+ memset(&ct->counters, 0, sizeof(ct->counters));
+ }
+ }
+out:
+ write_unlock_bh(&ip_conntrack_lock);
+
+ DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+
+ return skb->len;
+}
+#endif
+
+static const int cta_min_ip[CTA_IP_MAX] = {
+ [CTA_IP_V4_SRC-1] = sizeof(u_int32_t),
+ [CTA_IP_V4_DST-1] = sizeof(u_int32_t),
+};
+
+static inline int
+ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
+{
+ struct nfattr *tb[CTA_IP_MAX];
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+
+ if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
+ return -EINVAL;
+
+ if (!tb[CTA_IP_V4_SRC-1])
+ return -EINVAL;
+ tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
+
+ if (!tb[CTA_IP_V4_DST-1])
+ return -EINVAL;
+ tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
+
+ DEBUGP("leaving\n");
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static const int cta_min_proto[CTA_PROTO_MAX] = {
+ [CTA_PROTO_NUM-1] = sizeof(u_int16_t),
+ [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
+ [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t),
+ [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
+ [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t),
+ [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t),
+};
+
+static inline int
+ctnetlink_parse_tuple_proto(struct nfattr *attr,
+ struct ip_conntrack_tuple *tuple)
+{
+ struct nfattr *tb[CTA_PROTO_MAX];
+ struct ip_conntrack_protocol *proto;
+ int ret = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+ return -EINVAL;
+
+ if (!tb[CTA_PROTO_NUM-1])
+ return -EINVAL;
+ tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
+
+ proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
+
+ if (likely(proto && proto->nfattr_to_tuple)) {
+ ret = proto->nfattr_to_tuple(tb, tuple);
+ ip_conntrack_proto_put(proto);
+ }
+
+ return ret;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
+ enum ctattr_tuple type)
+{
+ struct nfattr *tb[CTA_TUPLE_MAX];
+ int err;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ memset(tuple, 0, sizeof(*tuple));
+
+ if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0)
+ goto nfattr_failure;
+
+ if (!tb[CTA_TUPLE_IP-1])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple);
+ if (err < 0)
+ return err;
+
+ if (!tb[CTA_TUPLE_PROTO-1])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple);
+ if (err < 0)
+ return err;
+
+ /* orig and expect tuples get DIR_ORIGINAL */
+ if (type == CTA_TUPLE_REPLY)
+ tuple->dst.dir = IP_CT_DIR_REPLY;
+ else
+ tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+ DUMP_TUPLE(tuple);
+
+ DEBUGP("leaving\n");
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+static const int cta_min_protonat[CTA_PROTONAT_MAX] = {
+ [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t),
+ [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t),
+};
+
+static int ctnetlink_parse_nat_proto(struct nfattr *attr,
+ const struct ip_conntrack *ct,
+ struct ip_nat_range *range)
+{
+ struct nfattr *tb[CTA_PROTONAT_MAX];
+ struct ip_nat_protocol *npt;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
+ goto nfattr_failure;
+
+ npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+ if (!npt)
+ return 0;
+
+ if (!npt->nfattr_to_range) {
+ ip_nat_proto_put(npt);
+ return 0;
+ }
+
+ /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */
+ if (npt->nfattr_to_range(tb, range) > 0)
+ range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+
+ ip_nat_proto_put(npt);
+
+ DEBUGP("leaving\n");
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_parse_nat(struct nfattr *cda[],
+ const struct ip_conntrack *ct, struct ip_nat_range *range)
+{
+ struct nfattr *tb[CTA_NAT_MAX];
+ int err;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ memset(range, 0, sizeof(*range));
+
+ if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0)
+ goto nfattr_failure;
+
+ if (tb[CTA_NAT_MINIP-1])
+ range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
+
+ if (!tb[CTA_NAT_MAXIP-1])
+ range->max_ip = range->min_ip;
+ else
+ range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]);
+
+ if (range->min_ip)
+ range->flags |= IP_NAT_RANGE_MAP_IPS;
+
+ if (!tb[CTA_NAT_PROTO-1])
+ return 0;
+
+ err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range);
+ if (err < 0)
+ return err;
+
+ DEBUGP("leaving\n");
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+#endif
+
+static inline int
+ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
+{
+ struct nfattr *tb[CTA_HELP_MAX];
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0)
+ goto nfattr_failure;
+
+ if (!tb[CTA_HELP_NAME-1])
+ return -EINVAL;
+
+ *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static int
+ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack *ct;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (cda[CTA_TUPLE_ORIG-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
+ else if (cda[CTA_TUPLE_REPLY-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+ else {
+ /* Flush the whole table */
+ ip_conntrack_flush();
+ return 0;
+ }
+
+ if (err < 0)
+ return err;
+
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (!h) {
+ DEBUGP("tuple not found in conntrack hash\n");
+ return -ENOENT;
+ }
+
+ ct = tuplehash_to_ctrack(h);
+
+ if (cda[CTA_ID-1]) {
+ u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1]));
+ if (ct->id != id) {
+ ip_conntrack_put(ct);
+ return -ENOENT;
+ }
+ }
+ if (del_timer(&ct->timeout)) {
+ ip_conntrack_put(ct);
+ ct->timeout.function((unsigned long)ct);
+ return 0;
+ }
+ ip_conntrack_put(ct);
+ DEBUGP("leaving\n");
+
+ return 0;
+}
+
+static int
+ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack *ct;
+ struct sk_buff *skb2 = NULL;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct nfgenmsg *msg = NLMSG_DATA(nlh);
+ u32 rlen;
+
+ if (msg->nfgen_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ if (NFNL_MSG_TYPE(nlh->nlmsg_type) ==
+ IPCTNL_MSG_CT_GET_CTRZERO) {
+#ifdef CONFIG_IP_NF_CT_ACCT
+ if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+ ctnetlink_dump_table_w,
+ ctnetlink_done)) != 0)
+ return -EINVAL;
+#else
+ return -ENOTSUPP;
+#endif
+ } else {
+ if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+ ctnetlink_dump_table,
+ ctnetlink_done)) != 0)
+ return -EINVAL;
+ }
+
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ skb_pull(skb, rlen);
+ return 0;
+ }
+
+ if (cda[CTA_TUPLE_ORIG-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
+ else if (cda[CTA_TUPLE_REPLY-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+ else
+ return -EINVAL;
+
+ if (err < 0)
+ return err;
+
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (!h) {
+ DEBUGP("tuple not found in conntrack hash");
+ return -ENOENT;
+ }
+ DEBUGP("tuple found\n");
+ ct = tuplehash_to_ctrack(h);
+
+ err = -ENOMEM;
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb2) {
+ ip_conntrack_put(ct);
+ return -ENOMEM;
+ }
+ NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+
+ err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq,
+ IPCTNL_MSG_CT_NEW, 1, ct);
+ ip_conntrack_put(ct);
+ if (err <= 0)
+ goto out;
+
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ if (err < 0)
+ goto out;
+
+ DEBUGP("leaving\n");
+ return 0;
+
+out:
+ if (skb2)
+ kfree_skb(skb2);
+ return -1;
+}
+
+static inline int
+ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ unsigned long d, status = *(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]);
+ d = ct->status ^ status;
+
+ if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+ /* unchangeable */
+ return -EINVAL;
+
+ if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+ /* SEEN_REPLY bit can only be set */
+ return -EINVAL;
+
+
+ if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+ /* ASSURED bit can only be set */
+ return -EINVAL;
+
+ if (cda[CTA_NAT-1]) {
+#ifndef CONFIG_IP_NF_NAT_NEEDED
+ return -EINVAL;
+#else
+ unsigned int hooknum;
+ struct ip_nat_range range;
+
+ if (ctnetlink_parse_nat(cda, ct, &range) < 0)
+ return -EINVAL;
+
+ DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n",
+ NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
+ htons(range.min.all), htons(range.max.all));
+
+ /* This is tricky but it works. ip_nat_setup_info needs the
+ * hook number as parameter, so let's do the correct
+ * conversion and run away */
+ if (status & IPS_SRC_NAT_DONE)
+ hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
+ else if (status & IPS_DST_NAT_DONE)
+ hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */
+ else
+ return -EINVAL; /* Missing NAT flags */
+
+ DEBUGP("NAT status: %lu\n",
+ status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+
+ if (ip_nat_initialized(ct, hooknum))
+ return -EEXIST;
+ ip_nat_setup_info(ct, &range, hooknum);
+
+ DEBUGP("NAT status after setup_info: %lu\n",
+ ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+#endif
+ }
+
+ /* Be careful here, modifying NAT bits can screw up things,
+ * so don't let users modify them directly if they don't pass
+ * ip_nat_range. */
+ ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
+ return 0;
+}
+
+
+static inline int
+ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ struct ip_conntrack_helper *helper;
+ char *helpname;
+ int err;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ /* don't change helper of sibling connections */
+ if (ct->master)
+ return -EINVAL;
+
+ err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname);
+ if (err < 0)
+ return err;
+
+ helper = __ip_conntrack_helper_find_byname(helpname);
+ if (!helper) {
+ if (!strcmp(helpname, ""))
+ helper = NULL;
+ else
+ return -EINVAL;
+ }
+
+ if (ct->helper) {
+ if (!helper) {
+ /* we had a helper before ... */
+ ip_ct_remove_expectations(ct);
+ ct->helper = NULL;
+ } else {
+ /* need to zero data of old helper */
+ memset(&ct->help, 0, sizeof(ct->help));
+ }
+ }
+
+ ct->helper = helper;
+
+ return 0;
+}
+
+static inline int
+ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+
+ if (!del_timer(&ct->timeout))
+ return -ETIME;
+
+ ct->timeout.expires = jiffies + timeout * HZ;
+ add_timer(&ct->timeout);
+
+ return 0;
+}
+
+static int
+ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+ int err;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (cda[CTA_HELP-1]) {
+ err = ctnetlink_change_helper(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_TIMEOUT-1]) {
+ err = ctnetlink_change_timeout(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_STATUS-1]) {
+ err = ctnetlink_change_status(ct, cda);
+ if (err < 0)
+ return err;
+ }
+
+ DEBUGP("all done\n");
+ return 0;
+}
+
+static int
+ctnetlink_create_conntrack(struct nfattr *cda[],
+ struct ip_conntrack_tuple *otuple,
+ struct ip_conntrack_tuple *rtuple)
+{
+ struct ip_conntrack *ct;
+ int err = -EINVAL;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ ct = ip_conntrack_alloc(otuple, rtuple);
+ if (ct == NULL || IS_ERR(ct))
+ return -ENOMEM;
+
+ if (!cda[CTA_TIMEOUT-1])
+ goto err;
+ ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+
+ ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
+ ct->status |= IPS_CONFIRMED;
+
+ err = ctnetlink_change_status(ct, cda);
+ if (err < 0)
+ goto err;
+
+ ct->helper = ip_conntrack_helper_find_get(rtuple);
+
+ add_timer(&ct->timeout);
+ ip_conntrack_hash_insert(ct);
+
+ if (ct->helper)
+ ip_conntrack_helper_put(ct->helper);
+
+ DEBUGP("conntrack with id %u inserted\n", ct->id);
+ return 0;
+
+err:
+ ip_conntrack_free(ct);
+ return err;
+}
+
+static int
+ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple otuple, rtuple;
+ struct ip_conntrack_tuple_hash *h = NULL;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (cda[CTA_TUPLE_ORIG-1]) {
+ err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG);
+ if (err < 0)
+ return err;
+ }
+
+ if (cda[CTA_TUPLE_REPLY-1]) {
+ err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY);
+ if (err < 0)
+ return err;
+ }
+
+ write_lock_bh(&ip_conntrack_lock);
+ if (cda[CTA_TUPLE_ORIG-1])
+ h = __ip_conntrack_find(&otuple, NULL);
+ else if (cda[CTA_TUPLE_REPLY-1])
+ h = __ip_conntrack_find(&rtuple, NULL);
+
+ if (h == NULL) {
+ write_unlock_bh(&ip_conntrack_lock);
+ DEBUGP("no such conntrack, create new\n");
+ err = -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_CREATE)
+ err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
+ return err;
+ }
+ /* implicit 'else' */
+
+ /* we only allow nat config for new conntracks */
+ if (cda[CTA_NAT-1]) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* We manipulate the conntrack inside the global conntrack table lock,
+ * so there's no need to increase the refcount */
+ DEBUGP("conntrack found\n");
+ err = -EEXIST;
+ if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+ err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda);
+
+out_unlock:
+ write_unlock_bh(&ip_conntrack_lock);
+ return err;
+}
+
+/***********************************************************************
+ * EXPECT
+ ***********************************************************************/
+
+static inline int
+ctnetlink_exp_dump_tuple(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *tuple,
+ enum ctattr_expect type)
+{
+ struct nfattr *nest_parms = NFA_NEST(skb, type);
+
+ if (ctnetlink_dump_tuples(skb, tuple) < 0)
+ goto nfattr_failure;
+
+ NFA_NEST_END(skb, nest_parms);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static inline int
+ctnetlink_exp_dump_expect(struct sk_buff *skb,
+ const struct ip_conntrack_expect *exp)
+{
+ struct ip_conntrack *master = exp->master;
+ u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ);
+ u_int32_t id = htonl(exp->id);
+
+ if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
+ goto nfattr_failure;
+ if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0)
+ goto nfattr_failure;
+ if (ctnetlink_exp_dump_tuple(skb,
+ &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ CTA_EXPECT_MASTER) < 0)
+ goto nfattr_failure;
+
+ NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout);
+ NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static int
+ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+ int event,
+ int nowait,
+ const struct ip_conntrack_expect *exp)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned char *b;
+
+ b = skb->tail;
+
+ event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+ nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+
+ nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+ goto nfattr_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+ return skb->len;
+
+nlmsg_failure:
+nfattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static int ctnetlink_expect_event(struct notifier_block *this,
+ unsigned long events, void *ptr)
+{
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr;
+ struct sk_buff *skb;
+ unsigned int type;
+ unsigned char *b;
+ int flags = 0;
+ u16 proto;
+
+ if (events & IPEXP_NEW) {
+ type = IPCTNL_MSG_EXP_NEW;
+ flags = NLM_F_CREATE|NLM_F_EXCL;
+ } else
+ return NOTIFY_DONE;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+ if (!skb)
+ return NOTIFY_DONE;
+
+ b = skb->tail;
+
+ type |= NFNL_SUBSYS_CTNETLINK << 8;
+ nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+
+ nlh->nlmsg_flags = flags;
+ nfmsg->nfgen_family = AF_INET;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+ goto nfattr_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+ proto = exp->tuple.dst.protonum;
+ nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0);
+ return NOTIFY_DONE;
+
+nlmsg_failure:
+nfattr_failure:
+ kfree_skb(skb);
+ return NOTIFY_DONE;
+}
+#endif
+
+static int
+ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ip_conntrack_expect *exp = NULL;
+ struct list_head *i;
+ u_int32_t *id = (u_int32_t *) &cb->args[0];
+
+ DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id);
+
+ read_lock_bh(&ip_conntrack_lock);
+ list_for_each_prev(i, &ip_conntrack_expect_list) {
+ exp = (struct ip_conntrack_expect *) i;
+ if (exp->id <= *id)
+ continue;
+ if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_EXP_NEW,
+ 1, exp) < 0)
+ goto out;
+ *id = exp->id;
+ }
+out:
+ read_unlock_bh(&ip_conntrack_lock);
+
+ DEBUGP("leaving, last id=%llu\n", *id);
+
+ return skb->len;
+}
+
+static int
+ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack_expect *exp;
+ struct sk_buff *skb2;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct nfgenmsg *msg = NLMSG_DATA(nlh);
+ u32 rlen;
+
+ if (msg->nfgen_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+ ctnetlink_exp_dump_table,
+ ctnetlink_done)) != 0)
+ return -EINVAL;
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ skb_pull(skb, rlen);
+ return 0;
+ }
+
+ if (cda[CTA_EXPECT_MASTER-1])
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER);
+ else
+ return -EINVAL;
+
+ if (err < 0)
+ return err;
+
+ exp = ip_conntrack_expect_find_get(&tuple);
+ if (!exp)
+ return -ENOENT;
+
+ err = -ENOMEM;
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb2)
+ goto out;
+ NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+
+ err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid,
+ nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+ 1, exp);
+ if (err <= 0)
+ goto out;
+
+ ip_conntrack_expect_put(exp);
+
+ err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ if (err < 0)
+ goto free;
+
+ return err;
+
+out:
+ ip_conntrack_expect_put(exp);
+free:
+ if (skb2)
+ kfree_skb(skb2);
+ return err;
+}
+
+static int
+ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_expect *exp, *tmp;
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack_helper *h;
+ int err;
+
+ if (cda[CTA_EXPECT_TUPLE-1]) {
+ /* delete a single expect by tuple */
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+ if (err < 0)
+ return err;
+
+ /* bump usage count to 2 */
+ exp = ip_conntrack_expect_find_get(&tuple);
+ if (!exp)
+ return -ENOENT;
+
+ if (cda[CTA_EXPECT_ID-1]) {
+ u_int32_t id =
+ *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
+ if (exp->id != ntohl(id)) {
+ ip_conntrack_expect_put(exp);
+ return -ENOENT;
+ }
+ }
+
+ /* after list removal, usage count == 1 */
+ ip_conntrack_unexpect_related(exp);
+ /* have to put what we 'get' above.
+ * after this line usage count == 0 */
+ ip_conntrack_expect_put(exp);
+ } else if (cda[CTA_EXPECT_HELP_NAME-1]) {
+ char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
+
+ /* delete all expectations for this helper */
+ write_lock_bh(&ip_conntrack_lock);
+ h = __ip_conntrack_helper_find_byname(name);
+ if (!h) {
+ write_unlock_bh(&ip_conntrack_lock);
+ return -EINVAL;
+ }
+ list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
+ list) {
+ if (exp->master->helper == h
+ && del_timer(&exp->timeout)) {
+ ip_ct_unlink_expect(exp);
+ ip_conntrack_expect_put(exp);
+ }
+ }
+ write_unlock(&ip_conntrack_lock);
+ } else {
+ /* This basically means we have to flush everything*/
+ write_lock_bh(&ip_conntrack_lock);
+ list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
+ list) {
+ if (del_timer(&exp->timeout)) {
+ ip_ct_unlink_expect(exp);
+ ip_conntrack_expect_put(exp);
+ }
+ }
+ write_unlock_bh(&ip_conntrack_lock);
+ }
+
+ return 0;
+}
+static int
+ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[])
+{
+ return -EOPNOTSUPP;
+}
+
+static int
+ctnetlink_create_expect(struct nfattr *cda[])
+{
+ struct ip_conntrack_tuple tuple, mask, master_tuple;
+ struct ip_conntrack_tuple_hash *h = NULL;
+ struct ip_conntrack_expect *exp;
+ struct ip_conntrack *ct;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ /* caller guarantees that those three CTA_EXPECT_* exist */
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+ if (err < 0)
+ return err;
+ err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK);
+ if (err < 0)
+ return err;
+ err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER);
+ if (err < 0)
+ return err;
+
+ /* Look for master conntrack of this expectation */
+ h = ip_conntrack_find_get(&master_tuple, NULL);
+ if (!h)
+ return -ENOENT;
+ ct = tuplehash_to_ctrack(h);
+
+ if (!ct->helper) {
+ /* such conntrack hasn't got any helper, abort */
+ err = -EINVAL;
+ goto out;
+ }
+
+ exp = ip_conntrack_expect_alloc(ct);
+ if (!exp) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ exp->expectfn = NULL;
+ exp->flags = 0;
+ exp->master = ct;
+ memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple));
+ memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple));
+
+ err = ip_conntrack_expect_related(exp);
+ ip_conntrack_expect_put(exp);
+
+out:
+ ip_conntrack_put(tuplehash_to_ctrack(h));
+ return err;
+}
+
+static int
+ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack_expect *exp;
+ int err = 0;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+ if (!cda[CTA_EXPECT_TUPLE-1]
+ || !cda[CTA_EXPECT_MASK-1]
+ || !cda[CTA_EXPECT_MASTER-1])
+ return -EINVAL;
+
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+ if (err < 0)
+ return err;
+
+ write_lock_bh(&ip_conntrack_lock);
+ exp = __ip_conntrack_expect_find(&tuple);
+
+ if (!exp) {
+ write_unlock_bh(&ip_conntrack_lock);
+ err = -ENOENT;
+ if (nlh->nlmsg_flags & NLM_F_CREATE)
+ err = ctnetlink_create_expect(cda);
+ return err;
+ }
+
+ err = -EEXIST;
+ if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+ err = ctnetlink_change_expect(exp, cda);
+ write_unlock_bh(&ip_conntrack_lock);
+
+ DEBUGP("leaving\n");
+
+ return err;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static struct notifier_block ctnl_notifier = {
+ .notifier_call = ctnetlink_conntrack_event,
+};
+
+static struct notifier_block ctnl_notifier_exp = {
+ .notifier_call = ctnetlink_expect_event,
+};
+#endif
+
+static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
+ [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
+ .attr_count = CTA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
+ .attr_count = CTA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
+ .attr_count = CTA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
+ .attr_count = CTA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
+ [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
+ .attr_count = CTA_EXPECT_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
+ .attr_count = CTA_EXPECT_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
+ .attr_count = CTA_EXPECT_MAX,
+ .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnetlink_subsystem ctnl_subsys = {
+ .name = "conntrack",
+ .subsys_id = NFNL_SUBSYS_CTNETLINK,
+ .cb_count = IPCTNL_MSG_MAX,
+ .cb = ctnl_cb,
+};
+
+static struct nfnetlink_subsystem ctnl_exp_subsys = {
+ .name = "conntrack_expect",
+ .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP,
+ .cb_count = IPCTNL_MSG_EXP_MAX,
+ .cb = ctnl_exp_cb,
+};
+
+static int __init ctnetlink_init(void)
+{
+ int ret;
+
+ printk("ctnetlink v%s: registering with nfnetlink.\n", version);
+ ret = nfnetlink_subsys_register(&ctnl_subsys);
+ if (ret < 0) {
+ printk("ctnetlink_init: cannot register with nfnetlink.\n");
+ goto err_out;
+ }
+
+ ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
+ if (ret < 0) {
+ printk("ctnetlink_init: cannot register exp with nfnetlink.\n");
+ goto err_unreg_subsys;
+ }
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+ ret = ip_conntrack_register_notifier(&ctnl_notifier);
+ if (ret < 0) {
+ printk("ctnetlink_init: cannot register notifier.\n");
+ goto err_unreg_exp_subsys;
+ }
+
+ ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp);
+ if (ret < 0) {
+ printk("ctnetlink_init: cannot expect register notifier.\n");
+ goto err_unreg_notifier;
+ }
+#endif
+
+ return 0;
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+err_unreg_notifier:
+ ip_conntrack_unregister_notifier(&ctnl_notifier);
+err_unreg_exp_subsys:
+ nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+#endif
+err_unreg_subsys:
+ nfnetlink_subsys_unregister(&ctnl_subsys);
+err_out:
+ return ret;
+}
+
+static void __exit ctnetlink_exit(void)
+{
+ printk("ctnetlink: unregistering from nfnetlink.\n");
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+ ip_conntrack_unregister_notifier(&ctnl_notifier_exp);
+ ip_conntrack_unregister_notifier(&ctnl_notifier);
+#endif
+
+ nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+ nfnetlink_subsys_unregister(&ctnl_subsys);
+ return;
+}
+
+module_init(ctnetlink_init);
+module_exit(ctnetlink_exit);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 602c74db..838d1d6 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -102,22 +102,24 @@ static int icmp_packet(struct ip_conntrack *ct,
ct->timeout.function((unsigned long)ct);
} else {
atomic_inc(&ct->proto.icmp.count);
+ ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
}
return NF_ACCEPT;
}
+static u_int8_t valid_new[] = {
+ [ICMP_ECHO] = 1,
+ [ICMP_TIMESTAMP] = 1,
+ [ICMP_INFO_REQUEST] = 1,
+ [ICMP_ADDRESS] = 1
+};
+
/* Called when a new connection for this protocol found. */
static int icmp_new(struct ip_conntrack *conntrack,
const struct sk_buff *skb)
{
- static u_int8_t valid_new[]
- = { [ICMP_ECHO] = 1,
- [ICMP_TIMESTAMP] = 1,
- [ICMP_INFO_REQUEST] = 1,
- [ICMP_ADDRESS] = 1 };
-
if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
|| !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
/* Can't create a new ICMP `conn' with this. */
@@ -158,11 +160,12 @@ icmp_error_message(struct sk_buff *skb,
return NF_ACCEPT;
}
- innerproto = ip_ct_find_proto(inside->ip.protocol);
+ innerproto = ip_conntrack_proto_find_get(inside->ip.protocol);
dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
/* Are they talking about one of our connections? */
if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
+ ip_conntrack_proto_put(innerproto);
return NF_ACCEPT;
}
@@ -170,8 +173,10 @@ icmp_error_message(struct sk_buff *skb,
been preserved inside the ICMP. */
if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
DEBUGP("icmp_error_track: Can't invert tuple\n");
+ ip_conntrack_proto_put(innerproto);
return NF_ACCEPT;
}
+ ip_conntrack_proto_put(innerproto);
*ctinfo = IP_CT_RELATED;
@@ -212,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
if (icmph == NULL) {
if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: short packet ");
return -NF_ACCEPT;
}
@@ -226,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
if (!(u16)csum_fold(skb->csum))
break;
if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: bad HW ICMP checksum ");
return -NF_ACCEPT;
case CHECKSUM_NONE:
if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: bad ICMP checksum ");
return -NF_ACCEPT;
}
@@ -249,7 +254,7 @@ checksum_skipped:
*/
if (icmph->type > NR_ICMP_TYPES) {
if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: invalid ICMP type ");
return -NF_ACCEPT;
}
@@ -265,6 +270,47 @@ checksum_skipped:
return icmp_error_message(skb, ctinfo, hooknum);
}
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+static int icmp_tuple_to_nfattr(struct sk_buff *skb,
+ const struct ip_conntrack_tuple *t)
+{
+ NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
+ &t->src.u.icmp.id);
+ NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
+ &t->dst.u.icmp.type);
+ NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
+ &t->dst.u.icmp.code);
+
+ if (t->dst.u.icmp.type >= sizeof(valid_new)
+ || !valid_new[t->dst.u.icmp.type])
+ return -EINVAL;
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+static int icmp_nfattr_to_tuple(struct nfattr *tb[],
+ struct ip_conntrack_tuple *tuple)
+{
+ if (!tb[CTA_PROTO_ICMP_TYPE-1]
+ || !tb[CTA_PROTO_ICMP_CODE-1]
+ || !tb[CTA_PROTO_ICMP_ID-1])
+ return -1;
+
+ tuple->dst.u.icmp.type =
+ *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
+ tuple->dst.u.icmp.code =
+ *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
+ tuple->src.u.icmp.id =
+ *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
+
+ return 0;
+}
+#endif
+
struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
{
.proto = IPPROTO_ICMP,
@@ -276,4 +322,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
.packet = icmp_packet,
.new = icmp_new,
.error = icmp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .tuple_to_nfattr = icmp_tuple_to_nfattr,
+ .nfattr_to_tuple = icmp_nfattr_to_tuple,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index ff8c34a..a875f35 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -26,7 +26,6 @@
#include <linux/netfilter_ipv4/ip_conntrack.h>
#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
#if 0
#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
@@ -35,7 +34,7 @@
#endif
/* Protects conntrack->proto.sctp */
-static DECLARE_RWLOCK(sctp_lock);
+static DEFINE_RWLOCK(sctp_lock);
/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
closely. They're more complex. --RR
@@ -199,9 +198,9 @@ static int sctp_print_conntrack(struct seq_file *s,
DEBUGP(__FUNCTION__);
DEBUGP("\n");
- READ_LOCK(&sctp_lock);
+ read_lock_bh(&sctp_lock);
state = conntrack->proto.sctp.state;
- READ_UNLOCK(&sctp_lock);
+ read_unlock_bh(&sctp_lock);
return seq_printf(s, "%s ", sctp_conntrack_names[state]);
}
@@ -343,13 +342,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
for_each_sctp_chunk (skb, sch, _sch, offset, count) {
- WRITE_LOCK(&sctp_lock);
+ write_lock_bh(&sctp_lock);
/* Special cases of Verification tag check (Sec 8.5.1) */
if (sch->type == SCTP_CID_INIT) {
/* Sec 8.5.1 (A) */
if (sh->vtag != 0) {
- WRITE_UNLOCK(&sctp_lock);
+ write_unlock_bh(&sctp_lock);
return -1;
}
} else if (sch->type == SCTP_CID_ABORT) {
@@ -357,7 +356,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
&& !(sh->vtag == conntrack->proto.sctp.vtag
[1 - CTINFO2DIR(ctinfo)])) {
- WRITE_UNLOCK(&sctp_lock);
+ write_unlock_bh(&sctp_lock);
return -1;
}
} else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
@@ -366,13 +365,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
&& !(sh->vtag == conntrack->proto.sctp.vtag
[1 - CTINFO2DIR(ctinfo)]
&& (sch->flags & 1))) {
- WRITE_UNLOCK(&sctp_lock);
+ write_unlock_bh(&sctp_lock);
return -1;
}
} else if (sch->type == SCTP_CID_COOKIE_ECHO) {
/* Sec 8.5.1 (D) */
if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
- WRITE_UNLOCK(&sctp_lock);
+ write_unlock_bh(&sctp_lock);
return -1;
}
}
@@ -384,7 +383,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
if (newconntrack == SCTP_CONNTRACK_MAX) {
DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
- WRITE_UNLOCK(&sctp_lock);
+ write_unlock_bh(&sctp_lock);
return -1;
}
@@ -396,7 +395,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
sizeof(_inithdr), &_inithdr);
if (ih == NULL) {
- WRITE_UNLOCK(&sctp_lock);
+ write_unlock_bh(&sctp_lock);
return -1;
}
DEBUGP("Setting vtag %x for dir %d\n",
@@ -405,7 +404,9 @@ static int sctp_packet(struct ip_conntrack *conntrack,
}
conntrack->proto.sctp.state = newconntrack;
- WRITE_UNLOCK(&sctp_lock);
+ if (oldsctpstate != newconntrack)
+ ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
+ write_unlock_bh(&sctp_lock);
}
ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
@@ -504,7 +505,12 @@ static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = {
.packet = sctp_packet,
.new = sctp_new,
.destroy = NULL,
- .me = THIS_MODULE
+ .me = THIS_MODULE,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+ .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
};
#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 721ddbf..1985abc 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -36,7 +36,6 @@
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv4/ip_conntrack.h>
#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
#if 0
#define DEBUGP printk
@@ -46,7 +45,7 @@
#endif
/* Protects conntrack->proto.tcp */
-static DECLARE_RWLOCK(tcp_lock);
+static DEFINE_RWLOCK(tcp_lock);
/* "Be conservative in what you do,
be liberal in what you accept from others."
@@ -330,13 +329,31 @@ static int tcp_print_conntrack(struct seq_file *s,
{
enum tcp_conntrack state;
- READ_LOCK(&tcp_lock);
+ read_lock_bh(&tcp_lock);
state = conntrack->proto.tcp.state;
- READ_UNLOCK(&tcp_lock);
+ read_unlock_bh(&tcp_lock);
return seq_printf(s, "%s ", tcp_conntrack_names[state]);
}
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
+ const struct ip_conntrack *ct)
+{
+ read_lock_bh(&tcp_lock);
+ NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
+ &ct->proto.tcp.state);
+ read_unlock_bh(&tcp_lock);
+
+ return 0;
+
+nfattr_failure:
+ read_unlock_bh(&tcp_lock);
+ return -1;
+}
+#endif
+
static unsigned int get_conntrack_index(const struct tcphdr *tcph)
{
if (tcph->rst) return TCP_RST_SET;
@@ -700,7 +717,7 @@ static int tcp_in_window(struct ip_ct_tcp *state,
res = 1;
} else {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: %s ",
before(seq, sender->td_maxend + 1) ?
after(end, sender->td_end - receiver->td_maxwin - 1) ?
@@ -738,14 +755,14 @@ void ip_conntrack_tcp_update(struct sk_buff *skb,
end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
- WRITE_LOCK(&tcp_lock);
+ write_lock_bh(&tcp_lock);
/*
* We have to worry for the ack in the reply packet only...
*/
if (after(end, conntrack->proto.tcp.seen[dir].td_end))
conntrack->proto.tcp.seen[dir].td_end = end;
conntrack->proto.tcp.last_end = end;
- WRITE_UNLOCK(&tcp_lock);
+ write_unlock_bh(&tcp_lock);
DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
"receiver end=%u maxend=%u maxwin=%u scale=%i\n",
sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -799,7 +816,7 @@ static int tcp_error(struct sk_buff *skb,
sizeof(_tcph), &_tcph);
if (th == NULL) {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: short packet ");
return -NF_ACCEPT;
}
@@ -807,7 +824,7 @@ static int tcp_error(struct sk_buff *skb,
/* Not whole TCP header or malformed packet */
if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -824,7 +841,7 @@ static int tcp_error(struct sk_buff *skb,
skb->ip_summed == CHECKSUM_HW ? skb->csum
: skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: bad TCP checksum ");
return -NF_ACCEPT;
}
@@ -833,7 +850,7 @@ static int tcp_error(struct sk_buff *skb,
tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
if (!tcp_valid_flags[tcpflags]) {
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: invalid TCP flag combination ");
return -NF_ACCEPT;
}
@@ -857,7 +874,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
sizeof(_tcph), &_tcph);
BUG_ON(th == NULL);
- WRITE_LOCK(&tcp_lock);
+ write_lock_bh(&tcp_lock);
old_state = conntrack->proto.tcp.state;
dir = CTINFO2DIR(ctinfo);
index = get_conntrack_index(th);
@@ -879,10 +896,11 @@ static int tcp_packet(struct ip_conntrack *conntrack,
* that the client cannot but retransmit its SYN and
* thus initiate a clean new session.
*/
- WRITE_UNLOCK(&tcp_lock);
+ write_unlock_bh(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
- "ip_ct_tcp: killing out of sync session ");
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ NULL, "ip_ct_tcp: "
+ "killing out of sync session ");
if (del_timer(&conntrack->timeout))
conntrack->timeout.function((unsigned long)
conntrack);
@@ -894,9 +912,9 @@ static int tcp_packet(struct ip_conntrack *conntrack,
conntrack->proto.tcp.last_end =
segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
- WRITE_UNLOCK(&tcp_lock);
+ write_unlock_bh(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: invalid packet ignored ");
return NF_ACCEPT;
case TCP_CONNTRACK_MAX:
@@ -904,9 +922,9 @@ static int tcp_packet(struct ip_conntrack *conntrack,
DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
dir, get_conntrack_index(th),
old_state);
- WRITE_UNLOCK(&tcp_lock);
+ write_unlock_bh(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_tcp: invalid state ");
return -NF_ACCEPT;
case TCP_CONNTRACK_SYN_SENT:
@@ -918,16 +936,16 @@ static int tcp_packet(struct ip_conntrack *conntrack,
conntrack->proto.tcp.seen[dir].td_end)) {
/* Attempt to reopen a closed connection.
* Delete this connection and look up again. */
- WRITE_UNLOCK(&tcp_lock);
+ write_unlock_bh(&tcp_lock);
if (del_timer(&conntrack->timeout))
conntrack->timeout.function((unsigned long)
conntrack);
return -NF_REPEAT;
} else {
- WRITE_UNLOCK(&tcp_lock);
+ write_unlock_bh(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL,
- "ip_ct_tcp: invalid SYN");
+ NULL, "ip_ct_tcp: invalid SYN");
return -NF_ACCEPT;
}
case TCP_CONNTRACK_CLOSE:
@@ -949,7 +967,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
skb, iph, th)) {
- WRITE_UNLOCK(&tcp_lock);
+ write_unlock_bh(&tcp_lock);
return -NF_ACCEPT;
}
in_window:
@@ -972,7 +990,11 @@ static int tcp_packet(struct ip_conntrack *conntrack,
timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
&& *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
- WRITE_UNLOCK(&tcp_lock);
+ write_unlock_bh(&tcp_lock);
+
+ ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+ if (new_state != old_state)
+ ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
/* If only reply is a RST, we can consider ourselves not to
@@ -1097,4 +1119,10 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
.packet = tcp_packet,
.new = tcp_new,
.error = tcp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .to_nfattr = tcp_to_nfattr,
+ .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+ .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 5bc28a2..f2dcac7 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrack *conntrack,
ip_ct_refresh_acct(conntrack, ctinfo, skb,
ip_ct_udp_timeout_stream);
/* Also, more likely to be important, and not a probe */
- set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
+ ip_conntrack_event_cache(IPCT_STATUS, skb);
} else
ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
@@ -97,7 +98,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
if (hdr == NULL) {
if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_udp: short packet ");
return -NF_ACCEPT;
}
@@ -105,7 +106,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
/* Truncated/malformed packets */
if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_udp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -120,11 +121,12 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
* and moreover root might send raw packets.
* FIXME: Source route IP option packets --RR */
if (hooknum == NF_IP_PRE_ROUTING
+ && skb->ip_summed != CHECKSUM_UNNECESSARY
&& csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP,
skb->ip_summed == CHECKSUM_HW ? skb->csum
: skb_checksum(skb, iph->ihl*4, udplen, 0))) {
if (LOG_INVALID(IPPROTO_UDP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_udp: bad UDP checksum ");
return -NF_ACCEPT;
}
@@ -143,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_udp =
.packet = udp_packet,
.new = udp_new,
.error = udp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+ .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 46ca45f..ae3e3e6 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -5,7 +5,7 @@
*/
/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -28,8 +28,8 @@
#include <net/checksum.h>
#include <net/ip.h>
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
#include <linux/netfilter_ipv4/ip_conntrack.h>
#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -119,7 +119,7 @@ static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
{
- READ_LOCK(&ip_conntrack_lock);
+ read_lock_bh(&ip_conntrack_lock);
return ct_get_idx(seq, *pos);
}
@@ -131,7 +131,7 @@ static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
static void ct_seq_stop(struct seq_file *s, void *v)
{
- READ_UNLOCK(&ip_conntrack_lock);
+ read_unlock_bh(&ip_conntrack_lock);
}
static int ct_seq_show(struct seq_file *s, void *v)
@@ -140,15 +140,14 @@ static int ct_seq_show(struct seq_file *s, void *v)
const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
struct ip_conntrack_protocol *proto;
- MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ ASSERT_READ_LOCK(&ip_conntrack_lock);
IP_NF_ASSERT(conntrack);
/* we only want to print DIR_ORIGINAL */
if (DIRECTION(hash))
return 0;
- proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.protonum);
+ proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
IP_NF_ASSERT(proto);
if (seq_printf(s, "%-8s %u %ld ",
@@ -185,7 +184,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
return -ENOSPC;
#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
- if (seq_printf(s, "mark=%lu ", conntrack->mark))
+ if (seq_printf(s, "mark=%u ", conntrack->mark))
return -ENOSPC;
#endif
@@ -239,7 +238,7 @@ static void *exp_seq_start(struct seq_file *s, loff_t *pos)
/* strange seq_file api calls stop even if we fail,
* thus we need to grab lock since stop unlocks */
- READ_LOCK(&ip_conntrack_lock);
+ read_lock_bh(&ip_conntrack_lock);
if (list_empty(e))
return NULL;
@@ -256,6 +255,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
struct list_head *e = v;
+ ++*pos;
e = e->next;
if (e == &ip_conntrack_expect_list)
@@ -266,7 +266,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
static void exp_seq_stop(struct seq_file *s, void *v)
{
- READ_UNLOCK(&ip_conntrack_lock);
+ read_unlock_bh(&ip_conntrack_lock);
}
static int exp_seq_show(struct seq_file *s, void *v)
@@ -282,7 +282,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
print_tuple(s, &expect->tuple,
- ip_ct_find_proto(expect->tuple.dst.protonum));
+ __ip_conntrack_proto_find(expect->tuple.dst.protonum));
return seq_putc(s, '\n');
}
@@ -431,6 +431,13 @@ static unsigned int ip_conntrack_defrag(unsigned int hooknum,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
+#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
+ /* Previously seen (loopback)? Ignore. Do this before
+ fragment check. */
+ if ((*pskb)->nfct)
+ return NF_ACCEPT;
+#endif
+
/* Gather fragments. */
if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
*pskb = ip_ct_gather_frags(*pskb,
@@ -881,6 +888,7 @@ static int init_or_cleanup(int init)
return ret;
cleanup:
+ synchronize_net();
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(ip_ct_sysctl_header);
cleanup_localinops:
@@ -920,22 +928,22 @@ int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
{
int ret = 0;
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
ret = -EBUSY;
goto out;
}
ip_ct_protos[proto->proto] = proto;
out:
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
return ret;
}
void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
{
- WRITE_LOCK(&ip_conntrack_lock);
+ write_lock_bh(&ip_conntrack_lock);
ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
- WRITE_UNLOCK(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
/* Somebody could be still looking at the proto in bh. */
synchronize_net();
@@ -963,6 +971,14 @@ void need_ip_conntrack(void)
{
}
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+EXPORT_SYMBOL_GPL(ip_conntrack_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
+EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
+EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
+EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
+#endif
EXPORT_SYMBOL(ip_conntrack_protocol_register);
EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
EXPORT_SYMBOL(ip_ct_get_tuple);
@@ -974,12 +990,16 @@ EXPORT_SYMBOL(ip_conntrack_helper_register);
EXPORT_SYMBOL(ip_conntrack_helper_unregister);
EXPORT_SYMBOL(ip_ct_iterate_cleanup);
EXPORT_SYMBOL(ip_ct_refresh_acct);
-EXPORT_SYMBOL(ip_ct_protos);
-EXPORT_SYMBOL(ip_ct_find_proto);
+
EXPORT_SYMBOL(ip_conntrack_expect_alloc);
-EXPORT_SYMBOL(ip_conntrack_expect_free);
+EXPORT_SYMBOL(ip_conntrack_expect_put);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get);
EXPORT_SYMBOL(ip_conntrack_expect_related);
EXPORT_SYMBOL(ip_conntrack_unexpect_related);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
+EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find);
+EXPORT_SYMBOL_GPL(ip_ct_unlink_expect);
+
EXPORT_SYMBOL(ip_conntrack_tuple_taken);
EXPORT_SYMBOL(ip_ct_gather_frags);
EXPORT_SYMBOL(ip_conntrack_htable_size);
@@ -987,7 +1007,28 @@ EXPORT_SYMBOL(ip_conntrack_lock);
EXPORT_SYMBOL(ip_conntrack_hash);
EXPORT_SYMBOL(ip_conntrack_untracked);
EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
-EXPORT_SYMBOL_GPL(ip_conntrack_put);
#ifdef CONFIG_IP_NF_NAT_NEEDED
EXPORT_SYMBOL(ip_conntrack_tcp_update);
#endif
+
+EXPORT_SYMBOL_GPL(ip_conntrack_flush);
+EXPORT_SYMBOL_GPL(__ip_conntrack_find);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_alloc);
+EXPORT_SYMBOL_GPL(ip_conntrack_free);
+EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert);
+
+EXPORT_SYMBOL_GPL(ip_ct_remove_expectations);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_helper_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
+EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple);
+#endif
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
index 992fac3..d2b5905 100644
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -65,7 +65,7 @@ static int tftp_help(struct sk_buff **pskb,
DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- exp = ip_conntrack_expect_alloc();
+ exp = ip_conntrack_expect_alloc(ct);
if (exp == NULL)
return NF_DROP;
@@ -75,17 +75,16 @@ static int tftp_help(struct sk_buff **pskb,
exp->mask.dst.u.udp.port = 0xffff;
exp->mask.dst.protonum = 0xff;
exp->expectfn = NULL;
- exp->master = ct;
+ exp->flags = 0;
DEBUGP("expect: ");
DUMP_TUPLE(&exp->tuple);
DUMP_TUPLE(&exp->mask);
if (ip_nat_tftp_hook)
ret = ip_nat_tftp_hook(pskb, ctinfo, exp);
- else if (ip_conntrack_expect_related(exp) != 0) {
- ip_conntrack_expect_free(exp);
+ else if (ip_conntrack_expect_related(exp) != 0)
ret = NF_DROP;
- }
+ ip_conntrack_expect_put(exp);
break;
case TFTP_OPCODE_DATA:
case TFTP_OPCODE_ACK:
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
index da1f412..706c807 100644
--- a/net/ipv4/netfilter/ip_nat_amanda.c
+++ b/net/ipv4/netfilter/ip_nat_amanda.c
@@ -56,10 +56,8 @@ static unsigned int help(struct sk_buff **pskb,
break;
}
- if (port == 0) {
- ip_conntrack_expect_free(exp);
+ if (port == 0)
return NF_DROP;
- }
sprintf(buffer, "%u", port);
ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 9fc6f93..1adedb7 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -22,8 +22,8 @@
#include <linux/udp.h>
#include <linux/jhash.h>
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
#include <linux/netfilter_ipv4/ip_conntrack.h>
#include <linux/netfilter_ipv4/ip_conntrack_core.h>
@@ -41,14 +41,45 @@
#define DEBUGP(format, args...)
#endif
-DECLARE_RWLOCK(ip_nat_lock);
+DEFINE_RWLOCK(ip_nat_lock);
/* Calculated at init based on memory size */
static unsigned int ip_nat_htable_size;
static struct list_head *bysource;
+
+#define MAX_IP_NAT_PROTO 256
struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
+static inline struct ip_nat_protocol *
+__ip_nat_proto_find(u_int8_t protonum)
+{
+ return ip_nat_protos[protonum];
+}
+
+struct ip_nat_protocol *
+ip_nat_proto_find_get(u_int8_t protonum)
+{
+ struct ip_nat_protocol *p;
+
+ /* we need to disable preemption to make sure 'p' doesn't get
+ * removed until we've grabbed the reference */
+ preempt_disable();
+ p = __ip_nat_proto_find(protonum);
+ if (p) {
+ if (!try_module_get(p->me))
+ p = &ip_nat_unknown_protocol;
+ }
+ preempt_enable();
+
+ return p;
+}
+
+void
+ip_nat_proto_put(struct ip_nat_protocol *p)
+{
+ module_put(p->me);
+}
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
@@ -65,9 +96,9 @@ static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
if (!(conn->status & IPS_NAT_DONE_MASK))
return;
- WRITE_LOCK(&ip_nat_lock);
+ write_lock_bh(&ip_nat_lock);
list_del(&conn->nat.info.bysource);
- WRITE_UNLOCK(&ip_nat_lock);
+ write_unlock_bh(&ip_nat_lock);
}
/* We do checksum mangling, so if they were wrong before they're still
@@ -103,7 +134,8 @@ static int
in_range(const struct ip_conntrack_tuple *tuple,
const struct ip_nat_range *range)
{
- struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
+ struct ip_nat_protocol *proto =
+ __ip_nat_proto_find(tuple->dst.protonum);
/* If we are supposed to map IPs, then we must be in the
range specified, otherwise let this drag us onto a new src IP. */
@@ -142,7 +174,7 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
unsigned int h = hash_by_src(tuple);
struct ip_conntrack *ct;
- READ_LOCK(&ip_nat_lock);
+ read_lock_bh(&ip_nat_lock);
list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
if (same_src(ct, tuple)) {
/* Copy source part from reply tuple. */
@@ -151,12 +183,12 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
result->dst = tuple->dst;
if (in_range(result, range)) {
- READ_UNLOCK(&ip_nat_lock);
+ read_unlock_bh(&ip_nat_lock);
return 1;
}
}
}
- READ_UNLOCK(&ip_nat_lock);
+ read_unlock_bh(&ip_nat_lock);
return 0;
}
@@ -216,8 +248,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
struct ip_conntrack *conntrack,
enum ip_nat_manip_type maniptype)
{
- struct ip_nat_protocol *proto
- = ip_nat_find_proto(orig_tuple->dst.protonum);
+ struct ip_nat_protocol *proto;
/* 1) If this srcip/proto/src-proto-part is currently mapped,
and that same mapping gives a unique tuple within the given
@@ -242,14 +273,20 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
/* 3) The per-protocol part of the manip is made to map into
the range to make a unique tuple. */
+ proto = ip_nat_proto_find_get(orig_tuple->dst.protonum);
+
/* Only bother mapping if it's not already in range and unique */
if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
|| proto->in_range(tuple, maniptype, &range->min, &range->max))
- && !ip_nat_used_tuple(tuple, conntrack))
+ && !ip_nat_used_tuple(tuple, conntrack)) {
+ ip_nat_proto_put(proto);
return;
+ }
/* Last change: get protocol to try to obtain unique tuple. */
proto->unique_tuple(tuple, range, maniptype, conntrack);
+
+ ip_nat_proto_put(proto);
}
unsigned int
@@ -297,9 +334,9 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
unsigned int srchash
= hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
.tuple);
- WRITE_LOCK(&ip_nat_lock);
+ write_lock_bh(&ip_nat_lock);
list_add(&info->bysource, &bysource[srchash]);
- WRITE_UNLOCK(&ip_nat_lock);
+ write_unlock_bh(&ip_nat_lock);
}
/* It's done. */
@@ -320,17 +357,20 @@ manip_pkt(u_int16_t proto,
enum ip_nat_manip_type maniptype)
{
struct iphdr *iph;
+ struct ip_nat_protocol *p;
- (*pskb)->nfcache |= NFC_ALTERED;
- if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
+ if (!skb_make_writable(pskb, iphdroff + sizeof(*iph)))
return 0;
iph = (void *)(*pskb)->data + iphdroff;
/* Manipulate protcol part. */
- if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
- target, maniptype))
+ p = ip_nat_proto_find_get(proto);
+ if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) {
+ ip_nat_proto_put(p);
return 0;
+ }
+ ip_nat_proto_put(p);
iph = (void *)(*pskb)->data + iphdroff;
@@ -391,7 +431,7 @@ int icmp_reply_translation(struct sk_buff **pskb,
struct ip_conntrack_tuple inner, target;
int hdrlen = (*pskb)->nh.iph->ihl * 4;
- if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
+ if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
return 0;
inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
@@ -426,7 +466,8 @@ int icmp_reply_translation(struct sk_buff **pskb,
if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
sizeof(struct icmphdr) + inside->ip.ihl*4,
- &inner, ip_ct_find_proto(inside->ip.protocol)))
+ &inner,
+ __ip_conntrack_proto_find(inside->ip.protocol)))
return 0;
/* Change inner back to look like incoming packet. We do the
@@ -474,28 +515,71 @@ int ip_nat_protocol_register(struct ip_nat_protocol *proto)
{
int ret = 0;
- WRITE_LOCK(&ip_nat_lock);
+ write_lock_bh(&ip_nat_lock);
if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
ret = -EBUSY;
goto out;
}
ip_nat_protos[proto->protonum] = proto;
out:
- WRITE_UNLOCK(&ip_nat_lock);
+ write_unlock_bh(&ip_nat_lock);
return ret;
}
/* Noone stores the protocol anywhere; simply delete it. */
void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
{
- WRITE_LOCK(&ip_nat_lock);
+ write_lock_bh(&ip_nat_lock);
ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
- WRITE_UNLOCK(&ip_nat_lock);
+ write_unlock_bh(&ip_nat_lock);
/* Someone could be still looking at the proto in a bh. */
synchronize_net();
}
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+int
+ip_nat_port_range_to_nfattr(struct sk_buff *skb,
+ const struct ip_nat_range *range)
+{
+ NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t),
+ &range->min.tcp.port);
+ NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t),
+ &range->max.tcp.port);
+
+ return 0;
+
+nfattr_failure:
+ return -1;
+}
+
+int
+ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range)
+{
+ int ret = 0;
+
+ /* we have to return whether we actually parsed something or not */
+
+ if (tb[CTA_PROTONAT_PORT_MIN-1]) {
+ ret = 1;
+ range->min.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
+ }
+
+ if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
+ if (ret)
+ range->max.tcp.port = range->min.tcp.port;
+ } else {
+ ret = 1;
+ range->max.tcp.port =
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
+ }
+
+ return ret;
+}
+#endif
+
int __init ip_nat_init(void)
{
size_t i;
@@ -509,13 +593,13 @@ int __init ip_nat_init(void)
return -ENOMEM;
/* Sew in builtin protocols. */
- WRITE_LOCK(&ip_nat_lock);
+ write_lock_bh(&ip_nat_lock);
for (i = 0; i < MAX_IP_NAT_PROTO; i++)
ip_nat_protos[i] = &ip_nat_unknown_protocol;
ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
- WRITE_UNLOCK(&ip_nat_lock);
+ write_unlock_bh(&ip_nat_lock);
for (i = 0; i < ip_nat_htable_size; i++) {
INIT_LIST_HEAD(&bysource[i]);
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
index c6000e7..d83757a 100644
--- a/net/ipv4/netfilter/ip_nat_ftp.c
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -143,10 +143,8 @@ static unsigned int ip_nat_ftp(struct sk_buff **pskb,
break;
}
- if (port == 0) {
- ip_conntrack_expect_free(exp);
+ if (port == 0)
return NF_DROP;
- }
if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
seq)) {
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 1637b96..d2dd5d3 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -28,8 +28,8 @@
#include <net/tcp.h>
#include <net/udp.h>
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
#include <linux/netfilter_ipv4/ip_conntrack.h>
#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
@@ -47,7 +47,7 @@
#define DUMP_OFFSET(x)
#endif
-static DECLARE_LOCK(ip_nat_seqofs_lock);
+static DEFINE_SPINLOCK(ip_nat_seqofs_lock);
/* Setup TCP sequence correction given this change at this sequence */
static inline void
@@ -70,7 +70,7 @@ adjust_tcp_sequence(u32 seq,
DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
DUMP_OFFSET(this_way);
- LOCK_BH(&ip_nat_seqofs_lock);
+ spin_lock_bh(&ip_nat_seqofs_lock);
/* SYN adjust. If it's uninitialized, or this is after last
* correction, record it: we don't handle more than one
@@ -82,7 +82,7 @@ adjust_tcp_sequence(u32 seq,
this_way->offset_before = this_way->offset_after;
this_way->offset_after += sizediff;
}
- UNLOCK_BH(&ip_nat_seqofs_lock);
+ spin_unlock_bh(&ip_nat_seqofs_lock);
DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
DUMP_OFFSET(this_way);
@@ -142,9 +142,6 @@ static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
/* Transfer socket to new skb. */
if ((*pskb)->sk)
skb_set_owner_w(nskb, (*pskb)->sk);
-#ifdef CONFIG_NETFILTER_DEBUG
- nskb->nf_debug = (*pskb)->nf_debug;
-#endif
kfree_skb(*pskb);
*pskb = nskb;
return 1;
@@ -171,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
struct tcphdr *tcph;
int datalen;
- if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ if (!skb_make_writable(pskb, (*pskb)->len))
return 0;
if (rep_len > match_len
@@ -231,7 +228,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
match_offset + match_len)
return 0;
- if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ if (!skb_make_writable(pskb, (*pskb)->len))
return 0;
if (rep_len > match_len
@@ -318,7 +315,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb,
optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
- if (!skb_ip_make_writable(pskb, optend))
+ if (!skb_make_writable(pskb, optend))
return 0;
dir = CTINFO2DIR(ctinfo);
@@ -366,7 +363,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
this_way = &ct->nat.info.seq[dir];
other_way = &ct->nat.info.seq[!dir];
- if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+ if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
return 0;
tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
index 9c1ca33..de31942 100644
--- a/net/ipv4/netfilter/ip_nat_irc.c
+++ b/net/ipv4/netfilter/ip_nat_irc.c
@@ -65,10 +65,8 @@ static unsigned int help(struct sk_buff **pskb,
break;
}
- if (port == 0) {
- ip_conntrack_expect_free(exp);
+ if (port == 0)
return NF_DROP;
- }
/* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
* strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index a558cf0..9387190 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -35,16 +35,17 @@ icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
const struct ip_conntrack *conntrack)
{
static u_int16_t id;
- unsigned int range_size
- = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1;
+ unsigned int range_size;
unsigned int i;
+ range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
/* If no range specified... */
if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
range_size = 0xFFFF;
for (i = 0; i < range_size; i++, id++) {
- tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size);
+ tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
+ (id % range_size));
if (!ip_nat_used_tuple(tuple, conntrack))
return 1;
}
@@ -61,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb,
struct icmphdr *hdr;
unsigned int hdroff = iphdroff + iph->ihl*4;
- if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+ if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
return 0;
hdr = (struct icmphdr *)((*pskb)->data + hdroff);
@@ -105,11 +106,18 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range)
else return 0;
}
-struct ip_nat_protocol ip_nat_protocol_icmp
-= { "ICMP", IPPROTO_ICMP,
- icmp_manip_pkt,
- icmp_in_range,
- icmp_unique_tuple,
- icmp_print,
- icmp_print_range
+struct ip_nat_protocol ip_nat_protocol_icmp = {
+ .name = "ICMP",
+ .protonum = IPPROTO_ICMP,
+ .me = THIS_MODULE,
+ .manip_pkt = icmp_manip_pkt,
+ .in_range = icmp_in_range,
+ .unique_tuple = icmp_unique_tuple,
+ .print = icmp_print,
+ .print_range = icmp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .range_to_nfattr = ip_nat_port_range_to_nfattr,
+ .nfattr_to_range = ip_nat_port_nfattr_to_range,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a91cfce..1d381bf 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -12,6 +12,7 @@
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/if.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
#include <linux/netfilter_ipv4/ip_nat.h>
#include <linux/netfilter_ipv4/ip_nat_rule.h>
#include <linux/netfilter_ipv4/ip_nat_protocol.h>
@@ -40,7 +41,8 @@ tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
enum ip_nat_manip_type maniptype,
const struct ip_conntrack *conntrack)
{
- static u_int16_t port, *portptr;
+ static u_int16_t port;
+ u_int16_t *portptr;
unsigned int range_size, min, i;
if (maniptype == IP_NAT_MANIP_SRC)
@@ -101,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb,
if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
hdrsize = sizeof(struct tcphdr);
- if (!skb_ip_make_writable(pskb, hdroff + hdrsize))
+ if (!skb_make_writable(pskb, hdroff + hdrsize))
return 0;
iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -168,11 +170,18 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range)
else return 0;
}
-struct ip_nat_protocol ip_nat_protocol_tcp
-= { "TCP", IPPROTO_TCP,
- tcp_manip_pkt,
- tcp_in_range,
- tcp_unique_tuple,
- tcp_print,
- tcp_print_range
+struct ip_nat_protocol ip_nat_protocol_tcp = {
+ .name = "TCP",
+ .protonum = IPPROTO_TCP,
+ .me = THIS_MODULE,
+ .manip_pkt = tcp_manip_pkt,
+ .in_range = tcp_in_range,
+ .unique_tuple = tcp_unique_tuple,
+ .print = tcp_print,
+ .print_range = tcp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .range_to_nfattr = ip_nat_port_range_to_nfattr,
+ .nfattr_to_range = ip_nat_port_nfattr_to_range,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index c669e3b..c4906e1 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -41,7 +41,8 @@ udp_unique_tuple(struct ip_conntrack_tuple *tuple,
enum ip_nat_manip_type maniptype,
const struct ip_conntrack *conntrack)
{
- static u_int16_t port, *portptr;
+ static u_int16_t port;
+ u_int16_t *portptr;
unsigned int range_size, min, i;
if (maniptype == IP_NAT_MANIP_SRC)
@@ -93,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb,
u32 oldip, newip;
u16 *portptr, newport;
- if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+ if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
return 0;
iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -155,11 +156,18 @@ udp_print_range(char *buffer, const struct ip_nat_range *range)
else return 0;
}
-struct ip_nat_protocol ip_nat_protocol_udp
-= { "UDP", IPPROTO_UDP,
- udp_manip_pkt,
- udp_in_range,
- udp_unique_tuple,
- udp_print,
- udp_print_range
+struct ip_nat_protocol ip_nat_protocol_udp = {
+ .name = "UDP",
+ .protonum = IPPROTO_UDP,
+ .me = THIS_MODULE,
+ .manip_pkt = udp_manip_pkt,
+ .in_range = udp_in_range,
+ .unique_tuple = udp_unique_tuple,
+ .print = udp_print,
+ .print_range = udp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+ .range_to_nfattr = ip_nat_port_range_to_nfattr,
+ .nfattr_to_range = ip_nat_port_nfattr_to_range,
+#endif
};
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
index f5525bd..99bbef56 100644
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
}
struct ip_nat_protocol ip_nat_unknown_protocol = {
- "unknown", 0,
- unknown_manip_pkt,
- unknown_in_range,
- unknown_unique_tuple,
- unknown_print,
- unknown_print_range
+ .name = "unknown",
+ .me = THIS_MODULE,
+ .manip_pkt = unknown_manip_pkt,
+ .in_range = unknown_in_range,
+ .unique_tuple = unknown_unique_tuple,
+ .print = unknown_print,
+ .print_range = unknown_print_range
};
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 581f097..cb66b8b 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -19,8 +19,8 @@
#include <net/route.h>
#include <linux/bitops.h>
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ip_nat.h>
@@ -255,6 +255,27 @@ alloc_null_binding(struct ip_conntrack *conntrack,
return ip_nat_setup_info(conntrack, &range, hooknum);
}
+unsigned int
+alloc_null_binding_confirmed(struct ip_conntrack *conntrack,
+ struct ip_nat_info *info,
+ unsigned int hooknum)
+{
+ u_int32_t ip
+ = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
+ ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
+ : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
+ u_int16_t all
+ = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
+ ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.all
+ : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.all);
+ struct ip_nat_range range
+ = { IP_NAT_RANGE_MAP_IPS, ip, ip, { all }, { all } };
+
+ DEBUGP("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n",
+ conntrack, NIPQUAD(ip));
+ return ip_nat_setup_info(conntrack, &range, hooknum);
+}
+
int ip_nat_rule_find(struct sk_buff **pskb,
unsigned int hooknum,
const struct net_device *in,
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 2a48b6e..93b2c51 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb,
return NF_DROP;
}
- if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ if (!skb_make_writable(pskb, (*pskb)->len))
return NF_DROP;
spin_lock_bh(&snmp_lock);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 79f56f6..0ff368b 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -31,8 +31,8 @@
#include <net/checksum.h>
#include <linux/spinlock.h>
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
#include <linux/netfilter_ipv4/ip_nat.h>
#include <linux/netfilter_ipv4/ip_nat_rule.h>
@@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum,
IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
& htons(IP_MF|IP_OFFSET)));
- (*pskb)->nfcache |= NFC_UNKNOWN;
-
/* If we had a hardware checksum before, it's now invalid */
if ((*pskb)->ip_summed == CHECKSUM_HW)
if (skb_checksum_help(*pskb, (out == NULL)))
@@ -102,6 +100,10 @@ ip_nat_fn(unsigned int hooknum,
return NF_ACCEPT;
}
+ /* Don't try to NAT if this packet is not conntracked */
+ if (ct == &ip_conntrack_untracked)
+ return NF_ACCEPT;
+
switch (ctinfo) {
case IP_CT_RELATED:
case IP_CT_RELATED+IP_CT_IS_REPLY:
@@ -121,8 +123,12 @@ ip_nat_fn(unsigned int hooknum,
if (!ip_nat_initialized(ct, maniptype)) {
unsigned int ret;
- /* LOCAL_IN hook doesn't have a chain! */
- if (hooknum == NF_IP_LOCAL_IN)
+ if (unlikely(is_confirmed(ct)))
+ /* NAT module was loaded late */
+ ret = alloc_null_binding_confirmed(ct, info,
+ hooknum);
+ else if (hooknum == NF_IP_LOCAL_IN)
+ /* LOCAL_IN hook doesn't have a chain! */
ret = alloc_null_binding(ct, info, hooknum);
else
ret = ip_nat_rule_find(pskb, hooknum,
@@ -373,7 +379,6 @@ static int init_or_cleanup(int init)
cleanup_rule_init:
ip_nat_rule_cleanup();
cleanup_nothing:
- MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock);
return ret;
}
@@ -393,6 +398,8 @@ module_exit(fini);
EXPORT_SYMBOL(ip_nat_setup_info);
EXPORT_SYMBOL(ip_nat_protocol_register);
EXPORT_SYMBOL(ip_nat_protocol_unregister);
+EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
+EXPORT_SYMBOL_GPL(ip_nat_proto_put);
EXPORT_SYMBOL(ip_nat_cheat_check);
EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
index 0343e0d..2215317 100644
--- a/net/ipv4/netfilter/ip_nat_tftp.c
+++ b/net/ipv4/netfilter/ip_nat_tftp.c
@@ -45,10 +45,8 @@ static unsigned int help(struct sk_buff **pskb,
exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port;
exp->dir = IP_CT_DIR_REPLY;
exp->expectfn = ip_nat_follow_master;
- if (ip_conntrack_expect_related(exp) != 0) {
- ip_conntrack_expect_free(exp);
+ if (ip_conntrack_expect_related(exp) != 0)
return NF_DROP;
- }
return NF_ACCEPT;
}
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index eda1fba..d54f14d 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -43,17 +43,10 @@
#define NET_IPQ_QMAX 2088
#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
-struct ipq_rt_info {
- __u8 tos;
- __u32 daddr;
- __u32 saddr;
-};
-
struct ipq_queue_entry {
struct list_head list;
struct nf_info *info;
struct sk_buff *skb;
- struct ipq_rt_info rt_info;
};
typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -214,6 +207,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
break;
case IPQ_COPY_PACKET:
+ if (entry->skb->ip_summed == CHECKSUM_HW &&
+ (*errp = skb_checksum_help(entry->skb,
+ entry->info->outdev == NULL))) {
+ read_unlock_bh(&queue_lock);
+ return NULL;
+ }
if (copy_range == 0 || copy_range > entry->skb->len)
data_len = entry->skb->len;
else
@@ -241,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
pmsg->packet_id = (unsigned long )entry;
pmsg->data_len = data_len;
- pmsg->timestamp_sec = entry->skb->stamp.tv_sec;
- pmsg->timestamp_usec = entry->skb->stamp.tv_usec;
+ pmsg->timestamp_sec = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec;
+ pmsg->timestamp_usec = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec;
pmsg->mark = entry->skb->nfmark;
pmsg->hook = entry->info->hook;
pmsg->hw_protocol = entry->skb->protocol;
@@ -281,7 +280,8 @@ nlmsg_failure:
}
static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
+ unsigned int queuenum, void *data)
{
int status = -EINVAL;
struct sk_buff *nskb;
@@ -299,14 +299,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
entry->info = info;
entry->skb = skb;
- if (entry->info->hook == NF_IP_LOCAL_OUT) {
- struct iphdr *iph = skb->nh.iph;
-
- entry->rt_info.tos = iph->tos;
- entry->rt_info.daddr = iph->daddr;
- entry->rt_info.saddr = iph->saddr;
- }
-
nskb = ipq_build_packet_message(entry, &status);
if (nskb == NULL)
goto err_out_free;
@@ -382,23 +374,11 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
}
skb_put(e->skb, diff);
}
- if (!skb_ip_make_writable(&e->skb, v->data_len))
+ if (!skb_make_writable(&e->skb, v->data_len))
return -ENOMEM;
memcpy(e->skb->data, v->payload, v->data_len);
- e->skb->nfcache |= NFC_ALTERED;
-
- /*
- * Extra routing may needed on local out, as the QUEUE target never
- * returns control to the table.
- */
- if (e->info->hook == NF_IP_LOCAL_OUT) {
- struct iphdr *iph = e->skb->nh.iph;
-
- if (!(iph->tos == e->rt_info.tos
- && iph->daddr == e->rt_info.daddr
- && iph->saddr == e->rt_info.saddr))
- return ip_route_me_harder(&e->skb);
- }
+ e->skb->ip_summed = CHECKSUM_NONE;
+
return 0;
}
@@ -676,6 +656,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
}
#endif /* CONFIG_PROC_FS */
+static struct nf_queue_handler nfqh = {
+ .name = "ip_queue",
+ .outfn = &ipq_enqueue_packet,
+};
+
static int
init_or_cleanup(int init)
{
@@ -686,7 +671,8 @@ init_or_cleanup(int init)
goto cleanup;
netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);
+ ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk,
+ THIS_MODULE);
if (ipqnl == NULL) {
printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
goto cleanup_netlink_notifier;
@@ -703,7 +689,7 @@ init_or_cleanup(int init)
register_netdevice_notifier(&ipq_dev_notifier);
ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
- status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
+ status = nf_register_queue_handler(PF_INET, &nfqh);
if (status < 0) {
printk(KERN_ERR "ip_queue: failed to register queue handler\n");
goto cleanup_sysctl;
@@ -711,7 +697,7 @@ init_or_cleanup(int init)
return status;
cleanup:
- nf_unregister_queue_handler(PF_INET);
+ nf_unregister_queue_handlers(&nfqh);
synchronize_net();
ipq_flush(NF_DROP);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 8a54f92..eef99a1 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -67,7 +67,6 @@ static DECLARE_MUTEX(ipt_mutex);
/* Must have mutex */
#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
-#include <linux/netfilter_ipv4/lockhelp.h>
#include <linux/netfilter_ipv4/listhelp.h>
#if 0
@@ -313,7 +312,6 @@ ipt_do_table(struct sk_buff **pskb,
do {
IP_NF_ASSERT(e);
IP_NF_ASSERT(back);
- (*pskb)->nfcache |= e->nfcache;
if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
struct ipt_entry_target *t;
@@ -342,8 +340,8 @@ ipt_do_table(struct sk_buff **pskb,
back->comefrom);
continue;
}
- if (table_base + v
- != (void *)e + e->next_offset) {
+ if (table_base + v != (void *)e + e->next_offset
+ && !(e->ip.flags & IPT_F_GOTO)) {
/* Save old back ptr in next entry */
struct ipt_entry *next
= (void *)e + e->next_offset;
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
index 9842e6e..dab78d8 100644
--- a/net/ipv4/netfilter/ipt_CLASSIFY.c
+++ b/net/ipv4/netfilter/ipt_CLASSIFY.c
@@ -32,10 +32,8 @@ target(struct sk_buff **pskb,
{
const struct ipt_classify_target_info *clinfo = targinfo;
- if((*pskb)->priority != clinfo->priority) {
+ if((*pskb)->priority != clinfo->priority)
(*pskb)->priority = clinfo->priority;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
return IPT_CONTINUE;
}
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0f12e3a..7d38913 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,9 +29,8 @@
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
#include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
-#define CLUSTERIP_VERSION "0.6"
+#define CLUSTERIP_VERSION "0.7"
#define DEBUG_CLUSTERIP
@@ -41,6 +40,8 @@
#define DEBUGP
#endif
+#define ASSERT_READ_LOCK(x)
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("iptables target for CLUSTERIP");
@@ -67,7 +68,7 @@ static LIST_HEAD(clusterip_configs);
/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
* data within all structurses (num_local_nodes, local_nodes[]) */
-static DECLARE_RWLOCK(clusterip_lock);
+static DEFINE_RWLOCK(clusterip_lock);
#ifdef CONFIG_PROC_FS
static struct file_operations clusterip_proc_fops;
@@ -82,9 +83,9 @@ clusterip_config_get(struct clusterip_config *c) {
static inline void
clusterip_config_put(struct clusterip_config *c) {
if (atomic_dec_and_test(&c->refcount)) {
- WRITE_LOCK(&clusterip_lock);
+ write_lock_bh(&clusterip_lock);
list_del(&c->list);
- WRITE_UNLOCK(&clusterip_lock);
+ write_unlock_bh(&clusterip_lock);
dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
dev_put(c->dev);
kfree(c);
@@ -97,7 +98,7 @@ __clusterip_config_find(u_int32_t clusterip)
{
struct list_head *pos;
- MUST_BE_READ_LOCKED(&clusterip_lock);
+ ASSERT_READ_LOCK(&clusterip_lock);
list_for_each(pos, &clusterip_configs) {
struct clusterip_config *c = list_entry(pos,
struct clusterip_config, list);
@@ -114,14 +115,14 @@ clusterip_config_find_get(u_int32_t clusterip)
{
struct clusterip_config *c;
- READ_LOCK(&clusterip_lock);
+ read_lock_bh(&clusterip_lock);
c = __clusterip_config_find(clusterip);
if (!c) {
- READ_UNLOCK(&clusterip_lock);
+ read_unlock_bh(&clusterip_lock);
return NULL;
}
atomic_inc(&c->refcount);
- READ_UNLOCK(&clusterip_lock);
+ read_unlock_bh(&clusterip_lock);
return c;
}
@@ -143,7 +144,7 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
c->num_total_nodes = i->num_total_nodes;
c->num_local_nodes = i->num_local_nodes;
- memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes));
+ memcpy(&c->local_nodes, &i->local_nodes, sizeof(c->local_nodes));
c->hash_mode = i->hash_mode;
c->hash_initval = i->hash_initval;
atomic_set(&c->refcount, 1);
@@ -160,9 +161,9 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
c->pde->data = c;
#endif
- WRITE_LOCK(&clusterip_lock);
+ write_lock_bh(&clusterip_lock);
list_add(&c->list, &clusterip_configs);
- WRITE_UNLOCK(&clusterip_lock);
+ write_unlock_bh(&clusterip_lock);
return c;
}
@@ -172,25 +173,25 @@ clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
{
int i;
- WRITE_LOCK(&clusterip_lock);
+ write_lock_bh(&clusterip_lock);
if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
|| nodenum > CLUSTERIP_MAX_NODES) {
- WRITE_UNLOCK(&clusterip_lock);
+ write_unlock_bh(&clusterip_lock);
return 1;
}
/* check if we alrady have this number in our array */
for (i = 0; i < c->num_local_nodes; i++) {
if (c->local_nodes[i] == nodenum) {
- WRITE_UNLOCK(&clusterip_lock);
+ write_unlock_bh(&clusterip_lock);
return 1;
}
}
c->local_nodes[c->num_local_nodes++] = nodenum;
- WRITE_UNLOCK(&clusterip_lock);
+ write_unlock_bh(&clusterip_lock);
return 0;
}
@@ -199,10 +200,10 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
{
int i;
- WRITE_LOCK(&clusterip_lock);
+ write_lock_bh(&clusterip_lock);
if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
- WRITE_UNLOCK(&clusterip_lock);
+ write_unlock_bh(&clusterip_lock);
return 1;
}
@@ -211,12 +212,12 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
c->num_local_nodes--;
- WRITE_UNLOCK(&clusterip_lock);
+ write_unlock_bh(&clusterip_lock);
return 0;
}
}
- WRITE_UNLOCK(&clusterip_lock);
+ write_unlock_bh(&clusterip_lock);
return 1;
}
@@ -286,21 +287,21 @@ clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
{
int i;
- READ_LOCK(&clusterip_lock);
+ read_lock_bh(&clusterip_lock);
if (config->num_local_nodes == 0) {
- READ_UNLOCK(&clusterip_lock);
+ read_unlock_bh(&clusterip_lock);
return 0;
}
for (i = 0; i < config->num_local_nodes; i++) {
if (config->local_nodes[i] == hash) {
- READ_UNLOCK(&clusterip_lock);
+ read_unlock_bh(&clusterip_lock);
return 1;
}
}
- READ_UNLOCK(&clusterip_lock);
+ read_unlock_bh(&clusterip_lock);
return 0;
}
@@ -338,7 +339,7 @@ target(struct sk_buff **pskb,
* error messages (RELATED) and information requests (see below) */
if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
&& (ctinfo == IP_CT_RELATED
- || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY))
+ || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY))
return IPT_CONTINUE;
/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -366,7 +367,7 @@ target(struct sk_buff **pskb,
#ifdef DEBUG_CLUSTERP
DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
#endif
- DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark);
+ DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark);
if (!clusterip_responsible(cipinfo->config, hash)) {
DEBUGP("not responsible\n");
return NF_DROP;
@@ -523,8 +524,9 @@ arp_mangle(unsigned int hook,
|| arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
return NF_ACCEPT;
- /* we only want to mangle arp replies */
- if (arp->ar_op != htons(ARPOP_REPLY))
+ /* we only want to mangle arp requests and replies */
+ if (arp->ar_op != htons(ARPOP_REPLY)
+ && arp->ar_op != htons(ARPOP_REQUEST))
return NF_ACCEPT;
payload = (void *)(arp+1);
@@ -578,7 +580,7 @@ static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
struct clusterip_config *c = pde->data;
unsigned int *nodeidx;
- READ_LOCK(&clusterip_lock);
+ read_lock_bh(&clusterip_lock);
if (*pos >= c->num_local_nodes)
return NULL;
@@ -608,7 +610,7 @@ static void clusterip_seq_stop(struct seq_file *s, void *v)
{
kfree(v);
- READ_UNLOCK(&clusterip_lock);
+ read_unlock_bh(&clusterip_lock);
}
static int clusterip_seq_show(struct seq_file *s, void *v)
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
index 30ddd3e..1346380 100644
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -40,9 +40,9 @@ target(struct sk_buff **pskb,
void *userinfo)
{
const struct ipt_connmark_target_info *markinfo = targinfo;
- unsigned long diff;
- unsigned long nfmark;
- unsigned long newmark;
+ u_int32_t diff;
+ u_int32_t nfmark;
+ u_int32_t newmark;
enum ip_conntrack_info ctinfo;
struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
@@ -61,10 +61,8 @@ target(struct sk_buff **pskb,
case IPT_CONNMARK_RESTORE:
nfmark = (*pskb)->nfmark;
diff = (ct->mark ^ nfmark) & markinfo->mask;
- if (diff != 0) {
+ if (diff != 0)
(*pskb)->nfmark = nfmark ^ diff;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
break;
}
}
@@ -94,6 +92,11 @@ checkentry(const char *tablename,
}
}
+ if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
+ printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
index 3ea4509..6e31957 100644
--- a/net/ipv4/netfilter/ipt_DSCP.c
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -39,7 +39,7 @@ target(struct sk_buff **pskb,
if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
u_int16_t diffs[2];
- if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+ if (!skb_make_writable(pskb, sizeof(struct iphdr)))
return NF_DROP;
diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -51,7 +51,6 @@ target(struct sk_buff **pskb,
sizeof(diffs),
(*pskb)->nh.iph->check
^ 0xFFFF));
- (*pskb)->nfcache |= NFC_ALTERED;
}
return IPT_CONTINUE;
}
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index ada9911..a131969 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
!= (einfo->ip_ect & IPT_ECN_IP_MASK)) {
u_int16_t diffs[2];
- if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+ if (!skb_make_writable(pskb, sizeof(struct iphdr)))
return 0;
diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
sizeof(diffs),
(*pskb)->nh.iph->check
^0xFFFF));
- (*pskb)->nfcache |= NFC_ALTERED;
}
return 1;
}
@@ -61,16 +60,20 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
if (!tcph)
return 0;
- if (!(einfo->operation & IPT_ECN_OP_SET_ECE
- || tcph->ece == einfo->proto.tcp.ece)
- && (!(einfo->operation & IPT_ECN_OP_SET_CWR
- || tcph->cwr == einfo->proto.tcp.cwr)))
+ if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
+ tcph->ece == einfo->proto.tcp.ece) &&
+ ((!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
+ tcph->cwr == einfo->proto.tcp.cwr)))
return 1;
- if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+ if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
return 0;
tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
+ if ((*pskb)->ip_summed == CHECKSUM_HW &&
+ skb_checksum_help(*pskb, inward))
+ return 0;
+
diffs[0] = ((u_int16_t *)tcph)[6];
if (einfo->operation & IPT_ECN_OP_SET_ECE)
tcph->ece = einfo->proto.tcp.ece;
@@ -79,14 +82,10 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
diffs[1] = ((u_int16_t *)tcph)[6];
diffs[0] = diffs[0] ^ 0xFFFF;
- if ((*pskb)->ip_summed != CHECKSUM_HW)
+ if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY)
tcph->check = csum_fold(csum_partial((char *)diffs,
sizeof(diffs),
tcph->check^0xFFFF));
- else
- if (skb_checksum_help(*pskb, inward))
- return 0;
- (*pskb)->nfcache |= NFC_ALTERED;
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index ef08733..92ed050 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -27,10 +27,6 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("iptables syslog logging module");
-static unsigned int nflog = 1;
-module_param(nflog, int, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
-
#if 0
#define DEBUGP printk
#else
@@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
static DEFINE_SPINLOCK(log_lock);
/* One level of recursion won't kill us */
-static void dump_packet(const struct ipt_log_info *info,
+static void dump_packet(const struct nf_loginfo *info,
const struct sk_buff *skb,
unsigned int iphoff)
{
struct iphdr _iph, *ih;
+ unsigned int logflags;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_MASK;
ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
if (ih == NULL) {
@@ -76,7 +78,7 @@ static void dump_packet(const struct ipt_log_info *info,
if (ntohs(ih->frag_off) & IP_OFFSET)
printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
- if ((info->logflags & IPT_LOG_IPOPT)
+ if ((logflags & IPT_LOG_IPOPT)
&& ih->ihl * 4 > sizeof(struct iphdr)) {
unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
unsigned int i, optsize;
@@ -119,7 +121,7 @@ static void dump_packet(const struct ipt_log_info *info,
printk("SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (info->logflags & IPT_LOG_TCPSEQ)
+ if (logflags & IPT_LOG_TCPSEQ)
printk("SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
/* Max length: 13 "WINDOW=65535 " */
@@ -146,7 +148,7 @@ static void dump_packet(const struct ipt_log_info *info,
/* Max length: 11 "URGP=65535 " */
printk("URGP=%u ", ntohs(th->urg_ptr));
- if ((info->logflags & IPT_LOG_TCPOPT)
+ if ((logflags & IPT_LOG_TCPOPT)
&& th->doff * 4 > sizeof(struct tcphdr)) {
unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
unsigned char *op;
@@ -328,7 +330,7 @@ static void dump_packet(const struct ipt_log_info *info,
}
/* Max length: 15 "UID=4294967295 " */
- if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
+ if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->sk->sk_socket && skb->sk->sk_socket->file)
printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -349,19 +351,31 @@ static void dump_packet(const struct ipt_log_info *info,
/* maxlen = 230+ 91 + 230 + 252 = 803 */
}
+struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 0,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
static void
-ipt_log_packet(unsigned int hooknum,
+ipt_log_packet(unsigned int pf,
+ unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
- const struct ipt_log_info *loginfo,
- const char *level_string,
+ const struct nf_loginfo *loginfo,
const char *prefix)
{
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
spin_lock_bh(&log_lock);
- printk(level_string);
- printk("%sIN=%s OUT=%s ",
- prefix == NULL ? loginfo->prefix : prefix,
+ printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+ prefix,
in ? in->name : "",
out ? out->name : "");
#ifdef CONFIG_BRIDGE_NETFILTER
@@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb,
void *userinfo)
{
const struct ipt_log_info *loginfo = targinfo;
- char level_string[4] = "< >";
+ struct nf_loginfo li;
- level_string[1] = '0' + (loginfo->level % 8);
- ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
+ li.type = NF_LOG_TYPE_LOG;
+ li.u.log.level = loginfo->level;
+ li.u.log.logflags = loginfo->logflags;
- return IPT_CONTINUE;
-}
+ nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix);
-static void
-ipt_logfn(unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const char *prefix)
-{
- struct ipt_log_info loginfo = {
- .level = 0,
- .logflags = IPT_LOG_MASK,
- .prefix = ""
- };
-
- ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
+ return IPT_CONTINUE;
}
static int ipt_log_checkentry(const char *tablename,
@@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = {
.me = THIS_MODULE,
};
+static struct nf_logger ipt_log_logger ={
+ .name = "ipt_LOG",
+ .logfn = &ipt_log_packet,
+ .me = THIS_MODULE,
+};
+
static int __init init(void)
{
if (ipt_register_target(&ipt_log_reg))
return -EINVAL;
- if (nflog)
- nf_log_register(PF_INET, &ipt_logfn);
+ if (nf_log_register(PF_INET, &ipt_log_logger) < 0) {
+ printk(KERN_WARNING "ipt_LOG: not logging via system console "
+ "since somebody else already registered for PF_INET\n");
+ /* we cannot make module load fail here, since otherwise
+ * iptables userspace would abort */
+ }
return 0;
}
static void __exit fini(void)
{
- if (nflog)
- nf_log_unregister(PF_INET, &ipt_logfn);
+ nf_log_unregister_logger(&ipt_log_logger);
ipt_unregister_target(&ipt_log_reg);
}
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
index 33c6f9b..52b4f2c 100644
--- a/net/ipv4/netfilter/ipt_MARK.c
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb,
{
const struct ipt_mark_target_info *markinfo = targinfo;
- if((*pskb)->nfmark != markinfo->mark) {
+ if((*pskb)->nfmark != markinfo->mark)
(*pskb)->nfmark = markinfo->mark;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
+
return IPT_CONTINUE;
}
@@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb,
break;
}
- if((*pskb)->nfmark != mark) {
+ if((*pskb)->nfmark != mark)
(*pskb)->nfmark = mark;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
+
return IPT_CONTINUE;
}
@@ -76,6 +74,8 @@ checkentry_v0(const char *tablename,
unsigned int targinfosize,
unsigned int hook_mask)
{
+ struct ipt_mark_target_info *markinfo = targinfo;
+
if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
targinfosize,
@@ -88,6 +88,11 @@ checkentry_v0(const char *tablename,
return 0;
}
+ if (markinfo->mark > 0xffffffff) {
+ printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+ return 0;
+ }
+
return 1;
}
@@ -120,6 +125,11 @@ checkentry_v1(const char *tablename,
return 0;
}
+ if (markinfo->mark > 0xffffffff) {
+ printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 57e9f6c..2f3e181 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -33,7 +33,7 @@ MODULE_DESCRIPTION("iptables MASQUERADE target module");
#endif
/* Lock protects masq region inside conntrack */
-static DECLARE_RWLOCK(masq_lock);
+static DEFINE_RWLOCK(masq_lock);
/* FIXME: Multiple targets. --RR */
static int
@@ -86,11 +86,6 @@ masquerade_target(struct sk_buff **pskb,
IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
- /* FIXME: For the moment, don't do local packets, breaks
- testsuite for 2.3.49 --RR */
- if ((*pskb)->sk)
- return NF_ACCEPT;
-
ct = ip_conntrack_get(*pskb, &ctinfo);
IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
|| ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
@@ -103,9 +98,9 @@ masquerade_target(struct sk_buff **pskb,
return NF_DROP;
}
- WRITE_LOCK(&masq_lock);
+ write_lock_bh(&masq_lock);
ct->nat.masq_index = out->ifindex;
- WRITE_UNLOCK(&masq_lock);
+ write_unlock_bh(&masq_lock);
/* Transfer from original range. */
newrange = ((struct ip_nat_range)
@@ -122,9 +117,9 @@ device_cmp(struct ip_conntrack *i, void *ifindex)
{
int ret;
- READ_LOCK(&masq_lock);
+ read_lock_bh(&masq_lock);
ret = (i->nat.masq_index == (int)(long)ifindex);
- READ_UNLOCK(&masq_lock);
+ read_unlock_bh(&masq_lock);
return ret;
}
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 06254b2..e6e7b60 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -46,7 +46,8 @@ check(const char *tablename,
DEBUGP(MODULENAME":check: size %u.\n", targinfosize);
return 0;
}
- if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) {
+ if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) |
+ (1 << NF_IP_LOCAL_OUT))) {
DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask);
return 0;
}
@@ -76,12 +77,13 @@ target(struct sk_buff **pskb,
struct ip_nat_range newrange;
IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
- || hooknum == NF_IP_POST_ROUTING);
+ || hooknum == NF_IP_POST_ROUTING
+ || hooknum == NF_IP_LOCAL_OUT);
ct = ip_conntrack_get(*pskb, &ctinfo);
netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
- if (hooknum == NF_IP_PRE_ROUTING)
+ if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT)
new_ip = (*pskb)->nh.iph->daddr & ~netmask;
else
new_ip = (*pskb)->nh.iph->saddr & ~netmask;
diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c
new file mode 100644
index 0000000..3cedc9b
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NFQUEUE.c
@@ -0,0 +1,70 @@
+/* iptables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables NFQUEUE target");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_NFQ_info *tinfo = targinfo;
+
+ return NF_QUEUE_NR(tinfo->queuenum);
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) {
+ printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_NFQ_info)));
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_NFQ_reg = {
+ .name = "NFQUEUE",
+ .target = target,
+ .checkentry = checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_NFQ_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_NFQ_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 266d649..f115a84 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -104,10 +104,12 @@ static inline struct rtable *route_reverse(struct sk_buff *skb,
static void send_reset(struct sk_buff *oldskb, int hook)
{
struct sk_buff *nskb;
+ struct iphdr *iph = oldskb->nh.iph;
struct tcphdr _otcph, *oth, *tcph;
struct rtable *rt;
u_int16_t tmp_port;
u_int32_t tmp_addr;
+ unsigned int tcplen;
int needs_ack;
int hh_len;
@@ -124,7 +126,16 @@ static void send_reset(struct sk_buff *oldskb, int hook)
if (oth->rst)
return;
- /* FIXME: Check checksum --RR */
+ /* Check checksum */
+ tcplen = oldskb->len - iph->ihl * 4;
+ if (((hook != NF_IP_LOCAL_IN && oldskb->ip_summed != CHECKSUM_HW) ||
+ (hook == NF_IP_LOCAL_IN &&
+ oldskb->ip_summed != CHECKSUM_UNNECESSARY)) &&
+ csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
+ oldskb->ip_summed == CHECKSUM_HW ? oldskb->csum :
+ skb_checksum(oldskb, iph->ihl * 4, tcplen, 0)))
+ return;
+
if ((rt = route_reverse(oldskb, oth, hook)) == NULL)
return;
@@ -145,7 +156,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
/* This packet will not be the same as the other: clear nf fields */
nf_reset(nskb);
- nskb->nfcache = 0;
nskb->nfmark = 0;
#ifdef CONFIG_BRIDGE_NETFILTER
nf_bridge_put(nskb->nf_bridge);
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 1049050..8db70d6 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -58,7 +58,11 @@ ipt_tcpmss_target(struct sk_buff **pskb,
unsigned int i;
u_int8_t *opt;
- if (!skb_ip_make_writable(pskb, (*pskb)->len))
+ if (!skb_make_writable(pskb, (*pskb)->len))
+ return NF_DROP;
+
+ if ((*pskb)->ip_summed == CHECKSUM_HW &&
+ skb_checksum_help(*pskb, out == NULL))
return NF_DROP;
iph = (*pskb)->nh.iph;
@@ -186,10 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb,
newmss);
retmodified:
- /* We never hw checksum SYN packets. */
- BUG_ON((*pskb)->ip_summed == CHECKSUM_HW);
-
- (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
return IPT_CONTINUE;
}
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index 85c70d2..deadb36 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -33,7 +33,7 @@ target(struct sk_buff **pskb,
if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
u_int16_t diffs[2];
- if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+ if (!skb_make_writable(pskb, sizeof(struct iphdr)))
return NF_DROP;
diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -46,7 +46,6 @@ target(struct sk_buff **pskb,
sizeof(diffs),
(*pskb)->nh.iph->check
^0xFFFF));
- (*pskb)->nfcache |= NFC_ALTERED;
}
return IPT_CONTINUE;
}
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
new file mode 100644
index 0000000..b9ae6a9
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -0,0 +1,119 @@
+/* TTL modification target for IP tables
+ * (C) 2000,2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TTL.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IP tables TTL modification module");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in,
+ const struct net_device *out, unsigned int hooknum,
+ const void *targinfo, void *userinfo)
+{
+ struct iphdr *iph;
+ const struct ipt_TTL_info *info = targinfo;
+ u_int16_t diffs[2];
+ int new_ttl;
+
+ if (!skb_make_writable(pskb, (*pskb)->len))
+ return NF_DROP;
+
+ iph = (*pskb)->nh.iph;
+
+ switch (info->mode) {
+ case IPT_TTL_SET:
+ new_ttl = info->ttl;
+ break;
+ case IPT_TTL_INC:
+ new_ttl = iph->ttl + info->ttl;
+ if (new_ttl > 255)
+ new_ttl = 255;
+ break;
+ case IPT_TTL_DEC:
+ new_ttl = iph->ttl - info->ttl;
+ if (new_ttl < 0)
+ new_ttl = 0;
+ break;
+ default:
+ new_ttl = iph->ttl;
+ break;
+ }
+
+ if (new_ttl != iph->ttl) {
+ diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF;
+ iph->ttl = new_ttl;
+ diffs[1] = htons(((unsigned)iph->ttl) << 8);
+ iph->check = csum_fold(csum_partial((char *)diffs,
+ sizeof(diffs),
+ iph->check^0xFFFF));
+ }
+
+ return IPT_CONTINUE;
+}
+
+static int ipt_ttl_checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ struct ipt_TTL_info *info = targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) {
+ printk(KERN_WARNING "ipt_TTL: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_TTL_info)));
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle")) {
+ printk(KERN_WARNING "ipt_TTL: can only be called from "
+ "\"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ if (info->mode > IPT_TTL_MAXMODE) {
+ printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n",
+ info->mode);
+ return 0;
+ }
+
+ if ((info->mode != IPT_TTL_SET) && (info->ttl == 0))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_target ipt_TTL = {
+ .name = "TTL",
+ .target = ipt_ttl_target,
+ .checkentry = ipt_ttl_checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_TTL);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_TTL);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 6f2cefb..e2c14f3 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -56,13 +56,13 @@
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_ULOG.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
#include <net/sock.h>
#include <linux/bitops.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
MODULE_DESCRIPTION("iptables userspace logging module");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
#define ULOG_NL_EVENT 111 /* Harald's favorite number */
#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
@@ -99,8 +99,8 @@ typedef struct {
static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */
-static struct sock *nflognl; /* our socket */
-static DECLARE_LOCK(ulog_lock); /* spinlock */
+static struct sock *nflognl; /* our socket */
+static DEFINE_SPINLOCK(ulog_lock); /* spinlock */
/* send one ulog_buff_t to userspace */
static void ulog_send(unsigned int nlgroupnum)
@@ -116,10 +116,10 @@ static void ulog_send(unsigned int nlgroupnum)
if (ub->qlen > 1)
ub->lastnlh->nlmsg_type = NLMSG_DONE;
- NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum);
- DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n",
- ub->qlen, nlgroupnum);
- netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC);
+ NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
+ DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n",
+ ub->qlen, nlgroupnum + 1);
+ netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
ub->qlen = 0;
ub->skb = NULL;
@@ -135,9 +135,9 @@ static void ulog_timer(unsigned long data)
/* lock to protect against somebody modifying our structure
* from ipt_ulog_target at the same time */
- LOCK_BH(&ulog_lock);
+ spin_lock_bh(&ulog_lock);
ulog_send(data);
- UNLOCK_BH(&ulog_lock);
+ spin_unlock_bh(&ulog_lock);
}
static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -193,7 +193,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
ub = &ulog_buffers[groupnum];
- LOCK_BH(&ulog_lock);
+ spin_lock_bh(&ulog_lock);
if (!ub->skb) {
if (!(ub->skb = ulog_alloc_skb(size)))
@@ -220,13 +220,13 @@ static void ipt_ulog_packet(unsigned int hooknum,
pm = NLMSG_DATA(nlh);
/* We might not have a timestamp, get one */
- if (skb->stamp.tv_sec == 0)
- do_gettimeofday((struct timeval *)&skb->stamp);
+ if (skb->tstamp.off_sec == 0)
+ __net_timestamp((struct sk_buff *)skb);
/* copy hook, prefix, timestamp, payload, etc. */
pm->data_len = copy_len;
- pm->timestamp_sec = skb->stamp.tv_sec;
- pm->timestamp_usec = skb->stamp.tv_usec;
+ pm->timestamp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec;
+ pm->timestamp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec;
pm->mark = skb->nfmark;
pm->hook = hooknum;
if (prefix != NULL)
@@ -278,7 +278,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
ulog_send(groupnum);
}
- UNLOCK_BH(&ulog_lock);
+ spin_unlock_bh(&ulog_lock);
return;
@@ -288,7 +288,7 @@ nlmsg_failure:
alloc_failure:
PRINTR("ipt_ULOG: Error building netlink message\n");
- UNLOCK_BH(&ulog_lock);
+ spin_unlock_bh(&ulog_lock);
}
static unsigned int ipt_ulog_target(struct sk_buff **pskb,
@@ -304,18 +304,27 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb,
return IPT_CONTINUE;
}
-static void ipt_logfn(unsigned int hooknum,
+static void ipt_logfn(unsigned int pf,
+ unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
+ const struct nf_loginfo *li,
const char *prefix)
{
- struct ipt_ulog_info loginfo = {
- .nl_group = ULOG_DEFAULT_NLGROUP,
- .copy_range = 0,
- .qthreshold = ULOG_DEFAULT_QTHRESHOLD,
- .prefix = ""
- };
+ struct ipt_ulog_info loginfo;
+
+ if (!li || li->type != NF_LOG_TYPE_ULOG) {
+ loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
+ loginfo.copy_range = 0;
+ loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
+ loginfo.prefix[0] = '\0';
+ } else {
+ loginfo.nl_group = li->u.ulog.group;
+ loginfo.copy_range = li->u.ulog.copy_len;
+ loginfo.qthreshold = li->u.ulog.qthreshold;
+ strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
+ }
ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
}
@@ -355,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = {
.me = THIS_MODULE,
};
+static struct nf_logger ipt_ulog_logger = {
+ .name = "ipt_ULOG",
+ .logfn = &ipt_logfn,
+ .me = THIS_MODULE,
+};
+
static int __init init(void)
{
int i;
@@ -373,7 +388,8 @@ static int __init init(void)
ulog_buffers[i].timer.data = i;
}
- nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+ nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
+ THIS_MODULE);
if (!nflognl)
return -ENOMEM;
@@ -382,7 +398,7 @@ static int __init init(void)
return -EINVAL;
}
if (nflog)
- nf_log_register(PF_INET, &ipt_logfn);
+ nf_log_register(PF_INET, &ipt_ulog_logger);
return 0;
}
@@ -395,7 +411,7 @@ static void __exit fini(void)
DEBUGP("ipt_ULOG: cleanup_module\n");
if (nflog)
- nf_log_unregister(PF_INET, &ipt_logfn);
+ nf_log_unregister_logger(&ipt_ulog_logger);
ipt_unregister_target(&ipt_ulog_reg);
sock_release(nflognl->sk_socket);
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c
new file mode 100644
index 0000000..df4a42c
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_connbytes.c
@@ -0,0 +1,162 @@
+/* Kernel module to match connection tracking byte counter.
+ * GPL (C) 2002 Martin Devera (devik@cdi.cz).
+ *
+ * 2004-07-20 Harald Welte <laforge@netfilter.org>
+ * - reimplemented to use per-connection accounting counters
+ * - add functionality to match number of packets
+ * - add functionality to match average packet size
+ * - add support to match directions seperately
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_connbytes.h>
+
+#include <asm/div64.h>
+#include <asm/bitops.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
+
+/* 64bit divisor, dividend and result. dynamic precision */
+static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+{
+ u_int32_t d = divisor;
+
+ if (divisor > 0xffffffffULL) {
+ unsigned int shift = fls(divisor >> 32);
+
+ d = divisor >> shift;
+ dividend >>= shift;
+ }
+
+ do_div(dividend, d);
+ return dividend;
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_connbytes_info *sinfo = matchinfo;
+ enum ip_conntrack_info ctinfo;
+ struct ip_conntrack *ct;
+ u_int64_t what = 0; /* initialize to make gcc happy */
+
+ if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo)))
+ return 0; /* no match */
+
+ switch (sinfo->what) {
+ case IPT_CONNBYTES_PKTS:
+ switch (sinfo->direction) {
+ case IPT_CONNBYTES_DIR_ORIGINAL:
+ what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+ break;
+ case IPT_CONNBYTES_DIR_REPLY:
+ what = ct->counters[IP_CT_DIR_REPLY].packets;
+ break;
+ case IPT_CONNBYTES_DIR_BOTH:
+ what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+ what += ct->counters[IP_CT_DIR_REPLY].packets;
+ break;
+ }
+ break;
+ case IPT_CONNBYTES_BYTES:
+ switch (sinfo->direction) {
+ case IPT_CONNBYTES_DIR_ORIGINAL:
+ what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+ break;
+ case IPT_CONNBYTES_DIR_REPLY:
+ what = ct->counters[IP_CT_DIR_REPLY].bytes;
+ break;
+ case IPT_CONNBYTES_DIR_BOTH:
+ what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+ what += ct->counters[IP_CT_DIR_REPLY].bytes;
+ break;
+ }
+ break;
+ case IPT_CONNBYTES_AVGPKT:
+ switch (sinfo->direction) {
+ case IPT_CONNBYTES_DIR_ORIGINAL:
+ what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes,
+ ct->counters[IP_CT_DIR_ORIGINAL].packets);
+ break;
+ case IPT_CONNBYTES_DIR_REPLY:
+ what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes,
+ ct->counters[IP_CT_DIR_REPLY].packets);
+ break;
+ case IPT_CONNBYTES_DIR_BOTH:
+ {
+ u_int64_t bytes;
+ u_int64_t pkts;
+ bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes +
+ ct->counters[IP_CT_DIR_REPLY].bytes;
+ pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+
+ ct->counters[IP_CT_DIR_REPLY].packets;
+
+ /* FIXME_THEORETICAL: what to do if sum
+ * overflows ? */
+
+ what = div64_64(bytes, pkts);
+ }
+ break;
+ }
+ break;
+ }
+
+ if (sinfo->count.to)
+ return (what <= sinfo->count.to && what >= sinfo->count.from);
+ else
+ return (what >= sinfo->count.from);
+}
+
+static int check(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_connbytes_info *sinfo = matchinfo;
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info)))
+ return 0;
+
+ if (sinfo->what != IPT_CONNBYTES_PKTS &&
+ sinfo->what != IPT_CONNBYTES_BYTES &&
+ sinfo->what != IPT_CONNBYTES_AVGPKT)
+ return 0;
+
+ if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL &&
+ sinfo->direction != IPT_CONNBYTES_DIR_REPLY &&
+ sinfo->direction != IPT_CONNBYTES_DIR_BOTH)
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match state_match = {
+ .name = "connbytes",
+ .match = &match,
+ .checkentry = &check,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&state_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&state_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
index 2706f96..bf8de47 100644
--- a/net/ipv4/netfilter/ipt_connmark.c
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -54,9 +54,16 @@ checkentry(const char *tablename,
unsigned int matchsize,
unsigned int hook_mask)
{
+ struct ipt_connmark_info *cm =
+ (struct ipt_connmark_info *)matchinfo;
if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
return 0;
+ if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
+ printk(KERN_WARNING "connmark: only support 32bit mark\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c
new file mode 100644
index 0000000..ad3278b
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_dccp.c
@@ -0,0 +1,176 @@
+/*
+ * iptables module for DCCP protocol header matching
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net/ip.h>
+#include <linux/dccp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_dccp.h>
+
+#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+ || (!!((invflag) & (option)) ^ (cond)))
+
+static unsigned char *dccp_optbuf;
+static DEFINE_SPINLOCK(dccp_buflock);
+
+static inline int
+dccp_find_option(u_int8_t option,
+ const struct sk_buff *skb,
+ const struct dccp_hdr *dh,
+ int *hotdrop)
+{
+ /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+ unsigned char *op;
+ unsigned int optoff = __dccp_hdr_len(dh);
+ unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh);
+ unsigned int i;
+
+ if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) {
+ *hotdrop = 1;
+ return 0;
+ }
+
+ if (!optlen)
+ return 0;
+
+ spin_lock_bh(&dccp_buflock);
+ op = skb_header_pointer(skb,
+ skb->nh.iph->ihl*4 + optoff,
+ optlen, dccp_optbuf);
+ if (op == NULL) {
+ /* If we don't have the whole header, drop packet. */
+ spin_unlock_bh(&dccp_buflock);
+ *hotdrop = 1;
+ return 0;
+ }
+
+ for (i = 0; i < optlen; ) {
+ if (op[i] == option) {
+ spin_unlock_bh(&dccp_buflock);
+ return 1;
+ }
+
+ if (op[i] < 2)
+ i++;
+ else
+ i += op[i+1]?:1;
+ }
+
+ spin_unlock_bh(&dccp_buflock);
+ return 0;
+}
+
+
+static inline int
+match_types(const struct dccp_hdr *dh, u_int16_t typemask)
+{
+ return (typemask & (1 << dh->dccph_type));
+}
+
+static inline int
+match_option(u_int8_t option, const struct sk_buff *skb,
+ const struct dccp_hdr *dh, int *hotdrop)
+{
+ return dccp_find_option(option, skb, dh, hotdrop);
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_dccp_info *info =
+ (const struct ipt_dccp_info *)matchinfo;
+ struct dccp_hdr _dh, *dh;
+
+ if (offset)
+ return 0;
+
+ dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh);
+ if (dh == NULL) {
+ *hotdrop = 1;
+ return 0;
+ }
+
+ return DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0])
+ && (ntohs(dh->dccph_sport) <= info->spts[1])),
+ IPT_DCCP_SRC_PORTS, info->flags, info->invflags)
+ && DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0])
+ && (ntohs(dh->dccph_dport) <= info->dpts[1])),
+ IPT_DCCP_DEST_PORTS, info->flags, info->invflags)
+ && DCCHECK(match_types(dh, info->typemask),
+ IPT_DCCP_TYPE, info->flags, info->invflags)
+ && DCCHECK(match_option(info->option, skb, dh, hotdrop),
+ IPT_DCCP_OPTION, info->flags, info->invflags);
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_dccp_info *info;
+
+ info = (const struct ipt_dccp_info *)matchinfo;
+
+ return ip->proto == IPPROTO_DCCP
+ && !(ip->invflags & IPT_INV_PROTO)
+ && matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info))
+ && !(info->flags & ~IPT_DCCP_VALID_FLAGS)
+ && !(info->invflags & ~IPT_DCCP_VALID_FLAGS)
+ && !(info->invflags & ~info->flags);
+}
+
+static struct ipt_match dccp_match =
+{
+ .name = "dccp",
+ .match = &match,
+ .checkentry = &checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ int ret;
+
+ /* doff is 8 bits, so the maximum option size is (4*256). Don't put
+ * this in BSS since DaveM is worried about locked TLB's for kernel
+ * BSS. */
+ dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
+ if (!dccp_optbuf)
+ return -ENOMEM;
+ ret = ipt_register_match(&dccp_match);
+ if (ret)
+ kfree(dccp_optbuf);
+
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&dccp_match);
+ kfree(dccp_optbuf);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Match for DCCP protocol packets");
+
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index f1937190..2dd1ccc 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -37,7 +37,6 @@
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_hashlimit.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
/* FIXME: this is just for IP_NF_ASSERRT */
#include <linux/netfilter_ipv4/ip_conntrack.h>
@@ -92,10 +91,10 @@ struct ipt_hashlimit_htable {
struct hlist_head hash[0]; /* hashtable itself */
};
-static DECLARE_LOCK(hashlimit_lock); /* protects htables list */
+static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */
static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */
static HLIST_HEAD(hashlimit_htables);
-static kmem_cache_t *hashlimit_cachep;
+static kmem_cache_t *hashlimit_cachep __read_mostly;
static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
{
@@ -233,9 +232,9 @@ static int htable_create(struct ipt_hashlimit_info *minfo)
hinfo->timer.function = htable_gc;
add_timer(&hinfo->timer);
- LOCK_BH(&hashlimit_lock);
+ spin_lock_bh(&hashlimit_lock);
hlist_add_head(&hinfo->node, &hashlimit_htables);
- UNLOCK_BH(&hashlimit_lock);
+ spin_unlock_bh(&hashlimit_lock);
return 0;
}
@@ -301,15 +300,15 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
struct ipt_hashlimit_htable *hinfo;
struct hlist_node *pos;
- LOCK_BH(&hashlimit_lock);
+ spin_lock_bh(&hashlimit_lock);
hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) {
if (!strcmp(name, hinfo->pde->name)) {
atomic_inc(&hinfo->use);
- UNLOCK_BH(&hashlimit_lock);
+ spin_unlock_bh(&hashlimit_lock);
return hinfo;
}
}
- UNLOCK_BH(&hashlimit_lock);
+ spin_unlock_bh(&hashlimit_lock);
return NULL;
}
@@ -317,9 +316,9 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
static void htable_put(struct ipt_hashlimit_htable *hinfo)
{
if (atomic_dec_and_test(&hinfo->use)) {
- LOCK_BH(&hashlimit_lock);
+ spin_lock_bh(&hashlimit_lock);
hlist_del(&hinfo->node);
- UNLOCK_BH(&hashlimit_lock);
+ spin_unlock_bh(&hashlimit_lock);
htable_destroy(hinfo);
}
}
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
index 33fdf36..3e7dd01 100644
--- a/net/ipv4/netfilter/ipt_helper.c
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -53,7 +53,7 @@ match(const struct sk_buff *skb,
return ret;
}
- READ_LOCK(&ip_conntrack_lock);
+ read_lock_bh(&ip_conntrack_lock);
if (!ct->master->helper) {
DEBUGP("ipt_helper: master ct %p has no helper\n",
exp->expectant);
@@ -69,7 +69,7 @@ match(const struct sk_buff *skb,
ret ^= !strncmp(ct->master->helper->name, info->name,
strlen(ct->master->helper->name));
out_unlock:
- READ_UNLOCK(&ip_conntrack_lock);
+ read_unlock_bh(&ip_conntrack_lock);
return ret;
}
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
index 8955728..00bef6c 100644
--- a/net/ipv4/netfilter/ipt_mark.c
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -37,9 +37,16 @@ checkentry(const char *tablename,
unsigned int matchsize,
unsigned int hook_mask)
{
+ struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo;
+
if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
return 0;
+ if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
+ printk(KERN_WARNING "mark: only supports 32bit mark\n");
+ return 0;
+ }
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 3b9065e..c1889f8 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -21,106 +21,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
MODULE_DESCRIPTION("iptables owner match");
static int
-match_comm(const struct sk_buff *skb, const char *comm)
-{
- struct task_struct *g, *p;
- struct files_struct *files;
- int i;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- if(strncmp(p->comm, comm, sizeof(p->comm)))
- continue;
-
- task_lock(p);
- files = p->files;
- if(files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) ==
- skb->sk->sk_socket->file) {
- spin_unlock(&files->file_lock);
- task_unlock(p);
- read_unlock(&tasklist_lock);
- return 1;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
- return 0;
-}
-
-static int
-match_pid(const struct sk_buff *skb, pid_t pid)
-{
- struct task_struct *p;
- struct files_struct *files;
- int i;
-
- read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
- if (!p)
- goto out;
- task_lock(p);
- files = p->files;
- if(files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) ==
- skb->sk->sk_socket->file) {
- spin_unlock(&files->file_lock);
- task_unlock(p);
- read_unlock(&tasklist_lock);
- return 1;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
-out:
- read_unlock(&tasklist_lock);
- return 0;
-}
-
-static int
-match_sid(const struct sk_buff *skb, pid_t sid)
-{
- struct task_struct *g, *p;
- struct file *file = skb->sk->sk_socket->file;
- int i, found=0;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- struct files_struct *files;
- if (p->signal->session != sid)
- continue;
-
- task_lock(p);
- files = p->files;
- if (files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) == file) {
- found = 1;
- break;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
- if (found)
- goto out;
- } while_each_thread(g, p);
-out:
- read_unlock(&tasklist_lock);
-
- return found;
-}
-
-static int
match(const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
@@ -145,24 +45,6 @@ match(const struct sk_buff *skb,
return 0;
}
- if(info->match & IPT_OWNER_PID) {
- if (!match_pid(skb, info->pid) ^
- !!(info->invert & IPT_OWNER_PID))
- return 0;
- }
-
- if(info->match & IPT_OWNER_SID) {
- if (!match_sid(skb, info->sid) ^
- !!(info->invert & IPT_OWNER_SID))
- return 0;
- }
-
- if(info->match & IPT_OWNER_COMM) {
- if (!match_comm(skb, info->comm) ^
- !!(info->invert & IPT_OWNER_COMM))
- return 0;
- }
-
return 1;
}
@@ -173,6 +55,8 @@ checkentry(const char *tablename,
unsigned int matchsize,
unsigned int hook_mask)
{
+ const struct ipt_owner_info *info = matchinfo;
+
if (hook_mask
& ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -184,15 +68,13 @@ checkentry(const char *tablename,
IPT_ALIGN(sizeof(struct ipt_owner_info)));
return 0;
}
-#ifdef CONFIG_SMP
- /* files->file_lock can not be used in a BH */
- if (((struct ipt_owner_info *)matchinfo)->match
- & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
- printk("ipt_owner: pid, sid and command matching is broken "
- "on SMP.\n");
+
+ if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
+ printk("ipt_owner: pid, sid and command matching "
+ "not supported anymore\n");
return 0;
}
-#endif
+
return 1;
}
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 25ab9fa..2d44b07 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -223,7 +223,7 @@ static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned
curr_table->table[count].last_seen = 0;
curr_table->table[count].addr = 0;
curr_table->table[count].ttl = 0;
- memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+ memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
curr_table->table[count].oldest_pkt = 0;
curr_table->table[count].time_pos = 0;
curr_table->time_info[count].position = count;
@@ -502,7 +502,7 @@ match(const struct sk_buff *skb,
location = time_info[curr_table->time_pos].position;
hash_table[r_list[location].hash_entry] = -1;
hash_table[hash_result] = location;
- memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+ memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
r_list[location].time_pos = curr_table->time_pos;
r_list[location].addr = addr;
r_list[location].ttl = ttl;
@@ -631,7 +631,7 @@ match(const struct sk_buff *skb,
r_list[location].last_seen = 0;
r_list[location].addr = 0;
r_list[location].ttl = 0;
- memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+ memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
r_list[location].oldest_pkt = 0;
ans = !info->invert;
}
@@ -734,10 +734,10 @@ checkentry(const char *tablename,
memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
#ifdef DEBUG
if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
- sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot);
+ sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
#endif
- hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot);
+ hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
#ifdef DEBUG
if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
#endif
diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c
new file mode 100644
index 0000000..b5def20
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_string.c
@@ -0,0 +1,91 @@
+/* String matching match for iptables
+ *
+ * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_string.h>
+#include <linux/textsearch.h>
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>");
+MODULE_DESCRIPTION("IP tables string match module");
+MODULE_LICENSE("GPL");
+
+static int match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ struct ts_state state;
+ struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo;
+
+ memset(&state, 0, sizeof(struct ts_state));
+
+ return (skb_find_text((struct sk_buff *)skb, conf->from_offset,
+ conf->to_offset, conf->config, &state)
+ != UINT_MAX) && !conf->invert;
+}
+
+#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m)
+
+static int checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ struct ipt_string_info *conf = matchinfo;
+ struct ts_config *ts_conf;
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info)))
+ return 0;
+
+ /* Damn, can't handle this case properly with iptables... */
+ if (conf->from_offset > conf->to_offset)
+ return 0;
+
+ ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
+ GFP_KERNEL, TS_AUTOLOAD);
+ if (IS_ERR(ts_conf))
+ return 0;
+
+ conf->config = ts_conf;
+
+ return 1;
+}
+
+static void destroy(void *matchinfo, unsigned int matchsize)
+{
+ textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
+}
+
+static struct ipt_match string_match = {
+ .name = "string",
+ .match = match,
+ .checkentry = checkentry,
+ .destroy = destroy,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ipt_register_match(&string_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&string_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 912bbcc..f7943ba 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,10 @@ static int fold_prot_inuse(struct proto *proto)
*/
static int sockstat_seq_show(struct seq_file *seq, void *v)
{
- /* From net/socket.c */
- extern void socket_seq_show(struct seq_file *seq);
-
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
- tcp_tw_count, atomic_read(&tcp_sockets_allocated),
+ tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
atomic_read(&tcp_memory_allocated));
seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 0db405a..291831e 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -40,7 +40,6 @@
#include <linux/timer.h>
#include <net/ip.h>
#include <net/protocol.h>
-#include <net/tcp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/icmp.h>
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5b1ec58..304bb0a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -59,7 +59,6 @@
#include <linux/netdevice.h>
#include <linux/in_route.h>
#include <linux/route.h>
-#include <linux/tcp.h>
#include <linux/skbuff.h>
#include <net/dst.h>
#include <net/sock.h>
@@ -71,6 +70,7 @@
#include <net/udp.h>
#include <net/raw.h>
#include <net/snmp.h>
+#include <net/tcp_states.h>
#include <net/inet_common.h>
#include <net/checksum.h>
#include <net/xfrm.h>
@@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
* RFC 1122: SHOULD pass TOS value up to the transport layer.
* -> It does. And not only TOS, but all IP header.
*/
-void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
{
struct sock *sk;
struct hlist_head *head;
+ int delivered = 0;
read_lock(&raw_v4_lock);
head = &raw_v4_htable[hash];
@@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
skb->dev->ifindex);
while (sk) {
+ delivered = 1;
if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
@@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
}
out:
read_unlock(&raw_v4_lock);
+ return delivered;
}
void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
@@ -259,7 +262,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
}
-static int raw_send_hdrinc(struct sock *sk, void *from, int length,
+static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
struct rtable *rt,
unsigned int flags)
{
@@ -298,7 +301,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, int length,
goto error_fault;
/* We don't modify invalid header */
- if (length >= sizeof(*iph) && iph->ihl * 4 <= length) {
+ if (length >= sizeof(*iph) && iph->ihl * 4U <= length) {
if (!iph->saddr)
iph->saddr = rt->rt_src;
iph->check = 0;
@@ -332,7 +335,7 @@ static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
u8 __user *type = NULL;
u8 __user *code = NULL;
int probed = 0;
- int i;
+ unsigned int i;
if (!msg->msg_iov)
return;
@@ -384,7 +387,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int err;
err = -EMSGSIZE;
- if (len < 0 || len > 0xFFFF)
+ if (len > 0xFFFF)
goto out;
/*
@@ -514,7 +517,10 @@ done:
kfree(ipc.opt);
ip_rt_put(rt);
-out: return err < 0 ? err : len;
+out:
+ if (err < 0)
+ return err;
+ return len;
do_confirm:
dst_confirm(&rt->u.dst);
@@ -610,7 +616,10 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
copied = skb->len;
done:
skb_free_datagram(sk, skb);
-out: return err ? err : copied;
+out:
+ if (err)
+ return err;
+ return copied;
}
static int raw_init(struct sock *sk)
@@ -691,11 +700,11 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
struct sk_buff *skb;
int amount = 0;
- spin_lock_irq(&sk->sk_receive_queue.lock);
+ spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
if (skb != NULL)
amount = skb->len;
- spin_unlock_irq(&sk->sk_receive_queue.lock);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg);
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a682d28..8c0b14e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
* Marc Boucher : routing by fwmark
* Robert Olsson : Added rt_cache statistics
* Arnaldo C. Melo : Convert proc stuff to seq_file
+ * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/bootmem.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
@@ -201,15 +203,46 @@ __u8 ip_tos2prio[16] = {
struct rt_hash_bucket {
struct rtable *chain;
- spinlock_t lock;
-} __attribute__((__aligned__(8)));
+};
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
+ * The size of this table is a power of two and depends on the number of CPUS.
+ */
+#if NR_CPUS >= 32
+#define RT_HASH_LOCK_SZ 4096
+#elif NR_CPUS >= 16
+#define RT_HASH_LOCK_SZ 2048
+#elif NR_CPUS >= 8
+#define RT_HASH_LOCK_SZ 1024
+#elif NR_CPUS >= 4
+#define RT_HASH_LOCK_SZ 512
+#else
+#define RT_HASH_LOCK_SZ 256
+#endif
+
+static spinlock_t *rt_hash_locks;
+# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
+# define rt_hash_lock_init() { \
+ int i; \
+ rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
+ if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
+ for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
+ spin_lock_init(&rt_hash_locks[i]); \
+ }
+#else
+# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_init()
+#endif
static struct rt_hash_bucket *rt_hash_table;
static unsigned rt_hash_mask;
static int rt_hash_log;
static unsigned int rt_hash_rnd;
-struct rt_cache_stat *rt_cache_stat;
+static struct rt_cache_stat *rt_cache_stat;
+#define RT_CACHE_STAT_INC(field) \
+ (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
static int rt_intern_hash(unsigned hash, struct rtable *rth,
struct rtable **res);
@@ -575,19 +608,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
/* This runs via a timer and thus is always in BH context. */
static void rt_check_expire(unsigned long dummy)
{
- static int rover;
- int i = rover, t;
+ static unsigned int rover;
+ unsigned int i = rover, goal;
struct rtable *rth, **rthp;
unsigned long now = jiffies;
-
- for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
- t -= ip_rt_gc_timeout) {
+ u64 mult;
+
+ mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
+ if (ip_rt_gc_timeout > 1)
+ do_div(mult, ip_rt_gc_timeout);
+ goal = (unsigned int)mult;
+ if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+ for (; goal > 0; goal--) {
unsigned long tmo = ip_rt_gc_timeout;
i = (i + 1) & rt_hash_mask;
rthp = &rt_hash_table[i].chain;
- spin_lock(&rt_hash_table[i].lock);
+ if (*rthp == 0)
+ continue;
+ spin_lock(rt_hash_lock_addr(i));
while ((rth = *rthp) != NULL) {
if (rth->u.dst.expires) {
/* Entry is expired even if it is in use */
@@ -620,14 +660,14 @@ static void rt_check_expire(unsigned long dummy)
rt_free(rth);
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
}
- spin_unlock(&rt_hash_table[i].lock);
+ spin_unlock(rt_hash_lock_addr(i));
/* Fallback loop breaker. */
if (time_after(jiffies, now))
break;
}
rover = i;
- mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+ mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
}
/* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +683,11 @@ static void rt_run_flush(unsigned long dummy)
get_random_bytes(&rt_hash_rnd, 4);
for (i = rt_hash_mask; i >= 0; i--) {
- spin_lock_bh(&rt_hash_table[i].lock);
+ spin_lock_bh(rt_hash_lock_addr(i));
rth = rt_hash_table[i].chain;
if (rth)
rt_hash_table[i].chain = NULL;
- spin_unlock_bh(&rt_hash_table[i].lock);
+ spin_unlock_bh(rt_hash_lock_addr(i));
for (; rth; rth = next) {
next = rth->u.rt_next;
@@ -780,7 +820,7 @@ static int rt_garbage_collect(void)
k = (k + 1) & rt_hash_mask;
rthp = &rt_hash_table[k].chain;
- spin_lock_bh(&rt_hash_table[k].lock);
+ spin_lock_bh(rt_hash_lock_addr(k));
while ((rth = *rthp) != NULL) {
if (!rt_may_expire(rth, tmo, expire)) {
tmo >>= 1;
@@ -812,7 +852,7 @@ static int rt_garbage_collect(void)
goal--;
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
}
- spin_unlock_bh(&rt_hash_table[k].lock);
+ spin_unlock_bh(rt_hash_lock_addr(k));
if (goal <= 0)
break;
}
@@ -882,7 +922,7 @@ restart:
rthp = &rt_hash_table[hash].chain;
- spin_lock_bh(&rt_hash_table[hash].lock);
+ spin_lock_bh(rt_hash_lock_addr(hash));
while ((rth = *rthp) != NULL) {
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +948,7 @@ restart:
rth->u.dst.__use++;
dst_hold(&rth->u.dst);
rth->u.dst.lastuse = now;
- spin_unlock_bh(&rt_hash_table[hash].lock);
+ spin_unlock_bh(rt_hash_lock_addr(hash));
rt_drop(rt);
*rp = rth;
@@ -949,7 +989,7 @@ restart:
if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
int err = arp_bind_neighbour(&rt->u.dst);
if (err) {
- spin_unlock_bh(&rt_hash_table[hash].lock);
+ spin_unlock_bh(rt_hash_lock_addr(hash));
if (err != -ENOBUFS) {
rt_drop(rt);
@@ -990,7 +1030,7 @@ restart:
}
#endif
rt_hash_table[hash].chain = rt;
- spin_unlock_bh(&rt_hash_table[hash].lock);
+ spin_unlock_bh(rt_hash_lock_addr(hash));
*rp = rt;
return 0;
}
@@ -1058,7 +1098,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
{
struct rtable **rthp;
- spin_lock_bh(&rt_hash_table[hash].lock);
+ spin_lock_bh(rt_hash_lock_addr(hash));
ip_rt_put(rt);
for (rthp = &rt_hash_table[hash].chain; *rthp;
rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1107,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
rt_free(rt);
break;
}
- spin_unlock_bh(&rt_hash_table[hash].lock);
+ spin_unlock_bh(rt_hash_lock_addr(hash));
}
void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -1647,7 +1687,7 @@ static void ip_handle_martian_source(struct net_device *dev,
printk(KERN_WARNING "martian source %u.%u.%u.%u from "
"%u.%u.%u.%u, on dev %s\n",
NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
- if (dev->hard_header_len) {
+ if (dev->hard_header_len && skb->mac.raw) {
int i;
unsigned char *p = skb->mac.raw;
printk(KERN_WARNING "ll header: ");
@@ -1767,7 +1807,7 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
struct in_device *in_dev,
u32 daddr, u32 saddr, u32 tos)
{
- struct rtable* rth;
+ struct rtable* rth = NULL;
int err;
unsigned hash;
@@ -1794,7 +1834,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
u32 daddr, u32 saddr, u32 tos)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
- struct rtable* rth;
+ struct rtable* rth = NULL;
unsigned char hop, hopcount, lasthop;
int err = -EINVAL;
unsigned int hash;
@@ -1909,7 +1949,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
*/
if ((err = fib_lookup(&fl, &res)) != 0) {
if (!IN_DEV_FORWARD(in_dev))
- goto e_inval;
+ goto e_hostunreach;
goto no_route;
}
free_res = 1;
@@ -1933,7 +1973,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
}
if (!IN_DEV_FORWARD(in_dev))
- goto e_inval;
+ goto e_hostunreach;
if (res.type != RTN_UNICAST)
goto martian_destination;
@@ -2025,6 +2065,11 @@ martian_destination:
"%u.%u.%u.%u, dev %s\n",
NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
#endif
+
+e_hostunreach:
+ err = -EHOSTUNREACH;
+ goto done;
+
e_inval:
err = -EINVAL;
goto done;
@@ -2239,7 +2284,7 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
struct net_device *dev_out,
unsigned flags)
{
- struct rtable *rth;
+ struct rtable *rth = NULL;
int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
unsigned hash;
if (err == 0) {
@@ -2267,7 +2312,7 @@ static inline int ip_mkroute_output(struct rtable** rp,
unsigned char hop;
unsigned hash;
int err = -EINVAL;
- struct rtable *rth;
+ struct rtable *rth = NULL;
if (res->fi && res->fi->fib_nhs > 1) {
unsigned char hopcount = res->fi->fib_nhs;
@@ -2557,6 +2602,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
return ip_route_output_slow(rp, flp);
}
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
+
int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
{
int err;
@@ -2575,13 +2622,15 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk,
return 0;
}
+EXPORT_SYMBOL_GPL(ip_route_output_flow);
+
int ip_route_output_key(struct rtable **rp, struct flowi *flp)
{
return ip_route_output_flow(rp, flp, NULL, 0);
}
static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
- int nowait)
+ int nowait, unsigned int flags)
{
struct rtable *rt = (struct rtable*)skb->dst;
struct rtmsg *r;
@@ -2591,9 +2640,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
#ifdef CONFIG_IP_MROUTE
struct rtattr *eptr;
#endif
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
r = NLMSG_DATA(nlh);
- nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
r->rtm_family = AF_INET;
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
@@ -2744,7 +2792,7 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
- RTM_NEWROUTE, 0);
+ RTM_NEWROUTE, 0, 0);
if (!err)
goto out_free;
if (err < 0) {
@@ -2781,8 +2829,8 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
continue;
skb->dst = dst_clone(&rt->u.dst);
if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- RTM_NEWROUTE, 1) <= 0) {
+ cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+ 1, NLM_F_MULTI) <= 0) {
dst_release(xchg(&skb->dst, NULL));
rcu_read_unlock_bh();
goto done;
@@ -3069,12 +3117,14 @@ __setup("rhash_entries=", set_rhash_entries);
int __init ip_rt_init(void)
{
- int i, order, goal, rc = 0;
+ int rc = 0;
rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
(jiffies ^ (jiffies >> 7)));
#ifdef CONFIG_NET_CLS_ROUTE
+ {
+ int order;
for (order = 0;
(PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
/* NOTHING */;
@@ -3082,6 +3132,7 @@ int __init ip_rt_init(void)
if (!ip_rt_acct)
panic("IP: failed to allocate ip_rt_acct\n");
memset(ip_rt_acct, 0, PAGE_SIZE << order);
+ }
#endif
ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3092,36 +3143,19 @@ int __init ip_rt_init(void)
if (!ipv4_dst_ops.kmem_cachep)
panic("IP: failed to allocate ip_dst_cache\n");
- goal = num_physpages >> (26 - PAGE_SHIFT);
- if (rhash_entries)
- goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
- for (order = 0; (1UL << order) < goal; order++)
- /* NOTHING */;
-
- do {
- rt_hash_mask = (1UL << order) * PAGE_SIZE /
- sizeof(struct rt_hash_bucket);
- while (rt_hash_mask & (rt_hash_mask - 1))
- rt_hash_mask--;
- rt_hash_table = (struct rt_hash_bucket *)
- __get_free_pages(GFP_ATOMIC, order);
- } while (rt_hash_table == NULL && --order > 0);
-
- if (!rt_hash_table)
- panic("Failed to allocate IP route cache hash table\n");
-
- printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
- rt_hash_mask,
- (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
-
- for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
- /* NOTHING */;
-
- rt_hash_mask--;
- for (i = 0; i <= rt_hash_mask; i++) {
- spin_lock_init(&rt_hash_table[i].lock);
- rt_hash_table[i].chain = NULL;
- }
+ rt_hash_table = (struct rt_hash_bucket *)
+ alloc_large_system_hash("IP route cache",
+ sizeof(struct rt_hash_bucket),
+ rhash_entries,
+ (num_physpages >= 128 * 1024) ?
+ (27 - PAGE_SHIFT) :
+ (29 - PAGE_SHIFT),
+ HASH_HIGHMEM,
+ &rt_hash_log,
+ &rt_hash_mask,
+ 0);
+ memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
+ rt_hash_lock_init();
ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
ip_rt_max_size = (rt_hash_mask + 1) * 16;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e923d2f..a34e60e 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -169,10 +169,8 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
}
-extern struct or_calltable or_ipv4;
-
static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
- struct open_request *req,
+ struct request_sock *req,
struct dst_entry *dst)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -180,9 +178,9 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
if (child)
- tcp_acceptq_queue(sk, req, child);
+ inet_csk_reqsk_queue_add(sk, req, child);
else
- tcp_openreq_free(req);
+ reqsk_free(req);
return child;
}
@@ -190,10 +188,12 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct ip_options *opt)
{
+ struct inet_request_sock *ireq;
+ struct tcp_request_sock *treq;
struct tcp_sock *tp = tcp_sk(sk);
__u32 cookie = ntohl(skb->h.th->ack_seq) - 1;
struct sock *ret = sk;
- struct open_request *req;
+ struct request_sock *req;
int mss;
struct rtable *rt;
__u8 rcv_wscale;
@@ -209,19 +209,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
- req = tcp_openreq_alloc();
ret = NULL;
+ req = reqsk_alloc(&tcp_request_sock_ops); /* for safety */
if (!req)
goto out;
- req->rcv_isn = htonl(skb->h.th->seq) - 1;
- req->snt_isn = cookie;
+ ireq = inet_rsk(req);
+ treq = tcp_rsk(req);
+ treq->rcv_isn = htonl(skb->h.th->seq) - 1;
+ treq->snt_isn = cookie;
req->mss = mss;
- req->rmt_port = skb->h.th->source;
- req->af.v4_req.loc_addr = skb->nh.iph->daddr;
- req->af.v4_req.rmt_addr = skb->nh.iph->saddr;
- req->class = &or_ipv4; /* for savety */
- req->af.v4_req.opt = NULL;
+ ireq->rmt_port = skb->h.th->source;
+ ireq->loc_addr = skb->nh.iph->daddr;
+ ireq->rmt_addr = skb->nh.iph->saddr;
+ ireq->opt = NULL;
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -229,17 +230,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
if (opt && opt->optlen) {
int opt_size = sizeof(struct ip_options) + opt->optlen;
- req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC);
- if (req->af.v4_req.opt) {
- if (ip_options_echo(req->af.v4_req.opt, skb)) {
- kfree(req->af.v4_req.opt);
- req->af.v4_req.opt = NULL;
- }
+ ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
+ if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) {
+ kfree(ireq->opt);
+ ireq->opt = NULL;
}
}
- req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0;
- req->wscale_ok = req->sack_ok = 0;
+ ireq->snd_wscale = ireq->rcv_wscale = ireq->tstamp_ok = 0;
+ ireq->wscale_ok = ireq->sack_ok = 0;
req->expires = 0UL;
req->retrans = 0;
@@ -253,15 +252,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct flowi fl = { .nl_u = { .ip4_u =
{ .daddr = ((opt && opt->srr) ?
opt->faddr :
- req->af.v4_req.rmt_addr),
- .saddr = req->af.v4_req.loc_addr,
+ ireq->rmt_addr),
+ .saddr = ireq->loc_addr,
.tos = RT_CONN_FLAGS(sk) } },
.proto = IPPROTO_TCP,
.uli_u = { .ports =
{ .sport = skb->h.th->dest,
.dport = skb->h.th->source } } };
if (ip_route_output_key(&rt, &fl)) {
- tcp_openreq_free(req);
+ reqsk_free(req);
goto out;
}
}
@@ -272,7 +271,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
&req->rcv_wnd, &req->window_clamp,
0, &rcv_wscale);
/* BTW win scale with syncookies is 0 by definition */
- req->rcv_wscale = rcv_wscale;
+ ireq->rcv_wscale = rcv_wscale;
ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
out: return ret;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3aafb29..6526856 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -11,7 +11,9 @@
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/config.h>
+#include <linux/igmp.h>
#include <net/snmp.h>
+#include <net/icmp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
@@ -19,35 +21,6 @@
/* From af_inet.c */
extern int sysctl_ip_nonlocal_bind;
-/* From icmp.c */
-extern int sysctl_icmp_echo_ignore_all;
-extern int sysctl_icmp_echo_ignore_broadcasts;
-extern int sysctl_icmp_ignore_bogus_error_responses;
-
-/* From ip_fragment.c */
-extern int sysctl_ipfrag_low_thresh;
-extern int sysctl_ipfrag_high_thresh;
-extern int sysctl_ipfrag_time;
-extern int sysctl_ipfrag_secret_interval;
-
-/* From ip_output.c */
-extern int sysctl_ip_dynaddr;
-
-/* From icmp.c */
-extern int sysctl_icmp_ratelimit;
-extern int sysctl_icmp_ratemask;
-
-/* From igmp.c */
-extern int sysctl_igmp_max_memberships;
-extern int sysctl_igmp_max_msf;
-
-/* From inetpeer.c */
-extern int inet_peer_threshold;
-extern int inet_peer_minttl;
-extern int inet_peer_maxttl;
-extern int inet_peer_gc_mintime;
-extern int inet_peer_gc_maxtime;
-
#ifdef CONFIG_SYSCTL
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
@@ -56,8 +29,6 @@ static int ip_local_port_range_max[] = { 65535, 65535 };
struct ipv4_config ipv4_config;
-extern ctl_table ipv4_route_table[];
-
#ifdef CONFIG_SYSCTL
static
@@ -117,6 +88,46 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
return 1;
}
+static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char val[TCP_CA_NAME_MAX];
+ ctl_table tbl = {
+ .data = val,
+ .maxlen = TCP_CA_NAME_MAX,
+ };
+ int ret;
+
+ tcp_get_default_congestion_control(val);
+
+ ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+ if (write && ret == 0)
+ ret = tcp_set_default_congestion_control(val);
+ return ret;
+}
+
+static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
+ int nlen, void __user *oldval,
+ size_t __user *oldlenp,
+ void __user *newval, size_t newlen,
+ void **context)
+{
+ char val[TCP_CA_NAME_MAX];
+ ctl_table tbl = {
+ .data = val,
+ .maxlen = TCP_CA_NAME_MAX,
+ };
+ int ret;
+
+ tcp_get_default_congestion_control(val);
+ ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
+ context);
+ if (ret == 0 && newval && newlen)
+ ret = tcp_set_default_congestion_control(val);
+ return ret;
+}
+
+
ctl_table ipv4_table[] = {
{
.ctl_name = NET_IPV4_TCP_TIMESTAMPS,
@@ -219,7 +230,7 @@ ctl_table ipv4_table[] = {
{
.ctl_name = NET_TCP_MAX_TW_BUCKETS,
.procname = "tcp_max_tw_buckets",
- .data = &sysctl_tcp_max_tw_buckets,
+ .data = &tcp_death_row.sysctl_max_tw_buckets,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec
@@ -323,7 +334,7 @@ ctl_table ipv4_table[] = {
{
.ctl_name = NET_TCP_TW_RECYCLE,
.procname = "tcp_tw_recycle",
- .data = &sysctl_tcp_tw_recycle,
+ .data = &tcp_death_row.sysctl_tw_recycle,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec
@@ -396,6 +407,14 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_dointvec
},
{
+ .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,
+ .procname = "icmp_errors_use_inbound_ifaddr",
+ .data = &sysctl_icmp_errors_use_inbound_ifaddr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
.ctl_name = NET_IPV4_ROUTE,
.procname = "route",
.maxlen = 0,
@@ -603,70 +622,6 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_dointvec,
},
{
- .ctl_name = NET_TCP_WESTWOOD,
- .procname = "tcp_westwood",
- .data = &sysctl_tcp_westwood,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_VEGAS,
- .procname = "tcp_vegas_cong_avoid",
- .data = &sysctl_tcp_vegas_cong_avoid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_VEGAS_ALPHA,
- .procname = "tcp_vegas_alpha",
- .data = &sysctl_tcp_vegas_alpha,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_VEGAS_BETA,
- .procname = "tcp_vegas_beta",
- .data = &sysctl_tcp_vegas_beta,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_VEGAS_GAMMA,
- .procname = "tcp_vegas_gamma",
- .data = &sysctl_tcp_vegas_gamma,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_BIC,
- .procname = "tcp_bic",
- .data = &sysctl_tcp_bic,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE,
- .procname = "tcp_bic_fast_convergence",
- .data = &sysctl_tcp_bic_fast_convergence,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = NET_TCP_BIC_LOW_WINDOW,
- .procname = "tcp_bic_low_window",
- .data = &sysctl_tcp_bic_low_window,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
.ctl_name = NET_TCP_MODERATE_RCVBUF,
.procname = "tcp_moderate_rcvbuf",
.data = &sysctl_tcp_moderate_rcvbuf,
@@ -683,13 +638,14 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_dointvec,
},
{
- .ctl_name = NET_TCP_BIC_BETA,
- .procname = "tcp_bic_beta",
- .data = &sysctl_tcp_bic_beta,
- .maxlen = sizeof(int),
+ .ctl_name = NET_TCP_CONG_CONTROL,
+ .procname = "tcp_congestion_control",
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .maxlen = TCP_CA_NAME_MAX,
+ .proc_handler = &proc_tcp_congestion_control,
+ .strategy = &sysctl_tcp_congestion_control,
},
+
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a037baf..f3f0013 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,14 +269,12 @@
int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
-DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
-
-kmem_cache_t *tcp_openreq_cachep;
-kmem_cache_t *tcp_bucket_cachep;
-kmem_cache_t *tcp_timewait_cachep;
+DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
atomic_t tcp_orphan_count = ATOMIC_INIT(0);
+EXPORT_SYMBOL_GPL(tcp_orphan_count);
+
int sysctl_tcp_mem[3];
int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
@@ -312,15 +310,6 @@ void tcp_enter_memory_pressure(void)
EXPORT_SYMBOL(tcp_enter_memory_pressure);
/*
- * LISTEN is a special case for poll..
- */
-static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
- poll_table *wait)
-{
- return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
-}
-
-/*
* Wait for a TCP event.
*
* Note that we don't need to lock the socket, as the upper poll layers
@@ -335,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
poll_wait(file, sk->sk_sleep, wait);
if (sk->sk_state == TCP_LISTEN)
- return tcp_listen_poll(sk, wait);
+ return inet_csk_listen_poll(sk);
/* Socket is not locked. We are protected from async events
by poll logic and correct handling of state changes
@@ -458,127 +447,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return put_user(answ, (int __user *)arg);
}
-
-int tcp_listen_start(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_listen_opt *lopt;
-
- sk->sk_max_ack_backlog = 0;
- sk->sk_ack_backlog = 0;
- tp->accept_queue = tp->accept_queue_tail = NULL;
- rwlock_init(&tp->syn_wait_lock);
- tcp_delack_init(tp);
-
- lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
- if (!lopt)
- return -ENOMEM;
-
- memset(lopt, 0, sizeof(struct tcp_listen_opt));
- for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
- if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
- break;
- get_random_bytes(&lopt->hash_rnd, 4);
-
- write_lock_bh(&tp->syn_wait_lock);
- tp->listen_opt = lopt;
- write_unlock_bh(&tp->syn_wait_lock);
-
- /* There is race window here: we announce ourselves listening,
- * but this transition is still not validated by get_port().
- * It is OK, because this socket enters to hash table only
- * after validation is complete.
- */
- sk->sk_state = TCP_LISTEN;
- if (!sk->sk_prot->get_port(sk, inet->num)) {
- inet->sport = htons(inet->num);
-
- sk_dst_reset(sk);
- sk->sk_prot->hash(sk);
-
- return 0;
- }
-
- sk->sk_state = TCP_CLOSE;
- write_lock_bh(&tp->syn_wait_lock);
- tp->listen_opt = NULL;
- write_unlock_bh(&tp->syn_wait_lock);
- kfree(lopt);
- return -EADDRINUSE;
-}
-
-/*
- * This routine closes sockets which have been at least partially
- * opened, but not yet accepted.
- */
-
-static void tcp_listen_stop (struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_listen_opt *lopt = tp->listen_opt;
- struct open_request *acc_req = tp->accept_queue;
- struct open_request *req;
- int i;
-
- tcp_delete_keepalive_timer(sk);
-
- /* make all the listen_opt local to us */
- write_lock_bh(&tp->syn_wait_lock);
- tp->listen_opt = NULL;
- write_unlock_bh(&tp->syn_wait_lock);
- tp->accept_queue = tp->accept_queue_tail = NULL;
-
- if (lopt->qlen) {
- for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
- while ((req = lopt->syn_table[i]) != NULL) {
- lopt->syn_table[i] = req->dl_next;
- lopt->qlen--;
- tcp_openreq_free(req);
-
- /* Following specs, it would be better either to send FIN
- * (and enter FIN-WAIT-1, it is normal close)
- * or to send active reset (abort).
- * Certainly, it is pretty dangerous while synflood, but it is
- * bad justification for our negligence 8)
- * To be honest, we are not able to make either
- * of the variants now. --ANK
- */
- }
- }
- }
- BUG_TRAP(!lopt->qlen);
-
- kfree(lopt);
-
- while ((req = acc_req) != NULL) {
- struct sock *child = req->sk;
-
- acc_req = req->dl_next;
-
- local_bh_disable();
- bh_lock_sock(child);
- BUG_TRAP(!sock_owned_by_user(child));
- sock_hold(child);
-
- tcp_disconnect(child, O_NONBLOCK);
-
- sock_orphan(child);
-
- atomic_inc(&tcp_orphan_count);
-
- tcp_destroy_sock(child);
-
- bh_unlock_sock(child);
- local_bh_enable();
- sock_put(child);
-
- sk_acceptq_removed(sk);
- tcp_openreq_fastfree(req);
- }
- BUG_TRAP(!sk->sk_ack_backlog);
-}
-
static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
@@ -603,7 +471,7 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
sk_charge_skb(sk, skb);
if (!sk->sk_send_head)
sk->sk_send_head = skb;
- else if (tp->nonagle&TCP_NAGLE_PUSH)
+ if (tp->nonagle & TCP_NAGLE_PUSH)
tp->nonagle &= ~TCP_NAGLE_PUSH;
}
@@ -634,7 +502,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
size_t psize, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
- int mss_now;
+ int mss_now, size_goal;
int err;
ssize_t copied;
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -647,6 +515,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ size_goal = tp->xmit_size_goal;
copied = 0;
err = -EPIPE;
@@ -660,7 +529,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
int offset = poffset % PAGE_SIZE;
int size = min_t(size_t, psize, PAGE_SIZE - offset);
- if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
+ if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
@@ -671,7 +540,7 @@ new_segment:
goto wait_for_memory;
skb_entail(sk, tp, skb);
- copy = mss_now;
+ copy = size_goal;
}
if (copy > size)
@@ -683,8 +552,7 @@ new_segment:
tcp_mark_push(tp, skb);
goto new_segment;
}
- if (sk->sk_forward_alloc < copy &&
- !sk_stream_mem_schedule(sk, copy, 0))
+ if (!sk_stream_wmem_schedule(sk, copy))
goto wait_for_memory;
if (can_coalesce) {
@@ -712,7 +580,7 @@ new_segment:
if (!(psize -= copy))
goto out;
- if (skb->len != mss_now || (flags & MSG_OOB))
+ if (skb->len < mss_now || (flags & MSG_OOB))
continue;
if (forced_push(tp)) {
@@ -732,6 +600,7 @@ wait_for_memory:
goto do_error;
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ size_goal = tp->xmit_size_goal;
}
out:
@@ -773,15 +642,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
static inline int select_size(struct sock *sk, struct tcp_sock *tp)
{
- int tmp = tp->mss_cache_std;
+ int tmp = tp->mss_cache;
if (sk->sk_route_caps & NETIF_F_SG) {
- int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+ if (sk->sk_route_caps & NETIF_F_TSO)
+ tmp = 0;
+ else {
+ int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
- if (tmp >= pgbreak &&
- tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
- tmp = pgbreak;
+ if (tmp >= pgbreak &&
+ tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+ tmp = pgbreak;
+ }
}
+
return tmp;
}
@@ -792,7 +666,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags;
- int mss_now;
+ int mss_now, size_goal;
int err, copied;
long timeo;
@@ -811,6 +685,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ size_goal = tp->xmit_size_goal;
/* Ok commence sending. */
iovlen = msg->msg_iovlen;
@@ -833,7 +708,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
skb = sk->sk_write_queue.prev;
if (!sk->sk_send_head ||
- (copy = mss_now - skb->len) <= 0) {
+ (copy = size_goal - skb->len) <= 0) {
new_segment:
/* Allocate new segment. If the interface is SG,
@@ -856,7 +731,7 @@ new_segment:
skb->ip_summed = CHECKSUM_HW;
skb_entail(sk, tp, skb);
- copy = mss_now;
+ copy = size_goal;
}
/* Try to append data to the end of skb. */
@@ -891,27 +766,26 @@ new_segment:
tcp_mark_push(tp, skb);
goto new_segment;
} else if (page) {
- /* If page is cached, align
- * offset to L1 cache boundary
- */
- off = (off + L1_CACHE_BYTES - 1) &
- ~(L1_CACHE_BYTES - 1);
if (off == PAGE_SIZE) {
put_page(page);
TCP_PAGE(sk) = page = NULL;
+ off = 0;
}
- }
+ } else
+ off = 0;
+
+ if (copy > PAGE_SIZE - off)
+ copy = PAGE_SIZE - off;
+
+ if (!sk_stream_wmem_schedule(sk, copy))
+ goto wait_for_memory;
if (!page) {
/* Allocate new cache page. */
if (!(page = sk_stream_alloc_page(sk)))
goto wait_for_memory;
- off = 0;
}
- if (copy > PAGE_SIZE - off)
- copy = PAGE_SIZE - off;
-
/* Time to copy data. We are close to
* the end! */
err = skb_copy_to_page(sk, from, skb, page,
@@ -956,7 +830,7 @@ new_segment:
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;
- if (skb->len != mss_now || (flags & MSG_OOB))
+ if (skb->len < mss_now || (flags & MSG_OOB))
continue;
if (forced_push(tp)) {
@@ -976,6 +850,7 @@ wait_for_memory:
goto do_error;
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+ size_goal = tp->xmit_size_goal;
}
}
@@ -990,7 +865,7 @@ do_fault:
if (!skb->len) {
if (sk->sk_send_head == skb)
sk->sk_send_head = NULL;
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &sk->sk_write_queue);
sk_stream_free_skb(sk, skb);
}
@@ -1072,20 +947,21 @@ static void cleanup_rbuf(struct sock *sk, int copied)
BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
#endif
- if (tcp_ack_scheduled(tp)) {
+ if (inet_csk_ack_scheduled(sk)) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
/* Delayed ACKs frequently hit locked sockets during bulk
* receive. */
- if (tp->ack.blocked ||
+ if (icsk->icsk_ack.blocked ||
/* Once-per-two-segments ACK was not sent by tcp_input.c */
- tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
+ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
/*
* If this read emptied read buffer, we send ACK, if
* connection is not bidirectional, user drained
* receive buffer and there was a small segment
* in queue.
*/
- (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
- !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
+ (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+ !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
time_to_ack = 1;
}
@@ -1120,7 +996,7 @@ static void tcp_prequeue_process(struct sock *sk)
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
- NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
+ NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
/* RX process wants to run with disabled BHs, though it is not
* necessary */
@@ -1345,7 +1221,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
cleanup_rbuf(sk, copied);
- if (tp->ucopy.task == user_recv) {
+ if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
/* Install new reader */
if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
user_recv = current;
@@ -1384,7 +1260,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
* is not empty. It is more elegant, but eats cycles,
* unfortunately.
*/
- if (skb_queue_len(&tp->ucopy.prequeue))
+ if (!skb_queue_empty(&tp->ucopy.prequeue))
goto do_prequeue;
/* __ Set realtime policy in scheduler __ */
@@ -1409,7 +1285,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
}
if (tp->rcv_nxt == tp->copied_seq &&
- skb_queue_len(&tp->ucopy.prequeue)) {
+ !skb_queue_empty(&tp->ucopy.prequeue)) {
do_prequeue:
tcp_prequeue_process(sk);
@@ -1491,7 +1367,7 @@ skip_copy:
} while (len > 0);
if (user_recv) {
- if (skb_queue_len(&tp->ucopy.prequeue)) {
+ if (!skb_queue_empty(&tp->ucopy.prequeue)) {
int chunk;
tp->ucopy.len = copied > 0 ? len : 0;
@@ -1587,40 +1463,6 @@ void tcp_shutdown(struct sock *sk, int how)
}
}
-/*
- * At this point, there should be no process reference to this
- * socket, and thus no user references at all. Therefore we
- * can assume the socket waitqueue is inactive and nobody will
- * try to jump onto it.
- */
-void tcp_destroy_sock(struct sock *sk)
-{
- BUG_TRAP(sk->sk_state == TCP_CLOSE);
- BUG_TRAP(sock_flag(sk, SOCK_DEAD));
-
- /* It cannot be in hash table! */
- BUG_TRAP(sk_unhashed(sk));
-
- /* If it has not 0 inet_sk(sk)->num, it must be bound */
- BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
-
- sk->sk_prot->destroy(sk);
-
- sk_stream_kill_queues(sk);
-
- xfrm_sk_free_policy(sk);
-
-#ifdef INET_REFCNT_DEBUG
- if (atomic_read(&sk->sk_refcnt) != 1) {
- printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
- sk, atomic_read(&sk->sk_refcnt));
- }
-#endif
-
- atomic_dec(&tcp_orphan_count);
- sock_put(sk);
-}
-
void tcp_close(struct sock *sk, long timeout)
{
struct sk_buff *skb;
@@ -1633,7 +1475,7 @@ void tcp_close(struct sock *sk, long timeout)
tcp_set_state(sk, TCP_CLOSE);
/* Special case. */
- tcp_listen_stop(sk);
+ inet_csk_listen_stop(sk);
goto adjudge_to_death;
}
@@ -1736,12 +1578,12 @@ adjudge_to_death:
tcp_send_active_reset(sk, GFP_ATOMIC);
NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
} else {
- int tmo = tcp_fin_time(tp);
+ const int tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
- tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
+ inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
} else {
- atomic_inc(&tcp_orphan_count);
+ atomic_inc(sk->sk_prot->orphan_count);
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto out;
}
@@ -1749,7 +1591,7 @@ adjudge_to_death:
}
if (sk->sk_state != TCP_CLOSE) {
sk_stream_mem_reclaim(sk);
- if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
+ if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
if (net_ratelimit())
@@ -1760,10 +1602,10 @@ adjudge_to_death:
NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
}
}
- atomic_inc(&tcp_orphan_count);
+ atomic_inc(sk->sk_prot->orphan_count);
if (sk->sk_state == TCP_CLOSE)
- tcp_destroy_sock(sk);
+ inet_csk_destroy_sock(sk);
/* Otherwise, socket is reprieved until protocol close. */
out:
@@ -1784,6 +1626,7 @@ static inline int tcp_need_reset(int state)
int tcp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int err = 0;
int old_state = sk->sk_state;
@@ -1793,7 +1636,7 @@ int tcp_disconnect(struct sock *sk, int flags)
/* ABORT function of RFC793 */
if (old_state == TCP_LISTEN) {
- tcp_listen_stop(sk);
+ inet_csk_listen_stop(sk);
} else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -1820,132 +1663,34 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->srtt = 0;
if ((tp->write_seq += tp->max_window + 2) == 0)
tp->write_seq = 1;
- tp->backoff = 0;
+ icsk->icsk_backoff = 0;
tp->snd_cwnd = 2;
- tp->probes_out = 0;
+ icsk->icsk_probes_out = 0;
tp->packets_out = 0;
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_cnt = 0;
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
- tcp_delack_init(tp);
+ inet_csk_delack_init(sk);
sk->sk_send_head = NULL;
tp->rx_opt.saw_tstamp = 0;
tcp_sack_reset(&tp->rx_opt);
__sk_dst_reset(sk);
- BUG_TRAP(!inet->num || tp->bind_hash);
+ BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
sk->sk_error_report(sk);
return err;
}
/*
- * Wait for an incoming connection, avoid race
- * conditions. This must be called with the socket locked.
- */
-static int wait_for_connect(struct sock *sk, long timeo)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- DEFINE_WAIT(wait);
- int err;
-
- /*
- * True wake-one mechanism for incoming connections: only
- * one process gets woken up, not the 'whole herd'.
- * Since we do not 'race & poll' for established sockets
- * anymore, the common case will execute the loop only once.
- *
- * Subtle issue: "add_wait_queue_exclusive()" will be added
- * after any current non-exclusive waiters, and we know that
- * it will always _stay_ after any new non-exclusive waiters
- * because all non-exclusive waiters are added at the
- * beginning of the wait-queue. As such, it's ok to "drop"
- * our exclusiveness temporarily when we get woken up without
- * having to remove and re-insert us on the wait queue.
- */
- for (;;) {
- prepare_to_wait_exclusive(sk->sk_sleep, &wait,
- TASK_INTERRUPTIBLE);
- release_sock(sk);
- if (!tp->accept_queue)
- timeo = schedule_timeout(timeo);
- lock_sock(sk);
- err = 0;
- if (tp->accept_queue)
- break;
- err = -EINVAL;
- if (sk->sk_state != TCP_LISTEN)
- break;
- err = sock_intr_errno(timeo);
- if (signal_pending(current))
- break;
- err = -EAGAIN;
- if (!timeo)
- break;
- }
- finish_wait(sk->sk_sleep, &wait);
- return err;
-}
-
-/*
- * This will accept the next outstanding connection.
- */
-
-struct sock *tcp_accept(struct sock *sk, int flags, int *err)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct open_request *req;
- struct sock *newsk;
- int error;
-
- lock_sock(sk);
-
- /* We need to make sure that this socket is listening,
- * and that it has something pending.
- */
- error = -EINVAL;
- if (sk->sk_state != TCP_LISTEN)
- goto out;
-
- /* Find already established connection */
- if (!tp->accept_queue) {
- long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-
- /* If this is a non blocking socket don't sleep */
- error = -EAGAIN;
- if (!timeo)
- goto out;
-
- error = wait_for_connect(sk, timeo);
- if (error)
- goto out;
- }
-
- req = tp->accept_queue;
- if ((tp->accept_queue = req->dl_next) == NULL)
- tp->accept_queue_tail = NULL;
-
- newsk = req->sk;
- sk_acceptq_removed(sk);
- tcp_openreq_fastfree(req);
- BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
- release_sock(sk);
- return newsk;
-
-out:
- release_sock(sk);
- *err = error;
- return NULL;
-}
-
-/*
* Socket option code for TCP.
*/
int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
int optlen)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
int val;
int err = 0;
@@ -1953,6 +1698,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
return tp->af_specific->setsockopt(sk, level, optname,
optval, optlen);
+ /* This is a string value all the others are int's */
+ if (optname == TCP_CONGESTION) {
+ char name[TCP_CA_NAME_MAX];
+
+ if (optlen < 1)
+ return -EINVAL;
+
+ val = strncpy_from_user(name, optval,
+ min(TCP_CA_NAME_MAX-1, optlen));
+ if (val < 0)
+ return -EFAULT;
+ name[val] = 0;
+
+ lock_sock(sk);
+ err = tcp_set_congestion_control(sk, name);
+ release_sock(sk);
+ return err;
+ }
+
if (optlen < sizeof(int))
return -EINVAL;
@@ -2025,7 +1789,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
elapsed = tp->keepalive_time - elapsed;
else
elapsed = 0;
- tcp_reset_keepalive_timer(sk, elapsed);
+ inet_csk_reset_keepalive_timer(sk, elapsed);
}
}
break;
@@ -2045,7 +1809,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
if (val < 1 || val > MAX_TCP_SYNCNT)
err = -EINVAL;
else
- tp->syn_retries = val;
+ icsk->icsk_syn_retries = val;
break;
case TCP_LINGER2:
@@ -2058,15 +1822,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
break;
case TCP_DEFER_ACCEPT:
- tp->defer_accept = 0;
+ icsk->icsk_accept_queue.rskq_defer_accept = 0;
if (val > 0) {
/* Translate value in seconds to number of
* retransmits */
- while (tp->defer_accept < 32 &&
+ while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
val > ((TCP_TIMEOUT_INIT / HZ) <<
- tp->defer_accept))
- tp->defer_accept++;
- tp->defer_accept++;
+ icsk->icsk_accept_queue.rskq_defer_accept))
+ icsk->icsk_accept_queue.rskq_defer_accept++;
+ icsk->icsk_accept_queue.rskq_defer_accept++;
}
break;
@@ -2084,16 +1848,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
case TCP_QUICKACK:
if (!val) {
- tp->ack.pingpong = 1;
+ icsk->icsk_ack.pingpong = 1;
} else {
- tp->ack.pingpong = 0;
+ icsk->icsk_ack.pingpong = 0;
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
- tcp_ack_scheduled(tp)) {
- tp->ack.pending |= TCP_ACK_PUSHED;
+ inet_csk_ack_scheduled(sk)) {
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
cleanup_rbuf(sk, 1);
if (!(val & 1))
- tp->ack.pingpong = 1;
+ icsk->icsk_ack.pingpong = 1;
}
}
break;
@@ -2110,15 +1874,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
memset(info, 0, sizeof(*info));
info->tcpi_state = sk->sk_state;
- info->tcpi_ca_state = tp->ca_state;
- info->tcpi_retransmits = tp->retransmits;
- info->tcpi_probes = tp->probes_out;
- info->tcpi_backoff = tp->backoff;
+ info->tcpi_ca_state = icsk->icsk_ca_state;
+ info->tcpi_retransmits = icsk->icsk_retransmits;
+ info->tcpi_probes = icsk->icsk_probes_out;
+ info->tcpi_backoff = icsk->icsk_backoff;
if (tp->rx_opt.tstamp_ok)
info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
@@ -2133,10 +1898,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (tp->ecn_flags&TCP_ECN_OK)
info->tcpi_options |= TCPI_OPT_ECN;
- info->tcpi_rto = jiffies_to_usecs(tp->rto);
- info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
- info->tcpi_snd_mss = tp->mss_cache_std;
- info->tcpi_rcv_mss = tp->ack.rcv_mss;
+ info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
+ info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
+ info->tcpi_snd_mss = tp->mss_cache;
+ info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
info->tcpi_unacked = tp->packets_out;
info->tcpi_sacked = tp->sacked_out;
@@ -2145,7 +1910,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_fackets = tp->fackets_out;
info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
- info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
+ info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
info->tcpi_pmtu = tp->pmtu_cookie;
@@ -2168,6 +1933,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info);
int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
int __user *optlen)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int val, len;
@@ -2185,7 +1951,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
switch (optname) {
case TCP_MAXSEG:
- val = tp->mss_cache_std;
+ val = tp->mss_cache;
if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
val = tp->rx_opt.user_mss;
break;
@@ -2205,7 +1971,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
break;
case TCP_SYNCNT:
- val = tp->syn_retries ? : sysctl_tcp_syn_retries;
+ val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
break;
case TCP_LINGER2:
val = tp->linger2;
@@ -2213,8 +1979,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
val = (val ? : sysctl_tcp_fin_timeout) / HZ;
break;
case TCP_DEFER_ACCEPT:
- val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
- (tp->defer_accept - 1));
+ val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
+ ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
break;
case TCP_WINDOW_CLAMP:
val = tp->window_clamp;
@@ -2235,8 +2001,18 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
return 0;
}
case TCP_QUICKACK:
- val = !tp->ack.pingpong;
+ val = !icsk->icsk_ack.pingpong;
break;
+
+ case TCP_CONGESTION:
+ if (get_user(len, optlen))
+ return -EFAULT;
+ len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
+ return -EFAULT;
+ return 0;
default:
return -ENOPROTOOPT;
};
@@ -2250,7 +2026,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
extern void __skb_cb_too_small_for_tcp(int, int);
-extern void tcpdiag_init(void);
+extern struct tcp_congestion_ops tcp_reno;
static __initdata unsigned long thash_entries;
static int __init set_thash_entries(char *str)
@@ -2271,86 +2047,72 @@ void __init tcp_init(void)
__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
sizeof(skb->cb));
- tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
- sizeof(struct open_request),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (!tcp_openreq_cachep)
- panic("tcp_init: Cannot alloc open_request cache.");
-
- tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
- sizeof(struct tcp_bind_bucket),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (!tcp_bucket_cachep)
+ tcp_hashinfo.bind_bucket_cachep =
+ kmem_cache_create("tcp_bind_bucket",
+ sizeof(struct inet_bind_bucket), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!tcp_hashinfo.bind_bucket_cachep)
panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
- tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
- sizeof(struct tcp_tw_bucket),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (!tcp_timewait_cachep)
- panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
-
/* Size and allocate the main established and bind bucket
* hash tables.
*
* The methodology is similar to that of the buffer cache.
*/
- tcp_ehash = (struct tcp_ehash_bucket *)
+ tcp_hashinfo.ehash =
alloc_large_system_hash("TCP established",
- sizeof(struct tcp_ehash_bucket),
+ sizeof(struct inet_ehash_bucket),
thash_entries,
(num_physpages >= 128 * 1024) ?
(25 - PAGE_SHIFT) :
(27 - PAGE_SHIFT),
HASH_HIGHMEM,
- &tcp_ehash_size,
+ &tcp_hashinfo.ehash_size,
NULL,
0);
- tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
- for (i = 0; i < (tcp_ehash_size << 1); i++) {
- rwlock_init(&tcp_ehash[i].lock);
- INIT_HLIST_HEAD(&tcp_ehash[i].chain);
+ tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
+ for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
+ rwlock_init(&tcp_hashinfo.ehash[i].lock);
+ INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
}
- tcp_bhash = (struct tcp_bind_hashbucket *)
+ tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
- sizeof(struct tcp_bind_hashbucket),
- tcp_ehash_size,
+ sizeof(struct inet_bind_hashbucket),
+ tcp_hashinfo.ehash_size,
(num_physpages >= 128 * 1024) ?
(25 - PAGE_SHIFT) :
(27 - PAGE_SHIFT),
HASH_HIGHMEM,
- &tcp_bhash_size,
+ &tcp_hashinfo.bhash_size,
NULL,
64 * 1024);
- tcp_bhash_size = 1 << tcp_bhash_size;
- for (i = 0; i < tcp_bhash_size; i++) {
- spin_lock_init(&tcp_bhash[i].lock);
- INIT_HLIST_HEAD(&tcp_bhash[i].chain);
+ tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
+ for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
+ spin_lock_init(&tcp_hashinfo.bhash[i].lock);
+ INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
}
/* Try to be a bit smarter and adjust defaults depending
* on available memory.
*/
for (order = 0; ((1 << order) << PAGE_SHIFT) <
- (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
+ (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
order++)
;
- if (order > 4) {
+ if (order >= 4) {
sysctl_local_port_range[0] = 32768;
sysctl_local_port_range[1] = 61000;
- sysctl_tcp_max_tw_buckets = 180000;
+ tcp_death_row.sysctl_max_tw_buckets = 180000;
sysctl_tcp_max_orphans = 4096 << (order - 4);
sysctl_max_syn_backlog = 1024;
} else if (order < 3) {
sysctl_local_port_range[0] = 1024 * (3 - order);
- sysctl_tcp_max_tw_buckets >>= (3 - order);
+ tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
sysctl_tcp_max_orphans >>= (3 - order);
sysctl_max_syn_backlog = 128;
}
- tcp_port_rover = sysctl_local_port_range[0] - 1;
+ tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
sysctl_tcp_mem[0] = 768 << order;
sysctl_tcp_mem[1] = 1024 << order;
@@ -2365,16 +2127,15 @@ void __init tcp_init(void)
printk(KERN_INFO "TCP: Hash tables configured "
"(established %d bind %d)\n",
- tcp_ehash_size << 1, tcp_bhash_size);
+ tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
+
+ tcp_register_congestion_control(&tcp_reno);
}
-EXPORT_SYMBOL(tcp_accept);
EXPORT_SYMBOL(tcp_close);
-EXPORT_SYMBOL(tcp_destroy_sock);
EXPORT_SYMBOL(tcp_disconnect);
EXPORT_SYMBOL(tcp_getsockopt);
EXPORT_SYMBOL(tcp_ioctl);
-EXPORT_SYMBOL(tcp_openreq_cachep);
EXPORT_SYMBOL(tcp_poll);
EXPORT_SYMBOL(tcp_read_sock);
EXPORT_SYMBOL(tcp_recvmsg);
@@ -2383,4 +2144,3 @@ EXPORT_SYMBOL(tcp_sendpage);
EXPORT_SYMBOL(tcp_setsockopt);
EXPORT_SYMBOL(tcp_shutdown);
EXPORT_SYMBOL(tcp_statistics);
-EXPORT_SYMBOL(tcp_timewait_cachep);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 0000000..b940346
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,337 @@
+/*
+ * Binary Increase Congestion control for TCP
+ *
+ * This is from the implementation of BICTCP in
+ * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
+ * "Binary Increase Congestion Control for Fast, Long Distance
+ * Networks" in InfoComm 2004
+ * Available from:
+ * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
+ *
+ * Unless BIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+
+#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
+ * max_cwnd = snd_cwnd * beta
+ */
+#define BICTCP_B 4 /*
+ * In binary search,
+ * go to point (max+min)/N
+ */
+
+static int fast_convergence = 1;
+static int max_increment = 32;
+static int low_window = 14;
+static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
+static int low_utilization_threshold = 153;
+static int low_utilization_period = 2;
+static int initial_ssthresh = 100;
+static int smooth_part = 20;
+
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(max_increment, int, 0644);
+MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
+module_param(low_window, int, 0644);
+MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(low_utilization_threshold, int, 0644);
+MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
+module_param(low_utilization_period, int, 0644);
+MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(smooth_part, int, 0644);
+MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
+
+
+/* BIC TCP Parameters */
+struct bictcp {
+ u32 cnt; /* increase cwnd by 1 after ACKs */
+ u32 last_max_cwnd; /* last maximum snd_cwnd */
+ u32 loss_cwnd; /* congestion window at last loss */
+ u32 last_cwnd; /* the last snd_cwnd */
+ u32 last_time; /* time when updated last_cwnd */
+ u32 delay_min; /* min delay */
+ u32 delay_max; /* max delay */
+ u32 last_delay;
+ u8 low_utilization;/* 0: high; 1: low */
+ u32 low_utilization_start; /* starting time of low utilization detection*/
+ u32 epoch_start; /* beginning of an epoch */
+#define ACK_RATIO_SHIFT 4
+ u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+ ca->cnt = 0;
+ ca->last_max_cwnd = 0;
+ ca->loss_cwnd = 0;
+ ca->last_cwnd = 0;
+ ca->last_time = 0;
+ ca->delay_min = 0;
+ ca->delay_max = 0;
+ ca->last_delay = 0;
+ ca->low_utilization = 0;
+ ca->low_utilization_start = 0;
+ ca->epoch_start = 0;
+ ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
+}
+
+static void bictcp_init(struct sock *sk)
+{
+ bictcp_reset(inet_csk_ca(sk));
+ if (initial_ssthresh)
+ tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+{
+ if (ca->last_cwnd == cwnd &&
+ (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+ return;
+
+ ca->last_cwnd = cwnd;
+ ca->last_time = tcp_time_stamp;
+
+ if (ca->epoch_start == 0) /* record the beginning of an epoch */
+ ca->epoch_start = tcp_time_stamp;
+
+ /* start off normal */
+ if (cwnd <= low_window) {
+ ca->cnt = cwnd;
+ return;
+ }
+
+ /* binary increase */
+ if (cwnd < ca->last_max_cwnd) {
+ __u32 dist = (ca->last_max_cwnd - cwnd)
+ / BICTCP_B;
+
+ if (dist > max_increment)
+ /* linear increase */
+ ca->cnt = cwnd / max_increment;
+ else if (dist <= 1U)
+ /* binary search increase */
+ ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+ else
+ /* binary search increase */
+ ca->cnt = cwnd / dist;
+ } else {
+ /* slow start AMD linear increase */
+ if (cwnd < ca->last_max_cwnd + BICTCP_B)
+ /* slow start */
+ ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+ else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
+ /* slow start */
+ ca->cnt = (cwnd * (BICTCP_B-1))
+ / cwnd-ca->last_max_cwnd;
+ else
+ /* linear increase */
+ ca->cnt = cwnd / max_increment;
+ }
+
+ /* if in slow start or link utilization is very low */
+ if ( ca->loss_cwnd == 0 ||
+ (cwnd > ca->loss_cwnd && ca->low_utilization)) {
+ if (ca->cnt > 20) /* increase cwnd 5% per RTT */
+ ca->cnt = 20;
+ }
+
+ ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
+ if (ca->cnt == 0) /* cannot be zero */
+ ca->cnt = 1;
+}
+
+
+/* Detect low utilization in congestion avoidance */
+static inline void bictcp_low_utilization(struct sock *sk, int flag)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+ u32 dist, delay;
+
+ /* No time stamp */
+ if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
+ /* Discard delay samples right after fast recovery */
+ tcp_time_stamp < ca->epoch_start + HZ ||
+ /* this delay samples may not be accurate */
+ flag == 0) {
+ ca->last_delay = 0;
+ goto notlow;
+ }
+
+ delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/
+ ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+ if (delay == 0) /* no previous delay sample */
+ goto notlow;
+
+ /* first time call or link delay decreases */
+ if (ca->delay_min == 0 || ca->delay_min > delay) {
+ ca->delay_min = ca->delay_max = delay;
+ goto notlow;
+ }
+
+ if (ca->delay_max < delay)
+ ca->delay_max = delay;
+
+ /* utilization is low, if avg delay < dist*threshold
+ for checking_period time */
+ dist = ca->delay_max - ca->delay_min;
+ if (dist <= ca->delay_min>>6 ||
+ tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10)
+ goto notlow;
+
+ if (ca->low_utilization_start == 0) {
+ ca->low_utilization = 0;
+ ca->low_utilization_start = tcp_time_stamp;
+ } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
+ > low_utilization_period*HZ) {
+ ca->low_utilization = 1;
+ }
+
+ return;
+
+ notlow:
+ ca->low_utilization = 0;
+ ca->low_utilization_start = 0;
+
+}
+
+static void bictcp_cong_avoid(struct sock *sk, u32 ack,
+ u32 seq_rtt, u32 in_flight, int data_acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ bictcp_low_utilization(sk, data_acked);
+
+ if (in_flight < tp->snd_cwnd)
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ /* In "safe" area, increase. */
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ } else {
+ bictcp_update(ca, tp->snd_cwnd);
+
+ /* In dangerous area, increase slowly.
+ * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+ */
+ if (tp->snd_cwnd_cnt >= ca->cnt) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt++;
+ }
+
+}
+
+/*
+ * behave like Reno until low_window is reached,
+ * then increase congestion window slowly
+ */
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ ca->epoch_start = 0; /* end of epoch */
+
+ /* in case of wrong delay_max*/
+ if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
+ ca->delay_max = ca->delay_min
+ + ((ca->delay_max - ca->delay_min)* 90) / 100;
+
+ /* Wmax and fast convergence */
+ if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ / (2 * BICTCP_BETA_SCALE);
+ else
+ ca->last_max_cwnd = tp->snd_cwnd;
+
+ ca->loss_cwnd = tp->snd_cwnd;
+
+
+ if (tp->snd_cwnd <= low_window)
+ return max(tp->snd_cwnd >> 1U, 2U);
+ else
+ return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static u32 bictcp_undo_cwnd(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct bictcp *ca = inet_csk_ca(sk);
+ return max(tp->snd_cwnd, ca->last_max_cwnd);
+}
+
+static u32 bictcp_min_cwnd(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ return tp->snd_ssthresh;
+}
+
+static void bictcp_state(struct sock *sk, u8 new_state)
+{
+ if (new_state == TCP_CA_Loss)
+ bictcp_reset(inet_csk_ca(sk));
+}
+
+/* Track delayed acknowledgement ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct sock *sk, u32 cnt)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
+ struct bictcp *ca = inet_csk_ca(sk);
+ cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+ ca->delayed_ack += cnt;
+ }
+}
+
+
+static struct tcp_congestion_ops bictcp = {
+ .init = bictcp_init,
+ .ssthresh = bictcp_recalc_ssthresh,
+ .cong_avoid = bictcp_cong_avoid,
+ .set_state = bictcp_state,
+ .undo_cwnd = bictcp_undo_cwnd,
+ .min_cwnd = bictcp_min_cwnd,
+ .pkts_acked = bictcp_acked,
+ .owner = THIS_MODULE,
+ .name = "bic",
+};
+
+static int __init bictcp_register(void)
+{
+ BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&bictcp);
+}
+
+static void __exit bictcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&bictcp);
+}
+
+module_init(bictcp_register);
+module_exit(bictcp_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("BIC TCP");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 0000000..bbf2d66
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,245 @@
+/*
+ * Plugable TCP congestion control support and newReno
+ * congestion control.
+ * Based on ideas from I/O scheduler suport and Web100.
+ *
+ * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/tcp.h>
+
+static DEFINE_SPINLOCK(tcp_cong_list_lock);
+static LIST_HEAD(tcp_cong_list);
+
+/* Simple linear search, don't expect many entries! */
+static struct tcp_congestion_ops *tcp_ca_find(const char *name)
+{
+ struct tcp_congestion_ops *e;
+
+ list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+ if (strcmp(e->name, name) == 0)
+ return e;
+ }
+
+ return NULL;
+}
+
+/*
+ * Attach new congestion control algorthim to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+ int ret = 0;
+
+ /* all algorithms must implement ssthresh and cong_avoid ops */
+ if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
+ printk(KERN_ERR "TCP %s does not implement required ops\n",
+ ca->name);
+ return -EINVAL;
+ }
+
+ spin_lock(&tcp_cong_list_lock);
+ if (tcp_ca_find(ca->name)) {
+ printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
+ ret = -EEXIST;
+ } else {
+ list_add_rcu(&ca->list, &tcp_cong_list);
+ printk(KERN_INFO "TCP %s registered\n", ca->name);
+ }
+ spin_unlock(&tcp_cong_list_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
+
+/*
+ * Remove congestion control algorithm, called from
+ * the module's remove function. Module ref counts are used
+ * to ensure that this can't be done till all sockets using
+ * that method are closed.
+ */
+void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
+{
+ spin_lock(&tcp_cong_list_lock);
+ list_del_rcu(&ca->list);
+ spin_unlock(&tcp_cong_list_lock);
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+
+/* Assign choice of congestion control. */
+void tcp_init_congestion_control(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_congestion_ops *ca;
+
+ if (icsk->icsk_ca_ops != &tcp_init_congestion_ops)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+ if (try_module_get(ca->owner)) {
+ icsk->icsk_ca_ops = ca;
+ break;
+ }
+
+ }
+ rcu_read_unlock();
+
+ if (icsk->icsk_ca_ops->init)
+ icsk->icsk_ca_ops->init(sk);
+}
+
+/* Manage refcounts on socket close. */
+void tcp_cleanup_congestion_control(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->release)
+ icsk->icsk_ca_ops->release(sk);
+ module_put(icsk->icsk_ca_ops->owner);
+}
+
+/* Used by sysctl to change default congestion control */
+int tcp_set_default_congestion_control(const char *name)
+{
+ struct tcp_congestion_ops *ca;
+ int ret = -ENOENT;
+
+ spin_lock(&tcp_cong_list_lock);
+ ca = tcp_ca_find(name);
+#ifdef CONFIG_KMOD
+ if (!ca) {
+ spin_unlock(&tcp_cong_list_lock);
+
+ request_module("tcp_%s", name);
+ spin_lock(&tcp_cong_list_lock);
+ ca = tcp_ca_find(name);
+ }
+#endif
+
+ if (ca) {
+ list_move(&ca->list, &tcp_cong_list);
+ ret = 0;
+ }
+ spin_unlock(&tcp_cong_list_lock);
+
+ return ret;
+}
+
+/* Get current default congestion control */
+void tcp_get_default_congestion_control(char *name)
+{
+ struct tcp_congestion_ops *ca;
+ /* We will always have reno... */
+ BUG_ON(list_empty(&tcp_cong_list));
+
+ rcu_read_lock();
+ ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
+ strncpy(name, ca->name, TCP_CA_NAME_MAX);
+ rcu_read_unlock();
+}
+
+/* Change congestion control for socket */
+int tcp_set_congestion_control(struct sock *sk, const char *name)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_congestion_ops *ca;
+ int err = 0;
+
+ rcu_read_lock();
+ ca = tcp_ca_find(name);
+ if (ca == icsk->icsk_ca_ops)
+ goto out;
+
+ if (!ca)
+ err = -ENOENT;
+
+ else if (!try_module_get(ca->owner))
+ err = -EBUSY;
+
+ else {
+ tcp_cleanup_congestion_control(sk);
+ icsk->icsk_ca_ops = ca;
+ if (icsk->icsk_ca_ops->init)
+ icsk->icsk_ca_ops->init(sk);
+ }
+ out:
+ rcu_read_unlock();
+ return err;
+}
+
+/*
+ * TCP Reno congestion control
+ * This is special case used for fallback as well.
+ */
+/* This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328.
+ */
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
+ int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (in_flight < tp->snd_cwnd)
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ /* In "safe" area, increase. */
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ } else {
+ /* In dangerous area, increase slowly.
+ * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+ */
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt++;
+ }
+}
+EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
+
+/* Slow start threshold is half the congestion window (min 2) */
+u32 tcp_reno_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ return max(tp->snd_cwnd >> 1U, 2U);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+
+/* Lower bound on congestion window. */
+u32 tcp_reno_min_cwnd(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ return tp->snd_ssthresh/2;
+}
+EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
+
+struct tcp_congestion_ops tcp_reno = {
+ .name = "reno",
+ .owner = THIS_MODULE,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .min_cwnd = tcp_reno_min_cwnd,
+};
+
+/* Initial congestion control used (until SYN)
+ * really reno under another name so we can tell difference
+ * during tcp_set_default_congestion_control
+ */
+struct tcp_congestion_ops tcp_init_congestion_ops = {
+ .name = "",
+ .owner = THIS_MODULE,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .min_cwnd = tcp_reno_min_cwnd,
+};
+EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 8faa894..c148c10 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -1,5 +1,5 @@
/*
- * tcp_diag.c Module for monitoring TCP sockets.
+ * tcp_diag.c Module for monitoring TCP transport protocols sockets.
*
* Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
*
@@ -12,792 +12,43 @@
*/
#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/random.h>
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/time.h>
-
-#include <net/icmp.h>
-#include <net/tcp.h>
-#include <net/ipv6.h>
-#include <net/inet_common.h>
-
-#include <linux/inet.h>
-#include <linux/stddef.h>
-
-#include <linux/tcp_diag.h>
-struct tcpdiag_entry
-{
- u32 *saddr;
- u32 *daddr;
- u16 sport;
- u16 dport;
- u16 family;
- u16 userlocks;
-};
-
-static struct sock *tcpnl;
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+#include <linux/tcp.h>
-#define TCPDIAG_PUT(skb, attrtype, attrlen) \
-({ int rtalen = RTA_LENGTH(attrlen); \
- struct rtattr *rta; \
- if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
- rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
- rta->rta_type = attrtype; \
- rta->rta_len = rtalen; \
- RTA_DATA(rta); })
+#include <net/tcp.h>
-static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
- int ext, u32 pid, u32 seq, u16 nlmsg_flags)
+static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+ void *_info)
{
- struct inet_sock *inet = inet_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcpdiagmsg *r;
- struct nlmsghdr *nlh;
- struct tcp_info *info = NULL;
- struct tcpdiag_meminfo *minfo = NULL;
- struct tcpvegas_info *vinfo = NULL;
- unsigned char *b = skb->tail;
-
- nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
- nlh->nlmsg_flags = nlmsg_flags;
- r = NLMSG_DATA(nlh);
- if (sk->sk_state != TCP_TIME_WAIT) {
- if (ext & (1<<(TCPDIAG_MEMINFO-1)))
- minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
- if (ext & (1<<(TCPDIAG_INFO-1)))
- info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
-
- if ((tcp_is_westwood(tp) || tcp_is_vegas(tp))
- && (ext & (1<<(TCPDIAG_VEGASINFO-1))))
- vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo));
- }
- r->tcpdiag_family = sk->sk_family;
- r->tcpdiag_state = sk->sk_state;
- r->tcpdiag_timer = 0;
- r->tcpdiag_retrans = 0;
-
- r->id.tcpdiag_if = sk->sk_bound_dev_if;
- r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk;
- r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
-
- if (r->tcpdiag_state == TCP_TIME_WAIT) {
- struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
- long tmo = tw->tw_ttd - jiffies;
- if (tmo < 0)
- tmo = 0;
-
- r->id.tcpdiag_sport = tw->tw_sport;
- r->id.tcpdiag_dport = tw->tw_dport;
- r->id.tcpdiag_src[0] = tw->tw_rcv_saddr;
- r->id.tcpdiag_dst[0] = tw->tw_daddr;
- r->tcpdiag_state = tw->tw_substate;
- r->tcpdiag_timer = 3;
- r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
- r->tcpdiag_rqueue = 0;
- r->tcpdiag_wqueue = 0;
- r->tcpdiag_uid = 0;
- r->tcpdiag_inode = 0;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- if (r->tcpdiag_family == AF_INET6) {
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
- &tw->tw_v6_rcv_saddr);
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
- &tw->tw_v6_daddr);
- }
-#endif
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
- }
-
- r->id.tcpdiag_sport = inet->sport;
- r->id.tcpdiag_dport = inet->dport;
- r->id.tcpdiag_src[0] = inet->rcv_saddr;
- r->id.tcpdiag_dst[0] = inet->daddr;
-
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- if (r->tcpdiag_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
- &np->rcv_saddr);
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
- &np->daddr);
- }
-#endif
-
-#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_info *info = _info;
- if (tp->pending == TCP_TIME_RETRANS) {
- r->tcpdiag_timer = 1;
- r->tcpdiag_retrans = tp->retransmits;
- r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
- } else if (tp->pending == TCP_TIME_PROBE0) {
- r->tcpdiag_timer = 4;
- r->tcpdiag_retrans = tp->probes_out;
- r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
- } else if (timer_pending(&sk->sk_timer)) {
- r->tcpdiag_timer = 2;
- r->tcpdiag_retrans = tp->probes_out;
- r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
- } else {
- r->tcpdiag_timer = 0;
- r->tcpdiag_expires = 0;
- }
-#undef EXPIRES_IN_MS
-
- r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq;
- r->tcpdiag_wqueue = tp->write_seq - tp->snd_una;
- r->tcpdiag_uid = sock_i_uid(sk);
- r->tcpdiag_inode = sock_i_ino(sk);
-
- if (minfo) {
- minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc);
- minfo->tcpdiag_wmem = sk->sk_wmem_queued;
- minfo->tcpdiag_fmem = sk->sk_forward_alloc;
- minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc);
- }
-
- if (info)
+ r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq;
+ r->idiag_wqueue = tp->write_seq - tp->snd_una;
+ if (info != NULL)
tcp_get_info(sk, info);
-
- if (vinfo) {
- if (tcp_is_vegas(tp)) {
- vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
- vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
- vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
- vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
- } else {
- vinfo->tcpv_enabled = 0;
- vinfo->tcpv_rttcnt = 0;
- vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
- vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
- }
- }
-
- nlh->nlmsg_len = skb->tail - b;
- return skb->len;
-
-nlmsg_failure:
- skb_trim(skb, b - skb->data);
- return -1;
-}
-
-extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
- int dif);
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
- struct in6_addr *daddr, u16 dport,
- int dif);
-#else
-static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
- struct in6_addr *daddr, u16 dport,
- int dif)
-{
- return NULL;
-}
-#endif
-
-static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
-{
- int err;
- struct sock *sk;
- struct tcpdiagreq *req = NLMSG_DATA(nlh);
- struct sk_buff *rep;
-
- if (req->tcpdiag_family == AF_INET) {
- sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
- req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
- req->id.tcpdiag_if);
- }
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- else if (req->tcpdiag_family == AF_INET6) {
- sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
- (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
- req->id.tcpdiag_if);
- }
-#endif
- else {
- return -EINVAL;
- }
-
- if (sk == NULL)
- return -ENOENT;
-
- err = -ESTALE;
- if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
- req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
- ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] ||
- (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1]))
- goto out;
-
- err = -ENOMEM;
- rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
- sizeof(struct tcpdiag_meminfo)+
- sizeof(struct tcp_info)+64), GFP_KERNEL);
- if (!rep)
- goto out;
-
- if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
- NETLINK_CB(in_skb).pid,
- nlh->nlmsg_seq, 0) <= 0)
- BUG();
-
- err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
- if (err > 0)
- err = 0;
-
-out:
- if (sk) {
- if (sk->sk_state == TCP_TIME_WAIT)
- tcp_tw_put((struct tcp_tw_bucket*)sk);
- else
- sock_put(sk);
- }
- return err;
-}
-
-static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
-{
- int words = bits >> 5;
-
- bits &= 0x1f;
-
- if (words) {
- if (memcmp(a1, a2, words << 2))
- return 0;
- }
- if (bits) {
- __u32 w1, w2;
- __u32 mask;
-
- w1 = a1[words];
- w2 = a2[words];
-
- mask = htonl((0xffffffff) << (32 - bits));
-
- if ((w1 ^ w2) & mask)
- return 0;
- }
-
- return 1;
-}
-
-
-static int tcpdiag_bc_run(const void *bc, int len,
- const struct tcpdiag_entry *entry)
-{
- while (len > 0) {
- int yes = 1;
- const struct tcpdiag_bc_op *op = bc;
-
- switch (op->code) {
- case TCPDIAG_BC_NOP:
- break;
- case TCPDIAG_BC_JMP:
- yes = 0;
- break;
- case TCPDIAG_BC_S_GE:
- yes = entry->sport >= op[1].no;
- break;
- case TCPDIAG_BC_S_LE:
- yes = entry->dport <= op[1].no;
- break;
- case TCPDIAG_BC_D_GE:
- yes = entry->dport >= op[1].no;
- break;
- case TCPDIAG_BC_D_LE:
- yes = entry->dport <= op[1].no;
- break;
- case TCPDIAG_BC_AUTO:
- yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
- break;
- case TCPDIAG_BC_S_COND:
- case TCPDIAG_BC_D_COND:
- {
- struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
- u32 *addr;
-
- if (cond->port != -1 &&
- cond->port != (op->code == TCPDIAG_BC_S_COND ?
- entry->sport : entry->dport)) {
- yes = 0;
- break;
- }
-
- if (cond->prefix_len == 0)
- break;
-
- if (op->code == TCPDIAG_BC_S_COND)
- addr = entry->saddr;
- else
- addr = entry->daddr;
-
- if (bitstring_match(addr, cond->addr, cond->prefix_len))
- break;
- if (entry->family == AF_INET6 &&
- cond->family == AF_INET) {
- if (addr[0] == 0 && addr[1] == 0 &&
- addr[2] == htonl(0xffff) &&
- bitstring_match(addr+3, cond->addr, cond->prefix_len))
- break;
- }
- yes = 0;
- break;
- }
- }
-
- if (yes) {
- len -= op->yes;
- bc += op->yes;
- } else {
- len -= op->no;
- bc += op->no;
- }
- }
- return (len == 0);
-}
-
-static int valid_cc(const void *bc, int len, int cc)
-{
- while (len >= 0) {
- const struct tcpdiag_bc_op *op = bc;
-
- if (cc > len)
- return 0;
- if (cc == len)
- return 1;
- if (op->yes < 4)
- return 0;
- len -= op->yes;
- bc += op->yes;
- }
- return 0;
-}
-
-static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len)
-{
- const unsigned char *bc = bytecode;
- int len = bytecode_len;
-
- while (len > 0) {
- struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
-
-//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
- switch (op->code) {
- case TCPDIAG_BC_AUTO:
- case TCPDIAG_BC_S_COND:
- case TCPDIAG_BC_D_COND:
- case TCPDIAG_BC_S_GE:
- case TCPDIAG_BC_S_LE:
- case TCPDIAG_BC_D_GE:
- case TCPDIAG_BC_D_LE:
- if (op->yes < 4 || op->yes > len+4)
- return -EINVAL;
- case TCPDIAG_BC_JMP:
- if (op->no < 4 || op->no > len+4)
- return -EINVAL;
- if (op->no < len &&
- !valid_cc(bytecode, bytecode_len, len-op->no))
- return -EINVAL;
- break;
- case TCPDIAG_BC_NOP:
- if (op->yes < 4 || op->yes > len+4)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
- bc += op->yes;
- len -= op->yes;
- }
- return len == 0 ? 0 : -EINVAL;
-}
-
-static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb)
-{
- struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-
- if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
- struct tcpdiag_entry entry;
- struct rtattr *bc = (struct rtattr *)(r + 1);
- struct inet_sock *inet = inet_sk(sk);
-
- entry.family = sk->sk_family;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- if (entry.family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- entry.saddr = np->rcv_saddr.s6_addr32;
- entry.daddr = np->daddr.s6_addr32;
- } else
-#endif
- {
- entry.saddr = &inet->rcv_saddr;
- entry.daddr = &inet->daddr;
- }
- entry.sport = inet->num;
- entry.dport = ntohs(inet->dport);
- entry.userlocks = sk->sk_userlocks;
-
- if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
- return 0;
- }
-
- return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI);
-}
-
-static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
- struct open_request *req,
- u32 pid, u32 seq)
-{
- struct inet_sock *inet = inet_sk(sk);
- unsigned char *b = skb->tail;
- struct tcpdiagmsg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
- nlh->nlmsg_flags = NLM_F_MULTI;
- r = NLMSG_DATA(nlh);
-
- r->tcpdiag_family = sk->sk_family;
- r->tcpdiag_state = TCP_SYN_RECV;
- r->tcpdiag_timer = 1;
- r->tcpdiag_retrans = req->retrans;
-
- r->id.tcpdiag_if = sk->sk_bound_dev_if;
- r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req;
- r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
-
- tmo = req->expires - jiffies;
- if (tmo < 0)
- tmo = 0;
-
- r->id.tcpdiag_sport = inet->sport;
- r->id.tcpdiag_dport = req->rmt_port;
- r->id.tcpdiag_src[0] = req->af.v4_req.loc_addr;
- r->id.tcpdiag_dst[0] = req->af.v4_req.rmt_addr;
- r->tcpdiag_expires = jiffies_to_msecs(tmo),
- r->tcpdiag_rqueue = 0;
- r->tcpdiag_wqueue = 0;
- r->tcpdiag_uid = sock_i_uid(sk);
- r->tcpdiag_inode = 0;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- if (r->tcpdiag_family == AF_INET6) {
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
- &req->af.v6_req.loc_addr);
- ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
- &req->af.v6_req.rmt_addr);
- }
-#endif
- nlh->nlmsg_len = skb->tail - b;
-
- return skb->len;
-
-nlmsg_failure:
- skb_trim(skb, b - skb->data);
- return -1;
-}
-
-static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
- struct netlink_callback *cb)
-{
- struct tcpdiag_entry entry;
- struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_listen_opt *lopt;
- struct rtattr *bc = NULL;
- struct inet_sock *inet = inet_sk(sk);
- int j, s_j;
- int reqnum, s_reqnum;
- int err = 0;
-
- s_j = cb->args[3];
- s_reqnum = cb->args[4];
-
- if (s_j > 0)
- s_j--;
-
- entry.family = sk->sk_family;
-
- read_lock_bh(&tp->syn_wait_lock);
-
- lopt = tp->listen_opt;
- if (!lopt || !lopt->qlen)
- goto out;
-
- if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
- bc = (struct rtattr *)(r + 1);
- entry.sport = inet->num;
- entry.userlocks = sk->sk_userlocks;
- }
-
- for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
- struct open_request *req, *head = lopt->syn_table[j];
-
- reqnum = 0;
- for (req = head; req; reqnum++, req = req->dl_next) {
- if (reqnum < s_reqnum)
- continue;
- if (r->id.tcpdiag_dport != req->rmt_port &&
- r->id.tcpdiag_dport)
- continue;
-
- if (bc) {
- entry.saddr =
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- (entry.family == AF_INET6) ?
- req->af.v6_req.loc_addr.s6_addr32 :
-#endif
- &req->af.v4_req.loc_addr;
- entry.daddr =
-#ifdef CONFIG_IP_TCPDIAG_IPV6
- (entry.family == AF_INET6) ?
- req->af.v6_req.rmt_addr.s6_addr32 :
-#endif
- &req->af.v4_req.rmt_addr;
- entry.dport = ntohs(req->rmt_port);
-
- if (!tcpdiag_bc_run(RTA_DATA(bc),
- RTA_PAYLOAD(bc), &entry))
- continue;
- }
-
- err = tcpdiag_fill_req(skb, sk, req,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq);
- if (err < 0) {
- cb->args[3] = j + 1;
- cb->args[4] = reqnum;
- goto out;
- }
- }
-
- s_reqnum = 0;
- }
-
-out:
- read_unlock_bh(&tp->syn_wait_lock);
-
- return err;
}
-static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
- int i, num;
- int s_i, s_num;
- struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-
- s_i = cb->args[1];
- s_num = num = cb->args[2];
-
- if (cb->args[0] == 0) {
- if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
- goto skip_listen_ht;
- tcp_listen_lock();
- for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
- struct sock *sk;
- struct hlist_node *node;
-
- num = 0;
- sk_for_each(sk, node, &tcp_listening_hash[i]) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num) {
- num++;
- continue;
- }
-
- if (r->id.tcpdiag_sport != inet->sport &&
- r->id.tcpdiag_sport)
- goto next_listen;
-
- if (!(r->tcpdiag_states&TCPF_LISTEN) ||
- r->id.tcpdiag_dport ||
- cb->args[3] > 0)
- goto syn_recv;
-
- if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
- tcp_listen_unlock();
- goto done;
- }
-
-syn_recv:
- if (!(r->tcpdiag_states&TCPF_SYN_RECV))
- goto next_listen;
-
- if (tcpdiag_dump_reqs(skb, sk, cb) < 0) {
- tcp_listen_unlock();
- goto done;
- }
-
-next_listen:
- cb->args[3] = 0;
- cb->args[4] = 0;
- ++num;
- }
-
- s_num = 0;
- cb->args[3] = 0;
- cb->args[4] = 0;
- }
- tcp_listen_unlock();
-skip_listen_ht:
- cb->args[0] = 1;
- s_i = num = s_num = 0;
- }
-
- if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
- return skb->len;
-
- for (i = s_i; i < tcp_ehash_size; i++) {
- struct tcp_ehash_bucket *head = &tcp_ehash[i];
- struct sock *sk;
- struct hlist_node *node;
-
- if (i > s_i)
- s_num = 0;
-
- read_lock_bh(&head->lock);
-
- num = 0;
- sk_for_each(sk, node, &head->chain) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num)
- goto next_normal;
- if (!(r->tcpdiag_states & (1 << sk->sk_state)))
- goto next_normal;
- if (r->id.tcpdiag_sport != inet->sport &&
- r->id.tcpdiag_sport)
- goto next_normal;
- if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
- goto next_normal;
- if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
- read_unlock_bh(&head->lock);
- goto done;
- }
-next_normal:
- ++num;
- }
-
- if (r->tcpdiag_states&TCPF_TIME_WAIT) {
- sk_for_each(sk, node,
- &tcp_ehash[i + tcp_ehash_size].chain) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num)
- goto next_dying;
- if (r->id.tcpdiag_sport != inet->sport &&
- r->id.tcpdiag_sport)
- goto next_dying;
- if (r->id.tcpdiag_dport != inet->dport &&
- r->id.tcpdiag_dport)
- goto next_dying;
- if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
- read_unlock_bh(&head->lock);
- goto done;
- }
-next_dying:
- ++num;
- }
- }
- read_unlock_bh(&head->lock);
- }
-
-done:
- cb->args[1] = i;
- cb->args[2] = num;
- return skb->len;
-}
-
-static int tcpdiag_dump_done(struct netlink_callback *cb)
-{
- return 0;
-}
-
-
-static __inline__ int
-tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
- if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
- return 0;
-
- if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
- goto err_inval;
-
- if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
- goto err_inval;
-
- if (nlh->nlmsg_flags&NLM_F_DUMP) {
- if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
- struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
- if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
- rta->rta_len < 8 ||
- rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
- goto err_inval;
- if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
- goto err_inval;
- }
- return netlink_dump_start(tcpnl, skb, nlh,
- tcpdiag_dump,
- tcpdiag_dump_done);
- } else {
- return tcpdiag_get_exact(skb, nlh);
- }
-
-err_inval:
- return -EINVAL;
-}
-
-
-static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
-{
- int err;
- struct nlmsghdr * nlh;
-
- if (skb->len >= NLMSG_SPACE(0)) {
- nlh = (struct nlmsghdr *)skb->data;
- if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
- return;
- err = tcpdiag_rcv_msg(skb, nlh);
- if (err || nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(skb, nlh, err);
- }
-}
-
-static void tcpdiag_rcv(struct sock *sk, int len)
-{
- struct sk_buff *skb;
- unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
-
- while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
- tcpdiag_rcv_skb(skb);
- kfree_skb(skb);
- }
-}
+static struct inet_diag_handler tcp_diag_handler = {
+ .idiag_hashinfo = &tcp_hashinfo,
+ .idiag_get_info = tcp_diag_get_info,
+ .idiag_type = TCPDIAG_GETSOCK,
+ .idiag_info_size = sizeof(struct tcp_info),
+};
-static int __init tcpdiag_init(void)
+static int __init tcp_diag_init(void)
{
- tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv);
- if (tcpnl == NULL)
- return -ENOMEM;
- return 0;
+ return inet_diag_register(&tcp_diag_handler);
}
-static void __exit tcpdiag_exit(void)
+static void __exit tcp_diag_exit(void)
{
- sock_release(tcpnl->sk_socket);
+ inet_diag_unregister(&tcp_diag_handler);
}
-module_init(tcpdiag_init);
-module_exit(tcpdiag_exit);
+module_init(tcp_diag_init);
+module_exit(tcp_diag_exit);
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 0000000..6acc04b
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,184 @@
+/*
+ * Sally Floyd's High Speed TCP (RFC 3649) congestion control
+ *
+ * See http://www.icir.org/floyd/hstcp.html
+ *
+ * John Heffner <jheffner@psc.edu>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+
+/* From AIMD tables from RFC 3649 appendix B,
+ * with fixed-point MD scaled <<8.
+ */
+static const struct hstcp_aimd_val {
+ unsigned int cwnd;
+ unsigned int md;
+} hstcp_aimd_vals[] = {
+ { 38, 128, /* 0.50 */ },
+ { 118, 112, /* 0.44 */ },
+ { 221, 104, /* 0.41 */ },
+ { 347, 98, /* 0.38 */ },
+ { 495, 93, /* 0.37 */ },
+ { 663, 89, /* 0.35 */ },
+ { 851, 86, /* 0.34 */ },
+ { 1058, 83, /* 0.33 */ },
+ { 1284, 81, /* 0.32 */ },
+ { 1529, 78, /* 0.31 */ },
+ { 1793, 76, /* 0.30 */ },
+ { 2076, 74, /* 0.29 */ },
+ { 2378, 72, /* 0.28 */ },
+ { 2699, 71, /* 0.28 */ },
+ { 3039, 69, /* 0.27 */ },
+ { 3399, 68, /* 0.27 */ },
+ { 3778, 66, /* 0.26 */ },
+ { 4177, 65, /* 0.26 */ },
+ { 4596, 64, /* 0.25 */ },
+ { 5036, 62, /* 0.25 */ },
+ { 5497, 61, /* 0.24 */ },
+ { 5979, 60, /* 0.24 */ },
+ { 6483, 59, /* 0.23 */ },
+ { 7009, 58, /* 0.23 */ },
+ { 7558, 57, /* 0.22 */ },
+ { 8130, 56, /* 0.22 */ },
+ { 8726, 55, /* 0.22 */ },
+ { 9346, 54, /* 0.21 */ },
+ { 9991, 53, /* 0.21 */ },
+ { 10661, 52, /* 0.21 */ },
+ { 11358, 52, /* 0.20 */ },
+ { 12082, 51, /* 0.20 */ },
+ { 12834, 50, /* 0.20 */ },
+ { 13614, 49, /* 0.19 */ },
+ { 14424, 48, /* 0.19 */ },
+ { 15265, 48, /* 0.19 */ },
+ { 16137, 47, /* 0.19 */ },
+ { 17042, 46, /* 0.18 */ },
+ { 17981, 45, /* 0.18 */ },
+ { 18955, 45, /* 0.18 */ },
+ { 19965, 44, /* 0.17 */ },
+ { 21013, 43, /* 0.17 */ },
+ { 22101, 43, /* 0.17 */ },
+ { 23230, 42, /* 0.17 */ },
+ { 24402, 41, /* 0.16 */ },
+ { 25618, 41, /* 0.16 */ },
+ { 26881, 40, /* 0.16 */ },
+ { 28193, 39, /* 0.16 */ },
+ { 29557, 39, /* 0.15 */ },
+ { 30975, 38, /* 0.15 */ },
+ { 32450, 38, /* 0.15 */ },
+ { 33986, 37, /* 0.15 */ },
+ { 35586, 36, /* 0.14 */ },
+ { 37253, 36, /* 0.14 */ },
+ { 38992, 35, /* 0.14 */ },
+ { 40808, 35, /* 0.14 */ },
+ { 42707, 34, /* 0.13 */ },
+ { 44694, 33, /* 0.13 */ },
+ { 46776, 33, /* 0.13 */ },
+ { 48961, 32, /* 0.13 */ },
+ { 51258, 32, /* 0.13 */ },
+ { 53677, 31, /* 0.12 */ },
+ { 56230, 30, /* 0.12 */ },
+ { 58932, 30, /* 0.12 */ },
+ { 61799, 29, /* 0.12 */ },
+ { 64851, 28, /* 0.11 */ },
+ { 68113, 28, /* 0.11 */ },
+ { 71617, 27, /* 0.11 */ },
+ { 75401, 26, /* 0.10 */ },
+ { 79517, 26, /* 0.10 */ },
+ { 84035, 25, /* 0.10 */ },
+ { 89053, 24, /* 0.10 */ },
+};
+
+#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
+
+struct hstcp {
+ u32 ai;
+};
+
+static void hstcp_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
+
+ ca->ai = 0;
+
+ /* Ensure the MD arithmetic works. This is somewhat pedantic,
+ * since I don't think we will see a cwnd this large. :) */
+ tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+
+static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
+ u32 in_flight, int good)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hstcp *ca = inet_csk_ca(sk);
+
+ if (in_flight < tp->snd_cwnd)
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ } else {
+ /* Update AIMD parameters */
+ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
+ while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+ ca->ai < HSTCP_AIMD_MAX)
+ ca->ai++;
+ } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) {
+ while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+ ca->ai > 0)
+ ca->ai--;
+ }
+
+ /* Do additive increase */
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
+ tp->snd_cwnd_cnt += ca->ai;
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt -= tp->snd_cwnd;
+ }
+ }
+ }
+}
+
+static u32 hstcp_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct hstcp *ca = inet_csk_ca(sk);
+
+ /* Do multiplicative decrease */
+ return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
+}
+
+
+static struct tcp_congestion_ops tcp_highspeed = {
+ .init = hstcp_init,
+ .ssthresh = hstcp_ssthresh,
+ .cong_avoid = hstcp_cong_avoid,
+ .min_cwnd = tcp_reno_min_cwnd,
+
+ .owner = THIS_MODULE,
+ .name = "highspeed"
+};
+
+static int __init hstcp_register(void)
+{
+ BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_highspeed);
+}
+
+static void __exit hstcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_highspeed);
+}
+
+module_init(hstcp_register);
+module_exit(hstcp_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("High Speed TCP");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 0000000..e47b379
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,298 @@
+/*
+ * H-TCP congestion control. The algorithm is detailed in:
+ * R.N.Shorten, D.J.Leith:
+ * "H-TCP: TCP for high-speed and long-distance networks"
+ * Proc. PFLDnet, Argonne, 2004.
+ * http://www.hamilton.ie/net/htcp3.pdf
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
+#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
+#define BETA_MAX 102 /* 0.8 with shift << 7 */
+
+static int use_rtt_scaling = 1;
+module_param(use_rtt_scaling, int, 0644);
+MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
+
+static int use_bandwidth_switch = 1;
+module_param(use_bandwidth_switch, int, 0644);
+MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
+
+struct htcp {
+ u16 alpha; /* Fixed point arith, << 7 */
+ u8 beta; /* Fixed point arith, << 7 */
+ u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */
+ u8 ccount; /* Number of RTTs since last congestion event */
+ u8 undo_ccount;
+ u16 packetcount;
+ u32 minRTT;
+ u32 maxRTT;
+ u32 snd_cwnd_cnt2;
+
+ u32 undo_maxRTT;
+ u32 undo_old_maxB;
+
+ /* Bandwidth estimation */
+ u32 minB;
+ u32 maxB;
+ u32 old_maxB;
+ u32 Bi;
+ u32 lasttime;
+};
+
+static inline void htcp_reset(struct htcp *ca)
+{
+ ca->undo_ccount = ca->ccount;
+ ca->undo_maxRTT = ca->maxRTT;
+ ca->undo_old_maxB = ca->old_maxB;
+
+ ca->ccount = 0;
+ ca->snd_cwnd_cnt2 = 0;
+}
+
+static u32 htcp_cwnd_undo(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
+ ca->ccount = ca->undo_ccount;
+ ca->maxRTT = ca->undo_maxRTT;
+ ca->old_maxB = ca->undo_old_maxB;
+ return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
+}
+
+static inline void measure_rtt(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
+ u32 srtt = tp->srtt>>3;
+
+ /* keep track of minimum RTT seen so far, minRTT is zero at first */
+ if (ca->minRTT > srtt || !ca->minRTT)
+ ca->minRTT = srtt;
+
+ /* max RTT */
+ if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
+ if (ca->maxRTT < ca->minRTT)
+ ca->maxRTT = ca->minRTT;
+ if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
+ ca->maxRTT = srtt;
+ }
+}
+
+static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
+ u32 now = tcp_time_stamp;
+
+ /* achieved throughput calculations */
+ if (icsk->icsk_ca_state != TCP_CA_Open &&
+ icsk->icsk_ca_state != TCP_CA_Disorder) {
+ ca->packetcount = 0;
+ ca->lasttime = now;
+ return;
+ }
+
+ ca->packetcount += pkts_acked;
+
+ if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1)
+ && now - ca->lasttime >= ca->minRTT
+ && ca->minRTT > 0) {
+ __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime);
+ if (ca->ccount <= 3) {
+ /* just after backoff */
+ ca->minB = ca->maxB = ca->Bi = cur_Bi;
+ } else {
+ ca->Bi = (3*ca->Bi + cur_Bi)/4;
+ if (ca->Bi > ca->maxB)
+ ca->maxB = ca->Bi;
+ if (ca->minB > ca->maxB)
+ ca->minB = ca->maxB;
+ }
+ ca->packetcount = 0;
+ ca->lasttime = now;
+ }
+}
+
+static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
+{
+ if (use_bandwidth_switch) {
+ u32 maxB = ca->maxB;
+ u32 old_maxB = ca->old_maxB;
+ ca->old_maxB = ca->maxB;
+
+ if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) {
+ ca->beta = BETA_MIN;
+ ca->modeswitch = 0;
+ return;
+ }
+ }
+
+ if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) {
+ ca->beta = (minRTT<<7)/maxRTT;
+ if (ca->beta < BETA_MIN)
+ ca->beta = BETA_MIN;
+ else if (ca->beta > BETA_MAX)
+ ca->beta = BETA_MAX;
+ } else {
+ ca->beta = BETA_MIN;
+ ca->modeswitch = 1;
+ }
+}
+
+static inline void htcp_alpha_update(struct htcp *ca)
+{
+ u32 minRTT = ca->minRTT;
+ u32 factor = 1;
+ u32 diff = ca->ccount * minRTT; /* time since last backoff */
+
+ if (diff > HZ) {
+ diff -= HZ;
+ factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ;
+ }
+
+ if (use_rtt_scaling && minRTT) {
+ u32 scale = (HZ<<3)/(10*minRTT);
+ scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */
+ factor = (factor<<3)/scale;
+ if (!factor)
+ factor = 1;
+ }
+
+ ca->alpha = 2*factor*((1<<7)-ca->beta);
+ if (!ca->alpha)
+ ca->alpha = ALPHA_BASE;
+}
+
+/* After we have the rtt data to calculate beta, we'd still prefer to wait one
+ * rtt before we adjust our beta to ensure we are working from a consistent
+ * data.
+ *
+ * This function should be called when we hit a congestion event since only at
+ * that point do we really have a real sense of maxRTT (the queues en route
+ * were getting just too full now).
+ */
+static void htcp_param_update(struct sock *sk)
+{
+ struct htcp *ca = inet_csk_ca(sk);
+ u32 minRTT = ca->minRTT;
+ u32 maxRTT = ca->maxRTT;
+
+ htcp_beta_update(ca, minRTT, maxRTT);
+ htcp_alpha_update(ca);
+
+ /* add slowly fading memory for maxRTT to accommodate routing changes etc */
+ if (minRTT > 0 && maxRTT > minRTT)
+ ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
+}
+
+static u32 htcp_recalc_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct htcp *ca = inet_csk_ca(sk);
+ htcp_param_update(sk);
+ return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
+}
+
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
+ u32 in_flight, int data_acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct htcp *ca = inet_csk_ca(sk);
+
+ if (in_flight < tp->snd_cwnd)
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ /* In "safe" area, increase. */
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ } else {
+ measure_rtt(sk);
+
+ /* keep track of number of round-trip times since last backoff event */
+ if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
+ ca->ccount++;
+ ca->snd_cwnd_cnt2 = 0;
+ htcp_alpha_update(ca);
+ }
+
+ /* In dangerous area, increase slowly.
+ * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
+ */
+ if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ ca->ccount++;
+ }
+ }
+}
+
+/* Lower bound on congestion window. */
+static u32 htcp_min_cwnd(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ return tp->snd_ssthresh;
+}
+
+
+static void htcp_init(struct sock *sk)
+{
+ struct htcp *ca = inet_csk_ca(sk);
+
+ memset(ca, 0, sizeof(struct htcp));
+ ca->alpha = ALPHA_BASE;
+ ca->beta = BETA_MIN;
+}
+
+static void htcp_state(struct sock *sk, u8 new_state)
+{
+ switch (new_state) {
+ case TCP_CA_CWR:
+ case TCP_CA_Recovery:
+ case TCP_CA_Loss:
+ htcp_reset(inet_csk_ca(sk));
+ break;
+ }
+}
+
+static struct tcp_congestion_ops htcp = {
+ .init = htcp_init,
+ .ssthresh = htcp_recalc_ssthresh,
+ .min_cwnd = htcp_min_cwnd,
+ .cong_avoid = htcp_cong_avoid,
+ .set_state = htcp_state,
+ .undo_cwnd = htcp_cwnd_undo,
+ .pkts_acked = measure_achieved_throughput,
+ .owner = THIS_MODULE,
+ .name = "htcp",
+};
+
+static int __init htcp_register(void)
+{
+ BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
+ BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
+ if (!use_bandwidth_switch)
+ htcp.pkts_acked = NULL;
+ return tcp_register_congestion_control(&htcp);
+}
+
+static void __exit htcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&htcp);
+}
+
+module_init(htcp_register);
+module_exit(htcp_unregister);
+
+MODULE_AUTHOR("Baruch Even");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("H-TCP");
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 0000000..77add636
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,188 @@
+/*
+ * TCP HYBLA
+ *
+ * TCP-HYBLA Congestion control algorithm, based on:
+ * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
+ * for Heterogeneous Networks",
+ * International Journal on satellite Communications,
+ * September 2004
+ * Daniele Lacamera
+ * root at danielinux.net
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* Tcp Hybla structure. */
+struct hybla {
+ u8 hybla_en;
+ u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
+ u32 rho; /* Rho parameter, integer part */
+ u32 rho2; /* Rho * Rho, integer part */
+ u32 rho_3ls; /* Rho parameter, <<3 */
+ u32 rho2_7ls; /* Rho^2, <<7 */
+ u32 minrtt; /* Minimum smoothed round trip time value seen */
+};
+
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
+ expressed in jiffies */
+static int rtt0 = 25;
+module_param(rtt0, int, 0644);
+MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
+
+
+/* This is called to refresh values for hybla parameters */
+static inline void hybla_recalc_param (struct sock *sk)
+{
+ struct hybla *ca = inet_csk_ca(sk);
+
+ ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
+ ca->rho = ca->rho_3ls >> 3;
+ ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
+ ca->rho2 = ca->rho2_7ls >>7;
+}
+
+static void hybla_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hybla *ca = inet_csk_ca(sk);
+
+ ca->rho = 0;
+ ca->rho2 = 0;
+ ca->rho_3ls = 0;
+ ca->rho2_7ls = 0;
+ ca->snd_cwnd_cents = 0;
+ ca->hybla_en = 1;
+ tp->snd_cwnd = 2;
+ tp->snd_cwnd_clamp = 65535;
+
+ /* 1st Rho measurement based on initial srtt */
+ hybla_recalc_param(sk);
+
+ /* set minimum rtt as this is the 1st ever seen */
+ ca->minrtt = tp->srtt;
+ tp->snd_cwnd = ca->rho;
+}
+
+static void hybla_state(struct sock *sk, u8 ca_state)
+{
+ struct hybla *ca = inet_csk_ca(sk);
+ ca->hybla_en = (ca_state == TCP_CA_Open);
+}
+
+static inline u32 hybla_fraction(u32 odds)
+{
+ static const u32 fractions[] = {
+ 128, 139, 152, 165, 181, 197, 215, 234,
+ };
+
+ return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
+}
+
+/* TCP Hybla main routine.
+ * This is the algorithm behavior:
+ * o Recalc Hybla parameters if min_rtt has changed
+ * o Give cwnd a new value based on the model proposed
+ * o remember increments <1
+ */
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
+ u32 in_flight, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct hybla *ca = inet_csk_ca(sk);
+ u32 increment, odd, rho_fractions;
+ int is_slowstart = 0;
+
+ /* Recalculate rho only if this srtt is the lowest */
+ if (tp->srtt < ca->minrtt){
+ hybla_recalc_param(sk);
+ ca->minrtt = tp->srtt;
+ }
+
+ if (!ca->hybla_en)
+ return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
+
+ if (in_flight < tp->snd_cwnd)
+ return;
+
+ if (ca->rho == 0)
+ hybla_recalc_param(sk);
+
+ rho_fractions = ca->rho_3ls - (ca->rho << 3);
+
+ if (tp->snd_cwnd < tp->snd_ssthresh) {
+ /*
+ * slow start
+ * INC = 2^RHO - 1
+ * This is done by splitting the rho parameter
+ * into 2 parts: an integer part and a fraction part.
+ * Inrement<<7 is estimated by doing:
+ * [2^(int+fract)]<<7
+ * that is equal to:
+ * (2^int) * [(2^fract) <<7]
+ * 2^int is straightly computed as 1<<int,
+ * while we will use hybla_slowstart_fraction_increment() to
+ * calculate 2^fract in a <<7 value.
+ */
+ is_slowstart = 1;
+ increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
+ - 128;
+ } else {
+ /*
+ * congestion avoidance
+ * INC = RHO^2 / W
+ * as long as increment is estimated as (rho<<7)/window
+ * it already is <<7 and we can easily count its fractions.
+ */
+ increment = ca->rho2_7ls / tp->snd_cwnd;
+ if (increment < 128)
+ tp->snd_cwnd_cnt++;
+ }
+
+ odd = increment % 128;
+ tp->snd_cwnd += increment >> 7;
+ ca->snd_cwnd_cents += odd;
+
+ /* check when fractions goes >=128 and increase cwnd by 1. */
+ while(ca->snd_cwnd_cents >= 128) {
+ tp->snd_cwnd++;
+ ca->snd_cwnd_cents -= 128;
+ tp->snd_cwnd_cnt = 0;
+ }
+
+ /* clamp down slowstart cwnd to ssthresh value. */
+ if (is_slowstart)
+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+
+ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+}
+
+static struct tcp_congestion_ops tcp_hybla = {
+ .init = hybla_init,
+ .ssthresh = tcp_reno_ssthresh,
+ .min_cwnd = tcp_reno_min_cwnd,
+ .cong_avoid = hybla_cong_avoid,
+ .set_state = hybla_state,
+
+ .owner = THIS_MODULE,
+ .name = "hybla"
+};
+
+static int __init hybla_register(void)
+{
+ BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_hybla);
+}
+
+static void __exit hybla_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_hybla);
+}
+
+module_init(hybla_register);
+module_exit(hybla_unregister);
+
+MODULE_AUTHOR("Daniele Lacamera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Hybla");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bad504..29222b96 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,7 +61,6 @@
* Panu Kuhlberg: Experimental audit of TCP (re)transmission
* engine. Lots of bugs are found.
* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
- * Angelo Dell'Aera: TCP Westwood+ support
*/
#include <linux/config.h>
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337;
int sysctl_tcp_max_orphans = NR_FILE;
int sysctl_tcp_frto;
int sysctl_tcp_nometrics_save;
-int sysctl_tcp_westwood;
-int sysctl_tcp_vegas_cong_avoid;
int sysctl_tcp_moderate_rcvbuf = 1;
-/* Default values of the Vegas variables, in fixed-point representation
- * with V_PARAM_SHIFT bits to the right of the binary point.
- */
-#define V_PARAM_SHIFT 1
-int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
-int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT;
-int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
-int sysctl_tcp_bic = 1;
-int sysctl_tcp_bic_fast_convergence = 1;
-int sysctl_tcp_bic_low_window = 14;
-int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
-
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
@@ -129,20 +114,21 @@ int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
-static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
- struct sk_buff *skb)
+static inline void tcp_measure_rcv_mss(struct sock *sk,
+ const struct sk_buff *skb)
{
- unsigned int len, lss;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const unsigned int lss = icsk->icsk_ack.last_seg_size;
+ unsigned int len;
- lss = tp->ack.last_seg_size;
- tp->ack.last_seg_size = 0;
+ icsk->icsk_ack.last_seg_size = 0;
/* skb->len may jitter because of SACKs, even if peer
* sends good full-sized frames.
*/
len = skb->len;
- if (len >= tp->ack.rcv_mss) {
- tp->ack.rcv_mss = len;
+ if (len >= icsk->icsk_ack.rcv_mss) {
+ icsk->icsk_ack.rcv_mss = len;
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
@@ -162,41 +148,44 @@ static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
* tcp header plus fixed timestamp option length.
* Resulting "len" is MSS free of SACK jitter.
*/
- len -= tp->tcp_header_len;
- tp->ack.last_seg_size = len;
+ len -= tcp_sk(sk)->tcp_header_len;
+ icsk->icsk_ack.last_seg_size = len;
if (len == lss) {
- tp->ack.rcv_mss = len;
+ icsk->icsk_ack.rcv_mss = len;
return;
}
}
- tp->ack.pending |= TCP_ACK_PUSHED;
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
}
}
-static void tcp_incr_quickack(struct tcp_sock *tp)
+static void tcp_incr_quickack(struct sock *sk)
{
- unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks==0)
quickacks=2;
- if (quickacks > tp->ack.quick)
- tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+ if (quickacks > icsk->icsk_ack.quick)
+ icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
}
-void tcp_enter_quickack_mode(struct tcp_sock *tp)
+void tcp_enter_quickack_mode(struct sock *sk)
{
- tcp_incr_quickack(tp);
- tp->ack.pingpong = 0;
- tp->ack.ato = TCP_ATO_MIN;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_incr_quickack(sk);
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
}
/* Send ACKs quickly, if "quick" count is not exhausted
* and the session is not interactive.
*/
-static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp)
+static inline int tcp_in_quickack_mode(const struct sock *sk)
{
- return (tp->ack.quick && !tp->ack.pingpong);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
}
/* Buffer size and advertised window tuning.
@@ -239,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock *sk)
*/
/* Slow part of check#2. */
-static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
- struct sk_buff *skb)
+static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
+ const struct sk_buff *skb)
{
/* Optimize this! */
int truesize = tcp_win_from_space(skb->truesize)/2;
@@ -248,7 +237,7 @@ static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
- return 2*tp->ack.rcv_mss;
+ return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
truesize >>= 1;
window >>= 1;
@@ -275,7 +264,7 @@ static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
if (incr) {
tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
- tp->ack.quick |= 1;
+ inet_csk(sk)->icsk_ack.quick |= 1;
}
}
}
@@ -333,23 +322,15 @@ static void tcp_init_buffer_space(struct sock *sk)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-static void init_bictcp(struct tcp_sock *tp)
-{
- tp->bictcp.cnt = 0;
-
- tp->bictcp.last_max_cwnd = 0;
- tp->bictcp.last_cwnd = 0;
- tp->bictcp.last_stamp = 0;
-}
-
/* 5. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
int ofo_win = 0;
- tp->ack.quick = 0;
+ icsk->icsk_ack.quick = 0;
skb_queue_walk(&tp->out_of_order_queue, skb) {
ofo_win += skb->len;
@@ -370,8 +351,8 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
app_win += ofo_win;
if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
app_win >>= 1;
- if (app_win > tp->ack.rcv_mss)
- app_win -= tp->ack.rcv_mss;
+ if (app_win > icsk->icsk_ack.rcv_mss)
+ app_win -= icsk->icsk_ack.rcv_mss;
app_win = max(app_win, 2U*tp->advmss);
if (!ofo_win)
@@ -439,11 +420,12 @@ new_measure:
tp->rcv_rtt_est.time = tcp_time_stamp;
}
-static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
if (tp->rx_opt.rcv_tsecr &&
(TCP_SKB_CB(skb)->end_seq -
- TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss))
+ TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
}
@@ -516,41 +498,42 @@ new_measure:
*/
static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
u32 now;
- tcp_schedule_ack(tp);
+ inet_csk_schedule_ack(sk);
- tcp_measure_rcv_mss(tp, skb);
+ tcp_measure_rcv_mss(sk, skb);
tcp_rcv_rtt_measure(tp);
now = tcp_time_stamp;
- if (!tp->ack.ato) {
+ if (!icsk->icsk_ack.ato) {
/* The _first_ data packet received, initialize
* delayed ACK engine.
*/
- tcp_incr_quickack(tp);
- tp->ack.ato = TCP_ATO_MIN;
+ tcp_incr_quickack(sk);
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
} else {
- int m = now - tp->ack.lrcvtime;
+ int m = now - icsk->icsk_ack.lrcvtime;
if (m <= TCP_ATO_MIN/2) {
/* The fastest case is the first. */
- tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2;
- } else if (m < tp->ack.ato) {
- tp->ack.ato = (tp->ack.ato>>1) + m;
- if (tp->ack.ato > tp->rto)
- tp->ack.ato = tp->rto;
- } else if (m > tp->rto) {
+ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
+ } else if (m < icsk->icsk_ack.ato) {
+ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
+ if (icsk->icsk_ack.ato > icsk->icsk_rto)
+ icsk->icsk_ack.ato = icsk->icsk_rto;
+ } else if (m > icsk->icsk_rto) {
/* Too long gap. Apparently sender falled to
* restart window, so that we send ACKs quickly.
*/
- tcp_incr_quickack(tp);
+ tcp_incr_quickack(sk);
sk_stream_mem_reclaim(sk);
}
}
- tp->ack.lrcvtime = now;
+ icsk->icsk_ack.lrcvtime = now;
TCP_ECN_check_ce(tp, skb);
@@ -558,45 +541,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
tcp_grow_window(sk, tp, skb);
}
-/* When starting a new connection, pin down the current choice of
- * congestion algorithm.
- */
-void tcp_ca_init(struct tcp_sock *tp)
-{
- if (sysctl_tcp_westwood)
- tp->adv_cong = TCP_WESTWOOD;
- else if (sysctl_tcp_bic)
- tp->adv_cong = TCP_BIC;
- else if (sysctl_tcp_vegas_cong_avoid) {
- tp->adv_cong = TCP_VEGAS;
- tp->vegas.baseRTT = 0x7fffffff;
- tcp_vegas_enable(tp);
- }
-}
-
-/* Do RTT sampling needed for Vegas.
- * Basically we:
- * o min-filter RTT samples from within an RTT to get the current
- * propagation delay + queuing delay (we are min-filtering to try to
- * avoid the effects of delayed ACKs)
- * o min-filter RTT samples from a much longer window (forever for now)
- * to find the propagation delay (baseRTT)
- */
-static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
-{
- __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
-
- /* Filter to find propagation delay: */
- if (vrtt < tp->vegas.baseRTT)
- tp->vegas.baseRTT = vrtt;
-
- /* Find the min RTT during the last RTT to find
- * the current prop. delay + queuing delay:
- */
- tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
- tp->vegas.cntRTT++;
-}
-
/* Called to compute a smoothed rtt estimate. The data fed to this
* routine either comes from timestamps, or from segments that were
* known _not_ to have been retransmitted [see Karn/Partridge
@@ -606,13 +550,12 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
-static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
+static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
long m = mrtt; /* RTT */
- if (tcp_vegas_enabled(tp))
- vegas_rtt_calc(tp, mrtt);
-
/* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
* are scaled versions of rtt and mean deviation.
@@ -670,14 +613,16 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
tp->rtt_seq = tp->snd_nxt;
}
- tcp_westwood_update_rtt(tp, tp->srtt >> 3);
+ if (icsk->icsk_ca_ops->rtt_sample)
+ icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static inline void tcp_set_rto(struct tcp_sock *tp)
+static inline void tcp_set_rto(struct sock *sk)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
*
* More seriously:
@@ -688,7 +633,7 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
* is invisible. Actually, Linux-2.4 also generates erratic
* ACKs in some curcumstances.
*/
- tp->rto = (tp->srtt >> 3) + tp->rttvar;
+ inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
@@ -700,10 +645,10 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
* guarantees that rto is higher.
*/
-static inline void tcp_bound_rto(struct tcp_sock *tp)
+static inline void tcp_bound_rto(struct sock *sk)
{
- if (tp->rto > TCP_RTO_MAX)
- tp->rto = TCP_RTO_MAX;
+ if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
+ inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
}
/* Save metrics learned by this TCP session.
@@ -721,9 +666,10 @@ void tcp_update_metrics(struct sock *sk)
dst_confirm(dst);
if (dst && (dst->flags&DST_HOST)) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
int m;
- if (tp->backoff || !tp->srtt) {
+ if (icsk->icsk_backoff || !tp->srtt) {
/* This session failed to estimate rtt. Why?
* Probably, no packets returned in time.
* Reset our results.
@@ -772,7 +718,7 @@ void tcp_update_metrics(struct sock *sk)
tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
} else if (tp->snd_cwnd > tp->snd_ssthresh &&
- tp->ca_state == TCP_CA_Open) {
+ icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
if (!dst_metric_locked(dst, RTAX_SSTHRESH))
dst->metrics[RTAX_SSTHRESH-1] =
@@ -805,10 +751,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
if (!cwnd) {
- if (tp->mss_cache_std > 1460)
+ if (tp->mss_cache > 1460)
cwnd = 2;
else
- cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
+ cwnd = (tp->mss_cache > 1095) ? 3 : 4;
}
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
@@ -866,9 +812,9 @@ static void tcp_init_metrics(struct sock *sk)
tp->mdev = dst_metric(dst, RTAX_RTTVAR);
tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
}
- tcp_set_rto(tp);
- tcp_bound_rto(tp);
- if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
+ tcp_set_rto(sk);
+ tcp_bound_rto(sk);
+ if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
goto reset;
tp->snd_cwnd = tcp_init_cwnd(tp, dst);
tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -882,12 +828,14 @@ reset:
if (!tp->rx_opt.saw_tstamp && tp->srtt) {
tp->srtt = 0;
tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
- tp->rto = TCP_TIMEOUT_INIT;
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
}
}
-static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
+static void tcp_update_reordering(struct sock *sk, const int metric,
+ const int ts)
{
+ struct tcp_sock *tp = tcp_sk(sk);
if (metric > tp->reordering) {
tp->reordering = min(TCP_MAX_REORDERING, metric);
@@ -902,7 +850,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
#if FASTRETRANS_DEBUG > 1
printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
- tp->rx_opt.sack_ok, tp->ca_state,
+ tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
tp->reordering,
tp->fackets_out,
tp->sacked_out,
@@ -964,6 +912,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
static int
tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
@@ -974,14 +923,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
int flag = 0;
int i;
- /* So, SACKs for already sent large segments will be lost.
- * Not good, but alternative is to resegment the queue. */
- if (sk->sk_route_caps & NETIF_F_TSO) {
- sk->sk_route_caps &= ~NETIF_F_TSO;
- sock_set_flag(sk, SOCK_NO_LARGESEND);
- tp->mss_cache = tp->mss_cache_std;
- }
-
if (!tp->sacked_out)
tp->fackets_out = 0;
prior_fackets = tp->fackets_out;
@@ -1029,20 +970,40 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
flag |= FLAG_DATA_LOST;
sk_stream_for_retrans_queue(skb, sk) {
- u8 sacked = TCP_SKB_CB(skb)->sacked;
- int in_sack;
+ int in_sack, pcount;
+ u8 sacked;
/* The retransmission queue is always in order, so
* we can short-circuit the walk early.
*/
- if(!before(TCP_SKB_CB(skb)->seq, end_seq))
+ if (!before(TCP_SKB_CB(skb)->seq, end_seq))
break;
- fack_count += tcp_skb_pcount(skb);
+ pcount = tcp_skb_pcount(skb);
+
+ if (pcount > 1 &&
+ (after(start_seq, TCP_SKB_CB(skb)->seq) ||
+ before(end_seq, TCP_SKB_CB(skb)->end_seq))) {
+ unsigned int pkt_len;
+
+ if (after(start_seq, TCP_SKB_CB(skb)->seq))
+ pkt_len = (start_seq -
+ TCP_SKB_CB(skb)->seq);
+ else
+ pkt_len = (end_seq -
+ TCP_SKB_CB(skb)->seq);
+ if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size))
+ break;
+ pcount = tcp_skb_pcount(skb);
+ }
+
+ fack_count += pcount;
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
+ sacked = TCP_SKB_CB(skb)->sacked;
+
/* Account D-SACK for retransmitted packet. */
if ((dup_sack && in_sack) &&
(sacked & TCPCB_RETRANS) &&
@@ -1129,7 +1090,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
* we have to account for reordering! Ugly,
* but should help.
*/
- if (lost_retrans && tp->ca_state == TCP_CA_Recovery) {
+ if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
struct sk_buff *skb;
sk_stream_for_retrans_queue(skb, sk) {
@@ -1142,7 +1103,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
(IsFack(tp) ||
!before(lost_retrans,
TCP_SKB_CB(skb)->ack_seq + tp->reordering *
- tp->mss_cache_std))) {
+ tp->mss_cache))) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
@@ -1158,8 +1119,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
tp->left_out = tp->sacked_out + tp->lost_out;
- if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss)
- tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0);
+ if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
+ tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
#if FASTRETRANS_DEBUG > 0
BUG_TRAP((int)tp->sacked_out >= 0);
@@ -1176,17 +1137,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
*/
void tcp_enter_frto(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
tp->frto_counter = 1;
- if (tp->ca_state <= TCP_CA_Disorder ||
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
tp->snd_una == tp->high_seq ||
- (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
- tp->prior_ssthresh = tcp_current_ssthresh(tp);
- if (!tcp_westwood_ssthresh(tp))
- tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ tcp_ca_event(sk, CA_EVENT_FRTO);
}
/* Have to clear retransmission markers here to keep the bookkeeping
@@ -1203,7 +1165,7 @@ void tcp_enter_frto(struct sock *sk)
}
tcp_sync_left_out(tp);
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
tp->frto_highmark = tp->snd_nxt;
}
@@ -1249,11 +1211,9 @@ static void tcp_enter_frto_loss(struct sock *sk)
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
- tcp_set_ca_state(tp, TCP_CA_Loss);
+ tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->frto_highmark;
TCP_ECN_queue_cwr(tp);
-
- init_bictcp(tp);
}
void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1275,15 +1235,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
*/
void tcp_enter_loss(struct sock *sk, int how)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int cnt = 0;
/* Reduce ssthresh if it has not yet been made inside this window. */
- if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
- (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
- tp->prior_ssthresh = tcp_current_ssthresh(tp);
- tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+ (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ tcp_ca_event(sk, CA_EVENT_LOSS);
}
tp->snd_cwnd = 1;
tp->snd_cwnd_cnt = 0;
@@ -1314,12 +1276,12 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
- tcp_set_ca_state(tp, TCP_CA_Loss);
+ tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
}
-static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
+static int tcp_check_sack_reneging(struct sock *sk)
{
struct sk_buff *skb;
@@ -1331,12 +1293,14 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
*/
if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
tcp_enter_loss(sk, 1);
- tp->retransmits++;
+ icsk->icsk_retransmits++;
tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ icsk->icsk_rto, TCP_RTO_MAX);
return 1;
}
return 0;
@@ -1347,15 +1311,15 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
}
-static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb)
+static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
{
- return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);
+ return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
}
static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
{
return tp->packets_out &&
- tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue));
+ tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue));
}
/* Linux NewReno/SACK/FACK/ECN state machine.
@@ -1489,8 +1453,9 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
* in assumption of absent reordering, interpret this as reordering.
* The only another reason could be bug in receiver TCP.
*/
-static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
+ struct tcp_sock *tp = tcp_sk(sk);
u32 holes;
holes = max(tp->lost_out, 1U);
@@ -1498,16 +1463,17 @@ static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
- tcp_update_reordering(tp, tp->packets_out+addend, 0);
+ tcp_update_reordering(sk, tp->packets_out + addend, 0);
}
}
/* Emulate SACKs for SACKless connection: account for a new dupack. */
-static void tcp_add_reno_sack(struct tcp_sock *tp)
+static void tcp_add_reno_sack(struct sock *sk)
{
+ struct tcp_sock *tp = tcp_sk(sk);
tp->sacked_out++;
- tcp_check_reno_reordering(tp, 0);
+ tcp_check_reno_reordering(sk, 0);
tcp_sync_left_out(tp);
}
@@ -1522,7 +1488,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acke
else
tp->sacked_out -= acked-1;
}
- tcp_check_reno_reordering(tp, acked);
+ tcp_check_reno_reordering(sk, acked);
tcp_sync_left_out(tp);
}
@@ -1575,7 +1541,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
struct sk_buff *skb;
sk_stream_for_retrans_queue(skb, sk) {
- if (tcp_skb_timedout(tp, skb) &&
+ if (tcp_skb_timedout(sk, skb) &&
!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
@@ -1596,28 +1562,16 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
}
/* Decrease cwnd each second ack. */
-
-static void tcp_cwnd_down(struct tcp_sock *tp)
+static void tcp_cwnd_down(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
int decr = tp->snd_cwnd_cnt + 1;
- __u32 limit;
-
- /*
- * TCP Westwood
- * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
- * in packets we use mss_cache). If sysctl_tcp_westwood is off
- * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
- * still used as usual. It prevents other strange cases in which
- * BWE*RTTmin could assume value 0. It should not happen but...
- */
-
- if (!(limit = tcp_westwood_bw_rttmin(tp)))
- limit = tp->snd_ssthresh/2;
tp->snd_cwnd_cnt = decr&1;
decr >>= 1;
- if (decr && tp->snd_cwnd > limit)
+ if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
tp->snd_cwnd -= decr;
tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1651,11 +1605,15 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
#define DBGUNDO(x...) do { } while (0)
#endif
-static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
+static void tcp_undo_cwr(struct sock *sk, const int undo)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
if (tp->prior_ssthresh) {
- if (tcp_is_bic(tp))
- tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_ops->undo_cwnd)
+ tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
else
tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
@@ -1683,9 +1641,9 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
/* Happy end! We did not retransmit anything
* or our original transmission succeeded.
*/
- DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans");
- tcp_undo_cwr(tp, 1);
- if (tp->ca_state == TCP_CA_Loss)
+ DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
+ tcp_undo_cwr(sk, 1);
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
else
NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
@@ -1698,7 +1656,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
tcp_moderate_cwnd(tp);
return 1;
}
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
return 0;
}
@@ -1707,7 +1665,7 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
{
if (tp->undo_marker && !tp->undo_retrans) {
DBGUNDO(sk, tp, "D-SACK");
- tcp_undo_cwr(tp, 1);
+ tcp_undo_cwr(sk, 1);
tp->undo_marker = 0;
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
}
@@ -1728,10 +1686,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
if (tp->retrans_out == 0)
tp->retrans_stamp = 0;
- tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1);
+ tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
DBGUNDO(sk, tp, "Hoe");
- tcp_undo_cwr(tp, 0);
+ tcp_undo_cwr(sk, 0);
NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
/* So... Do not make Hoe's retransmit yet.
@@ -1754,24 +1712,23 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
DBGUNDO(sk, tp, "partial loss");
tp->lost_out = 0;
tp->left_out = tp->sacked_out;
- tcp_undo_cwr(tp, 1);
+ tcp_undo_cwr(sk, 1);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
- tp->retransmits = 0;
+ inet_csk(sk)->icsk_retransmits = 0;
tp->undo_marker = 0;
if (!IsReno(tp))
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
return 1;
}
return 0;
}
-static inline void tcp_complete_cwr(struct tcp_sock *tp)
+static inline void tcp_complete_cwr(struct sock *sk)
{
- if (tcp_westwood_cwnd(tp))
- tp->snd_ssthresh = tp->snd_cwnd;
- else
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+ struct tcp_sock *tp = tcp_sk(sk);
+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
tp->snd_cwnd_stamp = tcp_time_stamp;
+ tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1782,21 +1739,21 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
tp->retrans_stamp = 0;
if (flag&FLAG_ECE)
- tcp_enter_cwr(tp);
+ tcp_enter_cwr(sk);
- if (tp->ca_state != TCP_CA_CWR) {
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
int state = TCP_CA_Open;
if (tp->left_out || tp->retrans_out || tp->undo_marker)
state = TCP_CA_Disorder;
- if (tp->ca_state != state) {
- tcp_set_ca_state(tp, state);
+ if (inet_csk(sk)->icsk_ca_state != state) {
+ tcp_set_ca_state(sk, state);
tp->high_seq = tp->snd_nxt;
}
tcp_moderate_cwnd(tp);
} else {
- tcp_cwnd_down(tp);
+ tcp_cwnd_down(sk);
}
}
@@ -1815,6 +1772,7 @@ static void
tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
int prior_packets, int flag)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
@@ -1832,13 +1790,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
- if (tp->sacked_out && tcp_check_sack_reneging(sk, tp))
+ if (tp->sacked_out && tcp_check_sack_reneging(sk))
return;
/* C. Process data loss notification, provided it is valid. */
if ((flag&FLAG_DATA_LOST) &&
before(tp->snd_una, tp->high_seq) &&
- tp->ca_state != TCP_CA_Open &&
+ icsk->icsk_ca_state != TCP_CA_Open &&
tp->fackets_out > tp->reordering) {
tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
@@ -1849,14 +1807,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
/* E. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
- if (tp->ca_state == TCP_CA_Open) {
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
if (!sysctl_tcp_frto)
BUG_TRAP(tp->retrans_out == 0);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
- switch (tp->ca_state) {
+ switch (icsk->icsk_ca_state) {
case TCP_CA_Loss:
- tp->retransmits = 0;
+ icsk->icsk_retransmits = 0;
if (tcp_try_undo_recovery(sk, tp))
return;
break;
@@ -1865,8 +1823,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
if (tp->snd_una != tp->high_seq) {
- tcp_complete_cwr(tp);
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_complete_cwr(sk);
+ tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
@@ -1877,7 +1835,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
* catching for all duplicate ACKs. */
IsReno(tp) || tp->snd_una != tp->high_seq) {
tp->undo_marker = 0;
- tcp_set_ca_state(tp, TCP_CA_Open);
+ tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
@@ -1886,17 +1844,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
tcp_reset_reno_sack(tp);
if (tcp_try_undo_recovery(sk, tp))
return;
- tcp_complete_cwr(tp);
+ tcp_complete_cwr(sk);
break;
}
}
/* F. Process state. */
- switch (tp->ca_state) {
+ switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (prior_snd_una == tp->snd_una) {
if (IsReno(tp) && is_dupack)
- tcp_add_reno_sack(tp);
+ tcp_add_reno_sack(sk);
} else {
int acked = prior_packets - tp->packets_out;
if (IsReno(tp))
@@ -1906,13 +1864,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
break;
case TCP_CA_Loss:
if (flag&FLAG_DATA_ACKED)
- tp->retransmits = 0;
+ icsk->icsk_retransmits = 0;
if (!tcp_try_undo_loss(sk, tp)) {
tcp_moderate_cwnd(tp);
tcp_xmit_retransmit_queue(sk);
return;
}
- if (tp->ca_state != TCP_CA_Open)
+ if (icsk->icsk_ca_state != TCP_CA_Open)
return;
/* Loss is undone; fall through to processing in Open state. */
default:
@@ -1920,10 +1878,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
if (tp->snd_una != prior_snd_una)
tcp_reset_reno_sack(tp);
if (is_dupack)
- tcp_add_reno_sack(tp);
+ tcp_add_reno_sack(sk);
}
- if (tp->ca_state == TCP_CA_Disorder)
+ if (icsk->icsk_ca_state == TCP_CA_Disorder)
tcp_try_undo_dsack(sk, tp);
if (!tcp_time_to_recover(sk, tp)) {
@@ -1943,30 +1901,28 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
tp->undo_marker = tp->snd_una;
tp->undo_retrans = tp->retrans_out;
- if (tp->ca_state < TCP_CA_CWR) {
+ if (icsk->icsk_ca_state < TCP_CA_CWR) {
if (!(flag&FLAG_ECE))
- tp->prior_ssthresh = tcp_current_ssthresh(tp);
- tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
TCP_ECN_queue_cwr(tp);
}
tp->snd_cwnd_cnt = 0;
- tcp_set_ca_state(tp, TCP_CA_Recovery);
+ tcp_set_ca_state(sk, TCP_CA_Recovery);
}
if (is_dupack || tcp_head_timedout(sk, tp))
tcp_update_scoreboard(sk, tp);
- tcp_cwnd_down(tp);
+ tcp_cwnd_down(sk);
tcp_xmit_retransmit_queue(sk);
}
/* Read draft-ietf-tcplw-high-performance before mucking
* with this code. (Superceeds RFC1323)
*/
-static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
+static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
{
- __u32 seq_rtt;
-
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the
@@ -1982,14 +1938,15 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
* answer arrives rto becomes 120 seconds! If at least one of segments
* in window is lost... Voila. --ANK (010210)
*/
- seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
- tcp_rtt_estimator(tp, seq_rtt);
- tcp_set_rto(tp);
- tp->backoff = 0;
- tcp_bound_rto(tp);
+ struct tcp_sock *tp = tcp_sk(sk);
+ const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+ tcp_rtt_estimator(sk, seq_rtt, usrtt);
+ tcp_set_rto(sk);
+ inet_csk(sk)->icsk_backoff = 0;
+ tcp_bound_rto(sk);
}
-static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
+static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag)
{
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
@@ -2003,336 +1960,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
if (flag & FLAG_RETRANS_DATA_ACKED)
return;
- tcp_rtt_estimator(tp, seq_rtt);
- tcp_set_rto(tp);
- tp->backoff = 0;
- tcp_bound_rto(tp);
+ tcp_rtt_estimator(sk, seq_rtt, usrtt);
+ tcp_set_rto(sk);
+ inet_csk(sk)->icsk_backoff = 0;
+ tcp_bound_rto(sk);
}
-static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
- int flag, s32 seq_rtt)
+static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
+ const s32 seq_rtt, u32 *usrtt)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
- tcp_ack_saw_tstamp(tp, flag);
+ tcp_ack_saw_tstamp(sk, usrtt, flag);
else if (seq_rtt >= 0)
- tcp_ack_no_tstamp(tp, seq_rtt, flag);
-}
-
-/*
- * Compute congestion window to use.
- *
- * This is from the implementation of BICTCP in
- * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
- * "Binary Increase Congestion Control for Fast, Long Distance
- * Networks" in InfoComm 2004
- * Available from:
- * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
- *
- * Unless BIC is enabled and congestion window is large
- * this behaves the same as the original Reno.
- */
-static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
-{
- /* orignal Reno behaviour */
- if (!tcp_is_bic(tp))
- return tp->snd_cwnd;
-
- if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
- (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
- return tp->bictcp.cnt;
-
- tp->bictcp.last_cwnd = tp->snd_cwnd;
- tp->bictcp.last_stamp = tcp_time_stamp;
-
- /* start off normal */
- if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
- tp->bictcp.cnt = tp->snd_cwnd;
-
- /* binary increase */
- else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
- __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
- / BICTCP_B;
-
- if (dist > BICTCP_MAX_INCREMENT)
- /* linear increase */
- tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
- else if (dist <= 1U)
- /* binary search increase */
- tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
- / BICTCP_B;
- else
- /* binary search increase */
- tp->bictcp.cnt = tp->snd_cwnd / dist;
- } else {
- /* slow start amd linear increase */
- if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
- /* slow start */
- tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
- / BICTCP_B;
- else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
- + BICTCP_MAX_INCREMENT*(BICTCP_B-1))
- /* slow start */
- tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
- / (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
- else
- /* linear increase */
- tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
- }
- return tp->bictcp.cnt;
-}
-
-/* This is Jacobson's slow start and congestion avoidance.
- * SIGCOMM '88, p. 328.
- */
-static inline void reno_cong_avoid(struct tcp_sock *tp)
-{
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* In "safe" area, increase. */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
- /* In dangerous area, increase slowly.
- * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
- */
- if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt=0;
- } else
- tp->snd_cwnd_cnt++;
- }
- tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
-/* This is based on the congestion detection/avoidance scheme described in
- * Lawrence S. Brakmo and Larry L. Peterson.
- * "TCP Vegas: End to end congestion avoidance on a global internet."
- * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
- * October 1995. Available from:
- * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
- *
- * See http://www.cs.arizona.edu/xkernel/ for their implementation.
- * The main aspects that distinguish this implementation from the
- * Arizona Vegas implementation are:
- * o We do not change the loss detection or recovery mechanisms of
- * Linux in any way. Linux already recovers from losses quite well,
- * using fine-grained timers, NewReno, and FACK.
- * o To avoid the performance penalty imposed by increasing cwnd
- * only every-other RTT during slow start, we increase during
- * every RTT during slow start, just like Reno.
- * o Largely to allow continuous cwnd growth during slow start,
- * we use the rate at which ACKs come back as the "actual"
- * rate, rather than the rate at which data is sent.
- * o To speed convergence to the right rate, we set the cwnd
- * to achieve the right ("actual") rate when we exit slow start.
- * o To filter out the noise caused by delayed ACKs, we use the
- * minimum RTT sample observed during the last RTT to calculate
- * the actual rate.
- * o When the sender re-starts from idle, it waits until it has
- * received ACKs for an entire flight of new data before making
- * a cwnd adjustment decision. The original Vegas implementation
- * assumed senders never went idle.
- */
-static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
-{
- /* The key players are v_beg_snd_una and v_beg_snd_nxt.
- *
- * These are so named because they represent the approximate values
- * of snd_una and snd_nxt at the beginning of the current RTT. More
- * precisely, they represent the amount of data sent during the RTT.
- * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
- * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
- * bytes of data have been ACKed during the course of the RTT, giving
- * an "actual" rate of:
- *
- * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
- *
- * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
- * because delayed ACKs can cover more than one segment, so they
- * don't line up nicely with the boundaries of RTTs.
- *
- * Another unfortunate fact of life is that delayed ACKs delay the
- * advance of the left edge of our send window, so that the number
- * of bytes we send in an RTT is often less than our cwnd will allow.
- * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
- */
-
- if (after(ack, tp->vegas.beg_snd_nxt)) {
- /* Do the Vegas once-per-RTT cwnd adjustment. */
- u32 old_wnd, old_snd_cwnd;
-
-
- /* Here old_wnd is essentially the window of data that was
- * sent during the previous RTT, and has all
- * been acknowledged in the course of the RTT that ended
- * with the ACK we just received. Likewise, old_snd_cwnd
- * is the cwnd during the previous RTT.
- */
- old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
- tp->mss_cache_std;
- old_snd_cwnd = tp->vegas.beg_snd_cwnd;
-
- /* Save the extent of the current window so we can use this
- * at the end of the next RTT.
- */
- tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt;
- tp->vegas.beg_snd_nxt = tp->snd_nxt;
- tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
-
- /* Take into account the current RTT sample too, to
- * decrease the impact of delayed acks. This double counts
- * this sample since we count it for the next window as well,
- * but that's not too awful, since we're taking the min,
- * rather than averaging.
- */
- vegas_rtt_calc(tp, seq_rtt);
-
- /* We do the Vegas calculations only if we got enough RTT
- * samples that we can be reasonably sure that we got
- * at least one RTT sample that wasn't from a delayed ACK.
- * If we only had 2 samples total,
- * then that means we're getting only 1 ACK per RTT, which
- * means they're almost certainly delayed ACKs.
- * If we have 3 samples, we should be OK.
- */
-
- if (tp->vegas.cntRTT <= 2) {
- /* We don't have enough RTT samples to do the Vegas
- * calculation, so we'll behave like Reno.
- */
- if (tp->snd_cwnd > tp->snd_ssthresh)
- tp->snd_cwnd++;
- } else {
- u32 rtt, target_cwnd, diff;
-
- /* We have enough RTT samples, so, using the Vegas
- * algorithm, we determine if we should increase or
- * decrease cwnd, and by how much.
- */
-
- /* Pluck out the RTT we are using for the Vegas
- * calculations. This is the min RTT seen during the
- * last RTT. Taking the min filters out the effects
- * of delayed ACKs, at the cost of noticing congestion
- * a bit later.
- */
- rtt = tp->vegas.minRTT;
-
- /* Calculate the cwnd we should have, if we weren't
- * going too fast.
- *
- * This is:
- * (actual rate in segments) * baseRTT
- * We keep it as a fixed point number with
- * V_PARAM_SHIFT bits to the right of the binary point.
- */
- target_cwnd = ((old_wnd * tp->vegas.baseRTT)
- << V_PARAM_SHIFT) / rtt;
-
- /* Calculate the difference between the window we had,
- * and the window we would like to have. This quantity
- * is the "Diff" from the Arizona Vegas papers.
- *
- * Again, this is a fixed point number with
- * V_PARAM_SHIFT bits to the right of the binary
- * point.
- */
- diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
-
- if (tp->snd_cwnd < tp->snd_ssthresh) {
- /* Slow start. */
- if (diff > sysctl_tcp_vegas_gamma) {
- /* Going too fast. Time to slow down
- * and switch to congestion avoidance.
- */
- tp->snd_ssthresh = 2;
-
- /* Set cwnd to match the actual rate
- * exactly:
- * cwnd = (actual rate) * baseRTT
- * Then we add 1 because the integer
- * truncation robs us of full link
- * utilization.
- */
- tp->snd_cwnd = min(tp->snd_cwnd,
- (target_cwnd >>
- V_PARAM_SHIFT)+1);
-
- }
- } else {
- /* Congestion avoidance. */
- u32 next_snd_cwnd;
-
- /* Figure out where we would like cwnd
- * to be.
- */
- if (diff > sysctl_tcp_vegas_beta) {
- /* The old window was too fast, so
- * we slow down.
- */
- next_snd_cwnd = old_snd_cwnd - 1;
- } else if (diff < sysctl_tcp_vegas_alpha) {
- /* We don't have enough extra packets
- * in the network, so speed up.
- */
- next_snd_cwnd = old_snd_cwnd + 1;
- } else {
- /* Sending just as fast as we
- * should be.
- */
- next_snd_cwnd = old_snd_cwnd;
- }
-
- /* Adjust cwnd upward or downward, toward the
- * desired value.
- */
- if (next_snd_cwnd > tp->snd_cwnd)
- tp->snd_cwnd++;
- else if (next_snd_cwnd < tp->snd_cwnd)
- tp->snd_cwnd--;
- }
- }
-
- /* Wipe the slate clean for the next RTT. */
- tp->vegas.cntRTT = 0;
- tp->vegas.minRTT = 0x7fffffff;
- }
-
- /* The following code is executed for every ack we receive,
- * except for conditions checked in should_advance_cwnd()
- * before the call to tcp_cong_avoid(). Mainly this means that
- * we only execute this code if the ack actually acked some
- * data.
- */
-
- /* If we are in slow start, increase our cwnd in response to this ACK.
- * (If we are not in slow start then we are in congestion avoidance,
- * and adjust our congestion window only once per RTT. See the code
- * above.)
- */
- if (tp->snd_cwnd <= tp->snd_ssthresh)
- tp->snd_cwnd++;
-
- /* to keep cwnd from growing without bound */
- tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
-
- /* Make sure that we are never so timid as to reduce our cwnd below
- * 2 MSS.
- *
- * Going below 2 MSS would risk huge delayed ACKs from our receiver.
- */
- tp->snd_cwnd = max(tp->snd_cwnd, 2U);
-
- tp->snd_cwnd_stamp = tcp_time_stamp;
+ tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag);
}
-static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
+static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
+ u32 in_flight, int good)
{
- if (tcp_vegas_enabled(tp))
- vegas_cong_avoid(tp, ack, seq_rtt);
- else
- reno_cong_avoid(tp);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
+ tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
}
/* Restart timer after forward progress on connection.
@@ -2342,21 +1992,12 @@ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
{
if (!tp->packets_out) {
- tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
} else {
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}
}
-/* There is one downside to this scheme. Although we keep the
- * ACK clock ticking, adjusting packet counters and advancing
- * congestion window, we do not liberate socket send buffer
- * space.
- *
- * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
- * then making a write space wakeup callback is a possible
- * future enhancement. WARNING: it is not trivial to make.
- */
static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
__u32 now, __s32 *seq_rtt)
{
@@ -2415,13 +2056,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
/* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
+static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
__u32 now = tcp_time_stamp;
int acked = 0;
__s32 seq_rtt = -1;
+ struct timeval usnow;
+ u32 pkts_acked = 0;
+
+ if (seq_usrtt)
+ do_gettimeofday(&usnow);
while ((skb = skb_peek(&sk->sk_write_queue)) &&
skb != sk->sk_send_head) {
@@ -2433,7 +2079,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
* the other end.
*/
if (after(scb->end_seq, tp->snd_una)) {
- if (tcp_skb_pcount(skb) > 1)
+ if (tcp_skb_pcount(skb) > 1 &&
+ after(tp->snd_una, scb->seq))
acked |= tcp_tso_acked(sk, skb,
now, &seq_rtt);
break;
@@ -2448,6 +2095,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
*/
if (!(scb->flags & TCPCB_FLAG_SYN)) {
acked |= FLAG_DATA_ACKED;
+ ++pkts_acked;
} else {
acked |= FLAG_SYN_ACKED;
tp->retrans_stamp = 0;
@@ -2461,6 +2109,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
seq_rtt = -1;
} else if (seq_rtt < 0)
seq_rtt = now - scb->when;
+ if (seq_usrtt) {
+ struct timeval tv;
+
+ skb_get_timestamp(skb, &tv);
+ *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
+ + (usnow.tv_usec - tv.tv_usec);
+ }
+
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= tcp_skb_pcount(skb);
if (sacked & TCPCB_LOST)
@@ -2474,13 +2130,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
seq_rtt = now - scb->when;
tcp_dec_pcount_approx(&tp->fackets_out, skb);
tcp_packets_out_dec(tp, skb);
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &sk->sk_write_queue);
sk_stream_free_skb(sk, skb);
}
if (acked&FLAG_ACKED) {
- tcp_ack_update_rtt(tp, acked, seq_rtt);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
tcp_ack_packets_out(sk, tp);
+
+ if (icsk->icsk_ca_ops->pkts_acked)
+ icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
}
#if FASTRETRANS_DEBUG > 0
@@ -2488,19 +2148,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
BUG_TRAP((int)tp->lost_out >= 0);
BUG_TRAP((int)tp->retrans_out >= 0);
if (!tp->packets_out && tp->rx_opt.sack_ok) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
if (tp->lost_out) {
printk(KERN_DEBUG "Leak l=%u %d\n",
- tp->lost_out, tp->ca_state);
+ tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
printk(KERN_DEBUG "Leak s=%u %d\n",
- tp->sacked_out, tp->ca_state);
+ tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
printk(KERN_DEBUG "Leak r=%u %d\n",
- tp->retrans_out, tp->ca_state);
+ tp->retrans_out, icsk->icsk_ca_state);
tp->retrans_out = 0;
}
}
@@ -2511,40 +2172,43 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
static void tcp_ack_probe(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
/* Was it a usable window open? */
if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
tp->snd_una + tp->snd_wnd)) {
- tp->backoff = 0;
- tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
+ icsk->icsk_backoff = 0;
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
* This function is not for random using!
*/
} else {
- tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
- min(tp->rto << tp->backoff, TCP_RTO_MAX));
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+ TCP_RTO_MAX);
}
}
-static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag)
+static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
- tp->ca_state != TCP_CA_Open);
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
}
-static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag)
+static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
- !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR));
+ !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
}
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
-static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack,
- u32 ack_seq, u32 nwin)
+static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
+ const u32 ack_seq, const u32 nwin)
{
return (after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
@@ -2624,266 +2288,17 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
tp->frto_counter = (tp->frto_counter + 1) % 3;
}
-/*
- * TCP Westwood+
- */
-
-/*
- * @init_westwood
- * This function initializes fields used in TCP Westwood+. We can't
- * get no information about RTTmin at this time so we simply set it to
- * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
- * since in this way we're sure it will be updated in a consistent
- * way as soon as possible. It will reasonably happen within the first
- * RTT period of the connection lifetime.
- */
-
-static void init_westwood(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tp->westwood.bw_ns_est = 0;
- tp->westwood.bw_est = 0;
- tp->westwood.accounted = 0;
- tp->westwood.cumul_ack = 0;
- tp->westwood.rtt_win_sx = tcp_time_stamp;
- tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
- tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
- tp->westwood.snd_una = tp->snd_una;
-}
-
-/*
- * @westwood_do_filter
- * Low-pass filter. Implemented using constant coeffients.
- */
-
-static inline __u32 westwood_do_filter(__u32 a, __u32 b)
-{
- return (((7 * a) + b) >> 3);
-}
-
-static void westwood_filter(struct sock *sk, __u32 delta)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tp->westwood.bw_ns_est =
- westwood_do_filter(tp->westwood.bw_ns_est,
- tp->westwood.bk / delta);
- tp->westwood.bw_est =
- westwood_do_filter(tp->westwood.bw_est,
- tp->westwood.bw_ns_est);
-}
-
-/*
- * @westwood_update_rttmin
- * It is used to update RTTmin. In this case we MUST NOT use
- * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
- */
-
-static inline __u32 westwood_update_rttmin(const struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- __u32 rttmin = tp->westwood.rtt_min;
-
- if (tp->westwood.rtt != 0 &&
- (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
- rttmin = tp->westwood.rtt;
-
- return rttmin;
-}
-
-/*
- * @westwood_acked
- * Evaluate increases for dk.
- */
-
-static inline __u32 westwood_acked(const struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
-
- return tp->snd_una - tp->westwood.snd_una;
-}
-
-/*
- * @westwood_new_window
- * It evaluates if we are receiving data inside the same RTT window as
- * when we started.
- * Return value:
- * It returns 0 if we are still evaluating samples in the same RTT
- * window, 1 if the sample has to be considered in the next window.
- */
-
-static int westwood_new_window(const struct sock *sk)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- __u32 left_bound;
- __u32 rtt;
- int ret = 0;
-
- left_bound = tp->westwood.rtt_win_sx;
- rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
-
- /*
- * A RTT-window has passed. Be careful since if RTT is less than
- * 50ms we don't filter but we continue 'building the sample'.
- * This minimum limit was choosen since an estimation on small
- * time intervals is better to avoid...
- * Obvioulsy on a LAN we reasonably will always have
- * right_bound = left_bound + WESTWOOD_RTT_MIN
- */
-
- if ((left_bound + rtt) < tcp_time_stamp)
- ret = 1;
-
- return ret;
-}
-
-/*
- * @westwood_update_window
- * It updates RTT evaluation window if it is the right moment to do
- * it. If so it calls filter for evaluating bandwidth.
- */
-
-static void __westwood_update_window(struct sock *sk, __u32 now)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- __u32 delta = now - tp->westwood.rtt_win_sx;
-
- if (delta) {
- if (tp->westwood.rtt)
- westwood_filter(sk, delta);
-
- tp->westwood.bk = 0;
- tp->westwood.rtt_win_sx = tcp_time_stamp;
- }
-}
-
-
-static void westwood_update_window(struct sock *sk, __u32 now)
-{
- if (westwood_new_window(sk))
- __westwood_update_window(sk, now);
-}
-
-/*
- * @__tcp_westwood_fast_bw
- * It is called when we are in fast path. In particular it is called when
- * header prediction is successfull. In such case infact update is
- * straight forward and doesn't need any particular care.
- */
-
-static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- westwood_update_window(sk, tcp_time_stamp);
-
- tp->westwood.bk += westwood_acked(sk);
- tp->westwood.snd_una = tp->snd_una;
- tp->westwood.rtt_min = westwood_update_rttmin(sk);
-}
-
-static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
-{
- if (tcp_is_westwood(tcp_sk(sk)))
- __tcp_westwood_fast_bw(sk, skb);
-}
-
-
-/*
- * @westwood_dupack_update
- * It updates accounted and cumul_ack when receiving a dupack.
- */
-
-static void westwood_dupack_update(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tp->westwood.accounted += tp->mss_cache_std;
- tp->westwood.cumul_ack = tp->mss_cache_std;
-}
-
-static inline int westwood_may_change_cumul(struct tcp_sock *tp)
-{
- return (tp->westwood.cumul_ack > tp->mss_cache_std);
-}
-
-static inline void westwood_partial_update(struct tcp_sock *tp)
-{
- tp->westwood.accounted -= tp->westwood.cumul_ack;
- tp->westwood.cumul_ack = tp->mss_cache_std;
-}
-
-static inline void westwood_complete_update(struct tcp_sock *tp)
-{
- tp->westwood.cumul_ack -= tp->westwood.accounted;
- tp->westwood.accounted = 0;
-}
-
-/*
- * @westwood_acked_count
- * This function evaluates cumul_ack for evaluating dk in case of
- * delayed or partial acks.
- */
-
-static inline __u32 westwood_acked_count(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tp->westwood.cumul_ack = westwood_acked(sk);
-
- /* If cumul_ack is 0 this is a dupack since it's not moving
- * tp->snd_una.
- */
- if (!(tp->westwood.cumul_ack))
- westwood_dupack_update(sk);
-
- if (westwood_may_change_cumul(tp)) {
- /* Partial or delayed ack */
- if (tp->westwood.accounted >= tp->westwood.cumul_ack)
- westwood_partial_update(tp);
- else
- westwood_complete_update(tp);
- }
-
- tp->westwood.snd_una = tp->snd_una;
-
- return tp->westwood.cumul_ack;
-}
-
-
-/*
- * @__tcp_westwood_slow_bw
- * It is called when something is going wrong..even if there could
- * be no problems! Infact a simple delayed packet may trigger a
- * dupack. But we need to be careful in such case.
- */
-
-static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- westwood_update_window(sk, tcp_time_stamp);
-
- tp->westwood.bk += westwood_acked_count(sk);
- tp->westwood.rtt_min = westwood_update_rttmin(sk);
-}
-
-static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
-{
- if (tcp_is_westwood(tcp_sk(sk)))
- __tcp_westwood_slow_bw(sk, skb);
-}
-
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_snd_una = tp->snd_una;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
u32 prior_in_flight;
s32 seq_rtt;
+ s32 seq_usrtt = 0;
int prior_packets;
/* If the ack is newer than sent or older than previous acks
@@ -2902,9 +2317,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
*/
tcp_update_wl(tp, ack, ack_seq);
tp->snd_una = ack;
- tcp_westwood_fast_bw(sk, skb);
flag |= FLAG_WIN_UPDATE;
+ tcp_ca_event(sk, CA_EVENT_FAST_ACK);
+
NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
} else {
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -2920,7 +2336,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
flag |= FLAG_ECE;
- tcp_westwood_slow_bw(sk,skb);
+ tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
}
/* We passed data and got it acked, remove any soft error
@@ -2935,22 +2351,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
prior_in_flight = tcp_packets_in_flight(tp);
/* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
+ flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
+ icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
if (tp->frto_counter)
tcp_process_frto(sk, prior_snd_una);
- if (tcp_ack_is_dubious(tp, flag)) {
+ if (tcp_ack_is_dubious(sk, flag)) {
/* Advanve CWND, if state allows this. */
- if ((flag & FLAG_DATA_ACKED) &&
- (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) &&
- tcp_may_raise_cwnd(tp, flag))
- tcp_cong_avoid(tp, ack, seq_rtt);
+ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
+ tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
} else {
- if ((flag & FLAG_DATA_ACKED) &&
- (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd))
- tcp_cong_avoid(tp, ack, seq_rtt);
+ if ((flag & FLAG_DATA_ACKED))
+ tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
}
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -2959,7 +2373,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
return 1;
no_queue:
- tp->probes_out = 0;
+ icsk->icsk_probes_out = 0;
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
@@ -3137,8 +2551,9 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
* up to bandwidth of 18Gigabit/sec. 8) ]
*/
-static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
+static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
struct tcphdr *th = skb->h.th;
u32 seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -3153,14 +2568,15 @@ static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
/* 4. ... and sits in replay window. */
- (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ);
+ (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
}
-static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb)
+static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
- !tcp_disordered_ack(tp, skb));
+ !tcp_disordered_ack(sk, skb));
}
/* Check segment sequence number for validity.
@@ -3223,7 +2639,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
- tcp_schedule_ack(tp);
+ inet_csk_schedule_ack(sk);
sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(sk, SOCK_DONE);
@@ -3233,7 +2649,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- tp->ack.pingpong = 1;
+ inet_csk(sk)->icsk_ack.pingpong = 1;
break;
case TCP_CLOSE_WAIT:
@@ -3331,7 +2747,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
- tcp_enter_quickack_mode(tp);
+ tcp_enter_quickack_mode(sk);
if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -3439,7 +2855,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
int this_sack;
/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
- if (skb_queue_len(&tp->out_of_order_queue) == 0) {
+ if (skb_queue_empty(&tp->out_of_order_queue)) {
tp->rx_opt.num_sacks = 0;
tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
return;
@@ -3490,7 +2906,7 @@ static void tcp_ofo_queue(struct sock *sk)
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "ofo packet was already received \n");
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &tp->out_of_order_queue);
__kfree_skb(skb);
continue;
}
@@ -3498,7 +2914,7 @@ static void tcp_ofo_queue(struct sock *sk)
tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq);
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &tp->out_of_order_queue);
__skb_queue_tail(&sk->sk_receive_queue, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if(skb->h.th->fin)
@@ -3572,14 +2988,14 @@ queue_and_out:
if(th->fin)
tcp_fin(skb, sk, th);
- if (skb_queue_len(&tp->out_of_order_queue)) {
+ if (!skb_queue_empty(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
/* RFC2581. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
- if (!skb_queue_len(&tp->out_of_order_queue))
- tp->ack.pingpong = 0;
+ if (skb_queue_empty(&tp->out_of_order_queue))
+ inet_csk(sk)->icsk_ack.pingpong = 0;
}
if (tp->rx_opt.num_sacks)
@@ -3600,8 +3016,8 @@ queue_and_out:
tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
- tcp_enter_quickack_mode(tp);
- tcp_schedule_ack(tp);
+ tcp_enter_quickack_mode(sk);
+ inet_csk_schedule_ack(sk);
drop:
__kfree_skb(skb);
return;
@@ -3611,7 +3027,7 @@ drop:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
goto out_of_window;
- tcp_enter_quickack_mode(tp);
+ tcp_enter_quickack_mode(sk);
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
@@ -3640,7 +3056,7 @@ drop:
/* Disable header prediction. */
tp->pred_flags = 0;
- tcp_schedule_ack(tp);
+ inet_csk_schedule_ack(sk);
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -3664,7 +3080,7 @@ drop:
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (seq == TCP_SKB_CB(skb1)->end_seq) {
- __skb_append(skb1, skb);
+ __skb_append(skb1, skb, &tp->out_of_order_queue);
if (!tp->rx_opt.num_sacks ||
tp->selective_acks[0].end_seq != seq)
@@ -3708,7 +3124,7 @@ drop:
tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
break;
}
- __skb_unlink(skb1, skb1->list);
+ __skb_unlink(skb1, &tp->out_of_order_queue);
tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
__kfree_skb(skb1);
}
@@ -3725,8 +3141,9 @@ add_sack:
* simplifies code)
*/
static void
-tcp_collapse(struct sock *sk, struct sk_buff *head,
- struct sk_buff *tail, u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list,
+ struct sk_buff *head, struct sk_buff *tail,
+ u32 start, u32 end)
{
struct sk_buff *skb;
@@ -3736,7 +3153,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
struct sk_buff *next = skb->next;
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, list);
__kfree_skb(skb);
NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
skb = next;
@@ -3782,7 +3199,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
- __skb_insert(nskb, skb->prev, skb, skb->list);
+ __skb_insert(nskb, skb->prev, skb, list);
sk_stream_set_owner_r(nskb, sk);
/* Copy data, releasing collapsed skbs. */
@@ -3801,7 +3218,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
}
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
struct sk_buff *next = skb->next;
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, list);
__kfree_skb(skb);
NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
skb = next;
@@ -3837,7 +3254,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) {
- tcp_collapse(sk, head, skb, start, end);
+ tcp_collapse(sk, &tp->out_of_order_queue,
+ head, skb, start, end);
head = skb;
if (skb == (struct sk_buff *)&tp->out_of_order_queue)
break;
@@ -3874,7 +3292,8 @@ static int tcp_prune_queue(struct sock *sk)
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk);
- tcp_collapse(sk, sk->sk_receive_queue.next,
+ tcp_collapse(sk, &sk->sk_receive_queue,
+ sk->sk_receive_queue.next,
(struct sk_buff*)&sk->sk_receive_queue,
tp->copied_seq, tp->rcv_nxt);
sk_stream_mem_reclaim(sk);
@@ -3886,9 +3305,8 @@ static int tcp_prune_queue(struct sock *sk)
* This must not ever occur. */
/* First, purge the out_of_order queue. */
- if (skb_queue_len(&tp->out_of_order_queue)) {
- NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED,
- skb_queue_len(&tp->out_of_order_queue));
+ if (!skb_queue_empty(&tp->out_of_order_queue)) {
+ NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
__skb_queue_purge(&tp->out_of_order_queue);
/* Reset SACK state. A conforming SACK implementation will
@@ -3924,12 +3342,12 @@ void tcp_cwnd_application_limited(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->ca_state == TCP_CA_Open &&
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
/* Limited by application or receiver window. */
u32 win_used = max(tp->snd_cwnd_used, 2U);
if (win_used < tp->snd_cwnd) {
- tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
}
tp->snd_cwnd_used = 0;
@@ -3937,6 +3355,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
+static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
+{
+ /* If the user specified a specific send buffer setting, do
+ * not modify it.
+ */
+ if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+ return 0;
+
+ /* If we are under global TCP memory pressure, do not expand. */
+ if (tcp_memory_pressure)
+ return 0;
+
+ /* If we are under soft global TCP memory pressure, do not expand. */
+ if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+ return 0;
+
+ /* If we filled the congestion window, do not expand. */
+ if (tp->packets_out >= tp->snd_cwnd)
+ return 0;
+
+ return 1;
+}
/* When incoming ACK allowed to free some skb from write_queue,
* we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3948,11 +3388,8 @@ static void tcp_new_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->packets_out < tp->snd_cwnd &&
- !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
- !tcp_memory_pressure &&
- atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
- int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
+ if (tcp_should_expand_sndbuf(sk, tp)) {
+ int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
demanded = max_t(unsigned int, tp->snd_cwnd,
tp->reordering + 1);
@@ -3975,22 +3412,9 @@ static inline void tcp_check_space(struct sock *sk)
}
}
-static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
+static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
- tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
- tcp_write_xmit(sk, tp->nonagle))
- tcp_check_probe_timer(sk, tp);
-}
-
-static __inline__ void tcp_data_snd_check(struct sock *sk)
-{
- struct sk_buff *skb = sk->sk_send_head;
-
- if (skb != NULL)
- __tcp_data_snd_check(sk, skb);
+ tcp_push_pending_frames(sk, tp);
tcp_check_space(sk);
}
@@ -4002,13 +3426,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
struct tcp_sock *tp = tcp_sk(sk);
/* More than one full frame received... */
- if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss
+ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss
/* ... and right edge of window advances far enough.
* (tcp_recvmsg() will send ACK otherwise). Or...
*/
&& __tcp_select_window(sk) >= tp->rcv_wnd) ||
/* We ACK each frame or... */
- tcp_in_quickack_mode(tp) ||
+ tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
(ofo_possible &&
skb_peek(&tp->out_of_order_queue))) {
@@ -4022,8 +3446,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
static __inline__ void tcp_ack_snd_check(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_ack_scheduled(tp)) {
+ if (!inet_csk_ack_scheduled(sk)) {
/* We sent a data segment already. */
return;
}
@@ -4094,7 +3517,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
tp->copied_seq++;
if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &sk->sk_receive_queue);
__kfree_skb(skb);
}
}
@@ -4277,14 +3700,14 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
- tcp_rcv_rtt_measure_ts(tp, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
/* We know that such packets are checksummed
* on entry.
*/
tcp_ack(sk, skb, 0);
__kfree_skb(skb);
- tcp_data_snd_check(sk);
+ tcp_data_snd_check(sk, tp);
return 0;
} else { /* Header too small */
TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -4310,7 +3733,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
- tcp_rcv_rtt_measure_ts(tp, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
__skb_pull(skb, tcp_header_len);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -4331,7 +3754,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
- tcp_rcv_rtt_measure_ts(tp, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
@@ -4350,8 +3773,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
/* Well, only one small jumplet in fast path... */
tcp_ack(sk, skb, FLAG_DATA);
- tcp_data_snd_check(sk);
- if (!tcp_ack_scheduled(tp))
+ tcp_data_snd_check(sk, tp);
+ if (!inet_csk_ack_scheduled(sk))
goto no_ack;
}
@@ -4373,7 +3796,7 @@ slow_path:
* RFC1323: H1. Apply PAWS check first.
*/
if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
- tcp_paws_discard(tp, skb)) {
+ tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
tcp_send_dupack(sk, skb);
@@ -4420,7 +3843,7 @@ step5:
if(th->ack)
tcp_ack(sk, skb, FLAG_SLOWPATH);
- tcp_rcv_rtt_measure_ts(tp, skb);
+ tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
tcp_urg(sk, skb, th);
@@ -4428,7 +3851,7 @@ step5:
/* step 7: process the segment text */
tcp_data_queue(sk, skb);
- tcp_data_snd_check(sk);
+ tcp_data_snd_check(sk, tp);
tcp_ack_snd_check(sk);
return 0;
@@ -4449,6 +3872,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_parse_options(skb, &tp->rx_opt, 0);
if (th->ack) {
+ struct inet_connection_sock *icsk;
/* rfc793:
* "If the state is SYN-SENT then
* first check the ACK bit
@@ -4552,6 +3976,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_init_metrics(sk);
+ tcp_init_congestion_control(sk);
+
/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
*/
@@ -4560,7 +3986,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_init_buffer_space(sk);
if (sock_flag(sk, SOCK_KEEPOPEN))
- tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
if (!tp->rx_opt.snd_wscale)
__tcp_fast_path_on(tp, tp->snd_wnd);
@@ -4572,7 +3998,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
sk_wake_async(sk, 0, POLL_OUT);
}
- if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) {
+ icsk = inet_csk(sk);
+
+ if (sk->sk_write_pending ||
+ icsk->icsk_accept_queue.rskq_defer_accept ||
+ icsk->icsk_ack.pingpong) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
*
@@ -4580,12 +4010,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* look so _wonderfully_ clever, that I was not able
* to stand against the temptation 8) --ANK
*/
- tcp_schedule_ack(tp);
- tp->ack.lrcvtime = tcp_time_stamp;
- tp->ack.ato = TCP_ATO_MIN;
- tcp_incr_quickack(tp);
- tcp_enter_quickack_mode(tp);
- tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+ inet_csk_schedule_ack(sk);
+ icsk->icsk_ack.lrcvtime = tcp_time_stamp;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ tcp_incr_quickack(sk);
+ tcp_enter_quickack_mode(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, TCP_RTO_MAX);
discard:
__kfree_skb(skb);
@@ -4708,9 +4139,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if(tp->af_specific->conn_request(sk, skb) < 0)
return 1;
- init_westwood(sk);
- init_bictcp(tp);
-
/* Now we have several options: In theory there is
* nothing else in the frame. KA9Q has an option to
* send data with the syn, BSD accepts data with the
@@ -4732,9 +4160,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
case TCP_SYN_SENT:
- init_westwood(sk);
- init_bictcp(tp);
-
queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
if (queued >= 0)
return queued;
@@ -4742,12 +4167,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* Do step6 onward by hand. */
tcp_urg(sk, skb, th);
__kfree_skb(skb);
- tcp_data_snd_check(sk);
+ tcp_data_snd_check(sk, tp);
return 0;
}
if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
- tcp_paws_discard(tp, skb)) {
+ tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
tcp_send_dupack(sk, skb);
@@ -4816,7 +4241,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!tp->srtt)
- tcp_ack_saw_tstamp(tp, 0);
+ tcp_ack_saw_tstamp(sk, NULL, 0);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4828,6 +4253,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tcp_init_metrics(sk);
+ tcp_init_congestion_control(sk);
+
/* Prevent spurious tcp_cwnd_restart() on
* first data packet.
*/
@@ -4861,9 +4288,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
return 1;
}
- tmo = tcp_fin_time(tp);
+ tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
- tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+ inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
} else if (th->fin || sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing
@@ -4871,7 +4298,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* if it spins in bh_lock_sock(), but it is really
* marginal case.
*/
- tcp_reset_keepalive_timer(sk, tmo);
+ inet_csk_reset_keepalive_timer(sk, tmo);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto discard;
@@ -4931,7 +4358,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* tcp_data could move socket to TIME-WAIT */
if (sk->sk_state != TCP_CLOSE) {
- tcp_data_snd_check(sk);
+ tcp_data_snd_check(sk, tp);
tcp_ack_snd_check(sk);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dad98e4..13dfb39 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -36,7 +36,7 @@
* ACK bit.
* Andi Kleen : Implemented fast path mtu discovery.
* Fixed many serious bugs in the
- * open_request handling and moved
+ * request_sock handling and moved
* most of it into the af independent code.
* Added tail drop and some other bugfixes.
* Added new listen sematics.
@@ -64,7 +64,9 @@
#include <linux/times.h>
#include <net/icmp.h>
+#include <net/inet_hashtables.h>
#include <net/tcp.h>
+#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
@@ -75,7 +77,6 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-extern int sysctl_ip_dynaddr;
int sysctl_tcp_tw_reuse;
int sysctl_tcp_low_latency;
@@ -88,458 +89,29 @@ static struct socket *tcp_socket;
void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
struct sk_buff *skb);
-struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
- .__tcp_lhash_lock = RW_LOCK_UNLOCKED,
- .__tcp_lhash_users = ATOMIC_INIT(0),
- .__tcp_lhash_wait
- = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
- .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED
+struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
+ .lhash_lock = RW_LOCK_UNLOCKED,
+ .lhash_users = ATOMIC_INIT(0),
+ .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
+ .portalloc_lock = SPIN_LOCK_UNLOCKED,
+ .port_rover = 1024 - 1,
};
-/*
- * This array holds the first and last local port number.
- * For high-usage systems, use sysctl to change this to
- * 32768-61000
- */
-int sysctl_local_port_range[2] = { 1024, 4999 };
-int tcp_port_rover = 1024 - 1;
-
-static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
- __u32 faddr, __u16 fport)
-{
- int h = (laddr ^ lport) ^ (faddr ^ fport);
- h ^= h >> 16;
- h ^= h >> 8;
- return h & (tcp_ehash_size - 1);
-}
-
-static __inline__ int tcp_sk_hashfn(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- __u32 laddr = inet->rcv_saddr;
- __u16 lport = inet->num;
- __u32 faddr = inet->daddr;
- __u16 fport = inet->dport;
-
- return tcp_hashfn(laddr, lport, faddr, fport);
-}
-
-/* Allocate and initialize a new TCP local port bind bucket.
- * The bindhash mutex for snum's hash chain must be held here.
- */
-struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
- unsigned short snum)
-{
- struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
- SLAB_ATOMIC);
- if (tb) {
- tb->port = snum;
- tb->fastreuse = 0;
- INIT_HLIST_HEAD(&tb->owners);
- hlist_add_head(&tb->node, &head->chain);
- }
- return tb;
-}
-
-/* Caller must hold hashbucket lock for this tb with local BH disabled */
-void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
-{
- if (hlist_empty(&tb->owners)) {
- __hlist_del(&tb->node);
- kmem_cache_free(tcp_bucket_cachep, tb);
- }
-}
-
-/* Caller must disable local BH processing. */
-static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
-{
- struct tcp_bind_hashbucket *head =
- &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
- struct tcp_bind_bucket *tb;
-
- spin_lock(&head->lock);
- tb = tcp_sk(sk)->bind_hash;
- sk_add_bind_node(child, &tb->owners);
- tcp_sk(child)->bind_hash = tb;
- spin_unlock(&head->lock);
-}
-
-inline void tcp_inherit_port(struct sock *sk, struct sock *child)
-{
- local_bh_disable();
- __tcp_inherit_port(sk, child);
- local_bh_enable();
-}
-
-void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
- unsigned short snum)
-{
- inet_sk(sk)->num = snum;
- sk_add_bind_node(sk, &tb->owners);
- tcp_sk(sk)->bind_hash = tb;
-}
-
-static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
-{
- const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
- struct sock *sk2;
- struct hlist_node *node;
- int reuse = sk->sk_reuse;
-
- sk_for_each_bound(sk2, node, &tb->owners) {
- if (sk != sk2 &&
- !tcp_v6_ipv6only(sk2) &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if (!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) {
- const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
- if (!sk2_rcv_saddr || !sk_rcv_saddr ||
- sk2_rcv_saddr == sk_rcv_saddr)
- break;
- }
- }
- }
- return node != NULL;
-}
-
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- */
static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
{
- struct tcp_bind_hashbucket *head;
- struct hlist_node *node;
- struct tcp_bind_bucket *tb;
- int ret;
-
- local_bh_disable();
- if (!snum) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int remaining = (high - low) + 1;
- int rover;
-
- spin_lock(&tcp_portalloc_lock);
- if (tcp_port_rover < low)
- rover = low;
- else
- rover = tcp_port_rover;
- do {
- rover++;
- if (rover > high)
- rover = low;
- head = &tcp_bhash[tcp_bhashfn(rover)];
- spin_lock(&head->lock);
- tb_for_each(tb, node, &head->chain)
- if (tb->port == rover)
- goto next;
- break;
- next:
- spin_unlock(&head->lock);
- } while (--remaining > 0);
- tcp_port_rover = rover;
- spin_unlock(&tcp_portalloc_lock);
-
- /* Exhausted local port range during search? */
- ret = 1;
- if (remaining <= 0)
- goto fail;
-
- /* OK, here is the one we will use. HEAD is
- * non-NULL and we hold it's mutex.
- */
- snum = rover;
- } else {
- head = &tcp_bhash[tcp_bhashfn(snum)];
- spin_lock(&head->lock);
- tb_for_each(tb, node, &head->chain)
- if (tb->port == snum)
- goto tb_found;
- }
- tb = NULL;
- goto tb_not_found;
-tb_found:
- if (!hlist_empty(&tb->owners)) {
- if (sk->sk_reuse > 1)
- goto success;
- if (tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
- goto success;
- } else {
- ret = 1;
- if (tcp_bind_conflict(sk, tb))
- goto fail_unlock;
- }
- }
-tb_not_found:
- ret = 1;
- if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
- goto fail_unlock;
- if (hlist_empty(&tb->owners)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
- tb->fastreuse = 1;
- else
- tb->fastreuse = 0;
- } else if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
-success:
- if (!tcp_sk(sk)->bind_hash)
- tcp_bind_hash(sk, tb, snum);
- BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
- ret = 0;
-
-fail_unlock:
- spin_unlock(&head->lock);
-fail:
- local_bh_enable();
- return ret;
-}
-
-/* Get rid of any references to a local port held by the
- * given sock.
- */
-static void __tcp_put_port(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
- struct tcp_bind_bucket *tb;
-
- spin_lock(&head->lock);
- tb = tcp_sk(sk)->bind_hash;
- __sk_del_bind_node(sk);
- tcp_sk(sk)->bind_hash = NULL;
- inet->num = 0;
- tcp_bucket_destroy(tb);
- spin_unlock(&head->lock);
-}
-
-void tcp_put_port(struct sock *sk)
-{
- local_bh_disable();
- __tcp_put_port(sk);
- local_bh_enable();
-}
-
-/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
- * Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines (wake up each
- * exclusive lock release). It should be ifdefed really.
- */
-
-void tcp_listen_wlock(void)
-{
- write_lock(&tcp_lhash_lock);
-
- if (atomic_read(&tcp_lhash_users)) {
- DEFINE_WAIT(wait);
-
- for (;;) {
- prepare_to_wait_exclusive(&tcp_lhash_wait,
- &wait, TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&tcp_lhash_users))
- break;
- write_unlock_bh(&tcp_lhash_lock);
- schedule();
- write_lock_bh(&tcp_lhash_lock);
- }
-
- finish_wait(&tcp_lhash_wait, &wait);
- }
-}
-
-static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
-{
- struct hlist_head *list;
- rwlock_t *lock;
-
- BUG_TRAP(sk_unhashed(sk));
- if (listen_possible && sk->sk_state == TCP_LISTEN) {
- list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
- lock = &tcp_lhash_lock;
- tcp_listen_wlock();
- } else {
- list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
- lock = &tcp_ehash[sk->sk_hashent].lock;
- write_lock(lock);
- }
- __sk_add_node(sk, list);
- sock_prot_inc_use(sk->sk_prot);
- write_unlock(lock);
- if (listen_possible && sk->sk_state == TCP_LISTEN)
- wake_up(&tcp_lhash_wait);
+ return inet_csk_get_port(&tcp_hashinfo, sk, snum);
}
static void tcp_v4_hash(struct sock *sk)
{
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
- __tcp_v4_hash(sk, 1);
- local_bh_enable();
- }
+ inet_hash(&tcp_hashinfo, sk);
}
void tcp_unhash(struct sock *sk)
{
- rwlock_t *lock;
-
- if (sk_unhashed(sk))
- goto ende;
-
- if (sk->sk_state == TCP_LISTEN) {
- local_bh_disable();
- tcp_listen_wlock();
- lock = &tcp_lhash_lock;
- } else {
- struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
- lock = &head->lock;
- write_lock_bh(&head->lock);
- }
-
- if (__sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
- write_unlock_bh(lock);
-
- ende:
- if (sk->sk_state == TCP_LISTEN)
- wake_up(&tcp_lhash_wait);
-}
-
-/* Don't inline this cruft. Here are some nice properties to
- * exploit here. The BSD API does not allow a listening TCP
- * to specify the remote port nor the remote address for the
- * connection. So always assume those are both wildcarded
- * during the search since they can never be otherwise.
- */
-static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
- unsigned short hnum, int dif)
-{
- struct sock *result = NULL, *sk;
- struct hlist_node *node;
- int score, hiscore;
-
- hiscore=-1;
- sk_for_each(sk, node, head) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (inet->num == hnum && !ipv6_only_sock(sk)) {
- __u32 rcv_saddr = inet->rcv_saddr;
-
- score = (sk->sk_family == PF_INET ? 1 : 0);
- if (rcv_saddr) {
- if (rcv_saddr != daddr)
- continue;
- score+=2;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- continue;
- score+=2;
- }
- if (score == 5)
- return sk;
- if (score > hiscore) {
- hiscore = score;
- result = sk;
- }
- }
- }
- return result;
-}
-
-/* Optimize the common listener case. */
-static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
- unsigned short hnum, int dif)
-{
- struct sock *sk = NULL;
- struct hlist_head *head;
-
- read_lock(&tcp_lhash_lock);
- head = &tcp_listening_hash[tcp_lhashfn(hnum)];
- if (!hlist_empty(head)) {
- struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
-
- if (inet->num == hnum && !sk->sk_node.next &&
- (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
- (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
- !sk->sk_bound_dev_if)
- goto sherry_cache;
- sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
- }
- if (sk) {
-sherry_cache:
- sock_hold(sk);
- }
- read_unlock(&tcp_lhash_lock);
- return sk;
-}
-
-/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
- * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- *
- * Local BH must be disabled here.
- */
-
-static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
- u32 daddr, u16 hnum,
- int dif)
-{
- struct tcp_ehash_bucket *head;
- TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
- __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
- struct sock *sk;
- struct hlist_node *node;
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- int hash = tcp_hashfn(daddr, hnum, saddr, sport);
- head = &tcp_ehash[hash];
- read_lock(&head->lock);
- sk_for_each(sk, node, &head->chain) {
- if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
- goto hit; /* You sunk my battleship! */
- }
-
- /* Must check for a TIME_WAIT'er before going to listener hash. */
- sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
- if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
- goto hit;
- }
- sk = NULL;
-out:
- read_unlock(&head->lock);
- return sk;
-hit:
- sock_hold(sk);
- goto out;
+ inet_unhash(&tcp_hashinfo, sk);
}
-static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
- u32 daddr, u16 hnum, int dif)
-{
- struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
- daddr, hnum, dif);
-
- return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
-}
-
-inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
- u16 dport, int dif)
-{
- struct sock *sk;
-
- local_bh_disable();
- sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
- local_bh_enable();
-
- return sk;
-}
-
-EXPORT_SYMBOL_GPL(tcp_v4_lookup);
-
static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
{
return secure_tcp_sequence_number(skb->nh.iph->daddr,
@@ -550,27 +122,28 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
/* called with local bh disabled */
static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
- struct tcp_tw_bucket **twp)
+ struct inet_timewait_sock **twp)
{
struct inet_sock *inet = inet_sk(sk);
u32 daddr = inet->rcv_saddr;
u32 saddr = inet->daddr;
int dif = sk->sk_bound_dev_if;
- TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
- __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
- int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
- struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+ INET_ADDR_COOKIE(acookie, saddr, daddr)
+ const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+ const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
+ struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
struct sock *sk2;
- struct hlist_node *node;
- struct tcp_tw_bucket *tw;
+ const struct hlist_node *node;
+ struct inet_timewait_sock *tw;
write_lock(&head->lock);
/* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
- tw = (struct tcp_tw_bucket *)sk2;
+ sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
+ tw = inet_twsk(sk2);
- if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+ if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
struct tcp_sock *tp = tcp_sk(sk);
/* With PAWS, it is safe from the viewpoint
@@ -587,15 +160,15 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
fall back to VJ's scheme and use initial
timestamp retrieved from peer table.
*/
- if (tw->tw_ts_recent_stamp &&
+ if (tcptw->tw_ts_recent_stamp &&
(!twp || (sysctl_tcp_tw_reuse &&
xtime.tv_sec -
- tw->tw_ts_recent_stamp > 1))) {
- if ((tp->write_seq =
- tw->tw_snd_nxt + 65535 + 2) == 0)
+ tcptw->tw_ts_recent_stamp > 1))) {
+ tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+ if (tp->write_seq == 0)
tp->write_seq = 1;
- tp->rx_opt.ts_recent = tw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+ tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
+ tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
sock_hold(sk2);
goto unique;
} else
@@ -606,7 +179,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
/* And established part... */
sk_for_each(sk2, node, &head->chain) {
- if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+ if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
goto not_unique;
}
@@ -626,10 +199,10 @@ unique:
NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
} else if (tw) {
/* Silly. Should hash-dance instead... */
- tcp_tw_deschedule(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
}
return 0;
@@ -652,9 +225,9 @@ static inline u32 connect_port_offset(const struct sock *sk)
*/
static inline int tcp_v4_hash_connect(struct sock *sk)
{
- unsigned short snum = inet_sk(sk)->num;
- struct tcp_bind_hashbucket *head;
- struct tcp_bind_bucket *tb;
+ const unsigned short snum = inet_sk(sk)->num;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
int ret;
if (!snum) {
@@ -666,19 +239,19 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
static u32 hint;
u32 offset = hint + connect_port_offset(sk);
struct hlist_node *node;
- struct tcp_tw_bucket *tw = NULL;
+ struct inet_timewait_sock *tw = NULL;
local_bh_disable();
for (i = 1; i <= range; i++) {
port = low + (i + offset) % range;
- head = &tcp_bhash[tcp_bhashfn(port)];
+ head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
spin_lock(&head->lock);
/* Does not bother with rcv_saddr checks,
* because the established check is already
* unique enough.
*/
- tb_for_each(tb, node, &head->chain) {
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
if (tb->port == port) {
BUG_TRAP(!hlist_empty(&tb->owners));
if (tb->fastreuse >= 0)
@@ -691,7 +264,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
}
}
- tb = tcp_bucket_create(head, port);
+ tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
if (!tb) {
spin_unlock(&head->lock);
break;
@@ -710,27 +283,27 @@ ok:
hint += i;
/* Head lock still held and bh's disabled */
- tcp_bind_hash(sk, tb, port);
+ inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->sport = htons(port);
- __tcp_v4_hash(sk, 0);
+ __inet_hash(&tcp_hashinfo, sk, 0);
}
spin_unlock(&head->lock);
if (tw) {
- tcp_tw_deschedule(tw);
- tcp_tw_put(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);;
+ inet_twsk_put(tw);
}
ret = 0;
goto out;
}
- head = &tcp_bhash[tcp_bhashfn(snum)];
- tb = tcp_sk(sk)->bind_hash;
+ head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- __tcp_v4_hash(sk, 0);
+ __inet_hash(&tcp_hashinfo, sk, 0);
spin_unlock_bh(&head->lock);
return 0;
} else {
@@ -793,7 +366,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tp->write_seq = 0;
}
- if (sysctl_tcp_tw_recycle &&
+ if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
struct inet_peer *peer = rt_get_peer(rt);
@@ -832,8 +405,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
goto failure;
/* OK, now commit destination to socket. */
- __sk_dst_set(sk, &rt->u.dst);
- tcp_v4_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->u.dst);
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->saddr,
@@ -859,59 +431,6 @@ failure:
return err;
}
-static __inline__ int tcp_v4_iif(struct sk_buff *skb)
-{
- return ((struct rtable *)skb->dst)->rt_iif;
-}
-
-static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
-{
- return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
-}
-
-static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
- struct open_request ***prevp,
- __u16 rport,
- __u32 raddr, __u32 laddr)
-{
- struct tcp_listen_opt *lopt = tp->listen_opt;
- struct open_request *req, **prev;
-
- for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
- (req = *prev) != NULL;
- prev = &req->dl_next) {
- if (req->rmt_port == rport &&
- req->af.v4_req.rmt_addr == raddr &&
- req->af.v4_req.loc_addr == laddr &&
- TCP_INET_FAMILY(req->class->family)) {
- BUG_TRAP(!req->sk);
- *prevp = prev;
- break;
- }
- }
-
- return req;
-}
-
-static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_listen_opt *lopt = tp->listen_opt;
- u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
-
- req->expires = jiffies + TCP_TIMEOUT_INIT;
- req->retrans = 0;
- req->sk = NULL;
- req->dl_next = lopt->syn_table[h];
-
- write_lock(&tp->syn_wait_lock);
- lopt->syn_table[h] = req;
- write_unlock(&tp->syn_wait_lock);
-
- tcp_synq_added(sk);
-}
-
-
/*
* This routine does path mtu discovery as defined in RFC1191.
*/
@@ -994,14 +513,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
return;
}
- sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
- th->source, tcp_v4_iif(skb));
+ sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
+ th->source, inet_iif(skb));
if (!sk) {
ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
return;
}
if (sk->sk_state == TCP_TIME_WAIT) {
- tcp_tw_put((struct tcp_tw_bucket *)sk);
+ inet_twsk_put((struct inet_timewait_sock *)sk);
return;
}
@@ -1050,13 +569,13 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
}
switch (sk->sk_state) {
- struct open_request *req, **prev;
+ struct request_sock *req, **prev;
case TCP_LISTEN:
if (sock_owned_by_user(sk))
goto out;
- req = tcp_v4_search_req(tp, &prev, th->dest,
- iph->daddr, iph->saddr);
+ req = inet_csk_search_req(sk, &prev, th->dest,
+ iph->daddr, iph->saddr);
if (!req)
goto out;
@@ -1065,7 +584,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
*/
BUG_TRAP(!req->sk);
- if (seq != req->snt_isn) {
+ if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
@@ -1076,7 +595,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
* created socket, and POSIX does not want network
* errors returned from accept().
*/
- tcp_synq_drop(sk, req, prev);
+ inet_csk_reqsk_queue_drop(sk, req, prev);
goto out;
case TCP_SYN_SENT:
@@ -1246,62 +765,35 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
- tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+ tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
}
-static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
+static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
{
- tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
+ tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
req->ts_recent);
}
-static struct dst_entry* tcp_v4_route_req(struct sock *sk,
- struct open_request *req)
-{
- struct rtable *rt;
- struct ip_options *opt = req->af.v4_req.opt;
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .nl_u = { .ip4_u =
- { .daddr = ((opt && opt->srr) ?
- opt->faddr :
- req->af.v4_req.rmt_addr),
- .saddr = req->af.v4_req.loc_addr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = IPPROTO_TCP,
- .uli_u = { .ports =
- { .sport = inet_sk(sk)->sport,
- .dport = req->rmt_port } } };
-
- if (ip_route_output_flow(&rt, &fl, sk, 0)) {
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
- return NULL;
- }
- if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
- ip_rt_put(rt);
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
- return NULL;
- }
- return &rt->u.dst;
-}
-
/*
* Send a SYN-ACK after having received an ACK.
- * This still operates on a open_request only, not on a big
+ * This still operates on a request_sock only, not on a big
* socket.
*/
-static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
+static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
struct dst_entry *dst)
{
+ const struct inet_request_sock *ireq = inet_rsk(req);
int err = -1;
struct sk_buff * skb;
/* First, grab a route. */
- if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+ if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
goto out;
skb = tcp_make_synack(sk, dst, req);
@@ -1310,14 +802,14 @@ static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
struct tcphdr *th = skb->h.th;
th->check = tcp_v4_check(th, skb->len,
- req->af.v4_req.loc_addr,
- req->af.v4_req.rmt_addr,
+ ireq->loc_addr,
+ ireq->rmt_addr,
csum_partial((char *)th, skb->len,
skb->csum));
- err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
- req->af.v4_req.rmt_addr,
- req->af.v4_req.opt);
+ err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
+ ireq->rmt_addr,
+ ireq->opt);
if (err == NET_XMIT_CN)
err = 0;
}
@@ -1328,12 +820,12 @@ out:
}
/*
- * IPv4 open_request destructor.
+ * IPv4 request_sock destructor.
*/
-static void tcp_v4_or_free(struct open_request *req)
+static void tcp_v4_reqsk_destructor(struct request_sock *req)
{
- if (req->af.v4_req.opt)
- kfree(req->af.v4_req.opt);
+ if (inet_rsk(req)->opt)
+ kfree(inet_rsk(req)->opt);
}
static inline void syn_flood_warning(struct sk_buff *skb)
@@ -1349,7 +841,7 @@ static inline void syn_flood_warning(struct sk_buff *skb)
}
/*
- * Save and compile IPv4 options into the open_request if needed.
+ * Save and compile IPv4 options into the request_sock if needed.
*/
static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
struct sk_buff *skb)
@@ -1370,33 +862,20 @@ static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
return dopt;
}
-/*
- * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
- * One SYN_RECV socket costs about 80bytes on a 32bit machine.
- * It would be better to replace it with a global counter for all sockets
- * but then some measure against one socket starving all other sockets
- * would be needed.
- *
- * It was 128 by default. Experiments with real servers show, that
- * it is absolutely not enough even at 100conn/sec. 256 cures most
- * of problems. This value is adjusted to 128 for very small machines
- * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
- * Further increasing requires to change hash table size.
- */
-int sysctl_max_syn_backlog = 256;
-
-struct or_calltable or_ipv4 = {
+struct request_sock_ops tcp_request_sock_ops = {
.family = PF_INET,
+ .obj_size = sizeof(struct tcp_request_sock),
.rtx_syn_ack = tcp_v4_send_synack,
- .send_ack = tcp_v4_or_send_ack,
- .destructor = tcp_v4_or_free,
+ .send_ack = tcp_v4_reqsk_send_ack,
+ .destructor = tcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
};
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
+ struct inet_request_sock *ireq;
struct tcp_options_received tmp_opt;
- struct open_request *req;
+ struct request_sock *req;
__u32 saddr = skb->nh.iph->saddr;
__u32 daddr = skb->nh.iph->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
@@ -1416,7 +895,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* limitations, they conserve resources and peer is
* evidently real one.
*/
- if (tcp_synq_is_full(sk) && !isn) {
+ if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
want_cookie = 1;
@@ -1430,10 +909,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* clogging syn queue with openreqs with exponentially increasing
* timeout.
*/
- if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
- req = tcp_openreq_alloc();
+ req = reqsk_alloc(&tcp_request_sock_ops);
if (!req)
goto drop;
@@ -1461,10 +940,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_openreq_init(req, &tmp_opt, skb);
- req->af.v4_req.loc_addr = daddr;
- req->af.v4_req.rmt_addr = saddr;
- req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
- req->class = &or_ipv4;
+ ireq = inet_rsk(req);
+ ireq->loc_addr = daddr;
+ ireq->rmt_addr = saddr;
+ ireq->opt = tcp_v4_save_options(sk, skb);
if (!want_cookie)
TCP_ECN_create_request(req, skb->h.th);
@@ -1486,8 +965,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* are made in the function processing timewait state.
*/
if (tmp_opt.saw_tstamp &&
- sysctl_tcp_tw_recycle &&
- (dst = tcp_v4_route_req(sk, req)) != NULL &&
+ tcp_death_row.sysctl_tw_recycle &&
+ (dst = inet_csk_route_req(sk, req)) != NULL &&
(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
peer->v4daddr == saddr) {
if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
@@ -1500,7 +979,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
}
/* Kill the following clause, if you dislike this way. */
else if (!sysctl_tcp_syncookies &&
- (sysctl_max_syn_backlog - tcp_synq_len(sk) <
+ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
(!peer || !peer->tcp_ts_stamp) &&
(!dst || !dst_metric(dst, RTAX_RTT))) {
@@ -1511,32 +990,30 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* to destinations, already remembered
* to the moment of synflood.
*/
- NETDEBUG(if (net_ratelimit()) \
- printk(KERN_DEBUG "TCP: drop open "
- "request from %u.%u."
- "%u.%u/%u\n", \
- NIPQUAD(saddr),
- ntohs(skb->h.th->source)));
+ LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
+ "request from %u.%u.%u.%u/%u\n",
+ NIPQUAD(saddr),
+ ntohs(skb->h.th->source));
dst_release(dst);
goto drop_and_free;
}
isn = tcp_v4_init_sequence(sk, skb);
}
- req->snt_isn = isn;
+ tcp_rsk(req)->snt_isn = isn;
if (tcp_v4_send_synack(sk, req, dst))
goto drop_and_free;
if (want_cookie) {
- tcp_openreq_free(req);
+ reqsk_free(req);
} else {
- tcp_v4_synq_add(sk, req);
+ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
}
return 0;
drop_and_free:
- tcp_openreq_free(req);
+ reqsk_free(req);
drop:
TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
return 0;
@@ -1548,9 +1025,10 @@ drop:
* now create the new socket.
*/
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
- struct open_request *req,
+ struct request_sock *req,
struct dst_entry *dst)
{
+ struct inet_request_sock *ireq;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
@@ -1558,24 +1036,24 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (sk_acceptq_is_full(sk))
goto exit_overflow;
- if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+ if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
goto exit;
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
goto exit;
- newsk->sk_dst_cache = dst;
- tcp_v4_setup_caps(newsk, dst);
+ sk_setup_caps(newsk, dst);
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
- newinet->daddr = req->af.v4_req.rmt_addr;
- newinet->rcv_saddr = req->af.v4_req.loc_addr;
- newinet->saddr = req->af.v4_req.loc_addr;
- newinet->opt = req->af.v4_req.opt;
- req->af.v4_req.opt = NULL;
- newinet->mc_index = tcp_v4_iif(skb);
+ ireq = inet_rsk(req);
+ newinet->daddr = ireq->rmt_addr;
+ newinet->rcv_saddr = ireq->loc_addr;
+ newinet->saddr = ireq->loc_addr;
+ newinet->opt = ireq->opt;
+ ireq->opt = NULL;
+ newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = skb->nh.iph->ttl;
newtp->ext_header_len = 0;
if (newinet->opt)
@@ -1586,8 +1064,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
tcp_initialize_rcv_mss(newsk);
- __tcp_v4_hash(newsk, 0);
- __tcp_inherit_port(sk, newsk);
+ __inet_hash(&tcp_hashinfo, newsk, 0);
+ __inet_inherit_port(&tcp_hashinfo, sk, newsk);
return newsk;
@@ -1603,27 +1081,24 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
struct tcphdr *th = skb->h.th;
struct iphdr *iph = skb->nh.iph;
- struct tcp_sock *tp = tcp_sk(sk);
struct sock *nsk;
- struct open_request **prev;
+ struct request_sock **prev;
/* Find possible connection requests. */
- struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
- iph->saddr, iph->daddr);
+ struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
+ iph->saddr, iph->daddr);
if (req)
return tcp_check_req(sk, skb, req, prev);
- nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
- th->source,
- skb->nh.iph->daddr,
- ntohs(th->dest),
- tcp_v4_iif(skb));
+ nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
+ th->source, skb->nh.iph->daddr,
+ ntohs(th->dest), inet_iif(skb));
if (nsk) {
if (nsk->sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
- tcp_tw_put((struct tcp_tw_bucket *)nsk);
+ inet_twsk_put((struct inet_timewait_sock *)nsk);
return NULL;
}
@@ -1642,8 +1117,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb)
skb->nh.iph->daddr, skb->csum))
return 0;
- NETDEBUG(if (net_ratelimit())
- printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
skb->ip_summed = CHECKSUM_NONE;
}
if (skb->len <= 76) {
@@ -1759,9 +1233,9 @@ int tcp_v4_rcv(struct sk_buff *skb)
TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
TCP_SKB_CB(skb)->sacked = 0;
- sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
- skb->nh.iph->daddr, ntohs(th->dest),
- tcp_v4_iif(skb));
+ sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
+ skb->nh.iph->daddr, ntohs(th->dest),
+ inet_iif(skb));
if (!sk)
goto no_tcp_socket;
@@ -1813,24 +1287,26 @@ discard_and_relse:
do_time_wait:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- tcp_tw_put((struct tcp_tw_bucket *) sk);
+ inet_twsk_put((struct inet_timewait_sock *) sk);
goto discard_it;
}
if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
TCP_INC_STATS_BH(TCP_MIB_INERRS);
- tcp_tw_put((struct tcp_tw_bucket *) sk);
+ inet_twsk_put((struct inet_timewait_sock *) sk);
goto discard_it;
}
- switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
- skb, th, skb->len)) {
+ switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
+ skb, th)) {
case TCP_TW_SYN: {
- struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
- ntohs(th->dest),
- tcp_v4_iif(skb));
+ struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
+ skb->nh.iph->daddr,
+ ntohs(th->dest),
+ inet_iif(skb));
if (sk2) {
- tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
- tcp_tw_put((struct tcp_tw_bucket *)sk);
+ inet_twsk_deschedule((struct inet_timewait_sock *)sk,
+ &tcp_death_row);
+ inet_twsk_put((struct inet_timewait_sock *)sk);
sk = sk2;
goto process;
}
@@ -1846,112 +1322,6 @@ do_time_wait:
goto discard_it;
}
-/* With per-bucket locks this operation is not-atomic, so that
- * this version is not worse.
- */
-static void __tcp_v4_rehash(struct sock *sk)
-{
- sk->sk_prot->unhash(sk);
- sk->sk_prot->hash(sk);
-}
-
-static int tcp_v4_reselect_saddr(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- int err;
- struct rtable *rt;
- __u32 old_saddr = inet->saddr;
- __u32 new_saddr;
- __u32 daddr = inet->daddr;
-
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
-
- /* Query new route. */
- err = ip_route_connect(&rt, daddr, 0,
- RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if,
- IPPROTO_TCP,
- inet->sport, inet->dport, sk);
- if (err)
- return err;
-
- __sk_dst_set(sk, &rt->u.dst);
- tcp_v4_setup_caps(sk, &rt->u.dst);
-
- new_saddr = rt->rt_src;
-
- if (new_saddr == old_saddr)
- return 0;
-
- if (sysctl_ip_dynaddr > 1) {
- printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
- "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
- NIPQUAD(old_saddr),
- NIPQUAD(new_saddr));
- }
-
- inet->saddr = new_saddr;
- inet->rcv_saddr = new_saddr;
-
- /* XXX The only one ugly spot where we need to
- * XXX really change the sockets identity after
- * XXX it has entered the hashes. -DaveM
- *
- * Besides that, it does not check for connection
- * uniqueness. Wait for troubles.
- */
- __tcp_v4_rehash(sk);
- return 0;
-}
-
-int tcp_v4_rebuild_header(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
- u32 daddr;
- int err;
-
- /* Route is OK, nothing to do. */
- if (rt)
- return 0;
-
- /* Reroute. */
- daddr = inet->daddr;
- if (inet->opt && inet->opt->srr)
- daddr = inet->opt->faddr;
-
- {
- struct flowi fl = { .oif = sk->sk_bound_dev_if,
- .nl_u = { .ip4_u =
- { .daddr = daddr,
- .saddr = inet->saddr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = IPPROTO_TCP,
- .uli_u = { .ports =
- { .sport = inet->sport,
- .dport = inet->dport } } };
-
- err = ip_route_output_flow(&rt, &fl, sk, 0);
- }
- if (!err) {
- __sk_dst_set(sk, &rt->u.dst);
- tcp_v4_setup_caps(sk, &rt->u.dst);
- return 0;
- }
-
- /* Routing failed... */
- sk->sk_route_caps = 0;
-
- if (!sysctl_ip_dynaddr ||
- sk->sk_state != TCP_SYN_SENT ||
- (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
- (err = tcp_v4_reselect_saddr(sk)) != 0)
- sk->sk_err_soft = -err;
-
- return err;
-}
-
static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
{
struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
@@ -2000,18 +1370,18 @@ int tcp_v4_remember_stamp(struct sock *sk)
return 0;
}
-int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
+int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
{
- struct inet_peer *peer = NULL;
-
- peer = inet_getpeer(tw->tw_daddr, 1);
+ struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
if (peer) {
- if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
+ const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+ if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
(peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
- peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
- peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
- peer->tcp_ts = tw->tw_ts_recent;
+ peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
+ peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
+ peer->tcp_ts = tcptw->tw_ts_recent;
}
inet_putpeer(peer);
return 1;
@@ -2023,7 +1393,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
struct tcp_func ipv4_specific = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
- .rebuild_header = tcp_v4_rebuild_header,
+ .rebuild_header = inet_sk_rebuild_header,
.conn_request = tcp_v4_conn_request,
.syn_recv_sock = tcp_v4_syn_recv_sock,
.remember_stamp = tcp_v4_remember_stamp,
@@ -2039,13 +1409,14 @@ struct tcp_func ipv4_specific = {
*/
static int tcp_v4_init_sock(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
- tp->rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT;
/* So many TCP implementations out there (incorrectly) count the
@@ -2060,9 +1431,10 @@ static int tcp_v4_init_sock(struct sock *sk)
*/
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
tp->snd_cwnd_clamp = ~0;
- tp->mss_cache_std = tp->mss_cache = 536;
+ tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
+ icsk->icsk_ca_ops = &tcp_init_congestion_ops;
sk->sk_state = TCP_CLOSE;
@@ -2085,6 +1457,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
tcp_clear_xmit_timers(sk);
+ tcp_cleanup_congestion_control(sk);
+
/* Cleanup up the write buffer. */
sk_stream_writequeue_purge(sk);
@@ -2095,8 +1469,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
__skb_queue_purge(&tp->ucopy.prequeue);
/* Clean up a referenced TCP bind bucket. */
- if (tp->bind_hash)
- tcp_put_port(sk);
+ if (inet_csk(sk)->icsk_bind_hash)
+ inet_put_port(&tcp_hashinfo, sk);
/*
* If sendmsg cached page exists, toss it.
@@ -2116,13 +1490,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */
-static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
+static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
{
return hlist_empty(head) ? NULL :
- list_entry(head->first, struct tcp_tw_bucket, tw_node);
+ list_entry(head->first, struct inet_timewait_sock, tw_node);
}
-static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
+static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
{
return tw->tw_node.next ?
hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
@@ -2130,27 +1504,27 @@ static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
static void *listening_get_next(struct seq_file *seq, void *cur)
{
- struct tcp_sock *tp;
+ struct inet_connection_sock *icsk;
struct hlist_node *node;
struct sock *sk = cur;
struct tcp_iter_state* st = seq->private;
if (!sk) {
st->bucket = 0;
- sk = sk_head(&tcp_listening_hash[0]);
+ sk = sk_head(&tcp_hashinfo.listening_hash[0]);
goto get_sk;
}
++st->num;
if (st->state == TCP_SEQ_STATE_OPENREQ) {
- struct open_request *req = cur;
+ struct request_sock *req = cur;
- tp = tcp_sk(st->syn_wait_sk);
+ icsk = inet_csk(st->syn_wait_sk);
req = req->dl_next;
while (1) {
while (req) {
- if (req->class->family == st->family) {
+ if (req->rsk_ops->family == st->family) {
cur = req;
goto out;
}
@@ -2159,17 +1533,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
if (++st->sbucket >= TCP_SYNQ_HSIZE)
break;
get_req:
- req = tp->listen_opt->syn_table[st->sbucket];
+ req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
}
sk = sk_next(st->syn_wait_sk);
st->state = TCP_SEQ_STATE_LISTENING;
- read_unlock_bh(&tp->syn_wait_lock);
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
} else {
- tp = tcp_sk(sk);
- read_lock_bh(&tp->syn_wait_lock);
- if (tp->listen_opt && tp->listen_opt->qlen)
+ icsk = inet_csk(sk);
+ read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ if (reqsk_queue_len(&icsk->icsk_accept_queue))
goto start_req;
- read_unlock_bh(&tp->syn_wait_lock);
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
sk = sk_next(sk);
}
get_sk:
@@ -2178,9 +1552,9 @@ get_sk:
cur = sk;
goto out;
}
- tp = tcp_sk(sk);
- read_lock_bh(&tp->syn_wait_lock);
- if (tp->listen_opt && tp->listen_opt->qlen) {
+ icsk = inet_csk(sk);
+ read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+ if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
start_req:
st->uid = sock_i_uid(sk);
st->syn_wait_sk = sk;
@@ -2188,10 +1562,10 @@ start_req:
st->sbucket = 0;
goto get_req;
}
- read_unlock_bh(&tp->syn_wait_lock);
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
- if (++st->bucket < TCP_LHTABLE_SIZE) {
- sk = sk_head(&tcp_listening_hash[st->bucket]);
+ if (++st->bucket < INET_LHTABLE_SIZE) {
+ sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
goto get_sk;
}
cur = NULL;
@@ -2215,16 +1589,16 @@ static void *established_get_first(struct seq_file *seq)
struct tcp_iter_state* st = seq->private;
void *rc = NULL;
- for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
+ for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
struct sock *sk;
struct hlist_node *node;
- struct tcp_tw_bucket *tw;
+ struct inet_timewait_sock *tw;
/* We can reschedule _before_ having picked the target: */
cond_resched_softirq();
- read_lock(&tcp_ehash[st->bucket].lock);
- sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
+ read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+ sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
if (sk->sk_family != st->family) {
continue;
}
@@ -2232,15 +1606,15 @@ static void *established_get_first(struct seq_file *seq)
goto out;
}
st->state = TCP_SEQ_STATE_TIME_WAIT;
- tw_for_each(tw, node,
- &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
+ inet_twsk_for_each(tw, node,
+ &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
if (tw->tw_family != st->family) {
continue;
}
rc = tw;
goto out;
}
- read_unlock(&tcp_ehash[st->bucket].lock);
+ read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
st->state = TCP_SEQ_STATE_ESTABLISHED;
}
out:
@@ -2250,7 +1624,7 @@ out:
static void *established_get_next(struct seq_file *seq, void *cur)
{
struct sock *sk = cur;
- struct tcp_tw_bucket *tw;
+ struct inet_timewait_sock *tw;
struct hlist_node *node;
struct tcp_iter_state* st = seq->private;
@@ -2267,15 +1641,15 @@ get_tw:
cur = tw;
goto out;
}
- read_unlock(&tcp_ehash[st->bucket].lock);
+ read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
st->state = TCP_SEQ_STATE_ESTABLISHED;
/* We can reschedule between buckets: */
cond_resched_softirq();
- if (++st->bucket < tcp_ehash_size) {
- read_lock(&tcp_ehash[st->bucket].lock);
- sk = sk_head(&tcp_ehash[st->bucket].chain);
+ if (++st->bucket < tcp_hashinfo.ehash_size) {
+ read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+ sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
} else {
cur = NULL;
goto out;
@@ -2289,7 +1663,7 @@ get_tw:
}
st->state = TCP_SEQ_STATE_TIME_WAIT;
- tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
+ tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
goto get_tw;
found:
cur = sk;
@@ -2313,12 +1687,12 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
void *rc;
struct tcp_iter_state* st = seq->private;
- tcp_listen_lock();
+ inet_listen_lock(&tcp_hashinfo);
st->state = TCP_SEQ_STATE_LISTENING;
rc = listening_get_idx(seq, &pos);
if (!rc) {
- tcp_listen_unlock();
+ inet_listen_unlock(&tcp_hashinfo);
local_bh_disable();
st->state = TCP_SEQ_STATE_ESTABLISHED;
rc = established_get_idx(seq, pos);
@@ -2351,7 +1725,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
case TCP_SEQ_STATE_LISTENING:
rc = listening_get_next(seq, v);
if (!rc) {
- tcp_listen_unlock();
+ inet_listen_unlock(&tcp_hashinfo);
local_bh_disable();
st->state = TCP_SEQ_STATE_ESTABLISHED;
rc = established_get_first(seq);
@@ -2374,17 +1748,17 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
switch (st->state) {
case TCP_SEQ_STATE_OPENREQ:
if (v) {
- struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
- read_unlock_bh(&tp->syn_wait_lock);
+ struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
+ read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
- tcp_listen_unlock();
+ inet_listen_unlock(&tcp_hashinfo);
break;
case TCP_SEQ_STATE_TIME_WAIT:
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
- read_unlock(&tcp_ehash[st->bucket].lock);
+ read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
local_bh_enable();
break;
}
@@ -2451,18 +1825,19 @@ void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
}
-static void get_openreq4(struct sock *sk, struct open_request *req,
+static void get_openreq4(struct sock *sk, struct request_sock *req,
char *tmpbuf, int i, int uid)
{
+ const struct inet_request_sock *ireq = inet_rsk(req);
int ttd = req->expires - jiffies;
sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
i,
- req->af.v4_req.loc_addr,
+ ireq->loc_addr,
ntohs(inet_sk(sk)->sport),
- req->af.v4_req.rmt_addr,
- ntohs(req->rmt_port),
+ ireq->rmt_addr,
+ ntohs(ireq->rmt_port),
TCP_SYN_RECV,
0, 0, /* could print option size, but that is af dependent. */
1, /* timers active (only the expire timer) */
@@ -2480,18 +1855,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
int timer_active;
unsigned long timer_expires;
struct tcp_sock *tp = tcp_sk(sp);
+ const struct inet_connection_sock *icsk = inet_csk(sp);
struct inet_sock *inet = inet_sk(sp);
unsigned int dest = inet->daddr;
unsigned int src = inet->rcv_saddr;
__u16 destp = ntohs(inet->dport);
__u16 srcp = ntohs(inet->sport);
- if (tp->pending == TCP_TIME_RETRANS) {
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
timer_active = 1;
- timer_expires = tp->timeout;
- } else if (tp->pending == TCP_TIME_PROBE0) {
+ timer_expires = icsk->icsk_timeout;
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
- timer_expires = tp->timeout;
+ timer_expires = icsk->icsk_timeout;
} else if (timer_pending(&sp->sk_timer)) {
timer_active = 2;
timer_expires = sp->sk_timer.expires;
@@ -2506,17 +1882,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
timer_active,
jiffies_to_clock_t(timer_expires - jiffies),
- tp->retransmits,
+ icsk->icsk_retransmits,
sock_i_uid(sp),
- tp->probes_out,
+ icsk->icsk_probes_out,
sock_i_ino(sp),
atomic_read(&sp->sk_refcnt), sp,
- tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
+ icsk->icsk_rto,
+ icsk->icsk_ack.ato,
+ (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
tp->snd_cwnd,
tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
}
-static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
+static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
{
unsigned int dest, src;
__u16 destp, srcp;
@@ -2596,7 +1974,7 @@ struct proto tcp_prot = {
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
- .accept = tcp_accept,
+ .accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
@@ -2611,6 +1989,7 @@ struct proto tcp_prot = {
.get_port = tcp_v4_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &tcp_sockets_allocated,
+ .orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
@@ -2618,6 +1997,8 @@ struct proto tcp_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
+ .twsk_obj_size = sizeof(struct tcp_timewait_sock),
+ .rsk_prot = &tcp_request_sock_ops,
};
@@ -2638,19 +2019,13 @@ void __init tcp_v4_init(struct net_proto_family *ops)
}
EXPORT_SYMBOL(ipv4_specific);
-EXPORT_SYMBOL(tcp_bind_hash);
-EXPORT_SYMBOL(tcp_bucket_create);
+EXPORT_SYMBOL(inet_bind_bucket_create);
EXPORT_SYMBOL(tcp_hashinfo);
-EXPORT_SYMBOL(tcp_inherit_port);
-EXPORT_SYMBOL(tcp_listen_wlock);
-EXPORT_SYMBOL(tcp_port_rover);
EXPORT_SYMBOL(tcp_prot);
-EXPORT_SYMBOL(tcp_put_port);
EXPORT_SYMBOL(tcp_unhash);
EXPORT_SYMBOL(tcp_v4_conn_request);
EXPORT_SYMBOL(tcp_v4_connect);
EXPORT_SYMBOL(tcp_v4_do_rcv);
-EXPORT_SYMBOL(tcp_v4_rebuild_header);
EXPORT_SYMBOL(tcp_v4_remember_stamp);
EXPORT_SYMBOL(tcp_v4_send_check);
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
@@ -2660,7 +2035,6 @@ EXPORT_SYMBOL(tcp_proc_register);
EXPORT_SYMBOL(tcp_proc_unregister);
#endif
EXPORT_SYMBOL(sysctl_local_port_range);
-EXPORT_SYMBOL(sysctl_max_syn_backlog);
EXPORT_SYMBOL(sysctl_tcp_low_latency);
EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index eea1a17..a88db28 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -35,13 +35,27 @@
#define SYNC_INIT 1
#endif
-int sysctl_tcp_tw_recycle;
-int sysctl_tcp_max_tw_buckets = NR_FILE*2;
-
int sysctl_tcp_syncookies = SYNC_INIT;
int sysctl_tcp_abort_on_overflow;
-static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
+struct inet_timewait_death_row tcp_death_row = {
+ .sysctl_max_tw_buckets = NR_FILE * 2,
+ .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
+ .death_lock = SPIN_LOCK_UNLOCKED,
+ .hashinfo = &tcp_hashinfo,
+ .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
+ (unsigned long)&tcp_death_row),
+ .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
+ inet_twdr_twkill_work,
+ &tcp_death_row),
+/* Short-time timewait calendar */
+
+ .twcal_hand = -1,
+ .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+ (unsigned long)&tcp_death_row),
+};
+
+EXPORT_SYMBOL_GPL(tcp_death_row);
static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
@@ -52,47 +66,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
return (seq == e_win && seq == end_seq);
}
-/* New-style handling of TIME_WAIT sockets. */
-
-int tcp_tw_count;
-
-
-/* Must be called with locally disabled BHs. */
-static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
-{
- struct tcp_ehash_bucket *ehead;
- struct tcp_bind_hashbucket *bhead;
- struct tcp_bind_bucket *tb;
-
- /* Unlink from established hashes. */
- ehead = &tcp_ehash[tw->tw_hashent];
- write_lock(&ehead->lock);
- if (hlist_unhashed(&tw->tw_node)) {
- write_unlock(&ehead->lock);
- return;
- }
- __hlist_del(&tw->tw_node);
- sk_node_init(&tw->tw_node);
- write_unlock(&ehead->lock);
-
- /* Disassociate with bind bucket. */
- bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
- spin_lock(&bhead->lock);
- tb = tw->tw_tb;
- __hlist_del(&tw->tw_bind_node);
- tw->tw_tb = NULL;
- tcp_bucket_destroy(tb);
- spin_unlock(&bhead->lock);
-
-#ifdef INET_REFCNT_DEBUG
- if (atomic_read(&tw->tw_refcnt) != 1) {
- printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
- atomic_read(&tw->tw_refcnt));
- }
-#endif
- tcp_tw_put(tw);
-}
-
/*
* * Main purpose of TIME-WAIT state is to close connection gracefully,
* when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -122,19 +95,20 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
* to avoid misread sequence numbers, states etc. --ANK
*/
enum tcp_tw_status
-tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
+ const struct tcphdr *th)
{
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
struct tcp_options_received tmp_opt;
int paws_reject = 0;
tmp_opt.saw_tstamp = 0;
- if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
+ if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
tcp_parse_options(skb, &tmp_opt, 0);
if (tmp_opt.saw_tstamp) {
- tmp_opt.ts_recent = tw->tw_ts_recent;
- tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+ tmp_opt.ts_recent = tcptw->tw_ts_recent;
+ tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
paws_reject = tcp_paws_check(&tmp_opt, th->rst);
}
}
@@ -145,20 +119,20 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
/* Out of window, send ACK */
if (paws_reject ||
!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tw->tw_rcv_nxt,
- tw->tw_rcv_nxt + tw->tw_rcv_wnd))
+ tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
return TCP_TW_ACK;
if (th->rst)
goto kill;
- if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
goto kill_with_rst;
/* Dup ACK? */
- if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
+ if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
@@ -166,19 +140,19 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
* reset.
*/
if (!th->fin ||
- TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
+ TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
kill_with_rst:
- tcp_tw_deschedule(tw);
- tcp_tw_put(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
+ inet_twsk_put(tw);
return TCP_TW_RST;
}
/* FIN arrived, enter true time-wait state. */
- tw->tw_substate = TCP_TIME_WAIT;
- tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ tw->tw_substate = TCP_TIME_WAIT;
+ tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (tmp_opt.saw_tstamp) {
- tw->tw_ts_recent_stamp = xtime.tv_sec;
- tw->tw_ts_recent = tmp_opt.rcv_tsval;
+ tcptw->tw_ts_recent_stamp = xtime.tv_sec;
+ tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
}
/* I am shamed, but failed to make it more elegant.
@@ -187,11 +161,13 @@ kill_with_rst:
* do not undertsnad recycling in any case, it not
* a big problem in practice. --ANK */
if (tw->tw_family == AF_INET &&
- sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
+ tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
tcp_v4_tw_remember_stamp(tw))
- tcp_tw_schedule(tw, tw->tw_timeout);
+ inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
+ TCP_TIMEWAIT_LEN);
else
- tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+ TCP_TIMEWAIT_LEN);
return TCP_TW_ACK;
}
@@ -213,7 +189,7 @@ kill_with_rst:
*/
if (!paws_reject &&
- (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
+ (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
/* In window segment, it may be only reset or bare ack. */
@@ -224,19 +200,20 @@ kill_with_rst:
*/
if (sysctl_tcp_rfc1337 == 0) {
kill:
- tcp_tw_deschedule(tw);
- tcp_tw_put(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
+ inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
}
- tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+ TCP_TIMEWAIT_LEN);
if (tmp_opt.saw_tstamp) {
- tw->tw_ts_recent = tmp_opt.rcv_tsval;
- tw->tw_ts_recent_stamp = xtime.tv_sec;
+ tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
+ tcptw->tw_ts_recent_stamp = xtime.tv_sec;
}
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
@@ -258,9 +235,10 @@ kill:
*/
if (th->syn && !th->rst && !th->ack && !paws_reject &&
- (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
- (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
- u32 isn = tw->tw_snd_nxt + 65535 + 2;
+ (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
+ (tmp_opt.saw_tstamp &&
+ (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+ u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
if (isn == 0)
isn++;
TCP_SKB_CB(skb)->when = isn;
@@ -278,107 +256,57 @@ kill:
* Do not reschedule in the last case.
*/
if (paws_reject || th->ack)
- tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+ inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+ TCP_TIMEWAIT_LEN);
/* Send ACK. Note, we do not put the bucket,
* it will be released by caller.
*/
return TCP_TW_ACK;
}
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
-/* Enter the time wait state. This is called with locally disabled BH.
- * Essentially we whip up a timewait bucket, copy the
- * relevant info into it from the SK, and mess with hash chains
- * and list linkage.
- */
-static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
-{
- struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
- struct tcp_bind_hashbucket *bhead;
-
- /* Step 1: Put TW into bind hash. Original socket stays there too.
- Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
- binding cache, even if it is closed.
- */
- bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
- spin_lock(&bhead->lock);
- tw->tw_tb = tcp_sk(sk)->bind_hash;
- BUG_TRAP(tcp_sk(sk)->bind_hash);
- tw_add_bind_node(tw, &tw->tw_tb->owners);
- spin_unlock(&bhead->lock);
-
- write_lock(&ehead->lock);
-
- /* Step 2: Remove SK from established hash. */
- if (__sk_del_node_init(sk))
- sock_prot_dec_use(sk->sk_prot);
-
- /* Step 3: Hash TW into TIMEWAIT half of established hash table. */
- tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
- atomic_inc(&tw->tw_refcnt);
-
- write_unlock(&ehead->lock);
-}
-
/*
* Move a socket to time-wait or dead fin-wait-2 state.
*/
void tcp_time_wait(struct sock *sk, int state, int timeo)
{
- struct tcp_tw_bucket *tw = NULL;
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_timewait_sock *tw = NULL;
+ const struct tcp_sock *tp = tcp_sk(sk);
int recycle_ok = 0;
- if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
+ if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = tp->af_specific->remember_stamp(sk);
- if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
- tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
-
- if(tw != NULL) {
- struct inet_sock *inet = inet_sk(sk);
- int rto = (tp->rto<<2) - (tp->rto>>1);
-
- /* Give us an identity. */
- tw->tw_daddr = inet->daddr;
- tw->tw_rcv_saddr = inet->rcv_saddr;
- tw->tw_bound_dev_if = sk->sk_bound_dev_if;
- tw->tw_num = inet->num;
- tw->tw_state = TCP_TIME_WAIT;
- tw->tw_substate = state;
- tw->tw_sport = inet->sport;
- tw->tw_dport = inet->dport;
- tw->tw_family = sk->sk_family;
- tw->tw_reuse = sk->sk_reuse;
- tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
- atomic_set(&tw->tw_refcnt, 1);
+ if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
+ tw = inet_twsk_alloc(sk, state);
+
+ if (tw != NULL) {
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
- tw->tw_hashent = sk->sk_hashent;
- tw->tw_rcv_nxt = tp->rcv_nxt;
- tw->tw_snd_nxt = tp->snd_nxt;
- tw->tw_rcv_wnd = tcp_receive_window(tp);
- tw->tw_ts_recent = tp->rx_opt.ts_recent;
- tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
- tw_dead_node_init(tw);
+ tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
+ tcptw->tw_rcv_nxt = tp->rcv_nxt;
+ tcptw->tw_snd_nxt = tp->snd_nxt;
+ tcptw->tw_rcv_wnd = tcp_receive_window(tp);
+ tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
+ tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
- ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
- ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
- tw->tw_v6_ipv6only = np->ipv6only;
- } else {
- memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
- memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
- tw->tw_v6_ipv6only = 0;
+ ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
+ ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+ tw->tw_ipv6only = np->ipv6only;
}
#endif
/* Linkage updates. */
- __tcp_tw_hashdance(sk, tw);
+ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
/* Get the TIME_WAIT timeout firing. */
if (timeo < rto)
@@ -392,8 +320,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
timeo = TCP_TIMEWAIT_LEN;
}
- tcp_tw_schedule(tw, timeo);
- tcp_tw_put(tw);
+ inet_twsk_schedule(tw, &tcp_death_row, timeo,
+ TCP_TIMEWAIT_LEN);
+ inet_twsk_put(tw);
} else {
/* Sorry, if we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
@@ -407,352 +336,35 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcp_done(sk);
}
-/* Kill off TIME_WAIT sockets once their lifetime has expired. */
-static int tcp_tw_death_row_slot;
-
-static void tcp_twkill(unsigned long);
-
-/* TIME_WAIT reaping mechanism. */
-#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
-#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
-
-#define TCP_TWKILL_QUOTA 100
-
-static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
-static DEFINE_SPINLOCK(tw_death_lock);
-static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
-static void twkill_work(void *);
-static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
-static u32 twkill_thread_slots;
-
-/* Returns non-zero if quota exceeded. */
-static int tcp_do_twkill_work(int slot, unsigned int quota)
-{
- struct tcp_tw_bucket *tw;
- struct hlist_node *node;
- unsigned int killed;
- int ret;
-
- /* NOTE: compare this to previous version where lock
- * was released after detaching chain. It was racy,
- * because tw buckets are scheduled in not serialized context
- * in 2.3 (with netfilter), and with softnet it is common, because
- * soft irqs are not sequenced.
- */
- killed = 0;
- ret = 0;
-rescan:
- tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
- __tw_del_dead_node(tw);
- spin_unlock(&tw_death_lock);
- tcp_timewait_kill(tw);
- tcp_tw_put(tw);
- killed++;
- spin_lock(&tw_death_lock);
- if (killed > quota) {
- ret = 1;
- break;
- }
-
- /* While we dropped tw_death_lock, another cpu may have
- * killed off the next TW bucket in the list, therefore
- * do a fresh re-read of the hlist head node with the
- * lock reacquired. We still use the hlist traversal
- * macro in order to get the prefetches.
- */
- goto rescan;
- }
-
- tcp_tw_count -= killed;
- NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
-
- return ret;
-}
-
-static void tcp_twkill(unsigned long dummy)
-{
- int need_timer, ret;
-
- spin_lock(&tw_death_lock);
-
- if (tcp_tw_count == 0)
- goto out;
-
- need_timer = 0;
- ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
- if (ret) {
- twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
- mb();
- schedule_work(&tcp_twkill_work);
- need_timer = 1;
- } else {
- /* We purged the entire slot, anything left? */
- if (tcp_tw_count)
- need_timer = 1;
- }
- tcp_tw_death_row_slot =
- ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
- if (need_timer)
- mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
-out:
- spin_unlock(&tw_death_lock);
-}
-
-extern void twkill_slots_invalid(void);
-
-static void twkill_work(void *dummy)
-{
- int i;
-
- if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
- twkill_slots_invalid();
-
- while (twkill_thread_slots) {
- spin_lock_bh(&tw_death_lock);
- for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
- if (!(twkill_thread_slots & (1 << i)))
- continue;
-
- while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
- if (need_resched()) {
- spin_unlock_bh(&tw_death_lock);
- schedule();
- spin_lock_bh(&tw_death_lock);
- }
- }
-
- twkill_thread_slots &= ~(1 << i);
- }
- spin_unlock_bh(&tw_death_lock);
- }
-}
-
-/* These are always called from BH context. See callers in
- * tcp_input.c to verify this.
- */
-
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
-{
- spin_lock(&tw_death_lock);
- if (tw_del_dead_node(tw)) {
- tcp_tw_put(tw);
- if (--tcp_tw_count == 0)
- del_timer(&tcp_tw_timer);
- }
- spin_unlock(&tw_death_lock);
- tcp_timewait_kill(tw);
-}
-
-/* Short-time timewait calendar */
-
-static int tcp_twcal_hand = -1;
-static int tcp_twcal_jiffie;
-static void tcp_twcal_tick(unsigned long);
-static struct timer_list tcp_twcal_timer =
- TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
-static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
-
-static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
-{
- struct hlist_head *list;
- int slot;
-
- /* timeout := RTO * 3.5
- *
- * 3.5 = 1+2+0.5 to wait for two retransmits.
- *
- * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
- * our ACK acking that FIN can be lost. If N subsequent retransmitted
- * FINs (or previous seqments) are lost (probability of such event
- * is p^(N+1), where p is probability to lose single packet and
- * time to detect the loss is about RTO*(2^N - 1) with exponential
- * backoff). Normal timewait length is calculated so, that we
- * waited at least for one retransmitted FIN (maximal RTO is 120sec).
- * [ BTW Linux. following BSD, violates this requirement waiting
- * only for 60sec, we should wait at least for 240 secs.
- * Well, 240 consumes too much of resources 8)
- * ]
- * This interval is not reduced to catch old duplicate and
- * responces to our wandering segments living for two MSLs.
- * However, if we use PAWS to detect
- * old duplicates, we can reduce the interval to bounds required
- * by RTO, rather than MSL. So, if peer understands PAWS, we
- * kill tw bucket after 3.5*RTO (it is important that this number
- * is greater than TS tick!) and detect old duplicates with help
- * of PAWS.
- */
- slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
-
- spin_lock(&tw_death_lock);
-
- /* Unlink it, if it was scheduled */
- if (tw_del_dead_node(tw))
- tcp_tw_count--;
- else
- atomic_inc(&tw->tw_refcnt);
-
- if (slot >= TCP_TW_RECYCLE_SLOTS) {
- /* Schedule to slow timer */
- if (timeo >= TCP_TIMEWAIT_LEN) {
- slot = TCP_TWKILL_SLOTS-1;
- } else {
- slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
- if (slot >= TCP_TWKILL_SLOTS)
- slot = TCP_TWKILL_SLOTS-1;
- }
- tw->tw_ttd = jiffies + timeo;
- slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
- list = &tcp_tw_death_row[slot];
- } else {
- tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
-
- if (tcp_twcal_hand < 0) {
- tcp_twcal_hand = 0;
- tcp_twcal_jiffie = jiffies;
- tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
- add_timer(&tcp_twcal_timer);
- } else {
- if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
- mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
- slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
- }
- list = &tcp_twcal_row[slot];
- }
-
- hlist_add_head(&tw->tw_death_node, list);
-
- if (tcp_tw_count++ == 0)
- mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
- spin_unlock(&tw_death_lock);
-}
-
-void tcp_twcal_tick(unsigned long dummy)
-{
- int n, slot;
- unsigned long j;
- unsigned long now = jiffies;
- int killed = 0;
- int adv = 0;
-
- spin_lock(&tw_death_lock);
- if (tcp_twcal_hand < 0)
- goto out;
-
- slot = tcp_twcal_hand;
- j = tcp_twcal_jiffie;
-
- for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
- if (time_before_eq(j, now)) {
- struct hlist_node *node, *safe;
- struct tcp_tw_bucket *tw;
-
- tw_for_each_inmate_safe(tw, node, safe,
- &tcp_twcal_row[slot]) {
- __tw_del_dead_node(tw);
- tcp_timewait_kill(tw);
- tcp_tw_put(tw);
- killed++;
- }
- } else {
- if (!adv) {
- adv = 1;
- tcp_twcal_jiffie = j;
- tcp_twcal_hand = slot;
- }
-
- if (!hlist_empty(&tcp_twcal_row[slot])) {
- mod_timer(&tcp_twcal_timer, j);
- goto out;
- }
- }
- j += (1<<TCP_TW_RECYCLE_TICK);
- slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
- }
- tcp_twcal_hand = -1;
-
-out:
- if ((tcp_tw_count -= killed) == 0)
- del_timer(&tcp_tw_timer);
- NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
- spin_unlock(&tw_death_lock);
-}
-
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
*
* Actually, we could lots of memory writes here. tp of listening
* socket contains all necessary default parameters.
*/
-struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
+struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
{
- /* allocate the newsk from the same slab of the master sock,
- * if not, at sk_free time we'll try to free it from the wrong
- * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
- struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
+ struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
- if(newsk != NULL) {
+ if (newsk != NULL) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_request_sock *treq = tcp_rsk(req);
+ struct inet_connection_sock *newicsk = inet_csk(sk);
struct tcp_sock *newtp;
- struct sk_filter *filter;
-
- memcpy(newsk, sk, sizeof(struct tcp_sock));
- newsk->sk_state = TCP_SYN_RECV;
-
- /* SANITY */
- sk_node_init(&newsk->sk_node);
- tcp_sk(newsk)->bind_hash = NULL;
-
- /* Clone the TCP header template */
- inet_sk(newsk)->dport = req->rmt_port;
-
- sock_lock_init(newsk);
- bh_lock_sock(newsk);
-
- rwlock_init(&newsk->sk_dst_lock);
- atomic_set(&newsk->sk_rmem_alloc, 0);
- skb_queue_head_init(&newsk->sk_receive_queue);
- atomic_set(&newsk->sk_wmem_alloc, 0);
- skb_queue_head_init(&newsk->sk_write_queue);
- atomic_set(&newsk->sk_omem_alloc, 0);
- newsk->sk_wmem_queued = 0;
- newsk->sk_forward_alloc = 0;
-
- sock_reset_flag(newsk, SOCK_DONE);
- newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
- newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
- newsk->sk_send_head = NULL;
- rwlock_init(&newsk->sk_callback_lock);
- skb_queue_head_init(&newsk->sk_error_queue);
- newsk->sk_write_space = sk_stream_write_space;
-
- if ((filter = newsk->sk_filter) != NULL)
- sk_filter_charge(newsk, filter);
-
- if (unlikely(xfrm_sk_clone_policy(newsk))) {
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- newsk->sk_destruct = NULL;
- sk_free(newsk);
- return NULL;
- }
/* Now setup tcp_sock */
newtp = tcp_sk(newsk);
newtp->pred_flags = 0;
- newtp->rcv_nxt = req->rcv_isn + 1;
- newtp->snd_nxt = req->snt_isn + 1;
- newtp->snd_una = req->snt_isn + 1;
- newtp->snd_sml = req->snt_isn + 1;
+ newtp->rcv_nxt = treq->rcv_isn + 1;
+ newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1;
tcp_prequeue_init(newtp);
- tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
+ tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
- newtp->retransmits = 0;
- newtp->backoff = 0;
newtp->srtt = 0;
newtp->mdev = TCP_TIMEOUT_INIT;
- newtp->rto = TCP_TIMEOUT_INIT;
+ newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
newtp->left_out = 0;
@@ -772,54 +384,40 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
newtp->frto_counter = 0;
newtp->frto_highmark = 0;
- tcp_set_ca_state(newtp, TCP_CA_Open);
+ newicsk->icsk_ca_ops = &tcp_reno;
+
+ tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
skb_queue_head_init(&newtp->out_of_order_queue);
- newtp->rcv_wup = req->rcv_isn + 1;
- newtp->write_seq = req->snt_isn + 1;
+ newtp->rcv_wup = treq->rcv_isn + 1;
+ newtp->write_seq = treq->snt_isn + 1;
newtp->pushed_seq = newtp->write_seq;
- newtp->copied_seq = req->rcv_isn + 1;
+ newtp->copied_seq = treq->rcv_isn + 1;
newtp->rx_opt.saw_tstamp = 0;
newtp->rx_opt.dsack = 0;
newtp->rx_opt.eff_sacks = 0;
- newtp->probes_out = 0;
newtp->rx_opt.num_sacks = 0;
newtp->urg_data = 0;
- newtp->listen_opt = NULL;
- newtp->accept_queue = newtp->accept_queue_tail = NULL;
- /* Deinitialize syn_wait_lock to trap illegal accesses. */
- memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
-
- /* Back to base struct sock members. */
- newsk->sk_err = 0;
- newsk->sk_priority = 0;
- atomic_set(&newsk->sk_refcnt, 2);
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet_sock_nr);
-#endif
- atomic_inc(&tcp_sockets_allocated);
if (sock_flag(newsk, SOCK_KEEPOPEN))
- tcp_reset_keepalive_timer(newsk,
- keepalive_time_when(newtp));
- newsk->sk_socket = NULL;
- newsk->sk_sleep = NULL;
+ inet_csk_reset_keepalive_timer(newsk,
+ keepalive_time_when(newtp));
- newtp->rx_opt.tstamp_ok = req->tstamp_ok;
- if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) {
+ newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
+ if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
if (sysctl_tcp_fack)
newtp->rx_opt.sack_ok |= 2;
}
newtp->window_clamp = req->window_clamp;
newtp->rcv_ssthresh = req->rcv_wnd;
newtp->rcv_wnd = req->rcv_wnd;
- newtp->rx_opt.wscale_ok = req->wscale_ok;
+ newtp->rx_opt.wscale_ok = ireq->wscale_ok;
if (newtp->rx_opt.wscale_ok) {
- newtp->rx_opt.snd_wscale = req->snd_wscale;
- newtp->rx_opt.rcv_wscale = req->rcv_wscale;
+ newtp->rx_opt.snd_wscale = ireq->snd_wscale;
+ newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
} else {
newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
newtp->window_clamp = min(newtp->window_clamp, 65535U);
@@ -836,14 +434,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
newtp->tcp_header_len = sizeof(struct tcphdr);
}
if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
- newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
+ newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
TCP_ECN_openreq_child(newtp, req);
if (newtp->ecn_flags&TCP_ECN_OK)
sock_set_flag(newsk, SOCK_NO_LARGESEND);
- tcp_ca_init(newtp);
-
TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
}
return newsk;
@@ -851,12 +447,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
/*
* Process an incoming packet for SYN_RECV sockets represented
- * as an open_request.
+ * as a request_sock.
*/
struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
- struct open_request *req,
- struct open_request **prev)
+ struct request_sock *req,
+ struct request_sock **prev)
{
struct tcphdr *th = skb->h.th;
struct tcp_sock *tp = tcp_sk(sk);
@@ -881,7 +477,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
}
/* Check for pure retransmitted SYN. */
- if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
+ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
flg == TCP_FLAG_SYN &&
!paws_reject) {
/*
@@ -901,7 +497,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
* Enforce "SYN-ACK" according to figure 8, figure 6
* of RFC793, fixed by RFC1122.
*/
- req->class->rtx_syn_ack(sk, req, NULL);
+ req->rsk_ops->rtx_syn_ack(sk, req, NULL);
return NULL;
}
@@ -934,9 +530,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
does sequence test, SYN is truncated, and thus we consider
it a bare ACK.
- If tp->defer_accept, we silently drop this bare ACK. Otherwise,
- we create an established connection. Both ends (listening sockets)
- accept the new incoming connection and try to talk to each other. 8-)
+ If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
+ bare ACK. Otherwise, we create an established connection. Both
+ ends (listening sockets) accept the new incoming connection and try
+ to talk to each other. 8-)
Note: This case is both harmless, and rare. Possibility is about the
same as us discovering intelligent life on another plant tomorrow.
@@ -959,7 +556,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
* Invalid ACK: reset will be sent by listening socket
*/
if ((flg & TCP_FLAG_ACK) &&
- (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1))
+ (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
return sk;
/* Also, it would be not so bad idea to check rcv_tsecr, which
@@ -970,10 +567,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
/* RFC793: "first check sequence number". */
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
+ tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST))
- req->class->send_ack(skb, req);
+ req->rsk_ops->send_ack(skb, req);
if (paws_reject)
NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
return NULL;
@@ -981,12 +578,12 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
/* In sequence, PAWS is OK. */
- if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
+ if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
req->ts_recent = tmp_opt.rcv_tsval;
- if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
+ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
/* Truncate SYN, it is out of window starting
- at req->rcv_isn+1. */
+ at tcp_rsk(req)->rcv_isn + 1. */
flg &= ~TCP_FLAG_SYN;
}
@@ -1003,8 +600,9 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
return NULL;
/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
- if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
- req->acked = 1;
+ if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+ inet_rsk(req)->acked = 1;
return NULL;
}
@@ -1018,24 +616,24 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
if (child == NULL)
goto listen_overflow;
- tcp_synq_unlink(tp, req, prev);
- tcp_synq_removed(sk, req);
+ inet_csk_reqsk_queue_unlink(sk, req, prev);
+ inet_csk_reqsk_queue_removed(sk, req);
- tcp_acceptq_queue(sk, req, child);
+ inet_csk_reqsk_queue_add(sk, req, child);
return child;
listen_overflow:
if (!sysctl_tcp_abort_on_overflow) {
- req->acked = 1;
+ inet_rsk(req)->acked = 1;
return NULL;
}
embryonic_reset:
NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
if (!(flg & TCP_FLAG_RST))
- req->class->send_reset(skb);
+ req->rsk_ops->send_reset(skb);
- tcp_synq_drop(sk, req, prev);
+ inet_csk_reqsk_queue_drop(sk, req, prev);
return NULL;
}
@@ -1074,4 +672,3 @@ EXPORT_SYMBOL(tcp_check_req);
EXPORT_SYMBOL(tcp_child_process);
EXPORT_SYMBOL(tcp_create_openreq_child);
EXPORT_SYMBOL(tcp_timewait_state_process);
-EXPORT_SYMBOL(tcp_tw_deschedule);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fa24e7a..6094db5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
* will allow a single TSO frame to consume. Building TSO frames
* which are too large can cause TCP streams to be bursty.
*/
-int sysctl_tcp_tso_win_divisor = 8;
+int sysctl_tcp_tso_win_divisor = 3;
static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
struct sk_buff *skb)
@@ -105,19 +105,19 @@ static __u16 tcp_advertise_mss(struct sock *sk)
/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
* This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
+static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
{
+ struct tcp_sock *tp = tcp_sk(sk);
s32 delta = tcp_time_stamp - tp->lsndtime;
u32 restart_cwnd = tcp_init_cwnd(tp, dst);
u32 cwnd = tp->snd_cwnd;
- if (tcp_is_vegas(tp))
- tcp_vegas_enable(tp);
+ tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
- tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
restart_cwnd = min(restart_cwnd, cwnd);
- while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
+ while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
cwnd >>= 1;
tp->snd_cwnd = max(cwnd, restart_cwnd);
tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -127,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
static inline void tcp_event_data_sent(struct tcp_sock *tp,
struct sk_buff *skb, struct sock *sk)
{
- u32 now = tcp_time_stamp;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const u32 now = tcp_time_stamp;
- if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
- tcp_cwnd_restart(tp, __sk_dst_get(sk));
+ if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
+ tcp_cwnd_restart(sk, __sk_dst_get(sk));
tp->lsndtime = now;
/* If it is a reply for ato after last received
* packet, enter pingpong mode.
*/
- if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
- tp->ack.pingpong = 1;
+ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+ icsk->icsk_ack.pingpong = 1;
}
-static __inline__ void tcp_event_ack_sent(struct sock *sk)
+static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_dec_quickack_mode(tp);
- tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
+ tcp_dec_quickack_mode(sk, pkts);
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
/* Determine a window scaling and initial window to offer.
@@ -266,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
if (skb != NULL) {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -280,6 +280,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
#define SYSCTL_FLAG_WSCALE 0x2
#define SYSCTL_FLAG_SACK 0x4
+ /* If congestion control is doing timestamping */
+ if (icsk->icsk_ca_ops->rtt_sample)
+ __net_timestamp(skb);
+
sysctl_flags = 0;
if (tcb->flags & TCPCB_FLAG_SYN) {
tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +308,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
(tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
}
- /*
- * If the connection is idle and we are restarting,
- * then we don't want to do any Vegas calculations
- * until we get fresh RTT samples. So when we
- * restart, we reset our Vegas state to a clean
- * slate. After we get acks for this flight of
- * packets, _then_ we can make Vegas calculations
- * again.
- */
- if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
- tcp_vegas_enable(tp);
+ if (tcp_packets_in_flight(tp) == 0)
+ tcp_ca_event(sk, CA_EVENT_TX_START);
th = (struct tcphdr *) skb_push(skb, tcp_header_size);
skb->h.th = th;
@@ -361,7 +356,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
tp->af_specific->send_check(sk, th, skb->len, skb);
if (tcb->flags & TCPCB_FLAG_ACK)
- tcp_event_ack_sent(sk);
+ tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
if (skb->len != tcp_header_size)
tcp_event_data_sent(tp, skb, sk);
@@ -372,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
if (err <= 0)
return err;
- tcp_enter_cwr(tp);
+ tcp_enter_cwr(sk);
/* NET_XMIT_CN is special. It does not guarantee,
* that this packet is lost. It tells that device
@@ -409,42 +404,9 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
sk->sk_send_head = skb;
}
-static inline void tcp_tso_set_push(struct sk_buff *skb)
-{
- /* Force push to be on for any TSO frames to workaround
- * problems with busted implementations like Mac OS-X that
- * hold off socket receive wakeups until push is seen.
- */
- if (tcp_skb_pcount(skb) > 1)
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
-}
-
-/* Send _single_ skb sitting at the send head. This function requires
- * true push pending frames to setup probe timer etc.
- */
-void tcp_push_one(struct sock *sk, unsigned cur_mss)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = sk->sk_send_head;
-
- if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
- /* Send it out now. */
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_tso_set_push(skb);
- if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
- sk->sk_send_head = NULL;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
- tcp_packets_out_inc(sk, tp, skb);
- return;
- }
- }
-}
-
-void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (skb->len <= tp->mss_cache_std ||
+ if (skb->len <= mss_now ||
!(sk->sk_route_caps & NETIF_F_TSO)) {
/* Avoid the costly divide in the normal
* non-TSO case.
@@ -454,10 +416,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
} else {
unsigned int factor;
- factor = skb->len + (tp->mss_cache_std - 1);
- factor /= tp->mss_cache_std;
+ factor = skb->len + (mss_now - 1);
+ factor /= mss_now;
skb_shinfo(skb)->tso_segs = factor;
- skb_shinfo(skb)->tso_size = tp->mss_cache_std;
+ skb_shinfo(skb)->tso_size = mss_now;
}
}
@@ -466,11 +428,11 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
* packet to the list. This won't be called frequently, I hope.
* Remember, these are still headerless SKBs at this point.
*/
-static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
+int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
- int nsize;
+ int nsize, old_factor;
u16 flags;
nsize = skb_headlen(skb) - len;
@@ -521,28 +483,41 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
* skbs, which it never sent before. --ANK
*/
TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+ buff->tstamp = skb->tstamp;
if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
tp->lost_out -= tcp_skb_pcount(skb);
tp->left_out -= tcp_skb_pcount(skb);
}
- /* Fix up tso_factor for both original and new SKB. */
- tcp_set_skb_tso_segs(sk, skb);
- tcp_set_skb_tso_segs(sk, buff);
+ old_factor = tcp_skb_pcount(skb);
- if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
- tp->lost_out += tcp_skb_pcount(skb);
- tp->left_out += tcp_skb_pcount(skb);
- }
+ /* Fix up tso_factor for both original and new SKB. */
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(sk, buff, mss_now);
- if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
- tp->lost_out += tcp_skb_pcount(buff);
- tp->left_out += tcp_skb_pcount(buff);
+ /* If this packet has been sent out already, we must
+ * adjust the various packet counters.
+ */
+ if (after(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
+ int diff = old_factor - tcp_skb_pcount(skb) -
+ tcp_skb_pcount(buff);
+
+ tp->packets_out -= diff;
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
+ tp->lost_out -= diff;
+ tp->left_out -= diff;
+ }
+ if (diff > 0) {
+ tp->fackets_out -= diff;
+ if ((int)tp->fackets_out < 0)
+ tp->fackets_out = 0;
+ }
}
/* Link BUFF into the send queue. */
- __skb_append(skb, buff);
+ skb_header_release(buff);
+ __skb_append(skb, buff, &sk->sk_write_queue);
return 0;
}
@@ -604,7 +579,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
* factor and mss.
*/
if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb);
+ tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
return 0;
}
@@ -662,7 +637,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
/* And store cached results */
tp->pmtu_cookie = pmtu;
- tp->mss_cache = tp->mss_cache_std = mss_now;
+ tp->mss_cache = mss_now;
return mss_now;
}
@@ -674,57 +649,316 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
* cannot be large. However, taking into account rare use of URG, this
* is not a big flaw.
*/
-
-unsigned int tcp_current_mss(struct sock *sk, int large)
+unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
{
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
- unsigned int do_large, mss_now;
+ u32 mss_now;
+ u16 xmit_size_goal;
+ int doing_tso = 0;
+
+ mss_now = tp->mss_cache;
+
+ if (large_allowed &&
+ (sk->sk_route_caps & NETIF_F_TSO) &&
+ !tp->urg_mode)
+ doing_tso = 1;
- mss_now = tp->mss_cache_std;
if (dst) {
u32 mtu = dst_mtu(dst);
if (mtu != tp->pmtu_cookie)
mss_now = tcp_sync_mss(sk, mtu);
}
- do_large = (large &&
- (sk->sk_route_caps & NETIF_F_TSO) &&
- !tp->urg_mode);
+ if (tp->rx_opt.eff_sacks)
+ mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
+ (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
- if (do_large) {
- unsigned int large_mss, factor, limit;
+ xmit_size_goal = mss_now;
- large_mss = 65535 - tp->af_specific->net_header_len -
+ if (doing_tso) {
+ xmit_size_goal = 65535 -
+ tp->af_specific->net_header_len -
tp->ext_header_len - tp->tcp_header_len;
- if (tp->max_window && large_mss > (tp->max_window>>1))
- large_mss = max((tp->max_window>>1),
- 68U - tp->tcp_header_len);
+ if (tp->max_window &&
+ (xmit_size_goal > (tp->max_window >> 1)))
+ xmit_size_goal = max((tp->max_window >> 1),
+ 68U - tp->tcp_header_len);
- factor = large_mss / mss_now;
+ xmit_size_goal -= (xmit_size_goal % mss_now);
+ }
+ tp->xmit_size_goal = xmit_size_goal;
- /* Always keep large mss multiple of real mss, but
- * do not exceed 1/tso_win_divisor of the congestion window
- * so we can keep the ACK clock ticking and minimize
- * bursting.
- */
- limit = tp->snd_cwnd;
- if (sysctl_tcp_tso_win_divisor)
- limit /= sysctl_tcp_tso_win_divisor;
- limit = max(1U, limit);
- if (factor > limit)
- factor = limit;
+ return mss_now;
+}
- tp->mss_cache = mss_now * factor;
+/* Congestion window validation. (RFC2861) */
- mss_now = tp->mss_cache;
+static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
+{
+ __u32 packets_out = tp->packets_out;
+
+ if (packets_out >= tp->snd_cwnd) {
+ /* Network is feed fully. */
+ tp->snd_cwnd_used = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ } else {
+ /* Network starves. */
+ if (tp->packets_out > tp->snd_cwnd_used)
+ tp->snd_cwnd_used = tp->packets_out;
+
+ if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
+ tcp_cwnd_application_limited(sk);
}
+}
- if (tp->rx_opt.eff_sacks)
- mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
- (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
- return mss_now;
+static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
+{
+ u32 window, cwnd_len;
+
+ window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
+ cwnd_len = mss_now * cwnd;
+ return min(window, cwnd_len);
+}
+
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules? If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ u32 in_flight, cwnd;
+
+ /* Don't be strict about the congestion window for the final FIN. */
+ if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+ return 1;
+
+ in_flight = tcp_packets_in_flight(tp);
+ cwnd = tp->snd_cwnd;
+ if (in_flight < cwnd)
+ return (cwnd - in_flight);
+
+ return 0;
+}
+
+/* This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
+{
+ int tso_segs = tcp_skb_pcount(skb);
+
+ if (!tso_segs ||
+ (tso_segs > 1 &&
+ skb_shinfo(skb)->tso_size != mss_now)) {
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tso_segs = tcp_skb_pcount(skb);
+ }
+ return tso_segs;
+}
+
+static inline int tcp_minshall_check(const struct tcp_sock *tp)
+{
+ return after(tp->snd_sml,tp->snd_una) &&
+ !after(tp->snd_sml, tp->snd_nxt);
+}
+
+/* Return 0, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized.
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_NODELAY was set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ * With Minshall's modification: all sent small packets are ACKed.
+ */
+
+static inline int tcp_nagle_check(const struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ unsigned mss_now, int nonagle)
+{
+ return (skb->len < mss_now &&
+ ((nonagle&TCP_NAGLE_CORK) ||
+ (!nonagle &&
+ tp->packets_out &&
+ tcp_minshall_check(tp))));
+}
+
+/* Return non-zero if the Nagle test allows this packet to be
+ * sent now.
+ */
+static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
+ unsigned int cur_mss, int nonagle)
+{
+ /* Nagle rule does not apply to frames, which sit in the middle of the
+ * write_queue (they have no chances to get new data).
+ *
+ * This is implemented in the callers, where they modify the 'nonagle'
+ * argument based upon the location of SKB in the send queue.
+ */
+ if (nonagle & TCP_NAGLE_PUSH)
+ return 1;
+
+ /* Don't use the nagle rule for urgent data (or for the final FIN). */
+ if (tp->urg_mode ||
+ (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+ return 1;
+
+ if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
+ return 1;
+
+ return 0;
+}
+
+/* Does at least the first segment of SKB fit into the send window? */
+static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
+{
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (skb->len > cur_mss)
+ end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
+
+ return !after(end_seq, tp->snd_una + tp->snd_wnd);
+}
+
+/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
+ * should be put on the wire right now. If so, it returns the number of
+ * packets allowed by the congestion window.
+ */
+static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
+ unsigned int cur_mss, int nonagle)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int cwnd_quota;
+
+ tcp_init_tso_segs(sk, skb, cur_mss);
+
+ if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
+ return 0;
+
+ cwnd_quota = tcp_cwnd_test(tp, skb);
+ if (cwnd_quota &&
+ !tcp_snd_wnd_test(tp, skb, cur_mss))
+ cwnd_quota = 0;
+
+ return cwnd_quota;
+}
+
+static inline int tcp_skb_is_last(const struct sock *sk,
+ const struct sk_buff *skb)
+{
+ return skb->next == (struct sk_buff *)&sk->sk_write_queue;
+}
+
+int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
+{
+ struct sk_buff *skb = sk->sk_send_head;
+
+ return (skb &&
+ tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+ (tcp_skb_is_last(sk, skb) ?
+ TCP_NAGLE_PUSH :
+ tp->nonagle)));
+}
+
+/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
+ * which is put after SKB on the list. It is very much like
+ * tcp_fragment() except that it may make several kinds of assumptions
+ * in order to speed up the splitting operation. In particular, we
+ * know that all the data is in scatter-gather pages, and that the
+ * packet has never been sent out before (and thus is not cloned).
+ */
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now)
+{
+ struct sk_buff *buff;
+ int nlen = skb->len - len;
+ u16 flags;
+
+ /* All of a TSO frame must be composed of paged data. */
+ if (skb->len != skb->data_len)
+ return tcp_fragment(sk, skb, len, mss_now);
+
+ buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
+ if (unlikely(buff == NULL))
+ return -ENOMEM;
+
+ buff->truesize = nlen;
+ skb->truesize -= nlen;
+
+ /* Correct the sequence numbers. */
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+ /* PSH and FIN should only be set in the second packet. */
+ flags = TCP_SKB_CB(skb)->flags;
+ TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+ TCP_SKB_CB(buff)->flags = flags;
+
+ /* This packet was never sent out yet, so no SACK bits. */
+ TCP_SKB_CB(buff)->sacked = 0;
+
+ buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
+ skb_split(skb, buff, len);
+
+ /* Fix up tso_factor for both original and new SKB. */
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ tcp_set_skb_tso_segs(sk, buff, mss_now);
+
+ /* Link BUFF into the send queue. */
+ skb_header_release(buff);
+ __skb_append(skb, buff, &sk->sk_write_queue);
+
+ return 0;
+}
+
+/* Try to defer sending, if possible, in order to minimize the amount
+ * of TSO splitting we do. View it as a kind of TSO Nagle test.
+ *
+ * This algorithm is from John Heffner.
+ */
+static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 send_win, cong_win, limit, in_flight;
+
+ if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+ return 0;
+
+ if (icsk->icsk_ca_state != TCP_CA_Open)
+ return 0;
+
+ in_flight = tcp_packets_in_flight(tp);
+
+ BUG_ON(tcp_skb_pcount(skb) <= 1 ||
+ (tp->snd_cwnd <= in_flight));
+
+ send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
+
+ /* From in_flight test above, we know that cwnd > in_flight. */
+ cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+
+ limit = min(send_win, cong_win);
+
+ if (sysctl_tcp_tso_win_divisor) {
+ u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+
+ /* If at least some fraction of a window is available,
+ * just use it.
+ */
+ chunk /= sysctl_tcp_tso_win_divisor;
+ if (limit >= chunk)
+ return 0;
+ } else {
+ /* Different approach, try not to defer past a single
+ * ACK. Receiver should ACK every other full sized
+ * frame, so if we have space for more than 3 frames
+ * then send now.
+ */
+ if (limit > tcp_max_burst(tp) * tp->mss_cache)
+ return 0;
+ }
+
+ /* Ok, it looks like it is advisable to defer. */
+ return 1;
}
/* This routine writes packets to the network. It advances the
@@ -734,57 +968,142 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
* Returns 1, if no segments are in flight and we have queued segments, but
* cannot send anything now because of SWS or another problem.
*/
-int tcp_write_xmit(struct sock *sk, int nonagle)
+static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
{
struct tcp_sock *tp = tcp_sk(sk);
- unsigned int mss_now;
+ struct sk_buff *skb;
+ unsigned int tso_segs, sent_pkts;
+ int cwnd_quota;
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and all
* will be happy.
*/
- if (sk->sk_state != TCP_CLOSE) {
- struct sk_buff *skb;
- int sent_pkts = 0;
+ if (unlikely(sk->sk_state == TCP_CLOSE))
+ return 0;
- /* Account for SACKS, we may need to fragment due to this.
- * It is just like the real MSS changing on us midstream.
- * We also handle things correctly when the user adds some
- * IP options mid-stream. Silly to do, but cover it.
- */
- mss_now = tcp_current_mss(sk, 1);
-
- while ((skb = sk->sk_send_head) &&
- tcp_snd_test(sk, skb, mss_now,
- tcp_skb_is_last(sk, skb) ? nonagle :
- TCP_NAGLE_PUSH)) {
- if (skb->len > mss_now) {
- if (tcp_fragment(sk, skb, mss_now))
- break;
- }
+ sent_pkts = 0;
+ while ((skb = sk->sk_send_head)) {
+ unsigned int limit;
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_tso_set_push(skb);
- if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+ tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
+ BUG_ON(!tso_segs);
+
+ cwnd_quota = tcp_cwnd_test(tp, skb);
+ if (!cwnd_quota)
+ break;
+
+ if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+ break;
+
+ if (tso_segs == 1) {
+ if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+ (tcp_skb_is_last(sk, skb) ?
+ nonagle : TCP_NAGLE_PUSH))))
+ break;
+ } else {
+ if (tcp_tso_should_defer(sk, tp, skb))
break;
+ }
- /* Advance the send_head. This one is sent out.
- * This call will increment packets_out.
- */
- update_send_head(sk, tp, skb);
+ limit = mss_now;
+ if (tso_segs > 1) {
+ limit = tcp_window_allows(tp, skb,
+ mss_now, cwnd_quota);
+
+ if (skb->len < limit) {
+ unsigned int trim = skb->len % mss_now;
- tcp_minshall_update(tp, mss_now, skb);
- sent_pkts = 1;
+ if (trim)
+ limit = skb->len - trim;
+ }
}
- if (sent_pkts) {
- tcp_cwnd_validate(sk, tp);
- return 0;
+ if (skb->len > limit &&
+ unlikely(tso_fragment(sk, skb, limit, mss_now)))
+ break;
+
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+
+ if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
+ break;
+
+ /* Advance the send_head. This one is sent out.
+ * This call will increment packets_out.
+ */
+ update_send_head(sk, tp, skb);
+
+ tcp_minshall_update(tp, mss_now, skb);
+ sent_pkts++;
+ }
+
+ if (likely(sent_pkts)) {
+ tcp_cwnd_validate(sk, tp);
+ return 0;
+ }
+ return !tp->packets_out && sk->sk_send_head;
+}
+
+/* Push out any pending frames which were held back due to
+ * TCP_CORK or attempt at coalescing tiny packets.
+ * The socket must be locked by the caller.
+ */
+void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
+ unsigned int cur_mss, int nonagle)
+{
+ struct sk_buff *skb = sk->sk_send_head;
+
+ if (skb) {
+ if (tcp_write_xmit(sk, cur_mss, nonagle))
+ tcp_check_probe_timer(sk, tp);
+ }
+}
+
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = sk->sk_send_head;
+ unsigned int tso_segs, cwnd_quota;
+
+ BUG_ON(!skb || skb->len < mss_now);
+
+ tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
+ cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
+
+ if (likely(cwnd_quota)) {
+ unsigned int limit;
+
+ BUG_ON(!tso_segs);
+
+ limit = mss_now;
+ if (tso_segs > 1) {
+ limit = tcp_window_allows(tp, skb,
+ mss_now, cwnd_quota);
+
+ if (skb->len < limit) {
+ unsigned int trim = skb->len % mss_now;
+
+ if (trim)
+ limit = skb->len - trim;
+ }
}
- return !tp->packets_out && sk->sk_send_head;
+ if (skb->len > limit &&
+ unlikely(tso_fragment(sk, skb, limit, mss_now)))
+ return;
+
+ /* Send it out now. */
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
+
+ if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+ update_send_head(sk, tp, skb);
+ tcp_cwnd_validate(sk, tp);
+ return;
+ }
}
- return 0;
}
/* This function returns the amount that we can raise the
@@ -841,6 +1160,7 @@ int tcp_write_xmit(struct sock *sk, int nonagle)
*/
u32 __tcp_select_window(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* MSS for the peer's data. Previous verions used mss_clamp
* here. I don't know if the value based on our guesses
@@ -848,7 +1168,7 @@ u32 __tcp_select_window(struct sock *sk)
* but may be worse for the performance because of rcv_mss
* fluctuations. --SAW 1998/11/1
*/
- int mss = tp->ack.rcv_mss;
+ int mss = icsk->icsk_ack.rcv_mss;
int free_space = tcp_space(sk);
int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
int window;
@@ -857,7 +1177,7 @@ u32 __tcp_select_window(struct sock *sk)
mss = full_space;
if (free_space < full_space/2) {
- tp->ack.quick = 0;
+ icsk->icsk_ack.quick = 0;
if (tcp_memory_pressure)
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
@@ -932,7 +1252,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
tcp_skb_pcount(next_skb) != 1);
/* Ok. We will be able to collapse the packet. */
- __skb_unlink(next_skb, next_skb->list);
+ __skb_unlink(next_skb, &sk->sk_write_queue);
memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
@@ -980,6 +1300,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
*/
void tcp_simple_retransmit(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int mss = tcp_current_mss(sk, 0);
@@ -1010,12 +1331,12 @@ void tcp_simple_retransmit(struct sock *sk)
* in network, but units changed and effective
* cwnd/ssthresh really reduced now.
*/
- if (tp->ca_state != TCP_CA_Loss) {
+ if (icsk->icsk_ca_state != TCP_CA_Loss) {
tp->high_seq = tp->snd_nxt;
- tp->snd_ssthresh = tcp_current_ssthresh(tp);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
tp->prior_ssthresh = 0;
tp->undo_marker = 0;
- tcp_set_ca_state(tp, TCP_CA_Loss);
+ tcp_set_ca_state(sk, TCP_CA_Loss);
}
tcp_xmit_retransmit_queue(sk);
}
@@ -1040,13 +1361,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
BUG();
-
- if (sk->sk_route_caps & NETIF_F_TSO) {
- sk->sk_route_caps &= ~NETIF_F_TSO;
- sock_set_flag(sk, SOCK_NO_LARGESEND);
- tp->mss_cache = tp->mss_cache_std;
- }
-
if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
return -ENOMEM;
}
@@ -1061,16 +1375,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
return -EAGAIN;
if (skb->len > cur_mss) {
- int old_factor = tcp_skb_pcount(skb);
- int new_factor;
-
- if (tcp_fragment(sk, skb, cur_mss))
+ if (tcp_fragment(sk, skb, cur_mss, cur_mss))
return -ENOMEM; /* We'll try again later. */
-
- /* New SKB created, account for it. */
- new_factor = tcp_skb_pcount(skb);
- tp->packets_out -= old_factor - new_factor;
- tp->packets_out += tcp_skb_pcount(skb->next);
}
/* Collapse two adjacent packets if worthwhile and we can. */
@@ -1106,7 +1412,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
* is still in somebody's hands, else make a clone.
*/
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_tso_set_push(skb);
err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
pskb_copy(skb, GFP_ATOMIC):
@@ -1151,6 +1456,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
*/
void tcp_xmit_retransmit_queue(struct sock *sk)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int packet_cnt = tp->lost_out;
@@ -1174,14 +1480,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
if (tcp_retransmit_skb(sk, skb))
return;
- if (tp->ca_state != TCP_CA_Loss)
+ if (icsk->icsk_ca_state != TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
else
NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
if (skb ==
skb_peek(&sk->sk_write_queue))
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
}
packet_cnt -= tcp_skb_pcount(skb);
@@ -1194,7 +1502,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
/* OK, demanded retransmission is finished. */
/* Forward retransmissions are possible only during Recovery. */
- if (tp->ca_state != TCP_CA_Recovery)
+ if (icsk->icsk_ca_state != TCP_CA_Recovery)
return;
/* No forward retransmissions in Reno are possible. */
@@ -1234,7 +1542,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
break;
if (skb == skb_peek(&sk->sk_write_queue))
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX);
NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
}
@@ -1263,7 +1573,7 @@ void tcp_send_fin(struct sock *sk)
} else {
/* Socket is locked, keep trying until memory is available. */
for (;;) {
- skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
if (skb)
break;
yield();
@@ -1290,7 +1600,7 @@ void tcp_send_fin(struct sock *sk)
* was unread data in the receive queue. This behavior is recommended
* by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
*/
-void tcp_send_active_reset(struct sock *sk, int priority)
+void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -1356,8 +1666,9 @@ int tcp_send_synack(struct sock *sk)
* Prepare a SYN-ACK.
*/
struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
- struct open_request *req)
+ struct request_sock *req)
{
+ struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_sock *tp = tcp_sk(sk);
struct tcphdr *th;
int tcp_header_size;
@@ -1373,47 +1684,47 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
skb->dst = dst_clone(dst);
tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
- (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
- (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
+ (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
+ (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
/* SACK_PERM is in the place of NOP NOP of TS */
- ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
+ ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
if (dst->dev->features&NETIF_F_TSO)
- req->ecn_ok = 0;
+ ireq->ecn_ok = 0;
TCP_ECN_make_synack(req, th);
th->source = inet_sk(sk)->sport;
- th->dest = req->rmt_port;
- TCP_SKB_CB(skb)->seq = req->snt_isn;
+ th->dest = ireq->rmt_port;
+ TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
TCP_SKB_CB(skb)->sacked = 0;
skb_shinfo(skb)->tso_segs = 1;
skb_shinfo(skb)->tso_size = 0;
th->seq = htonl(TCP_SKB_CB(skb)->seq);
- th->ack_seq = htonl(req->rcv_isn + 1);
+ th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
__u8 rcv_wscale;
/* Set this up on the first call only */
req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(tcp_full_space(sk),
- dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
&req->rcv_wnd,
&req->window_clamp,
- req->wscale_ok,
+ ireq->wscale_ok,
&rcv_wscale);
- req->rcv_wscale = rcv_wscale;
+ ireq->rcv_wscale = rcv_wscale;
}
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(req->rcv_wnd);
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
- req->sack_ok, req->wscale_ok, req->rcv_wscale,
+ tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
+ ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
TCP_SKB_CB(skb)->when,
req->ts_recent);
@@ -1448,7 +1759,6 @@ static inline void tcp_connect_init(struct sock *sk)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
tp->advmss = dst_metric(dst, RTAX_ADVMSS);
tcp_initialize_rcv_mss(sk);
- tcp_ca_init(tp);
tcp_select_initial_window(tcp_full_space(sk),
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1470,8 +1780,8 @@ static inline void tcp_connect_init(struct sock *sk)
tp->rcv_wup = 0;
tp->copied_seq = 0;
- tp->rto = TCP_TIMEOUT_INIT;
- tp->retransmits = 0;
+ inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+ inet_csk(sk)->icsk_retransmits = 0;
tcp_clear_retrans(tp);
}
@@ -1485,7 +1795,7 @@ int tcp_connect(struct sock *sk)
tcp_connect_init(sk);
- buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
+ buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
if (unlikely(buff == NULL))
return -ENOBUFS;
@@ -1502,7 +1812,6 @@ int tcp_connect(struct sock *sk)
TCP_SKB_CB(buff)->end_seq = tp->write_seq;
tp->snd_nxt = tp->write_seq;
tp->pushed_seq = tp->write_seq;
- tcp_ca_init(tp);
/* Send it off. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -1515,7 +1824,8 @@ int tcp_connect(struct sock *sk)
TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
return 0;
}
@@ -1525,20 +1835,21 @@ int tcp_connect(struct sock *sk)
*/
void tcp_send_delayed_ack(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- int ato = tp->ack.ato;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int ato = icsk->icsk_ack.ato;
unsigned long timeout;
if (ato > TCP_DELACK_MIN) {
+ const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ/2;
- if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
+ if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
max_ato = TCP_DELACK_MAX;
/* Slow path, intersegment interval is "high". */
/* If some rtt estimate is known, use it to bound delayed ack.
- * Do not use tp->rto here, use results of rtt measurements
+ * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
* directly.
*/
if (tp->srtt) {
@@ -1555,21 +1866,22 @@ void tcp_send_delayed_ack(struct sock *sk)
timeout = jiffies + ato;
/* Use new timeout only if there wasn't a older one earlier. */
- if (tp->ack.pending&TCP_ACK_TIMER) {
+ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
/* If delack timer was blocked or is about to expire,
* send ACK now.
*/
- if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
+ if (icsk->icsk_ack.blocked ||
+ time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
tcp_send_ack(sk);
return;
}
- if (!time_before(timeout, tp->ack.timeout))
- timeout = tp->ack.timeout;
+ if (!time_before(timeout, icsk->icsk_ack.timeout))
+ timeout = icsk->icsk_ack.timeout;
}
- tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
- tp->ack.timeout = timeout;
- sk_reset_timer(sk, &tp->delack_timer, timeout);
+ icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ icsk->icsk_ack.timeout = timeout;
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
/* This routine sends an ack and also updates the window. */
@@ -1586,9 +1898,10 @@ void tcp_send_ack(struct sock *sk)
*/
buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (buff == NULL) {
- tcp_schedule_ack(tp);
- tp->ack.ato = TCP_ATO_MIN;
- tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+ inet_csk_schedule_ack(sk);
+ inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, TCP_RTO_MAX);
return;
}
@@ -1669,21 +1982,13 @@ int tcp_write_wakeup(struct sock *sk)
skb->len > mss) {
seg_size = min(seg_size, mss);
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
- if (tcp_fragment(sk, skb, seg_size))
+ if (tcp_fragment(sk, skb, seg_size, mss))
return -1;
- /* SWS override triggered forced fragmentation.
- * Disable TSO, the connection is too sick. */
- if (sk->sk_route_caps & NETIF_F_TSO) {
- sock_set_flag(sk, SOCK_NO_LARGESEND);
- sk->sk_route_caps &= ~NETIF_F_TSO;
- tp->mss_cache = tp->mss_cache_std;
- }
} else if (!tcp_skb_pcount(skb))
- tcp_set_skb_tso_segs(sk, skb);
+ tcp_set_skb_tso_segs(sk, skb, mss);
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_tso_set_push(skb);
err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
if (!err) {
update_send_head(sk, tp, skb);
@@ -1704,6 +2009,7 @@ int tcp_write_wakeup(struct sock *sk)
*/
void tcp_send_probe0(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int err;
@@ -1711,28 +2017,31 @@ void tcp_send_probe0(struct sock *sk)
if (tp->packets_out || !sk->sk_send_head) {
/* Cancel probe timer, if it is not required. */
- tp->probes_out = 0;
- tp->backoff = 0;
+ icsk->icsk_probes_out = 0;
+ icsk->icsk_backoff = 0;
return;
}
if (err <= 0) {
- if (tp->backoff < sysctl_tcp_retries2)
- tp->backoff++;
- tp->probes_out++;
- tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
- min(tp->rto << tp->backoff, TCP_RTO_MAX));
+ if (icsk->icsk_backoff < sysctl_tcp_retries2)
+ icsk->icsk_backoff++;
+ icsk->icsk_probes_out++;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+ TCP_RTO_MAX);
} else {
/* If packet was not sent due to local congestion,
- * do not backoff and do not remember probes_out.
+ * do not backoff and do not remember icsk_probes_out.
* Let local senders to fight for local resources.
*
* Use accumulated backoff yet.
*/
- if (!tp->probes_out)
- tp->probes_out=1;
- tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0,
- min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+ if (!icsk->icsk_probes_out)
+ icsk->icsk_probes_out = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ min(icsk->icsk_rto << icsk->icsk_backoff,
+ TCP_RESOURCE_PROBE_INTERVAL),
+ TCP_RTO_MAX);
}
}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 0000000..327770b
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,70 @@
+/* Tom Kelly's Scalable TCP
+ *
+ * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+ *
+ * John Heffner <jheffner@sc.edu>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* These factors derived from the recommended values in the aer:
+ * .01 and and 7/8. We use 50 instead of 100 to account for
+ * delayed ack.
+ */
+#define TCP_SCALABLE_AI_CNT 50U
+#define TCP_SCALABLE_MD_SCALE 3
+
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
+ u32 in_flight, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ if (in_flight < tp->snd_cwnd)
+ return;
+
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ tp->snd_cwnd++;
+ } else {
+ tp->snd_cwnd_cnt++;
+ if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ }
+ }
+ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static u32 tcp_scalable_ssthresh(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
+}
+
+
+static struct tcp_congestion_ops tcp_scalable = {
+ .ssthresh = tcp_scalable_ssthresh,
+ .cong_avoid = tcp_scalable_cong_avoid,
+ .min_cwnd = tcp_reno_min_cwnd,
+
+ .owner = THIS_MODULE,
+ .name = "scalable",
+};
+
+static int __init tcp_scalable_register(void)
+{
+ return tcp_register_congestion_control(&tcp_scalable);
+}
+
+static void __exit tcp_scalable_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_scalable);
+}
+
+module_init(tcp_scalable_register);
+module_exit(tcp_scalable_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Scalable TCP");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 799ebe0..415ee47 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -36,49 +36,13 @@ static void tcp_write_timer(unsigned long);
static void tcp_delack_timer(unsigned long);
static void tcp_keepalive_timer (unsigned long data);
-#ifdef TCP_DEBUG
-const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
-EXPORT_SYMBOL(tcp_timer_bug_msg);
-#endif
-
-/*
- * Using different timers for retransmit, delayed acks and probes
- * We may wish use just one timer maintaining a list of expire jiffies
- * to optimize.
- */
-
void tcp_init_xmit_timers(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- init_timer(&tp->retransmit_timer);
- tp->retransmit_timer.function=&tcp_write_timer;
- tp->retransmit_timer.data = (unsigned long) sk;
- tp->pending = 0;
-
- init_timer(&tp->delack_timer);
- tp->delack_timer.function=&tcp_delack_timer;
- tp->delack_timer.data = (unsigned long) sk;
- tp->ack.pending = 0;
-
- init_timer(&sk->sk_timer);
- sk->sk_timer.function = &tcp_keepalive_timer;
- sk->sk_timer.data = (unsigned long)sk;
+ inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+ &tcp_keepalive_timer);
}
-void tcp_clear_xmit_timers(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tp->pending = 0;
- sk_stop_timer(sk, &tp->retransmit_timer);
-
- tp->ack.pending = 0;
- tp->ack.blocked = 0;
- sk_stop_timer(sk, &tp->delack_timer);
-
- sk_stop_timer(sk, &sk->sk_timer);
-}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
static void tcp_write_err(struct sock *sk)
{
@@ -155,15 +119,15 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
int retry_until;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
- if (tp->retransmits)
+ if (icsk->icsk_retransmits)
dst_negative_advice(&sk->sk_dst_cache);
- retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
+ retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
} else {
- if (tp->retransmits >= sysctl_tcp_retries1) {
+ if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
hole detection. :-(
@@ -189,16 +153,16 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- int alive = (tp->rto < TCP_RTO_MAX);
+ const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
retry_until = tcp_orphan_retries(sk, alive);
- if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
+ if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
return 1;
}
}
- if (tp->retransmits >= retry_until) {
+ if (icsk->icsk_retransmits >= retry_until) {
/* Has it gone just too far? */
tcp_write_err(sk);
return 1;
@@ -210,32 +174,32 @@ static void tcp_delack_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
- tp->ack.blocked = 1;
+ icsk->icsk_ack.blocked = 1;
NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
- sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
goto out_unlock;
}
sk_stream_mem_reclaim(sk);
- if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
+ if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
goto out;
- if (time_after(tp->ack.timeout, jiffies)) {
- sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
+ if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
goto out;
}
- tp->ack.pending &= ~TCP_ACK_TIMER;
+ icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
- if (skb_queue_len(&tp->ucopy.prequeue)) {
+ if (!skb_queue_empty(&tp->ucopy.prequeue)) {
struct sk_buff *skb;
- NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED,
- skb_queue_len(&tp->ucopy.prequeue));
+ NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk->sk_backlog_rcv(sk, skb);
@@ -243,16 +207,16 @@ static void tcp_delack_timer(unsigned long data)
tp->ucopy.memory = 0;
}
- if (tcp_ack_scheduled(tp)) {
- if (!tp->ack.pingpong) {
+ if (inet_csk_ack_scheduled(sk)) {
+ if (!icsk->icsk_ack.pingpong) {
/* Delayed ACK missed: inflate ATO. */
- tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
+ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
} else {
/* Delayed ACK missed: leave pingpong mode and
* deflate ATO.
*/
- tp->ack.pingpong = 0;
- tp->ack.ato = TCP_ATO_MIN;
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
}
tcp_send_ack(sk);
NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
@@ -269,11 +233,12 @@ out_unlock:
static void tcp_probe_timer(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
if (tp->packets_out || !sk->sk_send_head) {
- tp->probes_out = 0;
+ icsk->icsk_probes_out = 0;
return;
}
@@ -284,7 +249,7 @@ static void tcp_probe_timer(struct sock *sk)
* FIXME: We ought not to do it, Solaris 2.5 actually has fixing
* this behaviour in Solaris down as a bug fix. [AC]
*
- * Let me to explain. probes_out is zeroed by incoming ACKs
+ * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
* even if they advertise zero window. Hence, connection is killed only
* if we received no ACKs for normal connection timeout. It is not killed
* only because window stays zero for some time, window may be zero
@@ -295,15 +260,15 @@ static void tcp_probe_timer(struct sock *sk)
max_probes = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
+ const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
max_probes = tcp_orphan_retries(sk, alive);
- if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
+ if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
return;
}
- if (tp->probes_out > max_probes) {
+ if (icsk->icsk_probes_out > max_probes) {
tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
@@ -318,6 +283,7 @@ static void tcp_probe_timer(struct sock *sk)
static void tcp_retransmit_timer(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
if (!tp->packets_out)
goto out;
@@ -352,20 +318,21 @@ static void tcp_retransmit_timer(struct sock *sk)
if (tcp_write_timeout(sk))
goto out;
- if (tp->retransmits == 0) {
- if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
+ if (icsk->icsk_retransmits == 0) {
+ if (icsk->icsk_ca_state == TCP_CA_Disorder ||
+ icsk->icsk_ca_state == TCP_CA_Recovery) {
if (tp->rx_opt.sack_ok) {
- if (tp->ca_state == TCP_CA_Recovery)
+ if (icsk->icsk_ca_state == TCP_CA_Recovery)
NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
else
NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
} else {
- if (tp->ca_state == TCP_CA_Recovery)
+ if (icsk->icsk_ca_state == TCP_CA_Recovery)
NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
else
NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
}
- } else if (tp->ca_state == TCP_CA_Loss) {
+ } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
} else {
NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
@@ -382,10 +349,11 @@ static void tcp_retransmit_timer(struct sock *sk)
/* Retransmission failed because of local congestion,
* do not backoff.
*/
- if (!tp->retransmits)
- tp->retransmits=1;
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
- min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
+ if (!icsk->icsk_retransmits)
+ icsk->icsk_retransmits = 1;
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
+ TCP_RTO_MAX);
goto out;
}
@@ -404,13 +372,13 @@ static void tcp_retransmit_timer(struct sock *sk)
* implemented ftp to mars will work nicely. We will have to fix
* the 120 second clamps though!
*/
- tp->backoff++;
- tp->retransmits++;
+ icsk->icsk_backoff++;
+ icsk->icsk_retransmits++;
out_reset_timer:
- tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
- tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
- if (tp->retransmits > sysctl_tcp_retries1)
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+ if (icsk->icsk_retransmits > sysctl_tcp_retries1)
__sk_dst_reset(sk);
out:;
@@ -419,32 +387,32 @@ out:;
static void tcp_write_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
- struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
int event;
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later */
- sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
goto out_unlock;
}
- if (sk->sk_state == TCP_CLOSE || !tp->pending)
+ if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
goto out;
- if (time_after(tp->timeout, jiffies)) {
- sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
+ if (time_after(icsk->icsk_timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
goto out;
}
- event = tp->pending;
- tp->pending = 0;
+ event = icsk->icsk_pending;
+ icsk->icsk_pending = 0;
switch (event) {
- case TCP_TIME_RETRANS:
+ case ICSK_TIME_RETRANS:
tcp_retransmit_timer(sk);
break;
- case TCP_TIME_PROBE0:
+ case ICSK_TIME_PROBE0:
tcp_probe_timer(sk);
break;
}
@@ -463,100 +431,8 @@ out_unlock:
static void tcp_synack_timer(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_listen_opt *lopt = tp->listen_opt;
- int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
- int thresh = max_retries;
- unsigned long now = jiffies;
- struct open_request **reqp, *req;
- int i, budget;
-
- if (lopt == NULL || lopt->qlen == 0)
- return;
-
- /* Normally all the openreqs are young and become mature
- * (i.e. converted to established socket) for first timeout.
- * If synack was not acknowledged for 3 seconds, it means
- * one of the following things: synack was lost, ack was lost,
- * rtt is high or nobody planned to ack (i.e. synflood).
- * When server is a bit loaded, queue is populated with old
- * open requests, reducing effective size of queue.
- * When server is well loaded, queue size reduces to zero
- * after several minutes of work. It is not synflood,
- * it is normal operation. The solution is pruning
- * too old entries overriding normal timeout, when
- * situation becomes dangerous.
- *
- * Essentially, we reserve half of room for young
- * embrions; and abort old ones without pity, if old
- * ones are about to clog our table.
- */
- if (lopt->qlen>>(lopt->max_qlen_log-1)) {
- int young = (lopt->qlen_young<<1);
-
- while (thresh > 2) {
- if (lopt->qlen < young)
- break;
- thresh--;
- young <<= 1;
- }
- }
-
- if (tp->defer_accept)
- max_retries = tp->defer_accept;
-
- budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
- i = lopt->clock_hand;
-
- do {
- reqp=&lopt->syn_table[i];
- while ((req = *reqp) != NULL) {
- if (time_after_eq(now, req->expires)) {
- if ((req->retrans < thresh ||
- (req->acked && req->retrans < max_retries))
- && !req->class->rtx_syn_ack(sk, req, NULL)) {
- unsigned long timeo;
-
- if (req->retrans++ == 0)
- lopt->qlen_young--;
- timeo = min((TCP_TIMEOUT_INIT << req->retrans),
- TCP_RTO_MAX);
- req->expires = now + timeo;
- reqp = &req->dl_next;
- continue;
- }
-
- /* Drop this request */
- write_lock(&tp->syn_wait_lock);
- *reqp = req->dl_next;
- write_unlock(&tp->syn_wait_lock);
- lopt->qlen--;
- if (req->retrans == 0)
- lopt->qlen_young--;
- tcp_openreq_free(req);
- continue;
- }
- reqp = &req->dl_next;
- }
-
- i = (i+1)&(TCP_SYNQ_HSIZE-1);
-
- } while (--budget > 0);
-
- lopt->clock_hand = i;
-
- if (lopt->qlen)
- tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
-}
-
-void tcp_delete_keepalive_timer (struct sock *sk)
-{
- sk_stop_timer(sk, &sk->sk_timer);
-}
-
-void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
-{
- sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+ inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
+ TCP_TIMEOUT_INIT, TCP_RTO_MAX);
}
void tcp_set_keepalive(struct sock *sk, int val)
@@ -565,15 +441,16 @@ void tcp_set_keepalive(struct sock *sk, int val)
return;
if (val && !sock_flag(sk, SOCK_KEEPOPEN))
- tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+ inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
else if (!val)
- tcp_delete_keepalive_timer(sk);
+ inet_csk_delete_keepalive_timer(sk);
}
static void tcp_keepalive_timer (unsigned long data)
{
struct sock *sk = (struct sock *) data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u32 elapsed;
@@ -581,7 +458,7 @@ static void tcp_keepalive_timer (unsigned long data)
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
- tcp_reset_keepalive_timer (sk, HZ/20);
+ inet_csk_reset_keepalive_timer (sk, HZ/20);
goto out;
}
@@ -592,7 +469,7 @@ static void tcp_keepalive_timer (unsigned long data)
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
if (tp->linger2 >= 0) {
- int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
+ const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
@@ -615,14 +492,14 @@ static void tcp_keepalive_timer (unsigned long data)
elapsed = tcp_time_stamp - tp->rcv_tstamp;
if (elapsed >= keepalive_time_when(tp)) {
- if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
- (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
+ if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
+ (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_write_err(sk);
goto out;
}
if (tcp_write_wakeup(sk) <= 0) {
- tp->probes_out++;
+ icsk->icsk_probes_out++;
elapsed = keepalive_intvl_when(tp);
} else {
/* If keepalive was lost due to local congestion,
@@ -639,7 +516,7 @@ static void tcp_keepalive_timer (unsigned long data)
sk_stream_mem_reclaim(sk);
resched:
- tcp_reset_keepalive_timer (sk, elapsed);
+ inet_csk_reset_keepalive_timer (sk, elapsed);
goto out;
death:
@@ -649,8 +526,3 @@ out:
bh_unlock_sock(sk);
sock_put(sk);
}
-
-EXPORT_SYMBOL(tcp_clear_xmit_timers);
-EXPORT_SYMBOL(tcp_delete_keepalive_timer);
-EXPORT_SYMBOL(tcp_init_xmit_timers);
-EXPORT_SYMBOL(tcp_reset_keepalive_timer);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 0000000..93c5f92
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,413 @@
+/*
+ * TCP Vegas congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ * Lawrence S. Brakmo and Larry L. Peterson.
+ * "TCP Vegas: End to end congestion avoidance on a global internet."
+ * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ * October 1995. Available from:
+ * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ * o We do not change the loss detection or recovery mechanisms of
+ * Linux in any way. Linux already recovers from losses quite well,
+ * using fine-grained timers, NewReno, and FACK.
+ * o To avoid the performance penalty imposed by increasing cwnd
+ * only every-other RTT during slow start, we increase during
+ * every RTT during slow start, just like Reno.
+ * o Largely to allow continuous cwnd growth during slow start,
+ * we use the rate at which ACKs come back as the "actual"
+ * rate, rather than the rate at which data is sent.
+ * o To speed convergence to the right rate, we set the cwnd
+ * to achieve the right ("actual") rate when we exit slow start.
+ * o To filter out the noise caused by delayed ACKs, we use the
+ * minimum RTT sample observed during the last RTT to calculate
+ * the actual rate.
+ * o When the sender re-starts from idle, it waits until it has
+ * received ACKs for an entire flight of new data before making
+ * a cwnd adjustment decision. The original Vegas implementation
+ * assumed senders never went idle.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+
+#include <net/tcp.h>
+
+/* Default values of the Vegas variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+static int alpha = 1<<V_PARAM_SHIFT;
+static int beta = 3<<V_PARAM_SHIFT;
+static int gamma = 1<<V_PARAM_SHIFT;
+
+module_param(alpha, int, 0644);
+MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
+module_param(gamma, int, 0644);
+MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
+
+
+/* Vegas variables */
+struct vegas {
+ u32 beg_snd_nxt; /* right edge during last RTT */
+ u32 beg_snd_una; /* left edge during last RTT */
+ u32 beg_snd_cwnd; /* saves the size of the cwnd */
+ u8 doing_vegas_now;/* if true, do vegas for this RTT */
+ u16 cntRTT; /* # of RTTs measured within last RTT */
+ u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
+ u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
+};
+
+/* There are several situations when we must "re-start" Vegas:
+ *
+ * o when a connection is established
+ * o after an RTO
+ * o after fast recovery
+ * o when we send a packet and there is no outstanding
+ * unacknowledged data (restarting an idle connection)
+ *
+ * In these circumstances we cannot do a Vegas calculation at the
+ * end of the first RTT, because any calculation we do is using
+ * stale info -- both the saved cwnd and congestion feedback are
+ * stale.
+ *
+ * Instead we must wait until the completion of an RTT during
+ * which we actually receive ACKs.
+ */
+static inline void vegas_enable(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct vegas *vegas = inet_csk_ca(sk);
+
+ /* Begin taking Vegas samples next time we send something. */
+ vegas->doing_vegas_now = 1;
+
+ /* Set the beginning of the next send window. */
+ vegas->beg_snd_nxt = tp->snd_nxt;
+
+ vegas->cntRTT = 0;
+ vegas->minRTT = 0x7fffffff;
+}
+
+/* Stop taking Vegas samples for now. */
+static inline void vegas_disable(struct sock *sk)
+{
+ struct vegas *vegas = inet_csk_ca(sk);
+
+ vegas->doing_vegas_now = 0;
+}
+
+static void tcp_vegas_init(struct sock *sk)
+{
+ struct vegas *vegas = inet_csk_ca(sk);
+
+ vegas->baseRTT = 0x7fffffff;
+ vegas_enable(sk);
+}
+
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ * o min-filter RTT samples from within an RTT to get the current
+ * propagation delay + queuing delay (we are min-filtering to try to
+ * avoid the effects of delayed ACKs)
+ * o min-filter RTT samples from a much longer window (forever for now)
+ * to find the propagation delay (baseRTT)
+ */
+static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
+{
+ struct vegas *vegas = inet_csk_ca(sk);
+ u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
+
+ /* Filter to find propagation delay: */
+ if (vrtt < vegas->baseRTT)
+ vegas->baseRTT = vrtt;
+
+ /* Find the min RTT during the last RTT to find
+ * the current prop. delay + queuing delay:
+ */
+ vegas->minRTT = min(vegas->minRTT, vrtt);
+ vegas->cntRTT++;
+}
+
+static void tcp_vegas_state(struct sock *sk, u8 ca_state)
+{
+
+ if (ca_state == TCP_CA_Open)
+ vegas_enable(sk);
+ else
+ vegas_disable(sk);
+}
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples. So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ if (event == CA_EVENT_CWND_RESTART ||
+ event == CA_EVENT_TX_START)
+ tcp_vegas_init(sk);
+}
+
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
+ u32 seq_rtt, u32 in_flight, int flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct vegas *vegas = inet_csk_ca(sk);
+
+ if (!vegas->doing_vegas_now)
+ return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
+
+ /* The key players are v_beg_snd_una and v_beg_snd_nxt.
+ *
+ * These are so named because they represent the approximate values
+ * of snd_una and snd_nxt at the beginning of the current RTT. More
+ * precisely, they represent the amount of data sent during the RTT.
+ * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+ * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
+ * bytes of data have been ACKed during the course of the RTT, giving
+ * an "actual" rate of:
+ *
+ * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
+ *
+ * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
+ * because delayed ACKs can cover more than one segment, so they
+ * don't line up nicely with the boundaries of RTTs.
+ *
+ * Another unfortunate fact of life is that delayed ACKs delay the
+ * advance of the left edge of our send window, so that the number
+ * of bytes we send in an RTT is often less than our cwnd will allow.
+ * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+ */
+
+ if (after(ack, vegas->beg_snd_nxt)) {
+ /* Do the Vegas once-per-RTT cwnd adjustment. */
+ u32 old_wnd, old_snd_cwnd;
+
+
+ /* Here old_wnd is essentially the window of data that was
+ * sent during the previous RTT, and has all
+ * been acknowledged in the course of the RTT that ended
+ * with the ACK we just received. Likewise, old_snd_cwnd
+ * is the cwnd during the previous RTT.
+ */
+ old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
+ tp->mss_cache;
+ old_snd_cwnd = vegas->beg_snd_cwnd;
+
+ /* Save the extent of the current window so we can use this
+ * at the end of the next RTT.
+ */
+ vegas->beg_snd_una = vegas->beg_snd_nxt;
+ vegas->beg_snd_nxt = tp->snd_nxt;
+ vegas->beg_snd_cwnd = tp->snd_cwnd;
+
+ /* Take into account the current RTT sample too, to
+ * decrease the impact of delayed acks. This double counts
+ * this sample since we count it for the next window as well,
+ * but that's not too awful, since we're taking the min,
+ * rather than averaging.
+ */
+ tcp_vegas_rtt_calc(sk, seq_rtt * 1000);
+
+ /* We do the Vegas calculations only if we got enough RTT
+ * samples that we can be reasonably sure that we got
+ * at least one RTT sample that wasn't from a delayed ACK.
+ * If we only had 2 samples total,
+ * then that means we're getting only 1 ACK per RTT, which
+ * means they're almost certainly delayed ACKs.
+ * If we have 3 samples, we should be OK.
+ */
+
+ if (vegas->cntRTT <= 2) {
+ /* We don't have enough RTT samples to do the Vegas
+ * calculation, so we'll behave like Reno.
+ */
+ if (tp->snd_cwnd > tp->snd_ssthresh)
+ tp->snd_cwnd++;
+ } else {
+ u32 rtt, target_cwnd, diff;
+
+ /* We have enough RTT samples, so, using the Vegas
+ * algorithm, we determine if we should increase or
+ * decrease cwnd, and by how much.
+ */
+
+ /* Pluck out the RTT we are using for the Vegas
+ * calculations. This is the min RTT seen during the
+ * last RTT. Taking the min filters out the effects
+ * of delayed ACKs, at the cost of noticing congestion
+ * a bit later.
+ */
+ rtt = vegas->minRTT;
+
+ /* Calculate the cwnd we should have, if we weren't
+ * going too fast.
+ *
+ * This is:
+ * (actual rate in segments) * baseRTT
+ * We keep it as a fixed point number with
+ * V_PARAM_SHIFT bits to the right of the binary point.
+ */
+ target_cwnd = ((old_wnd * vegas->baseRTT)
+ << V_PARAM_SHIFT) / rtt;
+
+ /* Calculate the difference between the window we had,
+ * and the window we would like to have. This quantity
+ * is the "Diff" from the Arizona Vegas papers.
+ *
+ * Again, this is a fixed point number with
+ * V_PARAM_SHIFT bits to the right of the binary
+ * point.
+ */
+ diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
+
+ if (tp->snd_cwnd < tp->snd_ssthresh) {
+ /* Slow start. */
+ if (diff > gamma) {
+ /* Going too fast. Time to slow down
+ * and switch to congestion avoidance.
+ */
+ tp->snd_ssthresh = 2;
+
+ /* Set cwnd to match the actual rate
+ * exactly:
+ * cwnd = (actual rate) * baseRTT
+ * Then we add 1 because the integer
+ * truncation robs us of full link
+ * utilization.
+ */
+ tp->snd_cwnd = min(tp->snd_cwnd,
+ (target_cwnd >>
+ V_PARAM_SHIFT)+1);
+
+ }
+ } else {
+ /* Congestion avoidance. */
+ u32 next_snd_cwnd;
+
+ /* Figure out where we would like cwnd
+ * to be.
+ */
+ if (diff > beta) {
+ /* The old window was too fast, so
+ * we slow down.
+ */
+ next_snd_cwnd = old_snd_cwnd - 1;
+ } else if (diff < alpha) {
+ /* We don't have enough extra packets
+ * in the network, so speed up.
+ */
+ next_snd_cwnd = old_snd_cwnd + 1;
+ } else {
+ /* Sending just as fast as we
+ * should be.
+ */
+ next_snd_cwnd = old_snd_cwnd;
+ }
+
+ /* Adjust cwnd upward or downward, toward the
+ * desired value.
+ */
+ if (next_snd_cwnd > tp->snd_cwnd)
+ tp->snd_cwnd++;
+ else if (next_snd_cwnd < tp->snd_cwnd)
+ tp->snd_cwnd--;
+ }
+ }
+
+ /* Wipe the slate clean for the next RTT. */
+ vegas->cntRTT = 0;
+ vegas->minRTT = 0x7fffffff;
+ }
+
+ /* The following code is executed for every ack we receive,
+ * except for conditions checked in should_advance_cwnd()
+ * before the call to tcp_cong_avoid(). Mainly this means that
+ * we only execute this code if the ack actually acked some
+ * data.
+ */
+
+ /* If we are in slow start, increase our cwnd in response to this ACK.
+ * (If we are not in slow start then we are in congestion avoidance,
+ * and adjust our congestion window only once per RTT. See the code
+ * above.)
+ */
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tp->snd_cwnd++;
+
+ /* to keep cwnd from growing without bound */
+ tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+
+ /* Make sure that we are never so timid as to reduce our cwnd below
+ * 2 MSS.
+ *
+ * Going below 2 MSS would risk huge delayed ACKs from our receiver.
+ */
+ tp->snd_cwnd = max(tp->snd_cwnd, 2U);
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_vegas_get_info(struct sock *sk, u32 ext,
+ struct sk_buff *skb)
+{
+ const struct vegas *ca = inet_csk_ca(sk);
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ struct tcpvegas_info *info;
+
+ info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
+ sizeof(*info)));
+
+ info->tcpv_enabled = ca->doing_vegas_now;
+ info->tcpv_rttcnt = ca->cntRTT;
+ info->tcpv_rtt = ca->baseRTT;
+ info->tcpv_minrtt = ca->minRTT;
+ rtattr_failure: ;
+ }
+}
+
+static struct tcp_congestion_ops tcp_vegas = {
+ .init = tcp_vegas_init,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_vegas_cong_avoid,
+ .min_cwnd = tcp_reno_min_cwnd,
+ .rtt_sample = tcp_vegas_rtt_calc,
+ .set_state = tcp_vegas_state,
+ .cwnd_event = tcp_vegas_cwnd_event,
+ .get_info = tcp_vegas_get_info,
+
+ .owner = THIS_MODULE,
+ .name = "vegas",
+};
+
+static int __init tcp_vegas_register(void)
+{
+ BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE);
+ tcp_register_congestion_control(&tcp_vegas);
+ return 0;
+}
+
+static void __exit tcp_vegas_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_vegas);
+}
+
+module_init(tcp_vegas_register);
+module_exit(tcp_vegas_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Vegas");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 0000000..0c340c3
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,263 @@
+/*
+ * TCP Westwood+
+ *
+ * Angelo Dell'Aera: TCP Westwood+ support
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet_diag.h>
+#include <net/tcp.h>
+
+/* TCP Westwood structure */
+struct westwood {
+ u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */
+ u32 bw_est; /* bandwidth estimate */
+ u32 rtt_win_sx; /* here starts a new evaluation... */
+ u32 bk;
+ u32 snd_una; /* used for evaluating the number of acked bytes */
+ u32 cumul_ack;
+ u32 accounted;
+ u32 rtt;
+ u32 rtt_min; /* minimum observed RTT */
+};
+
+
+/* TCP Westwood functions and constants */
+#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
+#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
+
+/*
+ * @tcp_westwood_create
+ * This function initializes fields used in TCP Westwood+,
+ * it is called after the initial SYN, so the sequence numbers
+ * are correct but new passive connections we have no
+ * information about RTTmin at this time so we simply set it to
+ * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
+ * since in this way we're sure it will be updated in a consistent
+ * way as soon as possible. It will reasonably happen within the first
+ * RTT period of the connection lifetime.
+ */
+static void tcp_westwood_init(struct sock *sk)
+{
+ struct westwood *w = inet_csk_ca(sk);
+
+ w->bk = 0;
+ w->bw_ns_est = 0;
+ w->bw_est = 0;
+ w->accounted = 0;
+ w->cumul_ack = 0;
+ w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
+ w->rtt_win_sx = tcp_time_stamp;
+ w->snd_una = tcp_sk(sk)->snd_una;
+}
+
+/*
+ * @westwood_do_filter
+ * Low-pass filter. Implemented using constant coefficients.
+ */
+static inline u32 westwood_do_filter(u32 a, u32 b)
+{
+ return (((7 * a) + b) >> 3);
+}
+
+static inline void westwood_filter(struct westwood *w, u32 delta)
+{
+ w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
+ w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
+}
+
+/*
+ * @westwood_pkts_acked
+ * Called after processing group of packets.
+ * but all westwood needs is the last sample of srtt.
+ */
+static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt)
+{
+ struct westwood *w = inet_csk_ca(sk);
+ if (cnt > 0)
+ w->rtt = tcp_sk(sk)->srtt >> 3;
+}
+
+/*
+ * @westwood_update_window
+ * It updates RTT evaluation window if it is the right moment to do
+ * it. If so it calls filter for evaluating bandwidth.
+ */
+static void westwood_update_window(struct sock *sk)
+{
+ struct westwood *w = inet_csk_ca(sk);
+ s32 delta = tcp_time_stamp - w->rtt_win_sx;
+
+ /*
+ * See if a RTT-window has passed.
+ * Be careful since if RTT is less than
+ * 50ms we don't filter but we continue 'building the sample'.
+ * This minimum limit was chosen since an estimation on small
+ * time intervals is better to avoid...
+ * Obviously on a LAN we reasonably will always have
+ * right_bound = left_bound + WESTWOOD_RTT_MIN
+ */
+ if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
+ westwood_filter(w, delta);
+
+ w->bk = 0;
+ w->rtt_win_sx = tcp_time_stamp;
+ }
+}
+
+/*
+ * @westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successful. In such case in fact update is
+ * straight forward and doesn't need any particular care.
+ */
+static inline void westwood_fast_bw(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
+
+ westwood_update_window(sk);
+
+ w->bk += tp->snd_una - w->snd_una;
+ w->snd_una = tp->snd_una;
+ w->rtt_min = min(w->rtt, w->rtt_min);
+}
+
+/*
+ * @westwood_acked_count
+ * This function evaluates cumul_ack for evaluating bk in case of
+ * delayed or partial acks.
+ */
+static inline u32 westwood_acked_count(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
+
+ w->cumul_ack = tp->snd_una - w->snd_una;
+
+ /* If cumul_ack is 0 this is a dupack since it's not moving
+ * tp->snd_una.
+ */
+ if (!w->cumul_ack) {
+ w->accounted += tp->mss_cache;
+ w->cumul_ack = tp->mss_cache;
+ }
+
+ if (w->cumul_ack > tp->mss_cache) {
+ /* Partial or delayed ack */
+ if (w->accounted >= w->cumul_ack) {
+ w->accounted -= w->cumul_ack;
+ w->cumul_ack = tp->mss_cache;
+ } else {
+ w->cumul_ack -= w->accounted;
+ w->accounted = 0;
+ }
+ }
+
+ w->snd_una = tp->snd_una;
+
+ return w->cumul_ack;
+}
+
+static inline u32 westwood_bw_rttmin(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct westwood *w = inet_csk_ca(sk);
+ return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
+}
+
+/*
+ * TCP Westwood
+ * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
+ * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
+ * so avoids ever returning 0.
+ */
+static u32 tcp_westwood_cwnd_min(struct sock *sk)
+{
+ return westwood_bw_rttmin(sk);
+}
+
+static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct westwood *w = inet_csk_ca(sk);
+
+ switch(event) {
+ case CA_EVENT_FAST_ACK:
+ westwood_fast_bw(sk);
+ break;
+
+ case CA_EVENT_COMPLETE_CWR:
+ tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk);
+ break;
+
+ case CA_EVENT_FRTO:
+ tp->snd_ssthresh = westwood_bw_rttmin(sk);
+ break;
+
+ case CA_EVENT_SLOW_ACK:
+ westwood_update_window(sk);
+ w->bk += westwood_acked_count(sk);
+ w->rtt_min = min(w->rtt, w->rtt_min);
+ break;
+
+ default:
+ /* don't care */
+ break;
+ }
+}
+
+
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_westwood_info(struct sock *sk, u32 ext,
+ struct sk_buff *skb)
+{
+ const struct westwood *ca = inet_csk_ca(sk);
+ if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+ struct rtattr *rta;
+ struct tcpvegas_info *info;
+
+ rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info));
+ info = RTA_DATA(rta);
+ info->tcpv_enabled = 1;
+ info->tcpv_rttcnt = 0;
+ info->tcpv_rtt = jiffies_to_usecs(ca->rtt);
+ info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
+ rtattr_failure: ;
+ }
+}
+
+
+static struct tcp_congestion_ops tcp_westwood = {
+ .init = tcp_westwood_init,
+ .ssthresh = tcp_reno_ssthresh,
+ .cong_avoid = tcp_reno_cong_avoid,
+ .min_cwnd = tcp_westwood_cwnd_min,
+ .cwnd_event = tcp_westwood_event,
+ .get_info = tcp_westwood_info,
+ .pkts_acked = tcp_westwood_pkts_acked,
+
+ .owner = THIS_MODULE,
+ .name = "westwood"
+};
+
+static int __init tcp_westwood_register(void)
+{
+ BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE);
+ return tcp_register_congestion_control(&tcp_westwood);
+}
+
+static void __exit tcp_westwood_unregister(void)
+{
+ tcp_unregister_congestion_control(&tcp_westwood);
+}
+
+module_init(tcp_westwood_register);
+module_exit(tcp_westwood_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Westwood+");
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7c24e64..e5beca7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,7 +95,8 @@
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <net/snmp.h>
-#include <net/tcp.h>
+#include <net/ip.h>
+#include <net/tcp_states.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
@@ -112,7 +113,7 @@
* Snmp MIB for the UDP layer
*/
-DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
+DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
struct hlist_head udp_hash[UDP_HTABLE_SIZE];
DEFINE_RWLOCK(udp_hash_lock);
@@ -628,7 +629,7 @@ back_from_confirm:
/* ... which is an evident application bug. --ANK */
release_sock(sk);
- NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
err = -EINVAL;
goto out;
}
@@ -693,7 +694,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset,
if (unlikely(!up->pending)) {
release_sock(sk);
- NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
return -EINVAL;
}
@@ -1102,7 +1103,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
return 0;
- NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
skb->ip_summed = CHECKSUM_NONE;
}
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -1181,14 +1182,13 @@ int udp_rcv(struct sk_buff *skb)
return(0);
short_packet:
- NETDEBUG(if (net_ratelimit())
- printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
- NIPQUAD(saddr),
- ntohs(uh->source),
- ulen,
- len,
- NIPQUAD(daddr),
- ntohs(uh->dest)));
+ LIMIT_NETDEBUG(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
+ NIPQUAD(saddr),
+ ntohs(uh->source),
+ ulen,
+ len,
+ NIPQUAD(daddr),
+ ntohs(uh->dest));
no_header:
UDP_INC_STATS_BH(UDP_MIB_INERRORS);
kfree_skb(skb);
@@ -1199,13 +1199,12 @@ csum_error:
* RFC1122: OK. Discards the bad packet silently (as far as
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
- NETDEBUG(if (net_ratelimit())
- printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
- NIPQUAD(saddr),
- ntohs(uh->source),
- NIPQUAD(daddr),
- ntohs(uh->dest),
- ulen));
+ LIMIT_NETDEBUG(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
+ NIPQUAD(saddr),
+ ntohs(uh->source),
+ NIPQUAD(daddr),
+ ntohs(uh->dest),
+ ulen);
drop:
UDP_INC_STATS_BH(UDP_MIB_INERRORS);
kfree_skb(skb);
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
deleted file mode 100644
index 6aecd7a..0000000
--- a/net/ipv4/utils.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * Various kernel-resident INET utility functions; mainly
- * for format conversion and debugging output.
- *
- * Version: $Id: utils.c,v 1.8 2000/10/03 07:29:01 anton Exp $
- *
- * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
- *
- * Fixes:
- * Alan Cox : verify_area check.
- * Alan Cox : removed old debugging.
- * Andi Kleen : add net_ratelimit()
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <asm/byteorder.h>
-
-/*
- * Convert an ASCII string to binary IP.
- */
-
-__u32 in_aton(const char *str)
-{
- unsigned long l;
- unsigned int val;
- int i;
-
- l = 0;
- for (i = 0; i < 4; i++)
- {
- l <<= 8;
- if (*str != '\0')
- {
- val = 0;
- while (*str != '\0' && *str != '.')
- {
- val *= 10;
- val += *str - '0';
- str++;
- }
- l |= val;
- if (*str != '\0')
- str++;
- }
- }
- return(htonl(l));
-}
-
-EXPORT_SYMBOL(in_aton);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index af2392a..66620a9 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -33,6 +33,7 @@ static void xfrm4_encap(struct sk_buff *skb)
struct dst_entry *dst = skb->dst;
struct xfrm_state *x = dst->xfrm;
struct iphdr *iph, *top_iph;
+ int flags;
iph = skb->nh.iph;
skb->h.ipiph = iph;
@@ -51,10 +52,13 @@ static void xfrm4_encap(struct sk_buff *skb)
/* DS disclosed */
top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
- if (x->props.flags & XFRM_STATE_NOECN)
+
+ flags = x->props.flags;
+ if (flags & XFRM_STATE_NOECN)
IP_ECN_clear(top_iph);
- top_iph->frag_off = iph->frag_off & htons(IP_DF);
+ top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
+ 0 : (iph->frag_off & htons(IP_DF));
if (!top_iph->frag_off)
__ip_select_ident(top_iph, dst, 0);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 223a2e8..d23e07f 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -7,12 +7,20 @@
*
*/
+#include <net/ip.h>
#include <net/xfrm.h>
#include <linux/pfkeyv2.h>
#include <linux/ipsec.h>
static struct xfrm_state_afinfo xfrm4_state_afinfo;
+static int xfrm4_init_flags(struct xfrm_state *x)
+{
+ if (ipv4_config.no_pmtu_disc)
+ x->props.flags |= XFRM_STATE_NOPMTUDISC;
+ return 0;
+}
+
static void
__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
struct xfrm_tmpl *tmpl,
@@ -109,6 +117,7 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.family = AF_INET,
.lock = RW_LOCK_UNLOCKED,
+ .init_flags = xfrm4_init_flags,
.init_tempsel = __xfrm4_init_tempsel,
.state_lookup = __xfrm4_state_lookup,
.find_acq = __xfrm4_find_acq,
@@ -119,8 +128,10 @@ void __init xfrm4_state_init(void)
xfrm_state_register_afinfo(&xfrm4_state_afinfo);
}
+#if 0
void __exit xfrm4_state_fini(void)
{
xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
}
+#endif /* 0 */
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 413191f..afbb0d4 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -78,13 +78,12 @@ static int ipip_rcv(struct sk_buff *skb)
static void ipip_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler = ipip_handler;
- u32 arg = info;
if (handler)
- handler->err_handler(skb, &arg);
+ handler->err_handler(skb, info);
}
-static int ipip_init_state(struct xfrm_state *x, void *args)
+static int ipip_init_state(struct xfrm_state *x)
{
if (!x->props.mode)
return -EINVAL;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index e66ca93..ab7a912 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -1,6 +1,26 @@
#
# IPv6 configuration
-#
+#
+
+# IPv6 as module will cause a CRASH if you try to unload it
+config IPV6
+ tristate "The IPv6 protocol"
+ default m
+ select CRYPTO if IPV6_PRIVACY
+ select CRYPTO_MD5 if IPV6_PRIVACY
+ ---help---
+ This is complemental support for the IP version 6.
+ You will still be able to do traditional IPv4 networking as well.
+
+ For general information about IPv6, see
+ <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
+ For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
+ For specific information about IPv6 under Linux, read the HOWTO at
+ <http://www.bieringer.de/linux/IPv6/>.
+
+ To compile this protocol support as a module, choose M here: the
+ module will be called ipv6.
+
config IPV6_PRIVACY
bool "IPv6: Privacy Extensions (RFC 3041) support"
depends on IPV6
@@ -71,7 +91,6 @@ config INET6_TUNNEL
config IPV6_TUNNEL
tristate "IPv6: IPv6-in-IPv6 tunnel"
depends on IPV6
- select INET6_TUNNEL
---help---
Support for IPv6-in-IPv6 tunnels described in RFC 2473.
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index b39e049..6460eec 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,7 +8,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \
protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
- ip6_flowlabel.o ipv6_syms.o
+ ip6_flowlabel.o ipv6_syms.o netfilter.o
ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
xfrm6_output.o
@@ -23,3 +23,5 @@ obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
obj-y += exthdrs_core.o
+
+obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7744a25..6d6fb74 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -57,6 +57,7 @@
#endif
#include <linux/delay.h>
#include <linux/notifier.h>
+#include <linux/string.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -131,7 +132,7 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
static int addrconf_ifdown(struct net_device *dev, int how);
-static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags);
+static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags);
static void addrconf_dad_timer(unsigned long data);
static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
static void addrconf_rs_timer(unsigned long data);
@@ -372,6 +373,7 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
ndev->regen_timer.data = (unsigned long) ndev;
if ((dev->flags&IFF_LOOPBACK) ||
dev->type == ARPHRD_TUNNEL ||
+ dev->type == ARPHRD_NONE ||
dev->type == ARPHRD_SIT) {
printk(KERN_INFO
"Disabled Privacy Extensions on device %p(%s)\n",
@@ -491,7 +493,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
static struct inet6_ifaddr *
ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
- int scope, unsigned flags)
+ int scope, u32 flags)
{
struct inet6_ifaddr *ifa = NULL;
struct rt6_info *rt;
@@ -694,7 +696,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
if (onlink == 0) {
- ip6_del_rt(rt, NULL, NULL);
+ ip6_del_rt(rt, NULL, NULL, NULL);
rt = NULL;
} else if (!(rt->rt6i_flags & RTF_EXPIRES)) {
rt->rt6i_expires = expires;
@@ -1039,9 +1041,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2);
u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
- u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
+ u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
int sk_ipv6only = ipv6_only_sock(sk);
- int sk2_ipv6only = tcp_v6_ipv6only(sk2);
+ int sk2_ipv6only = inet_v6_ipv6only(sk2);
int addr_type = ipv6_addr_type(sk_rcv_saddr6);
int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
@@ -1124,7 +1126,7 @@ void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr)
__ipv6_dev_mc_dec(idev, &maddr);
}
-void addrconf_join_anycast(struct inet6_ifaddr *ifp)
+static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
{
struct in6_addr addr;
ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -1133,7 +1135,7 @@ void addrconf_join_anycast(struct inet6_ifaddr *ifp)
ipv6_dev_ac_inc(ifp->idev->dev, &addr);
}
-void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
+static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
{
struct in6_addr addr;
ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -1319,7 +1321,7 @@ static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
static void
addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
- unsigned long expires, unsigned flags)
+ unsigned long expires, u32 flags)
{
struct in6_rtmsg rtmsg;
@@ -1339,7 +1341,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT))
rtmsg.rtmsg_flags |= RTF_NONEXTHOP;
- ip6_route_add(&rtmsg, NULL, NULL);
+ ip6_route_add(&rtmsg, NULL, NULL, NULL);
}
/* Create "default" multicast route to the interface */
@@ -1356,7 +1358,7 @@ static void addrconf_add_mroute(struct net_device *dev)
rtmsg.rtmsg_ifindex = dev->ifindex;
rtmsg.rtmsg_flags = RTF_UP;
rtmsg.rtmsg_type = RTMSG_NEWROUTE;
- ip6_route_add(&rtmsg, NULL, NULL);
+ ip6_route_add(&rtmsg, NULL, NULL, NULL);
}
static void sit_route_add(struct net_device *dev)
@@ -1373,7 +1375,7 @@ static void sit_route_add(struct net_device *dev)
rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP;
rtmsg.rtmsg_ifindex = dev->ifindex;
- ip6_route_add(&rtmsg, NULL, NULL);
+ ip6_route_add(&rtmsg, NULL, NULL, NULL);
}
static void addrconf_add_lroute(struct net_device *dev)
@@ -1466,7 +1468,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
if (rt->rt6i_flags&RTF_EXPIRES) {
if (valid_lft == 0) {
- ip6_del_rt(rt, NULL, NULL);
+ ip6_del_rt(rt, NULL, NULL, NULL);
rt = NULL;
} else {
rt->rt6i_expires = rt_expires;
@@ -2228,7 +2230,7 @@ out:
/*
* Duplicate Address Detection
*/
-static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags)
+static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
{
struct inet6_dev *idev = ifp->idev;
struct net_device *dev = idev->dev;
@@ -2621,15 +2623,14 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
}
static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
- u32 pid, u32 seq, int event)
+ u32 pid, u32 seq, int event, unsigned int flags)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
struct ifa_cacheinfo ci;
unsigned char *b = skb->tail;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
- if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
ifm = NLMSG_DATA(nlh);
ifm->ifa_family = AF_INET6;
ifm->ifa_prefixlen = ifa->prefix_len;
@@ -2671,15 +2672,14 @@ rtattr_failure:
}
static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
- u32 pid, u32 seq, int event)
+ u32 pid, u32 seq, int event, u16 flags)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
struct ifa_cacheinfo ci;
unsigned char *b = skb->tail;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
- if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
ifm = NLMSG_DATA(nlh);
ifm->ifa_family = AF_INET6;
ifm->ifa_prefixlen = 128;
@@ -2708,15 +2708,14 @@ rtattr_failure:
}
static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
- u32 pid, u32 seq, int event)
+ u32 pid, u32 seq, int event, unsigned int flags)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
struct ifa_cacheinfo ci;
unsigned char *b = skb->tail;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
- if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
ifm = NLMSG_DATA(nlh);
ifm->ifa_family = AF_INET6;
ifm->ifa_prefixlen = 128;
@@ -2778,28 +2777,17 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
read_lock_bh(&idev->lock);
switch (type) {
case UNICAST_ADDR:
- /* unicast address */
+ /* unicast address incl. temp addr */
for (ifa = idev->addr_list; ifa;
ifa = ifa->if_next, ip_idx++) {
if (ip_idx < s_ip_idx)
continue;
if ((err = inet6_fill_ifaddr(skb, ifa,
NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWADDR)) <= 0)
+ cb->nlh->nlmsg_seq, RTM_NEWADDR,
+ NLM_F_MULTI)) <= 0)
goto done;
}
- /* temp addr */
-#ifdef CONFIG_IPV6_PRIVACY
- for (ifa = idev->tempaddr_list; ifa;
- ifa = ifa->tmp_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
- continue;
- if ((err = inet6_fill_ifaddr(skb, ifa,
- NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWADDR)) <= 0)
- goto done;
- }
-#endif
break;
case MULTICAST_ADDR:
/* multicast address */
@@ -2809,7 +2797,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
continue;
if ((err = inet6_fill_ifmcaddr(skb, ifmca,
NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_GETMULTICAST)) <= 0)
+ cb->nlh->nlmsg_seq, RTM_GETMULTICAST,
+ NLM_F_MULTI)) <= 0)
goto done;
}
break;
@@ -2821,7 +2810,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
continue;
if ((err = inet6_fill_ifacaddr(skb, ifaca,
NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_GETANYCAST)) <= 0)
+ cb->nlh->nlmsg_seq, RTM_GETANYCAST,
+ NLM_F_MULTI)) <= 0)
goto done;
}
break;
@@ -2868,16 +2858,16 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
skb = alloc_skb(size, GFP_ATOMIC);
if (!skb) {
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, ENOBUFS);
return;
}
- if (inet6_fill_ifaddr(skb, ifa, 0, 0, event) < 0) {
+ if (inet6_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, EINVAL);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFADDR;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFADDR, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFADDR;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFADDR, GFP_ATOMIC);
}
static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
@@ -2906,7 +2896,7 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
}
static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
- u32 pid, u32 seq, int event)
+ u32 pid, u32 seq, int event, unsigned int flags)
{
struct net_device *dev = idev->dev;
__s32 *array = NULL;
@@ -2917,10 +2907,10 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
__u32 mtu = dev->mtu;
struct ifla_cacheinfo ci;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
- if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
r = NLMSG_DATA(nlh);
r->ifi_family = AF_INET6;
+ r->__ifi_pad = 0;
r->ifi_type = dev->type;
r->ifi_index = dev->ifindex;
r->ifi_flags = dev_get_flags(dev);
@@ -2985,7 +2975,7 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
if ((idev = in6_dev_get(dev)) == NULL)
continue;
err = inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWLINK);
+ cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI);
in6_dev_put(idev);
if (err <= 0)
break;
@@ -3004,36 +2994,36 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
skb = alloc_skb(size, GFP_ATOMIC);
if (!skb) {
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, ENOBUFS);
return;
}
- if (inet6_fill_ifinfo(skb, idev, 0, 0, event) < 0) {
+ if (inet6_fill_ifinfo(skb, idev, current->pid, 0, event, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, EINVAL);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFINFO;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFINFO, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFINFO;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC);
}
static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
- struct prefix_info *pinfo, u32 pid, u32 seq, int event)
+ struct prefix_info *pinfo, u32 pid, u32 seq,
+ int event, unsigned int flags)
{
struct prefixmsg *pmsg;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
struct prefix_cacheinfo ci;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*pmsg));
-
- if (pid)
- nlh->nlmsg_flags |= NLM_F_MULTI;
-
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags);
pmsg = NLMSG_DATA(nlh);
pmsg->prefix_family = AF_INET6;
+ pmsg->prefix_pad1 = 0;
+ pmsg->prefix_pad2 = 0;
pmsg->prefix_ifindex = idev->dev->ifindex;
pmsg->prefix_len = pinfo->prefix_len;
pmsg->prefix_type = pinfo->type;
+ pmsg->prefix_pad3 = 0;
pmsg->prefix_flags = 0;
if (pinfo->onlink)
@@ -3064,16 +3054,16 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
skb = alloc_skb(size, GFP_ATOMIC);
if (!skb) {
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, ENOBUFS);
return;
}
- if (inet6_fill_prefix(skb, idev, pinfo, 0, 0, event) < 0) {
+ if (inet6_fill_prefix(skb, idev, pinfo, current->pid, 0, event, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, EINVAL);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_PREFIX;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_PREFIX, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_PREFIX;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_PREFIX, GFP_ATOMIC);
}
static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = {
@@ -3096,7 +3086,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
switch (event) {
case RTM_NEWADDR:
dst_hold(&ifp->rt->u.dst);
- if (ip6_ins_rt(ifp->rt, NULL, NULL))
+ if (ip6_ins_rt(ifp->rt, NULL, NULL, NULL))
dst_release(&ifp->rt->u.dst);
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);
@@ -3106,7 +3096,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
addrconf_leave_anycast(ifp);
addrconf_leave_solict(ifp->idev, &ifp->addr);
dst_hold(&ifp->rt->u.dst);
- if (ip6_del_rt(ifp->rt, NULL, NULL))
+ if (ip6_del_rt(ifp->rt, NULL, NULL, NULL))
dst_free(&ifp->rt->u.dst);
else
dst_release(&ifp->rt->u.dst);
@@ -3439,7 +3429,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf
* by sysctl and we wouldn't want anyone to change it under our feet
* (see SIOCSIFNAME).
*/
- dev_name = net_sysctl_strdup(dev_name);
+ dev_name = kstrdup(dev_name, GFP_KERNEL);
if (!dev_name)
goto free;
@@ -3603,10 +3593,8 @@ void __exit addrconf_cleanup(void)
rtnl_unlock();
#ifdef CONFIG_IPV6_PRIVACY
- if (likely(md5_tfm != NULL)) {
- crypto_free_tfm(md5_tfm);
- md5_tfm = NULL;
- }
+ crypto_free_tfm(md5_tfm);
+ md5_tfm = NULL;
#endif
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2b193e3..4f8795a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -44,6 +44,7 @@
#include <linux/netdevice.h>
#include <linux/icmpv6.h>
#include <linux/smp_lock.h>
+#include <linux/netfilter_ipv6.h>
#include <net/ip.h>
#include <net/ipv6.h>
@@ -66,45 +67,14 @@ MODULE_AUTHOR("Cast of dozens");
MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
MODULE_LICENSE("GPL");
-/* IPv6 procfs goodies... */
-
-#ifdef CONFIG_PROC_FS
-extern int raw6_proc_init(void);
-extern void raw6_proc_exit(void);
-extern int tcp6_proc_init(void);
-extern void tcp6_proc_exit(void);
-extern int udp6_proc_init(void);
-extern void udp6_proc_exit(void);
-extern int ipv6_misc_proc_init(void);
-extern void ipv6_misc_proc_exit(void);
-extern int ac6_proc_init(void);
-extern void ac6_proc_exit(void);
-extern int if6_proc_init(void);
-extern void if6_proc_exit(void);
-#endif
-
int sysctl_ipv6_bindv6only;
-#ifdef INET_REFCNT_DEBUG
-atomic_t inet6_sock_nr;
-EXPORT_SYMBOL(inet6_sock_nr);
-#endif
-
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
*/
static struct list_head inetsw6[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw6_lock);
-static void inet6_sock_destruct(struct sock *sk)
-{
- inet_sock_destruct(sk);
-
-#ifdef INET_REFCNT_DEBUG
- atomic_dec(&inet6_sock_nr);
-#endif
-}
-
static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
{
const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
@@ -185,7 +155,7 @@ static int inet6_create(struct socket *sock, int protocol)
inet->hdrincl = 1;
}
- sk->sk_destruct = inet6_sock_destruct;
+ sk->sk_destruct = inet_sock_destruct;
sk->sk_family = PF_INET6;
sk->sk_protocol = protocol;
@@ -212,12 +182,17 @@ static int inet6_create(struct socket *sock, int protocol)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
+ /*
+ * Increment only the relevant sk_prot->socks debug field, this changes
+ * the previous behaviour of incrementing both the equivalent to
+ * answer->prot->socks (inet6_sock_nr) and inet_sock_nr.
+ *
+ * This allows better debug granularity as we'll know exactly how many
+ * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6
+ * transport protocol socks. -acme
+ */
+ sk_refcnt_debug_inc(sk);
-
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet6_sock_nr);
- atomic_inc(&inet_sock_nr);
-#endif
if (inet->num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
@@ -513,11 +488,6 @@ static struct net_proto_family inet6_family_ops = {
.owner = THIS_MODULE,
};
-#ifdef CONFIG_SYSCTL
-extern void ipv6_sysctl_register(void);
-extern void ipv6_sysctl_unregister(void);
-#endif
-
/* Same as inet6_dgram_ops, sans udp_poll. */
static struct proto_ops inet6_sockraw_ops = {
.family = PF_INET6,
@@ -684,8 +654,6 @@ static void cleanup_ipv6_mibs(void)
snmp6_mib_free((void **)udp_stats_in6);
}
-extern int ipv6_misc_proc_init(void);
-
static int __init inet6_init(void)
{
struct sk_buff *dummy_skb;
@@ -757,6 +725,9 @@ static int __init inet6_init(void)
err = igmp6_init(&inet6_family_ops);
if (err)
goto igmp_fail;
+ err = ipv6_netfilter_init();
+ if (err)
+ goto netfilter_fail;
/* Create /proc/foo6 entries. */
#ifdef CONFIG_PROC_FS
err = -ENOMEM;
@@ -774,7 +745,6 @@ static int __init inet6_init(void)
if (if6_proc_init())
goto proc_if6_fail;
#endif
- ipv6_packet_init();
ip6_route_init();
ip6_flowlabel_init();
err = addrconf_init();
@@ -791,6 +761,8 @@ static int __init inet6_init(void)
/* Init v6 transport protocols. */
udpv6_init();
tcpv6_init();
+
+ ipv6_packet_init();
err = 0;
out:
return err;
@@ -798,7 +770,6 @@ out:
addrconf_fail:
ip6_flowlabel_cleanup();
ip6_route_cleanup();
- ipv6_packet_cleanup();
#ifdef CONFIG_PROC_FS
if6_proc_exit();
proc_if6_fail:
@@ -813,6 +784,8 @@ proc_tcp6_fail:
raw6_proc_exit();
proc_raw6_fail:
#endif
+ ipv6_netfilter_fini();
+netfilter_fail:
igmp6_cleanup();
igmp_fail:
ndisc_cleanup();
@@ -852,6 +825,7 @@ static void __exit inet6_exit(void)
ip6_route_cleanup();
ipv6_packet_cleanup();
igmp6_cleanup();
+ ipv6_netfilter_fini();
ndisc_cleanup();
icmpv6_cleanup();
#ifdef CONFIG_SYSCTL
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index e3ecf62..f362973 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -131,10 +131,10 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len)
case NEXTHDR_HOP:
case NEXTHDR_DEST:
if (!zero_out_mutable_opts(exthdr.opth)) {
- LIMIT_NETDEBUG(printk(
+ LIMIT_NETDEBUG(
KERN_WARNING "overrun %sopts\n",
nexthdr == NEXTHDR_HOP ?
- "hop" : "dest"));
+ "hop" : "dest");
return -EINVAL;
}
break;
@@ -293,8 +293,7 @@ static int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struc
skb_push(skb, skb->data - skb->nh.raw);
ahp->icv(ahp, skb, ah->auth_data);
if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
- LIMIT_NETDEBUG(
- printk(KERN_WARNING "ipsec ah authentication error\n"));
+ LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n");
x->stats.integrity_failed++;
goto free_out;
}
@@ -332,14 +331,14 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!x)
return;
- NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/"
- "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
- ntohl(ah->spi), NIP6(iph->daddr)));
+ NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/"
+ "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+ ntohl(ah->spi), NIP6(iph->daddr));
xfrm_state_put(x);
}
-static int ah6_init_state(struct xfrm_state *x, void *args)
+static int ah6_init_state(struct xfrm_state *x)
{
struct ah_data *ahp = NULL;
struct xfrm_algo_desc *aalg_desc;
@@ -402,10 +401,8 @@ static int ah6_init_state(struct xfrm_state *x, void *args)
error:
if (ahp) {
- if (ahp->work_icv)
- kfree(ahp->work_icv);
- if (ahp->tfm)
- crypto_free_tfm(ahp->tfm);
+ kfree(ahp->work_icv);
+ crypto_free_tfm(ahp->tfm);
kfree(ahp);
}
return -EINVAL;
@@ -418,14 +415,10 @@ static void ah6_destroy(struct xfrm_state *x)
if (!ahp)
return;
- if (ahp->work_icv) {
- kfree(ahp->work_icv);
- ahp->work_icv = NULL;
- }
- if (ahp->tfm) {
- crypto_free_tfm(ahp->tfm);
- ahp->tfm = NULL;
- }
+ kfree(ahp->work_icv);
+ ahp->work_icv = NULL;
+ crypto_free_tfm(ahp->tfm);
+ ahp->tfm = NULL;
kfree(ahp);
}
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 5d22ca3..6b72940 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -337,7 +337,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr)
write_unlock_bh(&idev->lock);
dst_hold(&rt->u.dst);
- if (ip6_ins_rt(rt, NULL, NULL))
+ if (ip6_ins_rt(rt, NULL, NULL, NULL))
dst_release(&rt->u.dst);
addrconf_join_solict(dev, &aca->aca_addr);
@@ -380,7 +380,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr)
addrconf_leave_solict(idev, &aca->aca_addr);
dst_hold(&aca->aca_rt->u.dst);
- if (ip6_del_rt(aca->aca_rt, NULL, NULL))
+ if (ip6_del_rt(aca->aca_rt, NULL, NULL, NULL))
dst_free(&aca->aca_rt->u.dst);
else
dst_release(&aca->aca_rt->u.dst);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 65b9375..01468fa 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -29,6 +29,7 @@
#include <net/addrconf.h>
#include <net/transp_v6.h>
#include <net/ip6_route.h>
+#include <net/tcp_states.h>
#include <linux/errqueue.h>
#include <asm/uaccess.h>
@@ -353,14 +354,14 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
err = copied;
/* Reset and regenerate socket error */
- spin_lock_irq(&sk->sk_error_queue.lock);
+ spin_lock_bh(&sk->sk_error_queue.lock);
sk->sk_err = 0;
if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
- spin_unlock_irq(&sk->sk_error_queue.lock);
+ spin_unlock_bh(&sk->sk_error_queue.lock);
sk->sk_error_report(sk);
} else {
- spin_unlock_irq(&sk->sk_error_queue.lock);
+ spin_unlock_bh(&sk->sk_error_queue.lock);
}
out_free_skb:
@@ -588,8 +589,8 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
break;
default:
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "invalid cmsg type: %d\n", cmsg->cmsg_type));
+ LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n",
+ cmsg->cmsg_type);
err = -EINVAL;
break;
};
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index be7095d..9b27460 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -212,8 +212,7 @@ static int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, stru
padlen = nexthdr[0];
if (padlen+2 >= elen) {
- LIMIT_NETDEBUG(
- printk(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen));
+ LIMIT_NETDEBUG(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen);
ret = -EINVAL;
goto out;
}
@@ -277,26 +276,18 @@ static void esp6_destroy(struct xfrm_state *x)
if (!esp)
return;
- if (esp->conf.tfm) {
- crypto_free_tfm(esp->conf.tfm);
- esp->conf.tfm = NULL;
- }
- if (esp->conf.ivec) {
- kfree(esp->conf.ivec);
- esp->conf.ivec = NULL;
- }
- if (esp->auth.tfm) {
- crypto_free_tfm(esp->auth.tfm);
- esp->auth.tfm = NULL;
- }
- if (esp->auth.work_icv) {
- kfree(esp->auth.work_icv);
- esp->auth.work_icv = NULL;
- }
+ crypto_free_tfm(esp->conf.tfm);
+ esp->conf.tfm = NULL;
+ kfree(esp->conf.ivec);
+ esp->conf.ivec = NULL;
+ crypto_free_tfm(esp->auth.tfm);
+ esp->auth.tfm = NULL;
+ kfree(esp->auth.work_icv);
+ esp->auth.work_icv = NULL;
kfree(esp);
}
-static int esp6_init_state(struct xfrm_state *x, void *args)
+static int esp6_init_state(struct xfrm_state *x)
{
struct esp_data *esp = NULL;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index e0839ea..5be6da2 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -424,8 +424,8 @@ static int ipv6_hop_ra(struct sk_buff *skb, int optoff)
IP6CB(skb)->ra = optoff;
return 1;
}
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", skb->nh.raw[optoff+1]));
+ LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n",
+ skb->nh.raw[optoff+1]);
kfree_skb(skb);
return 0;
}
@@ -437,8 +437,8 @@ static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
u32 pkt_len;
if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", skb->nh.raw[optoff+1]));
+ LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
+ skb->nh.raw[optoff+1]);
IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
goto drop;
}
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 8e0f569..fa8f1bb 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -67,7 +67,7 @@
#include <asm/uaccess.h>
#include <asm/system.h>
-DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics);
+DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly;
/*
* The ICMP socket(s). This is the most convenient way to flow control
@@ -277,8 +277,8 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
{
struct inet6_dev *idev = NULL;
struct ipv6hdr *hdr = skb->nh.ipv6h;
- struct sock *sk = icmpv6_socket->sk;
- struct ipv6_pinfo *np = inet6_sk(sk);
+ struct sock *sk;
+ struct ipv6_pinfo *np;
struct in6_addr *saddr = NULL;
struct dst_entry *dst;
struct icmp6hdr tmp_hdr;
@@ -332,8 +332,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
* for now we don't know that.
*/
if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n");
return;
}
@@ -341,8 +340,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
* Never answer to a ICMP packet.
*/
if (is_ineligible(skb)) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "icmpv6_send: no reply to icmp error\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n");
return;
}
@@ -358,6 +356,9 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
if (icmpv6_xmit_lock())
return;
+ sk = icmpv6_socket->sk;
+ np = inet6_sk(sk);
+
if (!icmpv6_xrlim_allow(sk, type, &fl))
goto out;
@@ -390,8 +391,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
len = skb->len - msg.offset;
len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr));
if (len < 0) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "icmp: len problem\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n");
goto out_dst_release;
}
@@ -423,9 +423,9 @@ out:
static void icmpv6_echo_reply(struct sk_buff *skb)
{
- struct sock *sk = icmpv6_socket->sk;
+ struct sock *sk;
struct inet6_dev *idev;
- struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_pinfo *np;
struct in6_addr *saddr = NULL;
struct icmp6hdr *icmph = (struct icmp6hdr *) skb->h.raw;
struct icmp6hdr tmp_hdr;
@@ -454,6 +454,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
if (icmpv6_xmit_lock())
return;
+ sk = icmpv6_socket->sk;
+ np = inet6_sk(sk);
+
if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
fl.oif = np->mcast_oif;
@@ -545,7 +548,8 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info)
read_lock(&raw_v6_lock);
if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) {
- while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr))) {
+ while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr,
+ IP6CB(skb)->iif))) {
rawv6_err(sk, skb, NULL, type, code, inner_offset, info);
sk = sk_next(sk);
}
@@ -577,17 +581,15 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
skb->csum)) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "ICMPv6 hw checksum failed\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n");
skb->ip_summed = CHECKSUM_NONE;
}
}
if (skb->ip_summed == CHECKSUM_NONE) {
if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
skb_checksum(skb, 0, skb->len, 0))) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
- NIP6(*saddr), NIP6(*daddr)));
+ LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
+ NIP6(*saddr), NIP6(*daddr));
goto discard_it;
}
}
@@ -663,8 +665,7 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
break;
default:
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "icmpv6: msg of unknown type\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n");
/* informational */
if (type & ICMPV6_INFOMSG_MASK)
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
new file mode 100644
index 0000000..01d5f46
--- /dev/null
+++ b/net/ipv6/inet6_hashtables.c
@@ -0,0 +1,81 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic INET6 transport hashtables
+ *
+ * Authors: Lotsa people, from code originally in tcp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+
+#include <linux/module.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+
+struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
+ const struct in6_addr *daddr,
+ const unsigned short hnum, const int dif)
+{
+ struct sock *sk;
+ const struct hlist_node *node;
+ struct sock *result = NULL;
+ int score, hiscore = 0;
+
+ read_lock(&hashinfo->lhash_lock);
+ sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) {
+ if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) {
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+
+ score = 1;
+ if (!ipv6_addr_any(&np->rcv_saddr)) {
+ if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+ continue;
+ score++;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ continue;
+ score++;
+ }
+ if (score == 3) {
+ result = sk;
+ break;
+ }
+ if (score > hiscore) {
+ hiscore = score;
+ result = sk;
+ }
+ }
+ }
+ if (result)
+ sock_hold(result);
+ read_unlock(&hashinfo->lhash_lock);
+ return result;
+}
+
+EXPORT_SYMBOL_GPL(inet6_lookup_listener);
+
+struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
+ const struct in6_addr *saddr, const u16 sport,
+ const struct in6_addr *daddr, const u16 dport,
+ const int dif)
+{
+ struct sock *sk;
+
+ local_bh_disable();
+ sk = __inet6_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif);
+ local_bh_enable();
+
+ return sk;
+}
+
+EXPORT_SYMBOL_GPL(inet6_lookup);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 405740b..16af874 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -49,7 +49,7 @@
struct rt6_statistics rt6_stats;
-static kmem_cache_t * fib6_node_kmem;
+static kmem_cache_t * fib6_node_kmem __read_mostly;
enum fib_walk_state_t
{
@@ -394,7 +394,7 @@ insert_above:
*/
static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
- struct nlmsghdr *nlh)
+ struct nlmsghdr *nlh, struct netlink_skb_parms *req)
{
struct rt6_info *iter = NULL;
struct rt6_info **ins;
@@ -449,7 +449,7 @@ out:
*ins = rt;
rt->rt6i_node = fn;
atomic_inc(&rt->rt6i_ref);
- inet6_rt_notify(RTM_NEWROUTE, rt, nlh);
+ inet6_rt_notify(RTM_NEWROUTE, rt, nlh, req);
rt6_stats.fib_rt_entries++;
if ((fn->fn_flags & RTN_RTINFO) == 0) {
@@ -479,7 +479,8 @@ void fib6_force_start_gc(void)
* with source addr info in sub-trees
*/
-int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
+int fib6_add(struct fib6_node *root, struct rt6_info *rt,
+ struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
{
struct fib6_node *fn;
int err = -ENOMEM;
@@ -552,7 +553,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh,
}
#endif
- err = fib6_add_rt2node(fn, rt, nlh);
+ err = fib6_add_rt2node(fn, rt, nlh, req);
if (err == 0) {
fib6_start_gc(rt);
@@ -859,7 +860,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
}
static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
- struct nlmsghdr *nlh, void *_rtattr)
+ struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
{
struct fib6_walker_t *w;
struct rt6_info *rt = *rtp;
@@ -915,11 +916,11 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
if (atomic_read(&rt->rt6i_ref) != 1) BUG();
}
- inet6_rt_notify(RTM_DELROUTE, rt, nlh);
+ inet6_rt_notify(RTM_DELROUTE, rt, nlh, req);
rt6_release(rt);
}
-int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
+int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
{
struct fib6_node *fn = rt->rt6i_node;
struct rt6_info **rtp;
@@ -944,7 +945,7 @@ int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
if (*rtp == rt) {
- fib6_del_route(fn, rtp, nlh, _rtattr);
+ fib6_del_route(fn, rtp, nlh, _rtattr, req);
return 0;
}
}
@@ -1073,7 +1074,7 @@ static int fib6_clean_node(struct fib6_walker_t *w)
res = c->func(rt, c->arg);
if (res < 0) {
w->leaf = rt;
- res = fib6_del(rt, NULL, NULL);
+ res = fib6_del(rt, NULL, NULL, NULL);
if (res) {
#if RT6_DEBUG >= 2
printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 0e5f7499..b6c73da5 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -244,7 +244,6 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
opt_space->opt_nflen = 0;
}
opt_space->dst1opt = fopt->dst1opt;
- opt_space->auth = fopt->auth;
opt_space->opt_flen = fopt->opt_flen;
return opt_space;
}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 866f107..6e34804 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -56,7 +56,7 @@ static inline int ip6_rcv_finish( struct sk_buff *skb)
return dst_input(skb);
}
-int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct ipv6hdr *hdr;
u32 pkt_len;
@@ -166,8 +166,8 @@ resubmit:
nexthdr = skb->nh.raw[nhoff];
raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]);
- if (raw_sk)
- ipv6_raw_deliver(skb, nexthdr);
+ if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
+ raw_sk = NULL;
hash = nexthdr & (MAX_INET_PROTOS - 1);
if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) {
@@ -198,12 +198,13 @@ resubmit:
if (!raw_sk) {
if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP6_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
- icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff);
+ icmpv6_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_UNK_NEXTHDR, nhoff,
+ skb->dev);
}
- } else {
+ } else
IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
- kfree_skb(skb);
- }
+ kfree_skb(skb);
}
rcu_read_unlock();
return 0;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b78a535..01ef94f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -153,51 +153,6 @@ int ip6_output(struct sk_buff *skb)
return ip6_output2(skb);
}
-#ifdef CONFIG_NETFILTER
-int ip6_route_me_harder(struct sk_buff *skb)
-{
- struct ipv6hdr *iph = skb->nh.ipv6h;
- struct dst_entry *dst;
- struct flowi fl = {
- .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
- .nl_u =
- { .ip6_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr, } },
- .proto = iph->nexthdr,
- };
-
- dst = ip6_route_output(skb->sk, &fl);
-
- if (dst->error) {
- IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
- dst_release(dst);
- return -EINVAL;
- }
-
- /* Drop old route. */
- dst_release(skb->dst);
-
- skb->dst = dst;
- return 0;
-}
-#endif
-
-static inline int ip6_maybe_reroute(struct sk_buff *skb)
-{
-#ifdef CONFIG_NETFILTER
- if (skb->nfcache & NFC_ALTERED){
- if (ip6_route_me_harder(skb) != 0){
- kfree_skb(skb);
- return -EINVAL;
- }
- }
-#endif /* CONFIG_NETFILTER */
- return dst_output(skb);
-}
-
/*
* xmit an sk_buff (used by TCP)
*/
@@ -266,7 +221,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
mtu = dst_mtu(dst);
if ((skb->len <= mtu) || ipfragok) {
IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
- return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
+ return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
+ dst_output);
}
if (net_ratelimit())
@@ -321,7 +277,9 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
read_lock(&ip6_ra_lock);
for (ra = ip6_ra_chain; ra; ra = ra->next) {
struct sock *sk = ra->sk;
- if (sk && ra->sel == sel) {
+ if (sk && ra->sel == sel &&
+ (!sk->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == skb->dev->ifindex)) {
if (last) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
@@ -465,7 +423,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->pkt_type = from->pkt_type;
to->priority = from->priority;
to->protocol = from->protocol;
- to->security = from->security;
dst_release(to->dst);
to->dst = dst_clone(from->dst);
to->dev = from->dev;
@@ -484,9 +441,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->nf_bridge = from->nf_bridge;
nf_bridge_get(to->nf_bridge);
#endif
-#ifdef CONFIG_NETFILTER_DEBUG
- to->nf_debug = from->nf_debug;
-#endif
#endif
}
@@ -671,7 +625,7 @@ slow_path:
*/
if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
- NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
+ NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
err = -ENOMEM;
goto fail;
@@ -796,13 +750,8 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
if (ipv6_addr_any(&fl->fl6_src)) {
err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
- if (err) {
-#if IP6_DEBUG >= 2
- printk(KERN_DEBUG "ip6_dst_lookup: "
- "no available source address\n");
-#endif
+ if (err)
goto out_err_release;
- }
}
return 0;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index ba3b0c2..0961372 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1110,11 +1110,39 @@ ip6ip6_fb_tnl_dev_init(struct net_device *dev)
return 0;
}
+#ifdef CONFIG_INET6_TUNNEL
static struct xfrm6_tunnel ip6ip6_handler = {
- .handler = ip6ip6_rcv,
- .err_handler = ip6ip6_err,
+ .handler = ip6ip6_rcv,
+ .err_handler = ip6ip6_err,
};
+static inline int ip6ip6_register(void)
+{
+ return xfrm6_tunnel_register(&ip6ip6_handler);
+}
+
+static inline int ip6ip6_unregister(void)
+{
+ return xfrm6_tunnel_deregister(&ip6ip6_handler);
+}
+#else
+static struct inet6_protocol xfrm6_tunnel_protocol = {
+ .handler = ip6ip6_rcv,
+ .err_handler = ip6ip6_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
+static inline int ip6ip6_register(void)
+{
+ return inet6_add_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6);
+}
+
+static inline int ip6ip6_unregister(void)
+{
+ return inet6_del_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6);
+}
+#endif
+
/**
* ip6_tunnel_init - register protocol and reserve needed resources
*
@@ -1125,7 +1153,7 @@ static int __init ip6_tunnel_init(void)
{
int err;
- if (xfrm6_tunnel_register(&ip6ip6_handler) < 0) {
+ if (ip6ip6_register() < 0) {
printk(KERN_ERR "ip6ip6 init: can't register tunnel\n");
return -EAGAIN;
}
@@ -1144,7 +1172,7 @@ static int __init ip6_tunnel_init(void)
}
return 0;
fail:
- xfrm6_tunnel_deregister(&ip6ip6_handler);
+ ip6ip6_unregister();
return err;
}
@@ -1154,7 +1182,7 @@ fail:
static void __exit ip6_tunnel_cleanup(void)
{
- if (xfrm6_tunnel_deregister(&ip6ip6_handler) < 0)
+ if (ip6ip6_unregister() < 0)
printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n");
unregister_netdev(ip6ip6_fb_tnl_dev);
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 6cde531..85bfbc6 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -234,14 +234,9 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
t->props.mode = 1;
memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
- t->type = xfrm_get_type(IPPROTO_IPV6, t->props.family);
- if (t->type == NULL)
+ if (xfrm_init_state(t))
goto error;
- if (t->type->init_state(t, NULL))
- goto error;
-
- t->km.state = XFRM_STATE_VALID;
atomic_set(&t->tunnel_users, 1);
out:
@@ -346,8 +341,7 @@ static void ipcomp6_free_tfms(struct crypto_tfm **tfms)
for_each_cpu(cpu) {
struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu);
- if (tfm)
- crypto_free_tfm(tfm);
+ crypto_free_tfm(tfm);
}
free_percpu(tfms);
}
@@ -359,7 +353,7 @@ static struct crypto_tfm **ipcomp6_alloc_tfms(const char *alg_name)
int cpu;
/* This can be any valid CPU ID so we don't need locking. */
- cpu = smp_processor_id();
+ cpu = raw_smp_processor_id();
list_for_each_entry(pos, &ipcomp6_tfms_list, list) {
struct crypto_tfm *tfm;
@@ -420,7 +414,7 @@ static void ipcomp6_destroy(struct xfrm_state *x)
xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr);
}
-static int ipcomp6_init_state(struct xfrm_state *x, void *args)
+static int ipcomp6_init_state(struct xfrm_state *x)
{
int err;
struct ipcomp_data *ipcd;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 279ab86..76466af 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -55,7 +55,7 @@
#include <asm/uaccess.h>
-DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics);
+DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly;
static struct packet_type ipv6_packet_type = {
.type = __constant_htons(ETH_P_IPV6),
@@ -109,13 +109,6 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *))
return 0;
}
-extern int ip6_mc_source(int add, int omode, struct sock *sk,
- struct group_source_req *pgsr);
-extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf);
-extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
- struct group_filter __user *optval, int __user *optlen);
-
-
int ipv6_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, int optlen)
{
@@ -163,6 +156,13 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
fl6_free_socklist(sk);
ipv6_sock_mc_close(sk);
+ /*
+ * Sock is moving from IPv6 to IPv4 (sk_prot), so
+ * remove it from the refcnt debug socks count in the
+ * original family...
+ */
+ sk_refcnt_debug_dec(sk);
+
if (sk->sk_protocol == IPPROTO_TCP) {
struct tcp_sock *tp = tcp_sk(sk);
@@ -192,9 +192,11 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
kfree_skb(pktopt);
sk->sk_destruct = inet_sock_destruct;
-#ifdef INET_REFCNT_DEBUG
- atomic_dec(&inet6_sock_nr);
-#endif
+ /*
+ * ... and add it to the refcnt debug socks count
+ * in the new family. -acme
+ */
+ sk_refcnt_debug_inc(sk);
module_put(THIS_MODULE);
retv = 0;
break;
@@ -423,11 +425,12 @@ done:
psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
retv = ipv6_sock_mc_join(sk, greqs.gsr_interface,
&psin6->sin6_addr);
- if (retv)
+ /* prior join w/ different source is ok */
+ if (retv && retv != -EADDRINUSE)
break;
omode = MCAST_INCLUDE;
add = 1;
- } else /*IP_DROP_SOURCE_MEMBERSHIP */ {
+ } else /* MCAST_LEAVE_SOURCE_GROUP */ {
omode = MCAST_INCLUDE;
add = 0;
}
@@ -436,7 +439,6 @@ done:
}
case MCAST_MSFILTER:
{
- extern int sysctl_optmem_max;
extern int sysctl_mld_max_msf;
struct group_filter *gsf;
@@ -503,6 +505,9 @@ done:
break;
case IPV6_IPSEC_POLICY:
case IPV6_XFRM_POLICY:
+ retv = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ break;
retv = xfrm_user_policy(sk, optname, optval, optlen);
break;
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index 5ade5a5..37a4a99 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -15,9 +15,6 @@ EXPORT_SYMBOL(ndisc_mc_map);
EXPORT_SYMBOL(register_inet6addr_notifier);
EXPORT_SYMBOL(unregister_inet6addr_notifier);
EXPORT_SYMBOL(ip6_route_output);
-#ifdef CONFIG_NETFILTER
-EXPORT_SYMBOL(ip6_route_me_harder);
-#endif
EXPORT_SYMBOL(addrconf_lock);
EXPORT_SYMBOL(ipv6_setsockopt);
EXPORT_SYMBOL(ipv6_getsockopt);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 393b6e6..29fed6e 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -188,6 +188,16 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
if (!ipv6_addr_is_multicast(addr))
return -EINVAL;
+ read_lock_bh(&ipv6_sk_mc_lock);
+ for (mc_lst=np->ipv6_mc_list; mc_lst; mc_lst=mc_lst->next) {
+ if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
+ ipv6_addr_equal(&mc_lst->addr, addr)) {
+ read_unlock_bh(&ipv6_sk_mc_lock);
+ return -EADDRINUSE;
+ }
+ }
+ read_unlock_bh(&ipv6_sk_mc_lock);
+
mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
if (mc_lst == NULL)
@@ -271,7 +281,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
}
write_unlock_bh(&ipv6_sk_mc_lock);
- return -ENOENT;
+ return -EADDRNOTAVAIL;
}
static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex)
@@ -349,6 +359,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct ip6_sf_socklist *psl;
int i, j, rv;
+ int leavegroup = 0;
int err;
if (pgsr->gsr_group.ss_family != AF_INET6 ||
@@ -368,18 +379,23 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
err = -EADDRNOTAVAIL;
+ read_lock_bh(&ipv6_sk_mc_lock);
for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
continue;
if (ipv6_addr_equal(&pmc->addr, group))
break;
}
- if (!pmc) /* must have a prior join */
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
goto done;
+ }
/* if a source filter was set, must be the same mode as before */
if (pmc->sflist) {
- if (pmc->sfmode != omode)
+ if (pmc->sfmode != omode) {
+ err = -EINVAL;
goto done;
+ }
} else if (pmc->sfmode != omode) {
/* allow mode switches for empty-set filters */
ip6_mc_add_src(idev, group, omode, 0, NULL, 0);
@@ -390,7 +406,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
psl = pmc->sflist;
if (!add) {
if (!psl)
- goto done;
+ goto done; /* err = -EADDRNOTAVAIL */
rv = !0;
for (i=0; i<psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], source,
@@ -399,7 +415,13 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
break;
}
if (rv) /* source not found */
+ goto done; /* err = -EADDRNOTAVAIL */
+
+ /* special case - (INCLUDE, empty) == LEAVE_GROUP */
+ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+ leavegroup = 1;
goto done;
+ }
/* update the interface filter */
ip6_mc_del_src(idev, group, omode, 1, source, 1);
@@ -453,9 +475,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
/* update the interface list */
ip6_mc_add_src(idev, group, omode, 1, source, 1);
done:
+ read_unlock_bh(&ipv6_sk_mc_lock);
read_unlock_bh(&idev->lock);
in6_dev_put(idev);
dev_put(dev);
+ if (leavegroup)
+ return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
return err;
}
@@ -467,6 +492,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
struct inet6_dev *idev;
struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct ip6_sf_socklist *newpsl, *psl;
+ int leavegroup = 0;
int i, err;
group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
@@ -482,7 +508,12 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
if (!idev)
return -ENODEV;
dev = idev->dev;
- err = -EADDRNOTAVAIL;
+
+ err = 0;
+ if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
+ leavegroup = 1;
+ goto done;
+ }
for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
if (pmc->ifindex != gsf->gf_interface)
@@ -490,8 +521,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
if (ipv6_addr_equal(&pmc->addr, group))
break;
}
- if (!pmc) /* must have a prior join */
+ if (!pmc) { /* must have a prior join */
+ err = -EINVAL;
goto done;
+ }
if (gsf->gf_numsrc) {
newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk,
IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC);
@@ -523,10 +556,13 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
(void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
pmc->sflist = newpsl;
pmc->sfmode = gsf->gf_fmode;
+ err = 0;
done:
read_unlock_bh(&idev->lock);
in6_dev_put(idev);
dev_put(dev);
+ if (leavegroup)
+ err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
return err;
}
@@ -1280,15 +1316,6 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
return NULL;
skb_reserve(skb, LL_RESERVED_SPACE(dev));
- if (dev->hard_header) {
- unsigned char ha[MAX_ADDR_LEN];
-
- ndisc_mc_map(&mld2_all_mcr, ha, dev, 1);
- if (dev->hard_header(skb, dev, ETH_P_IPV6,ha,NULL,size) < 0) {
- kfree_skb(skb);
- return NULL;
- }
- }
if (ipv6_get_lladdr(dev, &addr_buf)) {
/* <draft-ietf-magma-mld-source-05.txt>:
@@ -1312,6 +1339,30 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
return skb;
}
+static inline int mld_dev_queue_xmit2(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+
+ if (dev->hard_header) {
+ unsigned char ha[MAX_ADDR_LEN];
+ int err;
+
+ ndisc_mc_map(&skb->nh.ipv6h->daddr, ha, dev, 1);
+ err = dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, skb->len);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+ }
+ return dev_queue_xmit(skb);
+}
+
+static inline int mld_dev_queue_xmit(struct sk_buff *skb)
+{
+ return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dev,
+ mld_dev_queue_xmit2);
+}
+
static void mld_sendpack(struct sk_buff *skb)
{
struct ipv6hdr *pip6 = skb->nh.ipv6h;
@@ -1329,7 +1380,7 @@ static void mld_sendpack(struct sk_buff *skb)
pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen,
IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0));
err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
- dev_queue_xmit);
+ mld_dev_queue_xmit);
if (!err) {
ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS);
IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
@@ -1635,12 +1686,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
}
skb_reserve(skb, LL_RESERVED_SPACE(dev));
- if (dev->hard_header) {
- unsigned char ha[MAX_ADDR_LEN];
- ndisc_mc_map(snd_addr, ha, dev, 1);
- if (dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len) < 0)
- goto out;
- }
if (ipv6_get_lladdr(dev, &addr_buf)) {
/* <draft-ietf-magma-mld-source-05.txt>:
@@ -1668,7 +1713,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
idev = in6_dev_get(skb->dev);
err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
- dev_queue_xmit);
+ mld_dev_queue_xmit);
if (!err) {
if (type == ICMPV6_MGM_REDUCTION)
ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS);
@@ -1682,10 +1727,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
if (likely(idev != NULL))
in6_dev_put(idev);
return;
-
-out:
- IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
- kfree_skb(skb);
}
static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7c291f4..a7eae30 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -812,7 +812,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
if (ipv6_chk_acast_addr(dev, &msg->target) ||
(idev->cnf.forwarding &&
pneigh_lookup(&nd_tbl, &msg->target, dev, 0))) {
- if (skb->stamp.tv_sec != LOCALLY_ENQUEUED &&
+ if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
skb->pkt_type != PACKET_HOST &&
inc != 0 &&
idev->nd_parms->proxy_delay != 0) {
@@ -955,7 +955,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
struct rt6_info *rt;
rt = rt6_get_dflt_router(saddr, dev);
if (rt)
- ip6_del_rt(rt, NULL, NULL);
+ ip6_del_rt(rt, NULL, NULL, NULL);
}
out:
@@ -1096,7 +1096,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
if (rt && lifetime == 0) {
neigh_clone(neigh);
- ip6_del_rt(rt, NULL, NULL);
+ ip6_del_rt(rt, NULL, NULL, NULL);
rt = NULL;
}
@@ -1487,6 +1487,8 @@ int ndisc_rcv(struct sk_buff *skb)
return 0;
}
+ memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
switch (msg->icmph.icmp6_type) {
case NDISC_NEIGHBOUR_SOLICITATION:
ndisc_recv_ns(skb);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
new file mode 100644
index 0000000..f8626eb
--- /dev/null
+++ b/net/ipv6/netfilter.c
@@ -0,0 +1,104 @@
+#include <linux/config.h>
+#include <linux/init.h>
+
+#ifdef CONFIG_NETFILTER
+
+#include <linux/kernel.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+
+int ip6_route_me_harder(struct sk_buff *skb)
+{
+ struct ipv6hdr *iph = skb->nh.ipv6h;
+ struct dst_entry *dst;
+ struct flowi fl = {
+ .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
+ .nl_u =
+ { .ip6_u =
+ { .daddr = iph->daddr,
+ .saddr = iph->saddr, } },
+ .proto = iph->nexthdr,
+ };
+
+ dst = ip6_route_output(skb->sk, &fl);
+
+ if (dst->error) {
+ IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
+ LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
+ dst_release(dst);
+ return -EINVAL;
+ }
+
+ /* Drop old route. */
+ dst_release(skb->dst);
+
+ skb->dst = dst;
+ return 0;
+}
+EXPORT_SYMBOL(ip6_route_me_harder);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip6_rt_info {
+ struct in6_addr daddr;
+ struct in6_addr saddr;
+};
+
+static void save(const struct sk_buff *skb, struct nf_info *info)
+{
+ struct ip6_rt_info *rt_info = nf_info_reroute(info);
+
+ if (info->hook == NF_IP6_LOCAL_OUT) {
+ struct ipv6hdr *iph = skb->nh.ipv6h;
+
+ rt_info->daddr = iph->daddr;
+ rt_info->saddr = iph->saddr;
+ }
+}
+
+static int reroute(struct sk_buff **pskb, const struct nf_info *info)
+{
+ struct ip6_rt_info *rt_info = nf_info_reroute(info);
+
+ if (info->hook == NF_IP6_LOCAL_OUT) {
+ struct ipv6hdr *iph = (*pskb)->nh.ipv6h;
+ if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
+ !ipv6_addr_equal(&iph->saddr, &rt_info->saddr))
+ return ip6_route_me_harder(*pskb);
+ }
+ return 0;
+}
+
+static struct nf_queue_rerouter ip6_reroute = {
+ .rer_size = sizeof(struct ip6_rt_info),
+ .save = &save,
+ .reroute = &reroute,
+};
+
+int __init ipv6_netfilter_init(void)
+{
+ return nf_register_queue_rerouter(PF_INET6, &ip6_reroute);
+}
+
+void ipv6_netfilter_fini(void)
+{
+ nf_unregister_queue_rerouter(PF_INET6);
+}
+
+#else /* CONFIG_NETFILTER */
+int __init ipv6_netfilter_init(void)
+{
+ return 0;
+}
+
+void ipv6_netfilter_fini(void)
+{
+}
+#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 77ec704..216fbe1a 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -10,13 +10,16 @@ menu "IPv6: Netfilter Configuration (EXPERIMENTAL)"
# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK
#fi
config IP6_NF_QUEUE
- tristate "Userspace queueing via NETLINK"
+ tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)"
---help---
This option adds a queue handler to the kernel for IPv6
- packets which lets us to receive the filtered packets
- with QUEUE target using libiptc as we can do with
- the IPv4 now.
+ packets which enables users to receive the filtered packets
+ with QUEUE target using libipq.
+
+ THis option enables the old IPv6-only "ip6_queue" implementation
+ which has been obsoleted by the new "nfnetlink_queue" code (see
+ CONFIG_NETFILTER_NETLINK_QUEUE).
(C) Fernando Anton 2001
IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
@@ -196,6 +199,16 @@ config IP6_NF_TARGET_LOG
To compile it as a module, choose M here. If unsure, say N.
+config IP6_NF_TARGET_REJECT
+ tristate "REJECT target support"
+ depends on IP6_NF_FILTER
+ help
+ The REJECT target allows a filtering rule to specify that an ICMPv6
+ error should be issued in response to an incoming packet, rather
+ than silently being dropped.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
# if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then
# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER
# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
@@ -226,6 +239,22 @@ config IP6_NF_TARGET_MARK
To compile it as a module, choose M here. If unsure, say N.
+config IP6_NF_TARGET_HL
+ tristate 'HL (hoplimit) target support'
+ depends on IP6_NF_MANGLE
+ help
+ This option adds a `HL' target, which enables the user to decrement
+ the hoplimit value of the IPv6 header or set it to a given (lower)
+ value.
+
+ While it is safe to decrement the hoplimit value, this option also
+ enables functionality to increment and set the hoplimit value of the
+ IPv6 header to arbitrary values. This is EXTREMELY DANGEROUS since
+ you can easily create immortal packets that loop forever on the
+ network.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
#dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES
config IP6_NF_RAW
tristate 'raw table support (required for TRACE)'
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 2e51714..bd9a16a 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -20,7 +20,10 @@ obj-$(CONFIG_IP6_NF_MATCH_PHYSDEV) += ip6t_physdev.o
obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o
obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o
+obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o
obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o
obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o
+obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ip6t_NFQUEUE.o
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index 750943e..aa11cf3 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -47,16 +47,10 @@
#define NET_IPQ_QMAX 2088
#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen"
-struct ipq_rt_info {
- struct in6_addr daddr;
- struct in6_addr saddr;
-};
-
struct ipq_queue_entry {
struct list_head list;
struct nf_info *info;
struct sk_buff *skb;
- struct ipq_rt_info rt_info;
};
typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -76,7 +70,9 @@ static DECLARE_MUTEX(ipqnl_sem);
static void
ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
{
+ local_bh_disable();
nf_reinject(entry->skb, entry->info, verdict);
+ local_bh_enable();
kfree(entry);
}
@@ -209,6 +205,12 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
break;
case IPQ_COPY_PACKET:
+ if (entry->skb->ip_summed == CHECKSUM_HW &&
+ (*errp = skb_checksum_help(entry->skb,
+ entry->info->outdev == NULL))) {
+ read_unlock_bh(&queue_lock);
+ return NULL;
+ }
if (copy_range == 0 || copy_range > entry->skb->len)
data_len = entry->skb->len;
else
@@ -236,8 +238,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
pmsg->packet_id = (unsigned long )entry;
pmsg->data_len = data_len;
- pmsg->timestamp_sec = entry->skb->stamp.tv_sec;
- pmsg->timestamp_usec = entry->skb->stamp.tv_usec;
+ pmsg->timestamp_sec = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec;
+ pmsg->timestamp_usec = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec;
pmsg->mark = entry->skb->nfmark;
pmsg->hook = entry->info->hook;
pmsg->hw_protocol = entry->skb->protocol;
@@ -276,7 +278,8 @@ nlmsg_failure:
}
static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
+ unsigned int queuenum, void *data)
{
int status = -EINVAL;
struct sk_buff *nskb;
@@ -294,13 +297,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
entry->info = info;
entry->skb = skb;
- if (entry->info->hook == NF_IP_LOCAL_OUT) {
- struct ipv6hdr *iph = skb->nh.ipv6h;
-
- entry->rt_info.daddr = iph->daddr;
- entry->rt_info.saddr = iph->saddr;
- }
-
nskb = ipq_build_packet_message(entry, &status);
if (nskb == NULL)
goto err_out_free;
@@ -376,22 +372,11 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
}
skb_put(e->skb, diff);
}
- if (!skb_ip_make_writable(&e->skb, v->data_len))
+ if (!skb_make_writable(&e->skb, v->data_len))
return -ENOMEM;
memcpy(e->skb->data, v->payload, v->data_len);
- e->skb->nfcache |= NFC_ALTERED;
-
- /*
- * Extra routing may needed on local out, as the QUEUE target never
- * returns control to the table.
- * Not a nice way to cmp, but works
- */
- if (e->info->hook == NF_IP_LOCAL_OUT) {
- struct ipv6hdr *iph = e->skb->nh.ipv6h;
- if (!ipv6_addr_equal(&iph->daddr, &e->rt_info.daddr) ||
- !ipv6_addr_equal(&iph->saddr, &e->rt_info.saddr))
- return ip6_route_me_harder(e->skb);
- }
+ e->skb->ip_summed = CHECKSUM_NONE;
+
return 0;
}
@@ -667,6 +652,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
return len;
}
+static struct nf_queue_handler nfqh = {
+ .name = "ip6_queue",
+ .outfn = &ipq_enqueue_packet,
+};
+
static int
init_or_cleanup(int init)
{
@@ -677,7 +667,8 @@ init_or_cleanup(int init)
goto cleanup;
netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk);
+ ipqnl = netlink_kernel_create(NETLINK_IP6_FW, 0, ipq_rcv_sk,
+ THIS_MODULE);
if (ipqnl == NULL) {
printk(KERN_ERR "ip6_queue: failed to create netlink socket\n");
goto cleanup_netlink_notifier;
@@ -694,7 +685,7 @@ init_or_cleanup(int init)
register_netdevice_notifier(&ipq_dev_notifier);
ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
- status = nf_register_queue_handler(PF_INET6, ipq_enqueue_packet, NULL);
+ status = nf_register_queue_handler(PF_INET6, &nfqh);
if (status < 0) {
printk(KERN_ERR "ip6_queue: failed to register queue handler\n");
goto cleanup_sysctl;
@@ -702,7 +693,7 @@ init_or_cleanup(int init)
return status;
cleanup:
- nf_unregister_queue_handler(PF_INET6);
+ nf_unregister_queue_handlers(&nfqh);
synchronize_net();
ipq_flush(NF_DROP);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c735276..1cb8adb 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -71,7 +71,6 @@ static DECLARE_MUTEX(ip6t_mutex);
/* Must have mutex */
#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
-#include <linux/netfilter_ipv4/lockhelp.h>
#include <linux/netfilter_ipv4/listhelp.h>
#if 0
@@ -402,7 +401,6 @@ ip6t_do_table(struct sk_buff **pskb,
do {
IP_NF_ASSERT(e);
IP_NF_ASSERT(back);
- (*pskb)->nfcache |= e->nfcache;
if (ip6_packet_match(*pskb, indev, outdev, &e->ipv6,
&protoff, &offset)) {
struct ip6t_entry_target *t;
@@ -435,8 +433,8 @@ ip6t_do_table(struct sk_buff **pskb,
back->comefrom);
continue;
}
- if (table_base + v
- != (void *)e + e->next_offset) {
+ if (table_base + v != (void *)e + e->next_offset
+ && !(e->ipv6.flags & IP6T_F_GOTO)) {
/* Save old back ptr in next entry */
struct ip6t_entry *next
= (void *)e + e->next_offset;
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
new file mode 100644
index 0000000..8f5549b
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_HL.c
@@ -0,0 +1,118 @@
+/*
+ * Hop Limit modification target for ip6tables
+ * Maciej Soltysiak <solt@dns.toxicfilms.tv>
+ * Based on HW's TTL module
+ *
+ * This software is distributed under the terms of GNU GPL
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_HL.h>
+
+MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>");
+MODULE_DESCRIPTION("IP tables Hop Limit modification module");
+MODULE_LICENSE("GPL");
+
+static unsigned int ip6t_hl_target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo, void *userinfo)
+{
+ struct ipv6hdr *ip6h;
+ const struct ip6t_HL_info *info = targinfo;
+ u_int16_t diffs[2];
+ int new_hl;
+
+ if (!skb_make_writable(pskb, (*pskb)->len))
+ return NF_DROP;
+
+ ip6h = (*pskb)->nh.ipv6h;
+
+ switch (info->mode) {
+ case IP6T_HL_SET:
+ new_hl = info->hop_limit;
+ break;
+ case IP6T_HL_INC:
+ new_hl = ip6h->hop_limit + info->hop_limit;
+ if (new_hl > 255)
+ new_hl = 255;
+ break;
+ case IP6T_HL_DEC:
+ new_hl = ip6h->hop_limit - info->hop_limit;
+ if (new_hl < 0)
+ new_hl = 0;
+ break;
+ default:
+ new_hl = ip6h->hop_limit;
+ break;
+ }
+
+ if (new_hl != ip6h->hop_limit) {
+ diffs[0] = htons(((unsigned)ip6h->hop_limit) << 8) ^ 0xFFFF;
+ ip6h->hop_limit = new_hl;
+ diffs[1] = htons(((unsigned)ip6h->hop_limit) << 8);
+ }
+
+ return IP6T_CONTINUE;
+}
+
+static int ip6t_hl_checkentry(const char *tablename,
+ const struct ip6t_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ struct ip6t_HL_info *info = targinfo;
+
+ if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_HL_info))) {
+ printk(KERN_WARNING "ip6t_HL: targinfosize %u != %Zu\n",
+ targinfosize,
+ IP6T_ALIGN(sizeof(struct ip6t_HL_info)));
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle")) {
+ printk(KERN_WARNING "ip6t_HL: can only be called from "
+ "\"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ if (info->mode > IP6T_HL_MAXMODE) {
+ printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n",
+ info->mode);
+ return 0;
+ }
+
+ if ((info->mode != IP6T_HL_SET) && (info->hop_limit == 0)) {
+ printk(KERN_WARNING "ip6t_HL: increment/decrement doesn't "
+ "make sense with value 0\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ip6t_target ip6t_HL = {
+ .name = "HL",
+ .target = ip6t_hl_target,
+ .checkentry = ip6t_hl_checkentry,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ return ip6t_register_target(&ip6t_HL);
+}
+
+static void __exit fini(void)
+{
+ ip6t_unregister_target(&ip6t_HL);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index bfc3d01..0cd1d1b 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -26,10 +26,6 @@ MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
MODULE_DESCRIPTION("IP6 tables LOG target module");
MODULE_LICENSE("GPL");
-static unsigned int nflog = 1;
-module_param(nflog, int, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
-
struct in_device;
#include <net/route.h>
#include <linux/netfilter_ipv6/ip6t_LOG.h>
@@ -44,7 +40,7 @@ struct in_device;
static DEFINE_SPINLOCK(log_lock);
/* One level of recursion won't kill us */
-static void dump_packet(const struct ip6t_log_info *info,
+static void dump_packet(const struct nf_loginfo *info,
const struct sk_buff *skb, unsigned int ip6hoff,
int recurse)
{
@@ -53,6 +49,12 @@ static void dump_packet(const struct ip6t_log_info *info,
struct ipv6hdr _ip6h, *ih;
unsigned int ptr;
unsigned int hdrlen = 0;
+ unsigned int logflags;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_MASK;
ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
if (ih == NULL) {
@@ -84,7 +86,7 @@ static void dump_packet(const struct ip6t_log_info *info,
}
/* Max length: 48 "OPT (...) " */
- if (info->logflags & IP6T_LOG_IPOPT)
+ if (logflags & IP6T_LOG_IPOPT)
printk("OPT ( ");
switch (currenthdr) {
@@ -119,7 +121,7 @@ static void dump_packet(const struct ip6t_log_info *info,
case IPPROTO_ROUTING:
case IPPROTO_HOPOPTS:
if (fragment) {
- if (info->logflags & IP6T_LOG_IPOPT)
+ if (logflags & IP6T_LOG_IPOPT)
printk(")");
return;
}
@@ -127,7 +129,7 @@ static void dump_packet(const struct ip6t_log_info *info,
break;
/* Max Length */
case IPPROTO_AH:
- if (info->logflags & IP6T_LOG_IPOPT) {
+ if (logflags & IP6T_LOG_IPOPT) {
struct ip_auth_hdr _ahdr, *ah;
/* Max length: 3 "AH " */
@@ -158,7 +160,7 @@ static void dump_packet(const struct ip6t_log_info *info,
hdrlen = (hp->hdrlen+2)<<2;
break;
case IPPROTO_ESP:
- if (info->logflags & IP6T_LOG_IPOPT) {
+ if (logflags & IP6T_LOG_IPOPT) {
struct ip_esp_hdr _esph, *eh;
/* Max length: 4 "ESP " */
@@ -190,7 +192,7 @@ static void dump_packet(const struct ip6t_log_info *info,
printk("Unknown Ext Hdr %u", currenthdr);
return;
}
- if (info->logflags & IP6T_LOG_IPOPT)
+ if (logflags & IP6T_LOG_IPOPT)
printk(") ");
currenthdr = hp->nexthdr;
@@ -218,7 +220,7 @@ static void dump_packet(const struct ip6t_log_info *info,
printk("SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (info->logflags & IP6T_LOG_TCPSEQ)
+ if (logflags & IP6T_LOG_TCPSEQ)
printk("SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
/* Max length: 13 "WINDOW=65535 " */
@@ -245,7 +247,7 @@ static void dump_packet(const struct ip6t_log_info *info,
/* Max length: 11 "URGP=65535 " */
printk("URGP=%u ", ntohs(th->urg_ptr));
- if ((info->logflags & IP6T_LOG_TCPOPT)
+ if ((logflags & IP6T_LOG_TCPOPT)
&& th->doff * 4 > sizeof(struct tcphdr)) {
u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
unsigned int i;
@@ -349,7 +351,7 @@ static void dump_packet(const struct ip6t_log_info *info,
}
/* Max length: 15 "UID=4294967295 " */
- if ((info->logflags & IP6T_LOG_UID) && recurse && skb->sk) {
+ if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->sk->sk_socket && skb->sk->sk_socket->file)
printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -357,59 +359,58 @@ static void dump_packet(const struct ip6t_log_info *info,
}
}
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 0,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
static void
-ip6t_log_packet(unsigned int hooknum,
+ip6t_log_packet(unsigned int pf,
+ unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
- const struct ip6t_log_info *loginfo,
- const char *level_string,
+ const struct nf_loginfo *loginfo,
const char *prefix)
{
- struct ipv6hdr *ipv6h = skb->nh.ipv6h;
+ if (!loginfo)
+ loginfo = &default_loginfo;
spin_lock_bh(&log_lock);
- printk(level_string);
- printk("%sIN=%s OUT=%s ",
- prefix == NULL ? loginfo->prefix : prefix,
+ printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+ prefix,
in ? in->name : "",
out ? out->name : "");
if (in && !out) {
+ unsigned int len;
/* MAC logging for input chain only. */
printk("MAC=");
- if (skb->dev && skb->dev->hard_header_len && skb->mac.raw != (void*)ipv6h) {
- if (skb->dev->type != ARPHRD_SIT){
- int i;
- unsigned char *p = skb->mac.raw;
- for (i = 0; i < skb->dev->hard_header_len; i++,p++)
- printk("%02x%c", *p,
- i==skb->dev->hard_header_len - 1
- ? ' ':':');
- } else {
- int i;
- unsigned char *p = skb->mac.raw;
- if ( p - (ETH_ALEN*2+2) > skb->head ){
- p -= (ETH_ALEN+2);
- for (i = 0; i < (ETH_ALEN); i++,p++)
- printk("%02x%s", *p,
- i == ETH_ALEN-1 ? "->" : ":");
- p -= (ETH_ALEN*2);
- for (i = 0; i < (ETH_ALEN); i++,p++)
- printk("%02x%c", *p,
- i == ETH_ALEN-1 ? ' ' : ':');
- }
-
- if ((skb->dev->addr_len == 4) &&
- skb->dev->hard_header_len > 20){
- printk("TUNNEL=");
- p = skb->mac.raw + 12;
- for (i = 0; i < 4; i++,p++)
- printk("%3d%s", *p,
- i == 3 ? "->" : ".");
- for (i = 0; i < 4; i++,p++)
- printk("%3d%c", *p,
- i == 3 ? ' ' : '.');
- }
+ if (skb->dev && (len = skb->dev->hard_header_len) &&
+ skb->mac.raw != skb->nh.raw) {
+ unsigned char *p = skb->mac.raw;
+ int i;
+
+ if (skb->dev->type == ARPHRD_SIT &&
+ (p -= ETH_HLEN) < skb->head)
+ p = NULL;
+
+ if (p != NULL) {
+ for (i = 0; i < len; i++)
+ printk("%02x%s", p[i],
+ i == len - 1 ? "" : ":");
+ }
+ printk(" ");
+
+ if (skb->dev->type == ARPHRD_SIT) {
+ struct iphdr *iph = (struct iphdr *)skb->mac.raw;
+ printk("TUNNEL=%u.%u.%u.%u->%u.%u.%u.%u ",
+ NIPQUAD(iph->saddr),
+ NIPQUAD(iph->daddr));
}
} else
printk(" ");
@@ -429,29 +430,17 @@ ip6t_log_target(struct sk_buff **pskb,
void *userinfo)
{
const struct ip6t_log_info *loginfo = targinfo;
- char level_string[4] = "< >";
+ struct nf_loginfo li;
+
+ li.type = NF_LOG_TYPE_LOG;
+ li.u.log.level = loginfo->level;
+ li.u.log.logflags = loginfo->logflags;
- level_string[1] = '0' + (loginfo->level % 8);
- ip6t_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
+ nf_log_packet(PF_INET6, hooknum, *pskb, in, out, &li, loginfo->prefix);
return IP6T_CONTINUE;
}
-static void
-ip6t_logfn(unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const char *prefix)
-{
- struct ip6t_log_info loginfo = {
- .level = 0,
- .logflags = IP6T_LOG_MASK,
- .prefix = ""
- };
-
- ip6t_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
-}
static int ip6t_log_checkentry(const char *tablename,
const struct ip6t_entry *e,
@@ -488,20 +477,29 @@ static struct ip6t_target ip6t_log_reg = {
.me = THIS_MODULE,
};
+static struct nf_logger ip6t_logger = {
+ .name = "ip6t_LOG",
+ .logfn = &ip6t_log_packet,
+ .me = THIS_MODULE,
+};
+
static int __init init(void)
{
if (ip6t_register_target(&ip6t_log_reg))
return -EINVAL;
- if (nflog)
- nf_log_register(PF_INET6, &ip6t_logfn);
+ if (nf_log_register(PF_INET6, &ip6t_logger) < 0) {
+ printk(KERN_WARNING "ip6t_LOG: not logging via system console "
+ "since somebody else already registered for PF_INET6\n");
+ /* we cannot make module load fail here, since otherwise
+ * ip6tables userspace would abort */
+ }
return 0;
}
static void __exit fini(void)
{
- if (nflog)
- nf_log_unregister(PF_INET6, &ip6t_logfn);
+ nf_log_unregister_logger(&ip6t_logger);
ip6t_unregister_target(&ip6t_log_reg);
}
diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c
index d09ceb0..81924fc 100644
--- a/net/ipv6/netfilter/ip6t_MARK.c
+++ b/net/ipv6/netfilter/ip6t_MARK.c
@@ -28,10 +28,9 @@ target(struct sk_buff **pskb,
{
const struct ip6t_mark_target_info *markinfo = targinfo;
- if((*pskb)->nfmark != markinfo->mark) {
+ if((*pskb)->nfmark != markinfo->mark)
(*pskb)->nfmark = markinfo->mark;
- (*pskb)->nfcache |= NFC_ALTERED;
- }
+
return IP6T_CONTINUE;
}
diff --git a/net/ipv6/netfilter/ip6t_NFQUEUE.c b/net/ipv6/netfilter/ip6t_NFQUEUE.c
new file mode 100644
index 0000000..c6e3730
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_NFQUEUE.c
@@ -0,0 +1,70 @@
+/* ip6tables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("ip6tables NFQUEUE target");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_NFQ_info *tinfo = targinfo;
+
+ return NF_QUEUE_NR(tinfo->queuenum);
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ip6t_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ if (targinfosize != IP6T_ALIGN(sizeof(struct ipt_NFQ_info))) {
+ printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
+ targinfosize,
+ IP6T_ALIGN(sizeof(struct ipt_NFQ_info)));
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ip6t_target ipt_NFQ_reg = {
+ .name = "NFQUEUE",
+ .target = target,
+ .checkentry = checkentry,
+ .me = THIS_MODULE,
+};
+
+static int __init init(void)
+{
+ return ip6t_register_target(&ipt_NFQ_reg);
+}
+
+static void __exit fini(void)
+{
+ ip6t_unregister_target(&ipt_NFQ_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
new file mode 100644
index 0000000..14316c3
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -0,0 +1,284 @@
+/*
+ * IP6 tables REJECT target module
+ * Linux INET6 implementation
+ *
+ * Copyright (C)2003 USAGI/WIDE Project
+ *
+ * Authors:
+ * Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * Based on net/ipv4/netfilter/ipt_REJECT.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmpv6.h>
+#include <linux/netdevice.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#include <net/flow.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_REJECT.h>
+
+MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>");
+MODULE_DESCRIPTION("IP6 tables REJECT target module");
+MODULE_LICENSE("GPL");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Send RST reply */
+static void send_reset(struct sk_buff *oldskb)
+{
+ struct sk_buff *nskb;
+ struct tcphdr otcph, *tcph;
+ unsigned int otcplen, hh_len;
+ int tcphoff, needs_ack;
+ struct ipv6hdr *oip6h = oldskb->nh.ipv6h, *ip6h;
+ struct dst_entry *dst = NULL;
+ u8 proto;
+ struct flowi fl;
+
+ if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
+ (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
+ DEBUGP("ip6t_REJECT: addr is not unicast.\n");
+ return;
+ }
+
+ proto = oip6h->nexthdr;
+ tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto);
+
+ if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
+ DEBUGP("ip6t_REJECT: Can't get TCP header.\n");
+ return;
+ }
+
+ otcplen = oldskb->len - tcphoff;
+
+ /* IP header checks: fragment, too short. */
+ if ((proto != IPPROTO_TCP) || (otcplen < sizeof(struct tcphdr))) {
+ DEBUGP("ip6t_REJECT: proto(%d) != IPPROTO_TCP, or too short. otcplen = %d\n",
+ proto, otcplen);
+ return;
+ }
+
+ if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr)))
+ BUG();
+
+ /* No RST for RST. */
+ if (otcph.rst) {
+ DEBUGP("ip6t_REJECT: RST is set\n");
+ return;
+ }
+
+ /* Check checksum. */
+ if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP,
+ skb_checksum(oldskb, tcphoff, otcplen, 0))) {
+ DEBUGP("ip6t_REJECT: TCP checksum is invalid\n");
+ return;
+ }
+
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_TCP;
+ ipv6_addr_copy(&fl.fl6_src, &oip6h->daddr);
+ ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr);
+ fl.fl_ip_sport = otcph.dest;
+ fl.fl_ip_dport = otcph.source;
+ dst = ip6_route_output(NULL, &fl);
+ if (dst == NULL)
+ return;
+ if (dst->error ||
+ xfrm_lookup(&dst, &fl, NULL, 0)) {
+ dst_release(dst);
+ return;
+ }
+
+ hh_len = (dst->dev->hard_header_len + 15)&~15;
+ nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr)
+ + sizeof(struct tcphdr) + dst->trailer_len,
+ GFP_ATOMIC);
+
+ if (!nskb) {
+ if (net_ratelimit())
+ printk("ip6t_REJECT: Can't alloc skb\n");
+ dst_release(dst);
+ return;
+ }
+
+ nskb->dst = dst;
+
+ skb_reserve(nskb, hh_len + dst->header_len);
+
+ ip6h = nskb->nh.ipv6h = (struct ipv6hdr *)
+ skb_put(nskb, sizeof(struct ipv6hdr));
+ ip6h->version = 6;
+ ip6h->hop_limit = dst_metric(dst, RTAX_HOPLIMIT);
+ ip6h->nexthdr = IPPROTO_TCP;
+ ip6h->payload_len = htons(sizeof(struct tcphdr));
+ ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr);
+ ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr);
+
+ tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
+ /* Truncate to length (no data) */
+ tcph->doff = sizeof(struct tcphdr)/4;
+ tcph->source = otcph.dest;
+ tcph->dest = otcph.source;
+
+ if (otcph.ack) {
+ needs_ack = 0;
+ tcph->seq = otcph.ack_seq;
+ tcph->ack_seq = 0;
+ } else {
+ needs_ack = 1;
+ tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin
+ + otcplen - (otcph.doff<<2));
+ tcph->seq = 0;
+ }
+
+ /* Reset flags */
+ ((u_int8_t *)tcph)[13] = 0;
+ tcph->rst = 1;
+ tcph->ack = needs_ack;
+ tcph->window = 0;
+ tcph->urg_ptr = 0;
+ tcph->check = 0;
+
+ /* Adjust TCP checksum */
+ tcph->check = csum_ipv6_magic(&nskb->nh.ipv6h->saddr,
+ &nskb->nh.ipv6h->daddr,
+ sizeof(struct tcphdr), IPPROTO_TCP,
+ csum_partial((char *)tcph,
+ sizeof(struct tcphdr), 0));
+
+ NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
+ dst_output);
+}
+
+static inline void
+send_unreach(struct sk_buff *skb_in, unsigned char code, unsigned int hooknum)
+{
+ if (hooknum == NF_IP6_LOCAL_OUT && skb_in->dev == NULL)
+ skb_in->dev = &loopback_dev;
+
+ icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0, NULL);
+}
+
+static unsigned int reject6_target(struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ unsigned int hooknum,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ip6t_reject_info *reject = targinfo;
+
+ DEBUGP(KERN_DEBUG "%s: medium point\n", __FUNCTION__);
+ /* WARNING: This code causes reentry within ip6tables.
+ This means that the ip6tables jump stack is now crap. We
+ must return an absolute verdict. --RR */
+ switch (reject->with) {
+ case IP6T_ICMP6_NO_ROUTE:
+ send_unreach(*pskb, ICMPV6_NOROUTE, hooknum);
+ break;
+ case IP6T_ICMP6_ADM_PROHIBITED:
+ send_unreach(*pskb, ICMPV6_ADM_PROHIBITED, hooknum);
+ break;
+ case IP6T_ICMP6_NOT_NEIGHBOUR:
+ send_unreach(*pskb, ICMPV6_NOT_NEIGHBOUR, hooknum);
+ break;
+ case IP6T_ICMP6_ADDR_UNREACH:
+ send_unreach(*pskb, ICMPV6_ADDR_UNREACH, hooknum);
+ break;
+ case IP6T_ICMP6_PORT_UNREACH:
+ send_unreach(*pskb, ICMPV6_PORT_UNREACH, hooknum);
+ break;
+ case IP6T_ICMP6_ECHOREPLY:
+ /* Do nothing */
+ break;
+ case IP6T_TCP_RESET:
+ send_reset(*pskb);
+ break;
+ default:
+ if (net_ratelimit())
+ printk(KERN_WARNING "ip6t_REJECT: case %u not handled yet\n", reject->with);
+ break;
+ }
+
+ return NF_DROP;
+}
+
+static int check(const char *tablename,
+ const struct ip6t_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ip6t_reject_info *rejinfo = targinfo;
+
+ if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_reject_info))) {
+ DEBUGP("ip6t_REJECT: targinfosize %u != 0\n", targinfosize);
+ return 0;
+ }
+
+ /* Only allow these for packet filtering. */
+ if (strcmp(tablename, "filter") != 0) {
+ DEBUGP("ip6t_REJECT: bad table `%s'.\n", tablename);
+ return 0;
+ }
+
+ if ((hook_mask & ~((1 << NF_IP6_LOCAL_IN)
+ | (1 << NF_IP6_FORWARD)
+ | (1 << NF_IP6_LOCAL_OUT))) != 0) {
+ DEBUGP("ip6t_REJECT: bad hook mask %X\n", hook_mask);
+ return 0;
+ }
+
+ if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) {
+ printk("ip6t_REJECT: ECHOREPLY is not supported.\n");
+ return 0;
+ } else if (rejinfo->with == IP6T_TCP_RESET) {
+ /* Must specify that it's a TCP packet */
+ if (e->ipv6.proto != IPPROTO_TCP
+ || (e->ipv6.invflags & IP6T_INV_PROTO)) {
+ DEBUGP("ip6t_REJECT: TCP_RESET illegal for non-tcp\n");
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static struct ip6t_target ip6t_reject_reg = {
+ .name = "REJECT",
+ .target = reject6_target,
+ .checkentry = check,
+ .me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+ if (ip6t_register_target(&ip6t_reject_reg))
+ return -EINVAL;
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ ip6t_unregister_target(&ip6t_reject_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index ab0e32d..9b91dec 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -20,71 +20,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
MODULE_DESCRIPTION("IP6 tables owner matching module");
MODULE_LICENSE("GPL");
-static int
-match_pid(const struct sk_buff *skb, pid_t pid)
-{
- struct task_struct *p;
- struct files_struct *files;
- int i;
-
- read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
- if (!p)
- goto out;
- task_lock(p);
- files = p->files;
- if(files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) == skb->sk->sk_socket->file) {
- spin_unlock(&files->file_lock);
- task_unlock(p);
- read_unlock(&tasklist_lock);
- return 1;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
-out:
- read_unlock(&tasklist_lock);
- return 0;
-}
-
-static int
-match_sid(const struct sk_buff *skb, pid_t sid)
-{
- struct task_struct *g, *p;
- struct file *file = skb->sk->sk_socket->file;
- int i, found=0;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- struct files_struct *files;
- if (p->signal->session != sid)
- continue;
-
- task_lock(p);
- files = p->files;
- if (files) {
- spin_lock(&files->file_lock);
- for (i=0; i < files->max_fds; i++) {
- if (fcheck_files(files, i) == file) {
- found = 1;
- break;
- }
- }
- spin_unlock(&files->file_lock);
- }
- task_unlock(p);
- if (found)
- goto out;
- } while_each_thread(g, p);
-out:
- read_unlock(&tasklist_lock);
-
- return found;
-}
static int
match(const struct sk_buff *skb,
@@ -112,18 +47,6 @@ match(const struct sk_buff *skb,
return 0;
}
- if(info->match & IP6T_OWNER_PID) {
- if (!match_pid(skb, info->pid) ^
- !!(info->invert & IP6T_OWNER_PID))
- return 0;
- }
-
- if(info->match & IP6T_OWNER_SID) {
- if (!match_sid(skb, info->sid) ^
- !!(info->invert & IP6T_OWNER_SID))
- return 0;
- }
-
return 1;
}
@@ -134,6 +57,8 @@ checkentry(const char *tablename,
unsigned int matchsize,
unsigned int hook_mask)
{
+ const struct ip6t_owner_info *info = matchinfo;
+
if (hook_mask
& ~((1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING))) {
printk("ip6t_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -142,14 +67,13 @@ checkentry(const char *tablename,
if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_owner_info)))
return 0;
-#ifdef CONFIG_SMP
- /* files->file_lock can not be used in a BH */
- if (((struct ip6t_owner_info *)matchinfo)->match
- & (IP6T_OWNER_PID|IP6T_OWNER_SID)) {
- printk("ip6t_owner: pid and sid matching is broken on SMP.\n");
+
+ if (info->match & (IP6T_OWNER_PID|IP6T_OWNER_SID)) {
+ printk("ipt_owner: pid and sid matching "
+ "not supported anymore\n");
return 0;
}
-#endif
+
return 1;
}
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 71407be..c2982ef 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -129,13 +129,15 @@ static struct nf_hook_ops ip6t_ops[] = {
.hook = ip6t_hook,
.pf = PF_INET6,
.hooknum = NF_IP6_PRE_ROUTING,
- .priority = NF_IP6_PRI_FIRST
+ .priority = NF_IP6_PRI_FIRST,
+ .owner = THIS_MODULE,
},
{
.hook = ip6t_hook,
.pf = PF_INET6,
.hooknum = NF_IP6_LOCAL_OUT,
- .priority = NF_IP6_PRI_FIRST
+ .priority = NF_IP6_PRI_FIRST,
+ .owner = THIS_MODULE,
},
};
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 617645b..ed3a76b 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -49,6 +49,7 @@
#include <net/transp_v6.h>
#include <net/udp.h>
#include <net/inet_common.h>
+#include <net/tcp_states.h>
#include <net/rawv6.h>
#include <net/xfrm.h>
@@ -81,7 +82,8 @@ static void raw_v6_unhash(struct sock *sk)
/* Grumble... icmp and ip_input want to get at this... */
struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
- struct in6_addr *loc_addr, struct in6_addr *rmt_addr)
+ struct in6_addr *loc_addr, struct in6_addr *rmt_addr,
+ int dif)
{
struct hlist_node *node;
int is_multicast = ipv6_addr_is_multicast(loc_addr);
@@ -94,6 +96,9 @@ struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
!ipv6_addr_equal(&np->daddr, rmt_addr))
continue;
+ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+ continue;
+
if (!ipv6_addr_any(&np->rcv_saddr)) {
if (ipv6_addr_equal(&np->rcv_saddr, loc_addr))
goto found;
@@ -137,11 +142,12 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
*
* Caller owns SKB so we must make clones.
*/
-void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
+int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
{
struct in6_addr *saddr;
struct in6_addr *daddr;
struct sock *sk;
+ int delivered = 0;
__u8 hash;
saddr = &skb->nh.ipv6h->saddr;
@@ -160,9 +166,10 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
if (sk == NULL)
goto out;
- sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr);
+ sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, IP6CB(skb)->iif);
while (sk) {
+ delivered = 1;
if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) {
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
@@ -170,10 +177,12 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
if (clone)
rawv6_rcv(sk, clone);
}
- sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr);
+ sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr,
+ IP6CB(skb)->iif);
}
out:
read_unlock(&raw_v6_lock);
+ return delivered;
}
/* This cleans up af_inet6 a bit. -DaveM */
@@ -328,12 +337,13 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
if (skb->ip_summed == CHECKSUM_HW) {
+ skb_postpull_rcsum(skb, skb->nh.raw,
+ skb->h.raw - skb->nh.raw);
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
&skb->nh.ipv6h->daddr,
skb->len, inet->num, skb->csum)) {
- LIMIT_NETDEBUG(
- printk(KERN_DEBUG "raw v6 hw csum failure.\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n");
skb->ip_summed = CHECKSUM_NONE;
}
}
@@ -434,12 +444,12 @@ csum_copy_err:
/* Clear queue. */
if (flags&MSG_PEEK) {
int clear = 0;
- spin_lock_irq(&sk->sk_receive_queue.lock);
+ spin_lock_bh(&sk->sk_receive_queue.lock);
if (skb == skb_peek(&sk->sk_receive_queue)) {
__skb_unlink(skb, &sk->sk_receive_queue);
clear = 1;
}
- spin_unlock_irq(&sk->sk_receive_queue.lock);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
if (clear)
kfree_skb(skb);
}
@@ -971,11 +981,11 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
struct sk_buff *skb;
int amount = 0;
- spin_lock_irq(&sk->sk_receive_queue.lock);
+ spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
if (skb != NULL)
amount = skb->tail - skb->h.raw;
- spin_unlock_irq(&sk->sk_receive_queue.lock);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg);
}
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 59e7c63..9d9e043 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -562,7 +562,7 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
if (skb->dev)
fq->iif = skb->dev->ifindex;
skb->dev = NULL;
- fq->stamp = skb->stamp;
+ skb_get_timestamp(skb, &fq->stamp);
fq->meat += skb->len;
atomic_add(skb->truesize, &ip6_frag_mem);
@@ -664,7 +664,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
head->next = NULL;
head->dev = dev;
- head->stamp = fq->stamp;
+ skb_set_timestamp(head, &fq->stamp);
head->nh.ipv6h->payload_len = htons(payload_len);
*skb_in = head;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3bf8a02..5d5bbb4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -384,12 +384,13 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
be destroyed.
*/
-int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
+int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
+ void *_rtattr, struct netlink_skb_parms *req)
{
int err;
write_lock_bh(&rt6_lock);
- err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr);
+ err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
write_unlock_bh(&rt6_lock);
return err;
@@ -400,7 +401,7 @@ int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
*/
static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
- struct in6_addr *saddr)
+ struct in6_addr *saddr, struct netlink_skb_parms *req)
{
int err;
struct rt6_info *rt;
@@ -432,7 +433,7 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
dst_hold(&rt->u.dst);
- err = ip6_ins_rt(rt, NULL, NULL);
+ err = ip6_ins_rt(rt, NULL, NULL, req);
if (err == 0)
return rt;
@@ -491,7 +492,8 @@ restart:
read_unlock_bh(&rt6_lock);
nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
- &skb->nh.ipv6h->saddr);
+ &skb->nh.ipv6h->saddr,
+ &NETLINK_CB(skb));
dst_release(&rt->u.dst);
rt = nrt;
@@ -551,7 +553,7 @@ restart:
dst_hold(&rt->u.dst);
read_unlock_bh(&rt6_lock);
- nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src);
+ nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src, NULL);
dst_release(&rt->u.dst);
rt = nrt;
@@ -598,7 +600,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
if (rt) {
if (rt->rt6i_flags & RTF_CACHE)
- ip6_del_rt(rt, NULL, NULL);
+ ip6_del_rt(rt, NULL, NULL, NULL);
else
dst_release(dst);
}
@@ -787,7 +789,8 @@ int ipv6_get_hoplimit(struct net_device *dev)
*
*/
-int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr)
+int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
+ void *_rtattr, struct netlink_skb_parms *req)
{
int err;
struct rtmsg *r;
@@ -974,7 +977,7 @@ install_route:
rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
rt->u.dst.dev = dev;
rt->rt6i_idev = idev;
- return ip6_ins_rt(rt, nlh, _rtattr);
+ return ip6_ins_rt(rt, nlh, _rtattr, req);
out:
if (dev)
@@ -986,7 +989,7 @@ out:
return err;
}
-int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
+int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
{
int err;
@@ -994,7 +997,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
rt6_reset_dflt_pointer(NULL);
- err = fib6_del(rt, nlh, _rtattr);
+ err = fib6_del(rt, nlh, _rtattr, req);
dst_release(&rt->u.dst);
write_unlock_bh(&rt6_lock);
@@ -1002,7 +1005,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
return err;
}
-static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr)
+static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
{
struct fib6_node *fn;
struct rt6_info *rt;
@@ -1029,7 +1032,7 @@ static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_r
dst_hold(&rt->u.dst);
read_unlock_bh(&rt6_lock);
- return ip6_del_rt(rt, nlh, _rtattr);
+ return ip6_del_rt(rt, nlh, _rtattr, req);
}
}
read_unlock_bh(&rt6_lock);
@@ -1136,11 +1139,11 @@ source_ok:
nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
- if (ip6_ins_rt(nrt, NULL, NULL))
+ if (ip6_ins_rt(nrt, NULL, NULL, NULL))
goto out;
if (rt->rt6i_flags&RTF_CACHE) {
- ip6_del_rt(rt, NULL, NULL);
+ ip6_del_rt(rt, NULL, NULL, NULL);
return;
}
@@ -1204,7 +1207,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
2. It is gatewayed route or NONEXTHOP route. Action: clone it.
*/
if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
- nrt = rt6_cow(rt, daddr, saddr);
+ nrt = rt6_cow(rt, daddr, saddr, NULL);
if (!nrt->u.dst.error) {
nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
if (allfrag)
@@ -1232,7 +1235,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
if (allfrag)
nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
- ip6_ins_rt(nrt, NULL, NULL);
+ ip6_ins_rt(nrt, NULL, NULL, NULL);
}
out:
@@ -1305,7 +1308,7 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
rtmsg.rtmsg_ifindex = dev->ifindex;
- ip6_route_add(&rtmsg, NULL, NULL);
+ ip6_route_add(&rtmsg, NULL, NULL, NULL);
return rt6_get_dflt_router(gwaddr, dev);
}
@@ -1323,7 +1326,7 @@ restart:
read_unlock_bh(&rt6_lock);
- ip6_del_rt(rt, NULL, NULL);
+ ip6_del_rt(rt, NULL, NULL, NULL);
goto restart;
}
@@ -1349,10 +1352,10 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
rtnl_lock();
switch (cmd) {
case SIOCADDRT:
- err = ip6_route_add(&rtmsg, NULL, NULL);
+ err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
break;
case SIOCDELRT:
- err = ip6_route_del(&rtmsg, NULL, NULL);
+ err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
break;
default:
err = -EINVAL;
@@ -1369,7 +1372,7 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
* Drop the packet on the floor
*/
-int ip6_pkt_discard(struct sk_buff *skb)
+static int ip6_pkt_discard(struct sk_buff *skb)
{
IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
@@ -1377,7 +1380,7 @@ int ip6_pkt_discard(struct sk_buff *skb)
return 0;
}
-int ip6_pkt_discard_out(struct sk_buff *skb)
+static int ip6_pkt_discard_out(struct sk_buff *skb)
{
skb->dev = skb->dst->dev;
return ip6_pkt_discard(skb);
@@ -1546,7 +1549,7 @@ int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
return -EINVAL;
- return ip6_route_del(&rtmsg, nlh, arg);
+ return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
}
int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -1556,7 +1559,7 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
return -EINVAL;
- return ip6_route_add(&rtmsg, nlh, arg);
+ return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
}
struct rt6_rtnl_dump_arg
@@ -1566,11 +1569,9 @@ struct rt6_rtnl_dump_arg
};
static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
- struct in6_addr *dst,
- struct in6_addr *src,
- int iif,
- int type, u32 pid, u32 seq,
- struct nlmsghdr *in_nlh, int prefix)
+ struct in6_addr *dst, struct in6_addr *src,
+ int iif, int type, u32 pid, u32 seq,
+ int prefix, unsigned int flags)
{
struct rtmsg *rtm;
struct nlmsghdr *nlh;
@@ -1584,11 +1585,7 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
}
}
- if (!pid && in_nlh) {
- pid = in_nlh->nlmsg_pid;
- }
-
- nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm));
+ nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
rtm = NLMSG_DATA(nlh);
rtm->rtm_family = AF_INET6;
rtm->rtm_dst_len = rt->rt6i_dst.plen;
@@ -1674,7 +1671,7 @@ static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
- NULL, prefix);
+ prefix, NLM_F_MULTI);
}
static int fib6_dump_node(struct fib6_walker_t *w)
@@ -1822,7 +1819,7 @@ int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
&fl.fl6_dst, &fl.fl6_src,
iif,
RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
- nlh->nlmsg_seq, nlh, 0);
+ nlh->nlmsg_seq, 0, 0);
if (err < 0) {
err = -EMSGSIZE;
goto out_free;
@@ -1838,23 +1835,31 @@ out_free:
goto out;
}
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh)
+void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
+ struct netlink_skb_parms *req)
{
struct sk_buff *skb;
int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
+ u32 pid = current->pid;
+ u32 seq = 0;
+ if (req)
+ pid = req->pid;
+ if (nlh)
+ seq = nlh->nlmsg_seq;
+
skb = alloc_skb(size, gfp_any());
if (!skb) {
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
return;
}
- if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0) < 0) {
+ if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
kfree_skb(skb);
- netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
+ netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
return;
}
- NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE;
- netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any());
+ NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
+ netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
}
/*
@@ -1955,8 +1960,6 @@ static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
return arg.len;
}
-extern struct rt6_statistics rt6_stats;
-
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index b788f55..c3123c9 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -195,7 +195,6 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int
dev_hold(dev);
ipip6_tunnel_link(nt);
- /* Do not decrement MOD_USE_COUNT here. */
return nt;
failed:
@@ -771,7 +770,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
return 0;
}
-int __init ipip6_fb_tunnel_init(struct net_device *dev)
+static int __init ipip6_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = dev->priv;
struct iphdr *iph = &tunnel->parms.iph;
@@ -794,10 +793,28 @@ static struct net_protocol sit_protocol = {
.err_handler = ipip6_err,
};
+static void __exit sit_destroy_tunnels(void)
+{
+ int prio;
+
+ for (prio = 1; prio < 4; prio++) {
+ int h;
+ for (h = 0; h < HASH_SIZE; h++) {
+ struct ip_tunnel *t;
+ while ((t = tunnels[prio][h]) != NULL)
+ unregister_netdevice(t->dev);
+ }
+ }
+}
+
void __exit sit_cleanup(void)
{
inet_del_protocol(&sit_protocol, IPPROTO_IPV6);
- unregister_netdev(ipip6_fb_tunnel_dev);
+
+ rtnl_lock();
+ sit_destroy_tunnels();
+ unregister_netdevice(ipip6_fb_tunnel_dev);
+ rtnl_unlock();
}
int __init sit_init(void)
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 3a18e0e..8eff9fa 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -14,9 +14,6 @@
#include <net/ipv6.h>
#include <net/addrconf.h>
-extern ctl_table ipv6_route_table[];
-extern ctl_table ipv6_icmp_table[];
-
#ifdef CONFIG_SYSCTL
static ctl_table ipv6_table[] = {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0f69e80..794734f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -47,6 +47,7 @@
#include <net/tcp.h>
#include <net/ndisc.h>
+#include <net/inet6_hashtables.h>
#include <net/ipv6.h>
#include <net/transp_v6.h>
#include <net/addrconf.h>
@@ -65,7 +66,7 @@
#include <linux/seq_file.h>
static void tcp_v6_send_reset(struct sk_buff *skb);
-static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req);
+static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req);
static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len,
struct sk_buff *skb);
@@ -75,34 +76,11 @@ static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
static struct tcp_func ipv6_mapped;
static struct tcp_func ipv6_specific;
-/* I have no idea if this is a good hash for v6 or not. -DaveM */
-static __inline__ int tcp_v6_hashfn(struct in6_addr *laddr, u16 lport,
- struct in6_addr *faddr, u16 fport)
+static inline int tcp_v6_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb)
{
- int hashent = (lport ^ fport);
-
- hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]);
- hashent ^= hashent>>16;
- hashent ^= hashent>>8;
- return (hashent & (tcp_ehash_size - 1));
-}
-
-static __inline__ int tcp_v6_sk_hashfn(struct sock *sk)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct in6_addr *laddr = &np->rcv_saddr;
- struct in6_addr *faddr = &np->daddr;
- __u16 lport = inet->num;
- __u16 fport = inet->dport;
- return tcp_v6_hashfn(laddr, lport, faddr, fport);
-}
-
-static inline int tcp_v6_bind_conflict(struct sock *sk,
- struct tcp_bind_bucket *tb)
-{
- struct sock *sk2;
- struct hlist_node *node;
+ const struct sock *sk2;
+ const struct hlist_node *node;
/* We must walk the whole port owner list in this case. -DaveM */
sk_for_each_bound(sk2, node, &tb->owners) {
@@ -126,8 +104,8 @@ static inline int tcp_v6_bind_conflict(struct sock *sk,
*/
static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
{
- struct tcp_bind_hashbucket *head;
- struct tcp_bind_bucket *tb;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
struct hlist_node *node;
int ret;
@@ -138,37 +116,42 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
int remaining = (high - low) + 1;
int rover;
- spin_lock(&tcp_portalloc_lock);
- if (tcp_port_rover < low)
+ spin_lock(&tcp_hashinfo.portalloc_lock);
+ if (tcp_hashinfo.port_rover < low)
rover = low;
else
- rover = tcp_port_rover;
+ rover = tcp_hashinfo.port_rover;
do { rover++;
if (rover > high)
rover = low;
- head = &tcp_bhash[tcp_bhashfn(rover)];
+ head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
spin_lock(&head->lock);
- tb_for_each(tb, node, &head->chain)
+ inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->port == rover)
goto next;
break;
next:
spin_unlock(&head->lock);
} while (--remaining > 0);
- tcp_port_rover = rover;
- spin_unlock(&tcp_portalloc_lock);
-
- /* Exhausted local port range during search? */
+ tcp_hashinfo.port_rover = rover;
+ spin_unlock(&tcp_hashinfo.portalloc_lock);
+
+ /* Exhausted local port range during search? It is not
+ * possible for us to be holding one of the bind hash
+ * locks if this test triggers, because if 'remaining'
+ * drops to zero, we broke out of the do/while loop at
+ * the top level, not from the 'break;' statement.
+ */
ret = 1;
- if (remaining <= 0)
+ if (unlikely(remaining <= 0))
goto fail;
/* OK, here is the one we will use. */
snum = rover;
} else {
- head = &tcp_bhash[tcp_bhashfn(snum)];
+ head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
spin_lock(&head->lock);
- tb_for_each(tb, node, &head->chain)
+ inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->port == snum)
goto tb_found;
}
@@ -187,8 +170,11 @@ tb_found:
}
tb_not_found:
ret = 1;
- if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
- goto fail_unlock;
+ if (tb == NULL) {
+ tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum);
+ if (tb == NULL)
+ goto fail_unlock;
+ }
if (hlist_empty(&tb->owners)) {
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
tb->fastreuse = 1;
@@ -199,9 +185,9 @@ tb_not_found:
tb->fastreuse = 0;
success:
- if (!tcp_sk(sk)->bind_hash)
- tcp_bind_hash(sk, tb, snum);
- BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
+ if (!inet_csk(sk)->icsk_bind_hash)
+ inet_bind_hash(sk, tb, snum);
+ BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
ret = 0;
fail_unlock:
@@ -219,13 +205,13 @@ static __inline__ void __tcp_v6_hash(struct sock *sk)
BUG_TRAP(sk_unhashed(sk));
if (sk->sk_state == TCP_LISTEN) {
- list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
- lock = &tcp_lhash_lock;
- tcp_listen_wlock();
+ list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)];
+ lock = &tcp_hashinfo.lhash_lock;
+ inet_listen_wlock(&tcp_hashinfo);
} else {
- sk->sk_hashent = tcp_v6_sk_hashfn(sk);
- list = &tcp_ehash[sk->sk_hashent].chain;
- lock = &tcp_ehash[sk->sk_hashent].lock;
+ sk->sk_hashent = inet6_sk_ehashfn(sk, tcp_hashinfo.ehash_size);
+ list = &tcp_hashinfo.ehash[sk->sk_hashent].chain;
+ lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock;
write_lock(lock);
}
@@ -250,131 +236,11 @@ static void tcp_v6_hash(struct sock *sk)
}
}
-static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned short hnum, int dif)
-{
- struct sock *sk;
- struct hlist_node *node;
- struct sock *result = NULL;
- int score, hiscore;
-
- hiscore=0;
- read_lock(&tcp_lhash_lock);
- sk_for_each(sk, node, &tcp_listening_hash[tcp_lhashfn(hnum)]) {
- if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
- score = 1;
- if (!ipv6_addr_any(&np->rcv_saddr)) {
- if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
- continue;
- score++;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- continue;
- score++;
- }
- if (score == 3) {
- result = sk;
- break;
- }
- if (score > hiscore) {
- hiscore = score;
- result = sk;
- }
- }
- }
- if (result)
- sock_hold(result);
- read_unlock(&tcp_lhash_lock);
- return result;
-}
-
-/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
- * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- *
- * The sockhash lock must be held as a reader here.
- */
-
-static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u16 sport,
- struct in6_addr *daddr, u16 hnum,
- int dif)
-{
- struct tcp_ehash_bucket *head;
- struct sock *sk;
- struct hlist_node *node;
- __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
- int hash;
-
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- hash = tcp_v6_hashfn(daddr, hnum, saddr, sport);
- head = &tcp_ehash[hash];
- read_lock(&head->lock);
- sk_for_each(sk, node, &head->chain) {
- /* For IPV6 do the cheaper port and family tests first. */
- if(TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif))
- goto hit; /* You sunk my battleship! */
- }
- /* Must check for a TIME_WAIT'er before going to listener hash. */
- sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
- /* FIXME: acme: check this... */
- struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
-
- if(*((__u32 *)&(tw->tw_dport)) == ports &&
- sk->sk_family == PF_INET6) {
- if(ipv6_addr_equal(&tw->tw_v6_daddr, saddr) &&
- ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) &&
- (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif))
- goto hit;
- }
- }
- read_unlock(&head->lock);
- return NULL;
-
-hit:
- sock_hold(sk);
- read_unlock(&head->lock);
- return sk;
-}
-
-
-static inline struct sock *__tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
- struct in6_addr *daddr, u16 hnum,
- int dif)
-{
- struct sock *sk;
-
- sk = __tcp_v6_lookup_established(saddr, sport, daddr, hnum, dif);
-
- if (sk)
- return sk;
-
- return tcp_v6_lookup_listener(daddr, hnum, dif);
-}
-
-inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
- struct in6_addr *daddr, u16 dport,
- int dif)
-{
- struct sock *sk;
-
- local_bh_disable();
- sk = __tcp_v6_lookup(saddr, sport, daddr, ntohs(dport), dif);
- local_bh_enable();
-
- return sk;
-}
-
-EXPORT_SYMBOL_GPL(tcp_v6_lookup);
-
-
/*
* Open request hash tables.
*/
-static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
+static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd)
{
u32 a, b, c;
@@ -394,24 +260,27 @@ static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
return c & (TCP_SYNQ_HSIZE - 1);
}
-static struct open_request *tcp_v6_search_req(struct tcp_sock *tp,
- struct open_request ***prevp,
+static struct request_sock *tcp_v6_search_req(const struct sock *sk,
+ struct request_sock ***prevp,
__u16 rport,
struct in6_addr *raddr,
struct in6_addr *laddr,
int iif)
{
- struct tcp_listen_opt *lopt = tp->listen_opt;
- struct open_request *req, **prev;
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ struct request_sock *req, **prev;
for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
(req = *prev) != NULL;
prev = &req->dl_next) {
- if (req->rmt_port == rport &&
- req->class->family == AF_INET6 &&
- ipv6_addr_equal(&req->af.v6_req.rmt_addr, raddr) &&
- ipv6_addr_equal(&req->af.v6_req.loc_addr, laddr) &&
- (!req->af.v6_req.iif || req->af.v6_req.iif == iif)) {
+ const struct tcp6_request_sock *treq = tcp6_rsk(req);
+
+ if (inet_rsk(req)->rmt_port == rport &&
+ req->rsk_ops->family == AF_INET6 &&
+ ipv6_addr_equal(&treq->rmt_addr, raddr) &&
+ ipv6_addr_equal(&treq->loc_addr, laddr) &&
+ (!treq->iif || treq->iif == iif)) {
BUG_TRAP(req->sk == NULL);
*prevp = prev;
return req;
@@ -444,44 +313,48 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
}
}
-static int __tcp_v6_check_established(struct sock *sk, __u16 lport,
- struct tcp_tw_bucket **twp)
+static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
+ struct inet_timewait_sock **twp)
{
struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct in6_addr *daddr = &np->rcv_saddr;
- struct in6_addr *saddr = &np->daddr;
- int dif = sk->sk_bound_dev_if;
- u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
- int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport);
- struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct in6_addr *daddr = &np->rcv_saddr;
+ const struct in6_addr *saddr = &np->daddr;
+ const int dif = sk->sk_bound_dev_if;
+ const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+ const int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport,
+ tcp_hashinfo.ehash_size);
+ struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
struct sock *sk2;
- struct hlist_node *node;
- struct tcp_tw_bucket *tw;
+ const struct hlist_node *node;
+ struct inet_timewait_sock *tw;
write_lock(&head->lock);
/* Check TIME-WAIT sockets first. */
- sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
- tw = (struct tcp_tw_bucket*)sk2;
+ sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
+ const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2);
+
+ tw = inet_twsk(sk2);
if(*((__u32 *)&(tw->tw_dport)) == ports &&
sk2->sk_family == PF_INET6 &&
- ipv6_addr_equal(&tw->tw_v6_daddr, saddr) &&
- ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) &&
+ ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) &&
+ ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) &&
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
struct tcp_sock *tp = tcp_sk(sk);
- if (tw->tw_ts_recent_stamp &&
- (!twp || (sysctl_tcp_tw_reuse &&
- xtime.tv_sec -
- tw->tw_ts_recent_stamp > 1))) {
+ if (tcptw->tw_ts_recent_stamp &&
+ (!twp ||
+ (sysctl_tcp_tw_reuse &&
+ xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
/* See comment in tcp_ipv4.c */
- tp->write_seq = tw->tw_snd_nxt + 65535 + 2;
+ tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
if (!tp->write_seq)
tp->write_seq = 1;
- tp->rx_opt.ts_recent = tw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+ tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
+ tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
sock_hold(sk2);
goto unique;
} else
@@ -492,7 +365,7 @@ static int __tcp_v6_check_established(struct sock *sk, __u16 lport,
/* And established part... */
sk_for_each(sk2, node, &head->chain) {
- if(TCP_IPV6_MATCH(sk2, saddr, daddr, ports, dif))
+ if (INET6_MATCH(sk2, saddr, daddr, ports, dif))
goto not_unique;
}
@@ -508,10 +381,10 @@ unique:
NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
} else if (tw) {
/* Silly. Should hash-dance instead... */
- tcp_tw_deschedule(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
}
return 0;
@@ -533,8 +406,8 @@ static inline u32 tcpv6_port_offset(const struct sock *sk)
static int tcp_v6_hash_connect(struct sock *sk)
{
unsigned short snum = inet_sk(sk)->num;
- struct tcp_bind_hashbucket *head;
- struct tcp_bind_bucket *tb;
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
int ret;
if (!snum) {
@@ -546,19 +419,19 @@ static int tcp_v6_hash_connect(struct sock *sk)
static u32 hint;
u32 offset = hint + tcpv6_port_offset(sk);
struct hlist_node *node;
- struct tcp_tw_bucket *tw = NULL;
+ struct inet_timewait_sock *tw = NULL;
local_bh_disable();
for (i = 1; i <= range; i++) {
port = low + (i + offset) % range;
- head = &tcp_bhash[tcp_bhashfn(port)];
+ head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
spin_lock(&head->lock);
/* Does not bother with rcv_saddr checks,
* because the established check is already
* unique enough.
*/
- tb_for_each(tb, node, &head->chain) {
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
if (tb->port == port) {
BUG_TRAP(!hlist_empty(&tb->owners));
if (tb->fastreuse >= 0)
@@ -571,7 +444,7 @@ static int tcp_v6_hash_connect(struct sock *sk)
}
}
- tb = tcp_bucket_create(head, port);
+ tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
if (!tb) {
spin_unlock(&head->lock);
break;
@@ -590,7 +463,7 @@ ok:
hint += i;
/* Head lock still held and bh's disabled */
- tcp_bind_hash(sk, tb, port);
+ inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->sport = htons(port);
__tcp_v6_hash(sk);
@@ -598,16 +471,16 @@ ok:
spin_unlock(&head->lock);
if (tw) {
- tcp_tw_deschedule(tw);
- tcp_tw_put(tw);
+ inet_twsk_deschedule(tw, &tcp_death_row);
+ inet_twsk_put(tw);
}
ret = 0;
goto out;
}
- head = &tcp_bhash[tcp_bhashfn(snum)];
- tb = tcp_sk(sk)->bind_hash;
+ head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
@@ -624,11 +497,6 @@ out:
}
}
-static __inline__ int tcp_v6_iif(struct sk_buff *skb)
-{
- return IP6CB(skb)->iif;
-}
-
static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
@@ -820,14 +688,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
int type, int code, int offset, __u32 info)
{
struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
- struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
+ const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
struct ipv6_pinfo *np;
struct sock *sk;
int err;
struct tcp_sock *tp;
__u32 seq;
- sk = tcp_v6_lookup(&hdr->daddr, th->dest, &hdr->saddr, th->source, skb->dev->ifindex);
+ sk = inet6_lookup(&tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr,
+ th->source, skb->dev->ifindex);
if (sk == NULL) {
ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
@@ -835,7 +704,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
}
if (sk->sk_state == TCP_TIME_WAIT) {
- tcp_tw_put((struct tcp_tw_bucket*)sk);
+ inet_twsk_put((struct inet_timewait_sock *)sk);
return;
}
@@ -906,15 +775,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
icmpv6_err_convert(type, code, &err);
- /* Might be for an open_request */
+ /* Might be for an request_sock */
switch (sk->sk_state) {
- struct open_request *req, **prev;
+ struct request_sock *req, **prev;
case TCP_LISTEN:
if (sock_owned_by_user(sk))
goto out;
- req = tcp_v6_search_req(tp, &prev, th->dest, &hdr->daddr,
- &hdr->saddr, tcp_v6_iif(skb));
+ req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr,
+ &hdr->saddr, inet6_iif(skb));
if (!req)
goto out;
@@ -923,12 +792,12 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
*/
BUG_TRAP(req->sk == NULL);
- if (seq != req->snt_isn) {
+ if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
- tcp_synq_drop(sk, req, prev);
+ inet_csk_reqsk_queue_drop(sk, req, prev);
goto out;
case TCP_SYN_SENT:
@@ -957,9 +826,10 @@ out:
}
-static int tcp_v6_send_synack(struct sock *sk, struct open_request *req,
+static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
struct dst_entry *dst)
{
+ struct tcp6_request_sock *treq = tcp6_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff * skb;
struct ipv6_txoptions *opt = NULL;
@@ -969,19 +839,19 @@ static int tcp_v6_send_synack(struct sock *sk, struct open_request *req,
memset(&fl, 0, sizeof(fl));
fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr);
- ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr);
+ ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
+ ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
fl.fl6_flowlabel = 0;
- fl.oif = req->af.v6_req.iif;
- fl.fl_ip_dport = req->rmt_port;
+ fl.oif = treq->iif;
+ fl.fl_ip_dport = inet_rsk(req)->rmt_port;
fl.fl_ip_sport = inet_sk(sk)->sport;
if (dst == NULL) {
opt = np->opt;
if (opt == NULL &&
np->rxopt.bits.srcrt == 2 &&
- req->af.v6_req.pktopts) {
- struct sk_buff *pktopts = req->af.v6_req.pktopts;
+ treq->pktopts) {
+ struct sk_buff *pktopts = treq->pktopts;
struct inet6_skb_parm *rxopt = IP6CB(pktopts);
if (rxopt->srcrt)
opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt));
@@ -1008,10 +878,10 @@ static int tcp_v6_send_synack(struct sock *sk, struct open_request *req,
struct tcphdr *th = skb->h.th;
th->check = tcp_v6_check(th, skb->len,
- &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr,
+ &treq->loc_addr, &treq->rmt_addr,
csum_partial((char *)th, skb->len, skb->csum));
- ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr);
+ ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
err = ip6_xmit(sk, skb, &fl, opt, 0);
if (err == NET_XMIT_CN)
err = 0;
@@ -1024,17 +894,18 @@ done:
return err;
}
-static void tcp_v6_or_free(struct open_request *req)
+static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
- if (req->af.v6_req.pktopts)
- kfree_skb(req->af.v6_req.pktopts);
+ if (tcp6_rsk(req)->pktopts)
+ kfree_skb(tcp6_rsk(req)->pktopts);
}
-static struct or_calltable or_ipv6 = {
+static struct request_sock_ops tcp6_request_sock_ops = {
.family = AF_INET6,
+ .obj_size = sizeof(struct tcp6_request_sock),
.rtx_syn_ack = tcp_v6_send_synack,
- .send_ack = tcp_v6_or_send_ack,
- .destructor = tcp_v6_or_free,
+ .send_ack = tcp_v6_reqsk_send_ack,
+ .destructor = tcp_v6_reqsk_destructor,
.send_reset = tcp_v6_send_reset
};
@@ -1123,7 +994,7 @@ static void tcp_v6_send_reset(struct sk_buff *skb)
buff->csum);
fl.proto = IPPROTO_TCP;
- fl.oif = tcp_v6_iif(skb);
+ fl.oif = inet6_iif(skb);
fl.fl_ip_dport = t1->dest;
fl.fl_ip_sport = t1->source;
@@ -1192,7 +1063,7 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
buff->csum);
fl.proto = IPPROTO_TCP;
- fl.oif = tcp_v6_iif(skb);
+ fl.oif = inet6_iif(skb);
fl.fl_ip_dport = t1->dest;
fl.fl_ip_sport = t1->source;
@@ -1211,45 +1082,44 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v6_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
- tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+ tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+ tcptw->tw_ts_recent);
- tcp_tw_put(tw);
+ inet_twsk_put(tw);
}
-static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req)
+static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
{
- tcp_v6_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent);
+ tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent);
}
static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
{
- struct open_request *req, **prev;
- struct tcphdr *th = skb->h.th;
- struct tcp_sock *tp = tcp_sk(sk);
+ struct request_sock *req, **prev;
+ const struct tcphdr *th = skb->h.th;
struct sock *nsk;
/* Find possible connection requests. */
- req = tcp_v6_search_req(tp, &prev, th->source, &skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr, tcp_v6_iif(skb));
+ req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr,
+ &skb->nh.ipv6h->daddr, inet6_iif(skb));
if (req)
return tcp_check_req(sk, skb, req, prev);
- nsk = __tcp_v6_lookup_established(&skb->nh.ipv6h->saddr,
- th->source,
- &skb->nh.ipv6h->daddr,
- ntohs(th->dest),
- tcp_v6_iif(skb));
+ nsk = __inet6_lookup_established(&tcp_hashinfo, &skb->nh.ipv6h->saddr,
+ th->source, &skb->nh.ipv6h->daddr,
+ ntohs(th->dest), inet6_iif(skb));
if (nsk) {
if (nsk->sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
- tcp_tw_put((struct tcp_tw_bucket*)nsk);
+ inet_twsk_put((struct inet_timewait_sock *)nsk);
return NULL;
}
@@ -1260,22 +1130,14 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
return sk;
}
-static void tcp_v6_synq_add(struct sock *sk, struct open_request *req)
+static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
{
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_listen_opt *lopt = tp->listen_opt;
- u32 h = tcp_v6_synq_hash(&req->af.v6_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
-
- req->sk = NULL;
- req->expires = jiffies + TCP_TIMEOUT_INIT;
- req->retrans = 0;
- req->dl_next = lopt->syn_table[h];
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+ const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
- write_lock(&tp->syn_wait_lock);
- lopt->syn_table[h] = req;
- write_unlock(&tp->syn_wait_lock);
-
- tcp_synq_added(sk);
+ reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
+ inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
}
@@ -1284,10 +1146,11 @@ static void tcp_v6_synq_add(struct sock *sk, struct open_request *req)
*/
static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
+ struct tcp6_request_sock *treq;
struct ipv6_pinfo *np = inet6_sk(sk);
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
- struct open_request *req = NULL;
+ struct request_sock *req = NULL;
__u32 isn = TCP_SKB_CB(skb)->when;
if (skb->protocol == htons(ETH_P_IP))
@@ -1299,16 +1162,16 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
/*
* There are no SYN attacks on IPv6, yet...
*/
- if (tcp_synq_is_full(sk) && !isn) {
+ if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
if (net_ratelimit())
printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n");
goto drop;
}
- if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
- req = tcp_openreq_alloc();
+ req = reqsk_alloc(&tcp6_request_sock_ops);
if (req == NULL)
goto drop;
@@ -1321,28 +1184,28 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb);
- req->class = &or_ipv6;
- ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr);
- ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr);
+ treq = tcp6_rsk(req);
+ ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr);
+ ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr);
TCP_ECN_create_request(req, skb->h.th);
- req->af.v6_req.pktopts = NULL;
+ treq->pktopts = NULL;
if (ipv6_opt_accepted(sk, skb) ||
np->rxopt.bits.rxinfo ||
np->rxopt.bits.rxhlim) {
atomic_inc(&skb->users);
- req->af.v6_req.pktopts = skb;
+ treq->pktopts = skb;
}
- req->af.v6_req.iif = sk->sk_bound_dev_if;
+ treq->iif = sk->sk_bound_dev_if;
/* So that link locals have meaning */
if (!sk->sk_bound_dev_if &&
- ipv6_addr_type(&req->af.v6_req.rmt_addr) & IPV6_ADDR_LINKLOCAL)
- req->af.v6_req.iif = tcp_v6_iif(skb);
+ ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ treq->iif = inet6_iif(skb);
if (isn == 0)
isn = tcp_v6_init_sequence(sk,skb);
- req->snt_isn = isn;
+ tcp_rsk(req)->snt_isn = isn;
if (tcp_v6_send_synack(sk, req, NULL))
goto drop;
@@ -1353,16 +1216,17 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
drop:
if (req)
- tcp_openreq_free(req);
+ reqsk_free(req);
TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
return 0; /* don't send reset */
}
static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
- struct open_request *req,
+ struct request_sock *req,
struct dst_entry *dst)
{
+ struct tcp6_request_sock *treq = tcp6_rsk(req);
struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
struct tcp6_sock *newtcp6sk;
struct inet_sock *newinet;
@@ -1401,15 +1265,14 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newsk->sk_backlog_rcv = tcp_v4_do_rcv;
newnp->pktoptions = NULL;
newnp->opt = NULL;
- newnp->mcast_oif = tcp_v6_iif(skb);
+ newnp->mcast_oif = inet6_iif(skb);
newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
- /* Charge newly allocated IPv6 socket. Though it is mapped,
- * it is IPv6 yet.
+ /*
+ * No need to charge this sock to the relevant IPv6 refcnt debug socks count
+ * here, tcp_create_openreq_child now does this for us, see the comment in
+ * that function for the gory details. -acme
*/
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet6_sock_nr);
-#endif
/* It is tricky place. Until this moment IPv4 tcp
worked with IPv6 af_tcp.af_specific.
@@ -1426,10 +1289,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
goto out_overflow;
if (np->rxopt.bits.srcrt == 2 &&
- opt == NULL && req->af.v6_req.pktopts) {
- struct inet6_skb_parm *rxopt = IP6CB(req->af.v6_req.pktopts);
+ opt == NULL && treq->pktopts) {
+ struct inet6_skb_parm *rxopt = IP6CB(treq->pktopts);
if (rxopt->srcrt)
- opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(req->af.v6_req.pktopts->nh.raw+rxopt->srcrt));
+ opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr *)(treq->pktopts->nh.raw + rxopt->srcrt));
}
if (dst == NULL) {
@@ -1438,16 +1301,16 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
memset(&fl, 0, sizeof(fl));
fl.proto = IPPROTO_TCP;
- ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr);
+ ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
if (opt && opt->srcrt) {
struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
ipv6_addr_copy(&final, &fl.fl6_dst);
ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
final_p = &final;
}
- ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr);
+ ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = req->rmt_port;
+ fl.fl_ip_dport = inet_rsk(req)->rmt_port;
fl.fl_ip_sport = inet_sk(sk)->sport;
if (ip6_dst_lookup(sk, &dst, &fl))
@@ -1464,10 +1327,11 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (newsk == NULL)
goto out;
- /* Charge newly allocated IPv6 socket */
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet6_sock_nr);
-#endif
+ /*
+ * No need to charge this sock to the relevant IPv6 refcnt debug socks
+ * count here, tcp_create_openreq_child now does this for us, see the
+ * comment in that function for the gory details. -acme
+ */
ip6_dst_store(newsk, dst, NULL);
newsk->sk_route_caps = dst->dev->features &
@@ -1482,10 +1346,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
- ipv6_addr_copy(&newnp->daddr, &req->af.v6_req.rmt_addr);
- ipv6_addr_copy(&newnp->saddr, &req->af.v6_req.loc_addr);
- ipv6_addr_copy(&newnp->rcv_saddr, &req->af.v6_req.loc_addr);
- newsk->sk_bound_dev_if = req->af.v6_req.iif;
+ ipv6_addr_copy(&newnp->daddr, &treq->rmt_addr);
+ ipv6_addr_copy(&newnp->saddr, &treq->loc_addr);
+ ipv6_addr_copy(&newnp->rcv_saddr, &treq->loc_addr);
+ newsk->sk_bound_dev_if = treq->iif;
/* Now IPv6 options...
@@ -1498,16 +1362,15 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
/* Clone pktoptions received with SYN */
newnp->pktoptions = NULL;
- if (req->af.v6_req.pktopts) {
- newnp->pktoptions = skb_clone(req->af.v6_req.pktopts,
- GFP_ATOMIC);
- kfree_skb(req->af.v6_req.pktopts);
- req->af.v6_req.pktopts = NULL;
+ if (treq->pktopts != NULL) {
+ newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC);
+ kfree_skb(treq->pktopts);
+ treq->pktopts = NULL;
if (newnp->pktoptions)
skb_set_owner_r(newnp->pktoptions, newsk);
}
newnp->opt = NULL;
- newnp->mcast_oif = tcp_v6_iif(skb);
+ newnp->mcast_oif = inet6_iif(skb);
newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
/* Clone native IPv6 options from listening socket (if any)
@@ -1534,7 +1397,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
__tcp_v6_hash(newsk);
- tcp_inherit_port(sk, newsk);
+ inet_inherit_port(&tcp_hashinfo, sk, newsk);
return newsk;
@@ -1555,7 +1418,7 @@ static int tcp_v6_checksum_init(struct sk_buff *skb)
if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
&skb->nh.ipv6h->daddr,skb->csum))
return 0;
- LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v6 csum failed\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n");
}
if (skb->len <= 76) {
if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
@@ -1682,7 +1545,7 @@ ipv6_pktoptions:
if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
if (np->rxopt.bits.rxinfo)
- np->mcast_oif = tcp_v6_iif(opt_skb);
+ np->mcast_oif = inet6_iif(opt_skb);
if (np->rxopt.bits.rxhlim)
np->mcast_hops = opt_skb->nh.ipv6h->hop_limit;
if (ipv6_opt_accepted(sk, opt_skb)) {
@@ -1737,8 +1600,9 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(skb->nh.ipv6h);
TCP_SKB_CB(skb)->sacked = 0;
- sk = __tcp_v6_lookup(&skb->nh.ipv6h->saddr, th->source,
- &skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb));
+ sk = __inet6_lookup(&tcp_hashinfo, &skb->nh.ipv6h->saddr, th->source,
+ &skb->nh.ipv6h->daddr, ntohs(th->dest),
+ inet6_iif(skb));
if (!sk)
goto no_tcp_socket;
@@ -1793,26 +1657,29 @@ discard_and_relse:
do_time_wait:
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- tcp_tw_put((struct tcp_tw_bucket *) sk);
+ inet_twsk_put((struct inet_timewait_sock *)sk);
goto discard_it;
}
if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
TCP_INC_STATS_BH(TCP_MIB_INERRS);
- tcp_tw_put((struct tcp_tw_bucket *) sk);
+ inet_twsk_put((struct inet_timewait_sock *)sk);
goto discard_it;
}
- switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
- skb, th, skb->len)) {
+ switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
+ skb, th)) {
case TCP_TW_SYN:
{
struct sock *sk2;
- sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb));
+ sk2 = inet6_lookup_listener(&tcp_hashinfo,
+ &skb->nh.ipv6h->daddr,
+ ntohs(th->dest), inet6_iif(skb));
if (sk2 != NULL) {
- tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
- tcp_tw_put((struct tcp_tw_bucket *)sk);
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ inet_twsk_deschedule(tw, &tcp_death_row);
+ inet_twsk_put(tw);
sk = sk2;
goto process;
}
@@ -1981,7 +1848,7 @@ static struct tcp_func ipv6_specific = {
static struct tcp_func ipv6_mapped = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
- .rebuild_header = tcp_v4_rebuild_header,
+ .rebuild_header = inet_sk_rebuild_header,
.conn_request = tcp_v6_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
.remember_stamp = tcp_v4_remember_stamp,
@@ -2000,13 +1867,14 @@ static struct tcp_func ipv6_mapped = {
*/
static int tcp_v6_init_sock(struct sock *sk)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
- tp->rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT;
/* So many TCP implementations out there (incorrectly) count the
@@ -2021,14 +1889,14 @@ static int tcp_v6_init_sock(struct sock *sk)
*/
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_clamp = ~0;
- tp->mss_cache_std = tp->mss_cache = 536;
+ tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
sk->sk_state = TCP_CLOSE;
tp->af_specific = &ipv6_specific;
-
+ icsk->icsk_ca_ops = &tcp_init_congestion_ops;
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
@@ -2042,15 +1910,13 @@ static int tcp_v6_init_sock(struct sock *sk)
static int tcp_v6_destroy_sock(struct sock *sk)
{
- extern int tcp_v4_destroy_sock(struct sock *sk);
-
tcp_v4_destroy_sock(sk);
return inet6_destroy_sock(sk);
}
/* Proc filesystem TCPv6 sock list dumping. */
static void get_openreq6(struct seq_file *seq,
- struct sock *sk, struct open_request *req, int i, int uid)
+ struct sock *sk, struct request_sock *req, int i, int uid)
{
struct in6_addr *dest, *src;
int ttd = req->expires - jiffies;
@@ -2058,8 +1924,8 @@ static void get_openreq6(struct seq_file *seq,
if (ttd < 0)
ttd = 0;
- src = &req->af.v6_req.loc_addr;
- dest = &req->af.v6_req.rmt_addr;
+ src = &tcp6_rsk(req)->loc_addr;
+ dest = &tcp6_rsk(req)->rmt_addr;
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
"%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n",
@@ -2069,7 +1935,7 @@ static void get_openreq6(struct seq_file *seq,
ntohs(inet_sk(sk)->sport),
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3],
- ntohs(req->rmt_port),
+ ntohs(inet_rsk(req)->rmt_port),
TCP_SYN_RECV,
0,0, /* could print option size, but that is af dependent. */
1, /* timers active (only the expire timer) */
@@ -2089,18 +1955,20 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
unsigned long timer_expires;
struct inet_sock *inet = inet_sk(sp);
struct tcp_sock *tp = tcp_sk(sp);
+ const struct inet_connection_sock *icsk = inet_csk(sp);
struct ipv6_pinfo *np = inet6_sk(sp);
dest = &np->daddr;
src = &np->rcv_saddr;
destp = ntohs(inet->dport);
srcp = ntohs(inet->sport);
- if (tp->pending == TCP_TIME_RETRANS) {
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
timer_active = 1;
- timer_expires = tp->timeout;
- } else if (tp->pending == TCP_TIME_PROBE0) {
+ timer_expires = icsk->icsk_timeout;
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
- timer_expires = tp->timeout;
+ timer_expires = icsk->icsk_timeout;
} else if (timer_pending(&sp->sk_timer)) {
timer_active = 2;
timer_expires = sp->sk_timer.expires;
@@ -2121,28 +1989,31 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
timer_active,
jiffies_to_clock_t(timer_expires - jiffies),
- tp->retransmits,
+ icsk->icsk_retransmits,
sock_i_uid(sp),
- tp->probes_out,
+ icsk->icsk_probes_out,
sock_i_ino(sp),
atomic_read(&sp->sk_refcnt), sp,
- tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
+ icsk->icsk_rto,
+ icsk->icsk_ack.ato,
+ (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong,
tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
);
}
static void get_timewait6_sock(struct seq_file *seq,
- struct tcp_tw_bucket *tw, int i)
+ struct inet_timewait_sock *tw, int i)
{
struct in6_addr *dest, *src;
__u16 destp, srcp;
+ struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
int ttd = tw->tw_ttd - jiffies;
if (ttd < 0)
ttd = 0;
- dest = &tw->tw_v6_daddr;
- src = &tw->tw_v6_rcv_saddr;
+ dest = &tcp6tw->tw_v6_daddr;
+ src = &tcp6tw->tw_v6_rcv_saddr;
destp = ntohs(tw->tw_dport);
srcp = ntohs(tw->tw_sport);
@@ -2217,7 +2088,7 @@ struct proto tcpv6_prot = {
.close = tcp_close,
.connect = tcp_v6_connect,
.disconnect = tcp_disconnect,
- .accept = tcp_accept,
+ .accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v6_init_sock,
.destroy = tcp_v6_destroy_sock,
@@ -2234,11 +2105,14 @@ struct proto tcpv6_prot = {
.sockets_allocated = &tcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
+ .orphan_count = &tcp_orphan_count,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
+ .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
+ .rsk_prot = &tcp6_request_sock_ops,
};
static struct inet6_protocol tcpv6_protocol = {
@@ -2247,8 +2121,6 @@ static struct inet6_protocol tcpv6_protocol = {
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
-extern struct proto_ops inet6_stream_ops;
-
static struct inet_protosw tcpv6_protosw = {
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e251d0b..390d750 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -51,6 +51,7 @@
#include <net/udp.h>
#include <net/raw.h>
#include <net/inet_common.h>
+#include <net/tcp_states.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
@@ -58,7 +59,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6);
+DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
/* Grrr, addr_type already calculated by caller, but I don't want
* to add some silly "cookie" argument to this method just for that.
@@ -300,12 +301,12 @@ csum_copy_err:
/* Clear queue. */
if (flags&MSG_PEEK) {
int clear = 0;
- spin_lock_irq(&sk->sk_receive_queue.lock);
+ spin_lock_bh(&sk->sk_receive_queue.lock);
if (skb == skb_peek(&sk->sk_receive_queue)) {
__skb_unlink(skb, &sk->sk_receive_queue);
clear = 1;
}
- spin_unlock_irq(&sk->sk_receive_queue.lock);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
if (clear)
kfree_skb(skb);
}
@@ -477,8 +478,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
/* RFC 2460 section 8.1 says that we SHOULD log
this error. Well, it is reasonable.
*/
- LIMIT_NETDEBUG(
- printk(KERN_INFO "IPv6: udp checksum is 0\n"));
+ LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n");
goto discard;
}
@@ -493,7 +493,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
if (skb->ip_summed==CHECKSUM_HW) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) {
- LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v6 hw csum failure.\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n");
skb->ip_summed = CHECKSUM_NONE;
}
}
@@ -825,7 +825,7 @@ back_from_confirm:
/* ... which is an evident application bug. --ANK */
release_sock(sk);
- LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n"));
+ LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
err = -EINVAL;
goto out;
}
@@ -1054,8 +1054,6 @@ struct proto udpv6_prot = {
.obj_size = sizeof(struct udp6_sock),
};
-extern struct proto_ops inet6_dgram_ops;
-
static struct inet_protosw udpv6_protosw = {
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index ffcadd6..fbef782 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -79,7 +79,7 @@ static u32 xfrm6_tunnel_spi;
#define XFRM6_TUNNEL_SPI_MIN 1
#define XFRM6_TUNNEL_SPI_MAX 0xffffffff
-static kmem_cache_t *xfrm6_tunnel_spi_kmem;
+static kmem_cache_t *xfrm6_tunnel_spi_kmem __read_mostly;
#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256
#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256
@@ -466,7 +466,7 @@ static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return;
}
-static int xfrm6_tunnel_init_state(struct xfrm_state *x, void *args)
+static int xfrm6_tunnel_init_state(struct xfrm_state *x)
{
if (!x->props.mode)
return -EINVAL;
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
index a16237c..980a826 100644
--- a/net/ipx/Kconfig
+++ b/net/ipx/Kconfig
@@ -1,6 +1,39 @@
#
# IPX configuration
#
+config IPX
+ tristate "The IPX protocol"
+ select LLC
+ ---help---
+ This is support for the Novell networking protocol, IPX, commonly
+ used for local networks of Windows machines. You need it if you
+ want to access Novell NetWare file or print servers using the Linux
+ Novell client ncpfs (available from
+ <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
+ within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
+ available from <http://www.tldp.org/docs.html#howto>). In order
+ to do the former, you'll also have to say Y to "NCP file system
+ support", below.
+
+ IPX is similar in scope to IP, while SPX, which runs on top of IPX,
+ is similar to TCP. There is also experimental support for SPX in
+ Linux (see "SPX networking", below).
+
+ To turn your Linux box into a fully featured NetWare file server and
+ IPX router, say Y here and fetch either lwared from
+ <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
+ mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
+ information, read the IPX-HOWTO available from
+ <http://www.tldp.org/docs.html#howto>.
+
+ General information about how to connect Linux, Windows machines and
+ Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+
+ The IPX driver would enlarge your kernel by about 16 KB. To compile
+ this driver as a module, choose M here: the module will be called ipx.
+ Unless you want to integrate your Linux box with a local Novell
+ network, say N.
+
config IPX_INTERN
bool "IPX: Full internal IPX network"
depends on IPX
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 5a27e5d..34b3bb8 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -44,7 +44,6 @@
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/string.h>
-#include <linux/tcp.h>
#include <linux/types.h>
#include <linux/termios.h>
@@ -52,6 +51,7 @@
#include <net/p8022.h>
#include <net/psnap.h>
#include <net/sock.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
@@ -1627,7 +1627,7 @@ out:
return rc;
}
-static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
/* NULL here for pt means the packet was looped back */
struct ipx_interface *intrfc;
@@ -1796,8 +1796,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
copied);
if (rc)
goto out_free;
- if (skb->stamp.tv_sec)
- sk->sk_stamp = skb->stamp;
+ if (skb->tstamp.off_sec)
+ skb_get_timestamp(skb, &sk->sk_stamp);
msg->msg_namelen = sizeof(*sipx);
@@ -1940,9 +1940,7 @@ static struct notifier_block ipx_dev_notifier = {
};
extern struct datalink_proto *make_EII_client(void);
-extern struct datalink_proto *make_8023_client(void);
extern void destroy_EII_client(struct datalink_proto *);
-extern void destroy_8023_client(struct datalink_proto *);
static unsigned char ipx_8022_type = 0xE0;
static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 };
diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c
index b676191..1f73d9e 100644
--- a/net/ipx/ipx_proc.c
+++ b/net/ipx/ipx_proc.c
@@ -10,7 +10,7 @@
#include <linux/proc_fs.h>
#include <linux/spinlock.h>
#include <linux/seq_file.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
#include <net/ipx.h>
static __inline__ struct ipx_interface *ipx_get_interface_idx(loff_t pos)
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 92c6e8d..6f92f9c 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -56,7 +56,7 @@
#include <asm/uaccess.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/irda/af_irda.h>
diff --git a/net/irda/irlan/irlan_filter.c b/net/irda/irlan/irlan_filter.c
index 343c5d4..ca7d358 100644
--- a/net/irda/irlan/irlan_filter.c
+++ b/net/irda/irlan/irlan_filter.c
@@ -27,6 +27,7 @@
#include <linux/seq_file.h>
#include <net/irda/irlan_common.h>
+#include <net/irda/irlan_filter.h>
/*
* Function irlan_filter_request (self, skb)
diff --git a/net/irda/irlap.c b/net/irda/irlap.c
index 046ad07..7029618 100644
--- a/net/irda/irlap.c
+++ b/net/irda/irlap.c
@@ -445,9 +445,8 @@ void irlap_disconnect_request(struct irlap_cb *self)
IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
/* Don't disconnect until all data frames are successfully sent */
- if (skb_queue_len(&self->txq) > 0) {
+ if (!skb_queue_empty(&self->txq)) {
self->disconnect_pending = TRUE;
-
return;
}
diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c
index 1cd89f5..a505b54 100644
--- a/net/irda/irlap_event.c
+++ b/net/irda/irlap_event.c
@@ -191,7 +191,7 @@ static void irlap_start_poll_timer(struct irlap_cb *self, int timeout)
* Send out the RR frames faster if our own transmit queue is empty, or
* if the peer is busy. The effect is a much faster conversation
*/
- if ((skb_queue_len(&self->txq) == 0) || (self->remote_busy)) {
+ if (skb_queue_empty(&self->txq) || self->remote_busy) {
if (self->fast_RR == TRUE) {
/*
* Assert that the fast poll timer has not reached the
@@ -263,7 +263,7 @@ void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event,
IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__,
skb_queue_len(&self->txq));
- if (skb_queue_len(&self->txq)) {
+ if (!skb_queue_empty(&self->txq)) {
/* Prevent race conditions with irlap_data_request() */
self->local_busy = TRUE;
@@ -1074,7 +1074,7 @@ static int irlap_state_xmit_p(struct irlap_cb *self, IRLAP_EVENT event,
#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
/* Window has been adjusted for the max packet
* size, so much simpler... - Jean II */
- nextfit = (skb_queue_len(&self->txq) > 0);
+ nextfit = !skb_queue_empty(&self->txq);
#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
/*
* Send data with poll bit cleared only if window > 1
@@ -1814,7 +1814,7 @@ static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event,
#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
/* Window has been adjusted for the max packet
* size, so much simpler... - Jean II */
- nextfit = (skb_queue_len(&self->txq) > 0);
+ nextfit = !skb_queue_empty(&self->txq);
#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
/*
* Send data with final bit cleared only if window > 1
@@ -1937,7 +1937,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
irlap_data_indication(self, skb, FALSE);
/* Any pending data requests? */
- if ((skb_queue_len(&self->txq) > 0) &&
+ if (!skb_queue_empty(&self->txq) &&
(self->window > 0))
{
self->ack_required = TRUE;
@@ -2038,7 +2038,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
/*
* Any pending data requests?
*/
- if ((skb_queue_len(&self->txq) > 0) &&
+ if (!skb_queue_empty(&self->txq) &&
(self->window > 0) && !self->remote_busy)
{
irlap_data_indication(self, skb, TRUE);
@@ -2069,7 +2069,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
*/
nr_status = irlap_validate_nr_received(self, info->nr);
if (nr_status == NR_EXPECTED) {
- if ((skb_queue_len( &self->txq) > 0) &&
+ if (!skb_queue_empty(&self->txq) &&
(self->window > 0)) {
self->remote_busy = FALSE;
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 040abe7..3e9a06a 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -988,9 +988,6 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__);
return;
}
- /* Unlink tx_skb from list */
- tx_skb->next = tx_skb->prev = NULL;
- tx_skb->list = NULL;
/* Clear old Nr field + poll bit */
tx_skb->data[1] &= 0x0f;
@@ -1018,11 +1015,10 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
/*
* We can now fill the window with additional data frames
*/
- while (skb_queue_len( &self->txq) > 0) {
+ while (!skb_queue_empty(&self->txq)) {
IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__);
- if ((skb_queue_len( &self->txq) > 0) &&
- (self->window > 0)) {
+ if (self->window > 0) {
skb = skb_dequeue( &self->txq);
IRDA_ASSERT(skb != NULL, return;);
@@ -1031,8 +1027,7 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
* bit cleared
*/
if ((self->window > 1) &&
- skb_queue_len(&self->txq) > 0)
- {
+ !skb_queue_empty(&self->txq)) {
irlap_send_data_primary(self, skb);
} else {
irlap_send_data_primary_poll(self, skb);
@@ -1065,9 +1060,6 @@ void irlap_resend_rejected_frame(struct irlap_cb *self, int command)
IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__);
return;
}
- /* Unlink tx_skb from list */
- tx_skb->next = tx_skb->prev = NULL;
- tx_skb->list = NULL;
/* Clear old Nr field + poll bit */
tx_skb->data[1] &= 0x0f;
@@ -1311,7 +1303,7 @@ static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb,
* Jean II
*/
int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *ptype)
+ struct packet_type *ptype, struct net_device *orig_dev)
{
struct irlap_info info;
struct irlap_cb *self;
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
index 7a4a4d7..c19e9ce 100644
--- a/net/irda/irlmp.c
+++ b/net/irda/irlmp.c
@@ -53,7 +53,6 @@ struct irlmp_cb *irlmp = NULL;
/* These can be altered by the sysctl interface */
int sysctl_discovery = 0;
int sysctl_discovery_timeout = 3; /* 3 seconds by default */
-EXPORT_SYMBOL(sysctl_discovery_timeout);
int sysctl_discovery_slots = 6; /* 6 slots by default */
int sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ;
char sysctl_devname[65];
@@ -67,7 +66,6 @@ const char *irlmp_reasons[] = {
"LM_INIT_DISCONNECT",
"ERROR, NOT USED",
};
-EXPORT_SYMBOL(irlmp_reasons);
/*
* Function irlmp_init (void)
@@ -675,7 +673,6 @@ struct lsap_cb *irlmp_dup(struct lsap_cb *orig, void *instance)
return new;
}
-EXPORT_SYMBOL(irlmp_dup);
/*
* Function irlmp_disconnect_request (handle, userdata)
diff --git a/net/irda/irmod.c b/net/irda/irmod.c
index 6ffaed4..634901d 100644
--- a/net/irda/irmod.c
+++ b/net/irda/irmod.c
@@ -54,7 +54,7 @@ extern int irsock_init(void);
extern void irsock_cleanup(void);
/* irlap_frame.c */
extern int irlap_driver_rcv(struct sk_buff *, struct net_device *,
- struct packet_type *);
+ struct packet_type *, struct net_device *);
/*
* Module parameters
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
index 9004f73..b391cb3 100644
--- a/net/irda/irnet/irnet.h
+++ b/net/irda/irnet/irnet.h
@@ -517,9 +517,6 @@ extern int
irda_irnet_init(void); /* Initialise IrDA part of IrNET */
extern void
irda_irnet_cleanup(void); /* Teardown IrDA part of IrNET */
-/* ---------------------------- MODULE ---------------------------- */
-extern int
- irnet_init(void); /* Initialise IrNET module */
/**************************** VARIABLES ****************************/
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index f8f984b..e53bf9e 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -1107,7 +1107,7 @@ ppp_irnet_cleanup(void)
/*
* Module main entry point
*/
-int __init
+static int __init
irnet_init(void)
{
int err;
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
index b0dd3ea..1ba8c71 100644
--- a/net/irda/irqueue.c
+++ b/net/irda/irqueue.c
@@ -822,7 +822,6 @@ void* hashbin_find_next( hashbin_t* hashbin, long hashv, const char* name,
return entry;
}
-EXPORT_SYMBOL(hashbin_find_next);
/*
* Function hashbin_get_first (hashbin)
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
index d091ccf..6602d90 100644
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -1513,7 +1513,7 @@ int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata,
/*
* Check if there is still data segments in the transmit queue
*/
- if (skb_queue_len(&self->tx_queue) > 0) {
+ if (!skb_queue_empty(&self->tx_queue)) {
if (priority == P_HIGH) {
/*
* No need to send the queued data, if we are
diff --git a/net/irda/qos.c b/net/irda/qos.c
index df732d5..ddfb5c5 100644
--- a/net/irda/qos.c
+++ b/net/irda/qos.c
@@ -37,6 +37,7 @@
#include <net/irda/parameters.h>
#include <net/irda/qos.h>
#include <net/irda/irlap.h>
+#include <net/irda/irlap_frame.h>
/*
* Maximum values of the baud rate we negociate with the other end.
diff --git a/net/key/af_key.c b/net/key/af_key.c
index ce980aa..4879743 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -656,13 +656,18 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
sa->sadb_sa_exttype = SADB_EXT_SA;
sa->sadb_sa_spi = x->id.spi;
sa->sadb_sa_replay = x->props.replay_window;
- sa->sadb_sa_state = SADB_SASTATE_DYING;
- if (x->km.state == XFRM_STATE_VALID && !x->km.dying)
- sa->sadb_sa_state = SADB_SASTATE_MATURE;
- else if (x->km.state == XFRM_STATE_ACQ)
+ switch (x->km.state) {
+ case XFRM_STATE_VALID:
+ sa->sadb_sa_state = x->km.dying ?
+ SADB_SASTATE_DYING : SADB_SASTATE_MATURE;
+ break;
+ case XFRM_STATE_ACQ:
sa->sadb_sa_state = SADB_SASTATE_LARVAL;
- else if (x->km.state == XFRM_STATE_EXPIRED)
+ break;
+ default:
sa->sadb_sa_state = SADB_SASTATE_DEAD;
+ break;
+ }
sa->sadb_sa_auth = 0;
if (x->aalg) {
struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
@@ -685,6 +690,8 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN;
if (x->props.flags & XFRM_STATE_DECAP_DSCP)
sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP;
+ if (x->props.flags & XFRM_STATE_NOPMTUDISC)
+ sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC;
/* hard time */
if (hsc & 2) {
@@ -969,6 +976,8 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
x->props.flags |= XFRM_STATE_NOECN;
if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP)
x->props.flags |= XFRM_STATE_DECAP_DSCP;
+ if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC)
+ x->props.flags |= XFRM_STATE_NOPMTUDISC;
lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1];
if (lifetime != NULL) {
@@ -1091,17 +1100,11 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
}
}
- x->type = xfrm_get_type(proto, x->props.family);
- if (x->type == NULL) {
- err = -ENOPROTOOPT;
- goto out;
- }
- if (x->type->init_state(x, NULL)) {
- err = -EINVAL;
+ err = xfrm_init_state(x);
+ if (err)
goto out;
- }
+
x->km.seq = hdr->sadb_msg_seq;
- x->km.state = XFRM_STATE_VALID;
return x;
out:
@@ -1240,13 +1243,78 @@ static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, struct sadb_msg *
return 0;
}
+static inline int event2poltype(int event)
+{
+ switch (event) {
+ case XFRM_MSG_DELPOLICY:
+ return SADB_X_SPDDELETE;
+ case XFRM_MSG_NEWPOLICY:
+ return SADB_X_SPDADD;
+ case XFRM_MSG_UPDPOLICY:
+ return SADB_X_SPDUPDATE;
+ case XFRM_MSG_POLEXPIRE:
+ // return SADB_X_SPDEXPIRE;
+ default:
+ printk("pfkey: Unknown policy event %d\n", event);
+ break;
+ }
+
+ return 0;
+}
+
+static inline int event2keytype(int event)
+{
+ switch (event) {
+ case XFRM_MSG_DELSA:
+ return SADB_DELETE;
+ case XFRM_MSG_NEWSA:
+ return SADB_ADD;
+ case XFRM_MSG_UPDSA:
+ return SADB_UPDATE;
+ case XFRM_MSG_EXPIRE:
+ return SADB_EXPIRE;
+ default:
+ printk("pfkey: Unknown SA event %d\n", event);
+ break;
+ }
+
+ return 0;
+}
+
+/* ADD/UPD/DEL */
+static int key_notify_sa(struct xfrm_state *x, struct km_event *c)
+{
+ struct sk_buff *skb;
+ struct sadb_msg *hdr;
+ int hsc = 3;
+
+ if (c->event == XFRM_MSG_DELSA)
+ hsc = 0;
+
+ skb = pfkey_xfrm_state2msg(x, 0, hsc);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ hdr = (struct sadb_msg *) skb->data;
+ hdr->sadb_msg_version = PF_KEY_V2;
+ hdr->sadb_msg_type = event2keytype(c->event);
+ hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
+ hdr->sadb_msg_errno = 0;
+ hdr->sadb_msg_reserved = 0;
+ hdr->sadb_msg_seq = c->seq;
+ hdr->sadb_msg_pid = c->pid;
+
+ pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL);
+
+ return 0;
+}
static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
{
- struct sk_buff *out_skb;
- struct sadb_msg *out_hdr;
struct xfrm_state *x;
int err;
+ struct km_event c;
xfrm_probe_algs();
@@ -1254,6 +1322,7 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
if (IS_ERR(x))
return PTR_ERR(x);
+ xfrm_state_hold(x);
if (hdr->sadb_msg_type == SADB_ADD)
err = xfrm_state_add(x);
else
@@ -1262,30 +1331,26 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
if (err < 0) {
x->km.state = XFRM_STATE_DEAD;
xfrm_state_put(x);
- return err;
+ goto out;
}
- out_skb = pfkey_xfrm_state2msg(x, 0, 3);
- if (IS_ERR(out_skb))
- return PTR_ERR(out_skb); /* XXX Should we return 0 here ? */
-
- out_hdr = (struct sadb_msg *) out_skb->data;
- out_hdr->sadb_msg_version = hdr->sadb_msg_version;
- out_hdr->sadb_msg_type = hdr->sadb_msg_type;
- out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
- out_hdr->sadb_msg_errno = 0;
- out_hdr->sadb_msg_reserved = 0;
- out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
- out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
-
- pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
-
- return 0;
+ if (hdr->sadb_msg_type == SADB_ADD)
+ c.event = XFRM_MSG_NEWSA;
+ else
+ c.event = XFRM_MSG_UPDSA;
+ c.seq = hdr->sadb_msg_seq;
+ c.pid = hdr->sadb_msg_pid;
+ km_state_notify(x, &c);
+out:
+ xfrm_state_put(x);
+ return err;
}
static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
{
struct xfrm_state *x;
+ struct km_event c;
+ int err;
if (!ext_hdrs[SADB_EXT_SA-1] ||
!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
@@ -1301,13 +1366,19 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
return -EPERM;
}
- xfrm_state_delete(x);
- xfrm_state_put(x);
+ err = xfrm_state_delete(x);
+ if (err < 0) {
+ xfrm_state_put(x);
+ return err;
+ }
- pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL,
- BROADCAST_ALL, sk);
+ c.seq = hdr->sadb_msg_seq;
+ c.pid = hdr->sadb_msg_pid;
+ c.event = XFRM_MSG_DELSA;
+ km_state_notify(x, &c);
+ xfrm_state_put(x);
- return 0;
+ return err;
}
static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
@@ -1445,28 +1516,42 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, struct sadb_msg
return 0;
}
+static int key_notify_sa_flush(struct km_event *c)
+{
+ struct sk_buff *skb;
+ struct sadb_msg *hdr;
+
+ skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
+ if (!skb)
+ return -ENOBUFS;
+ hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+ hdr->sadb_msg_satype = pfkey_proto2satype(c->data.proto);
+ hdr->sadb_msg_seq = c->seq;
+ hdr->sadb_msg_pid = c->pid;
+ hdr->sadb_msg_version = PF_KEY_V2;
+ hdr->sadb_msg_errno = (uint8_t) 0;
+ hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
+
+ pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL);
+
+ return 0;
+}
+
static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
{
unsigned proto;
- struct sk_buff *skb_out;
- struct sadb_msg *hdr_out;
+ struct km_event c;
proto = pfkey_satype2proto(hdr->sadb_msg_satype);
if (proto == 0)
return -EINVAL;
- skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL);
- if (!skb_out)
- return -ENOBUFS;
-
xfrm_state_flush(proto);
-
- hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg));
- pfkey_hdr_dup(hdr_out, hdr);
- hdr_out->sadb_msg_errno = (uint8_t) 0;
- hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
-
- pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL);
+ c.data.proto = proto;
+ c.seq = hdr->sadb_msg_seq;
+ c.pid = hdr->sadb_msg_pid;
+ c.event = XFRM_MSG_FLUSHSA;
+ km_state_notify(NULL, &c);
return 0;
}
@@ -1859,6 +1944,35 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
}
+static int key_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
+{
+ struct sk_buff *out_skb;
+ struct sadb_msg *out_hdr;
+ int err;
+
+ out_skb = pfkey_xfrm_policy2msg_prep(xp);
+ if (IS_ERR(out_skb)) {
+ err = PTR_ERR(out_skb);
+ goto out;
+ }
+ pfkey_xfrm_policy2msg(out_skb, xp, dir);
+
+ out_hdr = (struct sadb_msg *) out_skb->data;
+ out_hdr->sadb_msg_version = PF_KEY_V2;
+
+ if (c->data.byid && c->event == XFRM_MSG_DELPOLICY)
+ out_hdr->sadb_msg_type = SADB_X_SPDDELETE2;
+ else
+ out_hdr->sadb_msg_type = event2poltype(c->event);
+ out_hdr->sadb_msg_errno = 0;
+ out_hdr->sadb_msg_seq = c->seq;
+ out_hdr->sadb_msg_pid = c->pid;
+ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL);
+out:
+ return 0;
+
+}
+
static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
{
int err;
@@ -1866,8 +1980,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
struct sadb_address *sa;
struct sadb_x_policy *pol;
struct xfrm_policy *xp;
- struct sk_buff *out_skb;
- struct sadb_msg *out_hdr;
+ struct km_event c;
if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -1935,31 +2048,23 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
(err = parse_ipsecrequests(xp, pol)) < 0)
goto out;
- out_skb = pfkey_xfrm_policy2msg_prep(xp);
- if (IS_ERR(out_skb)) {
- err = PTR_ERR(out_skb);
- goto out;
- }
-
err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
hdr->sadb_msg_type != SADB_X_SPDUPDATE);
if (err) {
- kfree_skb(out_skb);
- goto out;
+ kfree(xp);
+ return err;
}
- pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1);
+ if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
+ c.event = XFRM_MSG_UPDPOLICY;
+ else
+ c.event = XFRM_MSG_NEWPOLICY;
- xfrm_pol_put(xp);
+ c.seq = hdr->sadb_msg_seq;
+ c.pid = hdr->sadb_msg_pid;
- out_hdr = (struct sadb_msg *) out_skb->data;
- out_hdr->sadb_msg_version = hdr->sadb_msg_version;
- out_hdr->sadb_msg_type = hdr->sadb_msg_type;
- out_hdr->sadb_msg_satype = 0;
- out_hdr->sadb_msg_errno = 0;
- out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
- out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
- pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
+ km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
+ xfrm_pol_put(xp);
return 0;
out:
@@ -1973,9 +2078,8 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
struct sadb_address *sa;
struct sadb_x_policy *pol;
struct xfrm_policy *xp;
- struct sk_buff *out_skb;
- struct sadb_msg *out_hdr;
struct xfrm_selector sel;
+ struct km_event c;
if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2010,25 +2114,40 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
err = 0;
+ c.seq = hdr->sadb_msg_seq;
+ c.pid = hdr->sadb_msg_pid;
+ c.event = XFRM_MSG_DELPOLICY;
+ km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
+
+ xfrm_pol_put(xp);
+ return err;
+}
+
+static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, struct sadb_msg *hdr, int dir)
+{
+ int err;
+ struct sk_buff *out_skb;
+ struct sadb_msg *out_hdr;
+ err = 0;
+
out_skb = pfkey_xfrm_policy2msg_prep(xp);
if (IS_ERR(out_skb)) {
err = PTR_ERR(out_skb);
goto out;
}
- pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1);
+ pfkey_xfrm_policy2msg(out_skb, xp, dir);
out_hdr = (struct sadb_msg *) out_skb->data;
out_hdr->sadb_msg_version = hdr->sadb_msg_version;
- out_hdr->sadb_msg_type = SADB_X_SPDDELETE;
+ out_hdr->sadb_msg_type = hdr->sadb_msg_type;
out_hdr->sadb_msg_satype = 0;
out_hdr->sadb_msg_errno = 0;
out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
- pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
+ pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk);
err = 0;
out:
- xfrm_pol_put(xp);
return err;
}
@@ -2037,8 +2156,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
int err;
struct sadb_x_policy *pol;
struct xfrm_policy *xp;
- struct sk_buff *out_skb;
- struct sadb_msg *out_hdr;
+ struct km_event c;
if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL)
return -EINVAL;
@@ -2050,24 +2168,16 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
err = 0;
- out_skb = pfkey_xfrm_policy2msg_prep(xp);
- if (IS_ERR(out_skb)) {
- err = PTR_ERR(out_skb);
- goto out;
+ c.seq = hdr->sadb_msg_seq;
+ c.pid = hdr->sadb_msg_pid;
+ if (hdr->sadb_msg_type == SADB_X_SPDDELETE2) {
+ c.data.byid = 1;
+ c.event = XFRM_MSG_DELPOLICY;
+ km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
+ } else {
+ err = key_pol_get_resp(sk, xp, hdr, pol->sadb_x_policy_dir-1);
}
- pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1);
- out_hdr = (struct sadb_msg *) out_skb->data;
- out_hdr->sadb_msg_version = hdr->sadb_msg_version;
- out_hdr->sadb_msg_type = hdr->sadb_msg_type;
- out_hdr->sadb_msg_satype = 0;
- out_hdr->sadb_msg_errno = 0;
- out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
- out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
- pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
- err = 0;
-
-out:
xfrm_pol_put(xp);
return err;
}
@@ -2102,22 +2212,34 @@ static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *
return xfrm_policy_walk(dump_sp, &data);
}
-static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int key_notify_policy_flush(struct km_event *c)
{
struct sk_buff *skb_out;
- struct sadb_msg *hdr_out;
+ struct sadb_msg *hdr;
- skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL);
+ skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
if (!skb_out)
return -ENOBUFS;
+ hdr = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg));
+ hdr->sadb_msg_seq = c->seq;
+ hdr->sadb_msg_pid = c->pid;
+ hdr->sadb_msg_version = PF_KEY_V2;
+ hdr->sadb_msg_errno = (uint8_t) 0;
+ hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
+ pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL);
+ return 0;
- xfrm_policy_flush();
+}
- hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg));
- pfkey_hdr_dup(hdr_out, hdr);
- hdr_out->sadb_msg_errno = (uint8_t) 0;
- hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
- pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL);
+static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+{
+ struct km_event c;
+
+ xfrm_policy_flush();
+ c.event = XFRM_MSG_FLUSHPOLICY;
+ c.pid = hdr->sadb_msg_pid;
+ c.seq = hdr->sadb_msg_seq;
+ km_policy_notify(NULL, 0, &c);
return 0;
}
@@ -2317,11 +2439,23 @@ static void dump_esp_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
}
}
-static int pfkey_send_notify(struct xfrm_state *x, int hard)
+static int key_notify_policy_expire(struct xfrm_policy *xp, struct km_event *c)
+{
+ return 0;
+}
+
+static int key_notify_sa_expire(struct xfrm_state *x, struct km_event *c)
{
struct sk_buff *out_skb;
struct sadb_msg *out_hdr;
- int hsc = (hard ? 2 : 1);
+ int hard;
+ int hsc;
+
+ hard = c->data.hard;
+ if (hard)
+ hsc = 2;
+ else
+ hsc = 1;
out_skb = pfkey_xfrm_state2msg(x, 0, hsc);
if (IS_ERR(out_skb))
@@ -2340,6 +2474,44 @@ static int pfkey_send_notify(struct xfrm_state *x, int hard)
return 0;
}
+static int pfkey_send_notify(struct xfrm_state *x, struct km_event *c)
+{
+ switch (c->event) {
+ case XFRM_MSG_EXPIRE:
+ return key_notify_sa_expire(x, c);
+ case XFRM_MSG_DELSA:
+ case XFRM_MSG_NEWSA:
+ case XFRM_MSG_UPDSA:
+ return key_notify_sa(x, c);
+ case XFRM_MSG_FLUSHSA:
+ return key_notify_sa_flush(c);
+ default:
+ printk("pfkey: Unknown SA event %d\n", c->event);
+ break;
+ }
+
+ return 0;
+}
+
+static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
+{
+ switch (c->event) {
+ case XFRM_MSG_POLEXPIRE:
+ return key_notify_policy_expire(xp, c);
+ case XFRM_MSG_DELPOLICY:
+ case XFRM_MSG_NEWPOLICY:
+ case XFRM_MSG_UPDPOLICY:
+ return key_notify_policy(xp, dir, c);
+ case XFRM_MSG_FLUSHPOLICY:
+ return key_notify_policy_flush(c);
+ default:
+ printk("pfkey: Unknown policy event %d\n", c->event);
+ break;
+ }
+
+ return 0;
+}
+
static u32 get_acqseq(void)
{
u32 res;
@@ -2856,6 +3028,7 @@ static struct xfrm_mgr pfkeyv2_mgr =
.acquire = pfkey_send_acquire,
.compile_policy = pfkey_compile_policy,
.new_mapping = pfkey_send_new_mapping,
+ .notify_policy = pfkey_send_policy_notify,
};
static void __exit ipsec_pfkey_exit(void)
diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig
new file mode 100644
index 0000000..f0b5efb
--- /dev/null
+++ b/net/lapb/Kconfig
@@ -0,0 +1,22 @@
+#
+# LAPB Data Link Drive
+#
+
+config LAPB
+ tristate "LAPB Data Link Driver (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ ---help---
+ Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
+ the lower) part of the X.25 protocol. It offers a reliable
+ connection service to exchange data frames with one other host, and
+ it is used to transport higher level protocols (mostly X.25 Packet
+ Layer, the higher part of X.25, but others are possible as well).
+ Usually, LAPB is used with specialized X.21 network cards, but Linux
+ currently supports LAPB only over Ethernet connections. If you want
+ to use LAPB connections over Ethernet, say Y here and to "LAPB over
+ Ethernet driver" below. Read
+ <file:Documentation/networking/lapb-module.txt> for technical
+ details.
+
+ To compile this driver as a module, choose M here: the
+ module will be called lapb. If unsure, say N.
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 5de05a0..8b5eefd 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -78,7 +78,7 @@ void lapb_requeue_frames(struct lapb_cb *lapb)
if (!skb_prev)
skb_queue_head(&lapb->write_queue, skb);
else
- skb_append(skb_prev, skb);
+ skb_append(skb_prev, skb, &lapb->write_queue);
skb_prev = skb;
}
}
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 20b4cfe..66f55e5 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -23,13 +23,13 @@
#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/tcp.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <net/llc.h>
#include <net/llc_sap.h>
#include <net/llc_pdu.h>
#include <net/llc_conn.h>
+#include <net/tcp_states.h>
/* remember: uninitialized global data is zeroed because its in .bss */
static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
@@ -714,7 +714,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
if (uaddr)
memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr));
msg->msg_namelen = sizeof(*uaddr);
- if (!skb->list) {
+ if (!skb->next) {
dgram_free:
kfree_skb(skb);
}
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
index cd130c3..d5bdb53 100644
--- a/net/llc/llc_c_ev.c
+++ b/net/llc/llc_c_ev.c
@@ -84,7 +84,7 @@ static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr)
if (llc->dev->flags & IFF_LOOPBACK)
goto out;
rc = 1;
- if (!skb_queue_len(&llc->pdu_unack_q))
+ if (skb_queue_empty(&llc->pdu_unack_q))
goto out;
skb = skb_peek(&llc->pdu_unack_q);
pdu = llc_pdu_sn_hdr(skb);
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index eba812a..4c644bc 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -16,7 +16,7 @@
#include <net/llc_sap.h>
#include <net/llc_conn.h>
#include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
#include <net/llc_c_ev.h>
#include <net/llc_c_ac.h>
#include <net/llc_c_st.h>
@@ -71,7 +71,11 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
if (!ev->ind_prim && !ev->cfm_prim) {
/* indicate or confirm not required */
- if (!skb->list)
+ /* XXX this is not very pretty, perhaps we should store
+ * XXX indicate/confirm-needed state in the llc_conn_state_ev
+ * XXX control block of the SKB instead? -DaveM
+ */
+ if (!skb->next)
goto out_kfree_skb;
goto out_skb_put;
}
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index 5ff02c0..9727455 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -103,7 +103,8 @@ out:
struct llc_sap *llc_sap_open(unsigned char lsap,
int (*func)(struct sk_buff *skb,
struct net_device *dev,
- struct packet_type *pt))
+ struct packet_type *pt,
+ struct net_device *orig_dev))
{
struct llc_sap *sap = llc_sap_find(lsap);
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index 0f9fc48..0f84f66 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -15,7 +15,6 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
-#include <linux/tcp.h>
#include <asm/errno.h>
#include <net/llc_if.h>
#include <net/llc_sap.h>
@@ -25,6 +24,7 @@
#include <net/llc_c_ev.h>
#include <net/llc_c_ac.h>
#include <net/llc_c_st.h>
+#include <net/tcp_states.h>
u8 llc_mac_null_var[IFHWADDRLEN];
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
index 4da6976..13b4624 100644
--- a/net/llc/llc_input.c
+++ b/net/llc/llc_input.c
@@ -132,7 +132,7 @@ static inline int llc_fixup_skb(struct sk_buff *skb)
* data now), it queues this frame in the connection's backlog.
*/
int llc_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+ struct packet_type *pt, struct net_device *orig_dev)
{
struct llc_sap *sap;
struct llc_pdu_sn *pdu;
@@ -165,7 +165,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
* LLC functionality
*/
if (sap->rcv_func) {
- sap->rcv_func(skb, dev, pt);
+ sap->rcv_func(skb, dev, pt, orig_dev);
goto out;
}
dest = llc_pdu_type(skb);
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 965c94e..34228ef 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -21,7 +21,7 @@
#include <net/llc_s_ev.h>
#include <net/llc_s_st.h>
#include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
#include <linux/llc.h>
/**
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
new file mode 100644
index 0000000..8296b38
--- /dev/null
+++ b/net/netfilter/Kconfig
@@ -0,0 +1,24 @@
+config NETFILTER_NETLINK
+ tristate "Netfilter netlink interface"
+ help
+ If this option is enabled, the kernel will include support
+ for the new netfilter netlink interface.
+
+config NETFILTER_NETLINK_QUEUE
+ tristate "Netfilter NFQUEUE over NFNETLINK interface"
+ depends on NETFILTER_NETLINK
+ help
+ If this option isenabled, the kernel will include support
+ for queueing packets via NFNETLINK.
+
+config NETFILTER_NETLINK_LOG
+ tristate "Netfilter LOG over NFNETLINK interface"
+ depends on NETFILTER_NETLINK
+ help
+ If this option is enabled, the kernel will include support
+ for logging packets via NFNETLINK.
+
+ This obsoletes the existing ipt_ULOG and ebg_ulog mechanisms,
+ and is also scheduled to replace the old syslog-based ipt_LOG
+ and ip6t_LOG modules.
+
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
new file mode 100644
index 0000000..b3b44f8
--- /dev/null
+++ b/net/netfilter/Makefile
@@ -0,0 +1,7 @@
+netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
+
+obj-$(CONFIG_NETFILTER) = netfilter.o
+
+obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
+obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
new file mode 100644
index 0000000..1ceb1a6
--- /dev/null
+++ b/net/netfilter/core.c
@@ -0,0 +1,216 @@
+/* netfilter.c: look after the filters for various protocols.
+ * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
+ *
+ * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
+ * way.
+ *
+ * Rusty Russell (C)2000 -- This code is GPL.
+ *
+ * February 2000: Modified by James Morris to have 1 queue per protocol.
+ * 15-Mar-2000: Added NF_REPEAT --RR.
+ * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik.
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <net/sock.h>
+
+#include "nf_internals.h"
+
+/* In this code, we can be waiting indefinitely for userspace to
+ * service a packet if a hook returns NF_QUEUE. We could keep a count
+ * of skbuffs queued for userspace, and not deregister a hook unless
+ * this is zero, but that sucks. Now, we simply check when the
+ * packets come back: if the hook is gone, the packet is discarded. */
+struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
+EXPORT_SYMBOL(nf_hooks);
+static DEFINE_SPINLOCK(nf_hook_lock);
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+ struct list_head *i;
+
+ spin_lock_bh(&nf_hook_lock);
+ list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
+ if (reg->priority < ((struct nf_hook_ops *)i)->priority)
+ break;
+ }
+ list_add_rcu(&reg->list, i->prev);
+ spin_unlock_bh(&nf_hook_lock);
+
+ synchronize_net();
+ return 0;
+}
+EXPORT_SYMBOL(nf_register_hook);
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+ spin_lock_bh(&nf_hook_lock);
+ list_del_rcu(&reg->list);
+ spin_unlock_bh(&nf_hook_lock);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(nf_unregister_hook);
+
+unsigned int nf_iterate(struct list_head *head,
+ struct sk_buff **skb,
+ int hook,
+ const struct net_device *indev,
+ const struct net_device *outdev,
+ struct list_head **i,
+ int (*okfn)(struct sk_buff *),
+ int hook_thresh)
+{
+ unsigned int verdict;
+
+ /*
+ * The caller must not block between calls to this
+ * function because of risk of continuing from deleted element.
+ */
+ list_for_each_continue_rcu(*i, head) {
+ struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
+
+ if (hook_thresh > elem->priority)
+ continue;
+
+ /* Optimization: we don't need to hold module
+ reference here, since function can't sleep. --RR */
+ verdict = elem->hook(hook, skb, indev, outdev, okfn);
+ if (verdict != NF_ACCEPT) {
+#ifdef CONFIG_NETFILTER_DEBUG
+ if (unlikely((verdict & NF_VERDICT_MASK)
+ > NF_MAX_VERDICT)) {
+ NFDEBUG("Evil return from %p(%u).\n",
+ elem->hook, hook);
+ continue;
+ }
+#endif
+ if (verdict != NF_REPEAT)
+ return verdict;
+ *i = (*i)->prev;
+ }
+ }
+ return NF_ACCEPT;
+}
+
+
+/* Returns 1 if okfn() needs to be executed by the caller,
+ * -EPERM for NF_DROP, 0 otherwise. */
+int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
+ struct net_device *indev,
+ struct net_device *outdev,
+ int (*okfn)(struct sk_buff *),
+ int hook_thresh)
+{
+ struct list_head *elem;
+ unsigned int verdict;
+ int ret = 0;
+
+ /* We may already have this, but read-locks nest anyway */
+ rcu_read_lock();
+
+ elem = &nf_hooks[pf][hook];
+next_hook:
+ verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
+ outdev, &elem, okfn, hook_thresh);
+ if (verdict == NF_ACCEPT || verdict == NF_STOP) {
+ ret = 1;
+ goto unlock;
+ } else if (verdict == NF_DROP) {
+ kfree_skb(*pskb);
+ ret = -EPERM;
+ } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
+ NFDEBUG("nf_hook: Verdict = QUEUE.\n");
+ if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn,
+ verdict >> NF_VERDICT_BITS))
+ goto next_hook;
+ }
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(nf_hook_slow);
+
+
+int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len)
+{
+ struct sk_buff *nskb;
+
+ if (writable_len > (*pskb)->len)
+ return 0;
+
+ /* Not exclusive use of packet? Must copy. */
+ if (skb_shared(*pskb) || skb_cloned(*pskb))
+ goto copy_skb;
+
+ return pskb_may_pull(*pskb, writable_len);
+
+copy_skb:
+ nskb = skb_copy(*pskb, GFP_ATOMIC);
+ if (!nskb)
+ return 0;
+ BUG_ON(skb_is_nonlinear(nskb));
+
+ /* Rest of kernel will get very unhappy if we pass it a
+ suddenly-orphaned skbuff */
+ if ((*pskb)->sk)
+ skb_set_owner_w(nskb, (*pskb)->sk);
+ kfree_skb(*pskb);
+ *pskb = nskb;
+ return 1;
+}
+EXPORT_SYMBOL(skb_make_writable);
+
+
+/* This does not belong here, but locally generated errors need it if connection
+ tracking in use: without this, connection may not be in hash table, and hence
+ manufactured ICMP or RST packets will not be associated with it. */
+void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
+EXPORT_SYMBOL(ip_ct_attach);
+
+void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
+{
+ void (*attach)(struct sk_buff *, struct sk_buff *);
+
+ if (skb->nfct && (attach = ip_ct_attach) != NULL) {
+ mb(); /* Just to be sure: must be read before executing this */
+ attach(new, skb);
+ }
+}
+EXPORT_SYMBOL(nf_ct_attach);
+
+#ifdef CONFIG_PROC_FS
+struct proc_dir_entry *proc_net_netfilter;
+EXPORT_SYMBOL(proc_net_netfilter);
+#endif
+
+void __init netfilter_init(void)
+{
+ int i, h;
+ for (i = 0; i < NPROTO; i++) {
+ for (h = 0; h < NF_MAX_HOOKS; h++)
+ INIT_LIST_HEAD(&nf_hooks[i][h]);
+ }
+
+#ifdef CONFIG_PROC_FS
+ proc_net_netfilter = proc_mkdir("netfilter", proc_net);
+ if (!proc_net_netfilter)
+ panic("cannot create netfilter proc entry");
+#endif
+
+ if (netfilter_queue_init() < 0)
+ panic("cannot initialize nf_queue");
+ if (netfilter_log_init() < 0)
+ panic("cannot initialize nf_log");
+}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
new file mode 100644
index 0000000..6bdee29
--- /dev/null
+++ b/net/netfilter/nf_internals.h
@@ -0,0 +1,39 @@
+#ifndef _NF_INTERNALS_H
+#define _NF_INTERNALS_H
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define NFDEBUG(format, args...) printk(format , ## args)
+#else
+#define NFDEBUG(format, args...)
+#endif
+
+
+/* core.c */
+extern unsigned int nf_iterate(struct list_head *head,
+ struct sk_buff **skb,
+ int hook,
+ const struct net_device *indev,
+ const struct net_device *outdev,
+ struct list_head **i,
+ int (*okfn)(struct sk_buff *),
+ int hook_thresh);
+
+/* nf_queue.c */
+extern int nf_queue(struct sk_buff **skb,
+ struct list_head *elem,
+ int pf, unsigned int hook,
+ struct net_device *indev,
+ struct net_device *outdev,
+ int (*okfn)(struct sk_buff *),
+ unsigned int queuenum);
+extern int __init netfilter_queue_init(void);
+
+/* nf_log.c */
+extern int __init netfilter_log_init(void);
+
+#endif
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
new file mode 100644
index 0000000..3e76bd0
--- /dev/null
+++ b/net/netfilter/nf_log.c
@@ -0,0 +1,178 @@
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <net/protocol.h>
+
+#include "nf_internals.h"
+
+/* Internal logging interface, which relies on the real
+ LOG target modules */
+
+#define NF_LOG_PREFIXLEN 128
+
+static struct nf_logger *nf_logging[NPROTO]; /* = NULL */
+static DEFINE_SPINLOCK(nf_log_lock);
+
+/* return EBUSY if somebody else is registered, EEXIST if the same logger
+ * is registred, 0 on success. */
+int nf_log_register(int pf, struct nf_logger *logger)
+{
+ int ret = -EBUSY;
+
+ if (pf >= NPROTO)
+ return -EINVAL;
+
+ /* Any setup of logging members must be done before
+ * substituting pointer. */
+ spin_lock(&nf_log_lock);
+ if (!nf_logging[pf]) {
+ rcu_assign_pointer(nf_logging[pf], logger);
+ ret = 0;
+ } else if (nf_logging[pf] == logger)
+ ret = -EEXIST;
+
+ spin_unlock(&nf_log_lock);
+ return ret;
+}
+EXPORT_SYMBOL(nf_log_register);
+
+int nf_log_unregister_pf(int pf)
+{
+ if (pf >= NPROTO)
+ return -EINVAL;
+
+ spin_lock(&nf_log_lock);
+ nf_logging[pf] = NULL;
+ spin_unlock(&nf_log_lock);
+
+ /* Give time to concurrent readers. */
+ synchronize_net();
+
+ return 0;
+}
+EXPORT_SYMBOL(nf_log_unregister_pf);
+
+void nf_log_unregister_logger(struct nf_logger *logger)
+{
+ int i;
+
+ spin_lock(&nf_log_lock);
+ for (i = 0; i < NPROTO; i++) {
+ if (nf_logging[i] == logger)
+ nf_logging[i] = NULL;
+ }
+ spin_unlock(&nf_log_lock);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(nf_log_unregister_logger);
+
+void nf_log_packet(int pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct nf_loginfo *loginfo,
+ const char *fmt, ...)
+{
+ va_list args;
+ char prefix[NF_LOG_PREFIXLEN];
+ struct nf_logger *logger;
+
+ rcu_read_lock();
+ logger = rcu_dereference(nf_logging[pf]);
+ if (logger) {
+ va_start(args, fmt);
+ vsnprintf(prefix, sizeof(prefix), fmt, args);
+ va_end(args);
+ /* We must read logging before nf_logfn[pf] */
+ logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix);
+ } else if (net_ratelimit()) {
+ printk(KERN_WARNING "nf_log_packet: can\'t log since "
+ "no backend logging module loaded in! Please either "
+ "load one, or disable logging explicitly\n");
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nf_log_packet);
+
+#ifdef CONFIG_PROC_FS
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+ rcu_read_lock();
+
+ if (*pos >= NPROTO)
+ return NULL;
+
+ return pos;
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ if (*pos >= NPROTO)
+ return NULL;
+
+ return pos;
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+ rcu_read_unlock();
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ loff_t *pos = v;
+ const struct nf_logger *logger;
+
+ logger = rcu_dereference(nf_logging[*pos]);
+
+ if (!logger)
+ return seq_printf(s, "%2lld NONE\n", *pos);
+
+ return seq_printf(s, "%2lld %s\n", *pos, logger->name);
+}
+
+static struct seq_operations nflog_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nflog_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &nflog_seq_ops);
+}
+
+static struct file_operations nflog_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nflog_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#endif /* PROC_FS */
+
+
+int __init netfilter_log_init(void)
+{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *pde;
+
+ pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter);
+ if (!pde)
+ return -1;
+
+ pde->proc_fops = &nflog_file_ops;
+#endif
+ return 0;
+}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
new file mode 100644
index 0000000..d10d552
--- /dev/null
+++ b/net/netfilter/nf_queue.c
@@ -0,0 +1,343 @@
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <net/protocol.h>
+
+#include "nf_internals.h"
+
+/*
+ * A queue handler may be registered for each protocol. Each is protected by
+ * long term mutex. The handler must provide an an outfn() to accept packets
+ * for queueing and must reinject all packets it receives, no matter what.
+ */
+static struct nf_queue_handler *queue_handler[NPROTO];
+static struct nf_queue_rerouter *queue_rerouter;
+
+static DEFINE_RWLOCK(queue_handler_lock);
+
+/* return EBUSY when somebody else is registered, return EEXIST if the
+ * same handler is registered, return 0 in case of success. */
+int nf_register_queue_handler(int pf, struct nf_queue_handler *qh)
+{
+ int ret;
+
+ if (pf >= NPROTO)
+ return -EINVAL;
+
+ write_lock_bh(&queue_handler_lock);
+ if (queue_handler[pf] == qh)
+ ret = -EEXIST;
+ else if (queue_handler[pf])
+ ret = -EBUSY;
+ else {
+ queue_handler[pf] = qh;
+ ret = 0;
+ }
+ write_unlock_bh(&queue_handler_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(nf_register_queue_handler);
+
+/* The caller must flush their queue before this */
+int nf_unregister_queue_handler(int pf)
+{
+ if (pf >= NPROTO)
+ return -EINVAL;
+
+ write_lock_bh(&queue_handler_lock);
+ queue_handler[pf] = NULL;
+ write_unlock_bh(&queue_handler_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(nf_unregister_queue_handler);
+
+int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer)
+{
+ if (pf >= NPROTO)
+ return -EINVAL;
+
+ write_lock_bh(&queue_handler_lock);
+ memcpy(&queue_rerouter[pf], rer, sizeof(queue_rerouter[pf]));
+ write_unlock_bh(&queue_handler_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_register_queue_rerouter);
+
+int nf_unregister_queue_rerouter(int pf)
+{
+ if (pf >= NPROTO)
+ return -EINVAL;
+
+ write_lock_bh(&queue_handler_lock);
+ memset(&queue_rerouter[pf], 0, sizeof(queue_rerouter[pf]));
+ write_unlock_bh(&queue_handler_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter);
+
+void nf_unregister_queue_handlers(struct nf_queue_handler *qh)
+{
+ int pf;
+
+ write_lock_bh(&queue_handler_lock);
+ for (pf = 0; pf < NPROTO; pf++) {
+ if (queue_handler[pf] == qh)
+ queue_handler[pf] = NULL;
+ }
+ write_unlock_bh(&queue_handler_lock);
+}
+EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers);
+
+/*
+ * Any packet that leaves via this function must come back
+ * through nf_reinject().
+ */
+int nf_queue(struct sk_buff **skb,
+ struct list_head *elem,
+ int pf, unsigned int hook,
+ struct net_device *indev,
+ struct net_device *outdev,
+ int (*okfn)(struct sk_buff *),
+ unsigned int queuenum)
+{
+ int status;
+ struct nf_info *info;
+#ifdef CONFIG_BRIDGE_NETFILTER
+ struct net_device *physindev = NULL;
+ struct net_device *physoutdev = NULL;
+#endif
+
+ /* QUEUE == DROP if noone is waiting, to be safe. */
+ read_lock(&queue_handler_lock);
+ if (!queue_handler[pf]->outfn) {
+ read_unlock(&queue_handler_lock);
+ kfree_skb(*skb);
+ return 1;
+ }
+
+ info = kmalloc(sizeof(*info)+queue_rerouter[pf].rer_size, GFP_ATOMIC);
+ if (!info) {
+ if (net_ratelimit())
+ printk(KERN_ERR "OOM queueing packet %p\n",
+ *skb);
+ read_unlock(&queue_handler_lock);
+ kfree_skb(*skb);
+ return 1;
+ }
+
+ *info = (struct nf_info) {
+ (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
+
+ /* If it's going away, ignore hook. */
+ if (!try_module_get(info->elem->owner)) {
+ read_unlock(&queue_handler_lock);
+ kfree(info);
+ return 0;
+ }
+
+ /* Bump dev refs so they don't vanish while packet is out */
+ if (indev) dev_hold(indev);
+ if (outdev) dev_hold(outdev);
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if ((*skb)->nf_bridge) {
+ physindev = (*skb)->nf_bridge->physindev;
+ if (physindev) dev_hold(physindev);
+ physoutdev = (*skb)->nf_bridge->physoutdev;
+ if (physoutdev) dev_hold(physoutdev);
+ }
+#endif
+ if (queue_rerouter[pf].save)
+ queue_rerouter[pf].save(*skb, info);
+
+ status = queue_handler[pf]->outfn(*skb, info, queuenum,
+ queue_handler[pf]->data);
+
+ if (status >= 0 && queue_rerouter[pf].reroute)
+ status = queue_rerouter[pf].reroute(skb, info);
+
+ read_unlock(&queue_handler_lock);
+
+ if (status < 0) {
+ /* James M doesn't say fuck enough. */
+ if (indev) dev_put(indev);
+ if (outdev) dev_put(outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (physindev) dev_put(physindev);
+ if (physoutdev) dev_put(physoutdev);
+#endif
+ module_put(info->elem->owner);
+ kfree(info);
+ kfree_skb(*skb);
+
+ return 1;
+ }
+
+ return 1;
+}
+
+void nf_reinject(struct sk_buff *skb, struct nf_info *info,
+ unsigned int verdict)
+{
+ struct list_head *elem = &info->elem->list;
+ struct list_head *i;
+
+ rcu_read_lock();
+
+ /* Release those devices we held, or Alexey will kill me. */
+ if (info->indev) dev_put(info->indev);
+ if (info->outdev) dev_put(info->outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (skb->nf_bridge) {
+ if (skb->nf_bridge->physindev)
+ dev_put(skb->nf_bridge->physindev);
+ if (skb->nf_bridge->physoutdev)
+ dev_put(skb->nf_bridge->physoutdev);
+ }
+#endif
+
+ /* Drop reference to owner of hook which queued us. */
+ module_put(info->elem->owner);
+
+ list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
+ if (i == elem)
+ break;
+ }
+
+ if (elem == &nf_hooks[info->pf][info->hook]) {
+ /* The module which sent it to userspace is gone. */
+ NFDEBUG("%s: module disappeared, dropping packet.\n",
+ __FUNCTION__);
+ verdict = NF_DROP;
+ }
+
+ /* Continue traversal iff userspace said ok... */
+ if (verdict == NF_REPEAT) {
+ elem = elem->prev;
+ verdict = NF_ACCEPT;
+ }
+
+ if (verdict == NF_ACCEPT) {
+ next_hook:
+ verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
+ &skb, info->hook,
+ info->indev, info->outdev, &elem,
+ info->okfn, INT_MIN);
+ }
+
+ switch (verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ info->okfn(skb);
+ break;
+
+ case NF_QUEUE:
+ if (!nf_queue(&skb, elem, info->pf, info->hook,
+ info->indev, info->outdev, info->okfn,
+ verdict >> NF_VERDICT_BITS))
+ goto next_hook;
+ break;
+ }
+ rcu_read_unlock();
+
+ if (verdict == NF_DROP)
+ kfree_skb(skb);
+
+ kfree(info);
+ return;
+}
+EXPORT_SYMBOL(nf_reinject);
+
+#ifdef CONFIG_PROC_FS
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos >= NPROTO)
+ return NULL;
+
+ return pos;
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ if (*pos >= NPROTO)
+ return NULL;
+
+ return pos;
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ int ret;
+ loff_t *pos = v;
+ struct nf_queue_handler *qh;
+
+ read_lock_bh(&queue_handler_lock);
+ qh = queue_handler[*pos];
+ if (!qh)
+ ret = seq_printf(s, "%2lld NONE\n", *pos);
+ else
+ ret = seq_printf(s, "%2lld %s\n", *pos, qh->name);
+ read_unlock_bh(&queue_handler_lock);
+
+ return ret;
+}
+
+static struct seq_operations nfqueue_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nfqueue_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &nfqueue_seq_ops);
+}
+
+static struct file_operations nfqueue_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nfqueue_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif /* PROC_FS */
+
+
+int __init netfilter_queue_init(void)
+{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *pde;
+#endif
+ queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter),
+ GFP_KERNEL);
+ if (!queue_rerouter)
+ return -ENOMEM;
+
+#ifdef CONFIG_PROC_FS
+ pde = create_proc_entry("nf_queue", S_IRUGO, proc_net_netfilter);
+ if (!pde) {
+ kfree(queue_rerouter);
+ return -1;
+ }
+ pde->proc_fops = &nfqueue_file_ops;
+#endif
+ memset(queue_rerouter, 0, NPROTO * sizeof(struct nf_queue_rerouter));
+
+ return 0;
+}
+
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
new file mode 100644
index 0000000..61a833a
--- /dev/null
+++ b/net/netfilter/nf_sockopt.c
@@ -0,0 +1,132 @@
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/sock.h>
+
+#include "nf_internals.h"
+
+/* Sockopts only registered and called from user context, so
+ net locking would be overkill. Also, [gs]etsockopt calls may
+ sleep. */
+static DECLARE_MUTEX(nf_sockopt_mutex);
+static LIST_HEAD(nf_sockopts);
+
+/* Do exclusive ranges overlap? */
+static inline int overlap(int min1, int max1, int min2, int max2)
+{
+ return max1 > min2 && min1 < max2;
+}
+
+/* Functions to register sockopt ranges (exclusive). */
+int nf_register_sockopt(struct nf_sockopt_ops *reg)
+{
+ struct list_head *i;
+ int ret = 0;
+
+ if (down_interruptible(&nf_sockopt_mutex) != 0)
+ return -EINTR;
+
+ list_for_each(i, &nf_sockopts) {
+ struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
+ if (ops->pf == reg->pf
+ && (overlap(ops->set_optmin, ops->set_optmax,
+ reg->set_optmin, reg->set_optmax)
+ || overlap(ops->get_optmin, ops->get_optmax,
+ reg->get_optmin, reg->get_optmax))) {
+ NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
+ ops->set_optmin, ops->set_optmax,
+ ops->get_optmin, ops->get_optmax,
+ reg->set_optmin, reg->set_optmax,
+ reg->get_optmin, reg->get_optmax);
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ list_add(&reg->list, &nf_sockopts);
+out:
+ up(&nf_sockopt_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(nf_register_sockopt);
+
+void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
+{
+ /* No point being interruptible: we're probably in cleanup_module() */
+ restart:
+ down(&nf_sockopt_mutex);
+ if (reg->use != 0) {
+ /* To be woken by nf_sockopt call... */
+ /* FIXME: Stuart Young's name appears gratuitously. */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ reg->cleanup_task = current;
+ up(&nf_sockopt_mutex);
+ schedule();
+ goto restart;
+ }
+ list_del(&reg->list);
+ up(&nf_sockopt_mutex);
+}
+EXPORT_SYMBOL(nf_unregister_sockopt);
+
+/* Call get/setsockopt() */
+static int nf_sockopt(struct sock *sk, int pf, int val,
+ char __user *opt, int *len, int get)
+{
+ struct list_head *i;
+ struct nf_sockopt_ops *ops;
+ int ret;
+
+ if (down_interruptible(&nf_sockopt_mutex) != 0)
+ return -EINTR;
+
+ list_for_each(i, &nf_sockopts) {
+ ops = (struct nf_sockopt_ops *)i;
+ if (ops->pf == pf) {
+ if (get) {
+ if (val >= ops->get_optmin
+ && val < ops->get_optmax) {
+ ops->use++;
+ up(&nf_sockopt_mutex);
+ ret = ops->get(sk, val, opt, len);
+ goto out;
+ }
+ } else {
+ if (val >= ops->set_optmin
+ && val < ops->set_optmax) {
+ ops->use++;
+ up(&nf_sockopt_mutex);
+ ret = ops->set(sk, val, opt, *len);
+ goto out;
+ }
+ }
+ }
+ }
+ up(&nf_sockopt_mutex);
+ return -ENOPROTOOPT;
+
+ out:
+ down(&nf_sockopt_mutex);
+ ops->use--;
+ if (ops->cleanup_task)
+ wake_up_process(ops->cleanup_task);
+ up(&nf_sockopt_mutex);
+ return ret;
+}
+
+int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
+ int len)
+{
+ return nf_sockopt(sk, pf, val, opt, &len, 0);
+}
+EXPORT_SYMBOL(nf_setsockopt);
+
+int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
+{
+ return nf_sockopt(sk, pf, val, opt, len, 1);
+}
+EXPORT_SYMBOL(nf_getsockopt);
+
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
new file mode 100644
index 0000000..49a3900
--- /dev/null
+++ b/net/netfilter/nfnetlink.c
@@ -0,0 +1,376 @@
+/* Netfilter messages via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>,
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * Initial netfilter messages via netlink development funded and
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/skbuff.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <net/sock.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+
+#include <linux/netfilter.h>
+#include <linux/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
+
+static char __initdata nfversion[] = "0.30";
+
+#if 0
+#define DEBUGP(format, args...) \
+ printk(KERN_DEBUG "%s(%d):%s(): " format, __FILE__, \
+ __LINE__, __FUNCTION__, ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static struct sock *nfnl = NULL;
+static struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT];
+DECLARE_MUTEX(nfnl_sem);
+
+void nfnl_lock(void)
+{
+ nfnl_shlock();
+}
+
+void nfnl_unlock(void)
+{
+ nfnl_shunlock();
+}
+
+int nfnetlink_subsys_register(struct nfnetlink_subsystem *n)
+{
+ DEBUGP("registering subsystem ID %u\n", n->subsys_id);
+
+ nfnl_lock();
+ if (subsys_table[n->subsys_id]) {
+ nfnl_unlock();
+ return -EBUSY;
+ }
+ subsys_table[n->subsys_id] = n;
+ nfnl_unlock();
+
+ return 0;
+}
+
+int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n)
+{
+ DEBUGP("unregistering subsystem ID %u\n", n->subsys_id);
+
+ nfnl_lock();
+ subsys_table[n->subsys_id] = NULL;
+ nfnl_unlock();
+
+ return 0;
+}
+
+static inline struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type)
+{
+ u_int8_t subsys_id = NFNL_SUBSYS_ID(type);
+
+ if (subsys_id >= NFNL_SUBSYS_COUNT
+ || subsys_table[subsys_id] == NULL)
+ return NULL;
+
+ return subsys_table[subsys_id];
+}
+
+static inline struct nfnl_callback *
+nfnetlink_find_client(u_int16_t type, struct nfnetlink_subsystem *ss)
+{
+ u_int8_t cb_id = NFNL_MSG_TYPE(type);
+
+ if (cb_id >= ss->cb_count) {
+ DEBUGP("msgtype %u >= %u, returning\n", type, ss->cb_count);
+ return NULL;
+ }
+
+ return &ss->cb[cb_id];
+}
+
+void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen,
+ const void *data)
+{
+ struct nfattr *nfa;
+ int size = NFA_LENGTH(attrlen);
+
+ nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size));
+ nfa->nfa_type = attrtype;
+ nfa->nfa_len = size;
+ memcpy(NFA_DATA(nfa), data, attrlen);
+ memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size);
+}
+
+int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len)
+{
+ memset(tb, 0, sizeof(struct nfattr *) * maxattr);
+
+ while (NFA_OK(nfa, len)) {
+ unsigned flavor = nfa->nfa_type;
+ if (flavor && flavor <= maxattr)
+ tb[flavor-1] = nfa;
+ nfa = NFA_NEXT(nfa, len);
+ }
+
+ return 0;
+}
+
+/**
+ * nfnetlink_check_attributes - check and parse nfnetlink attributes
+ *
+ * subsys: nfnl subsystem for which this message is to be parsed
+ * nlmsghdr: netlink message to be checked/parsed
+ * cda: array of pointers, needs to be at least subsys->attr_count big
+ *
+ */
+static int
+nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys,
+ struct nlmsghdr *nlh, struct nfattr *cda[])
+{
+ int min_len;
+ u_int16_t attr_count;
+ u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+
+ if (unlikely(cb_id >= subsys->cb_count)) {
+ DEBUGP("msgtype %u >= %u, returning\n",
+ cb_id, subsys->cb_count);
+ return -EINVAL;
+ }
+
+ min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg));
+ if (unlikely(nlh->nlmsg_len < min_len))
+ return -EINVAL;
+
+ attr_count = subsys->cb[cb_id].attr_count;
+ memset(cda, 0, sizeof(struct nfattr *) * attr_count);
+
+ /* check attribute lengths. */
+ if (likely(nlh->nlmsg_len > min_len)) {
+ struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh));
+ int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
+
+ while (NFA_OK(attr, attrlen)) {
+ unsigned flavor = attr->nfa_type;
+ if (flavor) {
+ if (flavor > attr_count)
+ return -EINVAL;
+ cda[flavor - 1] = attr;
+ }
+ attr = NFA_NEXT(attr, attrlen);
+ }
+ }
+
+ /* implicit: if nlmsg_len == min_len, we return 0, and an empty
+ * (zeroed) cda[] array. The message is valid, but empty. */
+
+ return 0;
+}
+
+int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
+{
+ int allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
+ int err = 0;
+
+ NETLINK_CB(skb).dst_group = group;
+ if (echo)
+ atomic_inc(&skb->users);
+ netlink_broadcast(nfnl, skb, pid, group, allocation);
+ if (echo)
+ err = netlink_unicast(nfnl, skb, pid, MSG_DONTWAIT);
+
+ return err;
+}
+
+int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags)
+{
+ return netlink_unicast(nfnl, skb, pid, flags);
+}
+
+/* Process one complete nfnetlink message. */
+static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
+ struct nlmsghdr *nlh, int *errp)
+{
+ struct nfnl_callback *nc;
+ struct nfnetlink_subsystem *ss;
+ int type, err = 0;
+
+ DEBUGP("entered; subsys=%u, msgtype=%u\n",
+ NFNL_SUBSYS_ID(nlh->nlmsg_type),
+ NFNL_MSG_TYPE(nlh->nlmsg_type));
+
+ /* Only requests are handled by kernel now. */
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
+ DEBUGP("received non-request message\n");
+ return 0;
+ }
+
+ /* All the messages must at least contain nfgenmsg */
+ if (nlh->nlmsg_len <
+ NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) {
+ DEBUGP("received message was too short\n");
+ return 0;
+ }
+
+ type = nlh->nlmsg_type;
+ ss = nfnetlink_get_subsys(type);
+ if (!ss) {
+#ifdef CONFIG_KMOD
+ /* don't call nfnl_shunlock, since it would reenter
+ * with further packet processing */
+ up(&nfnl_sem);
+ request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
+ nfnl_shlock();
+ ss = nfnetlink_get_subsys(type);
+ if (!ss)
+#endif
+ goto err_inval;
+ }
+
+ nc = nfnetlink_find_client(type, ss);
+ if (!nc) {
+ DEBUGP("unable to find client for type %d\n", type);
+ goto err_inval;
+ }
+
+ if (nc->cap_required &&
+ !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) {
+ DEBUGP("permission denied for type %d\n", type);
+ *errp = -EPERM;
+ return -1;
+ }
+
+ {
+ u_int16_t attr_count =
+ ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count;
+ struct nfattr *cda[attr_count];
+
+ memset(cda, 0, sizeof(struct nfattr *) * attr_count);
+
+ err = nfnetlink_check_attributes(ss, nlh, cda);
+ if (err < 0)
+ goto err_inval;
+
+ DEBUGP("calling handler\n");
+ err = nc->call(nfnl, skb, nlh, cda, errp);
+ *errp = err;
+ return err;
+ }
+
+err_inval:
+ DEBUGP("returning -EINVAL\n");
+ *errp = -EINVAL;
+ return -1;
+}
+
+/* Process one packet of messages. */
+static inline int nfnetlink_rcv_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr *nlh;
+
+ while (skb->len >= NLMSG_SPACE(0)) {
+ u32 rlen;
+
+ nlh = (struct nlmsghdr *)skb->data;
+ if (nlh->nlmsg_len < sizeof(struct nlmsghdr)
+ || skb->len < nlh->nlmsg_len)
+ return 0;
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+ if (nfnetlink_rcv_msg(skb, nlh, &err)) {
+ if (!err)
+ return -1;
+ netlink_ack(skb, nlh, err);
+ } else
+ if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, 0);
+ skb_pull(skb, rlen);
+ }
+
+ return 0;
+}
+
+static void nfnetlink_rcv(struct sock *sk, int len)
+{
+ do {
+ struct sk_buff *skb;
+
+ if (nfnl_shlock_nowait())
+ return;
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ if (nfnetlink_rcv_skb(skb)) {
+ if (skb->len)
+ skb_queue_head(&sk->sk_receive_queue,
+ skb);
+ else
+ kfree_skb(skb);
+ break;
+ }
+ kfree_skb(skb);
+ }
+
+ /* don't call nfnl_shunlock, since it would reenter
+ * with further packet processing */
+ up(&nfnl_sem);
+ } while(nfnl && nfnl->sk_receive_queue.qlen);
+}
+
+static void __exit nfnetlink_exit(void)
+{
+ printk("Removing netfilter NETLINK layer.\n");
+ sock_release(nfnl->sk_socket);
+ return;
+}
+
+static int __init nfnetlink_init(void)
+{
+ printk("Netfilter messages via NETLINK v%s.\n", nfversion);
+
+ nfnl = netlink_kernel_create(NETLINK_NETFILTER, NFNLGRP_MAX,
+ nfnetlink_rcv, THIS_MODULE);
+ if (!nfnl) {
+ printk(KERN_ERR "cannot initialize nfnetlink!\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+module_init(nfnetlink_init);
+module_exit(nfnetlink_exit);
+
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
+EXPORT_SYMBOL_GPL(nfnetlink_send);
+EXPORT_SYMBOL_GPL(nfnetlink_unicast);
+EXPORT_SYMBOL_GPL(nfattr_parse);
+EXPORT_SYMBOL_GPL(__nfa_fill);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
new file mode 100644
index 0000000..ff5601c
--- /dev/null
+++ b/net/netfilter/nfnetlink_log.c
@@ -0,0 +1,1055 @@
+/*
+ * This is a module which is used for logging packets to userspace via
+ * nfetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * Based on the old ipv4-only ipt_ULOG.c:
+ * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_log.h>
+#include <linux/spinlock.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <net/sock.h>
+
+#include <asm/atomic.h>
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include "../bridge/br_private.h"
+#endif
+
+#define NFULNL_NLBUFSIZ_DEFAULT 4096
+#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
+#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */
+
+#define PRINTR(x, args...) do { if (net_ratelimit()) \
+ printk(x, ## args); } while (0);
+
+#if 0
+#define UDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \
+ __FILE__, __LINE__, __FUNCTION__, \
+ ## args)
+#else
+#define UDEBUG(x, ...)
+#endif
+
+struct nfulnl_instance {
+ struct hlist_node hlist; /* global list of instances */
+ spinlock_t lock;
+ atomic_t use; /* use count */
+
+ unsigned int qlen; /* number of nlmsgs in skb */
+ struct sk_buff *skb; /* pre-allocatd skb */
+ struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */
+ struct timer_list timer;
+ int peer_pid; /* PID of the peer process */
+
+ /* configurable parameters */
+ unsigned int flushtimeout; /* timeout until queue flush */
+ unsigned int nlbufsiz; /* netlink buffer allocation size */
+ unsigned int qthreshold; /* threshold of the queue */
+ u_int32_t copy_range;
+ u_int16_t group_num; /* number of this queue */
+ u_int8_t copy_mode;
+};
+
+static DEFINE_RWLOCK(instances_lock);
+
+#define INSTANCE_BUCKETS 16
+static struct hlist_head instance_table[INSTANCE_BUCKETS];
+static unsigned int hash_init;
+
+static inline u_int8_t instance_hashfn(u_int16_t group_num)
+{
+ return ((group_num & 0xff) % INSTANCE_BUCKETS);
+}
+
+static struct nfulnl_instance *
+__instance_lookup(u_int16_t group_num)
+{
+ struct hlist_head *head;
+ struct hlist_node *pos;
+ struct nfulnl_instance *inst;
+
+ UDEBUG("entering (group_num=%u)\n", group_num);
+
+ head = &instance_table[instance_hashfn(group_num)];
+ hlist_for_each_entry(inst, pos, head, hlist) {
+ if (inst->group_num == group_num)
+ return inst;
+ }
+ return NULL;
+}
+
+static inline void
+instance_get(struct nfulnl_instance *inst)
+{
+ atomic_inc(&inst->use);
+}
+
+static struct nfulnl_instance *
+instance_lookup_get(u_int16_t group_num)
+{
+ struct nfulnl_instance *inst;
+
+ read_lock_bh(&instances_lock);
+ inst = __instance_lookup(group_num);
+ if (inst)
+ instance_get(inst);
+ read_unlock_bh(&instances_lock);
+
+ return inst;
+}
+
+static void
+instance_put(struct nfulnl_instance *inst)
+{
+ if (inst && atomic_dec_and_test(&inst->use)) {
+ UDEBUG("kfree(inst=%p)\n", inst);
+ kfree(inst);
+ }
+}
+
+static void nfulnl_timer(unsigned long data);
+
+static struct nfulnl_instance *
+instance_create(u_int16_t group_num, int pid)
+{
+ struct nfulnl_instance *inst;
+
+ UDEBUG("entering (group_num=%u, pid=%d)\n", group_num,
+ pid);
+
+ write_lock_bh(&instances_lock);
+ if (__instance_lookup(group_num)) {
+ inst = NULL;
+ UDEBUG("aborting, instance already exists\n");
+ goto out_unlock;
+ }
+
+ inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+ if (!inst)
+ goto out_unlock;
+
+ memset(inst, 0, sizeof(*inst));
+ INIT_HLIST_NODE(&inst->hlist);
+ inst->lock = SPIN_LOCK_UNLOCKED;
+ /* needs to be two, since we _put() after creation */
+ atomic_set(&inst->use, 2);
+
+ init_timer(&inst->timer);
+ inst->timer.function = nfulnl_timer;
+ inst->timer.data = (unsigned long)inst;
+ /* don't start timer yet. (re)start it with every packet */
+
+ inst->peer_pid = pid;
+ inst->group_num = group_num;
+
+ inst->qthreshold = NFULNL_QTHRESH_DEFAULT;
+ inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT;
+ inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT;
+ inst->copy_mode = NFULNL_COPY_PACKET;
+ inst->copy_range = 0xffff;
+
+ if (!try_module_get(THIS_MODULE))
+ goto out_free;
+
+ hlist_add_head(&inst->hlist,
+ &instance_table[instance_hashfn(group_num)]);
+
+ UDEBUG("newly added node: %p, next=%p\n", &inst->hlist,
+ inst->hlist.next);
+
+ write_unlock_bh(&instances_lock);
+
+ return inst;
+
+out_free:
+ instance_put(inst);
+out_unlock:
+ write_unlock_bh(&instances_lock);
+ return NULL;
+}
+
+static int __nfulnl_send(struct nfulnl_instance *inst);
+
+static void
+_instance_destroy2(struct nfulnl_instance *inst, int lock)
+{
+ /* first pull it out of the global list */
+ if (lock)
+ write_lock_bh(&instances_lock);
+
+ UDEBUG("removing instance %p (queuenum=%u) from hash\n",
+ inst, inst->group_num);
+
+ hlist_del(&inst->hlist);
+
+ if (lock)
+ write_unlock_bh(&instances_lock);
+
+ /* then flush all pending packets from skb */
+
+ spin_lock_bh(&inst->lock);
+ if (inst->skb) {
+ if (inst->qlen)
+ __nfulnl_send(inst);
+ if (inst->skb) {
+ kfree_skb(inst->skb);
+ inst->skb = NULL;
+ }
+ }
+ spin_unlock_bh(&inst->lock);
+
+ /* and finally put the refcount */
+ instance_put(inst);
+
+ module_put(THIS_MODULE);
+}
+
+static inline void
+__instance_destroy(struct nfulnl_instance *inst)
+{
+ _instance_destroy2(inst, 0);
+}
+
+static inline void
+instance_destroy(struct nfulnl_instance *inst)
+{
+ _instance_destroy2(inst, 1);
+}
+
+static int
+nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode,
+ unsigned int range)
+{
+ int status = 0;
+
+ spin_lock_bh(&inst->lock);
+
+ switch (mode) {
+ case NFULNL_COPY_NONE:
+ case NFULNL_COPY_META:
+ inst->copy_mode = mode;
+ inst->copy_range = 0;
+ break;
+
+ case NFULNL_COPY_PACKET:
+ inst->copy_mode = mode;
+ /* we're using struct nfattr which has 16bit nfa_len */
+ if (range > 0xffff)
+ inst->copy_range = 0xffff;
+ else
+ inst->copy_range = range;
+ break;
+
+ default:
+ status = -EINVAL;
+ break;
+ }
+
+ spin_unlock_bh(&inst->lock);
+
+ return status;
+}
+
+static int
+nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz)
+{
+ int status;
+
+ spin_lock_bh(&inst->lock);
+ if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT)
+ status = -ERANGE;
+ else if (nlbufsiz > 131072)
+ status = -ERANGE;
+ else {
+ inst->nlbufsiz = nlbufsiz;
+ status = 0;
+ }
+ spin_unlock_bh(&inst->lock);
+
+ return status;
+}
+
+static int
+nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout)
+{
+ spin_lock_bh(&inst->lock);
+ inst->flushtimeout = timeout;
+ spin_unlock_bh(&inst->lock);
+
+ return 0;
+}
+
+static int
+nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh)
+{
+ spin_lock_bh(&inst->lock);
+ inst->qthreshold = qthresh;
+ spin_unlock_bh(&inst->lock);
+
+ return 0;
+}
+
+static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size,
+ unsigned int pkt_size)
+{
+ struct sk_buff *skb;
+
+ UDEBUG("entered (%u, %u)\n", inst_size, pkt_size);
+
+ /* alloc skb which should be big enough for a whole multipart
+ * message. WARNING: has to be <= 128k due to slab restrictions */
+
+ skb = alloc_skb(inst_size, GFP_ATOMIC);
+ if (!skb) {
+ PRINTR("nfnetlink_log: can't alloc whole buffer (%u bytes)\n",
+ inst_size);
+
+ /* try to allocate only as much as we need for current
+ * packet */
+
+ skb = alloc_skb(pkt_size, GFP_ATOMIC);
+ if (!skb)
+ PRINTR("nfnetlink_log: can't even alloc %u bytes\n",
+ pkt_size);
+ }
+
+ return skb;
+}
+
+static int
+__nfulnl_send(struct nfulnl_instance *inst)
+{
+ int status;
+
+ if (timer_pending(&inst->timer))
+ del_timer(&inst->timer);
+
+ if (inst->qlen > 1)
+ inst->lastnlh->nlmsg_type = NLMSG_DONE;
+
+ status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT);
+ if (status < 0) {
+ UDEBUG("netlink_unicast() failed\n");
+ /* FIXME: statistics */
+ }
+
+ inst->qlen = 0;
+ inst->skb = NULL;
+ inst->lastnlh = NULL;
+
+ return status;
+}
+
+static void nfulnl_timer(unsigned long data)
+{
+ struct nfulnl_instance *inst = (struct nfulnl_instance *)data;
+
+ UDEBUG("timer function called, flushing buffer\n");
+
+ spin_lock_bh(&inst->lock);
+ __nfulnl_send(inst);
+ instance_put(inst);
+ spin_unlock_bh(&inst->lock);
+}
+
+static inline int
+__build_packet_message(struct nfulnl_instance *inst,
+ const struct sk_buff *skb,
+ unsigned int data_len,
+ unsigned int pf,
+ unsigned int hooknum,
+ const struct net_device *indev,
+ const struct net_device *outdev,
+ const struct nf_loginfo *li,
+ const char *prefix)
+{
+ unsigned char *old_tail;
+ struct nfulnl_msg_packet_hdr pmsg;
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ u_int32_t tmp_uint;
+
+ UDEBUG("entered\n");
+
+ old_tail = inst->skb->tail;
+ nlh = NLMSG_PUT(inst->skb, 0, 0,
+ NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET,
+ sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+ nfmsg->nfgen_family = pf;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(inst->group_num);
+
+ pmsg.hw_protocol = htons(skb->protocol);
+ pmsg.hook = hooknum;
+
+ NFA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg);
+
+ if (prefix) {
+ int slen = strlen(prefix);
+ if (slen > NFULNL_PREFIXLEN)
+ slen = NFULNL_PREFIXLEN;
+ NFA_PUT(inst->skb, NFULA_PREFIX, slen, prefix);
+ }
+
+ if (indev) {
+ tmp_uint = htonl(indev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+ NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, sizeof(tmp_uint),
+ &tmp_uint);
+#else
+ if (pf == PF_BRIDGE) {
+ /* Case 1: outdev is physical input device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ /* this is the bridge group "brX" */
+ tmp_uint = htonl(indev->br_port->br->dev->ifindex);
+ NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ } else {
+ /* Case 2: indev is bridge group, we need to look for
+ * physical device (when called from ipv4) */
+ NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ if (skb->nf_bridge && skb->nf_bridge->physindev) {
+ tmp_uint =
+ htonl(skb->nf_bridge->physindev->ifindex);
+ NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ }
+ }
+#endif
+ }
+
+ if (outdev) {
+ tmp_uint = htonl(outdev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+ NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, sizeof(tmp_uint),
+ &tmp_uint);
+#else
+ if (pf == PF_BRIDGE) {
+ /* Case 1: outdev is physical output device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ /* this is the bridge group "brX" */
+ tmp_uint = htonl(outdev->br_port->br->dev->ifindex);
+ NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ } else {
+ /* Case 2: indev is a bridge group, we need to look
+ * for physical device (when called from ipv4) */
+ NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ if (skb->nf_bridge) {
+ tmp_uint =
+ htonl(skb->nf_bridge->physoutdev->ifindex);
+ NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ }
+ }
+#endif
+ }
+
+ if (skb->nfmark) {
+ tmp_uint = htonl(skb->nfmark);
+ NFA_PUT(inst->skb, NFULA_MARK, sizeof(tmp_uint), &tmp_uint);
+ }
+
+ if (indev && skb->dev && skb->dev->hard_header_parse) {
+ struct nfulnl_msg_packet_hw phw;
+
+ phw.hw_addrlen =
+ skb->dev->hard_header_parse((struct sk_buff *)skb,
+ phw.hw_addr);
+ phw.hw_addrlen = htons(phw.hw_addrlen);
+ NFA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw);
+ }
+
+ if (skb->tstamp.off_sec) {
+ struct nfulnl_msg_packet_timestamp ts;
+
+ ts.sec = cpu_to_be64(skb_tv_base.tv_sec + skb->tstamp.off_sec);
+ ts.usec = cpu_to_be64(skb_tv_base.tv_usec + skb->tstamp.off_usec);
+
+ NFA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts);
+ }
+
+ /* UID */
+ if (skb->sk) {
+ read_lock_bh(&skb->sk->sk_callback_lock);
+ if (skb->sk->sk_socket && skb->sk->sk_socket->file) {
+ u_int32_t uid = htonl(skb->sk->sk_socket->file->f_uid);
+ /* need to unlock here since NFA_PUT may goto */
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ NFA_PUT(inst->skb, NFULA_UID, sizeof(uid), &uid);
+ } else
+ read_unlock_bh(&skb->sk->sk_callback_lock);
+ }
+
+ if (data_len) {
+ struct nfattr *nfa;
+ int size = NFA_LENGTH(data_len);
+
+ if (skb_tailroom(inst->skb) < (int)NFA_SPACE(data_len)) {
+ printk(KERN_WARNING "nfnetlink_log: no tailroom!\n");
+ goto nlmsg_failure;
+ }
+
+ nfa = (struct nfattr *)skb_put(inst->skb, NFA_ALIGN(size));
+ nfa->nfa_type = NFULA_PAYLOAD;
+ nfa->nfa_len = size;
+
+ if (skb_copy_bits(skb, 0, NFA_DATA(nfa), data_len))
+ BUG();
+ }
+
+ nlh->nlmsg_len = inst->skb->tail - old_tail;
+ return 0;
+
+nlmsg_failure:
+ UDEBUG("nlmsg_failure\n");
+nfattr_failure:
+ PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");
+ return -1;
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_ULOG,
+ .u = {
+ .ulog = {
+ .copy_len = 0xffff,
+ .group = 0,
+ .qthreshold = 1,
+ },
+ },
+};
+
+/* log handler for internal netfilter logging api */
+static void
+nfulnl_log_packet(unsigned int pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *li_user,
+ const char *prefix)
+{
+ unsigned int size, data_len;
+ struct nfulnl_instance *inst;
+ const struct nf_loginfo *li;
+ unsigned int qthreshold;
+ unsigned int nlbufsiz;
+
+ if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
+ li = li_user;
+ else
+ li = &default_loginfo;
+
+ inst = instance_lookup_get(li->u.ulog.group);
+ if (!inst)
+ inst = instance_lookup_get(0);
+ if (!inst) {
+ PRINTR("nfnetlink_log: trying to log packet, "
+ "but no instance for group %u\n", li->u.ulog.group);
+ return;
+ }
+
+ /* all macros expand to constant values at compile time */
+ /* FIXME: do we want to make the size calculation conditional based on
+ * what is actually present? way more branches and checks, but more
+ * memory efficient... */
+ size = NLMSG_SPACE(sizeof(struct nfgenmsg))
+ + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hdr))
+ + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
+ + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
+#ifdef CONFIG_BRIDGE_NETFILTER
+ + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
+ + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */
+#endif
+ + NFA_SPACE(sizeof(u_int32_t)) /* mark */
+ + NFA_SPACE(sizeof(u_int32_t)) /* uid */
+ + NFA_SPACE(NFULNL_PREFIXLEN) /* prefix */
+ + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hw))
+ + NFA_SPACE(sizeof(struct nfulnl_msg_packet_timestamp));
+
+ UDEBUG("initial size=%u\n", size);
+
+ spin_lock_bh(&inst->lock);
+
+ qthreshold = inst->qthreshold;
+ /* per-rule qthreshold overrides per-instance */
+ if (qthreshold > li->u.ulog.qthreshold)
+ qthreshold = li->u.ulog.qthreshold;
+
+ switch (inst->copy_mode) {
+ case NFULNL_COPY_META:
+ case NFULNL_COPY_NONE:
+ data_len = 0;
+ break;
+
+ case NFULNL_COPY_PACKET:
+ if (inst->copy_range == 0
+ || inst->copy_range > skb->len)
+ data_len = skb->len;
+ else
+ data_len = inst->copy_range;
+
+ size += NFA_SPACE(data_len);
+ UDEBUG("copy_packet, therefore size now %u\n", size);
+ break;
+
+ default:
+ spin_unlock_bh(&inst->lock);
+ instance_put(inst);
+ return;
+ }
+
+ if (size > inst->nlbufsiz)
+ nlbufsiz = size;
+ else
+ nlbufsiz = inst->nlbufsiz;
+
+ if (!inst->skb) {
+ if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) {
+ UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n",
+ inst->nlbufsiz, size);
+ goto alloc_failure;
+ }
+ } else if (inst->qlen >= qthreshold ||
+ size > skb_tailroom(inst->skb)) {
+ /* either the queue len is too high or we don't have
+ * enough room in the skb left. flush to userspace. */
+ UDEBUG("flushing old skb\n");
+
+ __nfulnl_send(inst);
+
+ if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) {
+ UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n",
+ inst->nlbufsiz, size);
+ goto alloc_failure;
+ }
+ }
+
+ UDEBUG("qlen %d, qthreshold %d\n", inst->qlen, qthreshold);
+ inst->qlen++;
+
+ __build_packet_message(inst, skb, data_len, pf,
+ hooknum, in, out, li, prefix);
+
+ /* timer_pending always called within inst->lock, so there
+ * is no chance of a race here */
+ if (!timer_pending(&inst->timer)) {
+ instance_get(inst);
+ inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100);
+ add_timer(&inst->timer);
+ }
+ spin_unlock_bh(&inst->lock);
+
+ return;
+
+alloc_failure:
+ spin_unlock_bh(&inst->lock);
+ instance_put(inst);
+ UDEBUG("error allocating skb\n");
+ /* FIXME: statistics */
+}
+
+static int
+nfulnl_rcv_nl_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct netlink_notify *n = ptr;
+
+ if (event == NETLINK_URELEASE &&
+ n->protocol == NETLINK_NETFILTER && n->pid) {
+ int i;
+
+ /* destroy all instances for this pid */
+ write_lock_bh(&instances_lock);
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct hlist_node *tmp, *t2;
+ struct nfulnl_instance *inst;
+ struct hlist_head *head = &instance_table[i];
+
+ hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
+ UDEBUG("node = %p\n", inst);
+ if (n->pid == inst->peer_pid)
+ __instance_destroy(inst);
+ }
+ }
+ write_unlock_bh(&instances_lock);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfulnl_rtnl_notifier = {
+ .notifier_call = nfulnl_rcv_nl_event,
+};
+
+static int
+nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+ return -ENOTSUPP;
+}
+
+static struct nf_logger nfulnl_logger = {
+ .name = "nfnetlink_log",
+ .logfn = &nfulnl_log_packet,
+ .me = THIS_MODULE,
+};
+
+static const int nfula_min[NFULA_MAX] = {
+ [NFULA_PACKET_HDR-1] = sizeof(struct nfulnl_msg_packet_hdr),
+ [NFULA_MARK-1] = sizeof(u_int32_t),
+ [NFULA_TIMESTAMP-1] = sizeof(struct nfulnl_msg_packet_timestamp),
+ [NFULA_IFINDEX_INDEV-1] = sizeof(u_int32_t),
+ [NFULA_IFINDEX_OUTDEV-1]= sizeof(u_int32_t),
+ [NFULA_HWADDR-1] = sizeof(struct nfulnl_msg_packet_hw),
+ [NFULA_PAYLOAD-1] = 0,
+ [NFULA_PREFIX-1] = 0,
+ [NFULA_UID-1] = sizeof(u_int32_t),
+};
+
+static const int nfula_cfg_min[NFULA_CFG_MAX] = {
+ [NFULA_CFG_CMD-1] = sizeof(struct nfulnl_msg_config_cmd),
+ [NFULA_CFG_MODE-1] = sizeof(struct nfulnl_msg_config_mode),
+ [NFULA_CFG_TIMEOUT-1] = sizeof(u_int32_t),
+ [NFULA_CFG_QTHRESH-1] = sizeof(u_int32_t),
+ [NFULA_CFG_NLBUFSIZ-1] = sizeof(u_int32_t),
+};
+
+static int
+nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *nfula[], int *errp)
+{
+ struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ u_int16_t group_num = ntohs(nfmsg->res_id);
+ struct nfulnl_instance *inst;
+ int ret = 0;
+
+ UDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type));
+
+ if (nfattr_bad_size(nfula, NFULA_CFG_MAX, nfula_cfg_min)) {
+ UDEBUG("bad attribute size\n");
+ return -EINVAL;
+ }
+
+ inst = instance_lookup_get(group_num);
+ if (nfula[NFULA_CFG_CMD-1]) {
+ u_int8_t pf = nfmsg->nfgen_family;
+ struct nfulnl_msg_config_cmd *cmd;
+ cmd = NFA_DATA(nfula[NFULA_CFG_CMD-1]);
+ UDEBUG("found CFG_CMD for\n");
+
+ switch (cmd->command) {
+ case NFULNL_CFG_CMD_BIND:
+ if (inst) {
+ ret = -EBUSY;
+ goto out_put;
+ }
+
+ inst = instance_create(group_num,
+ NETLINK_CB(skb).pid);
+ if (!inst) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+ break;
+ case NFULNL_CFG_CMD_UNBIND:
+ if (!inst) {
+ ret = -ENODEV;
+ goto out_put;
+ }
+
+ if (inst->peer_pid != NETLINK_CB(skb).pid) {
+ ret = -EPERM;
+ goto out_put;
+ }
+
+ instance_destroy(inst);
+ break;
+ case NFULNL_CFG_CMD_PF_BIND:
+ UDEBUG("registering log handler for pf=%u\n", pf);
+ ret = nf_log_register(pf, &nfulnl_logger);
+ break;
+ case NFULNL_CFG_CMD_PF_UNBIND:
+ UDEBUG("unregistering log handler for pf=%u\n", pf);
+ /* This is a bug and a feature. We cannot unregister
+ * other handlers, like nfnetlink_inst can */
+ nf_log_unregister_pf(pf);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ } else {
+ if (!inst) {
+ UDEBUG("no config command, and no instance for "
+ "group=%u pid=%u =>ENOENT\n",
+ group_num, NETLINK_CB(skb).pid);
+ ret = -ENOENT;
+ goto out_put;
+ }
+
+ if (inst->peer_pid != NETLINK_CB(skb).pid) {
+ UDEBUG("no config command, and wrong pid\n");
+ ret = -EPERM;
+ goto out_put;
+ }
+ }
+
+ if (nfula[NFULA_CFG_MODE-1]) {
+ struct nfulnl_msg_config_mode *params;
+ params = NFA_DATA(nfula[NFULA_CFG_MODE-1]);
+
+ nfulnl_set_mode(inst, params->copy_mode,
+ ntohs(params->copy_range));
+ }
+
+ if (nfula[NFULA_CFG_TIMEOUT-1]) {
+ u_int32_t timeout =
+ *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_TIMEOUT-1]);
+
+ nfulnl_set_timeout(inst, ntohl(timeout));
+ }
+
+ if (nfula[NFULA_CFG_NLBUFSIZ-1]) {
+ u_int32_t nlbufsiz =
+ *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_NLBUFSIZ-1]);
+
+ nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz));
+ }
+
+ if (nfula[NFULA_CFG_QTHRESH-1]) {
+ u_int32_t qthresh =
+ *(u_int16_t *)NFA_DATA(nfula[NFULA_CFG_QTHRESH-1]);
+
+ nfulnl_set_qthresh(inst, ntohl(qthresh));
+ }
+
+out_put:
+ instance_put(inst);
+ return ret;
+}
+
+static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
+ [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp,
+ .attr_count = NFULA_MAX,
+ .cap_required = CAP_NET_ADMIN, },
+ [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config,
+ .attr_count = NFULA_CFG_MAX,
+ .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnetlink_subsystem nfulnl_subsys = {
+ .name = "log",
+ .subsys_id = NFNL_SUBSYS_ULOG,
+ .cb_count = NFULNL_MSG_MAX,
+ .cb = nfulnl_cb,
+};
+
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+ unsigned int bucket;
+};
+
+static struct hlist_node *get_first(struct seq_file *seq)
+{
+ struct iter_state *st = seq->private;
+
+ if (!st)
+ return NULL;
+
+ for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+ if (!hlist_empty(&instance_table[st->bucket]))
+ return instance_table[st->bucket].first;
+ }
+ return NULL;
+}
+
+static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+{
+ struct iter_state *st = seq->private;
+
+ h = h->next;
+ while (!h) {
+ if (++st->bucket >= INSTANCE_BUCKETS)
+ return NULL;
+
+ h = instance_table[st->bucket].first;
+ }
+ return h;
+}
+
+static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_node *head;
+ head = get_first(seq);
+
+ if (head)
+ while (pos && (head = get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock_bh(&instances_lock);
+ return get_idx(seq, *pos);
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return get_next(s, v);
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+ read_unlock_bh(&instances_lock);
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ const struct nfulnl_instance *inst = v;
+
+ return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n",
+ inst->group_num,
+ inst->peer_pid, inst->qlen,
+ inst->copy_mode, inst->copy_range,
+ inst->flushtimeout, atomic_read(&inst->use));
+}
+
+static struct seq_operations nful_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nful_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ struct iter_state *is;
+ int ret;
+
+ is = kmalloc(sizeof(*is), GFP_KERNEL);
+ if (!is)
+ return -ENOMEM;
+ memset(is, 0, sizeof(*is));
+ ret = seq_open(file, &nful_seq_ops);
+ if (ret < 0)
+ goto out_free;
+ seq = file->private_data;
+ seq->private = is;
+ return ret;
+out_free:
+ kfree(is);
+ return ret;
+}
+
+static struct file_operations nful_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nful_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+#endif /* PROC_FS */
+
+static int
+init_or_cleanup(int init)
+{
+ int i, status = -ENOMEM;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc_nful;
+#endif
+
+ if (!init)
+ goto cleanup;
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++)
+ INIT_HLIST_HEAD(&instance_table[i]);
+
+ /* it's not really all that important to have a random value, so
+ * we can do this from the init function, even if there hasn't
+ * been that much entropy yet */
+ get_random_bytes(&hash_init, sizeof(hash_init));
+
+ netlink_register_notifier(&nfulnl_rtnl_notifier);
+ status = nfnetlink_subsys_register(&nfulnl_subsys);
+ if (status < 0) {
+ printk(KERN_ERR "log: failed to create netlink socket\n");
+ goto cleanup_netlink_notifier;
+ }
+
+#ifdef CONFIG_PROC_FS
+ proc_nful = create_proc_entry("nfnetlink_log", 0440,
+ proc_net_netfilter);
+ if (!proc_nful)
+ goto cleanup_subsys;
+ proc_nful->proc_fops = &nful_file_ops;
+#endif
+
+ return status;
+
+cleanup:
+ nf_log_unregister_logger(&nfulnl_logger);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nfnetlink_log", proc_net_netfilter);
+cleanup_subsys:
+#endif
+ nfnetlink_subsys_unregister(&nfulnl_subsys);
+cleanup_netlink_notifier:
+ netlink_unregister_notifier(&nfulnl_rtnl_notifier);
+ return status;
+}
+
+static int __init init(void)
+{
+
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+MODULE_DESCRIPTION("netfilter userspace logging");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
new file mode 100644
index 0000000..f81fe8c
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue.c
@@ -0,0 +1,1127 @@
+/*
+ * This is a module which is used for queueing packets and communicating with
+ * userspace via nfetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * Based on the old ipv4-only ip_queue.c:
+ * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/proc_fs.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+#include <linux/list.h>
+#include <net/sock.h>
+
+#include <asm/atomic.h>
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include "../bridge/br_private.h"
+#endif
+
+#define NFQNL_QMAX_DEFAULT 1024
+
+#if 0
+#define QDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \
+ __FILE__, __LINE__, __FUNCTION__, \
+ ## args)
+#else
+#define QDEBUG(x, ...)
+#endif
+
+struct nfqnl_queue_entry {
+ struct list_head list;
+ struct nf_info *info;
+ struct sk_buff *skb;
+ unsigned int id;
+};
+
+struct nfqnl_instance {
+ struct hlist_node hlist; /* global list of queues */
+ atomic_t use;
+
+ int peer_pid;
+ unsigned int queue_maxlen;
+ unsigned int copy_range;
+ unsigned int queue_total;
+ unsigned int queue_dropped;
+ unsigned int queue_user_dropped;
+
+ atomic_t id_sequence; /* 'sequence' of pkt ids */
+
+ u_int16_t queue_num; /* number of this queue */
+ u_int8_t copy_mode;
+
+ spinlock_t lock;
+
+ struct list_head queue_list; /* packets in queue */
+};
+
+typedef int (*nfqnl_cmpfn)(struct nfqnl_queue_entry *, unsigned long);
+
+static DEFINE_RWLOCK(instances_lock);
+
+#define INSTANCE_BUCKETS 16
+static struct hlist_head instance_table[INSTANCE_BUCKETS];
+
+static inline u_int8_t instance_hashfn(u_int16_t queue_num)
+{
+ return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS;
+}
+
+static struct nfqnl_instance *
+__instance_lookup(u_int16_t queue_num)
+{
+ struct hlist_head *head;
+ struct hlist_node *pos;
+ struct nfqnl_instance *inst;
+
+ head = &instance_table[instance_hashfn(queue_num)];
+ hlist_for_each_entry(inst, pos, head, hlist) {
+ if (inst->queue_num == queue_num)
+ return inst;
+ }
+ return NULL;
+}
+
+static struct nfqnl_instance *
+instance_lookup_get(u_int16_t queue_num)
+{
+ struct nfqnl_instance *inst;
+
+ read_lock_bh(&instances_lock);
+ inst = __instance_lookup(queue_num);
+ if (inst)
+ atomic_inc(&inst->use);
+ read_unlock_bh(&instances_lock);
+
+ return inst;
+}
+
+static void
+instance_put(struct nfqnl_instance *inst)
+{
+ if (inst && atomic_dec_and_test(&inst->use)) {
+ QDEBUG("kfree(inst=%p)\n", inst);
+ kfree(inst);
+ }
+}
+
+static struct nfqnl_instance *
+instance_create(u_int16_t queue_num, int pid)
+{
+ struct nfqnl_instance *inst;
+
+ QDEBUG("entering for queue_num=%u, pid=%d\n", queue_num, pid);
+
+ write_lock_bh(&instances_lock);
+ if (__instance_lookup(queue_num)) {
+ inst = NULL;
+ QDEBUG("aborting, instance already exists\n");
+ goto out_unlock;
+ }
+
+ inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+ if (!inst)
+ goto out_unlock;
+
+ memset(inst, 0, sizeof(*inst));
+ inst->queue_num = queue_num;
+ inst->peer_pid = pid;
+ inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
+ inst->copy_range = 0xfffff;
+ inst->copy_mode = NFQNL_COPY_NONE;
+ atomic_set(&inst->id_sequence, 0);
+ /* needs to be two, since we _put() after creation */
+ atomic_set(&inst->use, 2);
+ inst->lock = SPIN_LOCK_UNLOCKED;
+ INIT_LIST_HEAD(&inst->queue_list);
+
+ if (!try_module_get(THIS_MODULE))
+ goto out_free;
+
+ hlist_add_head(&inst->hlist,
+ &instance_table[instance_hashfn(queue_num)]);
+
+ write_unlock_bh(&instances_lock);
+
+ QDEBUG("successfully created new instance\n");
+
+ return inst;
+
+out_free:
+ kfree(inst);
+out_unlock:
+ write_unlock_bh(&instances_lock);
+ return NULL;
+}
+
+static void nfqnl_flush(struct nfqnl_instance *queue, int verdict);
+
+static void
+_instance_destroy2(struct nfqnl_instance *inst, int lock)
+{
+ /* first pull it out of the global list */
+ if (lock)
+ write_lock_bh(&instances_lock);
+
+ QDEBUG("removing instance %p (queuenum=%u) from hash\n",
+ inst, inst->queue_num);
+ hlist_del(&inst->hlist);
+
+ if (lock)
+ write_unlock_bh(&instances_lock);
+
+ /* then flush all pending skbs from the queue */
+ nfqnl_flush(inst, NF_DROP);
+
+ /* and finally put the refcount */
+ instance_put(inst);
+
+ module_put(THIS_MODULE);
+}
+
+static inline void
+__instance_destroy(struct nfqnl_instance *inst)
+{
+ _instance_destroy2(inst, 0);
+}
+
+static inline void
+instance_destroy(struct nfqnl_instance *inst)
+{
+ _instance_destroy2(inst, 1);
+}
+
+
+
+static void
+issue_verdict(struct nfqnl_queue_entry *entry, int verdict)
+{
+ QDEBUG("entering for entry %p, verdict %u\n", entry, verdict);
+
+ /* TCP input path (and probably other bits) assume to be called
+ * from softirq context, not from syscall, like issue_verdict is
+ * called. TCP input path deadlocks with locks taken from timer
+ * softirq, e.g. We therefore emulate this by local_bh_disable() */
+
+ local_bh_disable();
+ nf_reinject(entry->skb, entry->info, verdict);
+ local_bh_enable();
+
+ kfree(entry);
+}
+
+static inline void
+__enqueue_entry(struct nfqnl_instance *queue,
+ struct nfqnl_queue_entry *entry)
+{
+ list_add(&entry->list, &queue->queue_list);
+ queue->queue_total++;
+}
+
+/*
+ * Find and return a queued entry matched by cmpfn, or return the last
+ * entry if cmpfn is NULL.
+ */
+static inline struct nfqnl_queue_entry *
+__find_entry(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn,
+ unsigned long data)
+{
+ struct list_head *p;
+
+ list_for_each_prev(p, &queue->queue_list) {
+ struct nfqnl_queue_entry *entry = (struct nfqnl_queue_entry *)p;
+
+ if (!cmpfn || cmpfn(entry, data))
+ return entry;
+ }
+ return NULL;
+}
+
+static inline void
+__dequeue_entry(struct nfqnl_instance *q, struct nfqnl_queue_entry *entry)
+{
+ list_del(&entry->list);
+ q->queue_total--;
+}
+
+static inline struct nfqnl_queue_entry *
+__find_dequeue_entry(struct nfqnl_instance *queue,
+ nfqnl_cmpfn cmpfn, unsigned long data)
+{
+ struct nfqnl_queue_entry *entry;
+
+ entry = __find_entry(queue, cmpfn, data);
+ if (entry == NULL)
+ return NULL;
+
+ __dequeue_entry(queue, entry);
+ return entry;
+}
+
+
+static inline void
+__nfqnl_flush(struct nfqnl_instance *queue, int verdict)
+{
+ struct nfqnl_queue_entry *entry;
+
+ while ((entry = __find_dequeue_entry(queue, NULL, 0)))
+ issue_verdict(entry, verdict);
+}
+
+static inline int
+__nfqnl_set_mode(struct nfqnl_instance *queue,
+ unsigned char mode, unsigned int range)
+{
+ int status = 0;
+
+ switch (mode) {
+ case NFQNL_COPY_NONE:
+ case NFQNL_COPY_META:
+ queue->copy_mode = mode;
+ queue->copy_range = 0;
+ break;
+
+ case NFQNL_COPY_PACKET:
+ queue->copy_mode = mode;
+ /* we're using struct nfattr which has 16bit nfa_len */
+ if (range > 0xffff)
+ queue->copy_range = 0xffff;
+ else
+ queue->copy_range = range;
+ break;
+
+ default:
+ status = -EINVAL;
+
+ }
+ return status;
+}
+
+static struct nfqnl_queue_entry *
+find_dequeue_entry(struct nfqnl_instance *queue,
+ nfqnl_cmpfn cmpfn, unsigned long data)
+{
+ struct nfqnl_queue_entry *entry;
+
+ spin_lock_bh(&queue->lock);
+ entry = __find_dequeue_entry(queue, cmpfn, data);
+ spin_unlock_bh(&queue->lock);
+
+ return entry;
+}
+
+static void
+nfqnl_flush(struct nfqnl_instance *queue, int verdict)
+{
+ spin_lock_bh(&queue->lock);
+ __nfqnl_flush(queue, verdict);
+ spin_unlock_bh(&queue->lock);
+}
+
+static struct sk_buff *
+nfqnl_build_packet_message(struct nfqnl_instance *queue,
+ struct nfqnl_queue_entry *entry, int *errp)
+{
+ unsigned char *old_tail;
+ size_t size;
+ size_t data_len = 0;
+ struct sk_buff *skb;
+ struct nfqnl_msg_packet_hdr pmsg;
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfmsg;
+ unsigned int tmp_uint;
+
+ QDEBUG("entered\n");
+
+ /* all macros expand to constant values at compile time */
+ size = NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hdr))
+ + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
+ + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
+#ifdef CONFIG_BRIDGE_NETFILTER
+ + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
+ + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */
+#endif
+ + NLMSG_SPACE(sizeof(u_int32_t)) /* mark */
+ + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hw))
+ + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_timestamp));
+
+ spin_lock_bh(&queue->lock);
+
+ switch (queue->copy_mode) {
+ case NFQNL_COPY_META:
+ case NFQNL_COPY_NONE:
+ data_len = 0;
+ break;
+
+ case NFQNL_COPY_PACKET:
+ if (entry->skb->ip_summed == CHECKSUM_HW &&
+ (*errp = skb_checksum_help(entry->skb,
+ entry->info->outdev == NULL))) {
+ spin_unlock_bh(&queue->lock);
+ return NULL;
+ }
+ if (queue->copy_range == 0
+ || queue->copy_range > entry->skb->len)
+ data_len = entry->skb->len;
+ else
+ data_len = queue->copy_range;
+
+ size += NLMSG_SPACE(data_len);
+ break;
+
+ default:
+ *errp = -EINVAL;
+ spin_unlock_bh(&queue->lock);
+ return NULL;
+ }
+
+ spin_unlock_bh(&queue->lock);
+
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (!skb)
+ goto nlmsg_failure;
+
+ old_tail= skb->tail;
+ nlh = NLMSG_PUT(skb, 0, 0,
+ NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
+ sizeof(struct nfgenmsg));
+ nfmsg = NLMSG_DATA(nlh);
+ nfmsg->nfgen_family = entry->info->pf;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = htons(queue->queue_num);
+
+ pmsg.packet_id = htonl(entry->id);
+ pmsg.hw_protocol = htons(entry->skb->protocol);
+ pmsg.hook = entry->info->hook;
+
+ NFA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg);
+
+ if (entry->info->indev) {
+ tmp_uint = htonl(entry->info->indev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+ NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint);
+#else
+ if (entry->info->pf == PF_BRIDGE) {
+ /* Case 1: indev is physical input device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint),
+ &tmp_uint);
+ /* this is the bridge group "brX" */
+ tmp_uint = htonl(entry->info->indev->br_port->br->dev->ifindex);
+ NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint),
+ &tmp_uint);
+ } else {
+ /* Case 2: indev is bridge group, we need to look for
+ * physical device (when called from ipv4) */
+ NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint),
+ &tmp_uint);
+ if (entry->skb->nf_bridge
+ && entry->skb->nf_bridge->physindev) {
+ tmp_uint = htonl(entry->skb->nf_bridge->physindev->ifindex);
+ NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ }
+ }
+#endif
+ }
+
+ if (entry->info->outdev) {
+ tmp_uint = htonl(entry->info->outdev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+ NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint);
+#else
+ if (entry->info->pf == PF_BRIDGE) {
+ /* Case 1: outdev is physical output device, we need to
+ * look for bridge group (when called from
+ * netfilter_bridge) */
+ NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint),
+ &tmp_uint);
+ /* this is the bridge group "brX" */
+ tmp_uint = htonl(entry->info->outdev->br_port->br->dev->ifindex);
+ NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint),
+ &tmp_uint);
+ } else {
+ /* Case 2: outdev is bridge group, we need to look for
+ * physical output device (when called from ipv4) */
+ NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint),
+ &tmp_uint);
+ if (entry->skb->nf_bridge
+ && entry->skb->nf_bridge->physoutdev) {
+ tmp_uint = htonl(entry->skb->nf_bridge->physoutdev->ifindex);
+ NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV,
+ sizeof(tmp_uint), &tmp_uint);
+ }
+ }
+#endif
+ }
+
+ if (entry->skb->nfmark) {
+ tmp_uint = htonl(entry->skb->nfmark);
+ NFA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint);
+ }
+
+ if (entry->info->indev && entry->skb->dev
+ && entry->skb->dev->hard_header_parse) {
+ struct nfqnl_msg_packet_hw phw;
+
+ phw.hw_addrlen =
+ entry->skb->dev->hard_header_parse(entry->skb,
+ phw.hw_addr);
+ phw.hw_addrlen = htons(phw.hw_addrlen);
+ NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw);
+ }
+
+ if (entry->skb->tstamp.off_sec) {
+ struct nfqnl_msg_packet_timestamp ts;
+
+ ts.sec = cpu_to_be64(skb_tv_base.tv_sec + entry->skb->tstamp.off_sec);
+ ts.usec = cpu_to_be64(skb_tv_base.tv_usec + entry->skb->tstamp.off_usec);
+
+ NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts);
+ }
+
+ if (data_len) {
+ struct nfattr *nfa;
+ int size = NFA_LENGTH(data_len);
+
+ if (skb_tailroom(skb) < (int)NFA_SPACE(data_len)) {
+ printk(KERN_WARNING "nf_queue: no tailroom!\n");
+ goto nlmsg_failure;
+ }
+
+ nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size));
+ nfa->nfa_type = NFQA_PAYLOAD;
+ nfa->nfa_len = size;
+
+ if (skb_copy_bits(entry->skb, 0, NFA_DATA(nfa), data_len))
+ BUG();
+ }
+
+ nlh->nlmsg_len = skb->tail - old_tail;
+ return skb;
+
+nlmsg_failure:
+nfattr_failure:
+ if (skb)
+ kfree_skb(skb);
+ *errp = -EINVAL;
+ if (net_ratelimit())
+ printk(KERN_ERR "nf_queue: error creating packet message\n");
+ return NULL;
+}
+
+static int
+nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
+ unsigned int queuenum, void *data)
+{
+ int status = -EINVAL;
+ struct sk_buff *nskb;
+ struct nfqnl_instance *queue;
+ struct nfqnl_queue_entry *entry;
+
+ QDEBUG("entered\n");
+
+ queue = instance_lookup_get(queuenum);
+ if (!queue) {
+ QDEBUG("no queue instance matching\n");
+ return -EINVAL;
+ }
+
+ if (queue->copy_mode == NFQNL_COPY_NONE) {
+ QDEBUG("mode COPY_NONE, aborting\n");
+ status = -EAGAIN;
+ goto err_out_put;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL) {
+ if (net_ratelimit())
+ printk(KERN_ERR
+ "nf_queue: OOM in nfqnl_enqueue_packet()\n");
+ status = -ENOMEM;
+ goto err_out_put;
+ }
+
+ entry->info = info;
+ entry->skb = skb;
+ entry->id = atomic_inc_return(&queue->id_sequence);
+
+ nskb = nfqnl_build_packet_message(queue, entry, &status);
+ if (nskb == NULL)
+ goto err_out_free;
+
+ spin_lock_bh(&queue->lock);
+
+ if (!queue->peer_pid)
+ goto err_out_free_nskb;
+
+ if (queue->queue_total >= queue->queue_maxlen) {
+ queue->queue_dropped++;
+ status = -ENOSPC;
+ if (net_ratelimit())
+ printk(KERN_WARNING "ip_queue: full at %d entries, "
+ "dropping packets(s). Dropped: %d\n",
+ queue->queue_total, queue->queue_dropped);
+ goto err_out_free_nskb;
+ }
+
+ /* nfnetlink_unicast will either free the nskb or add it to a socket */
+ status = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT);
+ if (status < 0) {
+ queue->queue_user_dropped++;
+ goto err_out_unlock;
+ }
+
+ __enqueue_entry(queue, entry);
+
+ spin_unlock_bh(&queue->lock);
+ instance_put(queue);
+ return status;
+
+err_out_free_nskb:
+ kfree_skb(nskb);
+
+err_out_unlock:
+ spin_unlock_bh(&queue->lock);
+
+err_out_free:
+ kfree(entry);
+err_out_put:
+ instance_put(queue);
+ return status;
+}
+
+static int
+nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e)
+{
+ int diff;
+
+ diff = data_len - e->skb->len;
+ if (diff < 0)
+ skb_trim(e->skb, data_len);
+ else if (diff > 0) {
+ if (data_len > 0xFFFF)
+ return -EINVAL;
+ if (diff > skb_tailroom(e->skb)) {
+ struct sk_buff *newskb;
+
+ newskb = skb_copy_expand(e->skb,
+ skb_headroom(e->skb),
+ diff,
+ GFP_ATOMIC);
+ if (newskb == NULL) {
+ printk(KERN_WARNING "ip_queue: OOM "
+ "in mangle, dropping packet\n");
+ return -ENOMEM;
+ }
+ if (e->skb->sk)
+ skb_set_owner_w(newskb, e->skb->sk);
+ kfree_skb(e->skb);
+ e->skb = newskb;
+ }
+ skb_put(e->skb, diff);
+ }
+ if (!skb_make_writable(&e->skb, data_len))
+ return -ENOMEM;
+ memcpy(e->skb->data, data, data_len);
+ e->skb->ip_summed = CHECKSUM_NONE;
+ return 0;
+}
+
+static inline int
+id_cmp(struct nfqnl_queue_entry *e, unsigned long id)
+{
+ return (id == e->id);
+}
+
+static int
+nfqnl_set_mode(struct nfqnl_instance *queue,
+ unsigned char mode, unsigned int range)
+{
+ int status;
+
+ spin_lock_bh(&queue->lock);
+ status = __nfqnl_set_mode(queue, mode, range);
+ spin_unlock_bh(&queue->lock);
+
+ return status;
+}
+
+static int
+dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex)
+{
+ if (entry->info->indev)
+ if (entry->info->indev->ifindex == ifindex)
+ return 1;
+
+ if (entry->info->outdev)
+ if (entry->info->outdev->ifindex == ifindex)
+ return 1;
+
+ return 0;
+}
+
+/* drop all packets with either indev or outdev == ifindex from all queue
+ * instances */
+static void
+nfqnl_dev_drop(int ifindex)
+{
+ int i;
+
+ QDEBUG("entering for ifindex %u\n", ifindex);
+
+ /* this only looks like we have to hold the readlock for a way too long
+ * time, issue_verdict(), nf_reinject(), ... - but we always only
+ * issue NF_DROP, which is processed directly in nf_reinject() */
+ read_lock_bh(&instances_lock);
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct hlist_node *tmp;
+ struct nfqnl_instance *inst;
+ struct hlist_head *head = &instance_table[i];
+
+ hlist_for_each_entry(inst, tmp, head, hlist) {
+ struct nfqnl_queue_entry *entry;
+ while ((entry = find_dequeue_entry(inst, dev_cmp,
+ ifindex)) != NULL)
+ issue_verdict(entry, NF_DROP);
+ }
+ }
+
+ read_unlock_bh(&instances_lock);
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static int
+nfqnl_rcv_dev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ /* Drop any packets associated with the downed device */
+ if (event == NETDEV_DOWN)
+ nfqnl_dev_drop(dev->ifindex);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_dev_notifier = {
+ .notifier_call = nfqnl_rcv_dev_event,
+};
+
+static int
+nfqnl_rcv_nl_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct netlink_notify *n = ptr;
+
+ if (event == NETLINK_URELEASE &&
+ n->protocol == NETLINK_NETFILTER && n->pid) {
+ int i;
+
+ /* destroy all instances for this pid */
+ write_lock_bh(&instances_lock);
+ for (i = 0; i < INSTANCE_BUCKETS; i++) {
+ struct hlist_node *tmp, *t2;
+ struct nfqnl_instance *inst;
+ struct hlist_head *head = &instance_table[i];
+
+ hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
+ if (n->pid == inst->peer_pid)
+ __instance_destroy(inst);
+ }
+ }
+ write_unlock_bh(&instances_lock);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_rtnl_notifier = {
+ .notifier_call = nfqnl_rcv_nl_event,
+};
+
+static const int nfqa_verdict_min[NFQA_MAX] = {
+ [NFQA_VERDICT_HDR-1] = sizeof(struct nfqnl_msg_verdict_hdr),
+ [NFQA_MARK-1] = sizeof(u_int32_t),
+ [NFQA_PAYLOAD-1] = 0,
+};
+
+static int
+nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+ struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ u_int16_t queue_num = ntohs(nfmsg->res_id);
+
+ struct nfqnl_msg_verdict_hdr *vhdr;
+ struct nfqnl_instance *queue;
+ unsigned int verdict;
+ struct nfqnl_queue_entry *entry;
+ int err;
+
+ if (nfattr_bad_size(nfqa, NFQA_MAX, nfqa_verdict_min)) {
+ QDEBUG("bad attribute size\n");
+ return -EINVAL;
+ }
+
+ queue = instance_lookup_get(queue_num);
+ if (!queue)
+ return -ENODEV;
+
+ if (queue->peer_pid != NETLINK_CB(skb).pid) {
+ err = -EPERM;
+ goto err_out_put;
+ }
+
+ if (!nfqa[NFQA_VERDICT_HDR-1]) {
+ err = -EINVAL;
+ goto err_out_put;
+ }
+
+ vhdr = NFA_DATA(nfqa[NFQA_VERDICT_HDR-1]);
+ verdict = ntohl(vhdr->verdict);
+
+ if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) {
+ err = -EINVAL;
+ goto err_out_put;
+ }
+
+ entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id));
+ if (entry == NULL) {
+ err = -ENOENT;
+ goto err_out_put;
+ }
+
+ if (nfqa[NFQA_PAYLOAD-1]) {
+ if (nfqnl_mangle(NFA_DATA(nfqa[NFQA_PAYLOAD-1]),
+ NFA_PAYLOAD(nfqa[NFQA_PAYLOAD-1]), entry) < 0)
+ verdict = NF_DROP;
+ }
+
+ if (nfqa[NFQA_MARK-1])
+ skb->nfmark = ntohl(*(u_int32_t *)NFA_DATA(nfqa[NFQA_MARK-1]));
+
+ issue_verdict(entry, verdict);
+ instance_put(queue);
+ return 0;
+
+err_out_put:
+ instance_put(queue);
+ return err;
+}
+
+static int
+nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+ return -ENOTSUPP;
+}
+
+static const int nfqa_cfg_min[NFQA_CFG_MAX] = {
+ [NFQA_CFG_CMD-1] = sizeof(struct nfqnl_msg_config_cmd),
+ [NFQA_CFG_PARAMS-1] = sizeof(struct nfqnl_msg_config_params),
+};
+
+static struct nf_queue_handler nfqh = {
+ .name = "nf_queue",
+ .outfn = &nfqnl_enqueue_packet,
+};
+
+static int
+nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+ struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+ u_int16_t queue_num = ntohs(nfmsg->res_id);
+ struct nfqnl_instance *queue;
+ int ret = 0;
+
+ QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type));
+
+ if (nfattr_bad_size(nfqa, NFQA_CFG_MAX, nfqa_cfg_min)) {
+ QDEBUG("bad attribute size\n");
+ return -EINVAL;
+ }
+
+ queue = instance_lookup_get(queue_num);
+ if (nfqa[NFQA_CFG_CMD-1]) {
+ struct nfqnl_msg_config_cmd *cmd;
+ cmd = NFA_DATA(nfqa[NFQA_CFG_CMD-1]);
+ QDEBUG("found CFG_CMD\n");
+
+ switch (cmd->command) {
+ case NFQNL_CFG_CMD_BIND:
+ if (queue)
+ return -EBUSY;
+
+ queue = instance_create(queue_num, NETLINK_CB(skb).pid);
+ if (!queue)
+ return -EINVAL;
+ break;
+ case NFQNL_CFG_CMD_UNBIND:
+ if (!queue)
+ return -ENODEV;
+
+ if (queue->peer_pid != NETLINK_CB(skb).pid) {
+ ret = -EPERM;
+ goto out_put;
+ }
+
+ instance_destroy(queue);
+ break;
+ case NFQNL_CFG_CMD_PF_BIND:
+ QDEBUG("registering queue handler for pf=%u\n",
+ ntohs(cmd->pf));
+ ret = nf_register_queue_handler(ntohs(cmd->pf), &nfqh);
+ break;
+ case NFQNL_CFG_CMD_PF_UNBIND:
+ QDEBUG("unregistering queue handler for pf=%u\n",
+ ntohs(cmd->pf));
+ /* This is a bug and a feature. We can unregister
+ * other handlers(!) */
+ ret = nf_unregister_queue_handler(ntohs(cmd->pf));
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ } else {
+ if (!queue) {
+ QDEBUG("no config command, and no instance ENOENT\n");
+ ret = -ENOENT;
+ goto out_put;
+ }
+
+ if (queue->peer_pid != NETLINK_CB(skb).pid) {
+ QDEBUG("no config command, and wrong pid\n");
+ ret = -EPERM;
+ goto out_put;
+ }
+ }
+
+ if (nfqa[NFQA_CFG_PARAMS-1]) {
+ struct nfqnl_msg_config_params *params;
+ params = NFA_DATA(nfqa[NFQA_CFG_PARAMS-1]);
+
+ nfqnl_set_mode(queue, params->copy_mode,
+ ntohl(params->copy_range));
+ }
+
+out_put:
+ instance_put(queue);
+ return ret;
+}
+
+static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
+ [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp,
+ .attr_count = NFQA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict,
+ .attr_count = NFQA_MAX,
+ .cap_required = CAP_NET_ADMIN },
+ [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
+ .attr_count = NFQA_CFG_MAX,
+ .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnetlink_subsystem nfqnl_subsys = {
+ .name = "nf_queue",
+ .subsys_id = NFNL_SUBSYS_QUEUE,
+ .cb_count = NFQNL_MSG_MAX,
+ .cb = nfqnl_cb,
+};
+
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+ unsigned int bucket;
+};
+
+static struct hlist_node *get_first(struct seq_file *seq)
+{
+ struct iter_state *st = seq->private;
+
+ if (!st)
+ return NULL;
+
+ for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+ if (!hlist_empty(&instance_table[st->bucket]))
+ return instance_table[st->bucket].first;
+ }
+ return NULL;
+}
+
+static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+{
+ struct iter_state *st = seq->private;
+
+ h = h->next;
+ while (!h) {
+ if (++st->bucket >= INSTANCE_BUCKETS)
+ return NULL;
+
+ h = instance_table[st->bucket].first;
+ }
+ return h;
+}
+
+static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct hlist_node *head;
+ head = get_first(seq);
+
+ if (head)
+ while (pos && (head = get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock_bh(&instances_lock);
+ return get_idx(seq, *pos);
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return get_next(s, v);
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+ read_unlock_bh(&instances_lock);
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+ const struct nfqnl_instance *inst = v;
+
+ return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n",
+ inst->queue_num,
+ inst->peer_pid, inst->queue_total,
+ inst->copy_mode, inst->copy_range,
+ inst->queue_dropped, inst->queue_user_dropped,
+ atomic_read(&inst->id_sequence),
+ atomic_read(&inst->use));
+}
+
+static struct seq_operations nfqnl_seq_ops = {
+ .start = seq_start,
+ .next = seq_next,
+ .stop = seq_stop,
+ .show = seq_show,
+};
+
+static int nfqnl_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ struct iter_state *is;
+ int ret;
+
+ is = kmalloc(sizeof(*is), GFP_KERNEL);
+ if (!is)
+ return -ENOMEM;
+ memset(is, 0, sizeof(*is));
+ ret = seq_open(file, &nfqnl_seq_ops);
+ if (ret < 0)
+ goto out_free;
+ seq = file->private_data;
+ seq->private = is;
+ return ret;
+out_free:
+ kfree(is);
+ return ret;
+}
+
+static struct file_operations nfqnl_file_ops = {
+ .owner = THIS_MODULE,
+ .open = nfqnl_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+#endif /* PROC_FS */
+
+static int
+init_or_cleanup(int init)
+{
+ int i, status = -ENOMEM;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc_nfqueue;
+#endif
+
+ if (!init)
+ goto cleanup;
+
+ for (i = 0; i < INSTANCE_BUCKETS; i++)
+ INIT_HLIST_HEAD(&instance_table[i]);
+
+ netlink_register_notifier(&nfqnl_rtnl_notifier);
+ status = nfnetlink_subsys_register(&nfqnl_subsys);
+ if (status < 0) {
+ printk(KERN_ERR "nf_queue: failed to create netlink socket\n");
+ goto cleanup_netlink_notifier;
+ }
+
+#ifdef CONFIG_PROC_FS
+ proc_nfqueue = create_proc_entry("nfnetlink_queue", 0440,
+ proc_net_netfilter);
+ if (!proc_nfqueue)
+ goto cleanup_subsys;
+ proc_nfqueue->proc_fops = &nfqnl_file_ops;
+#endif
+
+ register_netdevice_notifier(&nfqnl_dev_notifier);
+
+ return status;
+
+cleanup:
+ nf_unregister_queue_handlers(&nfqh);
+ unregister_netdevice_notifier(&nfqnl_dev_notifier);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nfnetlink_queue", proc_net_netfilter);
+cleanup_subsys:
+#endif
+ nfnetlink_subsys_unregister(&nfqnl_subsys);
+cleanup_netlink_notifier:
+ netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+ return status;
+}
+
+static int __init init(void)
+{
+
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+MODULE_DESCRIPTION("netfilter packet queue handler");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e41ce45..a64e1d5 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -13,7 +13,12 @@
* added netlink_proto_exit
* Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
* use nlk_sk, as sk->protinfo is on a diet 8)
- *
+ * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
+ * - inc module use count of module that owns
+ * the kernel socket in case userspace opens
+ * socket of same protocol
+ * - remove all module support, since netlink is
+ * mandatory if CONFIG_NET=y these days
*/
#include <linux/config.h>
@@ -55,21 +60,29 @@
#include <net/scm.h>
#define Nprintk(a...)
+#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
struct netlink_sock {
/* struct sock has to be the first member of netlink_sock */
struct sock sk;
u32 pid;
- unsigned int groups;
u32 dst_pid;
- unsigned int dst_groups;
+ u32 dst_group;
+ u32 flags;
+ u32 subscriptions;
+ u32 ngroups;
+ unsigned long *groups;
unsigned long state;
wait_queue_head_t wait;
struct netlink_callback *cb;
spinlock_t cb_lock;
void (*data_ready)(struct sock *sk, int bytes);
+ struct module *module;
};
+#define NETLINK_KERNEL_SOCKET 0x1
+#define NETLINK_RECV_PKTINFO 0x2
+
static inline struct netlink_sock *nlk_sk(struct sock *sk)
{
return (struct netlink_sock *)sk;
@@ -92,6 +105,9 @@ struct netlink_table {
struct nl_pid_hash hash;
struct hlist_head mc_list;
unsigned int nl_nonroot;
+ unsigned int groups;
+ struct module *module;
+ int registered;
};
static struct netlink_table *nl_table;
@@ -106,6 +122,11 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
static struct notifier_block *netlink_chain;
+static u32 netlink_group_mask(u32 group)
+{
+ return group ? 1 << (group - 1) : 0;
+}
+
static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid)
{
return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask];
@@ -122,6 +143,7 @@ static void netlink_sock_destruct(struct sock *sk)
BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
BUG_TRAP(!nlk_sk(sk)->cb);
+ BUG_TRAP(!nlk_sk(sk)->groups);
}
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on SMP.
@@ -315,9 +337,9 @@ err:
static void netlink_remove(struct sock *sk)
{
netlink_table_grab();
- nl_table[sk->sk_protocol].hash.entries--;
- sk_del_node_init(sk);
- if (nlk_sk(sk)->groups)
+ if (sk_del_node_init(sk))
+ nl_table[sk->sk_protocol].hash.entries--;
+ if (nlk_sk(sk)->subscriptions)
__sk_del_bind_node(sk);
netlink_table_ungrab();
}
@@ -328,19 +350,11 @@ static struct proto netlink_proto = {
.obj_size = sizeof(struct netlink_sock),
};
-static int netlink_create(struct socket *sock, int protocol)
+static int __netlink_create(struct socket *sock, int protocol)
{
struct sock *sk;
struct netlink_sock *nlk;
- sock->state = SS_UNCONNECTED;
-
- if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
- return -ESOCKTNOSUPPORT;
-
- if (protocol<0 || protocol >= MAX_LINKS)
- return -EPROTONOSUPPORT;
-
sock->ops = &netlink_ops;
sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1);
@@ -350,15 +364,56 @@ static int netlink_create(struct socket *sock, int protocol)
sock_init_data(sock, sk);
nlk = nlk_sk(sk);
-
spin_lock_init(&nlk->cb_lock);
init_waitqueue_head(&nlk->wait);
- sk->sk_destruct = netlink_sock_destruct;
+ sk->sk_destruct = netlink_sock_destruct;
sk->sk_protocol = protocol;
return 0;
}
+static int netlink_create(struct socket *sock, int protocol)
+{
+ struct module *module = NULL;
+ struct netlink_sock *nlk;
+ unsigned int groups;
+ int err = 0;
+
+ sock->state = SS_UNCONNECTED;
+
+ if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
+ return -ESOCKTNOSUPPORT;
+
+ if (protocol<0 || protocol >= MAX_LINKS)
+ return -EPROTONOSUPPORT;
+
+ netlink_lock_table();
+#ifdef CONFIG_KMOD
+ if (!nl_table[protocol].registered) {
+ netlink_unlock_table();
+ request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
+ netlink_lock_table();
+ }
+#endif
+ if (nl_table[protocol].registered &&
+ try_module_get(nl_table[protocol].module))
+ module = nl_table[protocol].module;
+ groups = nl_table[protocol].groups;
+ netlink_unlock_table();
+
+ if ((err = __netlink_create(sock, protocol) < 0))
+ goto out_module;
+
+ nlk = nlk_sk(sock->sk);
+ nlk->module = module;
+out:
+ return err;
+
+out_module:
+ module_put(module);
+ goto out;
+}
+
static int netlink_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -387,14 +442,27 @@ static int netlink_release(struct socket *sock)
skb_queue_purge(&sk->sk_write_queue);
- if (nlk->pid && !nlk->groups) {
+ if (nlk->pid && !nlk->subscriptions) {
struct netlink_notify n = {
.protocol = sk->sk_protocol,
.pid = nlk->pid,
};
notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n);
}
-
+
+ if (nlk->module)
+ module_put(nlk->module);
+
+ if (nlk->flags & NETLINK_KERNEL_SOCKET) {
+ netlink_table_grab();
+ nl_table[sk->sk_protocol].module = NULL;
+ nl_table[sk->sk_protocol].registered = 0;
+ netlink_table_ungrab();
+ }
+
+ kfree(nlk->groups);
+ nlk->groups = NULL;
+
sock_put(sk);
return 0;
}
@@ -429,7 +497,12 @@ retry:
err = netlink_insert(sk, pid);
if (err == -EADDRINUSE)
goto retry;
- return 0;
+
+ /* If 2 threads race to autobind, that is fine. */
+ if (err == -EBUSY)
+ err = 0;
+
+ return err;
}
static inline int netlink_capable(struct socket *sock, unsigned int flag)
@@ -438,6 +511,41 @@ static inline int netlink_capable(struct socket *sock, unsigned int flag)
capable(CAP_NET_ADMIN);
}
+static void
+netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ if (nlk->subscriptions && !subscriptions)
+ __sk_del_bind_node(sk);
+ else if (!nlk->subscriptions && subscriptions)
+ sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
+ nlk->subscriptions = subscriptions;
+}
+
+static int netlink_alloc_groups(struct sock *sk)
+{
+ struct netlink_sock *nlk = nlk_sk(sk);
+ unsigned int groups;
+ int err = 0;
+
+ netlink_lock_table();
+ groups = nl_table[sk->sk_protocol].groups;
+ if (!nl_table[sk->sk_protocol].registered)
+ err = -ENOENT;
+ netlink_unlock_table();
+
+ if (err)
+ return err;
+
+ nlk->groups = kmalloc(NLGRPSZ(groups), GFP_KERNEL);
+ if (nlk->groups == NULL)
+ return -ENOMEM;
+ memset(nlk->groups, 0, NLGRPSZ(groups));
+ nlk->ngroups = groups;
+ return 0;
+}
+
static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sock *sk = sock->sk;
@@ -449,8 +557,15 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
return -EINVAL;
/* Only superuser is allowed to listen multicasts */
- if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_RECV))
- return -EPERM;
+ if (nladdr->nl_groups) {
+ if (!netlink_capable(sock, NL_NONROOT_RECV))
+ return -EPERM;
+ if (nlk->groups == NULL) {
+ err = netlink_alloc_groups(sk);
+ if (err)
+ return err;
+ }
+ }
if (nlk->pid) {
if (nladdr->nl_pid != nlk->pid)
@@ -463,15 +578,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
return err;
}
- if (!nladdr->nl_groups && !nlk->groups)
+ if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
return 0;
netlink_table_grab();
- if (nlk->groups && !nladdr->nl_groups)
- __sk_del_bind_node(sk);
- else if (!nlk->groups && nladdr->nl_groups)
- sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
- nlk->groups = nladdr->nl_groups;
+ netlink_update_subscriptions(sk, nlk->subscriptions +
+ hweight32(nladdr->nl_groups) -
+ hweight32(nlk->groups[0]));
+ nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups;
netlink_table_ungrab();
return 0;
@@ -488,7 +602,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
if (addr->sa_family == AF_UNSPEC) {
sk->sk_state = NETLINK_UNCONNECTED;
nlk->dst_pid = 0;
- nlk->dst_groups = 0;
+ nlk->dst_group = 0;
return 0;
}
if (addr->sa_family != AF_NETLINK)
@@ -504,7 +618,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
if (err == 0) {
sk->sk_state = NETLINK_CONNECTED;
nlk->dst_pid = nladdr->nl_pid;
- nlk->dst_groups = nladdr->nl_groups;
+ nlk->dst_group = ffs(nladdr->nl_groups);
}
return err;
@@ -522,10 +636,10 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr
if (peer) {
nladdr->nl_pid = nlk->dst_pid;
- nladdr->nl_groups = nlk->dst_groups;
+ nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
} else {
nladdr->nl_pid = nlk->pid;
- nladdr->nl_groups = nlk->groups;
+ nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
}
return 0;
}
@@ -643,7 +757,8 @@ void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
sock_put(sk);
}
-static inline struct sk_buff *netlink_trim(struct sk_buff *skb, int allocation)
+static inline struct sk_buff *netlink_trim(struct sk_buff *skb,
+ unsigned int __nocast allocation)
{
int delta;
@@ -712,7 +827,7 @@ struct netlink_broadcast_data {
int failure;
int congested;
int delivered;
- int allocation;
+ unsigned int allocation;
struct sk_buff *skb, *skb2;
};
@@ -725,7 +840,8 @@ static inline int do_one_broadcast(struct sock *sk,
if (p->exclude_sk == sk)
goto out;
- if (nlk->pid == p->pid || !(nlk->groups & p->group))
+ if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
+ !test_bit(p->group - 1, nlk->groups))
goto out;
if (p->failure) {
@@ -764,7 +880,7 @@ out:
}
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
- u32 group, int allocation)
+ u32 group, unsigned int __nocast allocation)
{
struct netlink_broadcast_data info;
struct hlist_node *node;
@@ -821,7 +937,8 @@ static inline int do_one_set_err(struct sock *sk,
if (sk == p->exclude_sk)
goto out;
- if (nlk->pid == p->pid || !(nlk->groups & p->group))
+ if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
+ !test_bit(p->group - 1, nlk->groups))
goto out;
sk->sk_err = p->code;
@@ -849,11 +966,104 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
read_unlock(&nl_table_lock);
}
+static int netlink_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+ int val = 0, err;
+
+ if (level != SOL_NETLINK)
+ return -ENOPROTOOPT;
+
+ if (optlen >= sizeof(int) &&
+ get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ switch (optname) {
+ case NETLINK_PKTINFO:
+ if (val)
+ nlk->flags |= NETLINK_RECV_PKTINFO;
+ else
+ nlk->flags &= ~NETLINK_RECV_PKTINFO;
+ err = 0;
+ break;
+ case NETLINK_ADD_MEMBERSHIP:
+ case NETLINK_DROP_MEMBERSHIP: {
+ unsigned int subscriptions;
+ int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0;
+
+ if (!netlink_capable(sock, NL_NONROOT_RECV))
+ return -EPERM;
+ if (nlk->groups == NULL) {
+ err = netlink_alloc_groups(sk);
+ if (err)
+ return err;
+ }
+ if (!val || val - 1 >= nlk->ngroups)
+ return -EINVAL;
+ netlink_table_grab();
+ old = test_bit(val - 1, nlk->groups);
+ subscriptions = nlk->subscriptions - old + new;
+ if (new)
+ __set_bit(val - 1, nlk->groups);
+ else
+ __clear_bit(val - 1, nlk->groups);
+ netlink_update_subscriptions(sk, subscriptions);
+ netlink_table_ungrab();
+ err = 0;
+ break;
+ }
+ default:
+ err = -ENOPROTOOPT;
+ }
+ return err;
+}
+
+static int netlink_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+ int len, val, err;
+
+ if (level != SOL_NETLINK)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case NETLINK_PKTINFO:
+ if (len < sizeof(int))
+ return -EINVAL;
+ len = sizeof(int);
+ val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
+ put_user(len, optlen);
+ put_user(val, optval);
+ err = 0;
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ }
+ return err;
+}
+
+static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct nl_pktinfo info;
+
+ info.group = NETLINK_CB(skb).dst_group;
+ put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
+}
+
static inline void netlink_rcv_wake(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
- if (!skb_queue_len(&sk->sk_receive_queue))
+ if (skb_queue_empty(&sk->sk_receive_queue))
clear_bit(0, &nlk->state);
if (!test_bit(0, &nlk->state))
wake_up_interruptible(&nlk->wait);
@@ -867,7 +1077,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *addr=msg->msg_name;
u32 dst_pid;
- u32 dst_groups;
+ u32 dst_group;
struct sk_buff *skb;
int err;
struct scm_cookie scm;
@@ -885,12 +1095,12 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
if (addr->nl_family != AF_NETLINK)
return -EINVAL;
dst_pid = addr->nl_pid;
- dst_groups = addr->nl_groups;
- if (dst_groups && !netlink_capable(sock, NL_NONROOT_SEND))
+ dst_group = ffs(addr->nl_groups);
+ if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND))
return -EPERM;
} else {
dst_pid = nlk->dst_pid;
- dst_groups = nlk->dst_groups;
+ dst_group = nlk->dst_group;
}
if (!nlk->pid) {
@@ -908,9 +1118,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
goto out;
NETLINK_CB(skb).pid = nlk->pid;
- NETLINK_CB(skb).groups = nlk->groups;
NETLINK_CB(skb).dst_pid = dst_pid;
- NETLINK_CB(skb).dst_groups = dst_groups;
+ NETLINK_CB(skb).dst_group = dst_group;
NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context);
memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
@@ -932,9 +1141,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
goto out;
}
- if (dst_groups) {
+ if (dst_group) {
atomic_inc(&skb->users);
- netlink_broadcast(sk, skb, dst_pid, dst_groups, GFP_KERNEL);
+ netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
}
err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
@@ -980,7 +1189,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
addr->nl_family = AF_NETLINK;
addr->nl_pad = 0;
addr->nl_pid = NETLINK_CB(skb).pid;
- addr->nl_groups = NETLINK_CB(skb).dst_groups;
+ addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group);
msg->msg_namelen = sizeof(*addr);
}
@@ -995,6 +1204,8 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
netlink_dump(sk);
scm_recv(sock, msg, siocb->scm, flags);
+ if (nlk->flags & NETLINK_RECV_PKTINFO)
+ netlink_cmsg_recv_pktinfo(msg, skb);
out:
netlink_rcv_wake(sk);
@@ -1017,10 +1228,13 @@ static void netlink_data_ready(struct sock *sk, int len)
*/
struct sock *
-netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len))
+netlink_kernel_create(int unit, unsigned int groups,
+ void (*input)(struct sock *sk, int len),
+ struct module *module)
{
struct socket *sock;
struct sock *sk;
+ struct netlink_sock *nlk;
if (!nl_table)
return NULL;
@@ -1031,20 +1245,31 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len))
if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
return NULL;
- if (netlink_create(sock, unit) < 0) {
- sock_release(sock);
- return NULL;
- }
+ if (__netlink_create(sock, unit) < 0)
+ goto out_sock_release;
+
sk = sock->sk;
sk->sk_data_ready = netlink_data_ready;
if (input)
nlk_sk(sk)->data_ready = input;
- if (netlink_insert(sk, 0)) {
- sock_release(sock);
- return NULL;
- }
+ if (netlink_insert(sk, 0))
+ goto out_sock_release;
+
+ nlk = nlk_sk(sk);
+ nlk->flags |= NETLINK_KERNEL_SOCKET;
+
+ netlink_table_grab();
+ nl_table[unit].groups = groups < 32 ? 32 : groups;
+ nl_table[unit].module = module;
+ nl_table[unit].registered = 1;
+ netlink_table_ungrab();
+
return sk;
+
+out_sock_release:
+ sock_release(sock);
+ return NULL;
}
void netlink_set_nonroot(int protocol, unsigned int flags)
@@ -1095,8 +1320,7 @@ static int netlink_dump(struct sock *sk)
return 0;
}
- nlh = __nlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLMSG_DONE, sizeof(int));
- nlh->nlmsg_flags |= NLM_F_MULTI;
+ nlh = NLMSG_NEW_ANSWER(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI);
memcpy(NLMSG_DATA(nlh), &len, sizeof(len));
skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_data_ready(sk, skb->len);
@@ -1107,6 +1331,9 @@ static int netlink_dump(struct sock *sk)
netlink_destroy_callback(cb);
return 0;
+
+nlmsg_failure:
+ return -ENOBUFS;
}
int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
@@ -1178,7 +1405,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
}
rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
- NLMSG_ERROR, sizeof(struct nlmsgerr));
+ NLMSG_ERROR, sizeof(struct nlmsgerr), 0);
errmsg = NLMSG_DATA(rep);
errmsg->error = err;
memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(struct nlmsghdr));
@@ -1280,7 +1507,7 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
s,
s->sk_protocol,
nlk->pid,
- nlk->groups,
+ nlk->groups ? (u32)nlk->groups[0] : 0,
atomic_read(&s->sk_rmem_alloc),
atomic_read(&s->sk_wmem_alloc),
nlk->cb,
@@ -1354,8 +1581,8 @@ static struct proto_ops netlink_ops = {
.ioctl = sock_no_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
+ .setsockopt = netlink_setsockopt,
+ .getsockopt = netlink_getsockopt,
.sendmsg = netlink_sendmsg,
.recvmsg = netlink_recvmsg,
.mmap = sock_no_mmap,
@@ -1430,21 +1657,7 @@ out:
return err;
}
-static void __exit netlink_proto_exit(void)
-{
- sock_unregister(PF_NETLINK);
- proc_net_remove("netlink");
- kfree(nl_table);
- nl_table = NULL;
- proto_unregister(&netlink_proto);
-}
-
core_initcall(netlink_proto_init);
-module_exit(netlink_proto_exit);
-
-MODULE_LICENSE("GPL");
-
-MODULE_ALIAS_NETPROTO(PF_NETLINK);
EXPORT_SYMBOL(netlink_ack);
EXPORT_SYMBOL(netlink_broadcast);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 31ed4a9..f4578c7 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -39,7 +39,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/ip.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/arp.h>
#include <linux/init.h>
@@ -459,12 +459,7 @@ static struct sock *nr_make_new(struct sock *osk)
sk->sk_sndbuf = osk->sk_sndbuf;
sk->sk_state = TCP_ESTABLISHED;
sk->sk_sleep = osk->sk_sleep;
-
- if (sock_flag(osk, SOCK_ZAPPED))
- sock_set_flag(sk, SOCK_ZAPPED);
-
- if (sock_flag(osk, SOCK_DBG))
- sock_set_flag(sk, SOCK_DBG);
+ sock_copy_flags(sk, osk);
skb_queue_head_init(&nr->ack_queue);
skb_queue_head_init(&nr->reseq_queue);
@@ -541,7 +536,8 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct nr_sock *nr = nr_sk(sk);
struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
struct net_device *dev;
- ax25_address *user, *source;
+ ax25_uid_assoc *user;
+ ax25_address *source;
lock_sock(sk);
if (!sock_flag(sk, SOCK_ZAPPED)) {
@@ -580,16 +576,19 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
} else {
source = &addr->fsa_ax25.sax25_call;
- if ((user = ax25_findbyuid(current->euid)) == NULL) {
+ user = ax25_findbyuid(current->euid);
+ if (user) {
+ nr->user_addr = user->call;
+ ax25_uid_put(user);
+ } else {
if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
release_sock(sk);
dev_put(dev);
return -EPERM;
}
- user = source;
+ nr->user_addr = *source;
}
- nr->user_addr = *user;
nr->source_addr = *source;
}
@@ -609,7 +608,8 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
struct sock *sk = sock->sk;
struct nr_sock *nr = nr_sk(sk);
struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr;
- ax25_address *user, *source = NULL;
+ ax25_address *source = NULL;
+ ax25_uid_assoc *user;
struct net_device *dev;
lock_sock(sk);
@@ -650,16 +650,19 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr,
}
source = (ax25_address *)dev->dev_addr;
- if ((user = ax25_findbyuid(current->euid)) == NULL) {
+ user = ax25_findbyuid(current->euid);
+ if (user) {
+ nr->user_addr = user->call;
+ ax25_uid_put(user);
+ } else {
if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) {
dev_put(dev);
release_sock(sk);
return -EPERM;
}
- user = source;
+ nr->user_addr = *source;
}
- nr->user_addr = *user;
nr->source_addr = *source;
nr->device = dev;
@@ -855,17 +858,16 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
frametype = skb->data[19] & 0x0F;
flags = skb->data[19] & 0xF0;
-#ifdef CONFIG_INET
/*
* Check for an incoming IP over NET/ROM frame.
*/
- if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) {
+ if (frametype == NR_PROTOEXT &&
+ circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) {
skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
skb->h.raw = skb->data;
return nr_rx_ip(skb, dev);
}
-#endif
/*
* Find an existing socket connection, based on circuit ID, if it's
@@ -1259,6 +1261,7 @@ static int nr_info_show(struct seq_file *seq, void *v)
struct net_device *dev;
struct nr_sock *nr;
const char *devname;
+ char buf[11];
if (v == SEQ_START_TOKEN)
seq_puts(seq,
@@ -1274,11 +1277,11 @@ static int nr_info_show(struct seq_file *seq, void *v)
else
devname = dev->name;
- seq_printf(seq, "%-9s ", ax2asc(&nr->user_addr));
- seq_printf(seq, "%-9s ", ax2asc(&nr->dest_addr));
+ seq_printf(seq, "%-9s ", ax2asc(buf, &nr->user_addr));
+ seq_printf(seq, "%-9s ", ax2asc(buf, &nr->dest_addr));
seq_printf(seq,
"%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %ld\n",
- ax2asc(&nr->source_addr),
+ ax2asc(buf, &nr->source_addr),
devname,
nr->my_index,
nr->my_id,
diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c
index 220bf74..263da4c 100644
--- a/net/netrom/nr_dev.c
+++ b/net/netrom/nr_dev.c
@@ -38,8 +38,6 @@
#include <net/ax25.h>
#include <net/netrom.h>
-#ifdef CONFIG_INET
-
/*
* Only allow IP over NET/ROM frames through if the netrom device is up.
*/
@@ -64,11 +62,12 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev)
skb->nh.raw = skb->data;
skb->pkt_type = PACKET_HOST;
- ip_rcv(skb, skb->dev, NULL);
+ netif_rx(skb);
return 1;
}
+#ifdef CONFIG_INET
static int nr_rebuild_header(struct sk_buff *skb)
{
diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c
index 9c44b37..64b81a7 100644
--- a/net/netrom/nr_in.c
+++ b/net/netrom/nr_in.c
@@ -22,8 +22,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
-#include <net/ip.h> /* For ip_rcv */
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c
index 7a86b36..b3b9097 100644
--- a/net/netrom/nr_route.c
+++ b/net/netrom/nr_route.c
@@ -881,6 +881,7 @@ static void nr_node_stop(struct seq_file *seq, void *v)
static int nr_node_show(struct seq_file *seq, void *v)
{
+ char buf[11];
int i;
if (v == SEQ_START_TOKEN)
@@ -890,7 +891,7 @@ static int nr_node_show(struct seq_file *seq, void *v)
struct nr_node *nr_node = v;
nr_node_lock(nr_node);
seq_printf(seq, "%-9s %-7s %d %d",
- ax2asc(&nr_node->callsign),
+ ax2asc(buf, &nr_node->callsign),
(nr_node->mnemonic[0] == '\0') ? "*" : nr_node->mnemonic,
nr_node->which + 1,
nr_node->count);
@@ -964,6 +965,7 @@ static void nr_neigh_stop(struct seq_file *seq, void *v)
static int nr_neigh_show(struct seq_file *seq, void *v)
{
+ char buf[11];
int i;
if (v == SEQ_START_TOKEN)
@@ -973,7 +975,7 @@ static int nr_neigh_show(struct seq_file *seq, void *v)
seq_printf(seq, "%05d %-9s %-4s %3d %d %3d %3d",
nr_neigh->number,
- ax2asc(&nr_neigh->callsign),
+ ax2asc(buf, &nr_neigh->callsign),
nr_neigh->dev ? nr_neigh->dev->name : "???",
nr_neigh->quality,
nr_neigh->locked,
@@ -983,7 +985,7 @@ static int nr_neigh_show(struct seq_file *seq, void *v)
if (nr_neigh->digipeat != NULL) {
for (i = 0; i < nr_neigh->digipeat->ndigi; i++)
seq_printf(seq, " %s",
- ax2asc(&nr_neigh->digipeat->calls[i]));
+ ax2asc(buf, &nr_neigh->digipeat->calls[i]));
}
seq_puts(seq, "\n");
diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c
index 0627347..587bed2 100644
--- a/net/netrom/nr_subr.c
+++ b/net/netrom/nr_subr.c
@@ -21,7 +21,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
@@ -77,7 +77,7 @@ void nr_requeue_frames(struct sock *sk)
if (skb_prev == NULL)
skb_queue_head(&sk->sk_write_queue, skb);
else
- skb_append(skb_prev, skb);
+ skb_append(skb_prev, skb, &sk->sk_write_queue);
skb_prev = skb;
}
}
diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
index faabda8..75b72d3 100644
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@@ -22,7 +22,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
diff --git a/net/packet/Kconfig b/net/packet/Kconfig
new file mode 100644
index 0000000..34ff93f
--- /dev/null
+++ b/net/packet/Kconfig
@@ -0,0 +1,26 @@
+#
+# Packet configuration
+#
+
+config PACKET
+ tristate "Packet socket"
+ ---help---
+ The Packet protocol is used by applications which communicate
+ directly with network devices without an intermediate network
+ protocol implemented in the kernel, e.g. tcpdump. If you want them
+ to work, choose Y.
+
+ To compile this driver as a module, choose M here: the module will
+ be called af_packet.
+
+ If unsure, say Y.
+
+config PACKET_MMAP
+ bool "Packet socket: mmapped IO"
+ depends on PACKET
+ help
+ If you say Y here, the Packet protocol driver will use an IO
+ mechanism that results in faster communication.
+
+ If unsure, say N.
+
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0269616..8690f17 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -241,7 +241,7 @@ static struct proto_ops packet_ops;
#ifdef CONFIG_SOCK_PACKET
static struct proto_ops packet_ops_spkt;
-static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk;
struct sockaddr_pkt *spkt;
@@ -274,6 +274,9 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct
dst_release(skb->dst);
skb->dst = NULL;
+ /* drop conntrack reference */
+ nf_reset(skb);
+
spkt = (struct sockaddr_pkt*)skb->cb;
skb_push(skb, skb->data-skb->mac.raw);
@@ -438,7 +441,7 @@ static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned
we will not harm anyone.
*/
-static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk;
struct sockaddr_ll *sll;
@@ -517,6 +520,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
dst_release(skb->dst);
skb->dst = NULL;
+ /* drop conntrack reference */
+ nf_reset(skb);
+
spin_lock(&sk->sk_receive_queue.lock);
po->stats.tp_packets++;
__skb_queue_tail(&sk->sk_receive_queue, skb);
@@ -540,7 +546,7 @@ drop:
}
#ifdef CONFIG_PACKET_MMAP
-static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk;
struct packet_sock *po;
@@ -629,12 +635,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct pack
h->tp_snaplen = snaplen;
h->tp_mac = macoff;
h->tp_net = netoff;
- if (skb->stamp.tv_sec == 0) {
- do_gettimeofday(&skb->stamp);
+ if (skb->tstamp.off_sec == 0) {
+ __net_timestamp(skb);
sock_enable_timestamp(sk);
}
- h->tp_sec = skb->stamp.tv_sec;
- h->tp_usec = skb->stamp.tv_usec;
+ h->tp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec;
+ h->tp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec;
sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
sll->sll_halen = 0;
@@ -1529,8 +1535,7 @@ static unsigned int packet_poll(struct file * file, struct socket *sock,
static void packet_mm_open(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
- struct inode *inode = file->f_dentry->d_inode;
- struct socket * sock = SOCKET_I(inode);
+ struct socket * sock = file->private_data;
struct sock *sk = sock->sk;
if (sk)
@@ -1540,8 +1545,7 @@ static void packet_mm_open(struct vm_area_struct *vma)
static void packet_mm_close(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
- struct inode *inode = file->f_dentry->d_inode;
- struct socket * sock = SOCKET_I(inode);
+ struct socket * sock = file->private_data;
struct sock *sk = sock->sk;
if (sk)
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 7eb6a5b..3077878 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -41,7 +41,7 @@
#include <net/rose.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/ip.h>
#include <net/arp.h>
@@ -556,12 +556,7 @@ static struct sock *rose_make_new(struct sock *osk)
sk->sk_sndbuf = osk->sk_sndbuf;
sk->sk_state = TCP_ESTABLISHED;
sk->sk_sleep = osk->sk_sleep;
-
- if (sock_flag(osk, SOCK_ZAPPED))
- sock_set_flag(sk, SOCK_ZAPPED);
-
- if (sock_flag(osk, SOCK_DBG))
- sock_set_flag(sk, SOCK_DBG);
+ sock_copy_flags(sk, osk);
init_timer(&rose->timer);
init_timer(&rose->idletimer);
@@ -631,7 +626,8 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct rose_sock *rose = rose_sk(sk);
struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
struct net_device *dev;
- ax25_address *user, *source;
+ ax25_address *source;
+ ax25_uid_assoc *user;
int n;
if (!sock_flag(sk, SOCK_ZAPPED))
@@ -656,14 +652,17 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
source = &addr->srose_call;
- if ((user = ax25_findbyuid(current->euid)) == NULL) {
+ user = ax25_findbyuid(current->euid);
+ if (user) {
+ rose->source_call = user->call;
+ ax25_uid_put(user);
+ } else {
if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE))
return -EACCES;
- user = source;
+ rose->source_call = *source;
}
rose->source_addr = addr->srose_addr;
- rose->source_call = *user;
rose->device = dev;
rose->source_ndigis = addr->srose_ndigis;
@@ -690,8 +689,8 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
struct rose_sock *rose = rose_sk(sk);
struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
unsigned char cause, diagnostic;
- ax25_address *user;
struct net_device *dev;
+ ax25_uid_assoc *user;
int n;
if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
@@ -741,12 +740,14 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
if ((dev = rose_dev_first()) == NULL)
return -ENETUNREACH;
- if ((user = ax25_findbyuid(current->euid)) == NULL)
+ user = ax25_findbyuid(current->euid);
+ if (!user)
return -EINVAL;
memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN);
- rose->source_call = *user;
+ rose->source_call = user->call;
rose->device = dev;
+ ax25_uid_put(user);
rose_insert_socket(sk); /* Finish the bind */
}
@@ -1362,6 +1363,8 @@ static void rose_info_stop(struct seq_file *seq, void *v)
static int rose_info_show(struct seq_file *seq, void *v)
{
+ char buf[11];
+
if (v == SEQ_START_TOKEN)
seq_puts(seq,
"dest_addr dest_call src_addr src_call dev lci neigh st vs vr va t t1 t2 t3 hb idle Snd-Q Rcv-Q inode\n");
@@ -1379,12 +1382,12 @@ static int rose_info_show(struct seq_file *seq, void *v)
seq_printf(seq, "%-10s %-9s ",
rose2asc(&rose->dest_addr),
- ax2asc(&rose->dest_call));
+ ax2asc(buf, &rose->dest_call));
if (ax25cmp(&rose->source_call, &null_ax25_address) == 0)
callsign = "??????-?";
else
- callsign = ax2asc(&rose->source_call);
+ callsign = ax2asc(buf, &rose->source_call);
seq_printf(seq,
"%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n",
diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c
index ef475a1..8348d33 100644
--- a/net/rose/rose_in.c
+++ b/net/rose/rose_in.c
@@ -26,8 +26,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/ip.h> /* For ip_rcv */
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/system.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index ff73ebb..e556d92 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -24,7 +24,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <linux/fcntl.h>
@@ -851,6 +851,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
unsigned char cause, diagnostic;
struct net_device *dev;
int len, res = 0;
+ char buf[11];
#if 0
if (call_in_firewall(PF_ROSE, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT)
@@ -876,7 +877,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
if (rose_neigh == NULL) {
printk("rose_route : unknown neighbour or device %s\n",
- ax2asc(&ax25->dest_addr));
+ ax2asc(buf, &ax25->dest_addr));
goto out;
}
@@ -994,8 +995,10 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25)
* 1. The frame isn't for us,
* 2. It isn't "owned" by any existing route.
*/
- if (frametype != ROSE_CALL_REQUEST) /* XXX */
- return 0;
+ if (frametype != ROSE_CALL_REQUEST) { /* XXX */
+ res = 0;
+ goto out;
+ }
len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2;
len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2;
@@ -1176,6 +1179,7 @@ static void rose_neigh_stop(struct seq_file *seq, void *v)
static int rose_neigh_show(struct seq_file *seq, void *v)
{
+ char buf[11];
int i;
if (v == SEQ_START_TOKEN)
@@ -1187,7 +1191,7 @@ static int rose_neigh_show(struct seq_file *seq, void *v)
/* if (!rose_neigh->loopback) { */
seq_printf(seq, "%05d %-9s %-4s %3d %3d %3s %3s %3lu %3lu",
rose_neigh->number,
- (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(&rose_neigh->callsign),
+ (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign),
rose_neigh->dev ? rose_neigh->dev->name : "???",
rose_neigh->count,
rose_neigh->use,
@@ -1198,7 +1202,7 @@ static int rose_neigh_show(struct seq_file *seq, void *v)
if (rose_neigh->digipeat != NULL) {
for (i = 0; i < rose_neigh->digipeat->ndigi; i++)
- seq_printf(seq, " %s", ax2asc(&rose_neigh->digipeat->calls[i]));
+ seq_printf(seq, " %s", ax2asc(buf, &rose_neigh->digipeat->calls[i]));
}
seq_puts(seq, "\n");
@@ -1258,6 +1262,8 @@ static void rose_route_stop(struct seq_file *seq, void *v)
static int rose_route_show(struct seq_file *seq, void *v)
{
+ char buf[11];
+
if (v == SEQ_START_TOKEN)
seq_puts(seq,
"lci address callsign neigh <-> lci address callsign neigh\n");
@@ -1269,7 +1275,7 @@ static int rose_route_show(struct seq_file *seq, void *v)
"%3.3X %-10s %-9s %05d ",
rose_route->lci1,
rose2asc(&rose_route->src_addr),
- ax2asc(&rose_route->src_call),
+ ax2asc(buf, &rose_route->src_call),
rose_route->neigh1->number);
else
seq_puts(seq,
@@ -1280,7 +1286,7 @@ static int rose_route_show(struct seq_file *seq, void *v)
"%3.3X %-10s %-9s %05d\n",
rose_route->lci2,
rose2asc(&rose_route->dest_addr),
- ax2asc(&rose_route->dest_call),
+ ax2asc(buf, &rose_route->dest_call),
rose_route->neigh2->number);
else
seq_puts(seq,
diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c
index 7db7e1c..02891ce 100644
--- a/net/rose/rose_subr.c
+++ b/net/rose/rose_subr.c
@@ -21,7 +21,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/system.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
@@ -74,7 +74,7 @@ void rose_requeue_frames(struct sock *sk)
if (skb_prev == NULL)
skb_queue_head(&sk->sk_write_queue, skb);
else
- skb_append(skb_prev, skb);
+ skb_append(skb_prev, skb, &sk->sk_write_queue);
skb_prev = skb;
}
}
@@ -400,6 +400,7 @@ static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose)
{
unsigned char *p = buffer + 1;
char *callsign;
+ char buf[11];
int len, nb;
/* National Facilities */
@@ -456,7 +457,7 @@ static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose)
*p++ = FAC_CCITT_DEST_NSAP;
- callsign = ax2asc(&rose->dest_call);
+ callsign = ax2asc(buf, &rose->dest_call);
*p++ = strlen(callsign) + 10;
*p++ = (strlen(callsign) + 9) * 2; /* ??? */
@@ -471,7 +472,7 @@ static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose)
*p++ = FAC_CCITT_SRC_NSAP;
- callsign = ax2asc(&rose->source_call);
+ callsign = ax2asc(buf, &rose->source_call);
*p++ = strlen(callsign) + 10;
*p++ = (strlen(callsign) + 9) * 2; /* ??? */
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index 84dd440..50ae037 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -22,7 +22,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/system.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index 2b537f4..dada34a 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -138,7 +138,7 @@ static int rxrpc_krxiod(void *arg)
_debug("### End Work");
- try_to_freeze(PF_FREEZE);
+ try_to_freeze();
/* discard pending signals */
rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index 6020c89..1aadd02 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -107,7 +107,7 @@ static int rxrpc_krxsecd(void *arg)
_debug("### End Inbound Calls");
- try_to_freeze(PF_FREEZE);
+ try_to_freeze();
/* discard pending signals */
rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 249c2b0..3ac81cd 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -90,7 +90,7 @@ static int krxtimod(void *arg)
complete_and_exit(&krxtimod_dead, 0);
}
- try_to_freeze(PF_FREEZE);
+ try_to_freeze();
/* discard pending signals */
rxrpc_discard_my_signals();
diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c
index 9bce779..122c086 100644
--- a/net/rxrpc/transport.c
+++ b/net/rxrpc/transport.c
@@ -330,7 +330,7 @@ static int rxrpc_incoming_msg(struct rxrpc_transport *trans,
msg->trans = trans;
msg->state = RXRPC_MSG_RECEIVED;
- msg->stamp = pkt->stamp;
+ skb_get_timestamp(pkt, &msg->stamp);
if (msg->stamp.tv_sec == 0) {
do_gettimeofday(&msg->stamp);
if (pkt->sk)
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b22c9be..45d3bc0 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -1,6 +1,43 @@
#
# Traffic control configuration.
#
+
+menuconfig NET_SCHED
+ bool "QoS and/or fair queueing"
+ ---help---
+ When the kernel has several packets to send out over a network
+ device, it has to decide which ones to send first, which ones to
+ delay, and which ones to drop. This is the job of the packet
+ scheduler, and several different algorithms for how to do this
+ "fairly" have been proposed.
+
+ If you say N here, you will get the standard packet scheduler, which
+ is a FIFO (first come, first served). If you say Y here, you will be
+ able to choose from among several alternative algorithms which can
+ then be attached to different network devices. This is useful for
+ example if some of your network devices are real time devices that
+ need a certain minimum data flow rate, or if you need to limit the
+ maximum data flow rate for traffic which matches specified criteria.
+ This code is considered to be experimental.
+
+ To administer these schedulers, you'll need the user-level utilities
+ from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
+ That package also contains some documentation; for more, check out
+ <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
+
+ This Quality of Service (QoS) support will enable you to use
+ Differentiated Services (diffserv) and Resource Reservation Protocol
+ (RSVP) on your Linux router if you also say Y to "QoS support",
+ "Packet classifier API" and to some classifiers below. Documentation
+ and software is at <http://diffserv.sourceforge.net/>.
+
+ If you say Y here and to "/proc file system" below, you will be able
+ to read status information about packet schedulers from the file
+ /proc/net/psched.
+
+ The available schedulers are listed in the following questions; you
+ can say Y to as many as you like. If unsure, say N now.
+
choice
prompt "Packet scheduler clock source"
depends on NET_SCHED
@@ -449,6 +486,20 @@ config NET_EMATCH_META
To compile this code as a module, choose M here: the
module will be called em_meta.
+config NET_EMATCH_TEXT
+ tristate "Textsearch"
+ depends on NET_EMATCH
+ select TEXTSEARCH
+ select TEXTSEARCH_KMP
+ select TEXTSEARCH_BM
+ select TEXTSEARCH_FSM
+ ---help---
+ Say Y here if you want to be ablt to classify packets based on
+ textsearch comparisons.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_text.
+
config NET_CLS_ACT
bool "Packet ACTION"
depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index eb3fe58..e48d0d4 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -4,7 +4,7 @@
obj-y := sch_generic.o
-obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o
+obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o
obj-$(CONFIG_NET_CLS) += cls_api.o
obj-$(CONFIG_NET_CLS_ACT) += act_api.o
obj-$(CONFIG_NET_ACT_POLICE) += police.o
@@ -40,3 +40,4 @@ obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
+obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index cafcb08..8aebe8f 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -165,7 +165,7 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
while ((a = act) != NULL) {
repeat:
if (a->ops && a->ops->act) {
- ret = a->ops->act(&skb, a);
+ ret = a->ops->act(&skb, a, res);
if (TC_MUNGED & skb->tc_verd) {
/* copied already, allow trampling */
skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
@@ -179,11 +179,6 @@ repeat:
act = a->next;
}
exec_done:
- if (skb->tc_classid > 0) {
- res->classid = skb->tc_classid;
- res->class = 0;
- skb->tc_classid = 0;
- }
return ret;
}
@@ -428,17 +423,19 @@ errout:
static int
tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
- unsigned flags, int event, int bind, int ref)
+ u16 flags, int event, int bind, int ref)
{
struct tcamsg *t;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
struct rtattr *x;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t));
- nlh->nlmsg_flags = flags;
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
+
t = NLMSG_DATA(nlh);
t->tca_family = AF_UNSPEC;
+ t->tca__pad1 = 0;
+ t->tca__pad2 = 0;
x = (struct rtattr*) skb->tail;
RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -580,6 +577,8 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
t = NLMSG_DATA(nlh);
t->tca_family = AF_UNSPEC;
+ t->tca__pad1 = 0;
+ t->tca__pad2 = 0;
x = (struct rtattr *) skb->tail;
RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -594,7 +593,7 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
nlh->nlmsg_flags |= NLM_F_ROOT;
module_put(a->ops->owner);
kfree(a);
- err = rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ err = rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
if (err > 0)
return 0;
@@ -657,7 +656,7 @@ tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event)
/* now do the delete */
tcf_action_destroy(head, 0);
- ret = rtnetlink_send(skb, pid, RTMGRP_TC,
+ ret = rtnetlink_send(skb, pid, RTNLGRP_TC,
n->nlmsg_flags&NLM_F_ECHO);
if (ret > 0)
return 0;
@@ -669,7 +668,7 @@ err:
}
static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
- unsigned flags)
+ u16 flags)
{
struct tcamsg *t;
struct nlmsghdr *nlh;
@@ -684,11 +683,12 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
b = (unsigned char *)skb->tail;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t));
- nlh->nlmsg_flags = flags;
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
t = NLMSG_DATA(nlh);
t->tca_family = AF_UNSPEC;
-
+ t->tca__pad1 = 0;
+ t->tca__pad2 = 0;
+
x = (struct rtattr*) skb->tail;
RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -698,9 +698,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
x->rta_len = skb->tail - (u8*)x;
nlh->nlmsg_len = skb->tail - b;
- NETLINK_CB(skb).dst_groups = RTMGRP_TC;
+ NETLINK_CB(skb).dst_group = RTNLGRP_TC;
- err = rtnetlink_send(skb, pid, RTMGRP_TC, flags&NLM_F_ECHO);
+ err = rtnetlink_send(skb, pid, RTNLGRP_TC, flags&NLM_F_ECHO);
if (err > 0)
err = 0;
return err;
@@ -843,6 +843,8 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
cb->nlh->nlmsg_type, sizeof(*t));
t = NLMSG_DATA(nlh);
t->tca_family = AF_UNSPEC;
+ t->tca__pad1 = 0;
+ t->tca__pad2 = 0;
x = (struct rtattr *) skb->tail;
RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -881,7 +883,7 @@ static int __init tc_action_init(void)
link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action;
}
- printk("TC classifier action (bugs to netdev@oss.sgi.com cc "
+ printk("TC classifier action (bugs to netdev@vger.kernel.org cc "
"hadi@cyberus.ca)\n");
return 0;
}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 56e66c3..b4d89fb 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -322,16 +322,17 @@ errout:
static int
tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
- u32 pid, u32 seq, unsigned flags, int event)
+ u32 pid, u32 seq, u16 flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
- nlh->nlmsg_flags = flags;
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
tcm = NLMSG_DATA(nlh);
tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad1 = 0;
tcm->tcm_ifindex = tp->q->dev->ifindex;
tcm->tcm_parent = tp->classid;
tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
@@ -366,7 +367,7 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
return -EINVAL;
}
- return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
}
struct tcf_dump_args
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 232fb91..006168d 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -618,6 +618,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
pinfo.protocol = s->protocol;
pinfo.tunnelid = s->tunnelid;
pinfo.tunnelhdr = f->tunnelhdr;
+ pinfo.pad = 0;
RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
if (f->res.classid)
RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 48bb23c..00eae5f 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -27,17 +27,17 @@
* lvalue rvalue
* +-----------+ +-----------+
* | type: INT | | type: INT |
- * def | id: INDEV | | id: VALUE |
+ * def | id: DEV | | id: VALUE |
* | data: | | data: 3 |
* +-----------+ +-----------+
* | |
- * ---> meta_ops[INT][INDEV](...) |
+ * ---> meta_ops[INT][DEV](...) |
* | |
* ----------- |
* V V
* +-----------+ +-----------+
* | type: INT | | type: INT |
- * obj | id: INDEV | | id: VALUE |
+ * obj | id: DEV | | id: VALUE |
* | data: 2 |<--data got filled out | data: 3 |
* +-----------+ +-----------+
* | |
@@ -170,26 +170,6 @@ META_COLLECTOR(var_dev)
*err = var_dev(skb->dev, dst);
}
-META_COLLECTOR(int_indev)
-{
- *err = int_dev(skb->input_dev, dst);
-}
-
-META_COLLECTOR(var_indev)
-{
- *err = var_dev(skb->input_dev, dst);
-}
-
-META_COLLECTOR(int_realdev)
-{
- *err = int_dev(skb->real_dev, dst);
-}
-
-META_COLLECTOR(var_realdev)
-{
- *err = var_dev(skb->real_dev, dst);
-}
-
/**************************************************************************
* skb attributes
**************************************************************************/
@@ -205,11 +185,6 @@ META_COLLECTOR(int_protocol)
dst->value = skb->protocol;
}
-META_COLLECTOR(int_security)
-{
- dst->value = skb->security;
-}
-
META_COLLECTOR(int_pkttype)
{
dst->value = skb->pkt_type;
@@ -234,12 +209,14 @@ META_COLLECTOR(int_maclen)
* Netfilter
**************************************************************************/
-#ifdef CONFIG_NETFILTER
META_COLLECTOR(int_nfmark)
{
+#ifdef CONFIG_NETFILTER
dst->value = skb->nfmark;
-}
+#else
+ dst->value = 0;
#endif
+}
/**************************************************************************
* Traffic Control
@@ -250,31 +227,21 @@ META_COLLECTOR(int_tcindex)
dst->value = skb->tc_index;
}
-#ifdef CONFIG_NET_CLS_ACT
-META_COLLECTOR(int_tcverd)
-{
- dst->value = skb->tc_verd;
-}
-
-META_COLLECTOR(int_tcclassid)
-{
- dst->value = skb->tc_classid;
-}
-#endif
-
/**************************************************************************
* Routing
**************************************************************************/
-#ifdef CONFIG_NET_CLS_ROUTE
META_COLLECTOR(int_rtclassid)
{
if (unlikely(skb->dst == NULL))
*err = -1;
else
+#ifdef CONFIG_NET_CLS_ROUTE
dst->value = skb->dst->tclassid;
-}
+#else
+ dst->value = 0;
#endif
+}
META_COLLECTOR(int_rtiif)
{
@@ -510,8 +477,6 @@ struct meta_ops
static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
[TCF_META_TYPE_VAR] = {
[META_ID(DEV)] = META_FUNC(var_dev),
- [META_ID(INDEV)] = META_FUNC(var_indev),
- [META_ID(REALDEV)] = META_FUNC(var_realdev),
[META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
},
[TCF_META_TYPE_INT] = {
@@ -520,26 +485,15 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
[META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1),
[META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2),
[META_ID(DEV)] = META_FUNC(int_dev),
- [META_ID(INDEV)] = META_FUNC(int_indev),
- [META_ID(REALDEV)] = META_FUNC(int_realdev),
[META_ID(PRIORITY)] = META_FUNC(int_priority),
[META_ID(PROTOCOL)] = META_FUNC(int_protocol),
- [META_ID(SECURITY)] = META_FUNC(int_security),
[META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
[META_ID(PKTLEN)] = META_FUNC(int_pktlen),
[META_ID(DATALEN)] = META_FUNC(int_datalen),
[META_ID(MACLEN)] = META_FUNC(int_maclen),
-#ifdef CONFIG_NETFILTER
[META_ID(NFMARK)] = META_FUNC(int_nfmark),
-#endif
[META_ID(TCINDEX)] = META_FUNC(int_tcindex),
-#ifdef CONFIG_NET_CLS_ACT
- [META_ID(TCVERDICT)] = META_FUNC(int_tcverd),
- [META_ID(TCCLASSID)] = META_FUNC(int_tcclassid),
-#endif
-#ifdef CONFIG_NET_CLS_ROUTE
[META_ID(RTCLASSID)] = META_FUNC(int_rtclassid),
-#endif
[META_ID(RTIIF)] = META_FUNC(int_rtiif),
[META_ID(SK_FAMILY)] = META_FUNC(int_sk_family),
[META_ID(SK_STATE)] = META_FUNC(int_sk_state),
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 0000000..77beabc
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,154 @@
+/*
+ * net/sched/em_text.c Textsearch ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/textsearch.h>
+#include <linux/tc_ematch/tc_em_text.h>
+#include <net/pkt_cls.h>
+
+struct text_match
+{
+ u16 from_offset;
+ u16 to_offset;
+ u8 from_layer;
+ u8 to_layer;
+ struct ts_config *config;
+};
+
+#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
+
+static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
+ struct tcf_pkt_info *info)
+{
+ struct text_match *tm = EM_TEXT_PRIV(m);
+ int from, to;
+ struct ts_state state;
+
+ from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
+ from += tm->from_offset;
+
+ to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
+ to += tm->to_offset;
+
+ return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
+}
+
+static int em_text_change(struct tcf_proto *tp, void *data, int len,
+ struct tcf_ematch *m)
+{
+ struct text_match *tm;
+ struct tcf_em_text *conf = data;
+ struct ts_config *ts_conf;
+ int flags = 0;
+
+ if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
+ return -EINVAL;
+
+ if (conf->from_layer > conf->to_layer)
+ return -EINVAL;
+
+ if (conf->from_layer == conf->to_layer &&
+ conf->from_offset > conf->to_offset)
+ return -EINVAL;
+
+retry:
+ ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
+ conf->pattern_len, GFP_KERNEL, flags);
+
+ if (flags & TS_AUTOLOAD)
+ rtnl_lock();
+
+ if (IS_ERR(ts_conf)) {
+ if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
+ rtnl_unlock();
+ flags |= TS_AUTOLOAD;
+ goto retry;
+ } else
+ return PTR_ERR(ts_conf);
+ } else if (flags & TS_AUTOLOAD) {
+ textsearch_destroy(ts_conf);
+ return -EAGAIN;
+ }
+
+ tm = kmalloc(sizeof(*tm), GFP_KERNEL);
+ if (tm == NULL) {
+ textsearch_destroy(ts_conf);
+ return -ENOBUFS;
+ }
+
+ tm->from_offset = conf->from_offset;
+ tm->to_offset = conf->to_offset;
+ tm->from_layer = conf->from_layer;
+ tm->to_layer = conf->to_layer;
+ tm->config = ts_conf;
+
+ m->datalen = sizeof(*tm);
+ m->data = (unsigned long) tm;
+
+ return 0;
+}
+
+static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+ textsearch_destroy(EM_TEXT_PRIV(m)->config);
+}
+
+static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ struct text_match *tm = EM_TEXT_PRIV(m);
+ struct tcf_em_text conf;
+
+ strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
+ conf.from_offset = tm->from_offset;
+ conf.to_offset = tm->to_offset;
+ conf.from_layer = tm->from_layer;
+ conf.to_layer = tm->to_layer;
+ conf.pattern_len = textsearch_get_pattern_len(tm->config);
+ conf.pad = 0;
+
+ RTA_PUT_NOHDR(skb, sizeof(conf), &conf);
+ RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config));
+ return 0;
+
+rtattr_failure:
+ return -1;
+}
+
+static struct tcf_ematch_ops em_text_ops = {
+ .kind = TCF_EM_TEXT,
+ .change = em_text_change,
+ .match = em_text_match,
+ .destroy = em_text_destroy,
+ .dump = em_text_dump,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_text_ops.link)
+};
+
+static int __init init_em_text(void)
+{
+ return tcf_em_register(&em_text_ops);
+}
+
+static void __exit exit_em_text(void)
+{
+ tcf_em_unregister(&em_text_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_text);
+module_exit(exit_em_text);
diff --git a/net/sched/gact.c b/net/sched/gact.c
index a811c89..d1c6d54 100644
--- a/net/sched/gact.c
+++ b/net/sched/gact.c
@@ -135,7 +135,7 @@ tcf_gact_cleanup(struct tc_action *a, int bind)
}
static int
-tcf_gact(struct sk_buff **pskb, struct tc_action *a)
+tcf_gact(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
{
struct tcf_gact *p = PRIV(a, gact);
struct sk_buff *skb = *pskb;
diff --git a/net/sched/ipt.c b/net/sched/ipt.c
index b114d99..f50136e 100644
--- a/net/sched/ipt.c
+++ b/net/sched/ipt.c
@@ -201,7 +201,7 @@ tcf_ipt_cleanup(struct tc_action *a, int bind)
}
static int
-tcf_ipt(struct sk_buff **pskb, struct tc_action *a)
+tcf_ipt(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
{
int ret = 0, result = 0;
struct tcf_ipt *p = PRIV(a, ipt);
diff --git a/net/sched/mirred.c b/net/sched/mirred.c
index f309ce3..20d0691 100644
--- a/net/sched/mirred.c
+++ b/net/sched/mirred.c
@@ -158,7 +158,7 @@ tcf_mirred_cleanup(struct tc_action *a, int bind)
}
static int
-tcf_mirred(struct sk_buff **pskb, struct tc_action *a)
+tcf_mirred(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
{
struct tcf_mirred *p = PRIV(a, mirred);
struct net_device *dev;
diff --git a/net/sched/pedit.c b/net/sched/pedit.c
index 678be6a..767d24f 100644
--- a/net/sched/pedit.c
+++ b/net/sched/pedit.c
@@ -130,7 +130,7 @@ tcf_pedit_cleanup(struct tc_action *a, int bind)
}
static int
-tcf_pedit(struct sk_buff **pskb, struct tc_action *a)
+tcf_pedit(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
{
struct tcf_pedit *p = PRIV(a, pedit);
struct sk_buff *skb = *pskb;
diff --git a/net/sched/police.c b/net/sched/police.c
index c03545f..eb39fb2 100644
--- a/net/sched/police.c
+++ b/net/sched/police.c
@@ -284,7 +284,8 @@ static int tcf_act_police_cleanup(struct tc_action *a, int bind)
return 0;
}
-static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a)
+static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a,
+ struct tcf_result *res)
{
psched_time_t now;
struct sk_buff *skb = *pskb;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 07977f8..737681c 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
{
int err;
struct rtattr *kind = tca[TCA_KIND-1];
- void *p = NULL;
struct Qdisc *sch;
struct Qdisc_ops *ops;
- int size;
ops = qdisc_lookup_ops(kind);
#ifdef CONFIG_KMOD
@@ -437,64 +435,55 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
if (ops == NULL)
goto err_out;
- /* ensure that the Qdisc and the private data are 32-byte aligned */
- size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
- size += ops->priv_size + QDISC_ALIGN_CONST;
-
- p = kmalloc(size, GFP_KERNEL);
- err = -ENOBUFS;
- if (!p)
+ sch = qdisc_alloc(dev, ops);
+ if (IS_ERR(sch)) {
+ err = PTR_ERR(sch);
goto err_out2;
- memset(p, 0, size);
- sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
- & ~QDISC_ALIGN_CONST);
- sch->padded = (char *)sch - (char *)p;
-
- INIT_LIST_HEAD(&sch->list);
- skb_queue_head_init(&sch->q);
+ }
- if (handle == TC_H_INGRESS)
+ if (handle == TC_H_INGRESS) {
sch->flags |= TCQ_F_INGRESS;
-
- sch->ops = ops;
- sch->enqueue = ops->enqueue;
- sch->dequeue = ops->dequeue;
- sch->dev = dev;
- dev_hold(dev);
- atomic_set(&sch->refcnt, 1);
- sch->stats_lock = &dev->queue_lock;
- if (handle == 0) {
+ handle = TC_H_MAKE(TC_H_INGRESS, 0);
+ } else if (handle == 0) {
handle = qdisc_alloc_handle(dev);
err = -ENOMEM;
if (handle == 0)
goto err_out3;
}
- if (handle == TC_H_INGRESS)
- sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
- else
- sch->handle = handle;
+ sch->handle = handle;
if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
+#ifdef CONFIG_NET_ESTIMATOR
+ if (tca[TCA_RATE-1]) {
+ err = gen_new_estimator(&sch->bstats, &sch->rate_est,
+ sch->stats_lock,
+ tca[TCA_RATE-1]);
+ if (err) {
+ /*
+ * Any broken qdiscs that would require
+ * a ops->reset() here? The qdisc was never
+ * in action so it shouldn't be necessary.
+ */
+ if (ops->destroy)
+ ops->destroy(sch);
+ goto err_out3;
+ }
+ }
+#endif
qdisc_lock_tree(dev);
list_add_tail(&sch->list, &dev->qdisc_list);
qdisc_unlock_tree(dev);
-#ifdef CONFIG_NET_ESTIMATOR
- if (tca[TCA_RATE-1])
- gen_new_estimator(&sch->bstats, &sch->rate_est,
- sch->stats_lock, tca[TCA_RATE-1]);
-#endif
return sch;
}
err_out3:
dev_put(dev);
+ kfree((char *) sch - sch->padded);
err_out2:
module_put(ops->owner);
err_out:
*errp = err;
- if (p)
- kfree(p);
return NULL;
}
@@ -760,17 +749,18 @@ graft:
}
static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
- u32 pid, u32 seq, unsigned flags, int event)
+ u32 pid, u32 seq, u16 flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb->tail;
struct gnet_dump d;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
- nlh->nlmsg_flags = flags;
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
tcm = NLMSG_DATA(nlh);
tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
tcm->tcm_ifindex = q->dev->ifindex;
tcm->tcm_parent = clid;
tcm->tcm_handle = q->handle;
@@ -826,7 +816,7 @@ static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
}
if (skb->len)
- return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
err_out:
kfree_skb(skb);
@@ -997,7 +987,7 @@ out:
static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
unsigned long cl,
- u32 pid, u32 seq, unsigned flags, int event)
+ u32 pid, u32 seq, u16 flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
@@ -1005,8 +995,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
struct gnet_dump d;
struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
- nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
- nlh->nlmsg_flags = flags;
+ nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
tcm = NLMSG_DATA(nlh);
tcm->tcm_family = AF_UNSPEC;
tcm->tcm_ifindex = q->dev->ifindex;
@@ -1051,7 +1040,7 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
return -EINVAL;
}
- return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+ return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
}
struct qdisc_dump_args
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 0000000..81f0b83
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,54 @@
+/*
+ * net/sched/sch_blackhole.c Black hole queue
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ *
+ * Note: Quantum tunneling is not supported.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ qdisc_drop(skb, sch);
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
+{
+ return NULL;
+}
+
+static struct Qdisc_ops blackhole_qdisc_ops = {
+ .id = "blackhole",
+ .priv_size = 0,
+ .enqueue = blackhole_enqueue,
+ .dequeue = blackhole_dequeue,
+ .owner = THIS_MODULE,
+};
+
+static int __init blackhole_module_init(void)
+{
+ return register_qdisc(&blackhole_qdisc_ops);
+}
+
+static void __exit blackhole_module_exit(void)
+{
+ unregister_qdisc(&blackhole_qdisc_ops);
+}
+
+module_init(blackhole_module_init)
+module_exit(blackhole_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index d43e3b8..09453f9 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1528,6 +1528,7 @@ static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
opt.strategy = cl->ovl_strategy;
opt.priority2 = cl->priority2+1;
+ opt.pad = 0;
opt.penalty = (cl->penalty*1000)/HZ;
RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
return skb->len;
@@ -1563,6 +1564,8 @@ static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
if (cl->police) {
opt.police = cl->police;
+ opt.__res1 = 0;
+ opt.__res2 = 0;
RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
}
return skb->len;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index d8bd2a5..13e0e7b 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -31,7 +31,7 @@
#endif
-#define PRIV(sch) qdisc_priv(sch)
+#define PRIV(sch) ((struct dsmark_qdisc_data *) qdisc_priv(sch))
/*
@@ -55,24 +55,38 @@
struct dsmark_qdisc_data {
struct Qdisc *q;
struct tcf_proto *filter_list;
- __u8 *mask; /* "owns" the array */
- __u8 *value;
- __u16 indices;
- __u32 default_index; /* index range is 0...0xffff */
+ u8 *mask; /* "owns" the array */
+ u8 *value;
+ u16 indices;
+ u32 default_index; /* index range is 0...0xffff */
int set_tc_index;
};
+static inline int dsmark_valid_indices(u16 indices)
+{
+ while (indices != 1) {
+ if (indices & 1)
+ return 0;
+ indices >>= 1;
+ }
+
+ return 1;
+}
-/* ------------------------- Class/flow operations ------------------------- */
+static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
+{
+ return (index <= p->indices && index > 0);
+}
+/* ------------------------- Class/flow operations ------------------------- */
-static int dsmark_graft(struct Qdisc *sch,unsigned long arg,
- struct Qdisc *new,struct Qdisc **old)
+static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old)
{
struct dsmark_qdisc_data *p = PRIV(sch);
- DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new,
- old);
+ DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",
+ sch, p, new, old);
if (new == NULL) {
new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
@@ -81,91 +95,95 @@ static int dsmark_graft(struct Qdisc *sch,unsigned long arg,
}
sch_tree_lock(sch);
- *old = xchg(&p->q,new);
- if (*old)
- qdisc_reset(*old);
+ *old = xchg(&p->q, new);
+ qdisc_reset(*old);
sch->q.qlen = 0;
- sch_tree_unlock(sch); /* @@@ move up ? */
+ sch_tree_unlock(sch);
+
return 0;
}
-
static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
{
- struct dsmark_qdisc_data *p = PRIV(sch);
-
- return p->q;
+ return PRIV(sch)->q;
}
-
-static unsigned long dsmark_get(struct Qdisc *sch,u32 classid)
+static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
{
- struct dsmark_qdisc_data *p __attribute__((unused)) = PRIV(sch);
+ DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",
+ sch, PRIV(sch), classid);
- DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid);
- return TC_H_MIN(classid)+1;
+ return TC_H_MIN(classid) + 1;
}
-
static unsigned long dsmark_bind_filter(struct Qdisc *sch,
- unsigned long parent, u32 classid)
+ unsigned long parent, u32 classid)
{
- return dsmark_get(sch,classid);
+ return dsmark_get(sch, classid);
}
-
static void dsmark_put(struct Qdisc *sch, unsigned long cl)
{
}
-
static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
- struct rtattr **tca, unsigned long *arg)
+ struct rtattr **tca, unsigned long *arg)
{
struct dsmark_qdisc_data *p = PRIV(sch);
struct rtattr *opt = tca[TCA_OPTIONS-1];
struct rtattr *tb[TCA_DSMARK_MAX];
+ int err = -EINVAL;
+ u8 mask = 0;
DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x),"
- "arg 0x%lx\n",sch,p,classid,parent,*arg);
- if (*arg > p->indices)
- return -ENOENT;
- if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt))
- return -EINVAL;
- if (tb[TCA_DSMARK_MASK-1]) {
- if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK-1]))
- return -EINVAL;
- p->mask[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK-1]);
- }
- if (tb[TCA_DSMARK_VALUE-1]) {
- if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE-1]))
- return -EINVAL;
- p->value[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE-1]);
+ "arg 0x%lx\n", sch, p, classid, parent, *arg);
+
+ if (!dsmark_valid_index(p, *arg)) {
+ err = -ENOENT;
+ goto rtattr_failure;
}
- return 0;
-}
+ if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt))
+ goto rtattr_failure;
+
+ if (tb[TCA_DSMARK_MASK-1])
+ mask = RTA_GET_U8(tb[TCA_DSMARK_MASK-1]);
+
+ if (tb[TCA_DSMARK_VALUE-1])
+ p->value[*arg-1] = RTA_GET_U8(tb[TCA_DSMARK_VALUE-1]);
+
+ if (tb[TCA_DSMARK_MASK-1])
+ p->mask[*arg-1] = mask;
+
+ err = 0;
-static int dsmark_delete(struct Qdisc *sch,unsigned long arg)
+rtattr_failure:
+ return err;
+}
+
+static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
{
struct dsmark_qdisc_data *p = PRIV(sch);
- if (!arg || arg > p->indices)
+ if (!dsmark_valid_index(p, arg))
return -EINVAL;
+
p->mask[arg-1] = 0xff;
p->value[arg-1] = 0;
+
return 0;
}
-
static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker)
{
struct dsmark_qdisc_data *p = PRIV(sch);
int i;
- DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker);
+ DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
+
if (walker->stop)
return;
+
for (i = 0; i < p->indices; i++) {
if (p->mask[i] == 0xff && !p->value[i])
goto ignore;
@@ -180,26 +198,20 @@ ignore:
}
}
-
static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl)
{
- struct dsmark_qdisc_data *p = PRIV(sch);
-
- return &p->filter_list;
+ return &PRIV(sch)->filter_list;
}
-
/* --------------------------- Qdisc operations ---------------------------- */
-
static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = PRIV(sch);
- struct tcf_result res;
- int result;
- int ret = NET_XMIT_POLICED;
+ int err;
+
+ D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
- D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
if (p->set_tc_index) {
/* FIXME: Safe with non-linear skbs? --RR */
switch (skb->protocol) {
@@ -216,17 +228,21 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
break;
};
}
- result = TC_POLICE_OK; /* be nice to gcc */
- if (TC_H_MAJ(skb->priority) == sch->handle) {
+
+ if (TC_H_MAJ(skb->priority) == sch->handle)
skb->tc_index = TC_H_MIN(skb->priority);
- } else {
- result = tc_classify(skb,p->filter_list,&res);
- D2PRINTK("result %d class 0x%04x\n",result,res.classid);
+ else {
+ struct tcf_result res;
+ int result = tc_classify(skb, p->filter_list, &res);
+
+ D2PRINTK("result %d class 0x%04x\n", result, res.classid);
+
switch (result) {
#ifdef CONFIG_NET_CLS_POLICE
case TC_POLICE_SHOT:
kfree_skb(skb);
- break;
+ sch->qstats.drops++;
+ return NET_XMIT_POLICED;
#if 0
case TC_POLICE_RECLASSIFY:
/* FIXME: what to do here ??? */
@@ -243,43 +259,45 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
break;
};
}
- if (
-#ifdef CONFIG_NET_CLS_POLICE
- result == TC_POLICE_SHOT ||
-#endif
- ((ret = p->q->enqueue(skb,p->q)) != 0)) {
+ err = p->q->enqueue(skb,p->q);
+ if (err != NET_XMIT_SUCCESS) {
sch->qstats.drops++;
- return ret;
+ return err;
}
+
sch->bstats.bytes += skb->len;
sch->bstats.packets++;
sch->q.qlen++;
- return ret;
-}
+ return NET_XMIT_SUCCESS;
+}
static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = PRIV(sch);
struct sk_buff *skb;
- int index;
+ u32 index;
+
+ D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p);
- D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n",sch,p);
skb = p->q->ops->dequeue(p->q);
- if (!skb)
+ if (skb == NULL)
return NULL;
+
sch->q.qlen--;
- index = skb->tc_index & (p->indices-1);
- D2PRINTK("index %d->%d\n",skb->tc_index,index);
+
+ index = skb->tc_index & (p->indices - 1);
+ D2PRINTK("index %d->%d\n", skb->tc_index, index);
+
switch (skb->protocol) {
case __constant_htons(ETH_P_IP):
- ipv4_change_dsfield(skb->nh.iph,
- p->mask[index],p->value[index]);
+ ipv4_change_dsfield(skb->nh.iph, p->mask[index],
+ p->value[index]);
break;
case __constant_htons(ETH_P_IPV6):
- ipv6_change_dsfield(skb->nh.ipv6h,
- p->mask[index],p->value[index]);
+ ipv6_change_dsfield(skb->nh.ipv6h, p->mask[index],
+ p->value[index]);
break;
default:
/*
@@ -293,152 +311,162 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
htons(skb->protocol));
break;
};
+
return skb;
}
-
static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch)
{
- int ret;
struct dsmark_qdisc_data *p = PRIV(sch);
+ int err;
- D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
- if ((ret = p->q->ops->requeue(skb, p->q)) == 0) {
- sch->q.qlen++;
- sch->qstats.requeues++;
- return 0;
+ D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
+
+ err = p->q->ops->requeue(skb, p->q);
+ if (err != NET_XMIT_SUCCESS) {
+ sch->qstats.drops++;
+ return err;
}
- sch->qstats.drops++;
- return ret;
-}
+ sch->q.qlen++;
+ sch->qstats.requeues++;
+
+ return NET_XMIT_SUCCESS;
+}
static unsigned int dsmark_drop(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = PRIV(sch);
unsigned int len;
- DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p);
- if (!p->q->ops->drop)
- return 0;
- if (!(len = p->q->ops->drop(p->q)))
+ DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
+
+ if (p->q->ops->drop == NULL)
return 0;
- sch->q.qlen--;
+
+ len = p->q->ops->drop(p->q);
+ if (len)
+ sch->q.qlen--;
+
return len;
}
-
-static int dsmark_init(struct Qdisc *sch,struct rtattr *opt)
+static int dsmark_init(struct Qdisc *sch, struct rtattr *opt)
{
struct dsmark_qdisc_data *p = PRIV(sch);
struct rtattr *tb[TCA_DSMARK_MAX];
- __u16 tmp;
-
- DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
- if (!opt ||
- rtattr_parse(tb,TCA_DSMARK_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0 ||
- !tb[TCA_DSMARK_INDICES-1] ||
- RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16))
- return -EINVAL;
- p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]);
- if (!p->indices)
- return -EINVAL;
- for (tmp = p->indices; tmp != 1; tmp >>= 1) {
- if (tmp & 1)
- return -EINVAL;
- }
- p->default_index = NO_DEFAULT_INDEX;
- if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) {
- if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX-1]) < sizeof(__u16))
- return -EINVAL;
- p->default_index =
- *(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX-1]);
+ int err = -EINVAL;
+ u32 default_index = NO_DEFAULT_INDEX;
+ u16 indices;
+ u8 *mask;
+
+ DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
+
+ if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt) < 0)
+ goto errout;
+
+ indices = RTA_GET_U16(tb[TCA_DSMARK_INDICES-1]);
+ if (!indices || !dsmark_valid_indices(indices))
+ goto errout;
+
+ if (tb[TCA_DSMARK_DEFAULT_INDEX-1])
+ default_index = RTA_GET_U16(tb[TCA_DSMARK_DEFAULT_INDEX-1]);
+
+ mask = kmalloc(indices * 2, GFP_KERNEL);
+ if (mask == NULL) {
+ err = -ENOMEM;
+ goto errout;
}
- p->set_tc_index = !!tb[TCA_DSMARK_SET_TC_INDEX-1];
- p->mask = kmalloc(p->indices*2,GFP_KERNEL);
- if (!p->mask)
- return -ENOMEM;
- p->value = p->mask+p->indices;
- memset(p->mask,0xff,p->indices);
- memset(p->value,0,p->indices);
- if (!(p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
+
+ p->mask = mask;
+ memset(p->mask, 0xff, indices);
+
+ p->value = p->mask + indices;
+ memset(p->value, 0, indices);
+
+ p->indices = indices;
+ p->default_index = default_index;
+ p->set_tc_index = RTA_GET_FLAG(tb[TCA_DSMARK_SET_TC_INDEX-1]);
+
+ p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+ if (p->q == NULL)
p->q = &noop_qdisc;
- DPRINTK("dsmark_init: qdisc %p\n",&p->q);
- return 0;
-}
+ DPRINTK("dsmark_init: qdisc %p\n", p->q);
+
+ err = 0;
+errout:
+rtattr_failure:
+ return err;
+}
static void dsmark_reset(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = PRIV(sch);
- DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p);
+ DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
qdisc_reset(p->q);
sch->q.qlen = 0;
}
-
static void dsmark_destroy(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = PRIV(sch);
struct tcf_proto *tp;
- DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n",sch,p);
+ DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p);
+
while (p->filter_list) {
tp = p->filter_list;
p->filter_list = tp->next;
tcf_destroy(tp);
}
+
qdisc_destroy(p->q);
kfree(p->mask);
}
-
static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
- struct sk_buff *skb, struct tcmsg *tcm)
+ struct sk_buff *skb, struct tcmsg *tcm)
{
struct dsmark_qdisc_data *p = PRIV(sch);
- unsigned char *b = skb->tail;
- struct rtattr *rta;
+ struct rtattr *opts = NULL;
+
+ DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl);
- DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n",sch,p,cl);
- if (!cl || cl > p->indices)
+ if (!dsmark_valid_index(p, cl))
return -EINVAL;
- tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle),cl-1);
- rta = (struct rtattr *) b;
- RTA_PUT(skb,TCA_OPTIONS,0,NULL);
- RTA_PUT(skb,TCA_DSMARK_MASK,1,&p->mask[cl-1]);
- RTA_PUT(skb,TCA_DSMARK_VALUE,1,&p->value[cl-1]);
- rta->rta_len = skb->tail-b;
- return skb->len;
+
+ tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1);
+
+ opts = RTA_NEST(skb, TCA_OPTIONS);
+ RTA_PUT_U8(skb,TCA_DSMARK_MASK, p->mask[cl-1]);
+ RTA_PUT_U8(skb,TCA_DSMARK_VALUE, p->value[cl-1]);
+
+ return RTA_NEST_END(skb, opts);
rtattr_failure:
- skb_trim(skb,b-skb->data);
- return -1;
+ return RTA_NEST_CANCEL(skb, opts);
}
static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct dsmark_qdisc_data *p = PRIV(sch);
- unsigned char *b = skb->tail;
- struct rtattr *rta;
+ struct rtattr *opts = NULL;
- rta = (struct rtattr *) b;
- RTA_PUT(skb,TCA_OPTIONS,0,NULL);
- RTA_PUT(skb,TCA_DSMARK_INDICES,sizeof(__u16),&p->indices);
- if (p->default_index != NO_DEFAULT_INDEX) {
- __u16 tmp = p->default_index;
+ opts = RTA_NEST(skb, TCA_OPTIONS);
+ RTA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices);
+
+ if (p->default_index != NO_DEFAULT_INDEX)
+ RTA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index);
- RTA_PUT(skb,TCA_DSMARK_DEFAULT_INDEX, sizeof(__u16), &tmp);
- }
if (p->set_tc_index)
- RTA_PUT(skb, TCA_DSMARK_SET_TC_INDEX, 0, NULL);
- rta->rta_len = skb->tail-b;
- return skb->len;
+ RTA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX);
+
+ return RTA_NEST_END(skb, opts);
rtattr_failure:
- skb_trim(skb,b-skb->data);
- return -1;
+ return RTA_NEST_CANCEL(skb, opts);
}
static struct Qdisc_class_ops dsmark_class_ops = {
@@ -476,10 +504,13 @@ static int __init dsmark_module_init(void)
{
return register_qdisc(&dsmark_qdisc_ops);
}
+
static void __exit dsmark_module_exit(void)
{
unregister_qdisc(&dsmark_qdisc_ops);
}
+
module_init(dsmark_module_init)
module_exit(dsmark_module_exit)
+
MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 4888305..033083b 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -11,131 +11,38 @@
#include <linux/config.h>
#include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
#include <linux/skbuff.h>
-#include <net/sock.h>
#include <net/pkt_sched.h>
/* 1 band FIFO pseudo-"scheduler" */
struct fifo_sched_data
{
- unsigned limit;
+ u32 limit;
};
-static int
-bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct fifo_sched_data *q = qdisc_priv(sch);
- if (sch->qstats.backlog + skb->len <= q->limit) {
- __skb_queue_tail(&sch->q, skb);
- sch->qstats.backlog += skb->len;
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
- return 0;
- }
- sch->qstats.drops++;
-#ifdef CONFIG_NET_CLS_POLICE
- if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
-#endif
- kfree_skb(skb);
- return NET_XMIT_DROP;
-}
-
-static int
-bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
-{
- __skb_queue_head(&sch->q, skb);
- sch->qstats.backlog += skb->len;
- sch->qstats.requeues++;
- return 0;
-}
-
-static struct sk_buff *
-bfifo_dequeue(struct Qdisc* sch)
-{
- struct sk_buff *skb;
+ if (likely(sch->qstats.backlog + skb->len <= q->limit))
+ return qdisc_enqueue_tail(skb, sch);
- skb = __skb_dequeue(&sch->q);
- if (skb)
- sch->qstats.backlog -= skb->len;
- return skb;
+ return qdisc_reshape_fail(skb, sch);
}
-static unsigned int
-fifo_drop(struct Qdisc* sch)
-{
- struct sk_buff *skb;
-
- skb = __skb_dequeue_tail(&sch->q);
- if (skb) {
- unsigned int len = skb->len;
- sch->qstats.backlog -= len;
- kfree_skb(skb);
- return len;
- }
- return 0;
-}
-
-static void
-fifo_reset(struct Qdisc* sch)
-{
- skb_queue_purge(&sch->q);
- sch->qstats.backlog = 0;
-}
-
-static int
-pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct fifo_sched_data *q = qdisc_priv(sch);
- if (sch->q.qlen < q->limit) {
- __skb_queue_tail(&sch->q, skb);
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
- return 0;
- }
- sch->qstats.drops++;
-#ifdef CONFIG_NET_CLS_POLICE
- if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
-#endif
- kfree_skb(skb);
- return NET_XMIT_DROP;
-}
-
-static int
-pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
-{
- __skb_queue_head(&sch->q, skb);
- sch->qstats.requeues++;
- return 0;
-}
-
+ if (likely(skb_queue_len(&sch->q) < q->limit))
+ return qdisc_enqueue_tail(skb, sch);
-static struct sk_buff *
-pfifo_dequeue(struct Qdisc* sch)
-{
- return __skb_dequeue(&sch->q);
+ return qdisc_reshape_fail(skb, sch);
}
static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
@@ -143,66 +50,59 @@ static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
struct fifo_sched_data *q = qdisc_priv(sch);
if (opt == NULL) {
- unsigned int limit = sch->dev->tx_queue_len ? : 1;
+ u32 limit = sch->dev->tx_queue_len ? : 1;
if (sch->ops == &bfifo_qdisc_ops)
- q->limit = limit*sch->dev->mtu;
- else
- q->limit = limit;
+ limit *= sch->dev->mtu;
+
+ q->limit = limit;
} else {
struct tc_fifo_qopt *ctl = RTA_DATA(opt);
- if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+
+ if (RTA_PAYLOAD(opt) < sizeof(*ctl))
return -EINVAL;
+
q->limit = ctl->limit;
}
+
return 0;
}
static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fifo_sched_data *q = qdisc_priv(sch);
- unsigned char *b = skb->tail;
- struct tc_fifo_qopt opt;
+ struct tc_fifo_qopt opt = { .limit = q->limit };
- opt.limit = q->limit;
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
-
return skb->len;
rtattr_failure:
- skb_trim(skb, b - skb->data);
return -1;
}
struct Qdisc_ops pfifo_qdisc_ops = {
- .next = NULL,
- .cl_ops = NULL,
.id = "pfifo",
.priv_size = sizeof(struct fifo_sched_data),
.enqueue = pfifo_enqueue,
- .dequeue = pfifo_dequeue,
- .requeue = pfifo_requeue,
- .drop = fifo_drop,
+ .dequeue = qdisc_dequeue_head,
+ .requeue = qdisc_requeue,
+ .drop = qdisc_queue_drop,
.init = fifo_init,
- .reset = fifo_reset,
- .destroy = NULL,
+ .reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
.owner = THIS_MODULE,
};
struct Qdisc_ops bfifo_qdisc_ops = {
- .next = NULL,
- .cl_ops = NULL,
.id = "bfifo",
.priv_size = sizeof(struct fifo_sched_data),
.enqueue = bfifo_enqueue,
- .dequeue = bfifo_dequeue,
- .requeue = bfifo_requeue,
- .drop = fifo_drop,
+ .dequeue = qdisc_dequeue_head,
+ .requeue = qdisc_requeue,
+ .drop = qdisc_queue_drop,
.init = fifo_init,
- .reset = fifo_reset,
- .destroy = NULL,
+ .reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
.owner = THIS_MODULE,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 87e48a4..99ceb91 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -238,36 +238,46 @@ static void dev_watchdog_down(struct net_device *dev)
spin_unlock_bh(&dev->xmit_lock);
}
+void netif_carrier_on(struct net_device *dev)
+{
+ if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
+ linkwatch_fire_event(dev);
+ if (netif_running(dev))
+ __netdev_watchdog_up(dev);
+}
+
+void netif_carrier_off(struct net_device *dev)
+{
+ if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
+ linkwatch_fire_event(dev);
+}
+
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
under all circumstances. It is difficult to invent anything faster or
cheaper.
*/
-static int
-noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
+static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
{
kfree_skb(skb);
return NET_XMIT_CN;
}
-static struct sk_buff *
-noop_dequeue(struct Qdisc * qdisc)
+static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
{
return NULL;
}
-static int
-noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
+static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
{
if (net_ratelimit())
- printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name);
+ printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
+ skb->dev->name);
kfree_skb(skb);
return NET_XMIT_CN;
}
struct Qdisc_ops noop_qdisc_ops = {
- .next = NULL,
- .cl_ops = NULL,
.id = "noop",
.priv_size = 0,
.enqueue = noop_enqueue,
@@ -285,8 +295,6 @@ struct Qdisc noop_qdisc = {
};
static struct Qdisc_ops noqueue_qdisc_ops = {
- .next = NULL,
- .cl_ops = NULL,
.id = "noqueue",
.priv_size = 0,
.enqueue = noop_enqueue,
@@ -311,97 +319,86 @@ static const u8 prio2band[TC_PRIO_MAX+1] =
generic prio+fifo combination.
*/
-static int
-pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+#define PFIFO_FAST_BANDS 3
+
+static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
+ struct Qdisc *qdisc)
{
struct sk_buff_head *list = qdisc_priv(qdisc);
+ return list + prio2band[skb->priority & TC_PRIO_MAX];
+}
- list += prio2band[skb->priority&TC_PRIO_MAX];
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+{
+ struct sk_buff_head *list = prio2list(skb, qdisc);
- if (list->qlen < qdisc->dev->tx_queue_len) {
- __skb_queue_tail(list, skb);
+ if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
qdisc->q.qlen++;
- qdisc->bstats.bytes += skb->len;
- qdisc->bstats.packets++;
- return 0;
+ return __qdisc_enqueue_tail(skb, qdisc, list);
}
- qdisc->qstats.drops++;
- kfree_skb(skb);
- return NET_XMIT_DROP;
+
+ return qdisc_drop(skb, qdisc);
}
-static struct sk_buff *
-pfifo_fast_dequeue(struct Qdisc* qdisc)
+static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
{
int prio;
struct sk_buff_head *list = qdisc_priv(qdisc);
- struct sk_buff *skb;
- for (prio = 0; prio < 3; prio++, list++) {
- skb = __skb_dequeue(list);
- if (skb) {
+ for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+ if (!skb_queue_empty(list + prio)) {
qdisc->q.qlen--;
- return skb;
+ return __qdisc_dequeue_head(qdisc, list + prio);
}
}
+
return NULL;
}
-static int
-pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
+static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
{
- struct sk_buff_head *list = qdisc_priv(qdisc);
-
- list += prio2band[skb->priority&TC_PRIO_MAX];
-
- __skb_queue_head(list, skb);
qdisc->q.qlen++;
- qdisc->qstats.requeues++;
- return 0;
+ return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
}
-static void
-pfifo_fast_reset(struct Qdisc* qdisc)
+static void pfifo_fast_reset(struct Qdisc* qdisc)
{
int prio;
struct sk_buff_head *list = qdisc_priv(qdisc);
- for (prio=0; prio < 3; prio++)
- skb_queue_purge(list+prio);
+ for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
+ __qdisc_reset_queue(qdisc, list + prio);
+
+ qdisc->qstats.backlog = 0;
qdisc->q.qlen = 0;
}
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
- unsigned char *b = skb->tail;
- struct tc_prio_qopt opt;
+ struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
- opt.bands = 3;
memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
rtattr_failure:
- skb_trim(skb, b - skb->data);
return -1;
}
static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
{
- int i;
+ int prio;
struct sk_buff_head *list = qdisc_priv(qdisc);
- for (i=0; i<3; i++)
- skb_queue_head_init(list+i);
+ for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
+ skb_queue_head_init(list + prio);
return 0;
}
static struct Qdisc_ops pfifo_fast_ops = {
- .next = NULL,
- .cl_ops = NULL,
.id = "pfifo_fast",
- .priv_size = 3 * sizeof(struct sk_buff_head),
+ .priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
.enqueue = pfifo_fast_enqueue,
.dequeue = pfifo_fast_dequeue,
.requeue = pfifo_fast_requeue,
@@ -411,24 +408,23 @@ static struct Qdisc_ops pfifo_fast_ops = {
.owner = THIS_MODULE,
};
-struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
{
void *p;
struct Qdisc *sch;
- int size;
+ unsigned int size;
+ int err = -ENOBUFS;
/* ensure that the Qdisc and the private data are 32-byte aligned */
- size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
- size += ops->priv_size + QDISC_ALIGN_CONST;
+ size = QDISC_ALIGN(sizeof(*sch));
+ size += ops->priv_size + (QDISC_ALIGNTO - 1);
p = kmalloc(size, GFP_KERNEL);
if (!p)
- return NULL;
+ goto errout;
memset(p, 0, size);
-
- sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
- & ~QDISC_ALIGN_CONST);
- sch->padded = (char *)sch - (char *)p;
+ sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
+ sch->padded = (char *) sch - (char *) p;
INIT_LIST_HEAD(&sch->list);
skb_queue_head_init(&sch->q);
@@ -439,11 +435,25 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
dev_hold(dev);
sch->stats_lock = &dev->queue_lock;
atomic_set(&sch->refcnt, 1);
+
+ return sch;
+errout:
+ return ERR_PTR(-err);
+}
+
+struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+{
+ struct Qdisc *sch;
+
+ sch = qdisc_alloc(dev, ops);
+ if (IS_ERR(sch))
+ goto errout;
+
if (!ops->init || ops->init(sch, NULL) == 0)
return sch;
- dev_put(dev);
- kfree(p);
+ qdisc_destroy(sch);
+errout:
return NULL;
}
@@ -604,9 +614,12 @@ void dev_shutdown(struct net_device *dev)
}
EXPORT_SYMBOL(__netdev_watchdog_up);
+EXPORT_SYMBOL(netif_carrier_on);
+EXPORT_SYMBOL(netif_carrier_off);
EXPORT_SYMBOL(noop_qdisc);
EXPORT_SYMBOL(noop_qdisc_ops);
EXPORT_SYMBOL(qdisc_create_dflt);
+EXPORT_SYMBOL(qdisc_alloc);
EXPORT_SYMBOL(qdisc_destroy);
EXPORT_SYMBOL(qdisc_reset);
EXPORT_SYMBOL(qdisc_restart);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 664d0e4..7845d04 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -385,7 +385,7 @@ static int red_change(struct Qdisc *sch, struct rtattr *opt)
memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
q->qcount = -1;
- if (skb_queue_len(&sch->q) == 0)
+ if (skb_queue_empty(&sch->q))
PSCHED_SET_PASTPERFECT(q->qidlestart);
sch_tree_unlock(sch);
return 0;
diff --git a/net/sched/simple.c b/net/sched/simple.c
index 3ab4c67..8a6ae4f 100644
--- a/net/sched/simple.c
+++ b/net/sched/simple.c
@@ -44,7 +44,7 @@ static DEFINE_RWLOCK(simp_lock);
#include <net/pkt_act.h>
#include <net/act_generic.h>
-static int tcf_simp(struct sk_buff **pskb, struct tc_action *a)
+static int tcf_simp(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
{
struct sk_buff *skb = *pskb;
struct tcf_defact *p = PRIV(a, defact);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 663843d..5b24ae0 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -71,7 +71,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
const struct sctp_endpoint *ep,
const struct sock *sk,
sctp_scope_t scope,
- int gfp)
+ unsigned int __nocast gfp)
{
struct sctp_sock *sp;
int i;
@@ -191,10 +191,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
asoc->last_cwr_tsn = asoc->ctsn_ack_point;
asoc->unack_data = 0;
- SCTP_DEBUG_PRINTK("myctsnap for %s INIT as 0x%x.\n",
- asoc->ep->debug_name,
- asoc->ctsn_ack_point);
-
/* ADDIP Section 4.1 Asconf Chunk Procedures
*
* When an endpoint has an ASCONF signaled change to be sent to the
@@ -207,10 +203,11 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
*/
asoc->addip_serial = asoc->c.initial_tsn;
- skb_queue_head_init(&asoc->addip_chunks);
+ INIT_LIST_HEAD(&asoc->addip_chunk_list);
/* Make an empty list of remote transport addresses. */
INIT_LIST_HEAD(&asoc->peer.transport_addr_list);
+ asoc->peer.transport_count = 0;
/* RFC 2960 5.1 Normal Establishment of an Association
*
@@ -275,7 +272,8 @@ fail_init:
/* Allocate and initialize a new association */
struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
const struct sock *sk,
- sctp_scope_t scope, int gfp)
+ sctp_scope_t scope,
+ unsigned int __nocast gfp)
{
struct sctp_association *asoc;
@@ -288,6 +286,7 @@ struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
asoc->base.malloced = 1;
SCTP_DBG_OBJCNT_INC(assoc);
+ SCTP_DEBUG_PRINTK("Created asoc %p\n", asoc);
return asoc;
@@ -356,6 +355,8 @@ void sctp_association_free(struct sctp_association *asoc)
sctp_transport_free(transport);
}
+ asoc->peer.transport_count = 0;
+
/* Free any cached ASCONF_ACK chunk. */
if (asoc->addip_last_asconf_ack)
sctp_chunk_free(asoc->addip_last_asconf_ack);
@@ -400,7 +401,7 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
/* If the primary path is changing, assume that the
* user wants to use this new path.
*/
- if (transport->active)
+ if (transport->state != SCTP_INACTIVE)
asoc->peer.active_path = transport;
/*
@@ -428,10 +429,58 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
transport->cacc.next_tsn_at_change = asoc->next_tsn;
}
+/* Remove a transport from an association. */
+void sctp_assoc_rm_peer(struct sctp_association *asoc,
+ struct sctp_transport *peer)
+{
+ struct list_head *pos;
+ struct sctp_transport *transport;
+
+ SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_rm_peer:association %p addr: ",
+ " port: %d\n",
+ asoc,
+ (&peer->ipaddr),
+ peer->ipaddr.v4.sin_port);
+
+ /* If we are to remove the current retran_path, update it
+ * to the next peer before removing this peer from the list.
+ */
+ if (asoc->peer.retran_path == peer)
+ sctp_assoc_update_retran_path(asoc);
+
+ /* Remove this peer from the list. */
+ list_del(&peer->transports);
+
+ /* Get the first transport of asoc. */
+ pos = asoc->peer.transport_addr_list.next;
+ transport = list_entry(pos, struct sctp_transport, transports);
+
+ /* Update any entries that match the peer to be deleted. */
+ if (asoc->peer.primary_path == peer)
+ sctp_assoc_set_primary(asoc, transport);
+ if (asoc->peer.active_path == peer)
+ asoc->peer.active_path = transport;
+ if (asoc->peer.last_data_from == peer)
+ asoc->peer.last_data_from = transport;
+
+ /* If we remove the transport an INIT was last sent to, set it to
+ * NULL. Combined with the update of the retran path above, this
+ * will cause the next INIT to be sent to the next available
+ * transport, maintaining the cycle.
+ */
+ if (asoc->init_last_sent_to == peer)
+ asoc->init_last_sent_to = NULL;
+
+ asoc->peer.transport_count--;
+
+ sctp_transport_free(peer);
+}
+
/* Add a transport address to an association. */
struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
const union sctp_addr *addr,
- int gfp)
+ const unsigned int __nocast gfp,
+ const int peer_state)
{
struct sctp_transport *peer;
struct sctp_sock *sp;
@@ -442,14 +491,25 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
/* AF_INET and AF_INET6 share common port field. */
port = addr->v4.sin_port;
+ SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_add_peer:association %p addr: ",
+ " port: %d state:%s\n",
+ asoc,
+ addr,
+ addr->v4.sin_port,
+ peer_state == SCTP_UNKNOWN?"UNKNOWN":"ACTIVE");
+
/* Set the port if it has not been set yet. */
if (0 == asoc->peer.port)
asoc->peer.port = port;
/* Check to see if this is a duplicate. */
peer = sctp_assoc_lookup_paddr(asoc, addr);
- if (peer)
+ if (peer) {
+ if (peer_state == SCTP_ACTIVE &&
+ peer->state == SCTP_UNKNOWN)
+ peer->state = SCTP_ACTIVE;
return peer;
+ }
peer = sctp_transport_new(addr, gfp);
if (!peer)
@@ -516,8 +576,12 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
/* Set the transport's RTO.initial value */
peer->rto = asoc->rto_initial;
+ /* Set the peer's active state. */
+ peer->state = peer_state;
+
/* Attach the remote transport to our asoc. */
list_add_tail(&peer->transports, &asoc->peer.transport_addr_list);
+ asoc->peer.transport_count++;
/* If we do not yet have a primary path, set one. */
if (!asoc->peer.primary_path) {
@@ -525,8 +589,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
asoc->peer.retran_path = peer;
}
- if (asoc->peer.active_path == asoc->peer.retran_path)
+ if (asoc->peer.active_path == asoc->peer.retran_path) {
asoc->peer.retran_path = peer;
+ }
return peer;
}
@@ -537,37 +602,16 @@ void sctp_assoc_del_peer(struct sctp_association *asoc,
{
struct list_head *pos;
struct list_head *temp;
- struct sctp_transport *peer = NULL;
struct sctp_transport *transport;
list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
transport = list_entry(pos, struct sctp_transport, transports);
if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) {
- peer = transport;
- list_del(pos);
+ /* Do book keeping for removing the peer and free it. */
+ sctp_assoc_rm_peer(asoc, transport);
break;
}
}
-
- /* The address we want delete is not in the association. */
- if (!peer)
- return;
-
- /* Get the first transport of asoc. */
- pos = asoc->peer.transport_addr_list.next;
- transport = list_entry(pos, struct sctp_transport, transports);
-
- /* Update any entries that match the peer to be deleted. */
- if (asoc->peer.primary_path == peer)
- sctp_assoc_set_primary(asoc, transport);
- if (asoc->peer.active_path == peer)
- asoc->peer.active_path = transport;
- if (asoc->peer.retran_path == peer)
- asoc->peer.retran_path = transport;
- if (asoc->peer.last_data_from == peer)
- asoc->peer.last_data_from = transport;
-
- sctp_transport_free(peer);
}
/* Lookup a transport by address. */
@@ -608,12 +652,12 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
/* Record the transition on the transport. */
switch (command) {
case SCTP_TRANSPORT_UP:
- transport->active = SCTP_ACTIVE;
+ transport->state = SCTP_ACTIVE;
spc_state = SCTP_ADDR_AVAILABLE;
break;
case SCTP_TRANSPORT_DOWN:
- transport->active = SCTP_INACTIVE;
+ transport->state = SCTP_INACTIVE;
spc_state = SCTP_ADDR_UNREACHABLE;
break;
@@ -643,7 +687,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
list_for_each(pos, &asoc->peer.transport_addr_list) {
t = list_entry(pos, struct sctp_transport, transports);
- if (!t->active)
+ if (t->state == SCTP_INACTIVE)
continue;
if (!first || t->last_time_heard > first->last_time_heard) {
second = first;
@@ -663,7 +707,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
* [If the primary is active but not most recent, bump the most
* recently used transport.]
*/
- if (asoc->peer.primary_path->active &&
+ if (asoc->peer.primary_path->state != SCTP_INACTIVE &&
first != asoc->peer.primary_path) {
second = first;
first = asoc->peer.primary_path;
@@ -958,7 +1002,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
transports);
if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr))
sctp_assoc_add_peer(asoc, &trans->ipaddr,
- GFP_ATOMIC);
+ GFP_ATOMIC, SCTP_ACTIVE);
}
asoc->ctsn_ack_point = asoc->next_tsn - 1;
@@ -998,7 +1042,7 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
/* Try to find an active transport. */
- if (t->active) {
+ if (t->state != SCTP_INACTIVE) {
break;
} else {
/* Keep track of the next transport in case
@@ -1019,6 +1063,40 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
}
asoc->peer.retran_path = t;
+
+ SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association"
+ " %p addr: ",
+ " port: %d\n",
+ asoc,
+ (&t->ipaddr),
+ t->ipaddr.v4.sin_port);
+}
+
+/* Choose the transport for sending a INIT packet. */
+struct sctp_transport *sctp_assoc_choose_init_transport(
+ struct sctp_association *asoc)
+{
+ struct sctp_transport *t;
+
+ /* Use the retran path. If the last INIT was sent over the
+ * retran path, update the retran path and use it.
+ */
+ if (!asoc->init_last_sent_to) {
+ t = asoc->peer.active_path;
+ } else {
+ if (asoc->init_last_sent_to == asoc->peer.retran_path)
+ sctp_assoc_update_retran_path(asoc);
+ t = asoc->peer.retran_path;
+ }
+
+ SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association"
+ " %p addr: ",
+ " port: %d\n",
+ asoc,
+ (&t->ipaddr),
+ t->ipaddr.v4.sin_port);
+
+ return t;
}
/* Choose the transport for sending a SHUTDOWN packet. */
@@ -1152,7 +1230,8 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len)
/* Build the bind address list for the association based on info from the
* local endpoint and the remote peer.
*/
-int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp)
+int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
+ unsigned int __nocast gfp)
{
sctp_scope_t scope;
int flags;
@@ -1174,7 +1253,8 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp)
/* Build the association's bind address list from the cookie. */
int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
- struct sctp_cookie *cookie, int gfp)
+ struct sctp_cookie *cookie,
+ unsigned int __nocast gfp)
{
int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
int var_size3 = cookie->raw_addr_list_len;
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index f90eadf..f715497 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -53,7 +53,8 @@
/* Forward declarations for internal helpers. */
static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *,
- sctp_scope_t scope, int gfp, int flags);
+ sctp_scope_t scope, unsigned int __nocast gfp,
+ int flags);
static void sctp_bind_addr_clean(struct sctp_bind_addr *);
/* First Level Abstractions. */
@@ -63,7 +64,8 @@ static void sctp_bind_addr_clean(struct sctp_bind_addr *);
*/
int sctp_bind_addr_copy(struct sctp_bind_addr *dest,
const struct sctp_bind_addr *src,
- sctp_scope_t scope, int gfp, int flags)
+ sctp_scope_t scope, unsigned int __nocast gfp,
+ int flags)
{
struct sctp_sockaddr_entry *addr;
struct list_head *pos;
@@ -144,7 +146,7 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp)
/* Add an address to the bind address list in the SCTP_bind_addr structure. */
int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
- int gfp)
+ unsigned int __nocast gfp)
{
struct sctp_sockaddr_entry *addr;
@@ -197,7 +199,8 @@ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
* The second argument is the return value for the length.
*/
union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
- int *addrs_len, int gfp)
+ int *addrs_len,
+ unsigned int __nocast gfp)
{
union sctp_params addrparms;
union sctp_params retval;
@@ -249,7 +252,7 @@ end_raw:
* address parameters).
*/
int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
- int addrs_len, __u16 port, int gfp)
+ int addrs_len, __u16 port, unsigned int __nocast gfp)
{
union sctp_addr_param *rawaddr;
struct sctp_paramhdr *param;
@@ -347,7 +350,8 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp,
/* Copy out addresses from the global local address list. */
static int sctp_copy_one_addr(struct sctp_bind_addr *dest,
union sctp_addr *addr,
- sctp_scope_t scope, int gfp, int flags)
+ sctp_scope_t scope, unsigned int __nocast gfp,
+ int flags)
{
int error = 0;
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 0c2ab78..61da293 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -62,7 +62,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
}
/* Allocate and initialize datamsg. */
-SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(int gfp)
+SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(unsigned int __nocast gfp)
{
struct sctp_datamsg *msg;
msg = kmalloc(sizeof(struct sctp_datamsg), gfp);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 334f617..e22ccd6 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -67,7 +67,8 @@ static void sctp_endpoint_bh_rcv(struct sctp_endpoint *ep);
* Initialize the base fields of the endpoint structure.
*/
static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
- struct sock *sk, int gfp)
+ struct sock *sk,
+ unsigned int __nocast gfp)
{
struct sctp_sock *sp = sctp_sk(sk);
memset(ep, 0, sizeof(struct sctp_endpoint));
@@ -102,9 +103,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
/* Set up the base timeout information. */
ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
- SCTP_DEFAULT_TIMEOUT_T1_COOKIE;
+ msecs_to_jiffies(sp->rtoinfo.srto_initial);
ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
- SCTP_DEFAULT_TIMEOUT_T1_INIT;
+ msecs_to_jiffies(sp->rtoinfo.srto_initial);
ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =
msecs_to_jiffies(sp->rtoinfo.srto_initial);
ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
@@ -117,12 +118,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
= 5 * msecs_to_jiffies(sp->rtoinfo.srto_max);
- ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] =
- SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
- ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
- SCTP_DEFAULT_TIMEOUT_SACK;
- ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
- sp->autoclose * HZ;
+ ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
+ ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout;
+ ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
/* Use SCTP specific send buffer space queues. */
ep->sndbuf_policy = sctp_sndbuf_policy;
@@ -134,14 +132,14 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
ep->last_key = ep->current_key = 0;
ep->key_changed_at = jiffies;
- ep->debug_name = "unnamedEndpoint";
return ep;
}
/* Create a sctp_endpoint with all that boring stuff initialized.
* Returns NULL if there isn't enough memory.
*/
-struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, int gfp)
+struct sctp_endpoint *sctp_endpoint_new(struct sock *sk,
+ unsigned int __nocast gfp)
{
struct sctp_endpoint *ep;
@@ -195,8 +193,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
sctp_unhash_endpoint(ep);
/* Free up the HMAC transform. */
- if (sctp_sk(ep->base.sk)->hmac)
- sctp_crypto_free_tfm(sctp_sk(ep->base.sk)->hmac);
+ sctp_crypto_free_tfm(sctp_sk(ep->base.sk)->hmac);
/* Cleanup. */
sctp_inq_free(&ep->base.inqueue);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index b719a77..28f3224 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -115,6 +115,17 @@ static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk)
atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc);
}
+struct sctp_input_cb {
+ union {
+ struct inet_skb_parm h4;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ struct inet6_skb_parm h6;
+#endif
+ } header;
+ struct sctp_chunk *chunk;
+};
+#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0]))
+
/*
* This is the routine which IP calls when receiving an SCTP packet.
*/
@@ -178,6 +189,37 @@ int sctp_rcv(struct sk_buff *skb)
asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport);
+ if (!asoc)
+ ep = __sctp_rcv_lookup_endpoint(&dest);
+
+ /* Retrieve the common input handling substructure. */
+ rcvr = asoc ? &asoc->base : &ep->base;
+ sk = rcvr->sk;
+
+ /*
+ * If a frame arrives on an interface and the receiving socket is
+ * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB
+ */
+ if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb)))
+ {
+ sock_put(sk);
+ if (asoc) {
+ sctp_association_put(asoc);
+ asoc = NULL;
+ } else {
+ sctp_endpoint_put(ep);
+ ep = NULL;
+ }
+ sk = sctp_get_ctl_sock();
+ ep = sctp_sk(sk)->ep;
+ sctp_endpoint_hold(ep);
+ sock_hold(sk);
+ rcvr = &ep->base;
+ }
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ goto discard_release;
+
/*
* RFC 2960, 8.4 - Handle "Out of the blue" Packets.
* An SCTP packet is called an "out of the blue" (OOTB)
@@ -187,25 +229,15 @@ int sctp_rcv(struct sk_buff *skb)
* packet belongs.
*/
if (!asoc) {
- ep = __sctp_rcv_lookup_endpoint(&dest);
if (sctp_rcv_ootb(skb)) {
SCTP_INC_STATS_BH(SCTP_MIB_OUTOFBLUES);
goto discard_release;
}
}
- /* Retrieve the common input handling substructure. */
- rcvr = asoc ? &asoc->base : &ep->base;
- sk = rcvr->sk;
-
- if ((sk) && (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)) {
- goto discard_release;
- }
-
-
/* SCTP seems to always need a timestamp right now (FIXME) */
- if (skb->stamp.tv_sec == 0) {
- do_gettimeofday(&skb->stamp);
+ if (skb->tstamp.off_sec == 0) {
+ __net_timestamp(skb);
sock_enable_timestamp(sk);
}
@@ -222,6 +254,7 @@ int sctp_rcv(struct sk_buff *skb)
ret = -ENOMEM;
goto discard_release;
}
+ SCTP_INPUT_CB(skb)->chunk = chunk;
sctp_rcv_set_owner_r(skb,sk);
@@ -244,9 +277,9 @@ int sctp_rcv(struct sk_buff *skb)
sctp_bh_lock_sock(sk);
if (sock_owned_by_user(sk))
- sk_add_backlog(sk, (struct sk_buff *) chunk);
+ sk_add_backlog(sk, skb);
else
- sctp_backlog_rcv(sk, (struct sk_buff *) chunk);
+ sctp_backlog_rcv(sk, skb);
/* Release the sock and any reference counts we took in the
* lookup calls.
@@ -265,13 +298,11 @@ discard_it:
discard_release:
/* Release any structures we may be holding. */
- if (asoc) {
- sock_put(asoc->base.sk);
+ sock_put(sk);
+ if (asoc)
sctp_association_put(asoc);
- } else {
- sock_put(ep->base.sk);
+ else
sctp_endpoint_put(ep);
- }
goto discard_it;
}
@@ -283,14 +314,8 @@ discard_release:
*/
int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
- struct sctp_chunk *chunk;
- struct sctp_inq *inqueue;
-
- /* One day chunk will live inside the skb, but for
- * now this works.
- */
- chunk = (struct sctp_chunk *) skb;
- inqueue = &chunk->rcvr->inqueue;
+ struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
+ struct sctp_inq *inqueue = &chunk->rcvr->inqueue;
sctp_inq_push(inqueue, chunk);
return 0;
@@ -326,7 +351,6 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
*
*/
void sctp_icmp_proto_unreachable(struct sock *sk,
- struct sctp_endpoint *ep,
struct sctp_association *asoc,
struct sctp_transport *t)
{
@@ -334,7 +358,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
sctp_do_sm(SCTP_EVENT_T_OTHER,
SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
- asoc->state, asoc->ep, asoc, NULL,
+ asoc->state, asoc->ep, asoc, t,
GFP_ATOMIC);
}
@@ -342,7 +366,6 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
/* Common lookup code for icmp/icmpv6 error handler. */
struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
struct sctphdr *sctphdr,
- struct sctp_endpoint **epp,
struct sctp_association **app,
struct sctp_transport **tpp)
{
@@ -350,11 +373,10 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
union sctp_addr daddr;
struct sctp_af *af;
struct sock *sk = NULL;
- struct sctp_endpoint *ep = NULL;
struct sctp_association *asoc = NULL;
struct sctp_transport *transport = NULL;
- *app = NULL; *epp = NULL; *tpp = NULL;
+ *app = NULL; *tpp = NULL;
af = sctp_get_af_specific(family);
if (unlikely(!af)) {
@@ -369,26 +391,15 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
* packet.
*/
asoc = __sctp_lookup_association(&saddr, &daddr, &transport);
- if (!asoc) {
- /* If there is no matching association, see if it matches any
- * endpoint. This may happen for an ICMP error generated in
- * response to an INIT_ACK.
- */
- ep = __sctp_rcv_lookup_endpoint(&daddr);
- if (!ep) {
- return NULL;
- }
- }
+ if (!asoc)
+ return NULL;
- if (asoc) {
- sk = asoc->base.sk;
+ sk = asoc->base.sk;
- if (ntohl(sctphdr->vtag) != asoc->c.peer_vtag) {
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
- goto out;
- }
- } else
- sk = ep->base.sk;
+ if (ntohl(sctphdr->vtag) != asoc->c.peer_vtag) {
+ ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ goto out;
+ }
sctp_bh_lock_sock(sk);
@@ -398,7 +409,6 @@ struct sock *sctp_err_lookup(int family, struct sk_buff *skb,
if (sock_owned_by_user(sk))
NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
- *epp = ep;
*app = asoc;
*tpp = transport;
return sk;
@@ -407,21 +417,16 @@ out:
sock_put(sk);
if (asoc)
sctp_association_put(asoc);
- if (ep)
- sctp_endpoint_put(ep);
return NULL;
}
/* Common cleanup code for icmp/icmpv6 error handler. */
-void sctp_err_finish(struct sock *sk, struct sctp_endpoint *ep,
- struct sctp_association *asoc)
+void sctp_err_finish(struct sock *sk, struct sctp_association *asoc)
{
sctp_bh_unlock_sock(sk);
sock_put(sk);
if (asoc)
sctp_association_put(asoc);
- if (ep)
- sctp_endpoint_put(ep);
}
/*
@@ -446,7 +451,6 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
int type = skb->h.icmph->type;
int code = skb->h.icmph->code;
struct sock *sk;
- struct sctp_endpoint *ep;
struct sctp_association *asoc;
struct sctp_transport *transport;
struct inet_sock *inet;
@@ -463,7 +467,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
savesctp = skb->h.raw;
skb->nh.iph = iph;
skb->h.raw = (char *)sh;
- sk = sctp_err_lookup(AF_INET, skb, sh, &ep, &asoc, &transport);
+ sk = sctp_err_lookup(AF_INET, skb, sh, &asoc, &transport);
/* Put back, the original pointers. */
skb->nh.raw = saveip;
skb->h.raw = savesctp;
@@ -490,7 +494,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
}
else {
if (ICMP_PROT_UNREACH == code) {
- sctp_icmp_proto_unreachable(sk, ep, asoc,
+ sctp_icmp_proto_unreachable(sk, asoc,
transport);
goto out_unlock;
}
@@ -519,7 +523,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
}
out_unlock:
- sctp_err_finish(sk, ep, asoc);
+ sctp_err_finish(sk, asoc);
}
/*
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index cedf435..2d33922 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -50,7 +50,7 @@
/* Initialize an SCTP inqueue. */
void sctp_inq_init(struct sctp_inq *queue)
{
- skb_queue_head_init(&queue->in);
+ INIT_LIST_HEAD(&queue->in_chunk_list);
queue->in_progress = NULL;
/* Create a task for delivering data. */
@@ -62,11 +62,13 @@ void sctp_inq_init(struct sctp_inq *queue)
/* Release the memory associated with an SCTP inqueue. */
void sctp_inq_free(struct sctp_inq *queue)
{
- struct sctp_chunk *chunk;
+ struct sctp_chunk *chunk, *tmp;
/* Empty the queue. */
- while ((chunk = (struct sctp_chunk *) skb_dequeue(&queue->in)) != NULL)
+ list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) {
+ list_del_init(&chunk->list);
sctp_chunk_free(chunk);
+ }
/* If there is a packet which is currently being worked on,
* free it as well.
@@ -92,7 +94,7 @@ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *packet)
* Eventually, we should clean up inqueue to not rely
* on the BH related data structures.
*/
- skb_queue_tail(&(q->in), (struct sk_buff *) packet);
+ list_add_tail(&packet->list, &q->in_chunk_list);
q->immediate.func(q->immediate.data);
}
@@ -131,12 +133,16 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
/* Do we need to take the next packet out of the queue to process? */
if (!chunk) {
+ struct list_head *entry;
+
/* Is the queue empty? */
- if (skb_queue_empty(&queue->in))
+ if (list_empty(&queue->in_chunk_list))
return NULL;
+ entry = queue->in_chunk_list.next;
chunk = queue->in_progress =
- (struct sctp_chunk *) skb_dequeue(&queue->in);
+ list_entry(entry, struct sctp_chunk, list);
+ list_del_init(entry);
/* This is the first chunk in the packet. */
chunk->singleton = 1;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c9d9ea0..fa3be2b 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -66,8 +66,8 @@
#include <linux/seq_file.h>
#include <net/protocol.h>
-#include <net/tcp.h>
#include <net/ndisc.h>
+#include <net/ip.h>
#include <net/ipv6.h>
#include <net/transp_v6.h>
#include <net/addrconf.h>
@@ -91,7 +91,6 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct ipv6hdr *iph = (struct ipv6hdr *)skb->data;
struct sctphdr *sh = (struct sctphdr *)(skb->data + offset);
struct sock *sk;
- struct sctp_endpoint *ep;
struct sctp_association *asoc;
struct sctp_transport *transport;
struct ipv6_pinfo *np;
@@ -105,7 +104,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
savesctp = skb->h.raw;
skb->nh.ipv6h = iph;
skb->h.raw = (char *)sh;
- sk = sctp_err_lookup(AF_INET6, skb, sh, &ep, &asoc, &transport);
+ sk = sctp_err_lookup(AF_INET6, skb, sh, &asoc, &transport);
/* Put back, the original pointers. */
skb->nh.raw = saveip;
skb->h.raw = savesctp;
@@ -124,7 +123,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
goto out_unlock;
case ICMPV6_PARAMPROB:
if (ICMPV6_UNK_NEXTHDR == code) {
- sctp_icmp_proto_unreachable(sk, ep, asoc, transport);
+ sctp_icmp_proto_unreachable(sk, asoc, transport);
goto out_unlock;
}
break;
@@ -142,7 +141,7 @@ SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
}
out_unlock:
- sctp_err_finish(sk, ep, asoc);
+ sctp_err_finish(sk, asoc);
out:
if (likely(idev != NULL))
in6_dev_put(idev);
@@ -642,10 +641,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
else
newinet->pmtudisc = IP_PMTUDISC_WANT;
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet6_sock_nr);
- atomic_inc(&inet_sock_nr);
-#endif
+ sk_refcnt_debug_inc(newsk);
if (newsk->sk_prot->init(newsk)) {
sk_common_release(newsk);
@@ -812,26 +808,23 @@ static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr)
if (addr->sa.sa_family != AF_INET6)
af = sctp_get_af_specific(addr->sa.sa_family);
else {
- struct sock *sk;
int type = ipv6_addr_type(&addr->v6.sin6_addr);
- sk = sctp_opt2sk(opt);
+ struct net_device *dev;
+
if (type & IPV6_ADDR_LINKLOCAL) {
- /* Note: Behavior similar to af_inet6.c:
- * 1) Overrides previous bound_dev_if
- * 2) Destructive even if bind isn't successful.
- */
-
- if (addr->v6.sin6_scope_id)
- sk->sk_bound_dev_if = addr->v6.sin6_scope_id;
- if (!sk->sk_bound_dev_if)
+ if (!addr->v6.sin6_scope_id)
+ return 0;
+ dev = dev_get_by_index(addr->v6.sin6_scope_id);
+ if (!dev)
return 0;
+ dev_put(dev);
}
af = opt->pf->af;
}
return af->available(addr, opt);
}
-/* Verify that the provided sockaddr looks bindable. Common verification,
+/* Verify that the provided sockaddr looks sendable. Common verification,
* has already been taken care of.
*/
static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
@@ -842,19 +835,16 @@ static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
if (addr->sa.sa_family != AF_INET6)
af = sctp_get_af_specific(addr->sa.sa_family);
else {
- struct sock *sk;
int type = ipv6_addr_type(&addr->v6.sin6_addr);
- sk = sctp_opt2sk(opt);
+ struct net_device *dev;
+
if (type & IPV6_ADDR_LINKLOCAL) {
- /* Note: Behavior similar to af_inet6.c:
- * 1) Overrides previous bound_dev_if
- * 2) Destructive even if bind isn't successful.
- */
-
- if (addr->v6.sin6_scope_id)
- sk->sk_bound_dev_if = addr->v6.sin6_scope_id;
- if (!sk->sk_bound_dev_if)
+ if (!addr->v6.sin6_scope_id)
+ return 0;
+ dev = dev_get_by_index(addr->v6.sin6_scope_id);
+ if (!dev)
return 0;
+ dev_put(dev);
}
af = opt->pf->af;
}
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 0781e5d..8ff588f 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -127,8 +127,12 @@ done:
/* Initialize the objcount in the proc filesystem. */
void sctp_dbg_objcnt_init(void)
{
- create_proc_read_entry("sctp_dbg_objcnt", 0, proc_net_sctp,
+ struct proc_dir_entry *ent;
+ ent = create_proc_read_entry("sctp_dbg_objcnt", 0, proc_net_sctp,
sctp_dbg_objcnt_read, NULL);
+ if (!ent)
+ printk(KERN_WARNING
+ "sctp_dbg_objcnt: Unable to create /proc entry.\n");
}
/* Cleanup the objcount entry in the proc filesystem. */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 84b5b37..9313716 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -108,7 +108,7 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
packet->transport = transport;
packet->source_port = sport;
packet->destination_port = dport;
- skb_queue_head_init(&packet->chunks);
+ INIT_LIST_HEAD(&packet->chunk_list);
if (asoc) {
struct sctp_sock *sp = sctp_sk(asoc->base.sk);
overhead = sp->pf->af->net_header_len;
@@ -129,12 +129,14 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
/* Free a packet. */
void sctp_packet_free(struct sctp_packet *packet)
{
- struct sctp_chunk *chunk;
+ struct sctp_chunk *chunk, *tmp;
SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
- while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL)
+ list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+ list_del_init(&chunk->list);
sctp_chunk_free(chunk);
+ }
if (packet->malloced)
kfree(packet);
@@ -276,7 +278,7 @@ append:
packet->has_sack = 1;
/* It is OK to send this chunk. */
- __skb_queue_tail(&packet->chunks, (struct sk_buff *)chunk);
+ list_add_tail(&chunk->list, &packet->chunk_list);
packet->size += chunk_len;
chunk->transport = packet->transport;
finish:
@@ -295,7 +297,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
struct sctphdr *sh;
__u32 crc32;
struct sk_buff *nskb;
- struct sctp_chunk *chunk;
+ struct sctp_chunk *chunk, *tmp;
struct sock *sk;
int err = 0;
int padding; /* How much padding do we need? */
@@ -305,11 +307,11 @@ int sctp_packet_transmit(struct sctp_packet *packet)
SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
/* Do NOT generate a chunkless packet. */
- chunk = (struct sctp_chunk *)skb_peek(&packet->chunks);
- if (unlikely(!chunk))
+ if (list_empty(&packet->chunk_list))
return err;
/* Set up convenience variables... */
+ chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
sk = chunk->skb->sk;
/* Allocate the new skb. */
@@ -370,7 +372,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
* [This whole comment explains WORD_ROUND() below.]
*/
SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n");
- while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) {
+ list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+ list_del_init(&chunk->list);
if (sctp_chunk_is_data(chunk)) {
if (!chunk->has_tsn) {
@@ -511,7 +514,8 @@ err:
* will get resent or dropped later.
*/
- while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) {
+ list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+ list_del_init(&chunk->list);
if (!sctp_chunk_is_data(chunk))
sctp_chunk_free(chunk);
}
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 1b2d4adc..efb72fa 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -75,7 +75,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
static inline void sctp_outq_head_data(struct sctp_outq *q,
struct sctp_chunk *ch)
{
- __skb_queue_head(&q->out, (struct sk_buff *)ch);
+ list_add(&ch->list, &q->out_chunk_list);
q->out_qlen += ch->skb->len;
return;
}
@@ -83,17 +83,22 @@ static inline void sctp_outq_head_data(struct sctp_outq *q,
/* Take data from the front of the queue. */
static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
{
- struct sctp_chunk *ch;
- ch = (struct sctp_chunk *)__skb_dequeue(&q->out);
- if (ch)
+ struct sctp_chunk *ch = NULL;
+
+ if (!list_empty(&q->out_chunk_list)) {
+ struct list_head *entry = q->out_chunk_list.next;
+
+ ch = list_entry(entry, struct sctp_chunk, list);
+ list_del_init(entry);
q->out_qlen -= ch->skb->len;
+ }
return ch;
}
/* Add data chunk to the end of the queue. */
static inline void sctp_outq_tail_data(struct sctp_outq *q,
struct sctp_chunk *ch)
{
- __skb_queue_tail(&q->out, (struct sk_buff *)ch);
+ list_add_tail(&ch->list, &q->out_chunk_list);
q->out_qlen += ch->skb->len;
return;
}
@@ -197,8 +202,8 @@ static inline int sctp_cacc_skip(struct sctp_transport *primary,
void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
{
q->asoc = asoc;
- skb_queue_head_init(&q->out);
- skb_queue_head_init(&q->control);
+ INIT_LIST_HEAD(&q->out_chunk_list);
+ INIT_LIST_HEAD(&q->control_chunk_list);
INIT_LIST_HEAD(&q->retransmit);
INIT_LIST_HEAD(&q->sacked);
INIT_LIST_HEAD(&q->abandoned);
@@ -217,7 +222,7 @@ void sctp_outq_teardown(struct sctp_outq *q)
{
struct sctp_transport *transport;
struct list_head *lchunk, *pos, *temp;
- struct sctp_chunk *chunk;
+ struct sctp_chunk *chunk, *tmp;
/* Throw away unacknowledged chunks. */
list_for_each(pos, &q->asoc->peer.transport_addr_list) {
@@ -269,8 +274,10 @@ void sctp_outq_teardown(struct sctp_outq *q)
q->error = 0;
/* Throw away any leftover control chunks. */
- while ((chunk = (struct sctp_chunk *) skb_dequeue(&q->control)) != NULL)
+ list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+ list_del_init(&chunk->list);
sctp_chunk_free(chunk);
+ }
}
/* Free the outqueue structure and any related pending chunks. */
@@ -333,7 +340,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
break;
};
} else {
- __skb_queue_tail(&q->control, (struct sk_buff *) chunk);
+ list_add_tail(&chunk->list, &q->control_chunk_list);
SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
}
@@ -650,10 +657,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
__u16 sport = asoc->base.bind_addr.port;
__u16 dport = asoc->peer.port;
__u32 vtag = asoc->peer.i.init_tag;
- struct sk_buff_head *queue;
struct sctp_transport *transport = NULL;
struct sctp_transport *new_transport;
- struct sctp_chunk *chunk;
+ struct sctp_chunk *chunk, *tmp;
sctp_xmit_t status;
int error = 0;
int start_timer = 0;
@@ -675,16 +681,17 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
* ...
*/
- queue = &q->control;
- while ((chunk = (struct sctp_chunk *)skb_dequeue(queue)) != NULL) {
+ list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+ list_del_init(&chunk->list);
+
/* Pick the right transport to use. */
new_transport = chunk->transport;
if (!new_transport) {
new_transport = asoc->peer.active_path;
- } else if (!new_transport->active) {
- /* If the chunk is Heartbeat or Heartbeat Ack,
- * send it to chunk->transport, even if it's
+ } else if (new_transport->state == SCTP_INACTIVE) {
+ /* If the chunk is Heartbeat or Heartbeat Ack,
+ * send it to chunk->transport, even if it's
* inactive.
*
* 3.3.6 Heartbeat Acknowledgement:
@@ -814,8 +821,6 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
/* Finally, transmit new packets. */
start_timer = 0;
- queue = &q->out;
-
while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
/* RFC 2960 6.5 Every DATA chunk MUST carry a valid
* stream identifier.
@@ -840,7 +845,8 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
* Otherwise, we want to use the active path.
*/
new_transport = chunk->transport;
- if (!new_transport || !new_transport->active)
+ if (!new_transport ||
+ new_transport->state == SCTP_INACTIVE)
new_transport = asoc->peer.active_path;
/* Change packets if necessary. */
@@ -1148,8 +1154,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
/* See if all chunks are acked.
* Make sure the empty queue handler will get run later.
*/
- q->empty = skb_queue_empty(&q->out) && skb_queue_empty(&q->control) &&
- list_empty(&q->retransmit);
+ q->empty = (list_empty(&q->out_chunk_list) &&
+ list_empty(&q->control_chunk_list) &&
+ list_empty(&q->retransmit));
if (!q->empty)
goto finish;
@@ -1454,7 +1461,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
/* Mark the destination transport address as
* active if it is not so marked.
*/
- if (!transport->active) {
+ if (transport->state == SCTP_INACTIVE) {
sctp_assoc_control_transport(
transport->asoc,
transport,
@@ -1678,9 +1685,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
if (TSN_lte(tsn, ctsn)) {
list_del_init(lchunk);
if (!chunk->tsn_gap_acked) {
- chunk->transport->flight_size -=
- sctp_data_size(chunk);
- q->outstanding_bytes -= sctp_data_size(chunk);
+ chunk->transport->flight_size -=
+ sctp_data_size(chunk);
+ q->outstanding_bytes -= sctp_data_size(chunk);
}
sctp_chunk_free(chunk);
} else {
@@ -1728,7 +1735,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
nskips, &ftsn_skip_arr[0]);
if (ftsn_chunk) {
- __skb_queue_tail(&q->control, (struct sk_buff *)ftsn_chunk);
+ list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
}
}
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index e42fd8c..b74f777 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -57,6 +57,7 @@ static struct snmp_mib sctp_snmp_list[] = {
SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS),
SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS),
SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS),
+ SNMP_MIB_SENTINEL
};
/* Return the current value of a particular entry in the mib by adding its
@@ -132,14 +133,25 @@ void sctp_snmp_proc_exit(void)
static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb)
{
struct list_head *pos;
+ struct sctp_association *asoc;
struct sctp_sockaddr_entry *laddr;
- union sctp_addr *addr;
+ struct sctp_transport *peer;
+ union sctp_addr *addr, *primary = NULL;
struct sctp_af *af;
+ if (epb->type == SCTP_EP_TYPE_ASSOCIATION) {
+ asoc = sctp_assoc(epb);
+ peer = asoc->peer.primary_path;
+ primary = &peer->saddr;
+ }
+
list_for_each(pos, &epb->bind_addr.address_list) {
laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
addr = (union sctp_addr *)&laddr->a;
af = sctp_get_af_specific(addr->sa.sa_family);
+ if (primary && af->cmp_addr(addr, primary)) {
+ seq_printf(seq, "*");
+ }
af->seq_dump_addr(seq, addr);
}
}
@@ -149,17 +161,54 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa
{
struct list_head *pos;
struct sctp_transport *transport;
- union sctp_addr *addr;
+ union sctp_addr *addr, *primary;
struct sctp_af *af;
+ primary = &(assoc->peer.primary_addr);
list_for_each(pos, &assoc->peer.transport_addr_list) {
transport = list_entry(pos, struct sctp_transport, transports);
addr = (union sctp_addr *)&transport->ipaddr;
af = sctp_get_af_specific(addr->sa.sa_family);
+ if (af->cmp_addr(addr, primary)) {
+ seq_printf(seq, "*");
+ }
af->seq_dump_addr(seq, addr);
}
}
+static void * sctp_eps_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos > sctp_ep_hashsize)
+ return NULL;
+
+ if (*pos < 0)
+ *pos = 0;
+
+ if (*pos == 0)
+ seq_printf(seq, " ENDPT SOCK STY SST HBKT LPORT UID INODE LADDRS\n");
+
+ ++*pos;
+
+ return (void *)pos;
+}
+
+static void sctp_eps_seq_stop(struct seq_file *seq, void *v)
+{
+ return;
+}
+
+
+static void * sctp_eps_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ if (*pos > sctp_ep_hashsize)
+ return NULL;
+
+ ++*pos;
+
+ return pos;
+}
+
+
/* Display sctp endpoints (/proc/net/sctp/eps). */
static int sctp_eps_seq_show(struct seq_file *seq, void *v)
{
@@ -167,38 +216,50 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v)
struct sctp_ep_common *epb;
struct sctp_endpoint *ep;
struct sock *sk;
- int hash;
-
- seq_printf(seq, " ENDPT SOCK STY SST HBKT LPORT LADDRS\n");
- for (hash = 0; hash < sctp_ep_hashsize; hash++) {
- head = &sctp_ep_hashtable[hash];
- read_lock(&head->lock);
- for (epb = head->chain; epb; epb = epb->next) {
- ep = sctp_ep(epb);
- sk = epb->sk;
- seq_printf(seq, "%8p %8p %-3d %-3d %-4d %-5d ", ep, sk,
- sctp_sk(sk)->type, sk->sk_state, hash,
- epb->bind_addr.port);
- sctp_seq_dump_local_addrs(seq, epb);
- seq_printf(seq, "\n");
- }
- read_unlock(&head->lock);
+ int hash = *(int *)v;
+
+ if (hash > sctp_ep_hashsize)
+ return -ENOMEM;
+
+ head = &sctp_ep_hashtable[hash-1];
+ sctp_local_bh_disable();
+ read_lock(&head->lock);
+ for (epb = head->chain; epb; epb = epb->next) {
+ ep = sctp_ep(epb);
+ sk = epb->sk;
+ seq_printf(seq, "%8p %8p %-3d %-3d %-4d %-5d %5d %5lu ", ep, sk,
+ sctp_sk(sk)->type, sk->sk_state, hash-1,
+ epb->bind_addr.port,
+ sock_i_uid(sk), sock_i_ino(sk));
+
+ sctp_seq_dump_local_addrs(seq, epb);
+ seq_printf(seq, "\n");
}
+ read_unlock(&head->lock);
+ sctp_local_bh_enable();
return 0;
}
+static struct seq_operations sctp_eps_ops = {
+ .start = sctp_eps_seq_start,
+ .next = sctp_eps_seq_next,
+ .stop = sctp_eps_seq_stop,
+ .show = sctp_eps_seq_show,
+};
+
+
/* Initialize the seq file operations for 'eps' object. */
static int sctp_eps_seq_open(struct inode *inode, struct file *file)
{
- return single_open(file, sctp_eps_seq_show, NULL);
+ return seq_open(file, &sctp_eps_ops);
}
static struct file_operations sctp_eps_seq_fops = {
.open = sctp_eps_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = seq_release,
};
/* Set up the proc fs entry for 'eps' object. */
@@ -221,6 +282,40 @@ void sctp_eps_proc_exit(void)
remove_proc_entry("eps", proc_net_sctp);
}
+
+static void * sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos > sctp_assoc_hashsize)
+ return NULL;
+
+ if (*pos < 0)
+ *pos = 0;
+
+ if (*pos == 0)
+ seq_printf(seq, " ASSOC SOCK STY SST ST HBKT ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT "
+ "RPORT LADDRS <-> RADDRS\n");
+
+ ++*pos;
+
+ return (void *)pos;
+}
+
+static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
+{
+ return;
+}
+
+
+static void * sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ if (*pos > sctp_assoc_hashsize)
+ return NULL;
+
+ ++*pos;
+
+ return pos;
+}
+
/* Display sctp associations (/proc/net/sctp/assocs). */
static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
{
@@ -228,43 +323,57 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
struct sctp_ep_common *epb;
struct sctp_association *assoc;
struct sock *sk;
- int hash;
-
- seq_printf(seq, " ASSOC SOCK STY SST ST HBKT LPORT RPORT "
- "LADDRS <-> RADDRS\n");
- for (hash = 0; hash < sctp_assoc_hashsize; hash++) {
- head = &sctp_assoc_hashtable[hash];
- read_lock(&head->lock);
- for (epb = head->chain; epb; epb = epb->next) {
- assoc = sctp_assoc(epb);
- sk = epb->sk;
- seq_printf(seq,
- "%8p %8p %-3d %-3d %-2d %-4d %-5d %-5d ",
- assoc, sk, sctp_sk(sk)->type, sk->sk_state,
- assoc->state, hash, epb->bind_addr.port,
- assoc->peer.port);
- sctp_seq_dump_local_addrs(seq, epb);
- seq_printf(seq, "<-> ");
- sctp_seq_dump_remote_addrs(seq, assoc);
- seq_printf(seq, "\n");
- }
- read_unlock(&head->lock);
+ int hash = *(int *)v;
+
+ if (hash > sctp_assoc_hashsize)
+ return -ENOMEM;
+
+ head = &sctp_assoc_hashtable[hash-1];
+ sctp_local_bh_disable();
+ read_lock(&head->lock);
+ for (epb = head->chain; epb; epb = epb->next) {
+ assoc = sctp_assoc(epb);
+ sk = epb->sk;
+ seq_printf(seq,
+ "%8p %8p %-3d %-3d %-2d %-4d %4d %8d %8d %7d %5lu %-5d %5d ",
+ assoc, sk, sctp_sk(sk)->type, sk->sk_state,
+ assoc->state, hash-1, assoc->assoc_id,
+ (sk->sk_rcvbuf - assoc->rwnd),
+ assoc->sndbuf_used,
+ sock_i_uid(sk), sock_i_ino(sk),
+ epb->bind_addr.port,
+ assoc->peer.port);
+
+ seq_printf(seq, " ");
+ sctp_seq_dump_local_addrs(seq, epb);
+ seq_printf(seq, "<-> ");
+ sctp_seq_dump_remote_addrs(seq, assoc);
+ seq_printf(seq, "\n");
}
+ read_unlock(&head->lock);
+ sctp_local_bh_enable();
return 0;
}
+static struct seq_operations sctp_assoc_ops = {
+ .start = sctp_assocs_seq_start,
+ .next = sctp_assocs_seq_next,
+ .stop = sctp_assocs_seq_stop,
+ .show = sctp_assocs_seq_show,
+};
+
/* Initialize the seq file operations for 'assocs' object. */
static int sctp_assocs_seq_open(struct inode *inode, struct file *file)
{
- return single_open(file, sctp_assocs_seq_show, NULL);
+ return seq_open(file, &sctp_assoc_ops);
}
static struct file_operations sctp_assocs_seq_fops = {
.open = sctp_assocs_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = seq_release,
};
/* Set up the proc fs entry for 'assocs' object. */
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2e1f9c3..e7025be 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -62,7 +62,7 @@
/* Global data structures. */
struct sctp_globals sctp_globals;
struct proc_dir_entry *proc_net_sctp;
-DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics);
+DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics) __read_mostly;
struct idr sctp_assocs_id;
DEFINE_SPINLOCK(sctp_assocs_id_lock);
@@ -78,8 +78,8 @@ static struct sctp_pf *sctp_pf_inet_specific;
static struct sctp_af *sctp_af_v4_specific;
static struct sctp_af *sctp_af_v6_specific;
-kmem_cache_t *sctp_chunk_cachep;
-kmem_cache_t *sctp_bucket_cachep;
+kmem_cache_t *sctp_chunk_cachep __read_mostly;
+kmem_cache_t *sctp_bucket_cachep __read_mostly;
extern int sctp_snmp_proc_init(void);
extern int sctp_snmp_proc_exit(void);
@@ -219,7 +219,7 @@ static void sctp_free_local_addr_list(void)
/* Copy the local addresses which are valid for 'scope' into 'bp'. */
int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
- int gfp, int copy_flags)
+ unsigned int __nocast gfp, int copy_flags)
{
struct sctp_sockaddr_entry *addr;
int error = 0;
@@ -378,10 +378,13 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp)
{
int ret = inet_addr_type(addr->v4.sin_addr.s_addr);
- /* FIXME: ip_nonlocal_bind sysctl support. */
- if (addr->v4.sin_addr.s_addr != INADDR_ANY && ret != RTN_LOCAL)
+ if (addr->v4.sin_addr.s_addr != INADDR_ANY &&
+ ret != RTN_LOCAL &&
+ !sp->inet.freebind &&
+ !sysctl_ip_nonlocal_bind)
return 0;
+
return 1;
}
@@ -590,9 +593,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
newinet->mc_index = 0;
newinet->mc_list = NULL;
-#ifdef INET_REFCNT_DEBUG
- atomic_inc(&inet_sock_nr);
-#endif
+ sk_refcnt_debug_inc(newsk);
if (newsk->sk_prot->init(newsk)) {
sk_common_release(newsk);
@@ -1047,7 +1048,10 @@ SCTP_STATIC __init int sctp_init(void)
sctp_sndbuf_policy = 0;
/* HB.interval - 30 seconds */
- sctp_hb_interval = 30 * HZ;
+ sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
+
+ /* delayed SACK timeout */
+ sctp_sack_timeout = SCTP_DEFAULT_TIMEOUT_SACK;
/* Implementation specific variables. */
@@ -1238,6 +1242,10 @@ SCTP_STATIC __exit void sctp_exit(void)
module_init(sctp_init);
module_exit(sctp_exit);
+/*
+ * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly.
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132");
MODULE_AUTHOR("Linux Kernel SCTP developers <lksctp-developers@lists.sourceforge.net>");
MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
MODULE_LICENSE("GPL");
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 33ac8bf..3868a8d 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -78,7 +78,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
static int sctp_process_param(struct sctp_association *asoc,
union sctp_params param,
const union sctp_addr *peer_addr,
- int gfp);
+ unsigned int __nocast gfp);
/* What was the inbound interface for this chunk? */
int sctp_chunk_iif(const struct sctp_chunk *chunk)
@@ -174,7 +174,7 @@ void sctp_init_cause(struct sctp_chunk *chunk, __u16 cause_code,
*/
struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
const struct sctp_bind_addr *bp,
- int gfp, int vparam_len)
+ unsigned int __nocast gfp, int vparam_len)
{
sctp_inithdr_t init;
union sctp_params addrs;
@@ -261,7 +261,7 @@ nodata:
struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
const struct sctp_chunk *chunk,
- int gfp, int unkparam_len)
+ unsigned int __nocast gfp, int unkparam_len)
{
sctp_inithdr_t initack;
struct sctp_chunk *retval;
@@ -1003,6 +1003,7 @@ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb);
}
+ INIT_LIST_HEAD(&retval->list);
retval->skb = skb;
retval->asoc = (struct sctp_association *)asoc;
retval->resent = 0;
@@ -1116,8 +1117,7 @@ static void sctp_chunk_destroy(struct sctp_chunk *chunk)
/* Possibly, free the chunk. */
void sctp_chunk_free(struct sctp_chunk *chunk)
{
- /* Make sure that we are not on any list. */
- skb_unlink((struct sk_buff *) chunk);
+ BUG_ON(!list_empty(&chunk->list));
list_del_init(&chunk->transmitted_list);
/* Release our reference on the message tracker. */
@@ -1233,7 +1233,8 @@ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
/* Create a CLOSED association to use with an incoming packet. */
struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
- struct sctp_chunk *chunk, int gfp)
+ struct sctp_chunk *chunk,
+ unsigned int __nocast gfp)
{
struct sctp_association *asoc;
struct sk_buff *skb;
@@ -1348,7 +1349,7 @@ nodata:
struct sctp_association *sctp_unpack_cookie(
const struct sctp_endpoint *ep,
const struct sctp_association *asoc,
- struct sctp_chunk *chunk, int gfp,
+ struct sctp_chunk *chunk, unsigned int __nocast gfp,
int *error, struct sctp_chunk **errp)
{
struct sctp_association *retval = NULL;
@@ -1361,6 +1362,7 @@ struct sctp_association *sctp_unpack_cookie(
char *key;
sctp_scope_t scope;
struct sk_buff *skb = chunk->skb;
+ struct timeval tv;
headersize = sizeof(sctp_chunkhdr_t) + SCTP_SECRET_SIZE;
bodysize = ntohs(chunk->chunk_hdr->length) - headersize;
@@ -1433,7 +1435,8 @@ no_hmac:
* an association, there is no need to check cookie's expiration
* for init collision case of lost COOKIE ACK.
*/
- if (!asoc && tv_lt(bear_cookie->expiration, skb->stamp)) {
+ skb_get_timestamp(skb, &tv);
+ if (!asoc && tv_lt(bear_cookie->expiration, tv)) {
__u16 len;
/*
* Section 3.3.10.3 Stale Cookie Error (3)
@@ -1446,10 +1449,9 @@ no_hmac:
len = ntohs(chunk->chunk_hdr->length);
*errp = sctp_make_op_error_space(asoc, chunk, len);
if (*errp) {
- suseconds_t usecs = (skb->stamp.tv_sec -
+ suseconds_t usecs = (tv.tv_sec -
bear_cookie->expiration.tv_sec) * 1000000L +
- skb->stamp.tv_usec -
- bear_cookie->expiration.tv_usec;
+ tv.tv_usec - bear_cookie->expiration.tv_usec;
usecs = htonl(usecs);
sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE,
@@ -1812,7 +1814,7 @@ int sctp_verify_init(const struct sctp_association *asoc,
*/
int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
const union sctp_addr *peer_addr,
- sctp_init_chunk_t *peer_init, int gfp)
+ sctp_init_chunk_t *peer_init, unsigned int __nocast gfp)
{
union sctp_params param;
struct sctp_transport *transport;
@@ -1830,7 +1832,7 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
* be a a better choice than any of the embedded addresses.
*/
if (peer_addr)
- if(!sctp_assoc_add_peer(asoc, peer_addr, gfp))
+ if(!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE))
goto nomem;
/* Process the initialization parameters. */
@@ -1841,6 +1843,14 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
goto clean_up;
}
+ /* Walk list of transports, removing transports in the UNKNOWN state. */
+ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+ transport = list_entry(pos, struct sctp_transport, transports);
+ if (transport->state == SCTP_UNKNOWN) {
+ sctp_assoc_rm_peer(asoc, transport);
+ }
+ }
+
/* The fixed INIT headers are always in network byte
* order.
*/
@@ -1906,7 +1916,8 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
* stream sequence number shall be set to 0.
*/
- /* Allocate storage for the negotiated streams if it is not a temporary * association.
+ /* Allocate storage for the negotiated streams if it is not a temporary
+ * association.
*/
if (!asoc->temp) {
int assoc_id;
@@ -1952,6 +1963,9 @@ clean_up:
list_del_init(pos);
sctp_transport_free(transport);
}
+
+ asoc->peer.transport_count = 0;
+
nomem:
return 0;
}
@@ -1971,7 +1985,7 @@ nomem:
static int sctp_process_param(struct sctp_association *asoc,
union sctp_params param,
const union sctp_addr *peer_addr,
- int gfp)
+ unsigned int __nocast gfp)
{
union sctp_addr addr;
int i;
@@ -1995,7 +2009,7 @@ static int sctp_process_param(struct sctp_association *asoc,
af->from_addr_param(&addr, param.addr, asoc->peer.port, 0);
scope = sctp_scope(peer_addr);
if (sctp_in_scope(&addr, scope))
- if (!sctp_assoc_add_peer(asoc, &addr, gfp))
+ if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_ACTIVE))
return 0;
break;
@@ -2396,7 +2410,7 @@ static __u16 sctp_process_asconf_param(struct sctp_association *asoc,
* Due to Resource Shortage'.
*/
- peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC);
+ peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_ACTIVE);
if (!peer)
return SCTP_ERROR_RSRC_LOW;
@@ -2727,8 +2741,12 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
asoc->addip_last_asconf = NULL;
/* Send the next asconf chunk from the addip chunk queue. */
- asconf = (struct sctp_chunk *)__skb_dequeue(&asoc->addip_chunks);
- if (asconf) {
+ if (!list_empty(&asoc->addip_chunk_list)) {
+ struct list_head *entry = asoc->addip_chunk_list.next;
+ asconf = list_entry(entry, struct sctp_chunk, list);
+
+ list_del_init(entry);
+
/* Hold the chunk until an ASCONF_ACK is received. */
sctp_chunk_hold(asconf);
if (sctp_primitive_ASCONF(asoc, asconf))
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f65fa44..39c970b 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -63,7 +63,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
void *event_arg,
sctp_disposition_t status,
sctp_cmd_seq_t *commands,
- int gfp);
+ unsigned int __nocast gfp);
static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
sctp_state_t state,
struct sctp_endpoint *ep,
@@ -71,7 +71,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
void *event_arg,
sctp_disposition_t status,
sctp_cmd_seq_t *commands,
- int gfp);
+ unsigned int __nocast gfp);
/********************************************************************
* Helper functions
@@ -414,11 +414,13 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
*/
asoc->overall_error_count++;
- if (transport->active &&
+ if (transport->state != SCTP_INACTIVE &&
(transport->error_count++ >= transport->max_retrans)) {
- SCTP_DEBUG_PRINTK("transport_strike: transport "
- "IP:%d.%d.%d.%d failed.\n",
- NIPQUAD(transport->ipaddr.v4.sin_addr));
+ SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
+ " transport IP: port:%d failed.\n",
+ asoc,
+ (&transport->ipaddr),
+ transport->ipaddr.v4.sin_port);
sctp_assoc_control_transport(asoc, transport,
SCTP_TRANSPORT_DOWN,
SCTP_FAILED_THRESHOLD);
@@ -495,7 +497,8 @@ static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
struct sctp_association *asoc,
struct sctp_chunk *chunk,
- sctp_init_chunk_t *peer_init, int gfp)
+ sctp_init_chunk_t *peer_init,
+ unsigned int __nocast gfp)
{
int error;
@@ -593,7 +596,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
/* Mark the destination transport address as active if it is not so
* marked.
*/
- if (!t->active)
+ if (t->state == SCTP_INACTIVE)
sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
SCTP_HEARTBEAT_SUCCESS);
@@ -665,8 +668,11 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
asoc->state = state;
+ SCTP_DEBUG_PRINTK("sctp_cmd_new_state: asoc %p[%s]\n",
+ asoc, sctp_state_tbl[state]);
+
if (sctp_style(sk, TCP)) {
- /* Change the sk->sk_state of a TCP-style socket that has
+ /* Change the sk->sk_state of a TCP-style socket that has
* sucessfully completed a connect() call.
*/
if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED))
@@ -678,6 +684,16 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
sk->sk_shutdown |= RCV_SHUTDOWN;
}
+ if (sctp_state(asoc, COOKIE_WAIT)) {
+ /* Reset init timeouts since they may have been
+ * increased due to timer expirations.
+ */
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
+ asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT];
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
+ asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE];
+ }
+
if (sctp_state(asoc, ESTABLISHED) ||
sctp_state(asoc, CLOSED) ||
sctp_state(asoc, SHUTDOWN_RECEIVED)) {
@@ -837,7 +853,7 @@ int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
struct sctp_endpoint *ep,
struct sctp_association *asoc,
void *event_arg,
- int gfp)
+ unsigned int __nocast gfp)
{
sctp_cmd_seq_t commands;
const sctp_sm_table_entry_t *state_fn;
@@ -882,7 +898,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
void *event_arg,
sctp_disposition_t status,
sctp_cmd_seq_t *commands,
- int gfp)
+ unsigned int __nocast gfp)
{
int error;
@@ -970,7 +986,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
void *event_arg,
sctp_disposition_t status,
sctp_cmd_seq_t *commands,
- int gfp)
+ unsigned int __nocast gfp)
{
int error = 0;
int force;
@@ -1120,10 +1136,10 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
* to be executed only during failed attempts of
* association establishment.
*/
- if ((asoc->peer.retran_path !=
- asoc->peer.primary_path) &&
- (asoc->counters[SCTP_COUNTER_INIT_ERROR] > 0)) {
- sctp_add_cmd_sf(commands,
+ if ((asoc->peer.retran_path !=
+ asoc->peer.primary_path) &&
+ (asoc->init_err_counter > 0)) {
+ sctp_add_cmd_sf(commands,
SCTP_CMD_FORCE_PRIM_RETRAN,
SCTP_NULL());
}
@@ -1237,18 +1253,67 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
sctp_association_put(asoc);
break;
+ case SCTP_CMD_INIT_CHOOSE_TRANSPORT:
+ chunk = cmd->obj.ptr;
+ t = sctp_assoc_choose_init_transport(asoc);
+ asoc->init_last_sent_to = t;
+ chunk->transport = t;
+ t->init_sent_count++;
+ break;
+
case SCTP_CMD_INIT_RESTART:
/* Do the needed accounting and updates
* associated with restarting an initialization
- * timer.
+ * timer. Only multiply the timeout by two if
+ * all transports have been tried at the current
+ * timeout.
+ */
+ t = asoc->init_last_sent_to;
+ asoc->init_err_counter++;
+
+ if (t->init_sent_count > (asoc->init_cycle + 1)) {
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] *= 2;
+ if (asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] >
+ asoc->max_init_timeo) {
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
+ asoc->max_init_timeo;
+ }
+ asoc->init_cycle++;
+ SCTP_DEBUG_PRINTK(
+ "T1 INIT Timeout adjustment"
+ " init_err_counter: %d"
+ " cycle: %d"
+ " timeout: %d\n",
+ asoc->init_err_counter,
+ asoc->init_cycle,
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT]);
+ }
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+ break;
+
+ case SCTP_CMD_COOKIEECHO_RESTART:
+ /* Do the needed accounting and updates
+ * associated with restarting an initialization
+ * timer. Only multiply the timeout by two if
+ * all transports have been tried at the current
+ * timeout.
*/
- asoc->counters[SCTP_COUNTER_INIT_ERROR]++;
- asoc->timeouts[cmd->obj.to] *= 2;
- if (asoc->timeouts[cmd->obj.to] >
+ asoc->init_err_counter++;
+
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] *= 2;
+ if (asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] >
asoc->max_init_timeo) {
- asoc->timeouts[cmd->obj.to] =
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
asoc->max_init_timeo;
}
+ SCTP_DEBUG_PRINTK(
+ "T1 COOKIE Timeout adjustment"
+ " init_err_counter: %d"
+ " timeout: %d\n",
+ asoc->init_err_counter,
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE]);
/* If we've sent any data bundled with
* COOKIE-ECHO we need to resend.
@@ -1261,7 +1326,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
sctp_add_cmd_sf(commands,
SCTP_CMD_TIMER_RESTART,
- SCTP_TO(cmd->obj.to));
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
break;
case SCTP_CMD_INIT_FAILED:
@@ -1273,12 +1338,13 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
subtype, chunk, cmd->obj.u32);
break;
- case SCTP_CMD_COUNTER_INC:
- asoc->counters[cmd->obj.counter]++;
+ case SCTP_CMD_INIT_COUNTER_INC:
+ asoc->init_err_counter++;
break;
- case SCTP_CMD_COUNTER_RESET:
- asoc->counters[cmd->obj.counter] = 0;
+ case SCTP_CMD_INIT_COUNTER_RESET:
+ asoc->init_err_counter = 0;
+ asoc->init_cycle = 0;
break;
case SCTP_CMD_REPORT_DUP:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8e01b8f..86073df 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -92,6 +92,17 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
sctp_cmd_seq_t *commands);
static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+ __u16 error,
+ const struct sctp_association *asoc,
+ struct sctp_transport *transport);
+
+static sctp_disposition_t sctp_sf_violation_chunklen(
+ const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands);
/* Small helper function that checks if the chunk length
* is of the appropriate length. The 'required_length' argument
@@ -533,6 +544,9 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep,
sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT,
SCTP_PEER_INIT(initchunk));
+ /* Reset init error count upon receipt of INIT-ACK. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
+
/* 5.1 C) "A" shall stop the T1-init timer and leave
* COOKIE-WAIT state. "A" shall then ... start the T1-cookie
* timer, and enter the COOKIE-ECHOED state.
@@ -775,8 +789,7 @@ sctp_disposition_t sctp_sf_do_5_1E_ca(const struct sctp_endpoint *ep,
* from the COOKIE-ECHOED state to the COOKIE-WAIT
* state is performed.
*/
- sctp_add_cmd_sf(commands, SCTP_CMD_COUNTER_RESET,
- SCTP_COUNTER(SCTP_COUNTER_INIT_ERROR));
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
/* RFC 2960 5.1 Normal Establishment of an Association
*
@@ -1019,10 +1032,22 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
link = sctp_assoc_lookup_paddr(asoc, &from_addr);
/* This should never happen, but lets log it if so. */
- if (!link) {
- printk(KERN_WARNING
- "%s: Could not find address %d.%d.%d.%d\n",
- __FUNCTION__, NIPQUAD(from_addr.v4.sin_addr));
+ if (unlikely(!link)) {
+ if (from_addr.sa.sa_family == AF_INET6) {
+ printk(KERN_WARNING
+ "%s association %p could not find address "
+ "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+ __FUNCTION__,
+ asoc,
+ NIP6(from_addr.v6.sin6_addr));
+ } else {
+ printk(KERN_WARNING
+ "%s association %p could not find address "
+ "%u.%u.%u.%u\n",
+ __FUNCTION__,
+ asoc,
+ NIPQUAD(from_addr.v4.sin_addr.s_addr));
+ }
return SCTP_DISPOSITION_DISCARD;
}
@@ -2095,9 +2120,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep,
sctp_errhdr_t *err;
struct sctp_chunk *reply;
struct sctp_bind_addr *bp;
- int attempts;
-
- attempts = asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1;
+ int attempts = asoc->init_err_counter + 1;
if (attempts >= asoc->max_init_attempts) {
sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
@@ -2157,8 +2180,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep,
/* Cast away the const modifier, as we want to just
* rerun it through as a sideffect.
*/
- sctp_add_cmd_sf(commands, SCTP_CMD_COUNTER_INC,
- SCTP_COUNTER(SCTP_COUNTER_INIT_ERROR));
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_INC, SCTP_NULL());
sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
@@ -2281,8 +2303,7 @@ sctp_disposition_t sctp_sf_cookie_wait_abort(const struct sctp_endpoint *ep,
if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
error = ((sctp_errhdr_t *)chunk->skb->data)->cause;
- sctp_stop_t1_and_abort(commands, error);
- return SCTP_DISPOSITION_ABORT;
+ return sctp_stop_t1_and_abort(commands, error, asoc, chunk->transport);
}
/*
@@ -2294,8 +2315,8 @@ sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(const struct sctp_endpoint *ep
void *arg,
sctp_cmd_seq_t *commands)
{
- sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR);
- return SCTP_DISPOSITION_ABORT;
+ return sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR, asoc,
+ (struct sctp_transport *)arg);
}
/*
@@ -2318,8 +2339,12 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
*
* This is common code called by several sctp_sf_*_abort() functions above.
*/
-void sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, __u16 error)
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+ __u16 error,
+ const struct sctp_association *asoc,
+ struct sctp_transport *transport)
{
+ SCTP_DEBUG_PRINTK("ABORT received (INIT).\n");
sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
SCTP_STATE(SCTP_STATE_CLOSED));
SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
@@ -2328,6 +2353,7 @@ void sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, __u16 error)
/* CMD_INIT_FAILED will DELETE_TCB. */
sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
SCTP_U32(error));
+ return SCTP_DISPOSITION_ABORT;
}
/*
@@ -3672,7 +3698,8 @@ sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep,
*
* Generate an ABORT chunk and terminate the association.
*/
-sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep,
+static sctp_disposition_t sctp_sf_violation_chunklen(
+ const struct sctp_endpoint *ep,
const struct sctp_association *asoc,
const sctp_subtype_t type,
void *arg,
@@ -3805,6 +3832,10 @@ sctp_disposition_t sctp_sf_do_prm_asoc(const struct sctp_endpoint *ep,
sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC,
SCTP_ASOC((struct sctp_association *) asoc));
+ /* Choose transport for INIT. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
+ SCTP_CHUNK(repl));
+
/* After sending the INIT, "A" starts the T1-init timer and
* enters the COOKIE-WAIT state.
*/
@@ -4589,7 +4620,7 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
}
/*
- * sctp_sf_t1_timer_expire
+ * sctp_sf_t1_init_timer_expire
*
* Section: 4 Note: 2
* Verification Tag:
@@ -4603,7 +4634,59 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
* endpoint MUST abort the initialization process and report the
* error to SCTP user.
*
- * 3) If the T1-cookie timer expires, the endpoint MUST retransmit
+ * Outputs
+ * (timers, events)
+ *
+ */
+sctp_disposition_t sctp_sf_t1_init_timer_expire(const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ const sctp_subtype_t type,
+ void *arg,
+ sctp_cmd_seq_t *commands)
+{
+ struct sctp_chunk *repl = NULL;
+ struct sctp_bind_addr *bp;
+ int attempts = asoc->init_err_counter + 1;
+
+ SCTP_DEBUG_PRINTK("Timer T1 expired (INIT).\n");
+
+ if (attempts < asoc->max_init_attempts) {
+ bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
+ repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
+ if (!repl)
+ return SCTP_DISPOSITION_NOMEM;
+
+ /* Choose transport for INIT. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
+ SCTP_CHUNK(repl));
+
+ /* Issue a sideeffect to do the needed accounting. */
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+ } else {
+ SCTP_DEBUG_PRINTK("Giving up on INIT, attempts: %d"
+ " max_init_attempts: %d\n",
+ attempts, asoc->max_init_attempts);
+ sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+ SCTP_U32(SCTP_ERROR_NO_ERROR));
+ return SCTP_DISPOSITION_DELETE_TCB;
+ }
+
+ return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * sctp_sf_t1_cookie_timer_expire
+ *
+ * Section: 4 Note: 2
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ * RFC 2960 Section 4 Notes
+ * 3) If the T1-cookie timer expires, the endpoint MUST retransmit
* COOKIE ECHO and re-start the T1-cookie timer without changing
* state. This MUST be repeated up to 'Max.Init.Retransmits' times.
* After that, the endpoint MUST abort the initialization process and
@@ -4613,46 +4696,26 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
* (timers, events)
*
*/
-sctp_disposition_t sctp_sf_t1_timer_expire(const struct sctp_endpoint *ep,
+sctp_disposition_t sctp_sf_t1_cookie_timer_expire(const struct sctp_endpoint *ep,
const struct sctp_association *asoc,
const sctp_subtype_t type,
void *arg,
sctp_cmd_seq_t *commands)
{
- struct sctp_chunk *repl;
- struct sctp_bind_addr *bp;
- sctp_event_timeout_t timer = (sctp_event_timeout_t) arg;
- int timeout;
- int attempts;
+ struct sctp_chunk *repl = NULL;
+ int attempts = asoc->init_err_counter + 1;
- timeout = asoc->timeouts[timer];
- attempts = asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1;
- repl = NULL;
-
- SCTP_DEBUG_PRINTK("Timer T1 expired.\n");
+ SCTP_DEBUG_PRINTK("Timer T1 expired (COOKIE-ECHO).\n");
if (attempts < asoc->max_init_attempts) {
- switch (timer) {
- case SCTP_EVENT_TIMEOUT_T1_INIT:
- bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
- repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
- break;
-
- case SCTP_EVENT_TIMEOUT_T1_COOKIE:
- repl = sctp_make_cookie_echo(asoc, NULL);
- break;
-
- default:
- BUG();
- break;
- };
-
+ repl = sctp_make_cookie_echo(asoc, NULL);
if (!repl)
- goto nomem;
+ return SCTP_DISPOSITION_NOMEM;
/* Issue a sideeffect to do the needed accounting. */
- sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART,
- SCTP_TO(timer));
+ sctp_add_cmd_sf(commands, SCTP_CMD_COOKIEECHO_RESTART,
+ SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+
sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
} else {
sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
@@ -4661,9 +4724,6 @@ sctp_disposition_t sctp_sf_t1_timer_expire(const struct sctp_endpoint *ep,
}
return SCTP_DISPOSITION_CONSUME;
-
-nomem:
- return SCTP_DISPOSITION_NOMEM;
}
/* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 8967846..75ef104 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -783,7 +783,8 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
/* SCTP_STATE_COOKIE_WAIT */ \
{.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \
/* SCTP_STATE_COOKIE_ECHOED */ \
- {.fn = sctp_sf_t1_timer_expire, .name = "sctp_sf_t1_timer_expire"}, \
+ {.fn = sctp_sf_t1_cookie_timer_expire, \
+ .name = "sctp_sf_t1_cookie_timer_expire"}, \
/* SCTP_STATE_ESTABLISHED */ \
{.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
/* SCTP_STATE_SHUTDOWN_PENDING */ \
@@ -802,7 +803,8 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
/* SCTP_STATE_CLOSED */ \
{.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
/* SCTP_STATE_COOKIE_WAIT */ \
- {.fn = sctp_sf_t1_timer_expire, .name = "sctp_sf_t1_timer_expire"}, \
+ {.fn = sctp_sf_t1_init_timer_expire, \
+ .name = "sctp_sf_t1_init_timer_expire"}, \
/* SCTP_STATE_COOKIE_ECHOED */ \
{.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
/* SCTP_STATE_ESTABLISHED */ \
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0b338ec..91ec8c9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -262,18 +262,18 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
* sockaddr_in6 [RFC 2553]),
* addr_len - the size of the address structure.
*/
-SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len)
{
int retval = 0;
sctp_lock_sock(sk);
- SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, uaddr: %p, addr_len: %d)\n",
- sk, uaddr, addr_len);
+ SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, addr: %p, addr_len: %d)\n",
+ sk, addr, addr_len);
/* Disallow binding twice. */
if (!sctp_sk(sk)->ep->base.bind_addr.port)
- retval = sctp_do_bind(sk, (union sctp_addr *)uaddr,
+ retval = sctp_do_bind(sk, (union sctp_addr *)addr,
addr_len);
else
retval = -EINVAL;
@@ -318,23 +318,27 @@ SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
unsigned short snum;
int ret = 0;
- SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d)\n",
- sk, addr, len);
-
/* Common sockaddr verification. */
af = sctp_sockaddr_af(sp, addr, len);
- if (!af)
+ if (!af) {
+ SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d) EINVAL\n",
+ sk, addr, len);
return -EINVAL;
+ }
+
+ snum = ntohs(addr->v4.sin_port);
+
+ SCTP_DEBUG_PRINTK_IPADDR("sctp_do_bind(sk: %p, new addr: ",
+ ", port: %d, new port: %d, len: %d)\n",
+ sk,
+ addr,
+ bp->port, snum,
+ len);
/* PF specific bind() address verification. */
if (!sp->pf->bind_verify(sp, addr))
return -EADDRNOTAVAIL;
- snum= ntohs(addr->v4.sin_port);
-
- SCTP_DEBUG_PRINTK("sctp_do_bind: port: %d, new port: %d\n",
- bp->port, snum);
-
/* We must either be unbound, or bind to the same port. */
if (bp->port && (snum != bp->port)) {
SCTP_DEBUG_PRINTK("sctp_do_bind:"
@@ -402,7 +406,7 @@ static int sctp_send_asconf(struct sctp_association *asoc,
* transmission.
*/
if (asoc->addip_last_asconf) {
- __skb_queue_tail(&asoc->addip_chunks, (struct sk_buff *)chunk);
+ list_add_tail(&chunk->list, &asoc->addip_chunk_list);
goto out;
}
@@ -816,7 +820,8 @@ out:
*
* Basically do nothing but copying the addresses from user to kernel
* land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk.
- * This is used for tunneling the sctp_bindx() request through sctp_setsockopt() * from userspace.
+ * This is used for tunneling the sctp_bindx() request through sctp_setsockopt()
+ * from userspace.
*
* We don't use copy_from_user() for optimization: we first do the
* sanity checks (buffer size -fast- and access check-healthy
@@ -913,6 +918,243 @@ out:
return err;
}
+/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
+ *
+ * Common routine for handling connect() and sctp_connectx().
+ * Connect will come in with just a single address.
+ */
+static int __sctp_connect(struct sock* sk,
+ struct sockaddr *kaddrs,
+ int addrs_size)
+{
+ struct sctp_sock *sp;
+ struct sctp_endpoint *ep;
+ struct sctp_association *asoc = NULL;
+ struct sctp_association *asoc2;
+ struct sctp_transport *transport;
+ union sctp_addr to;
+ struct sctp_af *af;
+ sctp_scope_t scope;
+ long timeo;
+ int err = 0;
+ int addrcnt = 0;
+ int walk_size = 0;
+ struct sockaddr *sa_addr;
+ void *addr_buf;
+
+ sp = sctp_sk(sk);
+ ep = sp->ep;
+
+ /* connect() cannot be done on a socket that is already in ESTABLISHED
+ * state - UDP-style peeled off socket or a TCP-style socket that
+ * is already connected.
+ * It cannot be done even on a TCP-style listening socket.
+ */
+ if (sctp_sstate(sk, ESTABLISHED) ||
+ (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
+ err = -EISCONN;
+ goto out_free;
+ }
+
+ /* Walk through the addrs buffer and count the number of addresses. */
+ addr_buf = kaddrs;
+ while (walk_size < addrs_size) {
+ sa_addr = (struct sockaddr *)addr_buf;
+ af = sctp_get_af_specific(sa_addr->sa_family);
+
+ /* If the address family is not supported or if this address
+ * causes the address buffer to overflow return EINVAL.
+ */
+ if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ err = sctp_verify_addr(sk, (union sctp_addr *)sa_addr,
+ af->sockaddr_len);
+ if (err)
+ goto out_free;
+
+ memcpy(&to, sa_addr, af->sockaddr_len);
+ to.v4.sin_port = ntohs(to.v4.sin_port);
+
+ /* Check if there already is a matching association on the
+ * endpoint (other than the one created here).
+ */
+ asoc2 = sctp_endpoint_lookup_assoc(ep, &to, &transport);
+ if (asoc2 && asoc2 != asoc) {
+ if (asoc2->state >= SCTP_STATE_ESTABLISHED)
+ err = -EISCONN;
+ else
+ err = -EALREADY;
+ goto out_free;
+ }
+
+ /* If we could not find a matching association on the endpoint,
+ * make sure that there is no peeled-off association matching
+ * the peer address even on another socket.
+ */
+ if (sctp_endpoint_is_peeled_off(ep, &to)) {
+ err = -EADDRNOTAVAIL;
+ goto out_free;
+ }
+
+ if (!asoc) {
+ /* If a bind() or sctp_bindx() is not called prior to
+ * an sctp_connectx() call, the system picks an
+ * ephemeral port and will choose an address set
+ * equivalent to binding with a wildcard address.
+ */
+ if (!ep->base.bind_addr.port) {
+ if (sctp_autobind(sk)) {
+ err = -EAGAIN;
+ goto out_free;
+ }
+ }
+
+ scope = sctp_scope(&to);
+ asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
+ if (!asoc) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ }
+
+ /* Prime the peer's transport structures. */
+ transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL,
+ SCTP_UNKNOWN);
+ if (!transport) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+
+ addrcnt++;
+ addr_buf += af->sockaddr_len;
+ walk_size += af->sockaddr_len;
+ }
+
+ err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL);
+ if (err < 0) {
+ goto out_free;
+ }
+
+ err = sctp_primitive_ASSOCIATE(asoc, NULL);
+ if (err < 0) {
+ goto out_free;
+ }
+
+ /* Initialize sk's dport and daddr for getpeername() */
+ inet_sk(sk)->dport = htons(asoc->peer.port);
+ af = sctp_get_af_specific(to.sa.sa_family);
+ af->to_sk_daddr(&to, sk);
+
+ timeo = sock_sndtimeo(sk, sk->sk_socket->file->f_flags & O_NONBLOCK);
+ err = sctp_wait_for_connect(asoc, &timeo);
+
+ /* Don't free association on exit. */
+ asoc = NULL;
+
+out_free:
+
+ SCTP_DEBUG_PRINTK("About to exit __sctp_connect() free asoc: %p"
+ " kaddrs: %p err: %d\n",
+ asoc, kaddrs, err);
+ if (asoc)
+ sctp_association_free(asoc);
+ return err;
+}
+
+/* Helper for tunneling sctp_connectx() requests through sctp_setsockopt()
+ *
+ * API 8.9
+ * int sctp_connectx(int sd, struct sockaddr *addrs, int addrcnt);
+ *
+ * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses.
+ * If the sd is an IPv6 socket, the addresses passed can either be IPv4
+ * or IPv6 addresses.
+ *
+ * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see
+ * Section 3.1.2 for this usage.
+ *
+ * addrs is a pointer to an array of one or more socket addresses. Each
+ * address is contained in its appropriate structure (i.e. struct
+ * sockaddr_in or struct sockaddr_in6) the family of the address type
+ * must be used to distengish the address length (note that this
+ * representation is termed a "packed array" of addresses). The caller
+ * specifies the number of addresses in the array with addrcnt.
+ *
+ * On success, sctp_connectx() returns 0. On failure, sctp_connectx() returns
+ * -1, and sets errno to the appropriate error code.
+ *
+ * For SCTP, the port given in each socket address must be the same, or
+ * sctp_connectx() will fail, setting errno to EINVAL.
+ *
+ * An application can use sctp_connectx to initiate an association with
+ * an endpoint that is multi-homed. Much like sctp_bindx() this call
+ * allows a caller to specify multiple addresses at which a peer can be
+ * reached. The way the SCTP stack uses the list of addresses to set up
+ * the association is implementation dependant. This function only
+ * specifies that the stack will try to make use of all the addresses in
+ * the list when needed.
+ *
+ * Note that the list of addresses passed in is only used for setting up
+ * the association. It does not necessarily equal the set of addresses
+ * the peer uses for the resulting association. If the caller wants to
+ * find out the set of peer addresses, it must use sctp_getpaddrs() to
+ * retrieve them after the association has been set up.
+ *
+ * Basically do nothing but copying the addresses from user to kernel
+ * land and invoking either sctp_connectx(). This is used for tunneling
+ * the sctp_connectx() request through sctp_setsockopt() from userspace.
+ *
+ * We don't use copy_from_user() for optimization: we first do the
+ * sanity checks (buffer size -fast- and access check-healthy
+ * pointer); if all of those succeed, then we can alloc the memory
+ * (expensive operation) needed to copy the data to kernel. Then we do
+ * the copying without checking the user space area
+ * (__copy_from_user()).
+ *
+ * On exit there is no need to do sockfd_put(), sys_setsockopt() does
+ * it.
+ *
+ * sk The sk of the socket
+ * addrs The pointer to the addresses in user land
+ * addrssize Size of the addrs buffer
+ *
+ * Returns 0 if ok, <0 errno code on error.
+ */
+SCTP_STATIC int sctp_setsockopt_connectx(struct sock* sk,
+ struct sockaddr __user *addrs,
+ int addrs_size)
+{
+ int err = 0;
+ struct sockaddr *kaddrs;
+
+ SCTP_DEBUG_PRINTK("%s - sk %p addrs %p addrs_size %d\n",
+ __FUNCTION__, sk, addrs, addrs_size);
+
+ if (unlikely(addrs_size <= 0))
+ return -EINVAL;
+
+ /* Check the user passed a healthy pointer. */
+ if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size)))
+ return -EFAULT;
+
+ /* Alloc space for the address array in kernel memory. */
+ kaddrs = (struct sockaddr *)kmalloc(addrs_size, GFP_KERNEL);
+ if (unlikely(!kaddrs))
+ return -ENOMEM;
+
+ if (__copy_from_user(kaddrs, addrs, addrs_size)) {
+ err = -EFAULT;
+ } else {
+ err = __sctp_connect(sk, kaddrs, addrs_size);
+ }
+
+ kfree(kaddrs);
+ return err;
+}
+
/* API 3.1.4 close() - UDP Style Syntax
* Applications use close() to perform graceful shutdown (as described in
* Section 10.1 of [SCTP]) on ALL the associations currently represented
@@ -1095,7 +1337,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
sp = sctp_sk(sk);
ep = sp->ep;
- SCTP_DEBUG_PRINTK("Using endpoint: %s.\n", ep->debug_name);
+ SCTP_DEBUG_PRINTK("Using endpoint: %p.\n", ep);
/* We cannot send a message over a TCP-style listening socket. */
if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) {
@@ -1306,7 +1548,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
}
/* Prime the peer's transport structures. */
- transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL);
+ transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, SCTP_UNKNOWN);
if (!transport) {
err = -ENOMEM;
goto out_free;
@@ -2208,6 +2450,12 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
optlen, SCTP_BINDX_REM_ADDR);
break;
+ case SCTP_SOCKOPT_CONNECTX:
+ /* 'optlen' is the size of the addresses buffer. */
+ retval = sctp_setsockopt_connectx(sk, (struct sockaddr __user *)optval,
+ optlen);
+ break;
+
case SCTP_DISABLE_FRAGMENTS:
retval = sctp_setsockopt_disable_fragments(sk, optval, optlen);
break;
@@ -2283,112 +2531,29 @@ out_nounlock:
*
* len: the size of the address.
*/
-SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *uaddr,
+SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *addr,
int addr_len)
{
- struct sctp_sock *sp;
- struct sctp_endpoint *ep;
- struct sctp_association *asoc;
- struct sctp_transport *transport;
- union sctp_addr to;
- struct sctp_af *af;
- sctp_scope_t scope;
- long timeo;
int err = 0;
+ struct sctp_af *af;
sctp_lock_sock(sk);
- SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d)\n",
- __FUNCTION__, sk, uaddr, addr_len);
-
- sp = sctp_sk(sk);
- ep = sp->ep;
-
- /* connect() cannot be done on a socket that is already in ESTABLISHED
- * state - UDP-style peeled off socket or a TCP-style socket that
- * is already connected.
- * It cannot be done even on a TCP-style listening socket.
- */
- if (sctp_sstate(sk, ESTABLISHED) ||
- (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
- err = -EISCONN;
- goto out_unlock;
- }
-
- err = sctp_verify_addr(sk, (union sctp_addr *)uaddr, addr_len);
- if (err)
- goto out_unlock;
+ SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d\n",
+ __FUNCTION__, sk, addr, addr_len);
- if (addr_len > sizeof(to))
- addr_len = sizeof(to);
- memcpy(&to, uaddr, addr_len);
- to.v4.sin_port = ntohs(to.v4.sin_port);
-
- asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport);
- if (asoc) {
- if (asoc->state >= SCTP_STATE_ESTABLISHED)
- err = -EISCONN;
- else
- err = -EALREADY;
- goto out_unlock;
- }
-
- /* If we could not find a matching association on the endpoint,
- * make sure that there is no peeled-off association matching the
- * peer address even on another socket.
- */
- if (sctp_endpoint_is_peeled_off(ep, &to)) {
- err = -EADDRNOTAVAIL;
- goto out_unlock;
- }
-
- /* If a bind() or sctp_bindx() is not called prior to a connect()
- * call, the system picks an ephemeral port and will choose an address
- * set equivalent to binding with a wildcard address.
- */
- if (!ep->base.bind_addr.port) {
- if (sctp_autobind(sk)) {
- err = -EAGAIN;
- goto out_unlock;
- }
- }
-
- scope = sctp_scope(&to);
- asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
- if (!asoc) {
- err = -ENOMEM;
- goto out_unlock;
- }
-
- /* Prime the peer's transport structures. */
- transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL);
- if (!transport) {
- sctp_association_free(asoc);
- goto out_unlock;
- }
- err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL);
- if (err < 0) {
- sctp_association_free(asoc);
- goto out_unlock;
- }
-
- err = sctp_primitive_ASSOCIATE(asoc, NULL);
- if (err < 0) {
- sctp_association_free(asoc);
- goto out_unlock;
+ /* Validate addr_len before calling common connect/connectx routine. */
+ af = sctp_get_af_specific(addr->sa_family);
+ if (!af || addr_len < af->sockaddr_len) {
+ err = -EINVAL;
+ } else {
+ /* Pass correct addr len to common routine (so it knows there
+ * is only one address being passed.
+ */
+ err = __sctp_connect(sk, addr, af->sockaddr_len);
}
- /* Initialize sk's dport and daddr for getpeername() */
- inet_sk(sk)->dport = htons(asoc->peer.port);
- af = sctp_get_af_specific(to.sa.sa_family);
- af->to_sk_daddr(&to, sk);
-
- timeo = sock_sndtimeo(sk, sk->sk_socket->file->f_flags & O_NONBLOCK);
- err = sctp_wait_for_connect(asoc, &timeo);
-
-out_unlock:
sctp_release_sock(sk);
-
return err;
}
@@ -2677,12 +2842,15 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
/* Map ipv4 address into v4-mapped-on-v6 address. */
sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk),
(union sctp_addr *)&status.sstat_primary.spinfo_address);
- status.sstat_primary.spinfo_state = transport->active;
+ status.sstat_primary.spinfo_state = transport->state;
status.sstat_primary.spinfo_cwnd = transport->cwnd;
status.sstat_primary.spinfo_srtt = transport->srtt;
status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
status.sstat_primary.spinfo_mtu = transport->pmtu;
+ if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
+ status.sstat_primary.spinfo_state = SCTP_ACTIVE;
+
if (put_user(len, optlen)) {
retval = -EFAULT;
goto out;
@@ -2733,12 +2901,15 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
return -EINVAL;
pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
- pinfo.spinfo_state = transport->active;
+ pinfo.spinfo_state = transport->state;
pinfo.spinfo_cwnd = transport->cwnd;
pinfo.spinfo_srtt = transport->srtt;
pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
pinfo.spinfo_mtu = transport->pmtu;
+ if (pinfo.spinfo_state == SCTP_UNKNOWN)
+ pinfo.spinfo_state = SCTP_ACTIVE;
+
if (put_user(len, optlen)) {
retval = -EFAULT;
goto out;
@@ -3591,7 +3762,8 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
int retval = 0;
int len;
- SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p, ...)\n", sk);
+ SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p... optname: %d)\n",
+ sk, optname);
/* I can hardly begin to describe how wrong this is. This is
* so broken as to be worse than useless. The API draft
@@ -4022,8 +4194,7 @@ out:
sctp_release_sock(sk);
return err;
cleanup:
- if (tfm)
- sctp_crypto_free_tfm(tfm);
+ sctp_crypto_free_tfm(tfm);
goto out;
}
@@ -4368,15 +4539,11 @@ static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
* However, this function was corrent in any case. 8)
*/
if (flags & MSG_PEEK) {
- unsigned long cpu_flags;
-
- sctp_spin_lock_irqsave(&sk->sk_receive_queue.lock,
- cpu_flags);
+ spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
if (skb)
atomic_inc(&skb->users);
- sctp_spin_unlock_irqrestore(&sk->sk_receive_queue.lock,
- cpu_flags);
+ spin_unlock_bh(&sk->sk_receive_queue.lock);
} else {
skb = skb_dequeue(&sk->sk_receive_queue);
}
@@ -4600,8 +4767,7 @@ out:
return err;
do_error:
- if (asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1 >=
- asoc->max_init_attempts)
+ if (asoc->init_err_counter + 1 >= asoc->max_init_attempts)
err = -ETIMEDOUT;
else
err = -ECONNREFUSED;
@@ -4686,6 +4852,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
struct sctp_endpoint *newep = newsp->ep;
struct sk_buff *skb, *tmp;
struct sctp_ulpevent *event;
+ int flags = 0;
/* Migrate socket buffer sizes and all the socket level options to the
* new socket.
@@ -4707,13 +4874,24 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
sctp_sk(newsk)->bind_hash = pp;
inet_sk(newsk)->num = inet_sk(oldsk)->num;
+ /* Copy the bind_addr list from the original endpoint to the new
+ * endpoint so that we can handle restarts properly
+ */
+ if (assoc->peer.ipv4_address)
+ flags |= SCTP_ADDR4_PEERSUPP;
+ if (assoc->peer.ipv6_address)
+ flags |= SCTP_ADDR6_PEERSUPP;
+ sctp_bind_addr_copy(&newsp->ep->base.bind_addr,
+ &oldsp->ep->base.bind_addr,
+ SCTP_SCOPE_GLOBAL, GFP_KERNEL, flags);
+
/* Move any messages in the old socket's receive queue that are for the
* peeled off association to the new socket's receive queue.
*/
sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) {
event = sctp_skb2event(skb);
if (event->asoc == assoc) {
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &oldsk->sk_receive_queue);
__skb_queue_tail(&newsk->sk_receive_queue, skb);
}
}
@@ -4742,7 +4920,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) {
event = sctp_skb2event(skb);
if (event->asoc == assoc) {
- __skb_unlink(skb, skb->list);
+ __skb_unlink(skb, &oldsp->pd_lobby);
__skb_queue_tail(queue, skb);
}
}
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
index e627d2b..25037da 100644
--- a/net/sctp/ssnmap.c
+++ b/net/sctp/ssnmap.c
@@ -57,7 +57,8 @@ static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
/* Create a new sctp_ssnmap.
* Allocate room to store at least 'len' contiguous TSNs.
*/
-struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, int gfp)
+struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
+ unsigned int __nocast gfp)
{
struct sctp_ssnmap *retval;
int size;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 7fc3184..75b28dd 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -42,11 +42,14 @@
*/
#include <net/sctp/structs.h>
+#include <net/sctp/sctp.h>
#include <linux/sysctl.h>
static ctl_handler sctp_sysctl_jiffies_ms;
static long rto_timer_min = 1;
static long rto_timer_max = 86400000; /* One day */
+static long sack_timer_min = 1;
+static long sack_timer_max = 500;
static ctl_table sctp_table[] = {
{
@@ -187,6 +190,17 @@ static ctl_table sctp_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec
},
+ {
+ .ctl_name = NET_SCTP_SACK_TIMEOUT,
+ .procname = "sack_timeout",
+ .data = &sctp_sack_timeout,
+ .maxlen = sizeof(long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
+ .strategy = &sctp_sysctl_jiffies_ms,
+ .extra1 = &sack_timer_min,
+ .extra2 = &sack_timer_max,
+ },
{ .ctl_name = 0 }
};
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index f30882e..d2f04eb 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -57,7 +57,7 @@
/* Initialize a new transport from provided memory. */
static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
const union sctp_addr *addr,
- int gfp)
+ unsigned int __nocast gfp)
{
/* Copy in the address. */
peer->ipaddr = *addr;
@@ -83,7 +83,9 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
peer->last_time_used = jiffies;
peer->last_time_ecne_reduced = jiffies;
- peer->active = SCTP_ACTIVE;
+ peer->init_sent_count = 0;
+
+ peer->state = SCTP_ACTIVE;
peer->hb_allowed = 0;
/* Initialize the default path max_retrans. */
@@ -101,7 +103,6 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
/* Set up the heartbeat timer. */
init_timer(&peer->hb_timer);
- peer->hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
peer->hb_timer.function = sctp_generate_heartbeat_event;
peer->hb_timer.data = (unsigned long)peer;
@@ -120,7 +121,8 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
}
/* Allocate and initialize a new transport. */
-struct sctp_transport *sctp_transport_new(const union sctp_addr *addr, int gfp)
+struct sctp_transport *sctp_transport_new(const union sctp_addr *addr,
+ unsigned int __nocast gfp)
{
struct sctp_transport *transport;
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 17d0ff5..0abd510 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -74,7 +74,7 @@ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
/* Create a new sctp_ulpevent. */
SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
- int gfp)
+ unsigned int __nocast gfp)
{
struct sctp_ulpevent *event;
struct sk_buff *skb;
@@ -136,7 +136,7 @@ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
struct sctp_ulpevent *sctp_ulpevent_make_assoc_change(
const struct sctp_association *asoc,
__u16 flags, __u16 state, __u16 error, __u16 outbound,
- __u16 inbound, int gfp)
+ __u16 inbound, unsigned int __nocast gfp)
{
struct sctp_ulpevent *event;
struct sctp_assoc_change *sac;
@@ -237,7 +237,7 @@ fail:
struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
const struct sctp_association *asoc,
const struct sockaddr_storage *aaddr,
- int flags, int state, int error, int gfp)
+ int flags, int state, int error, unsigned int __nocast gfp)
{
struct sctp_ulpevent *event;
struct sctp_paddr_change *spc;
@@ -350,7 +350,7 @@ fail:
*/
struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
const struct sctp_association *asoc, struct sctp_chunk *chunk,
- __u16 flags, int gfp)
+ __u16 flags, unsigned int __nocast gfp)
{
struct sctp_ulpevent *event;
struct sctp_remote_error *sre;
@@ -448,7 +448,7 @@ fail:
*/
struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
const struct sctp_association *asoc, struct sctp_chunk *chunk,
- __u16 flags, __u32 error, int gfp)
+ __u16 flags, __u32 error, unsigned int __nocast gfp)
{
struct sctp_ulpevent *event;
struct sctp_send_failed *ssf;
@@ -557,7 +557,7 @@ fail:
*/
struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
const struct sctp_association *asoc,
- __u16 flags, int gfp)
+ __u16 flags, unsigned int __nocast gfp)
{
struct sctp_ulpevent *event;
struct sctp_shutdown_event *sse;
@@ -620,7 +620,7 @@ fail:
* 5.3.1.6 SCTP_ADAPTION_INDICATION
*/
struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication(
- const struct sctp_association *asoc, int gfp)
+ const struct sctp_association *asoc, unsigned int __nocast gfp)
{
struct sctp_ulpevent *event;
struct sctp_adaption_event *sai;
@@ -657,7 +657,7 @@ fail:
*/
struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
struct sctp_chunk *chunk,
- int gfp)
+ unsigned int __nocast gfp)
{
struct sctp_ulpevent *event = NULL;
struct sk_buff *skb;
@@ -718,7 +718,8 @@ fail:
* various events.
*/
struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
- const struct sctp_association *asoc, __u32 indication, int gfp)
+ const struct sctp_association *asoc, __u32 indication,
+ unsigned int __nocast gfp)
{
struct sctp_ulpevent *event;
struct sctp_pdapi_event *pd;
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index d5dd2cf..ec2c857 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -50,9 +50,9 @@
/* Forward declarations for internal helpers. */
static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq,
- struct sctp_ulpevent *);
+ struct sctp_ulpevent *);
static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *,
- struct sctp_ulpevent *);
+ struct sctp_ulpevent *);
/* 1st Level Abstractions */
@@ -100,7 +100,7 @@ void sctp_ulpq_free(struct sctp_ulpq *ulpq)
/* Process an incoming DATA chunk. */
int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
- int gfp)
+ unsigned int __nocast gfp)
{
struct sk_buff_head temp;
sctp_data_chunk_t *hdr;
@@ -125,7 +125,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
event = sctp_ulpq_order(ulpq, event);
}
- /* Send event to the ULP. */
+ /* Send event to the ULP. 'event' is the sctp_ulpevent for
+ * very first SKB on the 'temp' list.
+ */
if (event)
sctp_ulpq_tail_event(ulpq, event);
@@ -158,14 +160,18 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq)
return sctp_clear_pd(ulpq->asoc->base.sk);
}
-
-
+/* If the SKB of 'event' is on a list, it is the first such member
+ * of that list.
+ */
int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
{
struct sock *sk = ulpq->asoc->base.sk;
- struct sk_buff_head *queue;
+ struct sk_buff_head *queue, *skb_list;
+ struct sk_buff *skb = sctp_event2skb(event);
int clear_pd = 0;
+ skb_list = (struct sk_buff_head *) skb->prev;
+
/* If the socket is just going to throw this away, do not
* even try to deliver it.
*/
@@ -197,10 +203,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
/* If we are harvesting multiple skbs they will be
* collected on a list.
*/
- if (sctp_event2skb(event)->list)
- sctp_skb_list_tail(sctp_event2skb(event)->list, queue);
+ if (skb_list)
+ sctp_skb_list_tail(skb_list, queue);
else
- __skb_queue_tail(queue, sctp_event2skb(event));
+ __skb_queue_tail(queue, skb);
/* Did we just complete partial delivery and need to get
* rolling again? Move pending data to the receive
@@ -214,10 +220,11 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
return 1;
out_free:
- if (sctp_event2skb(event)->list)
- sctp_queue_purge_ulpevents(sctp_event2skb(event)->list);
+ if (skb_list)
+ sctp_queue_purge_ulpevents(skb_list);
else
sctp_ulpevent_free(event);
+
return 0;
}
@@ -269,7 +276,7 @@ static inline void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq,
* payload was fragmented on the way and ip had to reassemble them.
* We add the rest of skb's to the first skb's fraglist.
*/
-static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, struct sk_buff *l_frag)
+static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag)
{
struct sk_buff *pos;
struct sctp_ulpevent *event;
@@ -294,7 +301,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag,
skb_shinfo(f_frag)->frag_list = pos;
/* Remove the first fragment from the reassembly queue. */
- __skb_unlink(f_frag, f_frag->list);
+ __skb_unlink(f_frag, queue);
while (pos) {
pnext = pos->next;
@@ -304,7 +311,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag,
f_frag->data_len += pos->len;
/* Remove the fragment from the reassembly queue. */
- __skb_unlink(pos, pos->list);
+ __skb_unlink(pos, queue);
/* Break if we have reached the last fragment. */
if (pos == l_frag)
@@ -375,7 +382,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_u
done:
return retval;
found:
- retval = sctp_make_reassembled_event(first_frag, pos);
+ retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, pos);
if (retval)
retval->msg_flags |= MSG_EOR;
goto done;
@@ -435,7 +442,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq
* further.
*/
done:
- retval = sctp_make_reassembled_event(first_frag, last_frag);
+ retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag);
if (retval && is_last)
retval->msg_flags |= MSG_EOR;
@@ -527,7 +534,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *u
* further.
*/
done:
- retval = sctp_make_reassembled_event(first_frag, last_frag);
+ retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag);
return retval;
}
@@ -537,6 +544,7 @@ done:
static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
struct sctp_ulpevent *event)
{
+ struct sk_buff_head *event_list;
struct sk_buff *pos, *tmp;
struct sctp_ulpevent *cevent;
struct sctp_stream *in;
@@ -547,6 +555,8 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
ssn = event->ssn;
in = &ulpq->asoc->ssnmap->in;
+ event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
+
/* We are holding the chunks by stream, by SSN. */
sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
cevent = (struct sctp_ulpevent *) pos->cb;
@@ -567,10 +577,10 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
/* Found it, so mark in the ssnmap. */
sctp_ssn_next(in, sid);
- __skb_unlink(pos, pos->list);
+ __skb_unlink(pos, &ulpq->lobby);
/* Attach all gathered skbs to the event. */
- __skb_queue_tail(sctp_event2skb(event)->list, pos);
+ __skb_queue_tail(event_list, pos);
}
}
@@ -626,7 +636,7 @@ static inline void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq,
}
static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
- struct sctp_ulpevent *event)
+ struct sctp_ulpevent *event)
{
__u16 sid, ssn;
struct sctp_stream *in;
@@ -667,7 +677,7 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
{
struct sk_buff *pos, *tmp;
struct sctp_ulpevent *cevent;
- struct sctp_ulpevent *event = NULL;
+ struct sctp_ulpevent *event;
struct sctp_stream *in;
struct sk_buff_head temp;
__u16 csid, cssn;
@@ -675,6 +685,8 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
in = &ulpq->asoc->ssnmap->in;
/* We are holding the chunks by stream, by SSN. */
+ skb_queue_head_init(&temp);
+ event = NULL;
sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
cevent = (struct sctp_ulpevent *) pos->cb;
csid = cevent->stream;
@@ -686,19 +698,20 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
/* Found it, so mark in the ssnmap. */
sctp_ssn_next(in, csid);
- __skb_unlink(pos, pos->list);
+ __skb_unlink(pos, &ulpq->lobby);
if (!event) {
/* Create a temporary list to collect chunks on. */
event = sctp_skb2event(pos);
- skb_queue_head_init(&temp);
__skb_queue_tail(&temp, sctp_event2skb(event));
} else {
/* Attach all gathered skbs to the event. */
- __skb_queue_tail(sctp_event2skb(event)->list, pos);
+ __skb_queue_tail(&temp, pos);
}
}
- /* Send event to the ULP. */
+ /* Send event to the ULP. 'event' is the sctp_ulpevent for
+ * very first SKB on the 'temp' list.
+ */
if (event)
sctp_ulpq_tail_event(ulpq, event);
}
@@ -778,7 +791,8 @@ static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed)
/* Partial deliver the first message as there is pressure on rwnd. */
void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
- struct sctp_chunk *chunk, int gfp)
+ struct sctp_chunk *chunk,
+ unsigned int __nocast gfp)
{
struct sctp_ulpevent *event;
struct sctp_association *asoc;
@@ -802,7 +816,7 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
/* Renege some packets to make room for an incoming chunk. */
void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
- int gfp)
+ unsigned int __nocast gfp)
{
struct sctp_association *asoc;
__u16 needed, freed;
@@ -841,7 +855,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
/* Notify the application if an association is aborted and in
* partial delivery mode. Send up any pending received messages.
*/
-void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, int gfp)
+void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, unsigned int __nocast gfp)
{
struct sctp_ulpevent *ev = NULL;
struct sock *sk;
diff --git a/net/socket.c b/net/socket.c
index cec0cb3..c699e93 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -70,6 +70,8 @@
#include <linux/seq_file.h>
#include <linux/wanrouter.h>
#include <linux/if_bridge.h>
+#include <linux/if_frad.h>
+#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/cache.h>
@@ -81,6 +83,7 @@
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <linux/kmod.h>
+#include <linux/audit.h>
#ifdef CONFIG_NET_RADIO
#include <linux/wireless.h> /* Note : will define WIRELESS_EXT */
@@ -226,7 +229,7 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
return 0;
if(copy_from_user(kaddr,uaddr,ulen))
return -EFAULT;
- return 0;
+ return audit_sockaddr(ulen, kaddr);
}
/**
@@ -271,7 +274,7 @@ int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ule
#define SOCKFS_MAGIC 0x534F434B
-static kmem_cache_t * sock_inode_cachep;
+static kmem_cache_t * sock_inode_cachep __read_mostly;
static struct inode *sock_alloc_inode(struct super_block *sb)
{
@@ -330,7 +333,7 @@ static struct super_block *sockfs_get_sb(struct file_system_type *fs_type,
return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
}
-static struct vfsmount *sock_mnt;
+static struct vfsmount *sock_mnt __read_mostly;
static struct file_system_type sock_fs_type = {
.name = "sockfs",
@@ -382,9 +385,8 @@ int sock_map_fd(struct socket *sock)
goto out;
}
- sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);
+ this.len = sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);
this.name = name;
- this.len = strlen(name);
this.hash = SOCK_INODE(sock)->i_ino;
file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
@@ -404,6 +406,7 @@ int sock_map_fd(struct socket *sock)
file->f_mode = FMODE_READ | FMODE_WRITE;
file->f_flags = O_RDWR;
file->f_pos = 0;
+ file->private_data = sock;
fd_install(fd, file);
}
@@ -436,6 +439,9 @@ struct socket *sockfd_lookup(int fd, int *err)
return NULL;
}
+ if (file->f_op == &socket_file_ops)
+ return file->private_data; /* set in sock_map_fd */
+
inode = file->f_dentry->d_inode;
if (!S_ISSOCK(inode->i_mode)) {
*err = -ENOTSOCK;
@@ -661,7 +667,7 @@ static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
}
iocb->private = x;
x->kiocb = iocb;
- sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode);
+ sock = iocb->ki_filp->private_data;
x->async_msg.msg_name = NULL;
x->async_msg.msg_namelen = 0;
@@ -703,7 +709,7 @@ static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
}
iocb->private = x;
x->kiocb = iocb;
- sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode);
+ sock = iocb->ki_filp->private_data;
x->async_msg.msg_name = NULL;
x->async_msg.msg_namelen = 0;
@@ -720,13 +726,13 @@ static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
return __sock_sendmsg(iocb, sock, &x->async_msg, size);
}
-ssize_t sock_sendpage(struct file *file, struct page *page,
- int offset, size_t size, loff_t *ppos, int more)
+static ssize_t sock_sendpage(struct file *file, struct page *page,
+ int offset, size_t size, loff_t *ppos, int more)
{
struct socket *sock;
int flags;
- sock = SOCKET_I(file->f_dentry->d_inode);
+ sock = file->private_data;
flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
if (more)
@@ -735,14 +741,14 @@ ssize_t sock_sendpage(struct file *file, struct page *page,
return sock->ops->sendpage(sock, page, offset, size, flags);
}
-static int sock_readv_writev(int type, struct inode * inode,
+static int sock_readv_writev(int type,
struct file * file, const struct iovec * iov,
long count, size_t size)
{
struct msghdr msg;
struct socket *sock;
- sock = SOCKET_I(inode);
+ sock = file->private_data;
msg.msg_name = NULL;
msg.msg_namelen = 0;
@@ -769,7 +775,7 @@ static ssize_t sock_readv(struct file *file, const struct iovec *vector,
int i;
for (i = 0 ; i < count ; i++)
tot_len += vector[i].iov_len;
- return sock_readv_writev(VERIFY_WRITE, file->f_dentry->d_inode,
+ return sock_readv_writev(VERIFY_WRITE,
file, vector, count, tot_len);
}
@@ -780,7 +786,7 @@ static ssize_t sock_writev(struct file *file, const struct iovec *vector,
int i;
for (i = 0 ; i < count ; i++)
tot_len += vector[i].iov_len;
- return sock_readv_writev(VERIFY_READ, file->f_dentry->d_inode,
+ return sock_readv_writev(VERIFY_READ,
file, vector, count, tot_len);
}
@@ -834,7 +840,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
void __user *argp = (void __user *)arg;
int pid, err;
- sock = SOCKET_I(file->f_dentry->d_inode);
+ sock = file->private_data;
if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
err = dev_ioctl(cmd, argp);
} else
@@ -933,18 +939,18 @@ static unsigned int sock_poll(struct file *file, poll_table * wait)
/*
* We can't return errors to poll, so it's either yes or no.
*/
- sock = SOCKET_I(file->f_dentry->d_inode);
+ sock = file->private_data;
return sock->ops->poll(file, sock, wait);
}
static int sock_mmap(struct file * file, struct vm_area_struct * vma)
{
- struct socket *sock = SOCKET_I(file->f_dentry->d_inode);
+ struct socket *sock = file->private_data;
return sock->ops->mmap(file, sock, vma);
}
-int sock_close(struct inode *inode, struct file *filp)
+static int sock_close(struct inode *inode, struct file *filp)
{
/*
* It was possible the inode is NULL we were
@@ -989,7 +995,7 @@ static int sock_fasync(int fd, struct file *filp, int on)
return -ENOMEM;
}
- sock = SOCKET_I(filp->f_dentry->d_inode);
+ sock = filp->private_data;
if ((sk=sock->sk) == NULL) {
kfree(fna);
@@ -1739,10 +1745,11 @@ asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags)
goto out_freeiov;
ctl_len = msg_sys.msg_controllen;
if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
- err = cmsghdr_from_user_compat_to_kern(&msg_sys, ctl, sizeof(ctl));
+ err = cmsghdr_from_user_compat_to_kern(&msg_sys, sock->sk, ctl, sizeof(ctl));
if (err)
goto out_freeiov;
ctl_buf = msg_sys.msg_control;
+ ctl_len = msg_sys.msg_controllen;
} else if (ctl_len) {
if (ctl_len > sizeof(ctl))
{
@@ -1906,7 +1913,11 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
/* copy_from_user should be SMP safe. */
if (copy_from_user(a, args, nargs[call]))
return -EFAULT;
-
+
+ err = audit_socketcall(nargs[call]/sizeof(unsigned long), a);
+ if (err)
+ return err;
+
a0=a[0];
a1=a[1];
@@ -2019,9 +2030,6 @@ int sock_unregister(int family)
return 0;
}
-
-extern void sk_init(void);
-
void __init sock_init(void)
{
/*
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 9bcec9b..505e2d4 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -66,10 +66,10 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
u32 flavor = pseudoflavor_to_flavor(pseudoflavor);
if (flavor >= RPC_AUTH_MAXFLAVOR || !(ops = auth_flavors[flavor]))
- return NULL;
+ return ERR_PTR(-EINVAL);
auth = ops->create(clnt, pseudoflavor);
- if (!auth)
- return NULL;
+ if (IS_ERR(auth))
+ return auth;
if (clnt->cl_auth)
rpcauth_destroy(clnt->cl_auth);
clnt->cl_auth = auth;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index a33b627..2f7b867 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -660,14 +660,16 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
{
struct gss_auth *gss_auth;
struct rpc_auth * auth;
+ int err = -ENOMEM; /* XXX? */
dprintk("RPC: creating GSS authenticator for client %p\n",clnt);
if (!try_module_get(THIS_MODULE))
- return NULL;
+ return ERR_PTR(err);
if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL)))
goto out_dec;
gss_auth->client = clnt;
+ err = -EINVAL;
gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
if (!gss_auth->mech) {
printk(KERN_WARNING "%s: Pseudoflavor %d not found!",
@@ -675,9 +677,8 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
goto err_free;
}
gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
- /* FIXME: Will go away once privacy support is merged in */
- if (gss_auth->service == RPC_GSS_SVC_PRIVACY)
- gss_auth->service = RPC_GSS_SVC_INTEGRITY;
+ if (gss_auth->service == 0)
+ goto err_put_mech;
INIT_LIST_HEAD(&gss_auth->upcalls);
spin_lock_init(&gss_auth->lock);
auth = &gss_auth->rpc_auth;
@@ -687,15 +688,18 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
auth->au_flavor = flavor;
atomic_set(&auth->au_count, 1);
- if (rpcauth_init_credcache(auth, GSS_CRED_EXPIRE) < 0)
+ err = rpcauth_init_credcache(auth, GSS_CRED_EXPIRE);
+ if (err)
goto err_put_mech;
snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s",
clnt->cl_pathname,
gss_auth->mech->gm_name);
gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
- if (IS_ERR(gss_auth->dentry))
+ if (IS_ERR(gss_auth->dentry)) {
+ err = PTR_ERR(gss_auth->dentry);
goto err_put_mech;
+ }
return auth;
err_put_mech:
@@ -704,7 +708,7 @@ err_free:
kfree(gss_auth);
out_dec:
module_put(THIS_MODULE);
- return NULL;
+ return ERR_PTR(err);
}
static void
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 24c21f2..ee6ae74 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -160,7 +160,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
" unsupported checksum %d", cksumtype);
goto out;
}
- if (!(tfm = crypto_alloc_tfm(cksumname, 0)))
+ if (!(tfm = crypto_alloc_tfm(cksumname, CRYPTO_TFM_REQ_MAY_SLEEP)))
goto out;
cksum->len = crypto_tfm_alg_digestsize(tfm);
if ((cksum->data = kmalloc(cksum->len, GFP_KERNEL)) == NULL)
@@ -185,9 +185,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
sg->page = body->pages[i];
sg->offset = offset;
sg->length = thislen;
- kmap(sg->page); /* XXX kmap_atomic? */
crypto_digest_update(tfm, sg, 1);
- kunmap(sg->page);
len -= thislen;
i++;
offset = 0;
@@ -201,8 +199,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
crypto_digest_final(tfm, cksum->data);
code = 0;
out:
- if (tfm)
- crypto_free_tfm(tfm);
+ crypto_free_tfm(tfm);
return code;
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index cf72651..606a8a8 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -185,12 +185,9 @@ static void
gss_delete_sec_context_kerberos(void *internal_ctx) {
struct krb5_ctx *kctx = internal_ctx;
- if (kctx->seq)
- crypto_free_tfm(kctx->seq);
- if (kctx->enc)
- crypto_free_tfm(kctx->enc);
- if (kctx->mech_used.data)
- kfree(kctx->mech_used.data);
+ crypto_free_tfm(kctx->seq);
+ crypto_free_tfm(kctx->enc);
+ kfree(kctx->mech_used.data);
kfree(kctx);
}
diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c
index dad0599..6c97d61 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_mech.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c
@@ -214,14 +214,10 @@ static void
gss_delete_sec_context_spkm3(void *internal_ctx) {
struct spkm3_ctx *sctx = internal_ctx;
- if(sctx->derived_integ_key)
- crypto_free_tfm(sctx->derived_integ_key);
- if(sctx->derived_conf_key)
- crypto_free_tfm(sctx->derived_conf_key);
- if(sctx->share_key.data)
- kfree(sctx->share_key.data);
- if(sctx->mech_used.data)
- kfree(sctx->mech_used.data);
+ crypto_free_tfm(sctx->derived_integ_key);
+ crypto_free_tfm(sctx->derived_conf_key);
+ kfree(sctx->share_key.data);
+ kfree(sctx->mech_used.data);
kfree(sctx);
}
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 5c8fe3b..e330819 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -250,6 +250,7 @@ out:
}
static struct cache_detail rsi_cache = {
+ .owner = THIS_MODULE,
.hash_size = RSI_HASHMAX,
.hash_table = rsi_table,
.name = "auth.rpcsec.init",
@@ -436,6 +437,7 @@ out:
}
static struct cache_detail rsc_cache = {
+ .owner = THIS_MODULE,
.hash_size = RSC_HASHMAX,
.hash_table = rsc_table,
.name = "auth.rpcsec.context",
@@ -1074,7 +1076,9 @@ gss_svc_init(void)
void
gss_svc_shutdown(void)
{
- cache_unregister(&rsc_cache);
- cache_unregister(&rsi_cache);
+ if (cache_unregister(&rsc_cache))
+ printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n");
+ if (cache_unregister(&rsi_cache))
+ printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n");
svc_auth_unregister(RPC_AUTH_GSS);
}
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 900f5bc..f509e99 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -177,7 +177,7 @@ void cache_register(struct cache_detail *cd)
cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
if (cd->proc_ent) {
struct proc_dir_entry *p;
- cd->proc_ent->owner = THIS_MODULE;
+ cd->proc_ent->owner = cd->owner;
cd->channel_ent = cd->content_ent = NULL;
p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR,
@@ -185,7 +185,7 @@ void cache_register(struct cache_detail *cd)
cd->flush_ent = p;
if (p) {
p->proc_fops = &cache_flush_operations;
- p->owner = THIS_MODULE;
+ p->owner = cd->owner;
p->data = cd;
}
@@ -195,7 +195,7 @@ void cache_register(struct cache_detail *cd)
cd->channel_ent = p;
if (p) {
p->proc_fops = &cache_file_operations;
- p->owner = THIS_MODULE;
+ p->owner = cd->owner;
p->data = cd;
}
}
@@ -205,7 +205,7 @@ void cache_register(struct cache_detail *cd)
cd->content_ent = p;
if (p) {
p->proc_fops = &content_file_operations;
- p->owner = THIS_MODULE;
+ p->owner = cd->owner;
p->data = cd;
}
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 02bc029..f17e615 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -97,12 +97,13 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
* made to sleep too long.
*/
struct rpc_clnt *
-rpc_create_client(struct rpc_xprt *xprt, char *servname,
+rpc_new_client(struct rpc_xprt *xprt, char *servname,
struct rpc_program *program, u32 vers,
rpc_authflavor_t flavor)
{
struct rpc_version *version;
struct rpc_clnt *clnt = NULL;
+ struct rpc_auth *auth;
int err;
int len;
@@ -157,10 +158,11 @@ rpc_create_client(struct rpc_xprt *xprt, char *servname,
if (err < 0)
goto out_no_path;
- err = -ENOMEM;
- if (!rpcauth_create(flavor, clnt)) {
+ auth = rpcauth_create(flavor, clnt);
+ if (IS_ERR(auth)) {
printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n",
flavor);
+ err = PTR_ERR(auth);
goto out_no_auth;
}
@@ -178,6 +180,37 @@ out_no_path:
kfree(clnt->cl_server);
kfree(clnt);
out_err:
+ xprt_destroy(xprt);
+ return ERR_PTR(err);
+}
+
+/**
+ * Create an RPC client
+ * @xprt - pointer to xprt struct
+ * @servname - name of server
+ * @info - rpc_program
+ * @version - rpc_program version
+ * @authflavor - rpc_auth flavour to use
+ *
+ * Creates an RPC client structure, then pings the server in order to
+ * determine if it is up, and if it supports this program and version.
+ *
+ * This function should never be called by asynchronous tasks such as
+ * the portmapper.
+ */
+struct rpc_clnt *rpc_create_client(struct rpc_xprt *xprt, char *servname,
+ struct rpc_program *info, u32 version, rpc_authflavor_t authflavor)
+{
+ struct rpc_clnt *clnt;
+ int err;
+
+ clnt = rpc_new_client(xprt, servname, info, version, authflavor);
+ if (IS_ERR(clnt))
+ return clnt;
+ err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
+ if (err == 0)
+ return clnt;
+ rpc_shutdown_client(clnt);
return ERR_PTR(err);
}
@@ -208,6 +241,8 @@ rpc_clone_client(struct rpc_clnt *clnt)
rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
if (new->cl_auth)
atomic_inc(&new->cl_auth->au_count);
+ new->cl_pmap = &new->cl_pmap_default;
+ rpc_init_wait_queue(&new->cl_pmap_default.pm_bindwait, "bindwait");
return new;
out_no_clnt:
printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__);
@@ -296,6 +331,44 @@ rpc_release_client(struct rpc_clnt *clnt)
rpc_destroy_client(clnt);
}
+/**
+ * rpc_bind_new_program - bind a new RPC program to an existing client
+ * @old - old rpc_client
+ * @program - rpc program to set
+ * @vers - rpc program version
+ *
+ * Clones the rpc client and sets up a new RPC program. This is mainly
+ * of use for enabling different RPC programs to share the same transport.
+ * The Sun NFSv2/v3 ACL protocol can do this.
+ */
+struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
+ struct rpc_program *program,
+ int vers)
+{
+ struct rpc_clnt *clnt;
+ struct rpc_version *version;
+ int err;
+
+ BUG_ON(vers >= program->nrvers || !program->version[vers]);
+ version = program->version[vers];
+ clnt = rpc_clone_client(old);
+ if (IS_ERR(clnt))
+ goto out;
+ clnt->cl_procinfo = version->procs;
+ clnt->cl_maxproc = version->nrprocs;
+ clnt->cl_protname = program->name;
+ clnt->cl_prog = program->number;
+ clnt->cl_vers = version->number;
+ clnt->cl_stats = program->stats;
+ err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
+ if (err != 0) {
+ rpc_shutdown_client(clnt);
+ clnt = ERR_PTR(err);
+ }
+out:
+ return clnt;
+}
+
/*
* Default callback for async RPC calls
*/
@@ -305,38 +378,41 @@ rpc_default_callback(struct rpc_task *task)
}
/*
- * Export the signal mask handling for aysnchronous code that
+ * Export the signal mask handling for synchronous code that
* sleeps on RPC calls
*/
+#define RPC_INTR_SIGNALS (sigmask(SIGINT) | sigmask(SIGQUIT) | sigmask(SIGKILL))
+static void rpc_save_sigmask(sigset_t *oldset, int intr)
+{
+ unsigned long sigallow = 0;
+ sigset_t sigmask;
+
+ /* Block all signals except those listed in sigallow */
+ if (intr)
+ sigallow |= RPC_INTR_SIGNALS;
+ siginitsetinv(&sigmask, sigallow);
+ sigprocmask(SIG_BLOCK, &sigmask, oldset);
+}
+
+static inline void rpc_task_sigmask(struct rpc_task *task, sigset_t *oldset)
+{
+ rpc_save_sigmask(oldset, !RPC_TASK_UNINTERRUPTIBLE(task));
+}
+
+static inline void rpc_restore_sigmask(sigset_t *oldset)
+{
+ sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
void rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset)
{
- unsigned long sigallow = sigmask(SIGKILL);
- unsigned long irqflags;
-
- /* Turn off various signals */
- if (clnt->cl_intr) {
- struct k_sigaction *action = current->sighand->action;
- if (action[SIGINT-1].sa.sa_handler == SIG_DFL)
- sigallow |= sigmask(SIGINT);
- if (action[SIGQUIT-1].sa.sa_handler == SIG_DFL)
- sigallow |= sigmask(SIGQUIT);
- }
- spin_lock_irqsave(&current->sighand->siglock, irqflags);
- *oldset = current->blocked;
- siginitsetinv(&current->blocked, sigallow & ~oldset->sig[0]);
- recalc_sigpending();
- spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
+ rpc_save_sigmask(oldset, clnt->cl_intr);
}
void rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset)
{
- unsigned long irqflags;
-
- spin_lock_irqsave(&current->sighand->siglock, irqflags);
- current->blocked = *oldset;
- recalc_sigpending();
- spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
+ rpc_restore_sigmask(oldset);
}
/*
@@ -354,26 +430,26 @@ int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
BUG_ON(flags & RPC_TASK_ASYNC);
- rpc_clnt_sigmask(clnt, &oldset);
-
status = -ENOMEM;
task = rpc_new_task(clnt, NULL, flags);
if (task == NULL)
goto out;
+ /* Mask signals on RPC calls _and_ GSS_AUTH upcalls */
+ rpc_task_sigmask(task, &oldset);
+
rpc_call_setup(task, msg, 0);
/* Set up the call info struct and execute the task */
- if (task->tk_status == 0)
+ if (task->tk_status == 0) {
status = rpc_execute(task);
- else {
+ } else {
status = task->tk_status;
rpc_release_task(task);
}
+ rpc_restore_sigmask(&oldset);
out:
- rpc_clnt_sigunmask(clnt, &oldset);
-
return status;
}
@@ -394,8 +470,6 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
flags |= RPC_TASK_ASYNC;
- rpc_clnt_sigmask(clnt, &oldset);
-
/* Create/initialize a new RPC task */
if (!callback)
callback = rpc_default_callback;
@@ -404,6 +478,9 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
goto out;
task->tk_calldata = data;
+ /* Mask signals on GSS_AUTH upcalls */
+ rpc_task_sigmask(task, &oldset);
+
rpc_call_setup(task, msg, 0);
/* Set up the call info struct and execute the task */
@@ -413,9 +490,8 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
else
rpc_release_task(task);
+ rpc_restore_sigmask(&oldset);
out:
- rpc_clnt_sigunmask(clnt, &oldset);
-
return status;
}
@@ -593,7 +669,7 @@ call_allocate(struct rpc_task *task)
return;
printk(KERN_INFO "RPC: buffer allocation failed for task %p\n", task);
- if (RPC_IS_ASYNC(task) || !(task->tk_client->cl_intr && signalled())) {
+ if (RPC_IS_ASYNC(task) || !signalled()) {
xprt_release(task);
task->tk_action = call_reserve;
rpc_delay(task, HZ>>4);
@@ -957,7 +1033,9 @@ call_header(struct rpc_task *task)
*p++ = htonl(clnt->cl_prog); /* program number */
*p++ = htonl(clnt->cl_vers); /* program version */
*p++ = htonl(task->tk_msg.rpc_proc->p_proc); /* procedure */
- return rpcauth_marshcred(task, p);
+ p = rpcauth_marshcred(task, p);
+ req->rq_slen = xdr_adjust_iovec(&req->rq_svec[0], p);
+ return p;
}
/*
@@ -986,10 +1064,11 @@ call_verify(struct rpc_task *task)
case RPC_AUTH_ERROR:
break;
case RPC_MISMATCH:
- printk(KERN_WARNING "%s: RPC call version mismatch!\n", __FUNCTION__);
- goto out_eio;
+ dprintk("%s: RPC call version mismatch!\n", __FUNCTION__);
+ error = -EPROTONOSUPPORT;
+ goto out_err;
default:
- printk(KERN_WARNING "%s: RPC call rejected, unknown error: %x\n", __FUNCTION__, n);
+ dprintk("%s: RPC call rejected, unknown error: %x\n", __FUNCTION__, n);
goto out_eio;
}
if (--len < 0)
@@ -1040,23 +1119,26 @@ call_verify(struct rpc_task *task)
case RPC_SUCCESS:
return p;
case RPC_PROG_UNAVAIL:
- printk(KERN_WARNING "RPC: call_verify: program %u is unsupported by server %s\n",
+ dprintk("RPC: call_verify: program %u is unsupported by server %s\n",
(unsigned int)task->tk_client->cl_prog,
task->tk_client->cl_server);
- goto out_eio;
+ error = -EPFNOSUPPORT;
+ goto out_err;
case RPC_PROG_MISMATCH:
- printk(KERN_WARNING "RPC: call_verify: program %u, version %u unsupported by server %s\n",
+ dprintk("RPC: call_verify: program %u, version %u unsupported by server %s\n",
(unsigned int)task->tk_client->cl_prog,
(unsigned int)task->tk_client->cl_vers,
task->tk_client->cl_server);
- goto out_eio;
+ error = -EPROTONOSUPPORT;
+ goto out_err;
case RPC_PROC_UNAVAIL:
- printk(KERN_WARNING "RPC: call_verify: proc %p unsupported by program %u, version %u on server %s\n",
+ dprintk("RPC: call_verify: proc %p unsupported by program %u, version %u on server %s\n",
task->tk_msg.rpc_proc,
task->tk_client->cl_prog,
task->tk_client->cl_vers,
task->tk_client->cl_server);
- goto out_eio;
+ error = -EOPNOTSUPP;
+ goto out_err;
case RPC_GARBAGE_ARGS:
dprintk("RPC: %4d %s: server saw garbage\n", task->tk_pid, __FUNCTION__);
break; /* retry */
@@ -1069,7 +1151,7 @@ out_retry:
task->tk_client->cl_stats->rpcgarbage++;
if (task->tk_garb_retry) {
task->tk_garb_retry--;
- dprintk(KERN_WARNING "RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid);
+ dprintk("RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid);
task->tk_action = call_bind;
return NULL;
}
@@ -1083,3 +1165,30 @@ out_overflow:
printk(KERN_WARNING "RPC %s: server reply was truncated.\n", __FUNCTION__);
goto out_retry;
}
+
+static int rpcproc_encode_null(void *rqstp, u32 *data, void *obj)
+{
+ return 0;
+}
+
+static int rpcproc_decode_null(void *rqstp, u32 *data, void *obj)
+{
+ return 0;
+}
+
+static struct rpc_procinfo rpcproc_null = {
+ .p_encode = rpcproc_encode_null,
+ .p_decode = rpcproc_decode_null,
+};
+
+int rpc_ping(struct rpc_clnt *clnt, int flags)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &rpcproc_null,
+ };
+ int err;
+ msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0);
+ err = rpc_call_sync(clnt, &msg, flags);
+ put_rpccred(msg.rpc_cred);
+ return err;
+}
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index d0b1d2c..4e81f27 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -53,6 +53,9 @@ rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt)
task->tk_pid, clnt->cl_server,
map->pm_prog, map->pm_vers, map->pm_prot);
+ /* Autobind on cloned rpc clients is discouraged */
+ BUG_ON(clnt->cl_parent != clnt);
+
spin_lock(&pmap_lock);
if (map->pm_binding) {
rpc_sleep_on(&map->pm_bindwait, task, NULL, NULL);
@@ -207,12 +210,10 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto)
xprt->addr.sin_port = htons(RPC_PMAP_PORT);
/* printk("pmap: create clnt\n"); */
- clnt = rpc_create_client(xprt, hostname,
+ clnt = rpc_new_client(xprt, hostname,
&pmap_program, RPC_PMAP_VERSION,
RPC_AUTH_UNIX);
- if (IS_ERR(clnt)) {
- xprt_destroy(xprt);
- } else {
+ if (!IS_ERR(clnt)) {
clnt->cl_softrtry = 1;
clnt->cl_chatty = 1;
clnt->cl_oneshot = 1;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 554f224..ded6c63 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -3,7 +3,7 @@
*
* Userland/kernel interface for rpcauth_gss.
* Code shamelessly plagiarized from fs/nfsd/nfsctl.c
- * and fs/driverfs/inode.c
+ * and fs/sysfs/inode.c
*
* Copyright (c) 2002, Trond Myklebust <trond.myklebust@fys.uio.no>
*
@@ -28,13 +28,13 @@
#include <linux/workqueue.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
-static struct vfsmount *rpc_mount;
+static struct vfsmount *rpc_mount __read_mostly;
static int rpc_mount_count;
static struct file_system_type rpc_pipe_fs_type;
-static kmem_cache_t *rpc_inode_cachep;
+static kmem_cache_t *rpc_inode_cachep __read_mostly;
#define RPC_UPCALL_TIMEOUT (30*HZ)
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index c06614d..f310403 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -34,10 +34,10 @@ static int rpc_task_id;
#define RPC_BUFFER_MAXSIZE (2048)
#define RPC_BUFFER_POOLSIZE (8)
#define RPC_TASK_POOLSIZE (8)
-static kmem_cache_t *rpc_task_slabp;
-static kmem_cache_t *rpc_buffer_slabp;
-static mempool_t *rpc_task_mempool;
-static mempool_t *rpc_buffer_mempool;
+static kmem_cache_t *rpc_task_slabp __read_mostly;
+static kmem_cache_t *rpc_buffer_slabp __read_mostly;
+static mempool_t *rpc_task_mempool __read_mostly;
+static mempool_t *rpc_buffer_mempool __read_mostly;
static void __rpc_default_timer(struct rpc_task *task);
static void rpciod_killall(void);
@@ -290,7 +290,7 @@ static void rpc_make_runnable(struct rpc_task *task)
return;
}
} else
- wake_up(&task->u.tk_wait.waitq);
+ wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
}
/*
@@ -555,6 +555,38 @@ __rpc_atrun(struct rpc_task *task)
}
/*
+ * Helper that calls task->tk_exit if it exists and then returns
+ * true if we should exit __rpc_execute.
+ */
+static inline int __rpc_do_exit(struct rpc_task *task)
+{
+ if (task->tk_exit != NULL) {
+ lock_kernel();
+ task->tk_exit(task);
+ unlock_kernel();
+ /* If tk_action is non-null, we should restart the call */
+ if (task->tk_action != NULL) {
+ if (!RPC_ASSASSINATED(task)) {
+ /* Release RPC slot and buffer memory */
+ xprt_release(task);
+ rpc_free(task);
+ return 0;
+ }
+ printk(KERN_ERR "RPC: dead task tried to walk away.\n");
+ }
+ }
+ return 1;
+}
+
+static int rpc_wait_bit_interruptible(void *word)
+{
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ schedule();
+ return 0;
+}
+
+/*
* This is the RPC `scheduler' (or rather, the finite state machine).
*/
static int __rpc_execute(struct rpc_task *task)
@@ -566,8 +598,7 @@ static int __rpc_execute(struct rpc_task *task)
BUG_ON(RPC_IS_QUEUED(task));
- restarted:
- while (1) {
+ for (;;) {
/*
* Garbage collection of pending timers...
*/
@@ -600,11 +631,12 @@ static int __rpc_execute(struct rpc_task *task)
* by someone else.
*/
if (!RPC_IS_QUEUED(task)) {
- if (!task->tk_action)
+ if (task->tk_action != NULL) {
+ lock_kernel();
+ task->tk_action(task);
+ unlock_kernel();
+ } else if (__rpc_do_exit(task))
break;
- lock_kernel();
- task->tk_action(task);
- unlock_kernel();
}
/*
@@ -624,44 +656,26 @@ static int __rpc_execute(struct rpc_task *task)
/* sync task: sleep here */
dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid);
- if (RPC_TASK_UNINTERRUPTIBLE(task)) {
- __wait_event(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task));
- } else {
- __wait_event_interruptible(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task), status);
+ /* Note: Caller should be using rpc_clnt_sigmask() */
+ status = out_of_line_wait_on_bit(&task->tk_runstate,
+ RPC_TASK_QUEUED, rpc_wait_bit_interruptible,
+ TASK_INTERRUPTIBLE);
+ if (status == -ERESTARTSYS) {
/*
* When a sync task receives a signal, it exits with
* -ERESTARTSYS. In order to catch any callbacks that
* clean up after sleeping on some queue, we don't
* break the loop here, but go around once more.
*/
- if (status == -ERESTARTSYS) {
- dprintk("RPC: %4d got signal\n", task->tk_pid);
- task->tk_flags |= RPC_TASK_KILLED;
- rpc_exit(task, -ERESTARTSYS);
- rpc_wake_up_task(task);
- }
+ dprintk("RPC: %4d got signal\n", task->tk_pid);
+ task->tk_flags |= RPC_TASK_KILLED;
+ rpc_exit(task, -ERESTARTSYS);
+ rpc_wake_up_task(task);
}
rpc_set_running(task);
dprintk("RPC: %4d sync task resuming\n", task->tk_pid);
}
- if (task->tk_exit) {
- lock_kernel();
- task->tk_exit(task);
- unlock_kernel();
- /* If tk_action is non-null, the user wants us to restart */
- if (task->tk_action) {
- if (!RPC_ASSASSINATED(task)) {
- /* Release RPC slot and buffer memory */
- if (task->tk_rqstp)
- xprt_release(task);
- rpc_free(task);
- goto restarted;
- }
- printk(KERN_ERR "RPC: dead task tries to walk away.\n");
- }
- }
-
dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status);
status = task->tk_status;
@@ -759,8 +773,6 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action call
/* Initialize workqueue for async tasks */
task->tk_workqueue = rpciod_workqueue;
- if (!RPC_IS_ASYNC(task))
- init_waitqueue_head(&task->u.tk_wait.waitq);
if (clnt) {
atomic_inc(&clnt->cl_users);
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 9b67dc1..4979f22 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -35,13 +35,13 @@ static int rpc_proc_show(struct seq_file *seq, void *v) {
int i, j;
seq_printf(seq,
- "net %d %d %d %d\n",
+ "net %u %u %u %u\n",
statp->netcnt,
statp->netudpcnt,
statp->nettcpcnt,
statp->nettcpconn);
seq_printf(seq,
- "rpc %d %d %d\n",
+ "rpc %u %u %u\n",
statp->rpccnt,
statp->rpcretrans,
statp->rpcauthrefresh);
@@ -50,10 +50,10 @@ static int rpc_proc_show(struct seq_file *seq, void *v) {
const struct rpc_version *vers = prog->version[i];
if (!vers)
continue;
- seq_printf(seq, "proc%d %d",
+ seq_printf(seq, "proc%u %u",
vers->number, vers->nrprocs);
for (j = 0; j < vers->nrprocs; j++)
- seq_printf(seq, " %d",
+ seq_printf(seq, " %u",
vers->procs[j].p_count);
seq_putc(seq, '\n');
}
@@ -83,13 +83,13 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
int i, j;
seq_printf(seq,
- "net %d %d %d %d\n",
+ "net %u %u %u %u\n",
statp->netcnt,
statp->netudpcnt,
statp->nettcpcnt,
statp->nettcpconn);
seq_printf(seq,
- "rpc %d %d %d %d %d\n",
+ "rpc %u %u %u %u %u\n",
statp->rpccnt,
statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt,
statp->rpcbadfmt,
@@ -99,9 +99,9 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) {
for (i = 0; i < prog->pg_nvers; i++) {
if (!(vers = prog->pg_vers[i]) || !(proc = vers->vs_proc))
continue;
- seq_printf(seq, "proc%d %d", i, vers->vs_nproc);
+ seq_printf(seq, "proc%d %u", i, vers->vs_nproc);
for (j = 0; j < vers->vs_nproc; j++, proc++)
- seq_printf(seq, " %d", proc->pc_count);
+ seq_printf(seq, " %u", proc->pc_count);
seq_putc(seq, '\n');
}
}
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index d4f26bf..ed48ff0 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -41,7 +41,9 @@ EXPORT_SYMBOL(rpc_release_task);
/* RPC client functions */
EXPORT_SYMBOL(rpc_create_client);
+EXPORT_SYMBOL(rpc_new_client);
EXPORT_SYMBOL(rpc_clone_client);
+EXPORT_SYMBOL(rpc_bind_new_program);
EXPORT_SYMBOL(rpc_destroy_client);
EXPORT_SYMBOL(rpc_shutdown_client);
EXPORT_SYMBOL(rpc_release_client);
@@ -61,7 +63,6 @@ EXPORT_SYMBOL(rpc_mkpipe);
/* Client transport */
EXPORT_SYMBOL(xprt_create_proto);
-EXPORT_SYMBOL(xprt_destroy);
EXPORT_SYMBOL(xprt_set_timeout);
EXPORT_SYMBOL(xprt_udp_slot_table_entries);
EXPORT_SYMBOL(xprt_tcp_slot_table_entries);
@@ -129,6 +130,10 @@ EXPORT_SYMBOL(xdr_encode_netobj);
EXPORT_SYMBOL(xdr_encode_pages);
EXPORT_SYMBOL(xdr_inline_pages);
EXPORT_SYMBOL(xdr_shift_buf);
+EXPORT_SYMBOL(xdr_encode_word);
+EXPORT_SYMBOL(xdr_decode_word);
+EXPORT_SYMBOL(xdr_encode_array2);
+EXPORT_SYMBOL(xdr_decode_array2);
EXPORT_SYMBOL(xdr_buf_from_iov);
EXPORT_SYMBOL(xdr_buf_subsegment);
EXPORT_SYMBOL(xdr_buf_read_netobj);
@@ -171,8 +176,10 @@ cleanup_sunrpc(void)
{
unregister_rpc_pipefs();
rpc_destroy_mempool();
- cache_unregister(&auth_domain_cache);
- cache_unregister(&ip_map_cache);
+ if (cache_unregister(&auth_domain_cache))
+ printk(KERN_ERR "sunrpc: failed to unregister auth_domain cache\n");
+ if (cache_unregister(&ip_map_cache))
+ printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n");
#ifdef RPC_DEBUG
rpc_unregister_sysctl();
#endif
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index bb2d99f..e9bd912 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -35,20 +35,24 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL)))
return NULL;
memset(serv, 0, sizeof(*serv));
+ serv->sv_name = prog->pg_name;
serv->sv_program = prog;
serv->sv_nrthreads = 1;
serv->sv_stats = prog->pg_stats;
serv->sv_bufsz = bufsize? bufsize : 4096;
- prog->pg_lovers = prog->pg_nvers-1;
xdrsize = 0;
- for (vers=0; vers<prog->pg_nvers ; vers++)
- if (prog->pg_vers[vers]) {
- prog->pg_hivers = vers;
- if (prog->pg_lovers > vers)
- prog->pg_lovers = vers;
- if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
- xdrsize = prog->pg_vers[vers]->vs_xdrsize;
- }
+ while (prog) {
+ prog->pg_lovers = prog->pg_nvers-1;
+ for (vers=0; vers<prog->pg_nvers ; vers++)
+ if (prog->pg_vers[vers]) {
+ prog->pg_hivers = vers;
+ if (prog->pg_lovers > vers)
+ prog->pg_lovers = vers;
+ if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
+ xdrsize = prog->pg_vers[vers]->vs_xdrsize;
+ }
+ prog = prog->pg_next;
+ }
serv->sv_xdrsize = xdrsize;
INIT_LIST_HEAD(&serv->sv_threads);
INIT_LIST_HEAD(&serv->sv_sockets);
@@ -56,8 +60,6 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
INIT_LIST_HEAD(&serv->sv_permsocks);
spin_lock_init(&serv->sv_lock);
- serv->sv_name = prog->pg_name;
-
/* Remove any stale portmap registrations */
svc_register(serv, 0, 0);
@@ -281,6 +283,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
rqstp->rq_res.len = 0;
rqstp->rq_res.page_base = 0;
rqstp->rq_res.page_len = 0;
+ rqstp->rq_res.buflen = PAGE_SIZE;
rqstp->rq_res.tail[0].iov_len = 0;
/* tcp needs a space for the record length... */
if (rqstp->rq_prot == IPPROTO_TCP)
@@ -338,7 +341,10 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
goto sendit;
}
- if (prog != progp->pg_prog)
+ for (progp = serv->sv_program; progp; progp = progp->pg_next)
+ if (prog == progp->pg_prog)
+ break;
+ if (progp == NULL)
goto err_bad_prog;
if (vers >= progp->pg_nvers ||
@@ -451,11 +457,7 @@ err_bad_auth:
goto sendit;
err_bad_prog:
-#ifdef RPC_PARANOIA
- if (prog != 100227 || progp->pg_prog != 100003)
- printk("svc: unknown program %d (me %d)\n", prog, progp->pg_prog);
- /* else it is just a Solaris client seeing if ACLs are supported */
-#endif
+ dprintk("svc: unknown program %d\n", prog);
serv->sv_stats->rpcbadfmt++;
svc_putu32(resv, rpc_prog_unavail);
goto sendit;
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index bde8147..dda4f0c 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -143,6 +143,7 @@ static void auth_domain_drop(struct cache_head *item, struct cache_detail *cd)
struct cache_detail auth_domain_cache = {
+ .owner = THIS_MODULE,
.hash_size = DN_HASHMAX,
.hash_table = auth_domain_table,
.name = "auth.domain",
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2b99b40..cac2e77 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -8,6 +8,7 @@
#include <linux/err.h>
#include <linux/seq_file.h>
#include <linux/hash.h>
+#include <linux/string.h>
#define RPCDBG_FACILITY RPCDBG_AUTH
@@ -20,14 +21,6 @@
*/
-static char *strdup(char *s)
-{
- char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
- if (rv)
- strcpy(rv, s);
- return rv;
-}
-
struct unix_domain {
struct auth_domain h;
int addr_changes;
@@ -55,7 +48,7 @@ struct auth_domain *unix_domain_find(char *name)
if (new == NULL)
return NULL;
cache_init(&new->h.h);
- new->h.name = strdup(name);
+ new->h.name = kstrdup(name, GFP_KERNEL);
new->h.flavour = RPC_AUTH_UNIX;
new->addr_changes = 0;
new->h.h.expiry_time = NEVER;
@@ -249,6 +242,7 @@ static int ip_map_show(struct seq_file *m,
struct cache_detail ip_map_cache = {
+ .owner = THIS_MODULE,
.hash_size = IP_HASHMAX,
.hash_table = ip_table,
.name = "auth.unix.ip",
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 0590703..05fe2e7 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -34,7 +34,7 @@
#include <net/sock.h>
#include <net/checksum.h>
#include <net/ip.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
@@ -584,13 +584,16 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
/* possibly an icmp error */
dprintk("svc: recvfrom returned error %d\n", -err);
}
- if (skb->stamp.tv_sec == 0) {
- skb->stamp.tv_sec = xtime.tv_sec;
- skb->stamp.tv_usec = xtime.tv_nsec * 1000;
+ if (skb->tstamp.off_sec == 0) {
+ struct timeval tv;
+
+ tv.tv_sec = xtime.tv_sec;
+ tv.tv_usec = xtime.tv_nsec * 1000;
+ skb_set_timestamp(skb, &tv);
/* Don't enable netstamp, sunrpc doesn't
need that much accuracy */
}
- svsk->sk_sk->sk_stamp = skb->stamp;
+ skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp);
set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
/*
@@ -1185,8 +1188,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
arg->page_len = (pages-2)*PAGE_SIZE;
arg->len = (pages-1)*PAGE_SIZE;
arg->tail[0].iov_len = 0;
-
- try_to_freeze(PF_FREEZE);
+
+ try_to_freeze();
if (signalled())
return -EINTR;
@@ -1227,7 +1230,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
schedule_timeout(timeout);
- try_to_freeze(PF_FREEZE);
+ try_to_freeze();
spin_lock_bh(&serv->sv_lock);
remove_wait_queue(&rqstp->rq_wait, &wait);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 67b9f035..fde16f4 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -176,21 +176,23 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
xdr->buflen += len;
}
-void
+ssize_t
xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
skb_reader_t *desc,
skb_read_actor_t copy_actor)
{
struct page **ppage = xdr->pages;
unsigned int len, pglen = xdr->page_len;
+ ssize_t copied = 0;
int ret;
len = xdr->head[0].iov_len;
if (base < len) {
len -= base;
ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
+ copied += ret;
if (ret != len || !desc->count)
- return;
+ goto out;
base = 0;
} else
base -= len;
@@ -210,6 +212,17 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
do {
char *kaddr;
+ /* ACL likes to be lazy in allocating pages - ACLs
+ * are small by default but can get huge. */
+ if (unlikely(*ppage == NULL)) {
+ *ppage = alloc_page(GFP_ATOMIC);
+ if (unlikely(*ppage == NULL)) {
+ if (copied == 0)
+ copied = -ENOMEM;
+ goto out;
+ }
+ }
+
len = PAGE_CACHE_SIZE;
kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA);
if (base) {
@@ -225,14 +238,17 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
}
flush_dcache_page(*ppage);
kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA);
+ copied += ret;
if (ret != len || !desc->count)
- return;
+ goto out;
ppage++;
} while ((pglen -= len) != 0);
copy_tail:
len = xdr->tail[0].iov_len;
if (base < len)
- copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
+ copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
+out:
+ return copied;
}
@@ -616,12 +632,24 @@ xdr_shift_buf(struct xdr_buf *buf, size_t len)
void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p)
{
struct kvec *iov = buf->head;
+ int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
+ BUG_ON(scratch_len < 0);
xdr->buf = buf;
xdr->iov = iov;
- xdr->end = (uint32_t *)((char *)iov->iov_base + iov->iov_len);
- buf->len = iov->iov_len = (char *)p - (char *)iov->iov_base;
- xdr->p = p;
+ xdr->p = (uint32_t *)((char *)iov->iov_base + iov->iov_len);
+ xdr->end = (uint32_t *)((char *)iov->iov_base + scratch_len);
+ BUG_ON(iov->iov_len > scratch_len);
+
+ if (p != xdr->p && p != NULL) {
+ size_t len;
+
+ BUG_ON(p < xdr->p || p > xdr->end);
+ len = (char *)p - (char *)xdr->p;
+ xdr->p = p;
+ buf->len += len;
+ iov->iov_len += len;
+ }
}
EXPORT_SYMBOL(xdr_init_encode);
@@ -859,8 +887,34 @@ out:
return status;
}
-static int
-read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
+/* obj is assumed to point to allocated memory of size at least len: */
+int
+write_bytes_to_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len)
+{
+ struct xdr_buf subbuf;
+ int this_len;
+ int status;
+
+ status = xdr_buf_subsegment(buf, &subbuf, base, len);
+ if (status)
+ goto out;
+ this_len = min(len, (int)subbuf.head[0].iov_len);
+ memcpy(subbuf.head[0].iov_base, obj, this_len);
+ len -= this_len;
+ obj += this_len;
+ this_len = min(len, (int)subbuf.page_len);
+ if (this_len)
+ _copy_to_pages(subbuf.pages, subbuf.page_base, obj, this_len);
+ len -= this_len;
+ obj += this_len;
+ this_len = min(len, (int)subbuf.tail[0].iov_len);
+ memcpy(subbuf.tail[0].iov_base, obj, this_len);
+out:
+ return status;
+}
+
+int
+xdr_decode_word(struct xdr_buf *buf, int base, u32 *obj)
{
u32 raw;
int status;
@@ -872,6 +926,14 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
return 0;
}
+int
+xdr_encode_word(struct xdr_buf *buf, int base, u32 obj)
+{
+ u32 raw = htonl(obj);
+
+ return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj));
+}
+
/* If the netobj starting offset bytes from the start of xdr_buf is contained
* entirely in the head or the tail, set object to point to it; otherwise
* try to find space for it at the end of the tail, copy it there, and
@@ -882,7 +944,7 @@ xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset)
u32 tail_offset = buf->head[0].iov_len + buf->page_len;
u32 obj_end_offset;
- if (read_u32_from_xdr_buf(buf, offset, &obj->len))
+ if (xdr_decode_word(buf, offset, &obj->len))
goto out;
obj_end_offset = offset + 4 + obj->len;
@@ -915,3 +977,220 @@ xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset)
out:
return -1;
}
+
+/* Returns 0 on success, or else a negative error code. */
+static int
+xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc, int encode)
+{
+ char *elem = NULL, *c;
+ unsigned int copied = 0, todo, avail_here;
+ struct page **ppages = NULL;
+ int err;
+
+ if (encode) {
+ if (xdr_encode_word(buf, base, desc->array_len) != 0)
+ return -EINVAL;
+ } else {
+ if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
+ desc->array_len > desc->array_maxlen ||
+ (unsigned long) base + 4 + desc->array_len *
+ desc->elem_size > buf->len)
+ return -EINVAL;
+ }
+ base += 4;
+
+ if (!desc->xcode)
+ return 0;
+
+ todo = desc->array_len * desc->elem_size;
+
+ /* process head */
+ if (todo && base < buf->head->iov_len) {
+ c = buf->head->iov_base + base;
+ avail_here = min_t(unsigned int, todo,
+ buf->head->iov_len - base);
+ todo -= avail_here;
+
+ while (avail_here >= desc->elem_size) {
+ err = desc->xcode(desc, c);
+ if (err)
+ goto out;
+ c += desc->elem_size;
+ avail_here -= desc->elem_size;
+ }
+ if (avail_here) {
+ if (!elem) {
+ elem = kmalloc(desc->elem_size, GFP_KERNEL);
+ err = -ENOMEM;
+ if (!elem)
+ goto out;
+ }
+ if (encode) {
+ err = desc->xcode(desc, elem);
+ if (err)
+ goto out;
+ memcpy(c, elem, avail_here);
+ } else
+ memcpy(elem, c, avail_here);
+ copied = avail_here;
+ }
+ base = buf->head->iov_len; /* align to start of pages */
+ }
+
+ /* process pages array */
+ base -= buf->head->iov_len;
+ if (todo && base < buf->page_len) {
+ unsigned int avail_page;
+
+ avail_here = min(todo, buf->page_len - base);
+ todo -= avail_here;
+
+ base += buf->page_base;
+ ppages = buf->pages + (base >> PAGE_CACHE_SHIFT);
+ base &= ~PAGE_CACHE_MASK;
+ avail_page = min_t(unsigned int, PAGE_CACHE_SIZE - base,
+ avail_here);
+ c = kmap(*ppages) + base;
+
+ while (avail_here) {
+ avail_here -= avail_page;
+ if (copied || avail_page < desc->elem_size) {
+ unsigned int l = min(avail_page,
+ desc->elem_size - copied);
+ if (!elem) {
+ elem = kmalloc(desc->elem_size,
+ GFP_KERNEL);
+ err = -ENOMEM;
+ if (!elem)
+ goto out;
+ }
+ if (encode) {
+ if (!copied) {
+ err = desc->xcode(desc, elem);
+ if (err)
+ goto out;
+ }
+ memcpy(c, elem + copied, l);
+ copied += l;
+ if (copied == desc->elem_size)
+ copied = 0;
+ } else {
+ memcpy(elem + copied, c, l);
+ copied += l;
+ if (copied == desc->elem_size) {
+ err = desc->xcode(desc, elem);
+ if (err)
+ goto out;
+ copied = 0;
+ }
+ }
+ avail_page -= l;
+ c += l;
+ }
+ while (avail_page >= desc->elem_size) {
+ err = desc->xcode(desc, c);
+ if (err)
+ goto out;
+ c += desc->elem_size;
+ avail_page -= desc->elem_size;
+ }
+ if (avail_page) {
+ unsigned int l = min(avail_page,
+ desc->elem_size - copied);
+ if (!elem) {
+ elem = kmalloc(desc->elem_size,
+ GFP_KERNEL);
+ err = -ENOMEM;
+ if (!elem)
+ goto out;
+ }
+ if (encode) {
+ if (!copied) {
+ err = desc->xcode(desc, elem);
+ if (err)
+ goto out;
+ }
+ memcpy(c, elem + copied, l);
+ copied += l;
+ if (copied == desc->elem_size)
+ copied = 0;
+ } else {
+ memcpy(elem + copied, c, l);
+ copied += l;
+ if (copied == desc->elem_size) {
+ err = desc->xcode(desc, elem);
+ if (err)
+ goto out;
+ copied = 0;
+ }
+ }
+ }
+ if (avail_here) {
+ kunmap(*ppages);
+ ppages++;
+ c = kmap(*ppages);
+ }
+
+ avail_page = min(avail_here,
+ (unsigned int) PAGE_CACHE_SIZE);
+ }
+ base = buf->page_len; /* align to start of tail */
+ }
+
+ /* process tail */
+ base -= buf->page_len;
+ if (todo) {
+ c = buf->tail->iov_base + base;
+ if (copied) {
+ unsigned int l = desc->elem_size - copied;
+
+ if (encode)
+ memcpy(c, elem + copied, l);
+ else {
+ memcpy(elem + copied, c, l);
+ err = desc->xcode(desc, elem);
+ if (err)
+ goto out;
+ }
+ todo -= l;
+ c += l;
+ }
+ while (todo) {
+ err = desc->xcode(desc, c);
+ if (err)
+ goto out;
+ c += desc->elem_size;
+ todo -= desc->elem_size;
+ }
+ }
+ err = 0;
+
+out:
+ if (elem)
+ kfree(elem);
+ if (ppages)
+ kunmap(*ppages);
+ return err;
+}
+
+int
+xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc)
+{
+ if (base >= buf->len)
+ return -EINVAL;
+
+ return xdr_xcode_array2(buf, base, desc, 0);
+}
+
+int
+xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
+ struct xdr_array2_desc *desc)
+{
+ if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
+ buf->head->iov_len + buf->page_len + buf->tail->iov_len)
+ return -EINVAL;
+
+ return xdr_xcode_array2(buf, base, desc, 1);
+}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c74a6bb..3c654e0 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -145,8 +145,6 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) {
if (task == xprt->snd_task)
return 1;
- if (task == NULL)
- return 0;
goto out_sleep;
}
if (xprt->nocong || __xprt_get_cong(xprt, task)) {
@@ -569,8 +567,11 @@ void xprt_connect(struct rpc_task *task)
if (xprt->sock != NULL)
schedule_delayed_work(&xprt->sock_connect,
RPC_REESTABLISH_TIMEOUT);
- else
+ else {
schedule_work(&xprt->sock_connect);
+ if (!RPC_IS_ASYNC(task))
+ flush_scheduled_work();
+ }
}
return;
out_write:
@@ -725,7 +726,8 @@ csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
goto no_checksum;
desc.csum = csum_partial(skb->data, desc.offset, skb->csum);
- xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits);
+ if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0)
+ return -1;
if (desc.offset != skb->len) {
unsigned int csum2;
csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0);
@@ -737,7 +739,8 @@ csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
return -1;
return 0;
no_checksum:
- xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits);
+ if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0)
+ return -1;
if (desc.count)
return -1;
return 0;
@@ -821,10 +824,15 @@ tcp_copy_data(skb_reader_t *desc, void *p, size_t len)
{
if (len > desc->count)
len = desc->count;
- if (skb_copy_bits(desc->skb, desc->offset, p, len))
+ if (skb_copy_bits(desc->skb, desc->offset, p, len)) {
+ dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n",
+ len, desc->count);
return 0;
+ }
desc->offset += len;
desc->count -= len;
+ dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n",
+ len, desc->count);
return len;
}
@@ -863,6 +871,8 @@ tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc)
static void
tcp_check_recm(struct rpc_xprt *xprt)
{
+ dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n",
+ xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags);
if (xprt->tcp_offset == xprt->tcp_reclen) {
xprt->tcp_flags |= XPRT_COPY_RECM;
xprt->tcp_offset = 0;
@@ -907,6 +917,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
struct rpc_rqst *req;
struct xdr_buf *rcvbuf;
size_t len;
+ ssize_t r;
/* Find and lock the request corresponding to this xid */
spin_lock(&xprt->sock_lock);
@@ -927,15 +938,40 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
len = xprt->tcp_reclen - xprt->tcp_offset;
memcpy(&my_desc, desc, sizeof(my_desc));
my_desc.count = len;
- xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
+ r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
&my_desc, tcp_copy_data);
- desc->count -= len;
- desc->offset += len;
+ desc->count -= r;
+ desc->offset += r;
} else
- xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
+ r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
desc, tcp_copy_data);
- xprt->tcp_copied += len;
- xprt->tcp_offset += len;
+
+ if (r > 0) {
+ xprt->tcp_copied += r;
+ xprt->tcp_offset += r;
+ }
+ if (r != len) {
+ /* Error when copying to the receive buffer,
+ * usually because we weren't able to allocate
+ * additional buffer pages. All we can do now
+ * is turn off XPRT_COPY_DATA, so the request
+ * will not receive any additional updates,
+ * and time out.
+ * Any remaining data from this record will
+ * be discarded.
+ */
+ xprt->tcp_flags &= ~XPRT_COPY_DATA;
+ dprintk("RPC: XID %08x truncated request\n",
+ ntohl(xprt->tcp_xid));
+ dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
+ xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
+ goto out;
+ }
+
+ dprintk("RPC: XID %08x read %Zd bytes\n",
+ ntohl(xprt->tcp_xid), r);
+ dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
+ xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
if (xprt->tcp_copied == req->rq_private_buf.buflen)
xprt->tcp_flags &= ~XPRT_COPY_DATA;
@@ -944,6 +980,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
xprt->tcp_flags &= ~XPRT_COPY_DATA;
}
+out:
if (!(xprt->tcp_flags & XPRT_COPY_DATA)) {
dprintk("RPC: %4d received reply complete\n",
req->rq_task->tk_pid);
@@ -967,6 +1004,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
desc->count -= len;
desc->offset += len;
xprt->tcp_offset += len;
+ dprintk("RPC: discarded %Zu bytes\n", len);
tcp_check_recm(xprt);
}
@@ -1064,8 +1102,7 @@ tcp_state_change(struct sock *sk)
case TCP_SYN_RECV:
break;
default:
- if (xprt_test_and_clear_connected(xprt))
- rpc_wake_up_status(&xprt->pending, -ENOTCONN);
+ xprt_disconnect(xprt);
break;
}
out:
@@ -1203,6 +1240,8 @@ xprt_transmit(struct rpc_task *task)
list_add_tail(&req->rq_list, &xprt->recv);
spin_unlock_bh(&xprt->sock_lock);
xprt_reset_majortimeo(req);
+ /* Turn off autodisconnect */
+ del_singleshot_timer_sync(&xprt->timer);
}
} else if (!req->rq_bytes_sent)
return;
@@ -1333,8 +1372,6 @@ xprt_reserve(struct rpc_task *task)
spin_lock(&xprt->xprt_lock);
do_xprt_reserve(task);
spin_unlock(&xprt->xprt_lock);
- if (task->tk_rqstp)
- del_timer_sync(&xprt->timer);
}
}
@@ -1649,6 +1686,10 @@ xprt_shutdown(struct rpc_xprt *xprt)
rpc_wake_up(&xprt->backlog);
wake_up(&xprt->cong_wait);
del_timer_sync(&xprt->timer);
+
+ /* synchronously wait for connect worker to finish */
+ cancel_delayed_work(&xprt->sock_connect);
+ flush_scheduled_work();
}
/*
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 3f6e310..c5241fc 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -17,17 +17,15 @@
#include <linux/sysctl.h>
#ifdef CONFIG_INET
-extern struct ctl_table ipv4_table[];
+#include <net/ip.h>
#endif
-extern struct ctl_table core_table[];
-
#ifdef CONFIG_NET
-extern struct ctl_table ether_table[];
+#include <linux/if_ether.h>
#endif
#ifdef CONFIG_TR
-extern struct ctl_table tr_table[];
+#include <linux/if_tr.h>
#endif
struct ctl_table net_table[] = {
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
new file mode 100644
index 0000000..5a69733
--- /dev/null
+++ b/net/unix/Kconfig
@@ -0,0 +1,21 @@
+#
+# Unix Domain Sockets
+#
+
+config UNIX
+ tristate "Unix domain sockets"
+ ---help---
+ If you say Y here, you will include support for Unix domain sockets;
+ sockets are the standard Unix mechanism for establishing and
+ accessing network connections. Many commonly used programs such as
+ the X Window system and syslog use these sockets even if your
+ machine is not connected to any network. Unless you are working on
+ an embedded system or something similar, you therefore definitely
+ want to say Y here.
+
+ To compile this driver as a module, choose M here: the module will be
+ called unix. Note that several important services won't work
+ correctly if you say M here and then neglect to load the module.
+
+ Say Y unless you know what you are doing.
+
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c420eba..41feca3 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -105,7 +105,7 @@
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
#include <net/af_unix.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -302,7 +302,7 @@ static void unix_write_space(struct sock *sk)
* may receive messages only from that peer. */
static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
{
- if (skb_queue_len(&sk->sk_receive_queue)) {
+ if (!skb_queue_empty(&sk->sk_receive_queue)) {
skb_queue_purge(&sk->sk_receive_queue);
wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
@@ -1619,7 +1619,7 @@ static long unix_stream_data_wait(struct sock * sk, long timeo)
for (;;) {
prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- if (skb_queue_len(&sk->sk_receive_queue) ||
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
sk->sk_err ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
signal_pending(current) ||
@@ -2026,14 +2026,6 @@ static struct net_proto_family unix_family_ops = {
.owner = THIS_MODULE,
};
-#ifdef CONFIG_SYSCTL
-extern void unix_sysctl_register(void);
-extern void unix_sysctl_unregister(void);
-#else
-static inline void unix_sysctl_register(void) {}
-static inline void unix_sysctl_unregister(void) {}
-#endif
-
static int __init af_unix_init(void)
{
int rc = -1;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 4bd95c8..6ffc64e 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -76,11 +76,11 @@
#include <linux/netdevice.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
-#include <linux/tcp.h>
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/scm.h>
+#include <net/tcp_states.h>
/* Internal data structures and random procedures: */
@@ -286,16 +286,16 @@ void unix_gc(void)
skb = skb_peek(&s->sk_receive_queue);
while (skb &&
skb != (struct sk_buff *)&s->sk_receive_queue) {
- nextsk=skb->next;
+ nextsk = skb->next;
/*
* Do we have file descriptors ?
*/
- if(UNIXCB(skb).fp)
- {
- __skb_unlink(skb, skb->list);
- __skb_queue_tail(&hitlist,skb);
+ if (UNIXCB(skb).fp) {
+ __skb_unlink(skb,
+ &s->sk_receive_queue);
+ __skb_queue_tail(&hitlist, skb);
}
- skb=nextsk;
+ skb = nextsk;
}
spin_unlock(&s->sk_receive_queue.lock);
}
diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c
index c974dac..690ffa5 100644
--- a/net/unix/sysctl_net_unix.c
+++ b/net/unix/sysctl_net_unix.c
@@ -12,7 +12,7 @@
#include <linux/mm.h>
#include <linux/sysctl.h>
-extern int sysctl_unix_max_dgram_qlen;
+#include <net/af_unix.h>
static ctl_table unix_table[] = {
{
diff --git a/net/wanrouter/Kconfig b/net/wanrouter/Kconfig
new file mode 100644
index 0000000..1debe1c
--- /dev/null
+++ b/net/wanrouter/Kconfig
@@ -0,0 +1,29 @@
+#
+# Configuration for WAN router
+#
+
+config WAN_ROUTER
+ tristate "WAN router"
+ depends on EXPERIMENTAL
+ ---help---
+ Wide Area Networks (WANs), such as X.25, frame relay and leased
+ lines, are used to interconnect Local Area Networks (LANs) over vast
+ distances with data transfer rates significantly higher than those
+ achievable with commonly used asynchronous modem connections.
+ Usually, a quite expensive external device called a `WAN router' is
+ needed to connect to a WAN.
+
+ As an alternative, WAN routing can be built into the Linux kernel.
+ With relatively inexpensive WAN interface cards available on the
+ market, a perfectly usable router can be built for less than half
+ the price of an external router. If you have one of those cards and
+ wish to use your Linux box as a WAN router, say Y here and also to
+ the WAN driver for your card, below. You will then need the
+ wan-tools package which is available from <ftp://ftp.sangoma.com/>.
+ Read <file:Documentation/networking/wan-router.txt> for more
+ information.
+
+ To compile WAN routing support as a module, choose M here: the
+ module will be called wanrouter.
+
+ If unsure, say N.
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index d93b19f..596cb96 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -57,7 +57,7 @@
#include <linux/wanpipe.h>
#include <linux/if_wanpipe.h>
#include <linux/pkt_sched.h>
-#include <linux/tcp.h>
+#include <linux/tcp_states.h>
#include <linux/if_wanpipe_common.h>
#include <linux/sdla_x25.h>
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index d6844ac..13b650a 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -358,10 +358,10 @@ int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev,
*/
-unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
{
int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */
- unsigned short ethertype;
+ __be16 ethertype;
switch (skb->data[cnt]) {
case NLPID_IP: /* IP datagramm */
@@ -379,7 +379,7 @@ unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
skb->data[cnt+3], dev->name);
return 0;
}
- ethertype = *((unsigned short*)&skb->data[cnt+4]);
+ ethertype = *((__be16*)&skb->data[cnt+4]);
cnt += 6;
break;
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
new file mode 100644
index 0000000..e6759c9
--- /dev/null
+++ b/net/x25/Kconfig
@@ -0,0 +1,36 @@
+#
+# CCITT X.25 Packet Layer
+#
+
+config X25
+ tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ ---help---
+ X.25 is a set of standardized network protocols, similar in scope to
+ frame relay; the one physical line from your box to the X.25 network
+ entry point can carry several logical point-to-point connections
+ (called "virtual circuits") to other computers connected to the X.25
+ network. Governments, banks, and other organizations tend to use it
+ to connect to each other or to form Wide Area Networks (WANs). Many
+ countries have public X.25 networks. X.25 consists of two
+ protocols: the higher level Packet Layer Protocol (PLP) (say Y here
+ if you want that) and the lower level data link layer protocol LAPB
+ (say Y to "LAPB Data Link Driver" below if you want that).
+
+ You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
+ <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
+ Information about X.25 for Linux is contained in the files
+ <file:Documentation/networking/x25.txt> and
+ <file:Documentation/networking/x25-iface.txt>.
+
+ One connects to an X.25 network either with a dedicated network card
+ using the X.21 protocol (not yet supported by Linux) or one can do
+ X.25 over a standard telephone line using an ordinary modem (say Y
+ to "X.25 async driver" below) or over Ethernet using an ordinary
+ Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
+ Driver" and "LAPB over Ethernet driver" below).
+
+ To compile this driver as a module, choose M here: the module
+ will be called x25. If unsure, say N.
+
+
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 2a24b24..020d73c 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -29,6 +29,10 @@
* 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN
* 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to
* x25_proc.c, using seq_file
+ * 2005-04-02 Shaun Pereira Selective sub address matching
+ * with call user data
+ * 2005-04-15 Shaun Pereira Fast select with no restriction on
+ * response
*/
#include <linux/config.h>
@@ -43,7 +47,7 @@
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
@@ -219,7 +223,8 @@ static void x25_insert_socket(struct sock *sk)
* Note: if a listening socket has cud set it must only get calls
* with matching cud.
*/
-static struct sock *x25_find_listener(struct x25_address *addr, struct x25_calluserdata *calluserdata)
+static struct sock *x25_find_listener(struct x25_address *addr,
+ struct sk_buff *skb)
{
struct sock *s;
struct sock *next_best;
@@ -230,22 +235,23 @@ static struct sock *x25_find_listener(struct x25_address *addr, struct x25_callu
sk_for_each(s, node, &x25_list)
if ((!strcmp(addr->x25_addr,
- x25_sk(s)->source_addr.x25_addr) ||
- !strcmp(addr->x25_addr,
- null_x25_address.x25_addr)) &&
- s->sk_state == TCP_LISTEN) {
-
+ x25_sk(s)->source_addr.x25_addr) ||
+ !strcmp(addr->x25_addr,
+ null_x25_address.x25_addr)) &&
+ s->sk_state == TCP_LISTEN) {
/*
* Found a listening socket, now check the incoming
* call user data vs this sockets call user data
*/
- if (x25_check_calluserdata(&x25_sk(s)->calluserdata, calluserdata)) {
- sock_hold(s);
- goto found;
- }
- if (x25_sk(s)->calluserdata.cudlength == 0) {
+ if(skb->len > 0 && x25_sk(s)->cudmatchlength > 0) {
+ if((memcmp(x25_sk(s)->calluserdata.cuddata,
+ skb->data,
+ x25_sk(s)->cudmatchlength)) == 0) {
+ sock_hold(s);
+ goto found;
+ }
+ } else
next_best = s;
- }
}
if (next_best) {
s = next_best;
@@ -497,6 +503,9 @@ static int x25_create(struct socket *sock, int protocol)
x25->t23 = sysctl_x25_clear_request_timeout;
x25->t2 = sysctl_x25_ack_holdback_timeout;
x25->state = X25_STATE_0;
+ x25->cudmatchlength = 0;
+ x25->accptapprv = X25_DENY_ACCPT_APPRV; /* normally no cud */
+ /* on call accept */
x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE;
x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
@@ -545,6 +554,8 @@ static struct sock *x25_make_new(struct sock *osk)
x25->t2 = ox25->t2;
x25->facilities = ox25->facilities;
x25->qbitincl = ox25->qbitincl;
+ x25->cudmatchlength = ox25->cudmatchlength;
+ x25->accptapprv = ox25->accptapprv;
x25_init_timers(sk);
out:
@@ -822,7 +833,6 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
struct x25_sock *makex25;
struct x25_address source_addr, dest_addr;
struct x25_facilities facilities;
- struct x25_calluserdata calluserdata;
int len, rc;
/*
@@ -845,19 +855,10 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
skb_pull(skb,len);
/*
- * Incoming Call User Data.
- */
- if (skb->len >= 0) {
- memcpy(calluserdata.cuddata, skb->data, skb->len);
- calluserdata.cudlength = skb->len;
- }
-
- skb_push(skb,len);
-
- /*
* Find a listener for the particular address/cud pair.
*/
- sk = x25_find_listener(&source_addr,&calluserdata);
+ sk = x25_find_listener(&source_addr,skb);
+ skb_push(skb,len);
/*
* We can't accept the Call Request.
@@ -900,11 +901,23 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
makex25->neighbour = nb;
makex25->facilities = facilities;
makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask;
- makex25->calluserdata = calluserdata;
-
- x25_write_internal(make, X25_CALL_ACCEPTED);
+ /* ensure no reverse facil on accept */
+ makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
+ makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
+
+ /* Normally all calls are accepted immediatly */
+ if(makex25->accptapprv & X25_DENY_ACCPT_APPRV) {
+ x25_write_internal(make, X25_CALL_ACCEPTED);
+ makex25->state = X25_STATE_3;
+ }
- makex25->state = X25_STATE_3;
+ /*
+ * Incoming Call User Data.
+ */
+ if (skb->len >= 0) {
+ memcpy(makex25->calluserdata.cuddata, skb->data, skb->len);
+ makex25->calluserdata.cudlength = skb->len;
+ }
sk->sk_ack_backlog++;
@@ -1288,7 +1301,8 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
if (facilities.throughput < 0x03 ||
facilities.throughput > 0xDD)
break;
- if (facilities.reverse && facilities.reverse != 1)
+ if (facilities.reverse &&
+ (facilities.reverse | 0x81)!= 0x81)
break;
x25->facilities = facilities;
rc = 0;
@@ -1325,6 +1339,44 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
}
+ case SIOCX25SCUDMATCHLEN: {
+ struct x25_subaddr sub_addr;
+ rc = -EINVAL;
+ if(sk->sk_state != TCP_CLOSE)
+ break;
+ rc = -EFAULT;
+ if (copy_from_user(&sub_addr, argp,
+ sizeof(sub_addr)))
+ break;
+ rc = -EINVAL;
+ if(sub_addr.cudmatchlength > X25_MAX_CUD_LEN)
+ break;
+ x25->cudmatchlength = sub_addr.cudmatchlength;
+ rc = 0;
+ break;
+ }
+
+ case SIOCX25CALLACCPTAPPRV: {
+ rc = -EINVAL;
+ if (sk->sk_state != TCP_CLOSE)
+ break;
+ x25->accptapprv = X25_ALLOW_ACCPT_APPRV;
+ rc = 0;
+ break;
+ }
+
+ case SIOCX25SENDCALLACCPT: {
+ rc = -EINVAL;
+ if (sk->sk_state != TCP_ESTABLISHED)
+ break;
+ if (x25->accptapprv) /* must call accptapprv above */
+ break;
+ x25_write_internal(sk, X25_CALL_ACCEPTED);
+ x25->state = X25_STATE_3;
+ rc = 0;
+ break;
+ }
+
default:
rc = dev_ioctl(cmd, argp);
break;
diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c
index 36fc3bf..adfe7b8 100644
--- a/net/x25/x25_dev.c
+++ b/net/x25/x25_dev.c
@@ -81,7 +81,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb)
}
int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *ptype)
+ struct packet_type *ptype, struct net_device *orig_dev)
{
struct sk_buff *nskb;
struct x25_neigh *nb;
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index a21bdb9..54278b9 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -17,6 +17,8 @@
* X.25 001 Split from x25_subr.c
* mar/20/00 Daniela Squassoni Disabling/enabling of facilities
* negotiation.
+ * apr/14/05 Shaun Pereira - Allow fast select with no restriction
+ * on response.
*/
#include <linux/kernel.h>
@@ -43,9 +45,31 @@ int x25_parse_facilities(struct sk_buff *skb,
case X25_FAC_CLASS_A:
switch (*p) {
case X25_FAC_REVERSE:
- facilities->reverse = p[1] & 0x01;
- *vc_fac_mask |= X25_MASK_REVERSE;
- break;
+ if((p[1] & 0x81) == 0x81) {
+ facilities->reverse = p[1] & 0x81;
+ *vc_fac_mask |= X25_MASK_REVERSE;
+ break;
+ }
+
+ if((p[1] & 0x01) == 0x01) {
+ facilities->reverse = p[1] & 0x01;
+ *vc_fac_mask |= X25_MASK_REVERSE;
+ break;
+ }
+
+ if((p[1] & 0x80) == 0x80) {
+ facilities->reverse = p[1] & 0x80;
+ *vc_fac_mask |= X25_MASK_REVERSE;
+ break;
+ }
+
+ if(p[1] == 0x00) {
+ facilities->reverse
+ = X25_DEFAULT_REVERSE;
+ *vc_fac_mask |= X25_MASK_REVERSE;
+ break;
+ }
+
case X25_FAC_THROUGHPUT:
facilities->throughput = p[1];
*vc_fac_mask |= X25_MASK_THROUGHPUT;
@@ -122,7 +146,7 @@ int x25_create_facilities(unsigned char *buffer,
if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) {
*p++ = X25_FAC_REVERSE;
- *p++ = !!facilities->reverse;
+ *p++ = facilities->reverse;
}
if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) {
@@ -171,7 +195,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
/*
* They want reverse charging, we won't accept it.
*/
- if (theirs.reverse && ours->reverse) {
+ if ((theirs.reverse & 0x01 ) && (ours->reverse & 0x01)) {
SOCK_DEBUG(sk, "X.25: rejecting reverse charging request");
return -1;
}
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index b0197c7..2614687 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -28,7 +28,7 @@
#include <linux/string.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/x25.h>
static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 183fea3..8be9b8f 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -19,13 +19,15 @@
* mar/20/00 Daniela Squassoni Disabling/enabling of facilities
* negotiation.
* jun/24/01 Arnaldo C. Melo use skb_queue_purge, cleanups
+ * apr/04/15 Shaun Pereira Fast select with no
+ * restriction on response.
*/
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/skbuff.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/x25.h>
/*
@@ -78,7 +80,7 @@ void x25_requeue_frames(struct sock *sk)
if (!skb_prev)
skb_queue_head(&sk->sk_write_queue, skb);
else
- skb_append(skb_prev, skb);
+ skb_append(skb_prev, skb, &sk->sk_write_queue);
skb_prev = skb;
}
}
@@ -127,8 +129,12 @@ void x25_write_internal(struct sock *sk, int frametype)
len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN +
X25_MAX_CUD_LEN;
break;
- case X25_CALL_ACCEPTED:
- len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
+ case X25_CALL_ACCEPTED: /* fast sel with no restr on resp */
+ if(x25->facilities.reverse & 0x80) {
+ len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
+ } else {
+ len += 1 + X25_MAX_FAC_LEN;
+ }
break;
case X25_CLEAR_REQUEST:
case X25_RESET_REQUEST:
@@ -203,9 +209,16 @@ void x25_write_internal(struct sock *sk, int frametype)
x25->vc_facil_mask);
dptr = skb_put(skb, len);
memcpy(dptr, facilities, len);
- dptr = skb_put(skb, x25->calluserdata.cudlength);
- memcpy(dptr, x25->calluserdata.cuddata,
- x25->calluserdata.cudlength);
+
+ /* fast select with no restriction on response
+ allows call user data. Userland must
+ ensure it is ours and not theirs */
+ if(x25->facilities.reverse & 0x80) {
+ dptr = skb_put(skb,
+ x25->calluserdata.cudlength);
+ memcpy(dptr, x25->calluserdata.cuddata,
+ x25->calluserdata.cudlength);
+ }
x25->calluserdata.cudlength = 0;
break;
@@ -354,21 +367,3 @@ void x25_check_rbuf(struct sock *sk)
}
}
-/*
- * Compare 2 calluserdata structures, used to find correct listening sockets
- * when call user data is used.
- */
-int x25_check_calluserdata(struct x25_calluserdata *ours, struct x25_calluserdata *theirs)
-{
- int i;
- if (ours->cudlength != theirs->cudlength)
- return 0;
-
- for (i=0;i<ours->cudlength;i++) {
- if (ours->cuddata[i] != theirs->cuddata[i]) {
- return 0;
- }
- }
- return 1;
-}
-
diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c
index d6a21a3..0a92e1d 100644
--- a/net/x25/x25_timer.c
+++ b/net/x25/x25_timer.c
@@ -23,7 +23,7 @@
#include <linux/jiffies.h>
#include <linux/timer.h>
#include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
#include <net/x25.h>
static void x25_heartbeat_expiry(unsigned long);
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 58ca6a9..0c1c043 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -1,6 +1,10 @@
#
# XFRM configuration
#
+config XFRM
+ bool
+ depends on NET
+
config XFRM_USER
tristate "IPsec user configuration interface"
depends on INET && XFRM
@@ -10,3 +14,14 @@ config XFRM_USER
If unsure, say Y.
+config NET_KEY
+ tristate "PF_KEY sockets"
+ select XFRM
+ ---help---
+ PF_KEYv2 socket family, compatible to KAME ones.
+ They are required if you are going to use IPsec tools ported
+ from KAME.
+
+ Say Y unless you know what you are doing.
+
+
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index c58a6f0..2407a70 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -12,7 +12,7 @@
#include <net/ip.h>
#include <net/xfrm.h>
-static kmem_cache_t *secpath_cachep;
+static kmem_cache_t *secpath_cachep __read_mostly;
void __secpath_destroy(struct sec_path *sp)
{
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d07f5ce..83c8135 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -37,7 +37,7 @@ EXPORT_SYMBOL(xfrm_policy_list);
static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
-static kmem_cache_t *xfrm_dst_cache;
+static kmem_cache_t *xfrm_dst_cache __read_mostly;
static struct work_struct xfrm_policy_gc_work;
static struct list_head xfrm_policy_gc_list =
@@ -118,7 +118,6 @@ retry:
xfrm_policy_put_afinfo(afinfo);
return type;
}
-EXPORT_SYMBOL(xfrm_get_type);
int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl,
unsigned short family)
@@ -216,8 +215,8 @@ out:
expired:
read_unlock(&xp->lock);
- km_policy_expired(xp, dir, 1);
- xfrm_policy_delete(xp, dir);
+ if (!xfrm_policy_delete(xp, dir))
+ km_policy_expired(xp, dir, 1);
xfrm_pol_put(xp);
}
@@ -555,7 +554,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
return NULL;
}
-void xfrm_policy_delete(struct xfrm_policy *pol, int dir)
+int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
{
write_lock_bh(&xfrm_policy_lock);
pol = __xfrm_policy_unlink(pol, dir);
@@ -564,7 +563,9 @@ void xfrm_policy_delete(struct xfrm_policy *pol, int dir)
if (dir < XFRM_POLICY_MAX)
atomic_inc(&flow_cache_genid);
xfrm_policy_kill(pol);
+ return 0;
}
+ return -ENOENT;
}
int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index d11747c..9d206c2 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -50,7 +50,7 @@ static DEFINE_SPINLOCK(xfrm_state_gc_lock);
static int xfrm_state_gc_flush_bundles;
-static void __xfrm_state_delete(struct xfrm_state *x);
+static int __xfrm_state_delete(struct xfrm_state *x);
static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family);
static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
@@ -154,6 +154,7 @@ static void xfrm_timer_handler(unsigned long data)
next = tmo;
}
+ x->km.dying = warn;
if (warn)
km_state_expired(x, 0);
resched:
@@ -169,9 +170,8 @@ expired:
next = 2;
goto resched;
}
- if (x->id.spi != 0)
+ if (!__xfrm_state_delete(x) && x->id.spi)
km_state_expired(x, 1);
- __xfrm_state_delete(x);
out:
spin_unlock(&x->lock);
@@ -215,8 +215,10 @@ void __xfrm_state_destroy(struct xfrm_state *x)
}
EXPORT_SYMBOL(__xfrm_state_destroy);
-static void __xfrm_state_delete(struct xfrm_state *x)
+static int __xfrm_state_delete(struct xfrm_state *x)
{
+ int err = -ESRCH;
+
if (x->km.state != XFRM_STATE_DEAD) {
x->km.state = XFRM_STATE_DEAD;
spin_lock(&xfrm_state_lock);
@@ -245,14 +247,21 @@ static void __xfrm_state_delete(struct xfrm_state *x)
* is what we are dropping here.
*/
atomic_dec(&x->refcnt);
+ err = 0;
}
+
+ return err;
}
-void xfrm_state_delete(struct xfrm_state *x)
+int xfrm_state_delete(struct xfrm_state *x)
{
+ int err;
+
spin_lock_bh(&x->lock);
- __xfrm_state_delete(x);
+ err = __xfrm_state_delete(x);
spin_unlock_bh(&x->lock);
+
+ return err;
}
EXPORT_SYMBOL(xfrm_state_delete);
@@ -557,16 +566,18 @@ int xfrm_state_check_expire(struct xfrm_state *x)
if (x->curlft.bytes >= x->lft.hard_byte_limit ||
x->curlft.packets >= x->lft.hard_packet_limit) {
- km_state_expired(x, 1);
- if (!mod_timer(&x->timer, jiffies + XFRM_ACQ_EXPIRES*HZ))
+ x->km.state = XFRM_STATE_EXPIRED;
+ if (!mod_timer(&x->timer, jiffies))
xfrm_state_hold(x);
return -EINVAL;
}
if (!x->km.dying &&
(x->curlft.bytes >= x->lft.soft_byte_limit ||
- x->curlft.packets >= x->lft.soft_packet_limit))
+ x->curlft.packets >= x->lft.soft_packet_limit)) {
+ x->km.dying = 1;
km_state_expired(x, 0);
+ }
return 0;
}
EXPORT_SYMBOL(xfrm_state_check_expire);
@@ -796,34 +807,56 @@ EXPORT_SYMBOL(xfrm_replay_advance);
static struct list_head xfrm_km_list = LIST_HEAD_INIT(xfrm_km_list);
static DEFINE_RWLOCK(xfrm_km_lock);
-static void km_state_expired(struct xfrm_state *x, int hard)
+void km_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
{
struct xfrm_mgr *km;
- if (hard)
- x->km.state = XFRM_STATE_EXPIRED;
- else
- x->km.dying = 1;
+ read_lock(&xfrm_km_lock);
+ list_for_each_entry(km, &xfrm_km_list, list)
+ if (km->notify_policy)
+ km->notify_policy(xp, dir, c);
+ read_unlock(&xfrm_km_lock);
+}
+void km_state_notify(struct xfrm_state *x, struct km_event *c)
+{
+ struct xfrm_mgr *km;
read_lock(&xfrm_km_lock);
list_for_each_entry(km, &xfrm_km_list, list)
- km->notify(x, hard);
+ if (km->notify)
+ km->notify(x, c);
read_unlock(&xfrm_km_lock);
+}
+
+EXPORT_SYMBOL(km_policy_notify);
+EXPORT_SYMBOL(km_state_notify);
+
+static void km_state_expired(struct xfrm_state *x, int hard)
+{
+ struct km_event c;
+
+ c.data.hard = hard;
+ c.event = XFRM_MSG_EXPIRE;
+ km_state_notify(x, &c);
if (hard)
wake_up(&km_waitq);
}
+/*
+ * We send to all registered managers regardless of failure
+ * We are happy with one success
+*/
static int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol)
{
- int err = -EINVAL;
+ int err = -EINVAL, acqret;
struct xfrm_mgr *km;
read_lock(&xfrm_km_lock);
list_for_each_entry(km, &xfrm_km_list, list) {
- err = km->acquire(x, t, pol, XFRM_POLICY_OUT);
- if (!err)
- break;
+ acqret = km->acquire(x, t, pol, XFRM_POLICY_OUT);
+ if (!acqret)
+ err = acqret;
}
read_unlock(&xfrm_km_lock);
return err;
@@ -848,13 +881,11 @@ EXPORT_SYMBOL(km_new_mapping);
void km_policy_expired(struct xfrm_policy *pol, int dir, int hard)
{
- struct xfrm_mgr *km;
+ struct km_event c;
- read_lock(&xfrm_km_lock);
- list_for_each_entry(km, &xfrm_km_list, list)
- if (km->notify_policy)
- km->notify_policy(pol, dir, hard);
- read_unlock(&xfrm_km_lock);
+ c.data.hard = hard;
+ c.event = XFRM_MSG_POLEXPIRE;
+ km_policy_notify(pol, dir, &c);
if (hard)
wake_up(&km_waitq);
@@ -1024,6 +1055,43 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu)
}
EXPORT_SYMBOL(xfrm_state_mtu);
+
+int xfrm_init_state(struct xfrm_state *x)
+{
+ struct xfrm_state_afinfo *afinfo;
+ int family = x->props.family;
+ int err;
+
+ err = -EAFNOSUPPORT;
+ afinfo = xfrm_state_get_afinfo(family);
+ if (!afinfo)
+ goto error;
+
+ err = 0;
+ if (afinfo->init_flags)
+ err = afinfo->init_flags(x);
+
+ xfrm_state_put_afinfo(afinfo);
+
+ if (err)
+ goto error;
+
+ err = -EPROTONOSUPPORT;
+ x->type = xfrm_get_type(x->id.proto, family);
+ if (x->type == NULL)
+ goto error;
+
+ err = x->type->init_state(x);
+ if (err)
+ goto error;
+
+ x->km.state = XFRM_STATE_VALID;
+
+error:
+ return err;
+}
+
+EXPORT_SYMBOL(xfrm_init_state);
void __init xfrm_state_init(void)
{
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 9750901..c35336a 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -249,17 +249,10 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1])))
goto error;
- err = -ENOENT;
- x->type = xfrm_get_type(x->id.proto, x->props.family);
- if (x->type == NULL)
- goto error;
-
- err = x->type->init_state(x, NULL);
+ err = xfrm_init_state(x);
if (err)
goto error;
- x->curlft.add_time = (unsigned long) xtime.tv_sec;
- x->km.state = XFRM_STATE_VALID;
x->km.seq = p->seq;
return x;
@@ -277,6 +270,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
struct xfrm_usersa_info *p = NLMSG_DATA(nlh);
struct xfrm_state *x;
int err;
+ struct km_event c;
err = verify_newsa_info(p, (struct rtattr **) xfrma);
if (err)
@@ -286,6 +280,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
if (!x)
return err;
+ xfrm_state_hold(x);
if (nlh->nlmsg_type == XFRM_MSG_NEWSA)
err = xfrm_state_add(x);
else
@@ -294,14 +289,24 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
if (err < 0) {
x->km.state = XFRM_STATE_DEAD;
xfrm_state_put(x);
+ goto out;
}
+ c.seq = nlh->nlmsg_seq;
+ c.pid = nlh->nlmsg_pid;
+ c.event = nlh->nlmsg_type;
+
+ km_state_notify(x, &c);
+out:
+ xfrm_state_put(x);
return err;
}
static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
{
struct xfrm_state *x;
+ int err;
+ struct km_event c;
struct xfrm_usersa_id *p = NLMSG_DATA(nlh);
x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family);
@@ -313,10 +318,19 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
return -EPERM;
}
- xfrm_state_delete(x);
+ err = xfrm_state_delete(x);
+ if (err < 0) {
+ xfrm_state_put(x);
+ return err;
+ }
+
+ c.seq = nlh->nlmsg_seq;
+ c.pid = nlh->nlmsg_pid;
+ c.event = nlh->nlmsg_type;
+ km_state_notify(x, &c);
xfrm_state_put(x);
- return 0;
+ return err;
}
static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
@@ -681,6 +695,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
{
struct xfrm_userpolicy_info *p = NLMSG_DATA(nlh);
struct xfrm_policy *xp;
+ struct km_event c;
int err;
int excl;
@@ -692,6 +707,10 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
if (!xp)
return err;
+ /* shouldnt excl be based on nlh flags??
+ * Aha! this is anti-netlink really i.e more pfkey derived
+ * in netlink excl is a flag and you wouldnt need
+ * a type XFRM_MSG_UPDPOLICY - JHS */
excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY;
err = xfrm_policy_insert(p->dir, xp, excl);
if (err) {
@@ -699,6 +718,11 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
return err;
}
+ c.event = nlh->nlmsg_type;
+ c.seq = nlh->nlmsg_seq;
+ c.pid = nlh->nlmsg_pid;
+ km_policy_notify(xp, p->dir, &c);
+
xfrm_pol_put(xp);
return 0;
@@ -816,6 +840,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
struct xfrm_policy *xp;
struct xfrm_userpolicy_id *p;
int err;
+ struct km_event c;
int delete;
p = NLMSG_DATA(nlh);
@@ -843,6 +868,12 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
NETLINK_CB(skb).pid,
MSG_DONTWAIT);
}
+ } else {
+ c.data.byid = p->index;
+ c.event = nlh->nlmsg_type;
+ c.seq = nlh->nlmsg_seq;
+ c.pid = nlh->nlmsg_pid;
+ km_policy_notify(xp, p->dir, &c);
}
xfrm_pol_put(xp);
@@ -852,15 +883,28 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
{
+ struct km_event c;
struct xfrm_usersa_flush *p = NLMSG_DATA(nlh);
xfrm_state_flush(p->proto);
+ c.data.proto = p->proto;
+ c.event = nlh->nlmsg_type;
+ c.seq = nlh->nlmsg_seq;
+ c.pid = nlh->nlmsg_pid;
+ km_state_notify(NULL, &c);
+
return 0;
}
static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
{
+ struct km_event c;
+
xfrm_policy_flush();
+ c.event = nlh->nlmsg_type;
+ c.seq = nlh->nlmsg_seq;
+ c.pid = nlh->nlmsg_pid;
+ km_policy_notify(NULL, 0, &c);
return 0;
}
@@ -1069,20 +1113,147 @@ nlmsg_failure:
return -1;
}
-static int xfrm_send_state_notify(struct xfrm_state *x, int hard)
+static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c)
{
struct sk_buff *skb;
+ int len = NLMSG_LENGTH(sizeof(struct xfrm_user_expire));
- skb = alloc_skb(sizeof(struct xfrm_user_expire) + 16, GFP_ATOMIC);
+ skb = alloc_skb(len, GFP_ATOMIC);
if (skb == NULL)
return -ENOMEM;
- if (build_expire(skb, x, hard) < 0)
+ if (build_expire(skb, x, c->data.hard) < 0)
BUG();
- NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
+}
+
+static int xfrm_notify_sa_flush(struct km_event *c)
+{
+ struct xfrm_usersa_flush *p;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ unsigned char *b;
+ int len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_flush));
+
+ skb = alloc_skb(len, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+ b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, c->pid, c->seq,
+ XFRM_MSG_FLUSHSA, sizeof(*p));
+ nlh->nlmsg_flags = 0;
+
+ p = NLMSG_DATA(nlh);
+ p->proto = c->data.proto;
+
+ nlh->nlmsg_len = skb->tail - b;
+
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_SA;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC);
+
+nlmsg_failure:
+ kfree_skb(skb);
+ return -1;
+}
+
+static int inline xfrm_sa_len(struct xfrm_state *x)
+{
+ int l = 0;
+ if (x->aalg)
+ l += RTA_SPACE(sizeof(*x->aalg) + (x->aalg->alg_key_len+7)/8);
+ if (x->ealg)
+ l += RTA_SPACE(sizeof(*x->ealg) + (x->ealg->alg_key_len+7)/8);
+ if (x->calg)
+ l += RTA_SPACE(sizeof(*x->calg));
+ if (x->encap)
+ l += RTA_SPACE(sizeof(*x->encap));
+
+ return l;
+}
+
+static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c)
+{
+ struct xfrm_usersa_info *p;
+ struct xfrm_usersa_id *id;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ unsigned char *b;
+ int len = xfrm_sa_len(x);
+ int headlen;
+
+ headlen = sizeof(*p);
+ if (c->event == XFRM_MSG_DELSA) {
+ len += RTA_SPACE(headlen);
+ headlen = sizeof(*id);
+ }
+ len += NLMSG_SPACE(headlen);
+
+ skb = alloc_skb(len, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+ b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, c->pid, c->seq, c->event, headlen);
+ nlh->nlmsg_flags = 0;
+
+ p = NLMSG_DATA(nlh);
+ if (c->event == XFRM_MSG_DELSA) {
+ id = NLMSG_DATA(nlh);
+ memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr));
+ id->spi = x->id.spi;
+ id->family = x->props.family;
+ id->proto = x->id.proto;
+
+ p = RTA_DATA(__RTA_PUT(skb, XFRMA_SA, sizeof(*p)));
+ }
+
+ copy_to_user_state(x, p);
+
+ if (x->aalg)
+ RTA_PUT(skb, XFRMA_ALG_AUTH,
+ sizeof(*(x->aalg))+(x->aalg->alg_key_len+7)/8, x->aalg);
+ if (x->ealg)
+ RTA_PUT(skb, XFRMA_ALG_CRYPT,
+ sizeof(*(x->ealg))+(x->ealg->alg_key_len+7)/8, x->ealg);
+ if (x->calg)
+ RTA_PUT(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg);
+
+ if (x->encap)
+ RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
+
+ nlh->nlmsg_len = skb->tail - b;
+
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_SA;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC);
+
+nlmsg_failure:
+rtattr_failure:
+ kfree_skb(skb);
+ return -1;
+}
+
+static int xfrm_send_state_notify(struct xfrm_state *x, struct km_event *c)
+{
+
+ switch (c->event) {
+ case XFRM_MSG_EXPIRE:
+ return xfrm_exp_state_notify(x, c);
+ case XFRM_MSG_DELSA:
+ case XFRM_MSG_UPDSA:
+ case XFRM_MSG_NEWSA:
+ return xfrm_notify_sa(x, c);
+ case XFRM_MSG_FLUSHSA:
+ return xfrm_notify_sa_flush(c);
+ default:
+ printk("xfrm_user: Unknown SA event %d\n", c->event);
+ break;
+ }
+
+ return 0;
- return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
}
static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
@@ -1134,9 +1305,8 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
if (build_acquire(skb, x, xt, xp, dir) < 0)
BUG();
- NETLINK_CB(skb).dst_groups = XFRMGRP_ACQUIRE;
-
- return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_ACQUIRE, GFP_ATOMIC);
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_ACQUIRE;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_ACQUIRE, GFP_ATOMIC);
}
/* User gives us xfrm_user_policy_info followed by an array of 0
@@ -1180,6 +1350,9 @@ static struct xfrm_policy *xfrm_compile_policy(u16 family, int opt,
if (nr > XFRM_MAX_DEPTH)
return NULL;
+ if (p->dir > XFRM_POLICY_OUT)
+ return NULL;
+
xp = xfrm_policy_alloc(GFP_KERNEL);
if (xp == NULL) {
*dir = -ENOBUFS;
@@ -1218,7 +1391,7 @@ nlmsg_failure:
return -1;
}
-static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard)
+static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
{
struct sk_buff *skb;
size_t len;
@@ -1229,12 +1402,110 @@ static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard)
if (skb == NULL)
return -ENOMEM;
- if (build_polexpire(skb, xp, dir, hard) < 0)
+ if (build_polexpire(skb, xp, dir, c->data.hard) < 0)
BUG();
- NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
+}
+
+static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
+{
+ struct xfrm_userpolicy_info *p;
+ struct xfrm_userpolicy_id *id;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ unsigned char *b;
+ int len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
+ int headlen;
+
+ headlen = sizeof(*p);
+ if (c->event == XFRM_MSG_DELPOLICY) {
+ len += RTA_SPACE(headlen);
+ headlen = sizeof(*id);
+ }
+ len += NLMSG_SPACE(headlen);
+
+ skb = alloc_skb(len, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+ b = skb->tail;
+
+ nlh = NLMSG_PUT(skb, c->pid, c->seq, c->event, headlen);
+
+ p = NLMSG_DATA(nlh);
+ if (c->event == XFRM_MSG_DELPOLICY) {
+ id = NLMSG_DATA(nlh);
+ memset(id, 0, sizeof(*id));
+ id->dir = dir;
+ if (c->data.byid)
+ id->index = xp->index;
+ else
+ memcpy(&id->sel, &xp->selector, sizeof(id->sel));
+
+ p = RTA_DATA(__RTA_PUT(skb, XFRMA_POLICY, sizeof(*p)));
+ }
+
+ nlh->nlmsg_flags = 0;
+
+ copy_to_user_policy(xp, p, dir);
+ if (copy_to_user_tmpl(xp, skb) < 0)
+ goto nlmsg_failure;
+
+ nlh->nlmsg_len = skb->tail - b;
+
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
+
+nlmsg_failure:
+rtattr_failure:
+ kfree_skb(skb);
+ return -1;
+}
+
+static int xfrm_notify_policy_flush(struct km_event *c)
+{
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ unsigned char *b;
+ int len = NLMSG_LENGTH(0);
+
+ skb = alloc_skb(len, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOMEM;
+ b = skb->tail;
+
+
+ nlh = NLMSG_PUT(skb, c->pid, c->seq, XFRM_MSG_FLUSHPOLICY, 0);
+
+ nlh->nlmsg_len = skb->tail - b;
+
+ NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY;
+ return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
+
+nlmsg_failure:
+ kfree_skb(skb);
+ return -1;
+}
+
+static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
+{
+
+ switch (c->event) {
+ case XFRM_MSG_NEWPOLICY:
+ case XFRM_MSG_UPDPOLICY:
+ case XFRM_MSG_DELPOLICY:
+ return xfrm_notify_policy(xp, dir, c);
+ case XFRM_MSG_FLUSHPOLICY:
+ return xfrm_notify_policy_flush(c);
+ case XFRM_MSG_POLEXPIRE:
+ return xfrm_exp_policy_notify(xp, dir, c);
+ default:
+ printk("xfrm_user: Unknown Policy event %d\n", c->event);
+ }
+
+ return 0;
- return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
}
static struct xfrm_mgr netlink_mgr = {
@@ -1249,7 +1520,8 @@ static int __init xfrm_user_init(void)
{
printk(KERN_INFO "Initializing IPsec netlink socket\n");
- xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv);
+ xfrm_nl = netlink_kernel_create(NETLINK_XFRM, XFRMNLGRP_MAX,
+ xfrm_netlink_rcv, THIS_MODULE);
if (xfrm_nl == NULL)
return -ENOMEM;
@@ -1267,3 +1539,4 @@ static void __exit xfrm_user_exit(void)
module_init(xfrm_user_init);
module_exit(xfrm_user_exit);
MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);
OpenPOWER on IntegriCloud