diff options
Diffstat (limited to 'net/openvswitch')
-rw-r--r-- | net/openvswitch/Kconfig | 23 | ||||
-rw-r--r-- | net/openvswitch/Makefile | 14 | ||||
-rw-r--r-- | net/openvswitch/actions.c | 355 | ||||
-rw-r--r-- | net/openvswitch/datapath.c | 346 | ||||
-rw-r--r-- | net/openvswitch/datapath.h | 24 | ||||
-rw-r--r-- | net/openvswitch/flow.c | 38 | ||||
-rw-r--r-- | net/openvswitch/flow.h | 88 | ||||
-rw-r--r-- | net/openvswitch/flow_netlink.c | 629 | ||||
-rw-r--r-- | net/openvswitch/flow_netlink.h | 18 | ||||
-rw-r--r-- | net/openvswitch/flow_table.c | 27 | ||||
-rw-r--r-- | net/openvswitch/flow_table.h | 10 | ||||
-rw-r--r-- | net/openvswitch/vport-geneve.c | 46 | ||||
-rw-r--r-- | net/openvswitch/vport-gre.c | 47 | ||||
-rw-r--r-- | net/openvswitch/vport-internal_dev.c | 22 | ||||
-rw-r--r-- | net/openvswitch/vport-netdev.c | 16 | ||||
-rw-r--r-- | net/openvswitch/vport-netdev.h | 3 | ||||
-rw-r--r-- | net/openvswitch/vport-vxlan.c | 47 | ||||
-rw-r--r-- | net/openvswitch/vport.c | 171 | ||||
-rw-r--r-- | net/openvswitch/vport.h | 34 |
19 files changed, 1316 insertions, 642 deletions
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index ba3bb82..b7d818c 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -4,7 +4,9 @@ config OPENVSWITCH tristate "Open vSwitch" + depends on INET select LIBCRC32C + select NET_MPLS_GSO ---help--- Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features @@ -29,11 +31,10 @@ config OPENVSWITCH If unsure, say N. config OPENVSWITCH_GRE - bool "Open vSwitch GRE tunneling support" - depends on INET + tristate "Open vSwitch GRE tunneling support" depends on OPENVSWITCH - depends on NET_IPGRE_DEMUX && !(OPENVSWITCH=y && NET_IPGRE_DEMUX=m) - default y + depends on NET_IPGRE_DEMUX + default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create GRE vport. @@ -43,11 +44,10 @@ config OPENVSWITCH_GRE If unsure, say Y. config OPENVSWITCH_VXLAN - bool "Open vSwitch VXLAN tunneling support" - depends on INET + tristate "Open vSwitch VXLAN tunneling support" depends on OPENVSWITCH - depends on VXLAN && !(OPENVSWITCH=y && VXLAN=m) - default y + depends on VXLAN + default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create vxlan vport. @@ -56,11 +56,10 @@ config OPENVSWITCH_VXLAN If unsure, say Y. config OPENVSWITCH_GENEVE - bool "Open vSwitch Geneve tunneling support" - depends on INET + tristate "Open vSwitch Geneve tunneling support" depends on OPENVSWITCH - depends on GENEVE && !(OPENVSWITCH=y && GENEVE=m) - default y + depends on GENEVE + default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create geneve vport. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 9a33a27..91b9478 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -15,14 +15,6 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o -ifneq ($(CONFIG_OPENVSWITCH_GENEVE),) -openvswitch-y += vport-geneve.o -endif - -ifneq ($(CONFIG_OPENVSWITCH_VXLAN),) -openvswitch-y += vport-vxlan.o -endif - -ifneq ($(CONFIG_OPENVSWITCH_GRE),) -openvswitch-y += vport-gre.o -endif +obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o +obj-$(CONFIG_OPENVSWITCH_VXLAN) += vport-vxlan.o +obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 8c4229b..764fdc3 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -28,10 +28,12 @@ #include <linux/in6.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> + #include <net/ip.h> #include <net/ipv6.h> #include <net/checksum.h> #include <net/dsfield.h> +#include <net/mpls.h> #include <net/sctp/checksum.h> #include "datapath.h" @@ -67,7 +69,7 @@ static void action_fifo_init(struct action_fifo *fifo) fifo->tail = 0; } -static bool action_fifo_is_empty(struct action_fifo *fifo) +static bool action_fifo_is_empty(const struct action_fifo *fifo) { return (fifo->head == fifo->tail); } @@ -90,7 +92,7 @@ static struct deferred_action *action_fifo_put(struct action_fifo *fifo) /* Return true if fifo is not full */ static struct deferred_action *add_deferred_actions(struct sk_buff *skb, - struct sw_flow_key *key, + const struct sw_flow_key *key, const struct nlattr *attr) { struct action_fifo *fifo; @@ -107,100 +109,131 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb, return da; } -static int make_writable(struct sk_buff *skb, int write_len) +static void invalidate_flow_key(struct sw_flow_key *key) +{ + key->eth.type = htons(0); +} + +static bool is_flow_key_valid(const struct sw_flow_key *key) { - if (!pskb_may_pull(skb, write_len)) + return !!key->eth.type; +} + +static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_action_push_mpls *mpls) +{ + __be32 *new_mpls_lse; + struct ethhdr *hdr; + + /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ + if (skb->encapsulation) + return -ENOTSUPP; + + if (skb_cow_head(skb, MPLS_HLEN) < 0) return -ENOMEM; - if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) - return 0; + skb_push(skb, MPLS_HLEN); + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + skb_reset_mac_header(skb); + + new_mpls_lse = (__be32 *)skb_mpls_header(skb); + *new_mpls_lse = mpls->mpls_lse; - return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, + MPLS_HLEN, 0)); + + hdr = eth_hdr(skb); + hdr->h_proto = mpls->mpls_ethertype; + + skb_set_inner_protocol(skb, skb->protocol); + skb->protocol = mpls->mpls_ethertype; + + invalidate_flow_key(key); + return 0; } -/* remove VLAN header from packet and update csum accordingly. */ -static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) +static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, + const __be16 ethertype) { - struct vlan_hdr *vhdr; + struct ethhdr *hdr; int err; - err = make_writable(skb, VLAN_ETH_HLEN); + err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); if (unlikely(err)) return err; - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_sub(skb->csum, csum_partial(skb->data - + (2 * ETH_ALEN), VLAN_HLEN, 0)); + skb_postpull_rcsum(skb, skb_mpls_header(skb), MPLS_HLEN); - vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); - *current_tci = vhdr->h_vlan_TCI; + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); - memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); - __skb_pull(skb, VLAN_HLEN); + __skb_pull(skb, MPLS_HLEN); + skb_reset_mac_header(skb); - vlan_set_encap_proto(skb, vhdr); - skb->mac_header += VLAN_HLEN; - if (skb_network_offset(skb) < ETH_HLEN) - skb_set_network_header(skb, ETH_HLEN); - skb_reset_mac_len(skb); + /* skb_mpls_header() is used to locate the ethertype + * field correctly in the presence of VLAN tags. + */ + hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); + hdr->h_proto = ethertype; + if (eth_p_mpls(skb->protocol)) + skb->protocol = ethertype; + invalidate_flow_key(key); return 0; } -static int pop_vlan(struct sk_buff *skb) +static int set_mpls(struct sk_buff *skb, struct sw_flow_key *key, + const __be32 *mpls_lse) { - __be16 tci; + __be32 *stack; int err; - if (likely(vlan_tx_tag_present(skb))) { - skb->vlan_tci = 0; - } else { - if (unlikely(skb->protocol != htons(ETH_P_8021Q) || - skb->len < VLAN_ETH_HLEN)) - return 0; - - err = __pop_vlan_tci(skb, &tci); - if (err) - return err; - } - /* move next vlan tag to hw accel tag */ - if (likely(skb->protocol != htons(ETH_P_8021Q) || - skb->len < VLAN_ETH_HLEN)) - return 0; - - err = __pop_vlan_tci(skb, &tci); + err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); if (unlikely(err)) return err; - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(tci)); + stack = (__be32 *)skb_mpls_header(skb); + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be32 diff[] = { ~(*stack), *mpls_lse }; + skb->csum = ~csum_partial((char *)diff, sizeof(diff), + ~skb->csum); + } + + *stack = *mpls_lse; + key->mpls.top_lse = *mpls_lse; return 0; } -static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vlan) +static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key) { - if (unlikely(vlan_tx_tag_present(skb))) { - u16 current_tag; - - /* push down current VLAN tag */ - current_tag = vlan_tx_tag_get(skb); - - if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag)) - return -ENOMEM; + int err; - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(skb->data - + (2 * ETH_ALEN), VLAN_HLEN, 0)); + err = skb_vlan_pop(skb); + if (vlan_tx_tag_present(skb)) + invalidate_flow_key(key); + else + key->eth.tci = 0; + return err; +} - } - __vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); - return 0; +static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_action_push_vlan *vlan) +{ + if (vlan_tx_tag_present(skb)) + invalidate_flow_key(key); + else + key->eth.tci = vlan->vlan_tci; + return skb_vlan_push(skb, vlan->vlan_tpid, + ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); } -static int set_eth_addr(struct sk_buff *skb, +static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_key_ethernet *eth_key) { int err; - err = make_writable(skb, ETH_HLEN); + err = skb_ensure_writable(skb, ETH_HLEN); if (unlikely(err)) return err; @@ -211,11 +244,13 @@ static int set_eth_addr(struct sk_buff *skb, ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + ether_addr_copy(key->eth.src, eth_key->eth_src); + ether_addr_copy(key->eth.dst, eth_key->eth_dst); return 0; } static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, - __be32 *addr, __be32 new_addr) + __be32 *addr, __be32 new_addr) { int transport_len = skb->len - skb_transport_offset(skb); @@ -298,42 +333,52 @@ static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl) nh->ttl = new_ttl; } -static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key) +static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_key_ipv4 *ipv4_key) { struct iphdr *nh; int err; - err = make_writable(skb, skb_network_offset(skb) + - sizeof(struct iphdr)); + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(struct iphdr)); if (unlikely(err)) return err; nh = ip_hdr(skb); - if (ipv4_key->ipv4_src != nh->saddr) + if (ipv4_key->ipv4_src != nh->saddr) { set_ip_addr(skb, nh, &nh->saddr, ipv4_key->ipv4_src); + key->ipv4.addr.src = ipv4_key->ipv4_src; + } - if (ipv4_key->ipv4_dst != nh->daddr) + if (ipv4_key->ipv4_dst != nh->daddr) { set_ip_addr(skb, nh, &nh->daddr, ipv4_key->ipv4_dst); + key->ipv4.addr.dst = ipv4_key->ipv4_dst; + } - if (ipv4_key->ipv4_tos != nh->tos) + if (ipv4_key->ipv4_tos != nh->tos) { ipv4_change_dsfield(nh, 0, ipv4_key->ipv4_tos); + key->ip.tos = nh->tos; + } - if (ipv4_key->ipv4_ttl != nh->ttl) + if (ipv4_key->ipv4_ttl != nh->ttl) { set_ip_ttl(skb, nh, ipv4_key->ipv4_ttl); + key->ip.ttl = ipv4_key->ipv4_ttl; + } return 0; } -static int set_ipv6(struct sk_buff *skb, const struct ovs_key_ipv6 *ipv6_key) +static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_key_ipv6 *ipv6_key) { struct ipv6hdr *nh; int err; __be32 *saddr; __be32 *daddr; - err = make_writable(skb, skb_network_offset(skb) + - sizeof(struct ipv6hdr)); + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(struct ipv6hdr)); if (unlikely(err)) return err; @@ -341,9 +386,12 @@ static int set_ipv6(struct sk_buff *skb, const struct ovs_key_ipv6 *ipv6_key) saddr = (__be32 *)&nh->saddr; daddr = (__be32 *)&nh->daddr; - if (memcmp(ipv6_key->ipv6_src, saddr, sizeof(ipv6_key->ipv6_src))) + if (memcmp(ipv6_key->ipv6_src, saddr, sizeof(ipv6_key->ipv6_src))) { set_ipv6_addr(skb, ipv6_key->ipv6_proto, saddr, ipv6_key->ipv6_src, true); + memcpy(&key->ipv6.addr.src, ipv6_key->ipv6_src, + sizeof(ipv6_key->ipv6_src)); + } if (memcmp(ipv6_key->ipv6_dst, daddr, sizeof(ipv6_key->ipv6_dst))) { unsigned int offset = 0; @@ -357,16 +405,22 @@ static int set_ipv6(struct sk_buff *skb, const struct ovs_key_ipv6 *ipv6_key) set_ipv6_addr(skb, ipv6_key->ipv6_proto, daddr, ipv6_key->ipv6_dst, recalc_csum); + memcpy(&key->ipv6.addr.dst, ipv6_key->ipv6_dst, + sizeof(ipv6_key->ipv6_dst)); } set_ipv6_tc(nh, ipv6_key->ipv6_tclass); + key->ip.tos = ipv6_get_dsfield(nh); + set_ipv6_fl(nh, ntohl(ipv6_key->ipv6_label)); - nh->hop_limit = ipv6_key->ipv6_hlimit; + key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); + nh->hop_limit = ipv6_key->ipv6_hlimit; + key->ip.ttl = ipv6_key->ipv6_hlimit; return 0; } -/* Must follow make_writable() since that can move the skb data. */ +/* Must follow skb_ensure_writable() since that can move the skb data. */ static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) { @@ -390,54 +444,64 @@ static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) } } -static int set_udp(struct sk_buff *skb, const struct ovs_key_udp *udp_port_key) +static int set_udp(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_key_udp *udp_port_key) { struct udphdr *uh; int err; - err = make_writable(skb, skb_transport_offset(skb) + - sizeof(struct udphdr)); + err = skb_ensure_writable(skb, skb_transport_offset(skb) + + sizeof(struct udphdr)); if (unlikely(err)) return err; uh = udp_hdr(skb); - if (udp_port_key->udp_src != uh->source) + if (udp_port_key->udp_src != uh->source) { set_udp_port(skb, &uh->source, udp_port_key->udp_src); + key->tp.src = udp_port_key->udp_src; + } - if (udp_port_key->udp_dst != uh->dest) + if (udp_port_key->udp_dst != uh->dest) { set_udp_port(skb, &uh->dest, udp_port_key->udp_dst); + key->tp.dst = udp_port_key->udp_dst; + } return 0; } -static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) +static int set_tcp(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_key_tcp *tcp_port_key) { struct tcphdr *th; int err; - err = make_writable(skb, skb_transport_offset(skb) + - sizeof(struct tcphdr)); + err = skb_ensure_writable(skb, skb_transport_offset(skb) + + sizeof(struct tcphdr)); if (unlikely(err)) return err; th = tcp_hdr(skb); - if (tcp_port_key->tcp_src != th->source) + if (tcp_port_key->tcp_src != th->source) { set_tp_port(skb, &th->source, tcp_port_key->tcp_src, &th->check); + key->tp.src = tcp_port_key->tcp_src; + } - if (tcp_port_key->tcp_dst != th->dest) + if (tcp_port_key->tcp_dst != th->dest) { set_tp_port(skb, &th->dest, tcp_port_key->tcp_dst, &th->check); + key->tp.dst = tcp_port_key->tcp_dst; + } return 0; } -static int set_sctp(struct sk_buff *skb, - const struct ovs_key_sctp *sctp_port_key) +static int set_sctp(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_key_sctp *sctp_port_key) { struct sctphdr *sh; int err; unsigned int sctphoff = skb_transport_offset(skb); - err = make_writable(skb, sctphoff + sizeof(struct sctphdr)); + err = skb_ensure_writable(skb, sctphoff + sizeof(struct sctphdr)); if (unlikely(err)) return err; @@ -458,39 +522,35 @@ static int set_sctp(struct sk_buff *skb, sh->checksum = old_csum ^ old_correct_csum ^ new_csum; skb_clear_hash(skb); + key->tp.src = sctp_port_key->sctp_src; + key->tp.dst = sctp_port_key->sctp_dst; } return 0; } -static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port) { - struct vport *vport; - - if (unlikely(!skb)) - return -ENOMEM; + struct vport *vport = ovs_vport_rcu(dp, out_port); - vport = ovs_vport_rcu(dp, out_port); - if (unlikely(!vport)) { + if (likely(vport)) + ovs_vport_send(vport, skb); + else kfree_skb(skb); - return -ENODEV; - } - - ovs_vport_send(vport, skb); - return 0; } static int output_userspace(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr) { + struct ovs_tunnel_info info; struct dp_upcall_info upcall; const struct nlattr *a; int rem; upcall.cmd = OVS_PACKET_CMD_ACTION; - upcall.key = key; upcall.userdata = NULL; upcall.portid = 0; + upcall.egress_tun_info = NULL; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { @@ -502,15 +562,27 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, case OVS_USERSPACE_ATTR_PID: upcall.portid = nla_get_u32(a); break; + + case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: { + /* Get out tunnel info. */ + struct vport *vport; + + vport = ovs_vport_rcu(dp, nla_get_u32(a)); + if (vport) { + int err; + + err = ovs_vport_get_egress_tun_info(vport, skb, + &info); + if (!err) + upcall.egress_tun_info = &info; + } + break; } - } - return ovs_dp_upcall(dp, skb, &upcall); -} + } /* End of switch. */ + } -static bool last_action(const struct nlattr *a, int rem) -{ - return a->nla_len == rem; + return ovs_dp_upcall(dp, skb, key, &upcall); } static int sample(struct datapath *dp, struct sk_buff *skb, @@ -547,7 +619,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, * user space. This skb will be consumed by its caller. */ if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && - last_action(a, rem))) + nla_is_last(a, rem))) return output_userspace(dp, skb, key, a); skb = skb_clone(skb, GFP_ATOMIC); @@ -580,18 +652,20 @@ static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, key->ovs_flow_hash = hash; } -static int execute_set_action(struct sk_buff *skb, - const struct nlattr *nested_attr) +static int execute_set_action(struct sk_buff *skb, struct sw_flow_key *key, + const struct nlattr *nested_attr) { int err = 0; switch (nla_type(nested_attr)) { case OVS_KEY_ATTR_PRIORITY: skb->priority = nla_get_u32(nested_attr); + key->phy.priority = skb->priority; break; case OVS_KEY_ATTR_SKB_MARK: skb->mark = nla_get_u32(nested_attr); + key->phy.skb_mark = skb->mark; break; case OVS_KEY_ATTR_TUNNEL_INFO: @@ -599,27 +673,31 @@ static int execute_set_action(struct sk_buff *skb, break; case OVS_KEY_ATTR_ETHERNET: - err = set_eth_addr(skb, nla_data(nested_attr)); + err = set_eth_addr(skb, key, nla_data(nested_attr)); break; case OVS_KEY_ATTR_IPV4: - err = set_ipv4(skb, nla_data(nested_attr)); + err = set_ipv4(skb, key, nla_data(nested_attr)); break; case OVS_KEY_ATTR_IPV6: - err = set_ipv6(skb, nla_data(nested_attr)); + err = set_ipv6(skb, key, nla_data(nested_attr)); break; case OVS_KEY_ATTR_TCP: - err = set_tcp(skb, nla_data(nested_attr)); + err = set_tcp(skb, key, nla_data(nested_attr)); break; case OVS_KEY_ATTR_UDP: - err = set_udp(skb, nla_data(nested_attr)); + err = set_udp(skb, key, nla_data(nested_attr)); break; case OVS_KEY_ATTR_SCTP: - err = set_sctp(skb, nla_data(nested_attr)); + err = set_sctp(skb, key, nla_data(nested_attr)); + break; + + case OVS_KEY_ATTR_MPLS: + err = set_mpls(skb, key, nla_data(nested_attr)); break; } @@ -631,13 +709,17 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb, const struct nlattr *a, int rem) { struct deferred_action *da; - int err; - err = ovs_flow_key_update(skb, key); - if (err) - return err; + if (!is_flow_key_valid(key)) { + int err; + + err = ovs_flow_key_update(skb, key); + if (err) + return err; + } + BUG_ON(!is_flow_key_valid(key)); - if (!last_action(a, rem)) { + if (!nla_is_last(a, rem)) { /* Recirc action is the not the last action * of the action list, need to clone the skb. */ @@ -672,7 +754,8 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, /* Every output action needs a separate clone of 'skb', but the common * case is just a single output action, so that doing a clone and * then freeing the original skbuff is wasteful. So the following code - * is slightly obscure just to avoid that. */ + * is slightly obscure just to avoid that. + */ int prev_port = -1; const struct nlattr *a; int rem; @@ -681,8 +764,12 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, a = nla_next(a, &rem)) { int err = 0; - if (prev_port != -1) { - do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port); + if (unlikely(prev_port != -1)) { + struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); + + if (out_skb) + do_output(dp, out_skb, prev_port); + prev_port = -1; } @@ -699,19 +786,25 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, execute_hash(skb, key, a); break; + case OVS_ACTION_ATTR_PUSH_MPLS: + err = push_mpls(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_POP_MPLS: + err = pop_mpls(skb, key, nla_get_be16(a)); + break; + case OVS_ACTION_ATTR_PUSH_VLAN: - err = push_vlan(skb, nla_data(a)); - if (unlikely(err)) /* skb already freed. */ - return err; + err = push_vlan(skb, key, nla_data(a)); break; case OVS_ACTION_ATTR_POP_VLAN: - err = pop_vlan(skb); + err = pop_vlan(skb, key); break; case OVS_ACTION_ATTR_RECIRC: err = execute_recirc(dp, skb, key, a, rem); - if (last_action(a, rem)) { + if (nla_is_last(a, rem)) { /* If this is the last action, the skb has * been consumed or freed. * Return immediately. @@ -721,7 +814,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_SET: - err = execute_set_action(skb, nla_data(a)); + err = execute_set_action(skb, key, nla_data(a)); break; case OVS_ACTION_ATTR_SAMPLE: @@ -771,14 +864,12 @@ static void process_deferred_actions(struct datapath *dp) /* Execute a list of actions against 'skb'. */ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_actions *acts, struct sw_flow_key *key) { int level = this_cpu_read(exec_actions_level); - struct sw_flow_actions *acts; int err; - acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); - this_cpu_inc(exec_actions_level); OVS_CB(skb)->egress_tun_info = NULL; err = do_execute_actions(dp, skb, key, diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f9e556b..332b5a0 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -59,6 +59,7 @@ #include "vport-netdev.h" int ovs_net_id __read_mostly; +EXPORT_SYMBOL_GPL(ovs_net_id); static struct genl_family dp_packet_genl_family; static struct genl_family dp_flow_genl_family; @@ -130,27 +131,41 @@ int lockdep_ovsl_is_held(void) else return 1; } +EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held); #endif static struct vport *new_vport(const struct vport_parms *); static int queue_gso_packets(struct datapath *dp, struct sk_buff *, + const struct sw_flow_key *, const struct dp_upcall_info *); static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, + const struct sw_flow_key *, const struct dp_upcall_info *); -/* Must be called with rcu_read_lock or ovs_mutex. */ -static struct datapath *get_dp(struct net *net, int dp_ifindex) +/* Must be called with rcu_read_lock. */ +static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) { - struct datapath *dp = NULL; - struct net_device *dev; + struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, dp_ifindex); if (dev) { struct vport *vport = ovs_internal_dev_get_vport(dev); if (vport) - dp = vport->dp; + return vport->dp; } + + return NULL; +} + +/* The caller must hold either ovs_mutex or rcu_read_lock to keep the + * returned dp pointer valid. + */ +static inline struct datapath *get_dp(struct net *net, int dp_ifindex) +{ + struct datapath *dp; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + rcu_read_lock(); + dp = get_dp_rcu(net, dp_ifindex); rcu_read_unlock(); return dp; @@ -163,7 +178,7 @@ const char *ovs_dp_name(const struct datapath *dp) return vport->ops->get_name(vport); } -static int get_dpifindex(struct datapath *dp) +static int get_dpifindex(const struct datapath *dp) { struct vport *local; int ifindex; @@ -185,6 +200,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); + ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); kfree(dp->ports); @@ -243,6 +259,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; + struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; @@ -256,10 +273,10 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) int error; upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.key = key; upcall.userdata = NULL; upcall.portid = ovs_vport_find_upcall_portid(p, skb); - error = ovs_dp_upcall(dp, skb, &upcall); + upcall.egress_tun_info = NULL; + error = ovs_dp_upcall(dp, skb, key, &upcall); if (unlikely(error)) kfree_skb(skb); else @@ -268,10 +285,10 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) goto out; } - OVS_CB(skb)->flow = flow; + ovs_flow_stats_update(flow, key->tp.flags, skb); + sf_acts = rcu_dereference(flow->sf_acts); + ovs_execute_actions(dp, skb, sf_acts, key); - ovs_flow_stats_update(OVS_CB(skb)->flow, key->tp.flags, skb); - ovs_execute_actions(dp, skb, key); stats_counter = &stats->n_hit; out: @@ -283,6 +300,7 @@ out: } int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info) { struct dp_stats_percpu *stats; @@ -294,9 +312,9 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, } if (!skb_is_gso(skb)) - err = queue_userspace_packet(dp, skb, upcall_info); + err = queue_userspace_packet(dp, skb, key, upcall_info); else - err = queue_gso_packets(dp, skb, upcall_info); + err = queue_gso_packets(dp, skb, key, upcall_info); if (err) goto err; @@ -313,39 +331,43 @@ err: } static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info) { unsigned short gso_type = skb_shinfo(skb)->gso_type; - struct dp_upcall_info later_info; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; + struct ovs_skb_cb ovs_cb; int err; + ovs_cb = *OVS_CB(skb); segs = __skb_gso_segment(skb, NETIF_F_SG, false); + *OVS_CB(skb) = ovs_cb; if (IS_ERR(segs)) return PTR_ERR(segs); if (segs == NULL) return -EINVAL; + if (gso_type & SKB_GSO_UDP) { + /* The initial flow key extracted by ovs_flow_key_extract() + * in this case is for a first fragment, so we need to + * properly mark later fragments. + */ + later_key = *key; + later_key.ip.frag = OVS_FRAG_TYPE_LATER; + } + /* Queue all of the segments. */ skb = segs; do { - err = queue_userspace_packet(dp, skb, upcall_info); + *OVS_CB(skb) = ovs_cb; + if (gso_type & SKB_GSO_UDP && skb != segs) + key = &later_key; + + err = queue_userspace_packet(dp, skb, key, upcall_info); if (err) break; - if (skb == segs && gso_type & SKB_GSO_UDP) { - /* The initial flow key extracted by ovs_flow_extract() - * in this case is for a first fragment, so we need to - * properly mark later fragments. - */ - later_key = *upcall_info->key; - later_key.ip.frag = OVS_FRAG_TYPE_LATER; - - later_info = *upcall_info; - later_info.key = &later_key; - upcall_info = &later_info; - } } while ((skb = skb->next)); /* Free all of the segments. */ @@ -360,46 +382,26 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, return err; } -static size_t key_attr_size(void) -{ - return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ - + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ - + nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ - + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ - + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ - + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ - + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ - + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ - + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ - + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ - + nla_total_size(4) /* OVS_KEY_ATTR_8021Q */ - + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ - + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ - + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ - + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ - + nla_total_size(28); /* OVS_KEY_ATTR_ND */ -} - -static size_t upcall_msg_size(const struct nlattr *userdata, +static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, unsigned int hdrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ - + nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */ + + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */ /* OVS_PACKET_ATTR_USERDATA */ - if (userdata) - size += NLA_ALIGN(userdata->nla_len); + if (upcall_info->userdata) + size += NLA_ALIGN(upcall_info->userdata->nla_len); + + /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */ + if (upcall_info->egress_tun_info) + size += nla_total_size(ovs_tun_key_attr_size()); return size; } static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info) { struct ovs_header *upcall; @@ -423,11 +425,10 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, if (!nskb) return -ENOMEM; - nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb)); + nskb = __vlan_hwaccel_push_inside(nskb); if (!nskb) return -ENOMEM; - nskb->vlan_tci = 0; skb = nskb; } @@ -450,7 +451,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, else hlen = skb->len; - len = upcall_msg_size(upcall_info->userdata, hlen); + len = upcall_msg_size(upcall_info, hlen); user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; @@ -462,7 +463,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, upcall->dp_ifindex = dp_ifindex; nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); - err = ovs_nla_put_flow(upcall_info->key, upcall_info->key, user_skb); + err = ovs_nla_put_flow(key, key, user_skb); BUG_ON(err); nla_nest_end(user_skb, nla); @@ -471,6 +472,14 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, nla_len(upcall_info->userdata), nla_data(upcall_info->userdata)); + if (upcall_info->egress_tun_info) { + nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); + err = ovs_nla_put_egress_tunnel_key(user_skb, + upcall_info->egress_tun_info); + BUG_ON(err); + nla_nest_end(user_skb, nla); + } + /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { @@ -510,11 +519,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct sw_flow_actions *acts; struct sk_buff *packet; struct sw_flow *flow; + struct sw_flow_actions *sf_acts; struct datapath *dp; struct ethhdr *eth; struct vport *input_vport; int len; int err; + bool log = !a[OVS_FLOW_ATTR_PROBE]; err = -EINVAL; if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || @@ -548,29 +559,22 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_kfree_skb; err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet, - &flow->key); + &flow->key, log); if (err) goto err_flow_free; - acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); - err = PTR_ERR(acts); - if (IS_ERR(acts)) - goto err_flow_free; - err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], - &flow->key, 0, &acts); + &flow->key, &acts, log); if (err) goto err_flow_free; rcu_assign_pointer(flow->sf_acts, acts); - OVS_CB(packet)->egress_tun_info = NULL; - OVS_CB(packet)->flow = flow; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; rcu_read_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto err_unlock; @@ -583,9 +587,10 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_unlock; OVS_CB(packet)->input_vport = input_vport; + sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); - err = ovs_execute_actions(dp, packet, &flow->key); + err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); local_bh_enable(); rcu_read_unlock(); @@ -628,7 +633,7 @@ static struct genl_family dp_packet_genl_family = { .n_ops = ARRAY_SIZE(dp_packet_genl_ops), }; -static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats, +static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, struct ovs_dp_megaflow_stats *mega_stats) { int i; @@ -662,8 +667,8 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats, static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) { return NLMSG_ALIGN(sizeof(struct ovs_header)) - + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */ - + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */ + + nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_KEY */ + + nla_total_size(ovs_key_attr_size()) /* OVS_FLOW_ATTR_MASK */ + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + nla_total_size(8) /* OVS_FLOW_ATTR_USED */ @@ -671,58 +676,67 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) } /* Called with ovs_mutex or RCU read lock. */ -static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, - struct sk_buff *skb, u32 portid, - u32 seq, u32 flags, u8 cmd) +static int ovs_flow_cmd_fill_match(const struct sw_flow *flow, + struct sk_buff *skb) { - const int skb_orig_len = skb->len; - struct nlattr *start; - struct ovs_flow_stats stats; - __be16 tcp_flags; - unsigned long used; - struct ovs_header *ovs_header; struct nlattr *nla; int err; - ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); - if (!ovs_header) - return -EMSGSIZE; - - ovs_header->dp_ifindex = dp_ifindex; - /* Fill flow key. */ nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); if (!nla) - goto nla_put_failure; + return -EMSGSIZE; err = ovs_nla_put_flow(&flow->unmasked_key, &flow->unmasked_key, skb); if (err) - goto error; + return err; + nla_nest_end(skb, nla); + /* Fill flow mask. */ nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK); if (!nla) - goto nla_put_failure; + return -EMSGSIZE; err = ovs_nla_put_flow(&flow->key, &flow->mask->key, skb); if (err) - goto error; + return err; nla_nest_end(skb, nla); + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow, + struct sk_buff *skb) +{ + struct ovs_flow_stats stats; + __be16 tcp_flags; + unsigned long used; ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); if (used && nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used))) - goto nla_put_failure; + return -EMSGSIZE; if (stats.n_packets && nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats)) - goto nla_put_failure; + return -EMSGSIZE; if ((u8)ntohs(tcp_flags) && nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) - goto nla_put_failure; + return -EMSGSIZE; + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, + struct sk_buff *skb, int skb_orig_len) +{ + struct nlattr *start; + int err; /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if * this is the first flow to be dumped into 'skb'. This is unusual for @@ -746,17 +760,47 @@ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, nla_nest_end(skb, start); else { if (skb_orig_len) - goto error; + return err; nla_nest_cancel(skb, start); } - } else if (skb_orig_len) - goto nla_put_failure; + } else if (skb_orig_len) { + return -EMSGSIZE; + } + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, + struct sk_buff *skb, u32 portid, + u32 seq, u32 flags, u8 cmd) +{ + const int skb_orig_len = skb->len; + struct ovs_header *ovs_header; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, + flags, cmd); + if (!ovs_header) + return -EMSGSIZE; + + ovs_header->dp_ifindex = dp_ifindex; + + err = ovs_flow_cmd_fill_match(flow, skb); + if (err) + goto error; + + err = ovs_flow_cmd_fill_stats(flow, skb); + if (err) + goto error; + + err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len); + if (err) + goto error; return genlmsg_end(skb, ovs_header); -nla_put_failure: - err = -EMSGSIZE; error: genlmsg_cancel(skb, ovs_header); return err; @@ -811,13 +855,18 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) struct sw_flow_actions *acts; struct sw_flow_match match; int error; + bool log = !a[OVS_FLOW_ATTR_PROBE]; /* Must have key and actions. */ error = -EINVAL; - if (!a[OVS_FLOW_ATTR_KEY]) + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR(log, "Flow key attr not present in new flow."); goto error; - if (!a[OVS_FLOW_ATTR_ACTIONS]) + } + if (!a[OVS_FLOW_ATTR_ACTIONS]) { + OVS_NLERR(log, "Flow actions attr not present in new flow."); goto error; + } /* Most of the time we need to allocate a new flow, do it before * locking. @@ -830,24 +879,19 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) /* Extract key. */ ovs_match_init(&match, &new_flow->unmasked_key, &mask); - error = ovs_nla_get_match(&match, - a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); + error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_flow; ovs_flow_mask_key(&new_flow->key, &new_flow->unmasked_key, &mask); /* Validate actions. */ - acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); - error = PTR_ERR(acts); - if (IS_ERR(acts)) - goto err_kfree_flow; - error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, - 0, &acts); + &acts, log); if (error) { - OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); - goto err_kfree_acts; + OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); + goto err_kfree_flow; } reply = ovs_flow_cmd_alloc_info(acts, info, false); @@ -899,6 +943,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* The unmasked key has to be the same for flow updates. */ if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) { + /* Look for any overlapping flow. */ flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (!flow) { error = -ENOENT; @@ -938,23 +983,21 @@ error: return error; } +/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, const struct sw_flow_key *key, - const struct sw_flow_mask *mask) + const struct sw_flow_mask *mask, + bool log) { struct sw_flow_actions *acts; struct sw_flow_key masked_key; int error; - acts = ovs_nla_alloc_flow_actions(nla_len(a)); - if (IS_ERR(acts)) - return acts; - ovs_flow_mask_key(&masked_key, key, mask); - error = ovs_nla_copy_actions(a, &masked_key, 0, &acts); + error = ovs_nla_copy_actions(a, &masked_key, &acts, log); if (error) { - OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); - kfree(acts); + OVS_NLERR(log, + "Actions may not be safe on all matching packets"); return ERR_PTR(error); } @@ -973,29 +1016,31 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) struct sw_flow_actions *old_acts = NULL, *acts = NULL; struct sw_flow_match match; int error; + bool log = !a[OVS_FLOW_ATTR_PROBE]; /* Extract key. */ error = -EINVAL; - if (!a[OVS_FLOW_ATTR_KEY]) + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR(log, "Flow key attribute not present in set flow."); goto error; + } ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(&match, - a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); + error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); if (error) goto error; /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { - acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask); + acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask, + log); if (IS_ERR(acts)) { error = PTR_ERR(acts); goto error; } - } - /* Can allocate before locking if have acts. */ - if (acts) { + /* Can allocate before locking if have acts. */ reply = ovs_flow_cmd_alloc_info(acts, info, false); if (IS_ERR(reply)) { error = PTR_ERR(reply); @@ -1070,14 +1115,16 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct sw_flow_match match; int err; + bool log = !a[OVS_FLOW_ATTR_PROBE]; if (!a[OVS_FLOW_ATTR_KEY]) { - OVS_NLERR("Flow get message rejected, Key attribute missing.\n"); + OVS_NLERR(log, + "Flow get message rejected, Key attribute missing."); return -EINVAL; } ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL); + err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, log); if (err) return err; @@ -1118,10 +1165,12 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct sw_flow_match match; int err; + bool log = !a[OVS_FLOW_ATTR_PROBE]; if (likely(a[OVS_FLOW_ATTR_KEY])) { ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL); + err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, + log); if (unlikely(err)) return err; } @@ -1179,7 +1228,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) struct datapath *dp; rcu_read_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { rcu_read_unlock(); return -ENODEV; @@ -1211,8 +1260,10 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, + [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG }, }; static const struct genl_ops dp_flow_genl_ops[] = { @@ -1313,7 +1364,7 @@ static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) /* Called with rcu_read_lock or ovs_mutex. */ static struct datapath *lookup_datapath(struct net *net, - struct ovs_header *ovs_header, + const struct ovs_header *ovs_header, struct nlattr *a[OVS_DP_ATTR_MAX + 1]) { struct datapath *dp; @@ -1341,7 +1392,7 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *in dp->user_features = 0; } -static void ovs_dp_change(struct datapath *dp, struct nlattr **a) +static void ovs_dp_change(struct datapath *dp, struct nlattr *a[]) { if (a[OVS_DP_ATTR_USER_FEATURES]) dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); @@ -1442,7 +1493,7 @@ err_destroy_ports_array: err_destroy_percpu: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(&dp->table, false); + ovs_flow_tbl_destroy(&dp->table); err_free_dp: release_net(ovs_dp_get_net(dp)); kfree(dp); @@ -1474,8 +1525,6 @@ static void __dp_destroy(struct datapath *dp) ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); /* RCU destroy the flow table */ - ovs_flow_tbl_destroy(&dp->table, true); - call_rcu(&dp->rcu, destroy_dp_rcu); } @@ -1707,7 +1756,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, /* Called with ovs_mutex or RCU read lock. */ static struct vport *lookup_vport(struct net *net, - struct ovs_header *ovs_header, + const struct ovs_header *ovs_header, struct nlattr *a[OVS_VPORT_ATTR_MAX + 1]) { struct datapath *dp; @@ -1764,6 +1813,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); +restart: dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) @@ -1795,8 +1845,11 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) vport = new_vport(&parms); err = PTR_ERR(vport); - if (IS_ERR(vport)) + if (IS_ERR(vport)) { + if (err == -EAGAIN) + goto restart; goto exit_unlock_free; + } err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW); @@ -1939,7 +1992,7 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) int i, j = 0; rcu_read_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { rcu_read_unlock(); return -ENODEV; @@ -2112,12 +2165,18 @@ static int __init dp_init(void) if (err) goto error_netns_exit; + err = ovs_netdev_init(); + if (err) + goto error_unreg_notifier; + err = dp_register_genl(); if (err < 0) - goto error_unreg_notifier; + goto error_unreg_netdev; return 0; +error_unreg_netdev: + ovs_netdev_exit(); error_unreg_notifier: unregister_netdevice_notifier(&ovs_dp_device_notifier); error_netns_exit: @@ -2137,6 +2196,7 @@ error: static void dp_cleanup(void) { dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); + ovs_netdev_exit(); unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); rcu_barrier(); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 9741354..3ece945 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -94,14 +94,12 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB - * @flow: The flow associated with this packet. May be %NULL if no flow. * @egress_tun_key: Tunnel information about this packet on egress path. * NULL if the packet is not being tunneled. * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. */ struct ovs_skb_cb { - struct sw_flow *flow; struct ovs_tunnel_info *egress_tun_info; struct vport *input_vport; }; @@ -110,18 +108,18 @@ struct ovs_skb_cb { /** * struct dp_upcall - metadata to include with a packet to send to userspace * @cmd: One of %OVS_PACKET_CMD_*. - * @key: Becomes %OVS_PACKET_ATTR_KEY. Must be nonnull. * @userdata: If nonnull, its variable-length value is passed to userspace as * %OVS_PACKET_ATTR_USERDATA. - * @pid: Netlink PID to which packet should be sent. If @pid is 0 then no - * packet is sent and the packet is accounted in the datapath's @n_lost + * @portid: Netlink portid to which packet should be sent. If @portid is 0 + * then no packet is sent and the packet is accounted in the datapath's @n_lost * counter. + * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. */ struct dp_upcall_info { - u8 cmd; - const struct sw_flow_key *key; + const struct ovs_tunnel_info *egress_tun_info; const struct nlattr *userdata; u32 portid; + u8 cmd; }; /** @@ -151,7 +149,7 @@ int lockdep_ovsl_is_held(void); #define rcu_dereference_ovsl(p) \ rcu_dereference_check(p, lockdep_ovsl_is_held()) -static inline struct net *ovs_dp_get_net(struct datapath *dp) +static inline struct net *ovs_dp_get_net(const struct datapath *dp) { return read_pnet(&dp->net); } @@ -187,23 +185,23 @@ extern struct genl_family dp_vport_genl_family; void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, - const struct dp_upcall_info *); + const struct sw_flow_key *, const struct dp_upcall_info *); const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *); + const struct sw_flow_actions *, struct sw_flow_key *); void ovs_dp_notify_wq(struct work_struct *work); int action_fifos_init(void); void action_fifos_exit(void); -#define OVS_NLERR(fmt, ...) \ +#define OVS_NLERR(logging_allowed, fmt, ...) \ do { \ - if (net_ratelimit()) \ - pr_info("netlink: " fmt, ##__VA_ARGS__); \ + if (logging_allowed && net_ratelimit()) \ + pr_info("netlink: " fmt "\n", ##__VA_ARGS__); \ } while (0) #endif /* datapath.h */ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 2b78789..70bef2a 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -32,6 +32,7 @@ #include <linux/if_arp.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/mpls.h> #include <linux/sctp.h> #include <linux/smp.h> #include <linux/tcp.h> @@ -42,6 +43,7 @@ #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> +#include <net/mpls.h> #include <net/ndisc.h> #include "datapath.h" @@ -64,7 +66,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies) #define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, - struct sk_buff *skb) + const struct sk_buff *skb) { struct flow_stats *stats; int node = numa_node_id(); @@ -480,6 +482,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return -ENOMEM; skb_reset_network_header(skb); + skb_reset_mac_len(skb); __skb_push(skb, skb->data - skb_mac_header(skb)); /* Network layer. */ @@ -584,6 +587,33 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->ip, 0, sizeof(key->ip)); memset(&key->ipv4, 0, sizeof(key->ipv4)); } + } else if (eth_p_mpls(key->eth.type)) { + size_t stack_len = MPLS_HLEN; + + /* In the presence of an MPLS label stack the end of the L2 + * header and the beginning of the L3 header differ. + * + * Advance network_header to the beginning of the L3 + * header. mac_len corresponds to the end of the L2 header. + */ + while (1) { + __be32 lse; + + error = check_header(skb, skb->mac_len + stack_len); + if (unlikely(error)) + return 0; + + memcpy(&lse, skb_network_header(skb), MPLS_HLEN); + + if (stack_len == MPLS_HLEN) + memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); + + skb_set_network_header(skb, skb->mac_len + stack_len); + if (lse & htonl(MPLS_LS_S_MASK)) + break; + + stack_len += MPLS_HLEN; + } } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ @@ -649,7 +679,7 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) return key_extract(skb, key); } -int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ @@ -682,12 +712,12 @@ int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, int ovs_flow_key_extract_userspace(const struct nlattr *attr, struct sk_buff *skb, - struct sw_flow_key *key) + struct sw_flow_key *key, bool log) { int err; /* Extract metadata from netlink attributes. */ - err = ovs_nla_get_flow_metadata(attr, key); + err = ovs_nla_get_flow_metadata(attr, key, log); if (err) return err; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 7181331..a8b30f3 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -37,8 +37,8 @@ struct sk_buff; /* Used to memset ovs_key_ipv4_tunnel padding. */ #define OVS_TUNNEL_KEY_SIZE \ - (offsetof(struct ovs_key_ipv4_tunnel, ipv4_ttl) + \ - FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, ipv4_ttl)) + (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \ + FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst)) struct ovs_key_ipv4_tunnel { __be64 tun_id; @@ -47,11 +47,13 @@ struct ovs_key_ipv4_tunnel { __be16 tun_flags; u8 ipv4_tos; u8 ipv4_ttl; + __be16 tp_src; + __be16 tp_dst; } __packed __aligned(4); /* Minimize padding. */ struct ovs_tunnel_info { struct ovs_key_ipv4_tunnel tunnel; - struct geneve_opt *options; + const struct geneve_opt *options; u8 options_len; }; @@ -64,27 +66,59 @@ struct ovs_tunnel_info { FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \ opt_len)) -static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - const struct iphdr *iph, - __be64 tun_id, __be16 tun_flags, - struct geneve_opt *opts, - u8 opts_len) +static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, + __be32 saddr, __be32 daddr, + u8 tos, u8 ttl, + __be16 tp_src, + __be16 tp_dst, + __be64 tun_id, + __be16 tun_flags, + const struct geneve_opt *opts, + u8 opts_len) { tun_info->tunnel.tun_id = tun_id; - tun_info->tunnel.ipv4_src = iph->saddr; - tun_info->tunnel.ipv4_dst = iph->daddr; - tun_info->tunnel.ipv4_tos = iph->tos; - tun_info->tunnel.ipv4_ttl = iph->ttl; + tun_info->tunnel.ipv4_src = saddr; + tun_info->tunnel.ipv4_dst = daddr; + tun_info->tunnel.ipv4_tos = tos; + tun_info->tunnel.ipv4_ttl = ttl; tun_info->tunnel.tun_flags = tun_flags; - /* clear struct padding. */ - memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0, - sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); + /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of + * the upper tunnel are used. + * E.g: GRE over IPSEC, the tp_src and tp_port are zero. + */ + tun_info->tunnel.tp_src = tp_src; + tun_info->tunnel.tp_dst = tp_dst; + + /* Clear struct padding. */ + if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE) + memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, + 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); tun_info->options = opts; tun_info->options_len = opts_len; } +static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, + const struct iphdr *iph, + __be16 tp_src, + __be16 tp_dst, + __be64 tun_id, + __be16 tun_flags, + const struct geneve_opt *opts, + u8 opts_len) +{ + __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr, + iph->tos, iph->ttl, + tp_src, tp_dst, + tun_id, tun_flags, + opts, opts_len); +} + +#define OVS_SW_FLOW_KEY_METADATA_SIZE \ + (offsetof(struct sw_flow_key, recirc_id) + \ + FIELD_SIZEOF(struct sw_flow_key, recirc_id)) + struct sw_flow_key { u8 tun_opts[255]; u8 tun_opts_len; @@ -102,12 +136,17 @@ struct sw_flow_key { __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ __be16 type; /* Ethernet frame type. */ } eth; - struct { - u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ - u8 tos; /* IP ToS. */ - u8 ttl; /* IP TTL/hop limit. */ - u8 frag; /* One of OVS_FRAG_TYPE_*. */ - } ip; + union { + struct { + __be32 top_lse; /* top label stack entry */ + } mpls; + struct { + u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ + u8 tos; /* IP ToS. */ + u8 ttl; /* IP TTL/hop limit. */ + u8 frag; /* One of OVS_FRAG_TYPE_*. */ + } ip; + }; struct { __be16 src; /* TCP/UDP/SCTP source port. */ __be16 dst; /* TCP/UDP/SCTP destination port. */ @@ -205,18 +244,19 @@ struct arp_eth_header { } __packed; void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags, - struct sk_buff *); + const struct sk_buff *); void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, unsigned long *used, __be16 *tcp_flags); void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, +int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, + struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ int ovs_flow_key_extract_userspace(const struct nlattr *attr, struct sk_buff *skb, - struct sw_flow_key *key); + struct sw_flow_key *key, bool log); #endif /* flow.h */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 918e966..9645a21 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -46,24 +46,22 @@ #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> +#include <net/mpls.h> #include "flow_netlink.h" -static void update_range__(struct sw_flow_match *match, - size_t offset, size_t size, bool is_mask) +static void update_range(struct sw_flow_match *match, + size_t offset, size_t size, bool is_mask) { - struct sw_flow_key_range *range = NULL; + struct sw_flow_key_range *range; size_t start = rounddown(offset, sizeof(long)); size_t end = roundup(offset + size, sizeof(long)); if (!is_mask) range = &match->range; - else if (match->mask) + else range = &match->mask->range; - if (!range) - return; - if (range->start == range->end) { range->start = start; range->end = end; @@ -79,22 +77,20 @@ static void update_range__(struct sw_flow_match *match, #define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - sizeof((match)->key->field), is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - (match)->mask->key.field = value; \ - } else { \ + update_range(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) \ + (match)->mask->key.field = value; \ + else \ (match)->key->field = value; \ - } \ } while (0) #define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \ do { \ - update_range__(match, offset, len, is_mask); \ + update_range(match, offset, len, is_mask); \ if (is_mask) \ memcpy((u8 *)&(match)->mask->key + offset, value_p, \ - len); \ + len); \ else \ memcpy((u8 *)(match)->key + offset, value_p, len); \ } while (0) @@ -103,22 +99,20 @@ static void update_range__(struct sw_flow_match *match, SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ value_p, len, is_mask) -#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \ - do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - sizeof((match)->key->field), is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - memset((u8 *)&(match)->mask->key.field, value,\ - sizeof((match)->mask->key.field)); \ - } else { \ +#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \ + do { \ + update_range(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) \ + memset((u8 *)&(match)->mask->key.field, value, \ + sizeof((match)->mask->key.field)); \ + else \ memset((u8 *)&(match)->key->field, value, \ sizeof((match)->key->field)); \ - } \ } while (0) static bool match_validate(const struct sw_flow_match *match, - u64 key_attrs, u64 mask_attrs) + u64 key_attrs, u64 mask_attrs, bool log) { u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; u64 mask_allowed = key_attrs; /* At most allow all key attributes */ @@ -134,7 +128,8 @@ static bool match_validate(const struct sw_flow_match *match, | (1 << OVS_KEY_ATTR_ICMP) | (1 << OVS_KEY_ATTR_ICMPV6) | (1 << OVS_KEY_ATTR_ARP) - | (1 << OVS_KEY_ATTR_ND)); + | (1 << OVS_KEY_ATTR_ND) + | (1 << OVS_KEY_ATTR_MPLS)); /* Always allowed mask fields. */ mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) @@ -149,6 +144,12 @@ static bool match_validate(const struct sw_flow_match *match, mask_allowed |= 1 << OVS_KEY_ATTR_ARP; } + if (eth_p_mpls(match->key->eth.type)) { + key_expected |= 1 << OVS_KEY_ATTR_MPLS; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_MPLS; + } + if (match->key->eth.type == htons(ETH_P_IP)) { key_expected |= 1 << OVS_KEY_ATTR_IPV4; if (match->mask && (match->mask->key.eth.type == htons(0xffff))) @@ -229,21 +230,65 @@ static bool match_validate(const struct sw_flow_match *match, if ((key_attrs & key_expected) != key_expected) { /* Key attributes check failed. */ - OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n", - (unsigned long long)key_attrs, (unsigned long long)key_expected); + OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)", + (unsigned long long)key_attrs, + (unsigned long long)key_expected); return false; } if ((mask_attrs & mask_allowed) != mask_attrs) { /* Mask attributes check failed. */ - OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n", - (unsigned long long)mask_attrs, (unsigned long long)mask_allowed); + OVS_NLERR(log, "Unexpected mask (mask=%llx, allowed=%llx)", + (unsigned long long)mask_attrs, + (unsigned long long)mask_allowed); return false; } return true; } +size_t ovs_tun_key_attr_size(void) +{ + /* Whenever adding new OVS_TUNNEL_KEY_ FIELDS, we should consider + * updating this function. + */ + return nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ + + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ +} + +size_t ovs_key_attr_size(void) +{ + /* Whenever adding new OVS_KEY_ FIELDS, we should consider + * updating this function. + */ + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 22); + + return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ + + ovs_tun_key_attr_size() + + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ + + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ + + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ + + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ + + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ + + nla_total_size(28); /* OVS_KEY_ATTR_ND */ +} + /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ENCAP] = -1, @@ -266,6 +311,7 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32), [OVS_KEY_ATTR_DP_HASH] = sizeof(u32), [OVS_KEY_ATTR_TUNNEL] = -1, + [OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls), }; static bool is_all_zero(const u8 *fp, size_t size) @@ -284,7 +330,7 @@ static bool is_all_zero(const u8 *fp, size_t size) static int __parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[], - u64 *attrsp, bool nz) + u64 *attrsp, bool log, bool nz) { const struct nlattr *nla; u64 attrs; @@ -296,21 +342,20 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, int expected_len; if (type > OVS_KEY_ATTR_MAX) { - OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n", + OVS_NLERR(log, "Key type %d is out of range max %d", type, OVS_KEY_ATTR_MAX); return -EINVAL; } if (attrs & (1 << type)) { - OVS_NLERR("Duplicate key attribute (type %d).\n", type); + OVS_NLERR(log, "Duplicate key (type %d).", type); return -EINVAL; } expected_len = ovs_key_lens[type]; if (nla_len(nla) != expected_len && expected_len != -1) { - OVS_NLERR("Key attribute has unexpected length (type=%d" - ", length=%d, expected=%d).\n", type, - nla_len(nla), expected_len); + OVS_NLERR(log, "Key %d has unexpected len %d expected %d", + type, nla_len(nla), expected_len); return -EINVAL; } @@ -320,7 +365,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, } } if (rem) { - OVS_NLERR("Message has %d unknown bytes.\n", rem); + OVS_NLERR(log, "Message has %d unknown bytes.", rem); return -EINVAL; } @@ -329,28 +374,84 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, } static int parse_flow_mask_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u64 *attrsp) + const struct nlattr *a[], u64 *attrsp, + bool log) { - return __parse_flow_nlattrs(attr, a, attrsp, true); + return __parse_flow_nlattrs(attr, a, attrsp, log, true); } static int parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u64 *attrsp) + const struct nlattr *a[], u64 *attrsp, + bool log) { - return __parse_flow_nlattrs(attr, a, attrsp, false); + return __parse_flow_nlattrs(attr, a, attrsp, log, false); +} + +static int genev_tun_opt_from_nlattr(const struct nlattr *a, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + unsigned long opt_key_offset; + + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR(log, "Geneve option length err (len %d, max %zu).", + nla_len(a), sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (nla_len(a) % 4 != 0) { + OVS_NLERR(log, "Geneve opt len %d is not a multiple of 4.", + nla_len(a)); + return -EINVAL; + } + + /* We need to record the length of the options passed + * down, otherwise packets with the same format but + * additional options will be silently matched. + */ + if (!is_mask) { + SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), + false); + } else { + /* This is somewhat unusual because it looks at + * both the key and mask while parsing the + * attributes (and by extension assumes the key + * is parsed first). Normally, we would verify + * that each is the correct length and that the + * attributes line up in the validate function. + * However, that is difficult because this is + * variable length and we won't have the + * information later. + */ + if (match->key->tun_opts_len != nla_len(a)) { + OVS_NLERR(log, "Geneve option len %d != mask len %d", + match->key->tun_opts_len, nla_len(a)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + } + + opt_key_offset = (unsigned long)GENEVE_OPTS((struct sw_flow_key *)0, + nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a), + nla_len(a), is_mask); + return 0; } static int ipv4_tun_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask) + struct sw_flow_match *match, bool is_mask, + bool log) { struct nlattr *a; int rem; bool ttl = false; __be16 tun_flags = 0; - unsigned long opt_key_offset; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); + int err; + static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64), [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32), @@ -359,20 +460,21 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_TTL] = 1, [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, + [OVS_TUNNEL_KEY_ATTR_TP_SRC] = sizeof(u16), + [OVS_TUNNEL_KEY_ATTR_TP_DST] = sizeof(u16), [OVS_TUNNEL_KEY_ATTR_OAM] = 0, [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1, }; if (type > OVS_TUNNEL_KEY_ATTR_MAX) { - OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n", - type, OVS_TUNNEL_KEY_ATTR_MAX); + OVS_NLERR(log, "Tunnel attr %d out of range max %d", + type, OVS_TUNNEL_KEY_ATTR_MAX); return -EINVAL; } if (ovs_tunnel_key_lens[type] != nla_len(a) && ovs_tunnel_key_lens[type] != -1) { - OVS_NLERR("IPv4 tunnel attribute type has unexpected " - " length (type=%d, length=%d, expected=%d).\n", + OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", type, nla_len(a), ovs_tunnel_key_lens[type]); return -EINVAL; } @@ -406,62 +508,26 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_CSUM: tun_flags |= TUNNEL_CSUM; break; + case OVS_TUNNEL_KEY_ATTR_TP_SRC: + SW_FLOW_KEY_PUT(match, tun_key.tp_src, + nla_get_be16(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TP_DST: + SW_FLOW_KEY_PUT(match, tun_key.tp_dst, + nla_get_be16(a), is_mask); + break; case OVS_TUNNEL_KEY_ATTR_OAM: tun_flags |= TUNNEL_OAM; break; case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: - tun_flags |= TUNNEL_OPTIONS_PRESENT; - if (nla_len(a) > sizeof(match->key->tun_opts)) { - OVS_NLERR("Geneve option length exceeds maximum size (len %d, max %zu).\n", - nla_len(a), - sizeof(match->key->tun_opts)); - return -EINVAL; - } - - if (nla_len(a) % 4 != 0) { - OVS_NLERR("Geneve option length is not a multiple of 4 (len %d).\n", - nla_len(a)); - return -EINVAL; - } - - /* We need to record the length of the options passed - * down, otherwise packets with the same format but - * additional options will be silently matched. - */ - if (!is_mask) { - SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), - false); - } else { - /* This is somewhat unusual because it looks at - * both the key and mask while parsing the - * attributes (and by extension assumes the key - * is parsed first). Normally, we would verify - * that each is the correct length and that the - * attributes line up in the validate function. - * However, that is difficult because this is - * variable length and we won't have the - * information later. - */ - if (match->key->tun_opts_len != nla_len(a)) { - OVS_NLERR("Geneve option key length (%d) is different from mask length (%d).", - match->key->tun_opts_len, - nla_len(a)); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, - true); - } + err = genev_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; - opt_key_offset = (unsigned long)GENEVE_OPTS( - (struct sw_flow_key *)0, - nla_len(a)); - SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, - nla_data(a), nla_len(a), - is_mask); + tun_flags |= TUNNEL_OPTIONS_PRESENT; break; default: - OVS_NLERR("Unknown IPv4 tunnel attribute (%d).\n", + OVS_NLERR(log, "Unknown IPv4 tunnel attribute %d", type); return -EINVAL; } @@ -470,18 +536,19 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); if (rem > 0) { - OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem); + OVS_NLERR(log, "IPv4 tunnel attribute has %d unknown bytes.", + rem); return -EINVAL; } if (!is_mask) { if (!match->key->tun_key.ipv4_dst) { - OVS_NLERR("IPv4 tunnel destination address is zero.\n"); + OVS_NLERR(log, "IPv4 tunnel dst address is zero"); return -EINVAL; } if (!ttl) { - OVS_NLERR("IPv4 tunnel TTL not specified.\n"); + OVS_NLERR(log, "IPv4 tunnel TTL not specified."); return -EINVAL; } } @@ -514,6 +581,12 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, if ((output->tun_flags & TUNNEL_CSUM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) return -EMSGSIZE; + if (output->tp_src && + nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_SRC, output->tp_src)) + return -EMSGSIZE; + if (output->tp_dst && + nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_DST, output->tp_dst)) + return -EMSGSIZE; if ((output->tun_flags & TUNNEL_OAM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; @@ -525,7 +598,6 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } - static int ipv4_tun_to_nlattr(struct sk_buff *skb, const struct ovs_key_ipv4_tunnel *output, const struct geneve_opt *tun_opts, @@ -546,8 +618,17 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } +int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, + const struct ovs_tunnel_info *egress_tun_info) +{ + return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel, + egress_tun_info->options, + egress_tun_info->options_len); +} + static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, - const struct nlattr **a, bool is_mask) + const struct nlattr **a, bool is_mask, + bool log) { if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); @@ -572,10 +653,13 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - if (is_mask) + if (is_mask) { in_port = 0xffffffff; /* Always exact match in_port. */ - else if (in_port >= DP_MAX_PORTS) + } else if (in_port >= DP_MAX_PORTS) { + OVS_NLERR(log, "Port %d exceeds max allowable %d", + in_port, DP_MAX_PORTS); return -EINVAL; + } SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); @@ -591,7 +675,7 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, } if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, - is_mask)) + is_mask, log)) return -EINVAL; *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); } @@ -599,12 +683,12 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, } static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, - const struct nlattr **a, bool is_mask) + const struct nlattr **a, bool is_mask, + bool log) { int err; - u64 orig_attrs = attrs; - err = metadata_from_nlattrs(match, &attrs, a, is_mask); + err = metadata_from_nlattrs(match, &attrs, a, is_mask, log); if (err) return err; @@ -625,17 +709,16 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); if (!(tci & htons(VLAN_TAG_PRESENT))) { if (is_mask) - OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n"); + OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit."); else - OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n"); + OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set."); return -EINVAL; } SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); attrs &= ~(1 << OVS_KEY_ATTR_VLAN); - } else if (!is_mask) - SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); + } if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { __be16 eth_type; @@ -645,8 +728,8 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, /* Always exact match EtherType. */ eth_type = htons(0xffff); } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { - OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n", - ntohs(eth_type), ETH_P_802_3_MIN); + OVS_NLERR(log, "EtherType %x is less than min %x", + ntohs(eth_type), ETH_P_802_3_MIN); return -EINVAL; } @@ -661,8 +744,8 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { - OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n", - ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); + OVS_NLERR(log, "IPv4 frag type %d is out of range max %d", + ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); return -EINVAL; } SW_FLOW_KEY_PUT(match, ip.proto, @@ -685,13 +768,13 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { - OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n", - ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); + OVS_NLERR(log, "IPv6 frag type %d is out of range max %d", + ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); return -EINVAL; } if (!is_mask && ipv6_key->ipv6_label & htonl(0xFFF00000)) { - OVS_NLERR("IPv6 flow label %x is out of range (max=%x).\n", + OVS_NLERR(log, "IPv6 flow label %x is out of range (max=%x).\n", ntohl(ipv6_key->ipv6_label), (1 << 20) - 1); return -EINVAL; } @@ -723,7 +806,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); if (!is_mask && (arp_key->arp_op & htons(0xff00))) { - OVS_NLERR("Unknown ARP opcode (opcode=%d).\n", + OVS_NLERR(log, "Unknown ARP opcode (opcode=%d).", arp_key->arp_op); return -EINVAL; } @@ -742,6 +825,16 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, attrs &= ~(1 << OVS_KEY_ATTR_ARP); } + if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { + const struct ovs_key_mpls *mpls_key; + + mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); + SW_FLOW_KEY_PUT(match, mpls.top_lse, + mpls_key->mpls_lse, is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_MPLS); + } + if (attrs & (1 << OVS_KEY_ATTR_TCP)) { const struct ovs_key_tcp *tcp_key; @@ -752,15 +845,9 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, } if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) { - if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) { - SW_FLOW_KEY_PUT(match, tp.flags, - nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), - is_mask); - } else { - SW_FLOW_KEY_PUT(match, tp.flags, - nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), - is_mask); - } + SW_FLOW_KEY_PUT(match, tp.flags, + nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), + is_mask); attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS); } @@ -819,8 +906,11 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, attrs &= ~(1 << OVS_KEY_ATTR_ND); } - if (attrs != 0) + if (attrs != 0) { + OVS_NLERR(log, "Unknown key attributes %llx", + (unsigned long long)attrs); return -EINVAL; + } return 0; } @@ -858,10 +948,14 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * of this flow. * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink * attribute specifies the mask field of the wildcarded flow. + * @log: Boolean to allow kernel error logging. Normally true, but when + * probing for feature compatibility this should be passed in as false to + * suppress unnecessary error logging. */ int ovs_nla_get_match(struct sw_flow_match *match, - const struct nlattr *key, - const struct nlattr *mask) + const struct nlattr *nla_key, + const struct nlattr *nla_mask, + bool log) { const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; const struct nlattr *encap; @@ -871,7 +965,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, bool encap_valid = false; int err; - err = parse_flow_nlattrs(key, a, &key_attrs); + err = parse_flow_nlattrs(nla_key, a, &key_attrs, log); if (err) return err; @@ -882,7 +976,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { - OVS_NLERR("Invalid Vlan frame.\n"); + OVS_NLERR(log, "Invalid Vlan frame."); return -EINVAL; } @@ -893,61 +987,68 @@ int ovs_nla_get_match(struct sw_flow_match *match, encap_valid = true; if (tci & htons(VLAN_TAG_PRESENT)) { - err = parse_flow_nlattrs(encap, a, &key_attrs); + err = parse_flow_nlattrs(encap, a, &key_attrs, log); if (err) return err; } else if (!tci) { /* Corner case for truncated 802.1Q header. */ if (nla_len(encap)) { - OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n"); + OVS_NLERR(log, "Truncated 802.1Q header has non-zero encap attribute."); return -EINVAL; } } else { - OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n"); + OVS_NLERR(log, "Encap attr is set for non-VLAN frame"); return -EINVAL; } } - err = ovs_key_from_nlattrs(match, key_attrs, a, false); + err = ovs_key_from_nlattrs(match, key_attrs, a, false, log); if (err) return err; - if (match->mask && !mask) { - /* Create an exact match mask. We need to set to 0xff all the - * 'match->mask' fields that have been touched in 'match->key'. - * We cannot simply memset 'match->mask', because padding bytes - * and fields not specified in 'match->key' should be left to 0. - * Instead, we use a stream of netlink attributes, copied from - * 'key' and set to 0xff: ovs_key_from_nlattrs() will take care - * of filling 'match->mask' appropriately. - */ - newmask = kmemdup(key, nla_total_size(nla_len(key)), - GFP_KERNEL); - if (!newmask) - return -ENOMEM; + if (match->mask) { + if (!nla_mask) { + /* Create an exact match mask. We need to set to 0xff + * all the 'match->mask' fields that have been touched + * in 'match->key'. We cannot simply memset + * 'match->mask', because padding bytes and fields not + * specified in 'match->key' should be left to 0. + * Instead, we use a stream of netlink attributes, + * copied from 'key' and set to 0xff. + * ovs_key_from_nlattrs() will take care of filling + * 'match->mask' appropriately. + */ + newmask = kmemdup(nla_key, + nla_total_size(nla_len(nla_key)), + GFP_KERNEL); + if (!newmask) + return -ENOMEM; - mask_set_nlattr(newmask, 0xff); + mask_set_nlattr(newmask, 0xff); - /* The userspace does not send tunnel attributes that are 0, - * but we should not wildcard them nonetheless. - */ - if (match->key->tun_key.ipv4_dst) - SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, 0xff, true); + /* The userspace does not send tunnel attributes that + * are 0, but we should not wildcard them nonetheless. + */ + if (match->key->tun_key.ipv4_dst) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, + 0xff, true); - mask = newmask; - } + nla_mask = newmask; + } - if (mask) { - err = parse_flow_mask_nlattrs(mask, a, &mask_attrs); + err = parse_flow_mask_nlattrs(nla_mask, a, &mask_attrs, log); if (err) goto free_newmask; + /* Always match on tci. */ + SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); + if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) { __be16 eth_type = 0; __be16 tci = 0; if (!encap_valid) { - OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n"); + OVS_NLERR(log, "Encap mask attribute is set for non-VLAN frame."); err = -EINVAL; goto free_newmask; } @@ -959,12 +1060,13 @@ int ovs_nla_get_match(struct sw_flow_match *match, if (eth_type == htons(0xffff)) { mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); encap = a[OVS_KEY_ATTR_ENCAP]; - err = parse_flow_mask_nlattrs(encap, a, &mask_attrs); + err = parse_flow_mask_nlattrs(encap, a, + &mask_attrs, log); if (err) goto free_newmask; } else { - OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n", - ntohs(eth_type)); + OVS_NLERR(log, "VLAN frames must have an exact match on the TPID (mask=%x).", + ntohs(eth_type)); err = -EINVAL; goto free_newmask; } @@ -973,18 +1075,19 @@ int ovs_nla_get_match(struct sw_flow_match *match, tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); if (!(tci & htons(VLAN_TAG_PRESENT))) { - OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci)); + OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).", + ntohs(tci)); err = -EINVAL; goto free_newmask; } } - err = ovs_key_from_nlattrs(match, mask_attrs, a, true); + err = ovs_key_from_nlattrs(match, mask_attrs, a, true, log); if (err) goto free_newmask; } - if (!match_validate(match, key_attrs, mask_attrs)) + if (!match_validate(match, key_attrs, mask_attrs, log)) err = -EINVAL; free_newmask: @@ -997,6 +1100,9 @@ free_newmask: * @key: Receives extracted in_port, priority, tun_key and skb_mark. * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. + * @log: Boolean to allow kernel error logging. Normally true, but when + * probing for feature compatibility this should be passed in as false to + * suppress unnecessary error logging. * * This parses a series of Netlink attributes that form a flow key, which must * take the same form accepted by flow_from_nlattrs(), but only enough of it to @@ -1005,14 +1111,15 @@ free_newmask: */ int ovs_nla_get_flow_metadata(const struct nlattr *attr, - struct sw_flow_key *key) + struct sw_flow_key *key, + bool log) { const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; struct sw_flow_match match; u64 attrs = 0; int err; - err = parse_flow_nlattrs(attr, a, &attrs); + err = parse_flow_nlattrs(attr, a, &attrs, log); if (err) return -EINVAL; @@ -1021,7 +1128,7 @@ int ovs_nla_get_flow_metadata(const struct nlattr *attr, key->phy.in_port = DP_MAX_PORTS; - return metadata_from_nlattrs(&match, &attrs, a, false); + return metadata_from_nlattrs(&match, &attrs, a, false, log); } int ovs_nla_put_flow(const struct sw_flow_key *swkey, @@ -1147,6 +1254,14 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, arp_key->arp_op = htons(output->ip.proto); ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); + } else if (eth_p_mpls(swkey->eth.type)) { + struct ovs_key_mpls *mpls_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key)); + if (!nla) + goto nla_put_failure; + mpls_key = nla_data(nla); + mpls_key->mpls_lse = output->mpls.top_lse; } if ((swkey->eth.type == htons(ETH_P_IP) || @@ -1233,12 +1348,14 @@ nla_put_failure: #define MAX_ACTIONS_BUFSIZE (32 * 1024) -struct sw_flow_actions *ovs_nla_alloc_flow_actions(int size) +static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) { struct sw_flow_actions *sfa; - if (size > MAX_ACTIONS_BUFSIZE) + if (size > MAX_ACTIONS_BUFSIZE) { + OVS_NLERR(log, "Flow action size %u bytes exceeds max", size); return ERR_PTR(-EINVAL); + } sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); if (!sfa) @@ -1256,7 +1373,7 @@ void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) } static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, - int attr_len) + int attr_len, bool log) { struct sw_flow_actions *acts; @@ -1276,7 +1393,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = MAX_ACTIONS_BUFSIZE; } - acts = ovs_nla_alloc_flow_actions(new_acts_size); + acts = nla_alloc_flow_actions(new_acts_size, log); if (IS_ERR(acts)) return (void *)acts; @@ -1291,11 +1408,11 @@ out: } static struct nlattr *__add_action(struct sw_flow_actions **sfa, - int attrtype, void *data, int len) + int attrtype, void *data, int len, bool log) { struct nlattr *a; - a = reserve_sfa_size(sfa, nla_attr_size(len)); + a = reserve_sfa_size(sfa, nla_attr_size(len), log); if (IS_ERR(a)) return a; @@ -1310,24 +1427,22 @@ static struct nlattr *__add_action(struct sw_flow_actions **sfa, } static int add_action(struct sw_flow_actions **sfa, int attrtype, - void *data, int len) + void *data, int len, bool log) { struct nlattr *a; - a = __add_action(sfa, attrtype, data, len); - if (IS_ERR(a)) - return PTR_ERR(a); + a = __add_action(sfa, attrtype, data, len, log); - return 0; + return PTR_ERR_OR_ZERO(a); } static inline int add_nested_action_start(struct sw_flow_actions **sfa, - int attrtype) + int attrtype, bool log) { int used = (*sfa)->actions_len; int err; - err = add_action(sfa, attrtype, NULL, 0); + err = add_action(sfa, attrtype, NULL, 0, log); if (err) return err; @@ -1343,9 +1458,15 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, a->nla_len = sfa->actions_len - st_offset; } +static int __ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, bool log); + static int validate_and_copy_sample(const struct nlattr *attr, const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa) + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, bool log) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; @@ -1371,18 +1492,19 @@ static int validate_and_copy_sample(const struct nlattr *attr, return -EINVAL; /* validation done, copy sample action. */ - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE); + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log); if (start < 0) return start; err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, - nla_data(probability), sizeof(u32)); + nla_data(probability), sizeof(u32), log); if (err) return err; - st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS); + st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log); if (st_acts < 0) return st_acts; - err = ovs_nla_copy_actions(actions, key, depth + 1, sfa); + err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa, + eth_type, vlan_tci, log); if (err) return err; @@ -1392,10 +1514,10 @@ static int validate_and_copy_sample(const struct nlattr *attr, return 0; } -static int validate_tp_port(const struct sw_flow_key *flow_key) +static int validate_tp_port(const struct sw_flow_key *flow_key, + __be16 eth_type) { - if ((flow_key->eth.type == htons(ETH_P_IP) || - flow_key->eth.type == htons(ETH_P_IPV6)) && + if ((eth_type == htons(ETH_P_IP) || eth_type == htons(ETH_P_IPV6)) && (flow_key->tp.src || flow_key->tp.dst)) return 0; @@ -1419,7 +1541,7 @@ void ovs_match_init(struct sw_flow_match *match, } static int validate_and_copy_set_tun(const struct nlattr *attr, - struct sw_flow_actions **sfa) + struct sw_flow_actions **sfa, bool log) { struct sw_flow_match match; struct sw_flow_key key; @@ -1428,7 +1550,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, int err, start; ovs_match_init(&match, &key, NULL); - err = ipv4_tun_from_nlattr(nla_data(attr), &match, false); + err = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log); if (err) return err; @@ -1457,12 +1579,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; }; - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log); if (start < 0) return start; a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*tun_info) + key.tun_opts_len); + sizeof(*tun_info) + key.tun_opts_len, log); if (IS_ERR(a)) return PTR_ERR(a); @@ -1490,7 +1612,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, static int validate_set(const struct nlattr *a, const struct sw_flow_key *flow_key, struct sw_flow_actions **sfa, - bool *set_tun) + bool *set_tun, __be16 eth_type, bool log) { const struct nlattr *ovs_key = nla_data(a); int key_type = nla_type(ovs_key); @@ -1515,14 +1637,17 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_TUNNEL: + if (eth_p_mpls(eth_type)) + return -EINVAL; + *set_tun = true; - err = validate_and_copy_set_tun(a, sfa); + err = validate_and_copy_set_tun(a, sfa, log); if (err) return err; break; case OVS_KEY_ATTR_IPV4: - if (flow_key->eth.type != htons(ETH_P_IP)) + if (eth_type != htons(ETH_P_IP)) return -EINVAL; if (!flow_key->ip.proto) @@ -1538,7 +1663,7 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_IPV6: - if (flow_key->eth.type != htons(ETH_P_IPV6)) + if (eth_type != htons(ETH_P_IPV6)) return -EINVAL; if (!flow_key->ip.proto) @@ -1560,19 +1685,24 @@ static int validate_set(const struct nlattr *a, if (flow_key->ip.proto != IPPROTO_TCP) return -EINVAL; - return validate_tp_port(flow_key); + return validate_tp_port(flow_key, eth_type); case OVS_KEY_ATTR_UDP: if (flow_key->ip.proto != IPPROTO_UDP) return -EINVAL; - return validate_tp_port(flow_key); + return validate_tp_port(flow_key, eth_type); + + case OVS_KEY_ATTR_MPLS: + if (!eth_p_mpls(eth_type)) + return -EINVAL; + break; case OVS_KEY_ATTR_SCTP: if (flow_key->ip.proto != IPPROTO_SCTP) return -EINVAL; - return validate_tp_port(flow_key); + return validate_tp_port(flow_key, eth_type); default: return -EINVAL; @@ -1586,6 +1716,7 @@ static int validate_userspace(const struct nlattr *attr) static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, + [OVS_USERSPACE_ATTR_EGRESS_TUN_PORT] = {.type = NLA_U32 }, }; struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; int error; @@ -1603,12 +1734,12 @@ static int validate_userspace(const struct nlattr *attr) } static int copy_action(const struct nlattr *from, - struct sw_flow_actions **sfa) + struct sw_flow_actions **sfa, bool log) { int totlen = NLA_ALIGN(from->nla_len); struct nlattr *to; - to = reserve_sfa_size(sfa, from->nla_len); + to = reserve_sfa_size(sfa, from->nla_len, log); if (IS_ERR(to)) return PTR_ERR(to); @@ -1616,12 +1747,13 @@ static int copy_action(const struct nlattr *from, return 0; } -int ovs_nla_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, - int depth, - struct sw_flow_actions **sfa) +static int __ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, bool log) { const struct nlattr *a; + bool out_tnl_port = false; int rem, err; if (depth >= SAMPLE_ACTION_DEPTH) @@ -1633,6 +1765,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr, [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, + [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls), + [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16), [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, @@ -1662,6 +1796,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr, case OVS_ACTION_ATTR_OUTPUT: if (nla_get_u32(a) >= DP_MAX_PORTS) return -EINVAL; + out_tnl_port = false; + break; case OVS_ACTION_ATTR_HASH: { @@ -1678,6 +1814,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr, } case OVS_ACTION_ATTR_POP_VLAN: + vlan_tci = htons(0); break; case OVS_ACTION_ATTR_PUSH_VLAN: @@ -1686,29 +1823,77 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) return -EINVAL; + vlan_tci = vlan->vlan_tci; break; case OVS_ACTION_ATTR_RECIRC: break; + case OVS_ACTION_ATTR_PUSH_MPLS: { + const struct ovs_action_push_mpls *mpls = nla_data(a); + + /* Networking stack do not allow simultaneous Tunnel + * and MPLS GSO. + */ + if (out_tnl_port) + return -EINVAL; + + if (!eth_p_mpls(mpls->mpls_ethertype)) + return -EINVAL; + /* Prohibit push MPLS other than to a white list + * for packets that have a known tag order. + */ + if (vlan_tci & htons(VLAN_TAG_PRESENT) || + (eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6) && + eth_type != htons(ETH_P_ARP) && + eth_type != htons(ETH_P_RARP) && + !eth_p_mpls(eth_type))) + return -EINVAL; + eth_type = mpls->mpls_ethertype; + break; + } + + case OVS_ACTION_ATTR_POP_MPLS: + if (vlan_tci & htons(VLAN_TAG_PRESENT) || + !eth_p_mpls(eth_type)) + return -EINVAL; + + /* Disallow subsequent L2.5+ set and mpls_pop actions + * as there is no check here to ensure that the new + * eth_type is valid and thus set actions could + * write off the end of the packet or otherwise + * corrupt it. + * + * Support for these actions is planned using packet + * recirculation. + */ + eth_type = htons(0); + break; + case OVS_ACTION_ATTR_SET: - err = validate_set(a, key, sfa, &skip_copy); + err = validate_set(a, key, sfa, + &out_tnl_port, eth_type, log); if (err) return err; + + skip_copy = out_tnl_port; break; case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(a, key, depth, sfa); + err = validate_and_copy_sample(a, key, depth, sfa, + eth_type, vlan_tci, log); if (err) return err; skip_copy = true; break; default: + OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; } if (!skip_copy) { - err = copy_action(a, sfa); + err = copy_action(a, sfa, log); if (err) return err; } @@ -1720,6 +1905,24 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return 0; } +int ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log) +{ + int err; + + *sfa = nla_alloc_flow_actions(nla_len(attr), log); + if (IS_ERR(*sfa)) + return PTR_ERR(*sfa); + + err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, + key->eth.tci, log); + if (err) + kfree(*sfa); + + return err; +} + static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) { const struct nlattr *a; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 206e45a..577f12b 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -37,24 +37,28 @@ #include "flow.h" +size_t ovs_tun_key_attr_size(void); +size_t ovs_key_attr_size(void); + void ovs_match_init(struct sw_flow_match *match, struct sw_flow_key *key, struct sw_flow_mask *mask); int ovs_nla_put_flow(const struct sw_flow_key *, const struct sw_flow_key *, struct sk_buff *); -int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *); +int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *, + bool log); -int ovs_nla_get_match(struct sw_flow_match *match, - const struct nlattr *, - const struct nlattr *); +int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key, + const struct nlattr *mask, bool log); +int ovs_nla_put_egress_tunnel_key(struct sk_buff *, + const struct ovs_tunnel_info *); int ovs_nla_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa); + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log); int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); -struct sw_flow_actions *ovs_nla_alloc_flow_actions(int actions_len); void ovs_nla_free_flow_actions(struct sw_flow_actions *); #endif /* flow_netlink.h */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index cf2d853..5899bf1 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -25,7 +25,7 @@ #include <linux/if_vlan.h> #include <net/llc_pdu.h> #include <linux/kernel.h> -#include <linux/hash.h> +#include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/llc.h> #include <linux/module.h> @@ -107,7 +107,7 @@ err: return ERR_PTR(-ENOMEM); } -int ovs_flow_tbl_count(struct flow_table *table) +int ovs_flow_tbl_count(const struct flow_table *table) { return table->count; } @@ -250,11 +250,14 @@ skip_flows: __table_instance_destroy(ti); } -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred) +/* No need for locking this function is called from RCU callback or + * error path. + */ +void ovs_flow_tbl_destroy(struct flow_table *table) { - struct table_instance *ti = ovsl_dereference(table->ti); + struct table_instance *ti = rcu_dereference_raw(table->ti); - table_instance_destroy(ti, deferred); + table_instance_destroy(ti, false); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, @@ -363,7 +366,7 @@ static u32 flow_hash(const struct sw_flow_key *key, int key_start, /* Make sure number of hash bytes are multiple of u32. */ BUILD_BUG_ON(sizeof(long) % sizeof(u32)); - return arch_fast_hash2(hash_key, hash_u32s, 0); + return jhash2(hash_key, hash_u32s, 0); } static int flow_key_start(const struct sw_flow_key *key) @@ -398,7 +401,7 @@ static bool flow_cmp_masked_key(const struct sw_flow *flow, } bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, - struct sw_flow_match *match) + const struct sw_flow_match *match) { struct sw_flow_key *key = match->key; int key_start = flow_key_start(key); @@ -409,7 +412,7 @@ bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, static struct sw_flow *masked_flow_lookup(struct table_instance *ti, const struct sw_flow_key *unmasked, - struct sw_flow_mask *mask) + const struct sw_flow_mask *mask) { struct sw_flow *flow; struct hlist_head *head; @@ -457,7 +460,7 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, - struct sw_flow_match *match) + const struct sw_flow_match *match) { struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); struct sw_flow_mask *mask; @@ -560,7 +563,7 @@ static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, /* Add 'mask' into the mask list, if it is not already there. */ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, - struct sw_flow_mask *new) + const struct sw_flow_mask *new) { struct sw_flow_mask *mask; mask = flow_mask_find(tbl, new); @@ -583,7 +586,7 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, /* Must be called with OVS mutex held. */ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - struct sw_flow_mask *mask) + const struct sw_flow_mask *mask) { struct table_instance *new_ti = NULL; struct table_instance *ti; diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 5918bff..309fa64 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -61,12 +61,12 @@ struct sw_flow *ovs_flow_alloc(void); void ovs_flow_free(struct sw_flow *, bool deferred); int ovs_flow_tbl_init(struct flow_table *); -int ovs_flow_tbl_count(struct flow_table *table); -void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred); +int ovs_flow_tbl_count(const struct flow_table *table); +void ovs_flow_tbl_destroy(struct flow_table *table); int ovs_flow_tbl_flush(struct flow_table *flow_table); int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - struct sw_flow_mask *mask); + const struct sw_flow_mask *mask); void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); int ovs_flow_tbl_num_masks(const struct flow_table *table); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, @@ -77,9 +77,9 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, - struct sw_flow_match *match); + const struct sw_flow_match *match); bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, - struct sw_flow_match *match); + const struct sw_flow_match *match); void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, const struct sw_flow_mask *mask); diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 106a9d8..347fa23 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -17,6 +17,7 @@ #include <linux/rculist.h> #include <linux/udp.h> #include <linux/if_vlan.h> +#include <linux/module.h> #include <net/geneve.h> #include <net/icmp.h> @@ -28,6 +29,8 @@ #include "datapath.h" #include "vport.h" +static struct vport_ops ovs_geneve_vport_ops; + /** * struct geneve_port - Keeps track of open UDP ports * @gs: The socket created for this port number. @@ -65,7 +68,7 @@ static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) } /* Convert 24 bit VNI to 64 bit tunnel ID. */ -static __be64 vni_to_tunnel_id(__u8 *vni) +static __be64 vni_to_tunnel_id(const __u8 *vni) { #ifdef __BIG_ENDIAN return (vni[0] << 16) | (vni[1] << 8) | vni[2]; @@ -94,7 +97,9 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) key = vni_to_tunnel_id(geneveh->vni); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, flags, + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), + udp_hdr(skb)->source, udp_hdr(skb)->dest, + key, flags, geneveh->options, opts_len); ovs_vport_receive(vport, skb, &tun_info); @@ -225,11 +230,46 @@ static const char *geneve_get_name(const struct vport *vport) return geneve_port->name; } -const struct vport_ops ovs_geneve_vport_ops = { +static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *egress_tun_info) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; + __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + + /* Get tp_src and tp_dst, refert to geneve_build_header(). + */ + return ovs_tunnel_get_egress_info(egress_tun_info, + ovs_dp_get_net(vport->dp), + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, sport, dport); +} + +static struct vport_ops ovs_geneve_vport_ops = { .type = OVS_VPORT_TYPE_GENEVE, .create = geneve_tnl_create, .destroy = geneve_tnl_destroy, .get_name = geneve_get_name, .get_options = geneve_get_options, .send = geneve_tnl_send, + .owner = THIS_MODULE, + .get_egress_tun_info = geneve_get_egress_tun_info, }; + +static int __init ovs_geneve_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_geneve_vport_ops); +} + +static void __exit ovs_geneve_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_geneve_vport_ops); +} + +module_init(ovs_geneve_tnl_init); +module_exit(ovs_geneve_tnl_exit); + +MODULE_DESCRIPTION("OVS: Geneve swiching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-5"); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 108b82d..6b69df5 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -29,6 +29,7 @@ #include <linux/jhash.h> #include <linux/list.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/workqueue.h> #include <linux/rculist.h> #include <net/route.h> @@ -45,6 +46,8 @@ #include "datapath.h" #include "vport.h" +static struct vport_ops ovs_gre_vport_ops; + /* Returns the least-significant 32 bits of a __be64. */ static __be32 be64_get_low32(__be64 x) { @@ -105,7 +108,7 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_REJECT; key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key, filter_tnl_flags(tpi->flags), NULL, 0); ovs_vport_receive(vport, skb, &tun_info); @@ -172,14 +175,10 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) goto err_free_rt; } - if (vlan_tx_tag_present(skb)) { - if (unlikely(!__vlan_put_tag(skb, - skb->vlan_proto, - vlan_tx_tag_get(skb)))) { - err = -ENOMEM; - goto err_free_rt; - } - skb->vlan_tci = 0; + skb = vlan_hwaccel_push_inside(skb); + if (unlikely(!skb)) { + err = -ENOMEM; + goto err_free_rt; } /* Push Tunnel header. */ @@ -281,10 +280,38 @@ static void gre_tnl_destroy(struct vport *vport) gre_exit(); } -const struct vport_ops ovs_gre_vport_ops = { +static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *egress_tun_info) +{ + return ovs_tunnel_get_egress_info(egress_tun_info, + ovs_dp_get_net(vport->dp), + OVS_CB(skb)->egress_tun_info, + IPPROTO_GRE, skb->mark, 0, 0); +} + +static struct vport_ops ovs_gre_vport_ops = { .type = OVS_VPORT_TYPE_GRE, .create = gre_create, .destroy = gre_tnl_destroy, .get_name = gre_get_name, .send = gre_tnl_send, + .get_egress_tun_info = gre_get_egress_tun_info, + .owner = THIS_MODULE, }; + +static int __init ovs_gre_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_gre_vport_ops); +} + +static void __exit ovs_gre_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_gre_vport_ops); +} + +module_init(ovs_gre_tnl_init); +module_exit(ovs_gre_tnl_exit); + +MODULE_DESCRIPTION("OVS: GRE switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-3"); diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 8451612..6a55f71 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -36,6 +36,8 @@ struct internal_dev { struct vport *vport; }; +static struct vport_ops ovs_internal_vport_ops; + static struct internal_dev *internal_dev_priv(struct net_device *netdev) { return netdev_priv(netdev); @@ -222,6 +224,11 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) struct net_device *netdev = netdev_vport_priv(vport)->dev; int len; + if (unlikely(!(netdev->flags & IFF_UP))) { + kfree_skb(skb); + return 0; + } + len = skb->len; skb_dst_drop(skb); @@ -238,7 +245,7 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) return len; } -const struct vport_ops ovs_internal_vport_ops = { +static struct vport_ops ovs_internal_vport_ops = { .type = OVS_VPORT_TYPE_INTERNAL, .create = internal_dev_create, .destroy = internal_dev_destroy, @@ -261,10 +268,21 @@ struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) int ovs_internal_dev_rtnl_link_register(void) { - return rtnl_link_register(&internal_dev_link_ops); + int err; + + err = rtnl_link_register(&internal_dev_link_ops); + if (err < 0) + return err; + + err = ovs_vport_ops_register(&ovs_internal_vport_ops); + if (err < 0) + rtnl_link_unregister(&internal_dev_link_ops); + + return err; } void ovs_internal_dev_rtnl_link_unregister(void) { + ovs_vport_ops_unregister(&ovs_internal_vport_ops); rtnl_link_unregister(&internal_dev_link_ops); } diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index d21f77d..4776282 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -33,6 +33,8 @@ #include "vport-internal_dev.h" #include "vport-netdev.h" +static struct vport_ops ovs_netdev_vport_ops; + /* Must be called with rcu_read_lock. */ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) { @@ -75,7 +77,7 @@ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) return RX_HANDLER_CONSUMED; } -static struct net_device *get_dpdev(struct datapath *dp) +static struct net_device *get_dpdev(const struct datapath *dp) { struct vport *local; @@ -224,10 +226,20 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev) return NULL; } -const struct vport_ops ovs_netdev_vport_ops = { +static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, .get_name = ovs_netdev_get_name, .send = netdev_send, }; + +int __init ovs_netdev_init(void) +{ + return ovs_vport_ops_register(&ovs_netdev_vport_ops); +} + +void ovs_netdev_exit(void) +{ + ovs_vport_ops_unregister(&ovs_netdev_vport_ops); +} diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index 8df01c11..6f7038e 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -41,4 +41,7 @@ netdev_vport_priv(const struct vport *vport) const char *ovs_netdev_get_name(const struct vport *); void ovs_netdev_detach_dev(struct vport *); +int __init ovs_netdev_init(void); +void ovs_netdev_exit(void); + #endif /* vport_netdev.h */ diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 2735e01..38f95a5 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -24,6 +24,7 @@ #include <linux/net.h> #include <linux/rculist.h> #include <linux/udp.h> +#include <linux/module.h> #include <net/icmp.h> #include <net/ip.h> @@ -50,6 +51,8 @@ struct vxlan_port { char name[IFNAMSIZ]; }; +static struct vport_ops ovs_vxlan_vport_ops; + static inline struct vxlan_port *vxlan_vport(const struct vport *vport) { return vport_priv(vport); @@ -66,7 +69,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(vx_vni) >> 8); - ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY, NULL, 0); + ovs_flow_tun_info_init(&tun_info, iph, + udp_hdr(skb)->source, udp_hdr(skb)->dest, + key, TUNNEL_KEY, NULL, 0); ovs_vport_receive(vport, skb, &tun_info); } @@ -186,17 +191,55 @@ error: return err; } +static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *egress_tun_info) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct vxlan_port *vxlan_port = vxlan_vport(vport); + __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + __be16 src_port; + int port_min; + int port_max; + + inet_get_local_port_range(net, &port_min, &port_max); + src_port = udp_flow_src_port(net, skb, 0, 0, true); + + return ovs_tunnel_get_egress_info(egress_tun_info, net, + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, + src_port, dst_port); +} + static const char *vxlan_get_name(const struct vport *vport) { struct vxlan_port *vxlan_port = vxlan_vport(vport); return vxlan_port->name; } -const struct vport_ops ovs_vxlan_vport_ops = { +static struct vport_ops ovs_vxlan_vport_ops = { .type = OVS_VPORT_TYPE_VXLAN, .create = vxlan_tnl_create, .destroy = vxlan_tnl_destroy, .get_name = vxlan_get_name, .get_options = vxlan_get_options, .send = vxlan_tnl_send, + .get_egress_tun_info = vxlan_get_egress_tun_info, + .owner = THIS_MODULE, }; + +static int __init ovs_vxlan_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_vxlan_vport_ops); +} + +static void __exit ovs_vxlan_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_vxlan_vport_ops); +} + +module_init(ovs_vxlan_tnl_init); +module_exit(ovs_vxlan_tnl_exit); + +MODULE_DESCRIPTION("OVS: VXLAN switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-4"); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 6015802..9584526 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -28,6 +28,7 @@ #include <linux/rtnetlink.h> #include <linux/compat.h> #include <net/net_namespace.h> +#include <linux/module.h> #include "datapath.h" #include "vport.h" @@ -36,22 +37,7 @@ static void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); -/* List of statically compiled vport implementations. Don't forget to also - * add yours to the list at the bottom of vport.h. */ -static const struct vport_ops *vport_ops_list[] = { - &ovs_netdev_vport_ops, - &ovs_internal_vport_ops, - -#ifdef CONFIG_OPENVSWITCH_GRE - &ovs_gre_vport_ops, -#endif -#ifdef CONFIG_OPENVSWITCH_VXLAN - &ovs_vxlan_vport_ops, -#endif -#ifdef CONFIG_OPENVSWITCH_GENEVE - &ovs_geneve_vport_ops, -#endif -}; +static LIST_HEAD(vport_ops_list); /* Protected by RCU read lock for reading, ovs_mutex for writing. */ static struct hlist_head *dev_table; @@ -82,12 +68,38 @@ void ovs_vport_exit(void) kfree(dev_table); } -static struct hlist_head *hash_bucket(struct net *net, const char *name) +static struct hlist_head *hash_bucket(const struct net *net, const char *name) { unsigned int hash = jhash(name, strlen(name), (unsigned long) net); return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; } +int ovs_vport_ops_register(struct vport_ops *ops) +{ + int err = -EEXIST; + struct vport_ops *o; + + ovs_lock(); + list_for_each_entry(o, &vport_ops_list, list) + if (ops->type == o->type) + goto errout; + + list_add_tail(&ops->list, &vport_ops_list); + err = 0; +errout: + ovs_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(ovs_vport_ops_register); + +void ovs_vport_ops_unregister(struct vport_ops *ops) +{ + ovs_lock(); + list_del(&ops->list); + ovs_unlock(); +} +EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister); + /** * ovs_vport_locate - find a port that has already been created * @@ -95,7 +107,7 @@ static struct hlist_head *hash_bucket(struct net *net, const char *name) * * Must be called with ovs or RCU read lock. */ -struct vport *ovs_vport_locate(struct net *net, const char *name) +struct vport *ovs_vport_locate(const struct net *net, const char *name) { struct hlist_head *bucket = hash_bucket(net, name); struct vport *vport; @@ -153,6 +165,7 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, return vport; } +EXPORT_SYMBOL_GPL(ovs_vport_alloc); /** * ovs_vport_free - uninitialize and free vport @@ -173,6 +186,18 @@ void ovs_vport_free(struct vport *vport) free_percpu(vport->percpu_stats); kfree(vport); } +EXPORT_SYMBOL_GPL(ovs_vport_free); + +static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms) +{ + struct vport_ops *ops; + + list_for_each_entry(ops, &vport_ops_list, list) + if (ops->type == parms->type) + return ops; + + return NULL; +} /** * ovs_vport_add - add vport device (for kernel callers) @@ -184,31 +209,40 @@ void ovs_vport_free(struct vport *vport) */ struct vport *ovs_vport_add(const struct vport_parms *parms) { + struct vport_ops *ops; struct vport *vport; - int err = 0; - int i; - for (i = 0; i < ARRAY_SIZE(vport_ops_list); i++) { - if (vport_ops_list[i]->type == parms->type) { - struct hlist_head *bucket; + ops = ovs_vport_lookup(parms); + if (ops) { + struct hlist_head *bucket; - vport = vport_ops_list[i]->create(parms); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - goto out; - } + if (!try_module_get(ops->owner)) + return ERR_PTR(-EAFNOSUPPORT); - bucket = hash_bucket(ovs_dp_get_net(vport->dp), - vport->ops->get_name(vport)); - hlist_add_head_rcu(&vport->hash_node, bucket); + vport = ops->create(parms); + if (IS_ERR(vport)) { + module_put(ops->owner); return vport; } + + bucket = hash_bucket(ovs_dp_get_net(vport->dp), + vport->ops->get_name(vport)); + hlist_add_head_rcu(&vport->hash_node, bucket); + return vport; } - err = -EAFNOSUPPORT; + /* Unlock to attempt module load and return -EAGAIN if load + * was successful as we need to restart the port addition + * workflow. + */ + ovs_unlock(); + request_module("vport-type-%d", parms->type); + ovs_lock(); -out: - return ERR_PTR(err); + if (!ovs_vport_lookup(parms)) + return ERR_PTR(-EAFNOSUPPORT); + else + return ERR_PTR(-EAGAIN); } /** @@ -242,6 +276,8 @@ void ovs_vport_del(struct vport *vport) hlist_del_rcu(&vport->hash_node); vport->ops->destroy(vport); + + module_put(vport->ops->owner); } /** @@ -344,7 +380,7 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) * * Must be called with ovs_mutex. */ -int ovs_vport_set_upcall_portids(struct vport *vport, struct nlattr *ids) +int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids) { struct vport_portids *old, *vport_portids; @@ -435,7 +471,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) * skb->data should point to the Ethernet header. */ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *tun_info) + const struct ovs_tunnel_info *tun_info) { struct pcpu_sw_netstats *stats; struct sw_flow_key key; @@ -457,6 +493,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, } ovs_dp_process_packet(skb, &key); } +EXPORT_SYMBOL_GPL(ovs_vport_receive); /** * ovs_vport_send - send a packet on a device @@ -535,3 +572,65 @@ void ovs_vport_deferred_free(struct vport *vport) call_rcu(&vport->rcu, free_vport_rcu); } +EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); + +int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, + struct net *net, + const struct ovs_tunnel_info *tun_info, + u8 ipproto, + u32 skb_mark, + __be16 tp_src, + __be16 tp_dst) +{ + const struct ovs_key_ipv4_tunnel *tun_key; + struct rtable *rt; + struct flowi4 fl; + + if (unlikely(!tun_info)) + return -EINVAL; + + tun_key = &tun_info->tunnel; + + /* Route lookup to get srouce IP address. + * The process may need to be changed if the corresponding process + * in vports ops changed. + */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); + fl.flowi4_mark = skb_mark; + fl.flowi4_proto = ipproto; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + ip_rt_put(rt); + + /* Generate egress_tun_info based on tun_info, + * saddr, tp_src and tp_dst + */ + __ovs_flow_tun_info_init(egress_tun_info, + fl.saddr, tun_key->ipv4_dst, + tun_key->ipv4_tos, + tun_key->ipv4_ttl, + tp_src, tp_dst, + tun_key->tun_id, + tun_key->tun_flags, + tun_info->options, + tun_info->options_len); + + return 0; +} +EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); + +int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *info) +{ + /* get_egress_tun_info() is only implemented on tunnel ports. */ + if (unlikely(!vport->ops->get_egress_tun_info)) + return -EINVAL; + + return vport->ops->get_egress_tun_info(vport, skb, info); +} diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 8942125..99c8e71 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -45,19 +45,29 @@ void ovs_vport_exit(void); struct vport *ovs_vport_add(const struct vport_parms *); void ovs_vport_del(struct vport *); -struct vport *ovs_vport_locate(struct net *net, const char *name); +struct vport *ovs_vport_locate(const struct net *net, const char *name); void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); int ovs_vport_set_options(struct vport *, struct nlattr *options); int ovs_vport_get_options(const struct vport *, struct sk_buff *); -int ovs_vport_set_upcall_portids(struct vport *, struct nlattr *pids); +int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); int ovs_vport_send(struct vport *, struct sk_buff *); +int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, + struct net *net, + const struct ovs_tunnel_info *tun_info, + u8 ipproto, + u32 skb_mark, + __be16 tp_src, + __be16 tp_dst); +int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *info); + /* The following definitions are for implementers of vport devices: */ struct vport_err_stats { @@ -146,6 +156,8 @@ struct vport_parms { * @get_name: Get the device's name. * @send: Send a packet on the device. Returns the length of the packet sent, * zero for dropped packets or negative for error. + * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for + * a packet. */ struct vport_ops { enum ovs_vport_type type; @@ -161,6 +173,11 @@ struct vport_ops { const char *(*get_name)(const struct vport *); int (*send)(struct vport *, struct sk_buff *); + int (*get_egress_tun_info)(struct vport *, struct sk_buff *, + struct ovs_tunnel_info *); + + struct module *owner; + struct list_head list; }; enum vport_err_type { @@ -207,15 +224,7 @@ static inline struct vport *vport_from_priv(void *priv) } void ovs_vport_receive(struct vport *, struct sk_buff *, - struct ovs_tunnel_info *); - -/* List of statically compiled vport implementations. Don't forget to also - * add yours to the list at the top of vport.c. */ -extern const struct vport_ops ovs_netdev_vport_ops; -extern const struct vport_ops ovs_internal_vport_ops; -extern const struct vport_ops ovs_gre_vport_ops; -extern const struct vport_ops ovs_vxlan_vport_ops; -extern const struct vport_ops ovs_geneve_vport_ops; + const struct ovs_tunnel_info *); static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) @@ -224,4 +233,7 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); } +int ovs_vport_ops_register(struct vport_ops *ops); +void ovs_vport_ops_unregister(struct vport_ops *ops); + #endif /* vport.h */ |