diff options
Diffstat (limited to 'net')
106 files changed, 1427 insertions, 601 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index c7a581a9689..917ecb93ea2 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -205,7 +205,7 @@ int register_vlan_dev(struct net_device *dev) grp->nr_vlans++; if (ngrp) { - if (ops->ndo_vlan_rx_register) + if (ops->ndo_vlan_rx_register && (real_dev->features & NETIF_F_HW_VLAN_RX)) ops->ndo_vlan_rx_register(real_dev, ngrp); rcu_assign_pointer(real_dev->vlgrp, ngrp); } diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 41495dc2a4c..fcc684678af 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -23,6 +23,31 @@ bool vlan_do_receive(struct sk_buff **skbp) return false; skb->dev = vlan_dev; + if (skb->pkt_type == PACKET_OTHERHOST) { + /* Our lower layer thinks this is not local, let's make sure. + * This allows the VLAN to have a different MAC than the + * underlying device, and still route correctly. */ + if (!compare_ether_addr(eth_hdr(skb)->h_dest, + vlan_dev->dev_addr)) + skb->pkt_type = PACKET_HOST; + } + + if (!(vlan_dev_info(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR)) { + unsigned int offset = skb->data - skb_mac_header(skb); + + /* + * vlan_insert_tag expect skb->data pointing to mac header. + * So change skb->data before calling it and change back to + * original position later + */ + skb_push(skb, offset); + skb = *skbp = vlan_insert_tag(skb, skb->vlan_tci); + if (!skb) + return false; + skb_pull(skb, offset + VLAN_HLEN); + skb_reset_mac_len(skb); + } + skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci); skb->vlan_tci = 0; @@ -31,22 +56,8 @@ bool vlan_do_receive(struct sk_buff **skbp) u64_stats_update_begin(&rx_stats->syncp); rx_stats->rx_packets++; rx_stats->rx_bytes += skb->len; - - switch (skb->pkt_type) { - case PACKET_BROADCAST: - break; - case PACKET_MULTICAST: + if (skb->pkt_type == PACKET_MULTICAST) rx_stats->rx_multicast++; - break; - case PACKET_OTHERHOST: - /* Our lower layer thinks this is not local, let's make sure. - * This allows the VLAN to have a different MAC than the - * underlying device, and still route correctly. */ - if (!compare_ether_addr(eth_hdr(skb)->h_dest, - vlan_dev->dev_addr)) - skb->pkt_type = PACKET_HOST; - break; - } u64_stats_update_end(&rx_stats->syncp); return true; @@ -89,18 +100,13 @@ gro_result_t vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp, } EXPORT_SYMBOL(vlan_gro_frags); -static struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb) +static struct sk_buff *vlan_reorder_header(struct sk_buff *skb) { - if (vlan_dev_info(skb->dev)->flags & VLAN_FLAG_REORDER_HDR) { - if (skb_cow(skb, skb_headroom(skb)) < 0) - skb = NULL; - if (skb) { - /* Lifted from Gleb's VLAN code... */ - memmove(skb->data - ETH_HLEN, - skb->data - VLAN_ETH_HLEN, 12); - skb->mac_header += VLAN_HLEN; - } - } + if (skb_cow(skb, skb_headroom(skb)) < 0) + return NULL; + memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN); + skb->mac_header += VLAN_HLEN; + skb_reset_mac_len(skb); return skb; } @@ -161,7 +167,7 @@ struct sk_buff *vlan_untag(struct sk_buff *skb) skb_pull_rcsum(skb, VLAN_HLEN); vlan_set_encap_proto(skb, vhdr); - skb = vlan_check_reorder_header(skb); + skb = vlan_reorder_header(skb); if (unlikely(!skb)) goto err_free; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index f247f5bff88..86bff9b1ac4 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -165,7 +165,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, u64_stats_update_begin(&stats->syncp); stats->tx_packets++; stats->tx_bytes += len; - u64_stats_update_begin(&stats->syncp); + u64_stats_update_end(&stats->syncp); } else { this_cpu_inc(vlan_dev_info(dev)->vlan_pcpu_stats->tx_dropped); } @@ -586,9 +586,14 @@ static void vlan_dev_uninit(struct net_device *dev) static u32 vlan_dev_fix_features(struct net_device *dev, u32 features) { struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + u32 old_features = features; features &= real_dev->features; features &= real_dev->vlan_features; + + if (old_features & NETIF_F_SOFT_FEATURES) + features |= old_features & NETIF_F_SOFT_FEATURES; + if (dev_ethtool_get_rx_csum(real_dev)) features |= NETIF_F_RXCSUM; features |= NETIF_F_LLTX; diff --git a/net/9p/Kconfig b/net/9p/Kconfig index 7ed75c7bd5d..d9ea09b11cf 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -3,8 +3,8 @@ # menuconfig NET_9P - depends on NET && EXPERIMENTAL - tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)" + depends on NET + tristate "Plan 9 Resource Sharing Support (9P2000)" help If you say Y here, you will get experimental support for Plan 9 resource sharing via the 9P2000 protocol. @@ -16,8 +16,8 @@ menuconfig NET_9P if NET_9P config NET_9P_VIRTIO - depends on EXPERIMENTAL && VIRTIO - tristate "9P Virtio Transport (Experimental)" + depends on VIRTIO + tristate "9P Virtio Transport" help This builds support for a transports between guest partitions and a host partition. diff --git a/net/9p/client.c b/net/9p/client.c index ceab943dfc4..9e3b0e640da 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -92,9 +92,6 @@ static int get_protocol_version(const substring_t *name) return version; } -static struct p9_req_t * -p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...); - /** * parse_options - parse mount options into client structure * @opts: options string passed from mount @@ -307,12 +304,13 @@ static int p9_tag_init(struct p9_client *c) c->tagpool = p9_idpool_create(); if (IS_ERR(c->tagpool)) { err = PTR_ERR(c->tagpool); - c->tagpool = NULL; goto error; } - - p9_idpool_get(c->tagpool); /* reserve tag 0 */ - + err = p9_idpool_get(c->tagpool); /* reserve tag 0 */ + if (err < 0) { + p9_idpool_destroy(c->tagpool); + goto error; + } c->max_tag = 0; error: return err; @@ -518,12 +516,15 @@ out_err: return err; } +static struct p9_req_t * +p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...); + /** * p9_client_flush - flush (cancel) a request * @c: client state * @oldreq: request to cancel * - * This sents a flush for a particular requests and links + * This sents a flush for a particular request and links * the flush request to the original request. The current * code only supports a single flush request although the protocol * allows for multiple flush requests to be sent for a single request. @@ -789,11 +790,13 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) spin_lock_init(&clnt->lock); INIT_LIST_HEAD(&clnt->fidlist); - p9_tag_init(clnt); + err = p9_tag_init(clnt); + if (err < 0) + goto free_client; err = parse_opts(options, clnt); if (err < 0) - goto free_client; + goto destroy_tagpool; if (!clnt->trans_mod) clnt->trans_mod = v9fs_get_default_trans(); @@ -802,13 +805,12 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) err = -EPROTONOSUPPORT; P9_DPRINTK(P9_DEBUG_ERROR, "No transport defined or default transport\n"); - goto free_client; + goto destroy_tagpool; } clnt->fidpool = p9_idpool_create(); if (IS_ERR(clnt->fidpool)) { err = PTR_ERR(clnt->fidpool); - clnt->fidpool = NULL; goto put_trans; } @@ -834,6 +836,8 @@ destroy_fidpool: p9_idpool_destroy(clnt->fidpool); put_trans: v9fs_put_trans(clnt->trans_mod); +destroy_tagpool: + p9_idpool_destroy(clnt->tagpool); free_client: kfree(clnt); return ERR_PTR(err); @@ -1298,7 +1302,7 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset, if (count < rsize) rsize = count; - /* Don't bother zerocopy form small IO (< 1024) */ + /* Don't bother zerocopy for small IO (< 1024) */ if (((clnt->trans_mod->pref & P9_TRANS_PREF_PAYLOAD_MASK) == P9_TRANS_PREF_PAYLOAD_SEP) && (rsize > 1024)) { req = p9_client_rpc(clnt, P9_TREAD, "dqE", fid->fid, offset, diff --git a/net/9p/mod.c b/net/9p/mod.c index cf8a4128cd5..72c39827505 100644 --- a/net/9p/mod.c +++ b/net/9p/mod.c @@ -139,7 +139,7 @@ void v9fs_put_trans(struct p9_trans_module *m) } /** - * v9fs_init - Initialize module + * init_p9 - Initialize module * */ static int __init init_p9(void) @@ -154,7 +154,7 @@ static int __init init_p9(void) } /** - * v9fs_init - shutdown module + * exit_p9 - shutdown module * */ diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 4a9084395d3..fdfdb5747f6 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -916,8 +916,8 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) sin_server.sin_family = AF_INET; sin_server.sin_addr.s_addr = in_aton(addr); sin_server.sin_port = htons(opts.port); - err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &csocket); - + err = __sock_create(read_pnet(¤t->nsproxy->net_ns), PF_INET, + SOCK_STREAM, IPPROTO_TCP, &csocket, 1); if (err) { P9_EPRINTK(KERN_ERR, "p9_trans_tcp: problem creating socket\n"); return err; @@ -954,7 +954,8 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) sun_server.sun_family = PF_UNIX; strcpy(sun_server.sun_path, addr); - err = sock_create_kern(PF_UNIX, SOCK_STREAM, 0, &csocket); + err = __sock_create(read_pnet(¤t->nsproxy->net_ns), PF_UNIX, + SOCK_STREAM, 0, &csocket, 1); if (err < 0) { P9_EPRINTK(KERN_ERR, "p9_trans_unix: problem creating socket\n"); return err; diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 844a7a5607e..159c50f1c6b 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -589,7 +589,8 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) return -ENOMEM; /* Create the RDMA CM ID */ - rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP); + rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP, + IB_QPT_RC); if (IS_ERR(rdma->cm_id)) goto error; diff --git a/net/9p/util.c b/net/9p/util.c index da6af81e59d..9c1c9348ac3 100644 --- a/net/9p/util.c +++ b/net/9p/util.c @@ -93,7 +93,7 @@ int p9_idpool_get(struct p9_idpool *p) retry: if (idr_pre_get(&p->pool, GFP_NOFS) == 0) - return 0; + return -1; spin_lock_irqsave(&p->lock, flags); diff --git a/net/atm/lec.c b/net/atm/lec.c index 25073b6ef47..ba48daa68c1 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1171,7 +1171,7 @@ static int __init lane_module_init(void) #endif register_atm_ioctl(&lane_ioctl_ops); - pr_info("lec.c: " __DATE__ " " __TIME__ " initialized\n"); + pr_info("lec.c: initialized\n"); return 0; } diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 644cdf07164..3ccca42e6f9 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -1482,7 +1482,7 @@ static __init int atm_mpoa_init(void) if (mpc_proc_init() != 0) pr_info("failed to initialize /proc/mpoa\n"); - pr_info("mpc.c: " __DATE__ " " __TIME__ " initialized\n"); + pr_info("mpc.c: initialized\n"); return 0; } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index a6b2f86378c..32b8f9f7f79 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -49,7 +49,9 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_pull(skb, ETH_HLEN); rcu_read_lock(); - if (is_multicast_ether_addr(dest)) { + if (is_broadcast_ether_addr(dest)) + br_flood_deliver(br, skb); + else if (is_multicast_ether_addr(dest)) { if (unlikely(netpoll_tx_running(dev))) { br_flood_deliver(br, skb); goto out; @@ -243,6 +245,7 @@ int br_netpoll_enable(struct net_bridge_port *p) goto out; np->dev = p->dev; + strlcpy(np->dev_name, p->dev->name, IFNAMSIZ); err = __netpoll_setup(np); if (err) { diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index f3ac1e858ee..f06ee39c73f 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -60,7 +60,7 @@ int br_handle_frame_finish(struct sk_buff *skb) br = p->br; br_fdb_update(br, p, eth_hdr(skb)->h_source); - if (is_multicast_ether_addr(dest) && + if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) && br_multicast_rcv(br, p, skb)) goto drop; @@ -77,7 +77,9 @@ int br_handle_frame_finish(struct sk_buff *skb) dst = NULL; - if (is_multicast_ether_addr(dest)) { + if (is_broadcast_ether_addr(dest)) + skb2 = skb; + else if (is_multicast_ether_addr(dest)) { mdst = br_mdb_get(br, skb); if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) { if ((mdst && mdst->mglist) || diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 2f14eafdeea..2d85ca7111d 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1379,8 +1379,11 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) return -EINVAL; - if (iph->protocol != IPPROTO_IGMP) + if (iph->protocol != IPPROTO_IGMP) { + if ((iph->daddr & IGMP_LOCAL_GROUP_MASK) != IGMP_LOCAL_GROUP) + BR_INPUT_SKB_CB(skb)->mrouters_only = 1; return 0; + } len = ntohs(iph->tot_len); if (skb->len < len || len < ip_hdrlen(skb)) @@ -1424,7 +1427,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, switch (ih->type) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: - BR_INPUT_SKB_CB(skb2)->mrouters_only = 1; + BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip4_multicast_add_group(br, port, ih->group); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: @@ -1543,7 +1546,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, goto out; } mld = (struct mld_msg *)skb_transport_header(skb2); - BR_INPUT_SKB_CB(skb2)->mrouters_only = 1; + BR_INPUT_SKB_CB(skb)->mrouters_only = 1; err = br_ip6_multicast_add_group(br, port, &mld->mld_mca); break; } diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 3fa123185e8..56149ec36d7 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -104,10 +104,16 @@ static void fake_update_pmtu(struct dst_entry *dst, u32 mtu) { } +static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + return NULL; +} + static struct dst_ops fake_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), .update_pmtu = fake_update_pmtu, + .cow_metrics = fake_cow_metrics, }; /* diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c index 3a66b8c10e0..c23979e79df 100644 --- a/net/caif/cfmuxl.c +++ b/net/caif/cfmuxl.c @@ -255,7 +255,7 @@ static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, if (cfsrvl_phyid_match(layer, phyid) && layer->ctrlcmd) { - if ((ctrl == _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND || + if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND || ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) && layer->id != 0) { diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 649ebacaf6b..adbb424403d 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -139,17 +139,14 @@ static void close_work(struct work_struct *work) struct chnl_net *dev = NULL; struct list_head *list_node; struct list_head *_tmp; - /* May be called with or without RTNL lock held */ - int islocked = rtnl_is_locked(); - if (!islocked) - rtnl_lock(); + + rtnl_lock(); list_for_each_safe(list_node, _tmp, &chnl_net_list) { dev = list_entry(list_node, struct chnl_net, list_field); if (dev->state == CAIF_SHUTDOWN) dev_close(dev->netdev); } - if (!islocked) - rtnl_unlock(); + rtnl_unlock(); } static DECLARE_WORK(close_worker, close_work); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e15a82ccc05..78b55f49de7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -76,7 +76,8 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss) break; default: - sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family); + snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %d)", + (int)ss->ss_family); } return s; @@ -598,7 +599,7 @@ static void prepare_write_keepalive(struct ceph_connection *con) * Connection negotiation. */ -static void prepare_connect_authorizer(struct ceph_connection *con) +static int prepare_connect_authorizer(struct ceph_connection *con) { void *auth_buf; int auth_len = 0; @@ -612,13 +613,20 @@ static void prepare_connect_authorizer(struct ceph_connection *con) con->auth_retry); mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) + return -EAGAIN; + con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); con->out_connect.authorizer_len = cpu_to_le32(auth_len); - con->out_kvec[con->out_kvec_left].iov_base = auth_buf; - con->out_kvec[con->out_kvec_left].iov_len = auth_len; - con->out_kvec_left++; - con->out_kvec_bytes += auth_len; + if (auth_len) { + con->out_kvec[con->out_kvec_left].iov_base = auth_buf; + con->out_kvec[con->out_kvec_left].iov_len = auth_len; + con->out_kvec_left++; + con->out_kvec_bytes += auth_len; + } + return 0; } /* @@ -640,9 +648,9 @@ static void prepare_write_banner(struct ceph_messenger *msgr, set_bit(WRITE_PENDING, &con->state); } -static void prepare_write_connect(struct ceph_messenger *msgr, - struct ceph_connection *con, - int after_banner) +static int prepare_write_connect(struct ceph_messenger *msgr, + struct ceph_connection *con, + int after_banner) { unsigned global_seq = get_global_seq(con->msgr, 0); int proto; @@ -683,7 +691,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, con->out_more = 0; set_bit(WRITE_PENDING, &con->state); - prepare_connect_authorizer(con); + return prepare_connect_authorizer(con); } @@ -1065,8 +1073,10 @@ static void addr_set_port(struct sockaddr_storage *ss, int p) switch (ss->ss_family) { case AF_INET: ((struct sockaddr_in *)ss)->sin_port = htons(p); + break; case AF_INET6: ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); + break; } } @@ -1216,6 +1226,7 @@ static int process_connect(struct ceph_connection *con) u64 sup_feat = con->msgr->supported_features; u64 req_feat = con->msgr->required_features; u64 server_feat = le64_to_cpu(con->in_reply.features); + int ret; dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1250,7 +1261,9 @@ static int process_connect(struct ceph_connection *con) return -1; } con->auth_retry = 1; - prepare_write_connect(con->msgr, con, 0); + ret = prepare_write_connect(con->msgr, con, 0); + if (ret < 0) + return ret; prepare_read_connect(con); break; @@ -1277,6 +1290,9 @@ static int process_connect(struct ceph_connection *con) if (con->ops->peer_reset) con->ops->peer_reset(con); mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) + return -EAGAIN; break; case CEPH_MSGR_TAG_RETRY_SESSION: @@ -1341,7 +1357,9 @@ static int process_connect(struct ceph_connection *con) * to WAIT. This shouldn't happen if we are the * client. */ - pr_err("process_connect peer connecting WAIT\n"); + pr_err("process_connect got WAIT as client\n"); + con->error_msg = "protocol error, got WAIT as client"; + return -1; default: pr_err("connect protocol error, will retry\n"); @@ -1810,6 +1828,17 @@ static int try_read(struct ceph_connection *con) more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, con->in_base_pos); + + /* + * process_connect and process_message drop and re-take + * con->mutex. make sure we handle a racing close or reopen. + */ + if (test_bit(CLOSED, &con->state) || + test_bit(OPENING, &con->state)) { + ret = -EAGAIN; + goto out; + } + if (test_bit(CONNECTING, &con->state)) { if (!test_bit(NEGOTIATING, &con->state)) { dout("try_read connecting\n"); @@ -1938,8 +1967,10 @@ static void con_work(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); + int ret; mutex_lock(&con->mutex); +restart: if (test_and_clear_bit(BACKOFF, &con->state)) { dout("con_work %p backing off\n", con); if (queue_delayed_work(ceph_msgr_wq, &con->work, @@ -1969,18 +2000,31 @@ static void con_work(struct work_struct *work) con_close_socket(con); } - if (test_and_clear_bit(SOCK_CLOSED, &con->state) || - try_read(con) < 0 || - try_write(con) < 0) { - mutex_unlock(&con->mutex); - ceph_fault(con); /* error/fault path */ - goto done_unlocked; - } + if (test_and_clear_bit(SOCK_CLOSED, &con->state)) + goto fault; + + ret = try_read(con); + if (ret == -EAGAIN) + goto restart; + if (ret < 0) + goto fault; + + ret = try_write(con); + if (ret == -EAGAIN) + goto restart; + if (ret < 0) + goto fault; done: mutex_unlock(&con->mutex); done_unlocked: con->ops->put(con); + return; + +fault: + mutex_unlock(&con->mutex); + ceph_fault(con); /* error/fault path */ + goto done_unlocked; } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6b5dda1cb5d..7330c2757c0 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -124,7 +124,7 @@ static void calc_layout(struct ceph_osd_client *osdc, ceph_calc_raw_layout(osdc, layout, vino.snap, off, plen, &bno, req, op); - sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); + snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); } @@ -477,8 +477,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, calc_layout(osdc, vino, layout, off, plen, req, ops); req->r_file_layout = *layout; /* keep a copy */ - /* in case it differs from natural alignment that calc_layout - filled in for us */ + /* in case it differs from natural (file) alignment that + calc_layout filled in for us */ + req->r_num_pages = calc_pages_for(page_align, *plen); req->r_page_alignment = page_align; ceph_osdc_build_request(req, off, plen, ops, @@ -1144,6 +1145,13 @@ static void handle_osds_timeout(struct work_struct *work) round_jiffies_relative(delay)); } +static void complete_request(struct ceph_osd_request *req) +{ + if (req->r_safe_callback) + req->r_safe_callback(req, NULL); + complete_all(&req->r_safe_completion); /* fsync waiter */ +} + /* * handle osd op reply. either call the callback if it is specified, * or do the completion to wake up the waiting thread. @@ -1226,11 +1234,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, else complete_all(&req->r_completion); - if (flags & CEPH_OSD_FLAG_ONDISK) { - if (req->r_safe_callback) - req->r_safe_callback(req, msg); - complete_all(&req->r_safe_completion); /* fsync waiter */ - } + if (flags & CEPH_OSD_FLAG_ONDISK) + complete_request(req); done: dout("req=%p req->r_linger=%d\n", req, req->r_linger); @@ -1421,6 +1426,15 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) done: downgrade_write(&osdc->map_sem); ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); + + /* + * subscribe to subsequent osdmap updates if full to ensure + * we find out when we are no longer full and stop returning + * ENOSPC. + */ + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) + ceph_monc_request_next_osdmap(&osdc->client->monc); + send_queued(osdc); up_read(&osdc->map_sem); wake_up_all(&osdc->client->auth_wq); @@ -1677,8 +1691,14 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, */ if (req->r_sent == 0) { rc = __map_request(osdc, req); - if (rc < 0) + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; + } goto out_unlock; + } if (req->r_osd == NULL) { dout("send_request %p no up osds in pg\n", req); ceph_monc_request_next_osdmap(&osdc->client->monc); @@ -1717,6 +1737,7 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc, __cancel_request(req); __unregister_request(osdc, req); mutex_unlock(&osdc->request_mutex); + complete_request(req); dout("wait_request tid %llu canceled/timed out\n", req->r_tid); return rc; } @@ -2007,8 +2028,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, int want = calc_pages_for(req->r_page_alignment, data_len); if (unlikely(req->r_num_pages < want)) { - pr_warning("tid %lld reply %d > expected %d pages\n", - tid, want, m->nr_pages); + pr_warning("tid %lld reply has %d bytes %d pages, we" + " had only %d pages ready\n", tid, data_len, + want, req->r_num_pages); *skip = 1; ceph_msg_put(m); m = NULL; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 71603ac3dff..e97c3588c3e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -765,7 +765,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } map->epoch++; - map->modified = map->modified; + map->modified = modified; if (newcrush) { if (map->crush) crush_destroy(map->crush); @@ -830,15 +830,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, map->osd_addr[osd] = addr; } - /* new_down */ + /* new_state */ ceph_decode_32_safe(p, end, len, bad); while (len--) { u32 osd; + u8 xorstate; ceph_decode_32_safe(p, end, osd, bad); + xorstate = **(u8 **)p; (*p)++; /* clean flag */ - pr_info("osd%d down\n", osd); + if (xorstate == 0) + xorstate = CEPH_OSD_UP; + if (xorstate & CEPH_OSD_UP) + pr_info("osd%d down\n", osd); if (osd < map->max_osd) - map->osd_state[osd] &= ~CEPH_OSD_UP; + map->osd_state[osd] ^= xorstate; } /* new_weight */ diff --git a/net/core/dev.c b/net/core/dev.c index c7e305d13b7..9c58c1ec41a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2096,6 +2096,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, { const struct net_device_ops *ops = dev->netdev_ops; int rc = NETDEV_TX_OK; + unsigned int skb_len; if (likely(!skb->next)) { u32 features; @@ -2146,8 +2147,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, } } + skb_len = skb->len; rc = ops->ndo_start_xmit(skb, dev); - trace_net_dev_xmit(skb, rc); + trace_net_dev_xmit(skb, rc, dev, skb_len); if (rc == NETDEV_TX_OK) txq_trans_update(txq); return rc; @@ -2167,8 +2169,9 @@ gso: if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(nskb); + skb_len = nskb->len; rc = ops->ndo_start_xmit(nskb, dev); - trace_net_dev_xmit(nskb, rc); + trace_net_dev_xmit(nskb, rc, dev, skb_len); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) goto out_kfree_gso_skb; @@ -3111,7 +3114,7 @@ static int __netif_receive_skb(struct sk_buff *skb) skb_reset_network_header(skb); skb_reset_transport_header(skb); - skb->mac_len = skb->network_header - skb->mac_header; + skb_reset_mac_len(skb); pt_prev = NULL; @@ -6175,6 +6178,11 @@ static int dev_cpu_callback(struct notifier_block *nfb, oldsd->output_queue = NULL; oldsd->output_queue_tailp = &oldsd->output_queue; } + /* Append NAPI poll list from offline CPU. */ + if (!list_empty(&oldsd->poll_list)) { + list_splice_init(&oldsd->poll_list, &sd->poll_list); + raise_softirq_irqoff(NET_RX_SOFTIRQ); + } raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); @@ -6261,29 +6269,23 @@ err_name: /** * netdev_drivername - network driver for the device * @dev: network device - * @buffer: buffer for resulting name - * @len: size of buffer * * Determine network driver for device. */ -char *netdev_drivername(const struct net_device *dev, char *buffer, int len) +const char *netdev_drivername(const struct net_device *dev) { const struct device_driver *driver; const struct device *parent; - - if (len <= 0 || !buffer) - return buffer; - buffer[0] = 0; + const char *empty = ""; parent = dev->dev.parent; - if (!parent) - return buffer; + return empty; driver = parent->driver; if (driver && driver->name) - strlcpy(buffer, driver->name, len); - return buffer; + return driver->name; + return empty; } static int __netdev_printk(const char *level, const struct net_device *dev, diff --git a/net/core/dst.c b/net/core/dst.c index 9ccca038444..6135f367169 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -190,7 +190,8 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->lastuse = jiffies; dst->flags = flags; dst->next = NULL; - dst_entries_add(ops, 1); + if (!(flags & DST_NOCOUNT)) + dst_entries_add(ops, 1); return dst; } EXPORT_SYMBOL(dst_alloc); @@ -243,7 +244,8 @@ again: neigh_release(neigh); } - dst_entries_add(dst->ops, -1); + if (!(dst->flags & DST_NOCOUNT)) + dst_entries_add(dst->ops, -1); if (dst->ops->destroy) dst->ops->destroy(dst); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 11b98bc2aa8..33d2a1fba13 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1179,9 +1179,14 @@ static void remove_queue_kobjects(struct net_device *net) #endif } -static const void *net_current_ns(void) +static void *net_grab_current_ns(void) { - return current->nsproxy->net_ns; + struct net *ns = current->nsproxy->net_ns; +#ifdef CONFIG_NET_NS + if (ns) + atomic_inc(&ns->passive); +#endif + return ns; } static const void *net_initial_ns(void) @@ -1196,22 +1201,13 @@ static const void *net_netlink_ns(struct sock *sk) struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, - .current_ns = net_current_ns, + .grab_current_ns = net_grab_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, + .drop_ns = net_drop_ns, }; EXPORT_SYMBOL_GPL(net_ns_type_operations); -static void net_kobj_ns_exit(struct net *net) -{ - kobj_ns_exit(KOBJ_NS_TYPE_NET, net); -} - -static struct pernet_operations kobj_net_ops = { - .exit = net_kobj_ns_exit, -}; - - #ifdef CONFIG_HOTPLUG static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) { @@ -1339,6 +1335,5 @@ EXPORT_SYMBOL(netdev_class_remove_file); int netdev_kobject_init(void) { kobj_ns_type_register(&net_ns_type_operations); - register_pernet_subsys(&kobj_net_ops); return class_register(&net_class); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2e2dce6583e..ea489db1bc2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -8,6 +8,8 @@ #include <linux/idr.h> #include <linux/rculist.h> #include <linux/nsproxy.h> +#include <linux/proc_fs.h> +#include <linux/file.h> #include <net/net_namespace.h> #include <net/netns/generic.h> @@ -126,6 +128,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); atomic_set(&net->count, 1); + atomic_set(&net->passive, 1); #ifdef NETNS_REFCNT_DEBUG atomic_set(&net->use_count, 0); @@ -208,6 +211,13 @@ static void net_free(struct net *net) kmem_cache_free(net_cachep, net); } +void net_drop_ns(void *p) +{ + struct net *ns = p; + if (ns && atomic_dec_and_test(&ns->passive)) + net_free(ns); +} + struct net *copy_net_ns(unsigned long flags, struct net *old_net) { struct net *net; @@ -228,7 +238,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) } mutex_unlock(&net_mutex); if (rv < 0) { - net_free(net); + net_drop_ns(net); return ERR_PTR(rv); } return net; @@ -284,7 +294,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); - net_free(net); + net_drop_ns(net); } } static DECLARE_WORK(net_cleanup_work, cleanup_net); @@ -302,6 +312,26 @@ void __put_net(struct net *net) } EXPORT_SYMBOL_GPL(__put_net); +struct net *get_net_ns_by_fd(int fd) +{ + struct proc_inode *ei; + struct file *file; + struct net *net; + + file = proc_ns_fget(fd); + if (IS_ERR(file)) + return ERR_CAST(file); + + ei = PROC_I(file->f_dentry->d_inode); + if (ei->ns_ops == &netns_operations) + net = get_net(ei->ns); + else + net = ERR_PTR(-EINVAL); + + fput(file); + return net; +} + #else struct net *copy_net_ns(unsigned long flags, struct net *old_net) { @@ -309,6 +339,11 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) return ERR_PTR(-EINVAL); return old_net; } + +struct net *get_net_ns_by_fd(int fd) +{ + return ERR_PTR(-EINVAL); +} #endif struct net *get_net_ns_by_pid(pid_t pid) @@ -561,3 +596,39 @@ void unregister_pernet_device(struct pernet_operations *ops) mutex_unlock(&net_mutex); } EXPORT_SYMBOL_GPL(unregister_pernet_device); + +#ifdef CONFIG_NET_NS +static void *netns_get(struct task_struct *task) +{ + struct net *net = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) + net = get_net(nsproxy->net_ns); + rcu_read_unlock(); + + return net; +} + +static void netns_put(void *ns) +{ + put_net(ns); +} + +static int netns_install(struct nsproxy *nsproxy, void *ns) +{ + put_net(nsproxy->net_ns); + nsproxy->net_ns = get_net(ns); + return 0; +} + +const struct proc_ns_operations netns_operations = { + .name = "net", + .type = CLONE_NEWNET, + .get = netns_get, + .put = netns_put, + .install = netns_install, +}; +#endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2d7d6d47378..18d9cbda3a3 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -792,6 +792,13 @@ int netpoll_setup(struct netpoll *np) return -ENODEV; } + if (ndev->master) { + printk(KERN_ERR "%s: %s is a slave device, aborting.\n", + np->name, np->dev_name); + err = -EBUSY; + goto put; + } + if (!netif_running(ndev)) { unsigned long atmost, atleast; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2d56cb9b0b9..abd936d8a71 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1046,6 +1046,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINKMODE] = { .type = NLA_U8 }, [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, + [IFLA_NET_NS_FD] = { .type = NLA_U32 }, [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, @@ -1094,6 +1095,8 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) */ if (tb[IFLA_NET_NS_PID]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + else if (tb[IFLA_NET_NS_FD]) + net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); else net = get_net(src_net); return net; @@ -1224,7 +1227,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, int send_addr_notify = 0; int err; - if (tb[IFLA_NET_NS_PID]) { + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) { struct net *net = rtnl_link_get_net(dev_net(dev), tb); if (IS_ERR(net)) { err = PTR_ERR(net); diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index cfa7a5e1c5c..fa000d26dc6 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -212,10 +212,12 @@ static void dns_resolver_describe(const struct key *key, struct seq_file *m) int err = key->type_data.x[0]; seq_puts(m, key->description); - if (err) - seq_printf(m, ": %d", err); - else - seq_printf(m, ": %u", key->datalen); + if (key_is_instantiated(key)) { + if (err) + seq_printf(m, ": %d", err); + else + seq_printf(m, ": %u", key->datalen); + } } /* diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index ed0eab39f53..02548b292b5 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -44,7 +44,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 pid, pr_debug("%s\n", __func__); if (!buf) - goto out; + return -EMSGSIZE; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags, IEEE802154_LIST_PHY); @@ -65,6 +65,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 pid, pages * sizeof(uint32_t), buf); mutex_unlock(&phy->pib_lock); + kfree(buf); return genlmsg_end(msg, hdr); nla_put_failure: diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cc1463156cd..ef1528af7ab 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -465,6 +465,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < sizeof(struct sockaddr_in)) goto out; + if (addr->sin_family != AF_INET) { + err = -EAFNOSUPPORT; + goto out; + } + chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); /* Not specified by any standard per-se, however it breaks too @@ -673,6 +678,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) lock_sock(sk2); + sock_rps_record_flow(sk2); WARN_ON(!((1 << sk2->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 61fac4cabc7..c14d88ad348 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); * This struct holds the first and last local port number. */ struct local_ports sysctl_local_ports __read_mostly = { - .lock = SEQLOCK_UNLOCKED, + .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock), .range = { 32768, 61000 }, }; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 6ffe94ca5bc..3267d389843 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -437,7 +437,7 @@ static int valid_cc(const void *bc, int len, int cc) return 0; if (cc == len) return 1; - if (op->yes < 4) + if (op->yes < 4 || op->yes & 3) return 0; len -= op->yes; bc += op->yes; @@ -447,11 +447,11 @@ static int valid_cc(const void *bc, int len, int cc) static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) { - const unsigned char *bc = bytecode; + const void *bc = bytecode; int len = bytecode_len; while (len > 0) { - struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc; + const struct inet_diag_bc_op *op = bc; //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); switch (op->code) { @@ -462,22 +462,20 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: - if (op->yes < 4 || op->yes > len + 4) - return -EINVAL; case INET_DIAG_BC_JMP: - if (op->no < 4 || op->no > len + 4) + if (op->no < 4 || op->no > len + 4 || op->no & 3) return -EINVAL; if (op->no < len && !valid_cc(bytecode, bytecode_len, len - op->no)) return -EINVAL; break; case INET_DIAG_BC_NOP: - if (op->yes < 4 || op->yes > len + 4) - return -EINVAL; break; default: return -EINVAL; } + if (op->yes < 4 || op->yes > len + 4 || op->yes & 3) + return -EINVAL; bc += op->yes; len -= op->yes; } diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index c3118e1cd3b..ec93335901d 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/types.h> #include <asm/uaccess.h> +#include <asm/unaligned.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> @@ -350,7 +351,7 @@ int ip_options_compile(struct net *net, goto error; } if (optptr[2] <= optlen) { - __be32 *timeptr = NULL; + unsigned char *timeptr = NULL; if (optptr[2]+3 > optptr[1]) { pp_ptr = optptr + 2; goto error; @@ -359,7 +360,7 @@ int ip_options_compile(struct net *net, case IPOPT_TS_TSONLY: opt->ts = optptr - iph; if (skb) - timeptr = (__be32*)&optptr[optptr[2]-1]; + timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; optptr[2] += 4; break; @@ -371,7 +372,7 @@ int ip_options_compile(struct net *net, opt->ts = optptr - iph; if (rt) { memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); - timeptr = (__be32*)&optptr[optptr[2]+3]; + timeptr = &optptr[optptr[2]+3]; } opt->ts_needaddr = 1; opt->ts_needtime = 1; @@ -389,7 +390,7 @@ int ip_options_compile(struct net *net, if (inet_addr_type(net, addr) == RTN_UNICAST) break; if (skb) - timeptr = (__be32*)&optptr[optptr[2]+3]; + timeptr = &optptr[optptr[2]+3]; } opt->ts_needtime = 1; optptr[2] += 8; @@ -403,10 +404,10 @@ int ip_options_compile(struct net *net, } if (timeptr) { struct timespec tv; - __be32 midtime; + u32 midtime; getnstimeofday(&tv); - midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC); - memcpy(timeptr, &midtime, sizeof(__be32)); + midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC; + put_unaligned_be32(midtime, timeptr); opt->is_changed = 1; } } else { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 98af3697c71..84f26e8e6c6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -799,9 +799,9 @@ static int __ip_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; - exthdrlen = transhdrlen ? rt->dst.header_len : 0; - length += exthdrlen; - transhdrlen += exthdrlen; + skb = skb_peek_tail(queue); + + exthdrlen = !skb ? rt->dst.header_len : 0; mtu = cork->fragsize; hh_len = LL_RESERVED_SPACE(rt->dst.dev); @@ -825,12 +825,10 @@ static int __ip_append_data(struct sock *sk, !exthdrlen) csummode = CHECKSUM_PARTIAL; - skb = skb_peek_tail(queue); - cork->length += length; if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO)) { + (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags); @@ -883,17 +881,16 @@ alloc_new_skb: else alloclen = fraglen; + alloclen += exthdrlen; + /* The last fragment gets additional space at tail. * Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be * the last. */ - if (datalen == length + fraggap) { + if (datalen == length + fraggap) alloclen += rt->dst.trailer_len; - /* make sure mtu is not reached */ - if (datalen > mtu - fragheaderlen - rt->dst.trailer_len) - datalen -= ALIGN(rt->dst.trailer_len, 8); - } + if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, @@ -926,11 +923,11 @@ alloc_new_skb: /* * Find where to start putting bytes. */ - data = skb_put(skb, fraglen); + data = skb_put(skb, fraglen + exthdrlen); skb_set_network_header(skb, exthdrlen); skb->transport_header = (skb->network_header + fragheaderlen); - data += fragheaderlen; + data += fragheaderlen + exthdrlen; if (fraggap) { skb->csum = skb_copy_and_csum_bits( @@ -1064,7 +1061,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, */ *rtp = NULL; cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(rt->dst.path); + rt->dst.dev->mtu : dst_mtu(&rt->dst); cork->dst = &rt->dst; cork->length = 0; cork->tx_flags = ipc->tx_flags; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 4614babdc45..2e97e3ec1eb 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -17,51 +17,35 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi4 fl4 = {}; - unsigned long orefdst; + __be32 saddr = iph->saddr; + __u8 flags = 0; unsigned int hh_len; - unsigned int type; - type = inet_addr_type(net, iph->saddr); - if (skb->sk && inet_sk(skb->sk)->transparent) - type = RTN_LOCAL; - if (addr_type == RTN_UNSPEC) - addr_type = type; + if (!skb->sk && addr_type != RTN_LOCAL) { + if (addr_type == RTN_UNSPEC) + addr_type = inet_addr_type(net, saddr); + if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) + flags |= FLOWI_FLAG_ANYSRC; + else + saddr = 0; + } /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. */ - if (addr_type == RTN_LOCAL) { - fl4.daddr = iph->daddr; - if (type == RTN_LOCAL) - fl4.saddr = iph->saddr; - fl4.flowi4_tos = RT_TOS(iph->tos); - fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; - fl4.flowi4_mark = skb->mark; - fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - return -1; - - /* Drop old route. */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - } else { - /* non-local src, find valid iif to satisfy - * rp-filter when calling ip_route_input. */ - fl4.daddr = iph->saddr; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - return -1; + fl4.daddr = iph->daddr; + fl4.saddr = saddr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : flags; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return -1; - orefdst = skb->_skb_refdst; - if (ip_route_input(skb, iph->daddr, iph->saddr, - RT_TOS(iph->tos), rt->dst.dev) != 0) { - dst_release(&rt->dst); - return -1; - } - dst_release(&rt->dst); - refdst_drop(orefdst); - } + /* Drop old route. */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); if (skb_dst(skb)->error) return -1; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index d2c1311cb28..5c9b9d96391 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -203,7 +203,8 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) else pmsg->outdev_name[0] = '\0'; - if (entry->indev && entry->skb->dev) { + if (entry->indev && entry->skb->dev && + entry->skb->mac_header != entry->skb->network_header) { pmsg->hw_type = entry->skb->dev->type; pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr); @@ -402,7 +403,8 @@ ipq_dev_drop(int ifindex) static inline void __ipq_rcv_skb(struct sk_buff *skb) { - int status, type, pid, flags, nlmsglen, skblen; + int status, type, pid, flags; + unsigned int nlmsglen, skblen; struct nlmsghdr *nlh; skblen = skb->len; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 76474384350..24e556e83a3 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -566,7 +566,7 @@ check_entry(const struct ipt_entry *e, const char *name) const struct xt_entry_target *t; if (!ip_checkentry(&e->ip)) { - duprintf("ip check failed %p %s.\n", e, par->match->name); + duprintf("ip check failed %p %s.\n", e, name); return -EINVAL; } diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index d609ac3cb9a..5c9e97c7901 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -307,7 +307,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) * error messages (RELATED) and information requests (see below) */ if (ip_hdr(skb)->protocol == IPPROTO_ICMP && (ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)) + ctinfo == IP_CT_RELATED_REPLY)) return XT_CONTINUE; /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, @@ -321,12 +321,12 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) ct->mark = hash; break; case IP_CT_RELATED: - case IP_CT_RELATED+IP_CT_IS_REPLY: + case IP_CT_RELATED_REPLY: /* FIXME: we don't handle expectations at the * moment. they can arrive on a different node than * the master connection (e.g. FTP passive mode) */ case IP_CT_ESTABLISHED: - case IP_CT_ESTABLISHED+IP_CT_IS_REPLY: + case IP_CT_ESTABLISHED_REPLY: break; default: break; diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index d2ed9dc74eb..9931152a78b 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -60,7 +60,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) nat = nfct_nat(ct); NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); + ctinfo == IP_CT_RELATED_REPLY)); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 1ff79e557f9..51f13f8ec72 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -40,7 +40,6 @@ static void send_reset(struct sk_buff *oldskb, int hook) struct iphdr *niph; const struct tcphdr *oth; struct tcphdr _otcph, *tcph; - unsigned int addr_type; /* IP header checks: fragment. */ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) @@ -55,6 +54,9 @@ static void send_reset(struct sk_buff *oldskb, int hook) if (oth->rst) return; + if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + return; + /* Check checksum */ if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) return; @@ -101,19 +103,11 @@ static void send_reset(struct sk_buff *oldskb, int hook) nskb->csum_start = (unsigned char *)tcph - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); - addr_type = RTN_UNSPEC; - if (hook != NF_INET_FORWARD -#ifdef CONFIG_BRIDGE_NETFILTER - || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED) -#endif - ) - addr_type = RTN_LOCAL; - /* ip_route_me_harder expects skb->dst to be set */ skb_dst_set_noref(nskb, skb_dst(oldskb)); nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(nskb, addr_type)) + if (ip_route_me_harder(nskb, RTN_UNSPEC)) goto free_nskb; niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index af6e9c77834..2b57e52c746 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -25,7 +25,8 @@ MODULE_LICENSE("GPL"); static inline bool match_ip(const struct sk_buff *skb, const struct ipt_ecn_info *einfo) { - return (ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect; + return ((ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect) ^ + !!(einfo->invert & IPT_ECN_OP_MATCH_IP); } static inline bool match_tcp(const struct sk_buff *skb, @@ -76,8 +77,6 @@ static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { - if (ip_hdr(skb)->protocol != IPPROTO_TCP) - return false; if (!match_tcp(skb, info, &par->hotdrop)) return false; } @@ -97,7 +96,7 @@ static int ecn_mt_check(const struct xt_mtchk_param *par) return -EINVAL; if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) && - ip->proto != IPPROTO_TCP) { + (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) { pr_info("cannot match TCP bits in rule for non-tcp packets\n"); return -EINVAL; } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 5a03c02af99..de9da21113a 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -101,7 +101,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) + if (!ct || ctinfo == IP_CT_RELATED_REPLY) goto out; help = nfct_help(ct); @@ -121,7 +121,9 @@ static unsigned int ipv4_confirm(unsigned int hooknum, return ret; } - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { + /* adjust seqs for loopback traffic only in outgoing direction */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { typeof(nf_nat_seq_adjust_hook) seq_adjust; seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 7404bde9599..ab5b27a2916 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, /* Update skb to refer to this connection */ skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; skb->nfctinfo = *ctinfo; - return -NF_ACCEPT; + return NF_ACCEPT; } /* Small and modified version of icmp_rcv */ diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 9c71b2755ce..3346de5d94d 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -433,7 +433,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, /* Must be RELATED */ NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED || - skb->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY); + skb->nfctinfo == IP_CT_RELATED_REPLY); /* Redirects on non-null nats must be dropped, else they'll start talking to each other without our translation, and be diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 99cfa28b6d3..ebc5f8894f9 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -160,7 +160,7 @@ static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data if (skb->ip_summed != CHECKSUM_PARTIAL) { if (!(rt->rt_flags & RTCF_LOCAL) && - skb->dev->features & NETIF_F_V4_CSUM) { + (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 21c30426480..733c9abc1cb 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -53,7 +53,7 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par) /* Connection must be valid and new. */ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); + ctinfo == IP_CT_RELATED_REPLY)); NF_CT_ASSERT(par->out != NULL); return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 7317bdf1d45..483b76d042d 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -116,7 +116,7 @@ nf_nat_fn(unsigned int hooknum, switch (ctinfo) { case IP_CT_RELATED: - case IP_CT_RELATED+IP_CT_IS_REPLY: + case IP_CT_RELATED_REPLY: if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(ct, ctinfo, hooknum, skb)) @@ -144,7 +144,7 @@ nf_nat_fn(unsigned int hooknum, default: /* ESTABLISHED */ NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || - ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); + ctinfo == IP_CT_ESTABLISHED_REPLY); } return nf_nat_packet(ct, ctinfo, hooknum, skb); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 9aaa67165f4..39b403f854c 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -41,7 +41,6 @@ #include <linux/proc_fs.h> #include <net/sock.h> #include <net/ping.h> -#include <net/icmp.h> #include <net/udp.h> #include <net/route.h> #include <net/inet_common.h> diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 52b0b956508..aa13ef10511 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1316,6 +1316,23 @@ reject_redirect: ; } +static bool peer_pmtu_expired(struct inet_peer *peer) +{ + unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); + + return orig && + time_after_eq(jiffies, orig) && + cmpxchg(&peer->pmtu_expires, orig, 0) == orig; +} + +static bool peer_pmtu_cleaned(struct inet_peer *peer) +{ + unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); + + return orig && + cmpxchg(&peer->pmtu_expires, orig, 0) == orig; +} + static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { struct rtable *rt = (struct rtable *)dst; @@ -1331,14 +1348,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) rt_genid(dev_net(dst->dev))); rt_del(hash, rt); ret = NULL; - } else if (rt->peer && - rt->peer->pmtu_expires && - time_after_eq(jiffies, rt->peer->pmtu_expires)) { - unsigned long orig = rt->peer->pmtu_expires; - - if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) - dst_metric_set(dst, RTAX_MTU, - rt->peer->pmtu_orig); + } else if (rt->peer && peer_pmtu_expired(rt->peer)) { + dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); } } return ret; @@ -1531,8 +1542,10 @@ unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) { - unsigned long expires = peer->pmtu_expires; + unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); + if (!expires) + return; if (time_before(jiffies, expires)) { u32 orig_dst_mtu = dst_mtu(dst); if (peer->pmtu_learned < orig_dst_mtu) { @@ -1555,10 +1568,11 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) rt_bind_peer(rt, rt->rt_dst, 1); peer = rt->peer; if (peer) { + unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); + if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; - if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { - unsigned long pmtu_expires; + if (!pmtu_expires || mtu < peer->pmtu_learned) { pmtu_expires = jiffies + ip_rt_mtu_expires; if (!pmtu_expires) @@ -1612,13 +1626,14 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) rt_bind_peer(rt, rt->rt_dst, 0); peer = rt->peer; - if (peer && peer->pmtu_expires) + if (peer) { check_peer_pmtu(dst, peer); - if (peer && peer->redirect_learned.a4 && - peer->redirect_learned.a4 != rt->rt_gateway) { - if (check_peer_redir(dst, peer)) - return NULL; + if (peer->redirect_learned.a4 && + peer->redirect_learned.a4 != rt->rt_gateway) { + if (check_peer_redir(dst, peer)) + return NULL; + } } rt->rt_peer_genid = rt_peer_genid(); @@ -1649,14 +1664,8 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); rt = skb_rtable(skb); - if (rt && - rt->peer && - rt->peer->pmtu_expires) { - unsigned long orig = rt->peer->pmtu_expires; - - if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) - dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); - } + if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) + dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); } static int ip_rt_bug(struct sk_buff *skb) @@ -1770,8 +1779,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, sizeof(u32) * RTAX_MAX); dst_init_metrics(&rt->dst, peer->metrics, false); - if (peer->pmtu_expires) - check_peer_pmtu(&rt->dst, peer); + check_peer_pmtu(&rt->dst, peer); if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) { rt->rt_gateway = peer->redirect_learned.a4; @@ -1894,9 +1902,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); rth = rt_intern_hash(hash, rth, skb, dev->ifindex); - err = 0; - if (IS_ERR(rth)) - err = PTR_ERR(rth); + return IS_ERR(rth) ? PTR_ERR(rth) : 0; e_nobufs: return -ENOBUFS; @@ -2775,7 +2781,8 @@ static int rt_fill_info(struct net *net, struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; - long expires; + long expires = 0; + const struct inet_peer *peer = rt->peer; u32 id = 0, ts = 0, tsage = 0, error; nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); @@ -2823,15 +2830,16 @@ static int rt_fill_info(struct net *net, NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); error = rt->dst.error; - expires = (rt->peer && rt->peer->pmtu_expires) ? - rt->peer->pmtu_expires - jiffies : 0; - if (rt->peer) { + if (peer) { inet_peer_refcheck(rt->peer); - id = atomic_read(&rt->peer->ip_id_count) & 0xffff; - if (rt->peer->tcp_ts_stamp) { - ts = rt->peer->tcp_ts; - tsage = get_seconds() - rt->peer->tcp_ts_stamp; + id = atomic_read(&peer->ip_id_count) & 0xffff; + if (peer->tcp_ts_stamp) { + ts = peer->tcp_ts; + tsage = get_seconds() - peer->tcp_ts_stamp; } + expires = ACCESS_ONCE(peer->pmtu_expires); + if (expires) + expires -= jiffies; } if (rt_is_input_route(rt)) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 054a59d21eb..46febcacb72 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3220,7 +3220,7 @@ __setup("thash_entries=", set_thash_entries); void __init tcp_init(void) { struct sk_buff *skb = NULL; - unsigned long nr_pages, limit; + unsigned long limit; int i, max_share, cnt; unsigned long jiffy = jiffies; @@ -3277,13 +3277,7 @@ void __init tcp_init(void) sysctl_tcp_max_orphans = cnt / 2; sysctl_max_syn_backlog = max(128, cnt / 256); - /* Set the pressure threshold to be a fraction of global memory that - * is up to 1/2 at 256 MB, decreasing toward zero with the amount of - * memory, with a floor of 128 pages. - */ - nr_pages = totalram_pages - totalhigh_pages; - limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); - limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); + limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); sysctl_tcp_mem[0] = limit / 4 * 3; sysctl_tcp_mem[1] = limit; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a7d6671e33b..708dc203b03 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1589,6 +1589,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto discard; if (nsk != sk) { + sock_rps_save_rxhash(nsk, skb->rxhash); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index abca870d8ff..198f75b7bdd 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1249,6 +1249,9 @@ csum_copy_err: if (noblock) return -EAGAIN; + + /* starting over for a new packet */ + msg->msg_flags &= ~MSG_TRUNC; goto try_again; } @@ -2206,16 +2209,10 @@ void __init udp_table_init(struct udp_table *table, const char *name) void __init udp_init(void) { - unsigned long nr_pages, limit; + unsigned long limit; udp_table_init(&udp_table, "UDP"); - /* Set the pressure threshold up by the same strategy of TCP. It is a - * fraction of global memory that is up to 1/2 at 256 MB, decreasing - * toward zero with the amount of memory, with a floor of 128 pages. - */ - nr_pages = totalram_pages - totalhigh_pages; - limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); - limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); + limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); sysctl_udp_mem[0] = limit / 4 * 3; sysctl_udp_mem[1] = limit; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 2d51840e53a..327a617d594 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -32,7 +32,12 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) dst = skb_dst(skb); mtu = dst_mtu(dst); if (skb->len > mtu) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + if (skb->sk) + ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr, + inet_sk(skb->sk)->inet_dport, mtu); + else + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); ret = -EMSGSIZE; } out: diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b7919f901fb..3b5669a2582 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -272,6 +272,10 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; + + if (addr->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + addr_type = ipv6_addr_type(&addr->sin6_addr); if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) return -EINVAL; diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index 413ab0754e1..24939486328 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -204,7 +204,8 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) else pmsg->outdev_name[0] = '\0'; - if (entry->indev && entry->skb->dev) { + if (entry->indev && entry->skb->dev && + entry->skb->mac_header != entry->skb->network_header) { pmsg->hw_type = entry->skb->dev->type; pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr); } @@ -403,7 +404,8 @@ ipq_dev_drop(int ifindex) static inline void __ipq_rcv_skb(struct sk_buff *skb) { - int status, type, pid, flags, nlmsglen, skblen; + int status, type, pid, flags; + unsigned int nlmsglen, skblen; struct nlmsghdr *nlh; skblen = skb->len; diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index c8af58b2256..4111050a9fc 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -160,7 +160,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) + if (!ct || ctinfo == IP_CT_RELATED_REPLY) goto out; help = nfct_help(ct); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 1df3c8b6bf4..7c05e7eacbc 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -177,7 +177,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, /* Update skb to refer to this connection */ skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; skb->nfctinfo = *ctinfo; - return -NF_ACCEPT; + return NF_ACCEPT; } static int diff --git a/net/ipv6/route.c b/net/ipv6/route.c index de2b1decd78..0ef1f086feb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -228,9 +228,10 @@ static struct rt6_info ip6_blk_hole_entry_template = { /* allocate dst with ip6_dst_ops */ static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops, - struct net_device *dev) + struct net_device *dev, + int flags) { - struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, 0); + struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags); memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry)); @@ -1042,7 +1043,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, if (unlikely(idev == NULL)) return NULL; - rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev); + rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0); if (unlikely(rt == NULL)) { in6_dev_put(idev); goto out; @@ -1062,14 +1063,6 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255); rt->dst.output = ip6_output; -#if 0 /* there's no chance to use these for ndisc */ - rt->dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST - ? DST_HOST - : 0; - ipv6_addr_copy(&rt->rt6i_dst.addr, addr); - rt->rt6i_dst.plen = 128; -#endif - spin_lock_bh(&icmp6_dst_lock); rt->dst.next = icmp6_dst_gc_list; icmp6_dst_gc_list = &rt->dst; @@ -1214,7 +1207,7 @@ int ip6_route_add(struct fib6_config *cfg) goto out; } - rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL); + rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT); if (rt == NULL) { err = -ENOMEM; @@ -1244,7 +1237,7 @@ int ip6_route_add(struct fib6_config *cfg) ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; if (rt->rt6i_dst.plen == 128) - rt->dst.flags = DST_HOST; + rt->dst.flags |= DST_HOST; #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); @@ -1734,7 +1727,7 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) { struct net *net = dev_net(ort->rt6i_dev); struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, - ort->dst.dev); + ort->dst.dev, 0); if (rt) { rt->dst.input = ort->dst.input; @@ -2013,7 +2006,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, { struct net *net = dev_net(idev->dev); struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, - net->loopback_dev); + net->loopback_dev, 0); struct neighbour *neigh; if (rt == NULL) { @@ -2025,7 +2018,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, in6_dev_hold(idev); - rt->dst.flags = DST_HOST; + rt->dst.flags |= DST_HOST; rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_idev = idev; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d1fd28711ba..87551ca568c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1644,6 +1644,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) * the new socket.. */ if(nsk != sk) { + sock_rps_save_rxhash(nsk, skb->rxhash); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 41f8c9c08db..328985c4088 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -453,8 +453,11 @@ csum_copy_err: } unlock_sock_fast(sk, slow); - if (flags & MSG_DONTWAIT) + if (noblock) return -EAGAIN; + + /* starting over for a new packet */ + msg->msg_flags &= ~MSG_TRUNC; goto try_again; } diff --git a/net/irda/iriap.c b/net/irda/iriap.c index 36477538cea..f876eed7d4a 100644 --- a/net/irda/iriap.c +++ b/net/irda/iriap.c @@ -87,6 +87,8 @@ static inline void iriap_start_watchdog_timer(struct iriap_cb *self, iriap_watchdog_timer_expired); } +static struct lock_class_key irias_objects_key; + /* * Function iriap_init (void) * @@ -114,6 +116,9 @@ int __init iriap_init(void) return -ENOMEM; } + lockdep_set_class_and_name(&irias_objects->hb_spinlock, &irias_objects_key, + "irias_objects"); + /* * Register some default services for IrLMP */ diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index a15c0152495..7f9124914b1 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -54,7 +54,7 @@ #include <asm/atomic.h> #include <asm/ebcdic.h> #include <asm/io.h> -#include <asm/s390_ext.h> +#include <asm/irq.h> #include <asm/smp.h> /* diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index b8dbae82fab..76130134bfa 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -258,7 +258,7 @@ static int l2tp_dfs_seq_open(struct inode *inode, struct file *file) */ pd->net = get_net_ns_by_pid(current->pid); if (IS_ERR(pd->net)) { - rc = -PTR_ERR(pd->net); + rc = PTR_ERR(pd->net); goto err_free_pd; } diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 8041befc655..42aa64b6b0b 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -767,7 +767,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, if (!attr[IPSET_ATTR_SETNAME]) { for (i = 0; i < ip_set_max; i++) { if (ip_set_list[i] != NULL && ip_set_list[i]->ref) { - ret = IPSET_ERR_BUSY; + ret = -IPSET_ERR_BUSY; goto out; } } diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 4743e540252..565a7c5b881 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -146,8 +146,9 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, { const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet4_elem data = - { .cidr = h->nets[0].cidr || HOST_MASK }; + struct hash_ipportnet4_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; if (data.cidr == 0) return -EINVAL; @@ -394,8 +395,9 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, { const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_ipportnet6_elem data = - { .cidr = h->nets[0].cidr || HOST_MASK }; + struct hash_ipportnet6_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; if (data.cidr == 0) return -EINVAL; diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index c4db202b7da..2aeeabcd5a2 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -131,7 +131,9 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, { const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net4_elem data = { .cidr = h->nets[0].cidr || HOST_MASK }; + struct hash_net4_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; if (data.cidr == 0) return -EINVAL; @@ -296,7 +298,9 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, { const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_net6_elem data = { .cidr = h->nets[0].cidr || HOST_MASK }; + struct hash_net6_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; if (data.cidr == 0) return -EINVAL; diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index d2a40362dd3..e50d9bb8820 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -144,7 +144,8 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem data = { - .cidr = h->nets[0].cidr || HOST_MASK }; + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; if (data.cidr == 0) return -EINVAL; @@ -357,7 +358,8 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct ip_set_hash *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport6_elem data = { - .cidr = h->nets[0].cidr || HOST_MASK }; + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; if (data.cidr == 0) return -EINVAL; diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index bf28ac2fc99..782db275ac5 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -776,8 +776,16 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->control) ip_vs_control_del(cp); - if (cp->flags & IP_VS_CONN_F_NFCT) + if (cp->flags & IP_VS_CONN_F_NFCT) { ip_vs_conn_drop_conntrack(cp); + /* Do not access conntracks during subsys cleanup + * because nf_conntrack_find_get can not be used after + * conntrack cleanup for the net. + */ + smp_rmb(); + if (ipvs->enable) + ip_vs_conn_drop_conntrack(cp); + } ip_vs_pe_put(cp->pe); kfree(cp->pe_data); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index bfa808f4da1..24c28d238dc 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1772,7 +1772,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_LOCAL_IN, - .priority = 99, + .priority = NF_IP_PRI_NAT_SRC - 2, }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1782,7 +1782,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_LOCAL_IN, - .priority = 101, + .priority = NF_IP_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { @@ -1790,7 +1790,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_LOCAL_OUT, - .priority = -99, + .priority = NF_IP_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { @@ -1798,7 +1798,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_LOCAL_OUT, - .priority = -98, + .priority = NF_IP_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ @@ -1824,7 +1824,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_INET_LOCAL_IN, - .priority = 99, + .priority = NF_IP6_PRI_NAT_SRC - 2, }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1834,7 +1834,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_INET_LOCAL_IN, - .priority = 101, + .priority = NF_IP6_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { @@ -1842,7 +1842,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_LOCAL_OUT, - .priority = -99, + .priority = NF_IP6_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { @@ -1850,7 +1850,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_INET_LOCAL_OUT, - .priority = -98, + .priority = NF_IP6_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ @@ -1945,6 +1945,7 @@ static void __net_exit __ip_vs_dev_cleanup(struct net *net) { EnterFunction(2); net_ipvs(net)->enable = 0; /* Disable packet reception */ + smp_wmb(); __ip_vs_sync_cleanup(net); LeaveFunction(2); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2e1c11f7841..f7af8b86601 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -850,7 +850,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, /* It exists; we have (non-exclusive) reference. */ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { - *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; + *ctinfo = IP_CT_ESTABLISHED_REPLY; /* Please set reply bit if this packet OK */ *set_reply = 1; } else { @@ -922,6 +922,9 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, ret = -ret; goto out; } + /* ICMP[v6] protocol trackers may assign one conntrack. */ + if (skb->nfct) + goto out; } ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, @@ -1143,7 +1146,7 @@ static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) /* This ICMP is in reverse direction to the packet which caused it */ ct = nf_ct_get(skb, &ctinfo); if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) - ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; + ctinfo = IP_CT_RELATED_REPLY; else ctinfo = IP_CT_RELATED; diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index e17cb7c7dd8..6f5801eac99 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -368,7 +368,7 @@ static int help(struct sk_buff *skb, /* Until there's been traffic both ways, don't look in packets. */ if (ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { + ctinfo != IP_CT_ESTABLISHED_REPLY) { pr_debug("ftp: Conntrackinfo = %u\n", ctinfo); return NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 18b2ce5c8ce..f03c2d4539f 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -571,10 +571,9 @@ static int h245_help(struct sk_buff *skb, unsigned int protoff, int ret; /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) return NF_ACCEPT; - } + pr_debug("nf_ct_h245: skblen = %u\n", skb->len); spin_lock_bh(&nf_h323_lock); @@ -1125,10 +1124,9 @@ static int q931_help(struct sk_buff *skb, unsigned int protoff, int ret; /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) return NF_ACCEPT; - } + pr_debug("nf_ct_q931: skblen = %u\n", skb->len); spin_lock_bh(&nf_h323_lock); diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index b394aa31877..4f9390b9869 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -125,8 +125,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, return NF_ACCEPT; /* Until there's been traffic both ways, don't look in packets. */ - if (ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) return NF_ACCEPT; /* Not a full tcp header? */ diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 088944824e1..2fd4565144d 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -519,8 +519,7 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff, u_int16_t msg; /* don't do any tracking before tcp handshake complete */ - if (ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) return NF_ACCEPT; nexthdr_off = protoff; diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index d9e27734b2a..8501823b3f9 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -78,7 +78,7 @@ static int help(struct sk_buff *skb, ct_sane_info = &nfct_help(ct)->help.ct_sane_info; /* Until there's been traffic both ways, don't look in packets. */ if (ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) + ctinfo != IP_CT_ESTABLISHED_REPLY) return NF_ACCEPT; /* Not a full tcp header? */ diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index cb5a2858178..93faf6a3a63 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1423,7 +1423,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust; if (ctinfo != IP_CT_ESTABLISHED && - ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) + ctinfo != IP_CT_ESTABLISHED_REPLY) return NF_ACCEPT; /* No Data ? */ diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index e0ee010935e..2e7ccbb43dd 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -456,7 +456,8 @@ __build_packet_message(struct nfulnl_instance *inst, if (skb->mark) NLA_PUT_BE32(inst->skb, NFULA_MARK, htonl(skb->mark)); - if (indev && skb->dev) { + if (indev && skb->dev && + skb->mac_header != skb->network_header) { struct nfulnl_msg_packet_hw phw; int len = dev_parse_header(skb, phw.hw_addr); if (len > 0) { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index b83123f12b4..fdd2fafe0a1 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -335,7 +335,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (entskb->mark) NLA_PUT_BE32(skb, NFQA_MARK, htonl(entskb->mark)); - if (indev && entskb->dev) { + if (indev && entskb->dev && + entskb->mac_header != entskb->network_header) { struct nfqnl_msg_packet_hw phw; int len = dev_parse_header(entskb, phw.hw_addr); if (len) { diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 9cc46356b57..fe39f7e913d 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -143,9 +143,9 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct) && ((iph->protocol != IPPROTO_ICMP && - ctinfo == IP_CT_IS_REPLY + IP_CT_ESTABLISHED) || + ctinfo == IP_CT_ESTABLISHED_REPLY) || (iph->protocol == IPPROTO_ICMP && - ctinfo == IP_CT_IS_REPLY + IP_CT_RELATED)) && + ctinfo == IP_CT_RELATED_REPLY)) && (ct->status & IPS_SRC_NAT_DONE)) { daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 925f715686a..c0c3cda1971 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -798,7 +798,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, getnstimeofday(&ts); h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; - h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); + if (vlan_tx_tag_present(skb)) { + h.h2->tp_vlan_tci = vlan_tx_tag_get(skb); + status |= TP_STATUS_VLAN_VALID; + } else { + h.h2->tp_vlan_tci = 0; + } + h.h2->tp_padding = 0; hdrlen = sizeof(*h.h2); break; default: @@ -1725,8 +1731,13 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); - aux.tp_vlan_tci = vlan_tx_tag_get(skb); - + if (vlan_tx_tag_present(skb)) { + aux.tp_vlan_tci = vlan_tx_tag_get(skb); + aux.tp_status |= TP_STATUS_VLAN_VALID; + } else { + aux.tp_vlan_tci = 0; + } + aux.tp_padding = 0; put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); } diff --git a/net/rds/ib.c b/net/rds/ib.c index cce19f95c62..3b83086bcc3 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -325,7 +325,7 @@ static int rds_ib_laddr_check(__be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index ee369d201a6..fd453dd5124 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -587,7 +587,7 @@ int rds_ib_conn_connect(struct rds_connection *conn) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP); + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); ic->i_cm_id = NULL; diff --git a/net/rds/iw.c b/net/rds/iw.c index 5a9676fe594..f7474844f09 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -226,7 +226,7 @@ static int rds_iw_laddr_check(__be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index 3a60a15d1b4..c12db66f24c 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -522,7 +522,7 @@ int rds_iw_conn_connect(struct rds_connection *conn) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP); + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); ic->i_cm_id = NULL; diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 4195a053982..f8760e1b668 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -158,7 +158,8 @@ static int rds_rdma_listen_init(void) struct rdma_cm_id *cm_id; int ret; - cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP); + cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP, + IB_QPT_RC); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); printk(KERN_ERR "RDS/RDMA: failed to setup listener, " diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index b1721d71c27..b4c680900d7 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -251,9 +251,8 @@ static void dev_watchdog(unsigned long arg) } if (some_queue_timedout) { - char drivername[64]; WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n", - dev->name, netdev_drivername(dev, drivername, 64), i); + dev->name, netdev_drivername(dev), i); dev->netdev_ops->ndo_tx_timeout(dev); } if (!mod_timer(&dev->watchdog_timer, diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 525f97c467e..4a62888f2e4 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -444,15 +444,7 @@ void sctp_association_free(struct sctp_association *asoc) asoc->peer.transport_count = 0; - /* Free any cached ASCONF_ACK chunk. */ - sctp_assoc_free_asconf_acks(asoc); - - /* Free the ASCONF queue. */ - sctp_assoc_free_asconf_queue(asoc); - - /* Free any cached ASCONF chunk. */ - if (asoc->addip_last_asconf) - sctp_chunk_free(asoc->addip_last_asconf); + sctp_asconf_queue_teardown(asoc); /* AUTH - Free the endpoint shared keys */ sctp_auth_destroy_keys(&asoc->endpoint_shared_keys); @@ -1646,3 +1638,16 @@ struct sctp_chunk *sctp_assoc_lookup_asconf_ack( return NULL; } + +void sctp_asconf_queue_teardown(struct sctp_association *asoc) +{ + /* Free any cached ASCONF_ACK chunk. */ + sctp_assoc_free_asconf_acks(asoc); + + /* Free the ASCONF queue. */ + sctp_assoc_free_asconf_queue(asoc); + + /* Free any cached ASCONF chunk. */ + if (asoc->addip_last_asconf) + sctp_chunk_free(asoc->addip_last_asconf); +} diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 1c88c8911dc..d03682109b7 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1582,6 +1582,8 @@ static void sctp_check_transmitted(struct sctp_outq *q, #endif /* SCTP_DEBUG */ if (transport) { if (bytes_acked) { + struct sctp_association *asoc = transport->asoc; + /* We may have counted DATA that was migrated * to this transport due to DEL-IP operation. * Subtract those bytes, since the were never @@ -1600,6 +1602,17 @@ static void sctp_check_transmitted(struct sctp_outq *q, transport->error_count = 0; transport->asoc->overall_error_count = 0; + /* + * While in SHUTDOWN PENDING, we may have started + * the T5 shutdown guard timer after reaching the + * retransmission limit. Stop that timer as soon + * as the receiver acknowledged any data. + */ + if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING && + del_timer(&asoc->timers + [SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD])) + sctp_association_put(asoc); + /* Mark the destination transport address as * active if it is not so marked. */ @@ -1629,10 +1642,15 @@ static void sctp_check_transmitted(struct sctp_outq *q, * A sender is doing zero window probing when the * receiver's advertised window is zero, and there is * only one data chunk in flight to the receiver. + * + * Allow the association to timeout while in SHUTDOWN + * PENDING or SHUTDOWN RECEIVED in case the receiver + * stays in zero window mode forever. */ if (!q->asoc->peer.rwnd && !list_empty(&tlist) && - (sack_ctsn+2 == q->asoc->next_tsn)) { + (sack_ctsn+2 == q->asoc->next_tsn) && + q->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) { SCTP_DEBUG_PRINTK("%s: SACK received for zero " "window probe: %u\n", __func__, sack_ctsn); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 67380a29e2e..207175b2f40 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1058,7 +1058,6 @@ SCTP_STATIC __init int sctp_init(void) int status = -EINVAL; unsigned long goal; unsigned long limit; - unsigned long nr_pages; int max_share; int order; @@ -1148,15 +1147,7 @@ SCTP_STATIC __init int sctp_init(void) /* Initialize handle used for association ids. */ idr_init(&sctp_assocs_id); - /* Set the pressure threshold to be a fraction of global memory that - * is up to 1/2 at 256 MB, decreasing toward zero with the amount of - * memory, with a floor of 128 pages. - * Note this initializes the data in sctpv6_prot too - * Unabashedly stolen from tcp_init - */ - nr_pages = totalram_pages - totalhigh_pages; - limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); - limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); + limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); sysctl_sctp_mem[0] = limit / 4 * 3; sysctl_sctp_mem[1] = limit; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index d612ca1ca6c..6e0f88295aa 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -670,10 +670,19 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of the * HEARTBEAT should clear the error counter of the destination * transport address to which the HEARTBEAT was sent. - * The association's overall error count is also cleared. */ t->error_count = 0; - t->asoc->overall_error_count = 0; + + /* + * Although RFC4960 specifies that the overall error count must + * be cleared when a HEARTBEAT ACK is received, we make an + * exception while in SHUTDOWN PENDING. If the peer keeps its + * window shut forever, we may never be able to transmit our + * outstanding data and rely on the retransmission limit be reached + * to shutdown the association. + */ + if (t->asoc->state != SCTP_STATE_SHUTDOWN_PENDING) + t->asoc->overall_error_count = 0; /* Clear the hb_sent flag to signal that we had a good * acknowledgement. @@ -1437,6 +1446,13 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, sctp_cmd_setup_t2(commands, asoc, cmd->obj.ptr); break; + case SCTP_CMD_TIMER_START_ONCE: + timer = &asoc->timers[cmd->obj.to]; + + if (timer_pending(timer)) + break; + /* fall through */ + case SCTP_CMD_TIMER_START: timer = &asoc->timers[cmd->obj.to]; timeout = asoc->timeouts[cmd->obj.to]; @@ -1670,6 +1686,9 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_SEND_NEXT_ASCONF: sctp_cmd_send_asconf(asoc); break; + case SCTP_CMD_PURGE_ASCONF_QUEUE: + sctp_asconf_queue_teardown(asoc); + break; default: pr_warn("Impossible command: %u, %p\n", cmd->verb, cmd->obj.ptr); diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 7f4a4f8368e..246117142b5 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1718,11 +1718,21 @@ static sctp_disposition_t sctp_sf_do_dupcook_a(const struct sctp_endpoint *ep, return SCTP_DISPOSITION_CONSUME; } - /* For now, fail any unsent/unacked data. Consider the optional - * choice of resending of this data. + /* For now, stop pending T3-rtx and SACK timers, fail any unsent/unacked + * data. Consider the optional choice of resending of this data. */ + sctp_add_cmd_sf(commands, SCTP_CMD_T3_RTX_TIMERS_STOP, SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_OUTQUEUE, SCTP_NULL()); + /* Stop pending T4-rto timer, teardown ASCONF queue, ASCONF-ACK queue + * and ASCONF-ACK cache. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_ASCONF_QUEUE, SCTP_NULL()); + repl = sctp_make_cookie_ack(new_asoc, chunk); if (!repl) goto nomem; @@ -5144,7 +5154,7 @@ sctp_disposition_t sctp_sf_do_9_2_start_shutdown( * The sender of the SHUTDOWN MAY also start an overall guard timer * 'T5-shutdown-guard' to bound the overall time for shutdown sequence. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); if (asoc->autoclose) @@ -5289,14 +5299,28 @@ sctp_disposition_t sctp_sf_do_6_3_3_rtx(const struct sctp_endpoint *ep, SCTP_INC_STATS(SCTP_MIB_T3_RTX_EXPIREDS); if (asoc->overall_error_count >= asoc->max_retrans) { - sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, - SCTP_ERROR(ETIMEDOUT)); - /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */ - sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, - SCTP_PERR(SCTP_ERROR_NO_ERROR)); - SCTP_INC_STATS(SCTP_MIB_ABORTEDS); - SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); - return SCTP_DISPOSITION_DELETE_TCB; + if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING) { + /* + * We are here likely because the receiver had its rwnd + * closed for a while and we have not been able to + * transmit the locally queued data within the maximum + * retransmission attempts limit. Start the T5 + * shutdown guard timer to give the receiver one last + * chance and some additional time to recover before + * aborting. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START_ONCE, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + } else { + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, + SCTP_ERROR(ETIMEDOUT)); + /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_NO_ERROR)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_DELETE_TCB; + } } /* E1) For the destination address for which the timer diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 0338dc6fdc9..7c211a7f90f 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -827,7 +827,7 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_ /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ - TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \ + TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6766913a53e..08c6238802d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2073,10 +2073,33 @@ static int sctp_setsockopt_disable_fragments(struct sock *sk, static int sctp_setsockopt_events(struct sock *sk, char __user *optval, unsigned int optlen) { + struct sctp_association *asoc; + struct sctp_ulpevent *event; + if (optlen > sizeof(struct sctp_event_subscribe)) return -EINVAL; if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen)) return -EFAULT; + + /* + * At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, + * if there is no data to be sent or retransmit, the stack will + * immediately send up this notification. + */ + if (sctp_ulpevent_type_enabled(SCTP_SENDER_DRY_EVENT, + &sctp_sk(sk)->subscribe)) { + asoc = sctp_id2assoc(sk, 0); + + if (asoc && sctp_outq_is_empty(&asoc->outqueue)) { + event = sctp_ulpevent_make_sender_dry_event(asoc, + GFP_ATOMIC); + if (!event) + return -ENOMEM; + + sctp_ulpq_tail_event(&asoc->ulpq, event); + } + } + return 0; } diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 67e31276682..cd6e4aa19db 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -326,10 +326,12 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) * Run memory cache shrinker. */ static int -rpcauth_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) +rpcauth_cache_shrinker(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(free); int res; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) return (nr_to_scan == 0) ? 0 : -1; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 339ba64cce1..5daf6cc4fae 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -577,13 +577,13 @@ retry: } inode = &gss_msg->inode->vfs_inode; for (;;) { - prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_KILLABLE); spin_lock(&inode->i_lock); if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) { break; } spin_unlock(&inode->i_lock); - if (signalled()) { + if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto out_intr; } diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 0a9a2ec2e46..c3b75333b82 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -43,6 +43,7 @@ #include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/xdr.h> #include <linux/crypto.h> +#include <linux/sunrpc/gss_krb5_enctypes.h> #ifdef RPC_DEBUG # define RPCDBG_FACILITY RPCDBG_AUTH @@ -750,7 +751,7 @@ static struct gss_api_mech gss_kerberos_mech = { .gm_ops = &gss_kerberos_ops, .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), .gm_pfs = gss_kerberos_pfs, - .gm_upcall_enctypes = "18,17,16,23,3,1,2", + .gm_upcall_enctypes = KRB5_SUPPORTED_ENCTYPES, }; static int __init init_kerberos_module(void) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8d83f9d4871..8c9141583d6 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -13,10 +13,6 @@ * and need to be refreshed, or when a packet was damaged in transit. * This may be have to be moved to the VFS layer. * - * NB: BSD uses a more intelligent approach to guessing when a request - * or reply has been lost by keeping the RTO estimate for each procedure. - * We currently make do with a constant timeout value. - * * Copyright (C) 1992,1993 Rick Sladkey <jrs@world.std.com> * Copyright (C) 1995,1996 Olaf Kirch <okir@monad.swb.de> */ @@ -32,7 +28,9 @@ #include <linux/slab.h> #include <linux/utsname.h> #include <linux/workqueue.h> +#include <linux/in.h> #include <linux/in6.h> +#include <linux/un.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/rpc_pipe_fs.h> @@ -298,22 +296,27 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) * up a string representation of the passed-in address. */ if (args->servername == NULL) { + struct sockaddr_un *sun = + (struct sockaddr_un *)args->address; + struct sockaddr_in *sin = + (struct sockaddr_in *)args->address; + struct sockaddr_in6 *sin6 = + (struct sockaddr_in6 *)args->address; + servername[0] = '\0'; switch (args->address->sa_family) { - case AF_INET: { - struct sockaddr_in *sin = - (struct sockaddr_in *)args->address; + case AF_LOCAL: + snprintf(servername, sizeof(servername), "%s", + sun->sun_path); + break; + case AF_INET: snprintf(servername, sizeof(servername), "%pI4", &sin->sin_addr.s_addr); break; - } - case AF_INET6: { - struct sockaddr_in6 *sin = - (struct sockaddr_in6 *)args->address; + case AF_INET6: snprintf(servername, sizeof(servername), "%pI6", - &sin->sin6_addr); + &sin6->sin6_addr); break; - } default: /* caller wants default server name, but * address family isn't recognized. */ @@ -1058,7 +1061,7 @@ call_allocate(struct rpc_task *task) dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); - if (RPC_IS_ASYNC(task) || !signalled()) { + if (RPC_IS_ASYNC(task) || !fatal_signal_pending(current)) { task->tk_action = call_allocate; rpc_delay(task, HZ>>4); return; @@ -1172,6 +1175,9 @@ call_bind_status(struct rpc_task *task) status = -EOPNOTSUPP; break; } + if (task->tk_rebind_retry == 0) + break; + task->tk_rebind_retry--; rpc_delay(task, 3*HZ); goto retry_timeout; case -ETIMEDOUT: diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index c652e4cc9fe..9a80a922c52 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -16,6 +16,7 @@ #include <linux/types.h> #include <linux/socket.h> +#include <linux/un.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/kernel.h> @@ -32,6 +33,8 @@ # define RPCDBG_FACILITY RPCDBG_BIND #endif +#define RPCBIND_SOCK_PATHNAME "/var/run/rpcbind.sock" + #define RPCBIND_PROGRAM (100000u) #define RPCBIND_PORT (111u) @@ -158,20 +161,69 @@ static void rpcb_map_release(void *data) kfree(map); } -static const struct sockaddr_in rpcb_inaddr_loopback = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), - .sin_port = htons(RPCBIND_PORT), -}; +/* + * Returns zero on success, otherwise a negative errno value + * is returned. + */ +static int rpcb_create_local_unix(void) +{ + static const struct sockaddr_un rpcb_localaddr_rpcbind = { + .sun_family = AF_LOCAL, + .sun_path = RPCBIND_SOCK_PATHNAME, + }; + struct rpc_create_args args = { + .net = &init_net, + .protocol = XPRT_TRANSPORT_LOCAL, + .address = (struct sockaddr *)&rpcb_localaddr_rpcbind, + .addrsize = sizeof(rpcb_localaddr_rpcbind), + .servername = "localhost", + .program = &rpcb_program, + .version = RPCBVERS_2, + .authflavor = RPC_AUTH_NULL, + }; + struct rpc_clnt *clnt, *clnt4; + int result = 0; + + /* + * Because we requested an RPC PING at transport creation time, + * this works only if the user space portmapper is rpcbind, and + * it's listening on AF_LOCAL on the named socket. + */ + clnt = rpc_create(&args); + if (IS_ERR(clnt)) { + dprintk("RPC: failed to create AF_LOCAL rpcbind " + "client (errno %ld).\n", PTR_ERR(clnt)); + result = -PTR_ERR(clnt); + goto out; + } + + clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4); + if (IS_ERR(clnt4)) { + dprintk("RPC: failed to bind second program to " + "rpcbind v4 client (errno %ld).\n", + PTR_ERR(clnt4)); + clnt4 = NULL; + } + + /* Protected by rpcb_create_local_mutex */ + rpcb_local_clnt = clnt; + rpcb_local_clnt4 = clnt4; -static DEFINE_MUTEX(rpcb_create_local_mutex); +out: + return result; +} /* * Returns zero on success, otherwise a negative errno value * is returned. */ -static int rpcb_create_local(void) +static int rpcb_create_local_net(void) { + static const struct sockaddr_in rpcb_inaddr_loopback = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + .sin_port = htons(RPCBIND_PORT), + }; struct rpc_create_args args = { .net = &init_net, .protocol = XPRT_TRANSPORT_TCP, @@ -186,13 +238,6 @@ static int rpcb_create_local(void) struct rpc_clnt *clnt, *clnt4; int result = 0; - if (rpcb_local_clnt) - return result; - - mutex_lock(&rpcb_create_local_mutex); - if (rpcb_local_clnt) - goto out; - clnt = rpc_create(&args); if (IS_ERR(clnt)) { dprintk("RPC: failed to create local rpcbind " @@ -214,10 +259,34 @@ static int rpcb_create_local(void) clnt4 = NULL; } + /* Protected by rpcb_create_local_mutex */ rpcb_local_clnt = clnt; rpcb_local_clnt4 = clnt4; out: + return result; +} + +/* + * Returns zero on success, otherwise a negative errno value + * is returned. + */ +static int rpcb_create_local(void) +{ + static DEFINE_MUTEX(rpcb_create_local_mutex); + int result = 0; + + if (rpcb_local_clnt) + return result; + + mutex_lock(&rpcb_create_local_mutex); + if (rpcb_local_clnt) + goto out; + + if (rpcb_create_local_unix() != 0) + result = rpcb_create_local_net(); + +out: mutex_unlock(&rpcb_create_local_mutex); return result; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6b43ee7221d..a27406b1654 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -792,6 +792,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta /* Initialize retry counters */ task->tk_garb_retry = 2; task->tk_cred_retry = 2; + task->tk_rebind_retry = 2; task->tk_priority = task_setup_data->priority - RPC_PRIORITY_LOW; task->tk_owner = current->tgid; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 08e05a8ce02..2b90292e950 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -942,6 +942,8 @@ static void svc_unregister(const struct svc_serv *serv) if (progp->pg_vers[i]->vs_hidden) continue; + dprintk("svc: attempting to unregister %sv%u\n", + progp->pg_name, i); __svc_unregister(progp->pg_prog, i, progp->pg_name); } } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index b7d435c3f19..af04f779ce9 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -387,6 +387,33 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, return len; } +static int svc_partial_recvfrom(struct svc_rqst *rqstp, + struct kvec *iov, int nr, + int buflen, unsigned int base) +{ + size_t save_iovlen; + void __user *save_iovbase; + unsigned int i; + int ret; + + if (base == 0) + return svc_recvfrom(rqstp, iov, nr, buflen); + + for (i = 0; i < nr; i++) { + if (iov[i].iov_len > base) + break; + base -= iov[i].iov_len; + } + save_iovlen = iov[i].iov_len; + save_iovbase = iov[i].iov_base; + iov[i].iov_len -= base; + iov[i].iov_base += base; + ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen); + iov[i].iov_len = save_iovlen; + iov[i].iov_base = save_iovbase; + return ret; +} + /* * Set socket snd and rcv buffer lengths */ @@ -409,7 +436,6 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, lock_sock(sock->sk); sock->sk->sk_sndbuf = snd * 2; sock->sk->sk_rcvbuf = rcv * 2; - sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; sock->sk->sk_write_space(sock->sk); release_sock(sock->sk); #endif @@ -884,6 +910,56 @@ failed: return NULL; } +static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) +{ + unsigned int i, len, npages; + + if (svsk->sk_tcplen <= sizeof(rpc_fraghdr)) + return 0; + len = svsk->sk_tcplen - sizeof(rpc_fraghdr); + npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + if (rqstp->rq_pages[i] != NULL) + put_page(rqstp->rq_pages[i]); + BUG_ON(svsk->sk_pages[i] == NULL); + rqstp->rq_pages[i] = svsk->sk_pages[i]; + svsk->sk_pages[i] = NULL; + } + rqstp->rq_arg.head[0].iov_base = page_address(rqstp->rq_pages[0]); + return len; +} + +static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp) +{ + unsigned int i, len, npages; + + if (svsk->sk_tcplen <= sizeof(rpc_fraghdr)) + return; + len = svsk->sk_tcplen - sizeof(rpc_fraghdr); + npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + svsk->sk_pages[i] = rqstp->rq_pages[i]; + rqstp->rq_pages[i] = NULL; + } +} + +static void svc_tcp_clear_pages(struct svc_sock *svsk) +{ + unsigned int i, len, npages; + + if (svsk->sk_tcplen <= sizeof(rpc_fraghdr)) + goto out; + len = svsk->sk_tcplen - sizeof(rpc_fraghdr); + npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + BUG_ON(svsk->sk_pages[i] == NULL); + put_page(svsk->sk_pages[i]); + svsk->sk_pages[i] = NULL; + } +out: + svsk->sk_tcplen = 0; +} + /* * Receive data. * If we haven't gotten the record length yet, get the next four bytes. @@ -893,31 +969,15 @@ failed: static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) { struct svc_serv *serv = svsk->sk_xprt.xpt_server; + unsigned int want; int len; - if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) - /* sndbuf needs to have room for one request - * per thread, otherwise we can stall even when the - * network isn't a bottleneck. - * - * We count all threads rather than threads in a - * particular pool, which provides an upper bound - * on the number of threads which will access the socket. - * - * rcvbuf just needs to be able to hold a few requests. - * Normally they will be removed from the queue - * as soon a a complete request arrives. - */ - svc_sock_setbufsize(svsk->sk_sock, - (serv->sv_nrthreads+3) * serv->sv_max_mesg, - 3 * serv->sv_max_mesg); - clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); if (svsk->sk_tcplen < sizeof(rpc_fraghdr)) { - int want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; struct kvec iov; + want = sizeof(rpc_fraghdr) - svsk->sk_tcplen; iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; iov.iov_len = want; if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) @@ -927,7 +987,7 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) if (len < want) { dprintk("svc: short recvfrom while reading record " "length (%d of %d)\n", len, want); - goto err_again; /* record header not complete */ + return -EAGAIN; } svsk->sk_reclen = ntohl(svsk->sk_reclen); @@ -954,83 +1014,75 @@ static int svc_tcp_recv_record(struct svc_sock *svsk, struct svc_rqst *rqstp) } } - /* Check whether enough data is available */ - len = svc_recv_available(svsk); - if (len < 0) - goto error; + if (svsk->sk_reclen < 8) + goto err_delete; /* client is nuts. */ - if (len < svsk->sk_reclen) { - dprintk("svc: incomplete TCP record (%d of %d)\n", - len, svsk->sk_reclen); - goto err_again; /* record not complete */ - } len = svsk->sk_reclen; - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); return len; - error: - if (len == -EAGAIN) - dprintk("RPC: TCP recv_record got EAGAIN\n"); +error: + dprintk("RPC: TCP recv_record got %d\n", len); return len; - err_delete: +err_delete: set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - err_again: return -EAGAIN; } -static int svc_process_calldir(struct svc_sock *svsk, struct svc_rqst *rqstp, - struct rpc_rqst **reqpp, struct kvec *vec) +static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) { + struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt; struct rpc_rqst *req = NULL; - u32 *p; - u32 xid; - u32 calldir; - int len; - - len = svc_recvfrom(rqstp, vec, 1, 8); - if (len < 0) - goto error; + struct kvec *src, *dst; + __be32 *p = (__be32 *)rqstp->rq_arg.head[0].iov_base; + __be32 xid; + __be32 calldir; - p = (u32 *)rqstp->rq_arg.head[0].iov_base; xid = *p++; calldir = *p; - if (calldir == 0) { - /* REQUEST is the most common case */ - vec[0] = rqstp->rq_arg.head[0]; - } else { - /* REPLY */ - struct rpc_xprt *bc_xprt = svsk->sk_xprt.xpt_bc_xprt; - - if (bc_xprt) - req = xprt_lookup_rqst(bc_xprt, xid); - - if (!req) { - printk(KERN_NOTICE - "%s: Got unrecognized reply: " - "calldir 0x%x xpt_bc_xprt %p xid %08x\n", - __func__, ntohl(calldir), - bc_xprt, xid); - vec[0] = rqstp->rq_arg.head[0]; - goto out; - } + if (bc_xprt) + req = xprt_lookup_rqst(bc_xprt, xid); - memcpy(&req->rq_private_buf, &req->rq_rcv_buf, - sizeof(struct xdr_buf)); - /* copy the xid and call direction */ - memcpy(req->rq_private_buf.head[0].iov_base, - rqstp->rq_arg.head[0].iov_base, 8); - vec[0] = req->rq_private_buf.head[0]; + if (!req) { + printk(KERN_NOTICE + "%s: Got unrecognized reply: " + "calldir 0x%x xpt_bc_xprt %p xid %08x\n", + __func__, ntohl(calldir), + bc_xprt, xid); + return -EAGAIN; } - out: - vec[0].iov_base += 8; - vec[0].iov_len -= 8; - len = svsk->sk_reclen - 8; - error: - *reqpp = req; - return len; + + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); + /* + * XXX!: cheating for now! Only copying HEAD. + * But we know this is good enough for now (in fact, for any + * callback reply in the forseeable future). + */ + dst = &req->rq_private_buf.head[0]; + src = &rqstp->rq_arg.head[0]; + if (dst->iov_len < src->iov_len) + return -EAGAIN; /* whatever; just giving up. */ + memcpy(dst->iov_base, src->iov_base, src->iov_len); + xprt_complete_rqst(req->rq_task, svsk->sk_reclen); + rqstp->rq_arg.len = 0; + return 0; } +static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len) +{ + int i = 0; + int t = 0; + + while (t < len) { + vec[i].iov_base = page_address(pages[i]); + vec[i].iov_len = PAGE_SIZE; + i++; + t += PAGE_SIZE; + } + return i; +} + + /* * Receive data from a TCP socket. */ @@ -1041,8 +1093,10 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) struct svc_serv *serv = svsk->sk_xprt.xpt_server; int len; struct kvec *vec; - int pnum, vlen; - struct rpc_rqst *req = NULL; + unsigned int want, base; + __be32 *p; + __be32 calldir; + int pnum; dprintk("svc: tcp_recv %p data %d conn %d close %d\n", svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), @@ -1053,87 +1107,73 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) if (len < 0) goto error; + base = svc_tcp_restore_pages(svsk, rqstp); + want = svsk->sk_reclen - base; + vec = rqstp->rq_vec; - vec[0] = rqstp->rq_arg.head[0]; - vlen = PAGE_SIZE; - /* - * We have enough data for the whole tcp record. Let's try and read the - * first 8 bytes to get the xid and the call direction. We can use this - * to figure out if this is a call or a reply to a callback. If - * sk_reclen is < 8 (xid and calldir), then this is a malformed packet. - * In that case, don't bother with the calldir and just read the data. - * It will be rejected in svc_process. - */ - if (len >= 8) { - len = svc_process_calldir(svsk, rqstp, &req, vec); - if (len < 0) - goto err_again; - vlen -= 8; - } + pnum = copy_pages_to_kvecs(&vec[0], &rqstp->rq_pages[0], + svsk->sk_reclen); - pnum = 1; - while (vlen < len) { - vec[pnum].iov_base = (req) ? - page_address(req->rq_private_buf.pages[pnum - 1]) : - page_address(rqstp->rq_pages[pnum]); - vec[pnum].iov_len = PAGE_SIZE; - pnum++; - vlen += PAGE_SIZE; - } rqstp->rq_respages = &rqstp->rq_pages[pnum]; /* Now receive data */ - len = svc_recvfrom(rqstp, vec, pnum, len); - if (len < 0) - goto err_again; - - /* - * Account for the 8 bytes we read earlier - */ - len += 8; - - if (req) { - xprt_complete_rqst(req->rq_task, len); - len = 0; - goto out; + len = svc_partial_recvfrom(rqstp, vec, pnum, want, base); + if (len >= 0) + svsk->sk_tcplen += len; + if (len != want) { + if (len < 0 && len != -EAGAIN) + goto err_other; + svc_tcp_save_pages(svsk, rqstp); + dprintk("svc: incomplete TCP record (%d of %d)\n", + svsk->sk_tcplen, svsk->sk_reclen); + goto err_noclose; } - dprintk("svc: TCP complete record (%d bytes)\n", len); - rqstp->rq_arg.len = len; + + rqstp->rq_arg.len = svsk->sk_reclen; rqstp->rq_arg.page_base = 0; - if (len <= rqstp->rq_arg.head[0].iov_len) { - rqstp->rq_arg.head[0].iov_len = len; + if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len; rqstp->rq_arg.page_len = 0; - } else { - rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; - } + } else + rqstp->rq_arg.page_len = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; -out: + p = (__be32 *)rqstp->rq_arg.head[0].iov_base; + calldir = p[1]; + if (calldir) + len = receive_cb_reply(svsk, rqstp); + /* Reset TCP read info */ svsk->sk_reclen = 0; svsk->sk_tcplen = 0; + /* If we have more data, signal svc_xprt_enqueue() to try again */ + if (svc_recv_available(svsk) > sizeof(rpc_fraghdr)) + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); + + if (len < 0) + goto error; svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt); if (serv->sv_stats) serv->sv_stats->nettcpcnt++; - return len; + dprintk("svc: TCP complete record (%d bytes)\n", rqstp->rq_arg.len); + return rqstp->rq_arg.len; -err_again: - if (len == -EAGAIN) { - dprintk("RPC: TCP recvfrom got EAGAIN\n"); - return len; - } error: - if (len != -EAGAIN) { - printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", - svsk->sk_xprt.xpt_server->sv_name, -len); - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - } + if (len != -EAGAIN) + goto err_other; + dprintk("RPC: TCP recvfrom got EAGAIN\n"); return -EAGAIN; +err_other: + printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", + svsk->sk_xprt.xpt_server->sv_name, -len); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); +err_noclose: + return -EAGAIN; /* record not complete */ } /* @@ -1304,18 +1344,10 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) svsk->sk_reclen = 0; svsk->sk_tcplen = 0; + memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages)); tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; - /* initialise setting must have enough space to - * receive and respond to one request. - * svc_tcp_recvfrom will re-adjust if necessary - */ - svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, - 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); - - set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); if (sk->sk_state != TCP_ESTABLISHED) set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); @@ -1379,8 +1411,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Initialize the socket */ if (sock->type == SOCK_DGRAM) svc_udp_init(svsk, serv); - else + else { + /* initialise setting must have enough space to + * receive and respond to one request. + */ + svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg, + 4 * serv->sv_max_mesg); svc_tcp_init(svsk, serv); + } dprintk("svc: svc_setup_socket created %p (inet %p)\n", svsk, svsk->sk_sk); @@ -1562,8 +1600,10 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) svc_sock_detach(xprt); - if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) + if (!test_bit(XPT_LISTENER, &xprt->xpt_flags)) { + svc_tcp_clear_pages(svsk); kernel_sock_shutdown(svsk->sk_sock, SHUT_RDWR); + } } /* diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 679cd674b81..f008c14ad34 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -638,6 +638,25 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) } EXPORT_SYMBOL_GPL(xdr_init_decode); +/** + * xdr_init_decode - Initialize an xdr_stream for decoding data. + * @xdr: pointer to xdr_stream struct + * @buf: pointer to XDR buffer from which to decode data + * @pages: list of pages to decode into + * @len: length in bytes of buffer in pages + */ +void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, + struct page **pages, unsigned int len) +{ + memset(buf, 0, sizeof(*buf)); + buf->pages = pages; + buf->page_len = len; + buf->buflen = len; + buf->len = len; + xdr_init_decode(xdr, buf, NULL); +} +EXPORT_SYMBOL_GPL(xdr_init_decode_pages); + static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) { __be32 *p = xdr->p; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 6c014dd3a20..c3c232a88d9 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -695,7 +695,8 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, return ERR_PTR(-ENOMEM); xprt = &cma_xprt->sc_xprt; - listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP); + listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP, + IB_QPT_RC); if (IS_ERR(listen_id)) { ret = PTR_ERR(listen_id); dprintk("svcrdma: rdma_create_id failed = %d\n", ret); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index d4297dc43dc..80f8da344df 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -387,7 +387,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, init_completion(&ia->ri_done); - id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); + id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) { rc = PTR_ERR(id); dprintk("RPC: %s: rdma_create_id() failed %i\n", diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bf005d3c65e..72abb735893 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -19,6 +19,7 @@ */ #include <linux/types.h> +#include <linux/string.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/capability.h> @@ -28,6 +29,7 @@ #include <linux/in.h> #include <linux/net.h> #include <linux/mm.h> +#include <linux/un.h> #include <linux/udp.h> #include <linux/tcp.h> #include <linux/sunrpc/clnt.h> @@ -45,6 +47,9 @@ #include <net/tcp.h> #include "sunrpc.h" + +static void xs_close(struct rpc_xprt *xprt); + /* * xprtsock tunables */ @@ -261,6 +266,11 @@ static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt) return (struct sockaddr *) &xprt->addr; } +static inline struct sockaddr_un *xs_addr_un(struct rpc_xprt *xprt) +{ + return (struct sockaddr_un *) &xprt->addr; +} + static inline struct sockaddr_in *xs_addr_in(struct rpc_xprt *xprt) { return (struct sockaddr_in *) &xprt->addr; @@ -276,23 +286,34 @@ static void xs_format_common_peer_addresses(struct rpc_xprt *xprt) struct sockaddr *sap = xs_addr(xprt); struct sockaddr_in6 *sin6; struct sockaddr_in *sin; + struct sockaddr_un *sun; char buf[128]; - (void)rpc_ntop(sap, buf, sizeof(buf)); - xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); - switch (sap->sa_family) { + case AF_LOCAL: + sun = xs_addr_un(xprt); + strlcpy(buf, sun->sun_path, sizeof(buf)); + xprt->address_strings[RPC_DISPLAY_ADDR] = + kstrdup(buf, GFP_KERNEL); + break; case AF_INET: + (void)rpc_ntop(sap, buf, sizeof(buf)); + xprt->address_strings[RPC_DISPLAY_ADDR] = + kstrdup(buf, GFP_KERNEL); sin = xs_addr_in(xprt); snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); break; case AF_INET6: + (void)rpc_ntop(sap, buf, sizeof(buf)); + xprt->address_strings[RPC_DISPLAY_ADDR] = + kstrdup(buf, GFP_KERNEL); sin6 = xs_addr_in6(xprt); snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr); break; default: BUG(); } + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); } @@ -495,6 +516,70 @@ static int xs_nospace(struct rpc_task *task) return ret; } +/* + * Construct a stream transport record marker in @buf. + */ +static inline void xs_encode_stream_record_marker(struct xdr_buf *buf) +{ + u32 reclen = buf->len - sizeof(rpc_fraghdr); + rpc_fraghdr *base = buf->head[0].iov_base; + *base = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | reclen); +} + +/** + * xs_local_send_request - write an RPC request to an AF_LOCAL socket + * @task: RPC task that manages the state of an RPC request + * + * Return values: + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * ENOTCONN: Caller needs to invoke connect logic then call again + * other: Some other error occured, the request was not sent + */ +static int xs_local_send_request(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + struct sock_xprt *transport = + container_of(xprt, struct sock_xprt, xprt); + struct xdr_buf *xdr = &req->rq_snd_buf; + int status; + + xs_encode_stream_record_marker(&req->rq_snd_buf); + + xs_pktdump("packet data:", + req->rq_svec->iov_base, req->rq_svec->iov_len); + + status = xs_sendpages(transport->sock, NULL, 0, + xdr, req->rq_bytes_sent); + dprintk("RPC: %s(%u) = %d\n", + __func__, xdr->len - req->rq_bytes_sent, status); + if (likely(status >= 0)) { + req->rq_bytes_sent += status; + req->rq_xmit_bytes_sent += status; + if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_bytes_sent = 0; + return 0; + } + status = -EAGAIN; + } + + switch (status) { + case -EAGAIN: + status = xs_nospace(task); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); + case -EPIPE: + xs_close(xprt); + status = -ENOTCONN; + } + + return status; +} + /** * xs_udp_send_request - write an RPC request to a UDP socket * @task: address of RPC task that manages the state of an RPC request @@ -574,13 +659,6 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) kernel_sock_shutdown(sock, SHUT_WR); } -static inline void xs_encode_tcp_record_marker(struct xdr_buf *buf) -{ - u32 reclen = buf->len - sizeof(rpc_fraghdr); - rpc_fraghdr *base = buf->head[0].iov_base; - *base = htonl(RPC_LAST_STREAM_FRAGMENT | reclen); -} - /** * xs_tcp_send_request - write an RPC request to a TCP socket * @task: address of RPC task that manages the state of an RPC request @@ -603,7 +681,7 @@ static int xs_tcp_send_request(struct rpc_task *task) struct xdr_buf *xdr = &req->rq_snd_buf; int status; - xs_encode_tcp_record_marker(&req->rq_snd_buf); + xs_encode_stream_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", req->rq_svec->iov_base, @@ -785,6 +863,88 @@ static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) return (struct rpc_xprt *) sk->sk_user_data; } +static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) +{ + struct xdr_skb_reader desc = { + .skb = skb, + .offset = sizeof(rpc_fraghdr), + .count = skb->len - sizeof(rpc_fraghdr), + }; + + if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) + return -1; + if (desc.count) + return -1; + return 0; +} + +/** + * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets + * @sk: socket with data to read + * @len: how much data to read + * + * Currently this assumes we can read the whole reply in a single gulp. + */ +static void xs_local_data_ready(struct sock *sk, int len) +{ + struct rpc_task *task; + struct rpc_xprt *xprt; + struct rpc_rqst *rovr; + struct sk_buff *skb; + int err, repsize, copied; + u32 _xid; + __be32 *xp; + + read_lock_bh(&sk->sk_callback_lock); + dprintk("RPC: %s...\n", __func__); + xprt = xprt_from_sock(sk); + if (xprt == NULL) + goto out; + + skb = skb_recv_datagram(sk, 0, 1, &err); + if (skb == NULL) + goto out; + + if (xprt->shutdown) + goto dropit; + + repsize = skb->len - sizeof(rpc_fraghdr); + if (repsize < 4) { + dprintk("RPC: impossible RPC reply size %d\n", repsize); + goto dropit; + } + + /* Copy the XID from the skb... */ + xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid); + if (xp == NULL) + goto dropit; + + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->transport_lock); + rovr = xprt_lookup_rqst(xprt, *xp); + if (!rovr) + goto out_unlock; + task = rovr->rq_task; + + copied = rovr->rq_private_buf.buflen; + if (copied > repsize) + copied = repsize; + + if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) { + dprintk("RPC: sk_buff copy failed\n"); + goto out_unlock; + } + + xprt_complete_rqst(task, copied); + + out_unlock: + spin_unlock(&xprt->transport_lock); + dropit: + skb_free_datagram(sk, skb); + out: + read_unlock_bh(&sk->sk_callback_lock); +} + /** * xs_udp_data_ready - "data ready" callback for UDP sockets * @sk: socket with data to read @@ -1344,7 +1504,6 @@ static void xs_tcp_state_change(struct sock *sk) case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ xprt_force_disconnect(xprt); - case TCP_SYN_SENT: xprt->connect_cookie++; case TCP_CLOSING: /* @@ -1571,11 +1730,31 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) return err; } +/* + * We don't support autobind on AF_LOCAL sockets + */ +static void xs_local_rpcbind(struct rpc_task *task) +{ + xprt_set_bound(task->tk_xprt); +} + +static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port) +{ +} #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key xs_key[2]; static struct lock_class_key xs_slock_key[2]; +static inline void xs_reclassify_socketu(struct socket *sock) +{ + struct sock *sk = sock->sk; + + BUG_ON(sock_owned_by_user(sk)); + sock_lock_init_class_and_name(sk, "slock-AF_LOCAL-RPC", + &xs_slock_key[1], "sk_lock-AF_LOCAL-RPC", &xs_key[1]); +} + static inline void xs_reclassify_socket4(struct socket *sock) { struct sock *sk = sock->sk; @@ -1597,6 +1776,9 @@ static inline void xs_reclassify_socket6(struct socket *sock) static inline void xs_reclassify_socket(int family, struct socket *sock) { switch (family) { + case AF_LOCAL: + xs_reclassify_socketu(sock); + break; case AF_INET: xs_reclassify_socket4(sock); break; @@ -1606,6 +1788,10 @@ static inline void xs_reclassify_socket(int family, struct socket *sock) } } #else +static inline void xs_reclassify_socketu(struct socket *sock) +{ +} + static inline void xs_reclassify_socket4(struct socket *sock) { } @@ -1644,6 +1830,94 @@ out: return ERR_PTR(err); } +static int xs_local_finish_connecting(struct rpc_xprt *xprt, + struct socket *sock) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, + xprt); + + if (!transport->inet) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + + xs_save_old_callbacks(transport, sk); + + sk->sk_user_data = xprt; + sk->sk_data_ready = xs_local_data_ready; + sk->sk_write_space = xs_udp_write_space; + sk->sk_error_report = xs_error_report; + sk->sk_allocation = GFP_ATOMIC; + + xprt_clear_connected(xprt); + + /* Reset to new socket */ + transport->sock = sock; + transport->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); + } + + /* Tell the socket layer to start connecting... */ + xprt->stat.connect_count++; + xprt->stat.connect_start = jiffies; + return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); +} + +/** + * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint + * @xprt: RPC transport to connect + * @transport: socket transport to connect + * @create_sock: function to create a socket of the correct type + * + * Invoked by a work queue tasklet. + */ +static void xs_local_setup_socket(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, connect_worker.work); + struct rpc_xprt *xprt = &transport->xprt; + struct socket *sock; + int status = -EIO; + + if (xprt->shutdown) + goto out; + + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + status = __sock_create(xprt->xprt_net, AF_LOCAL, + SOCK_STREAM, 0, &sock, 1); + if (status < 0) { + dprintk("RPC: can't create AF_LOCAL " + "transport socket (%d).\n", -status); + goto out; + } + xs_reclassify_socketu(sock); + + dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n", + xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + + status = xs_local_finish_connecting(xprt, sock); + switch (status) { + case 0: + dprintk("RPC: xprt %p connected to %s\n", + xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + xprt_set_connected(xprt); + break; + case -ENOENT: + dprintk("RPC: xprt %p: socket %s does not exist\n", + xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + break; + default: + printk(KERN_ERR "%s: unhandled error (%d) connecting to %s\n", + __func__, -status, + xprt->address_strings[RPC_DISPLAY_ADDR]); + } + +out: + xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, status); +} + static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -1758,6 +2032,7 @@ static void xs_tcp_reuse_connection(struct sock_xprt *transport) static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + int ret = -ENOTCONN; if (!transport->inet) { struct sock *sk = sock->sk; @@ -1789,12 +2064,22 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) } if (!xprt_bound(xprt)) - return -ENOTCONN; + goto out; /* Tell the socket layer to start connecting... */ xprt->stat.connect_count++; xprt->stat.connect_start = jiffies; - return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); + ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); + switch (ret) { + case 0: + case -EINPROGRESS: + /* SYN_SENT! */ + xprt->connect_cookie++; + if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + } +out: + return ret; } /** @@ -1917,6 +2202,32 @@ static void xs_connect(struct rpc_task *task) } /** + * xs_local_print_stats - display AF_LOCAL socket-specifc stats + * @xprt: rpc_xprt struct containing statistics + * @seq: output file + * + */ +static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +{ + long idle_time = 0; + + if (xprt_connected(xprt)) + idle_time = (long)(jiffies - xprt->last_used) / HZ; + + seq_printf(seq, "\txprt:\tlocal %lu %lu %lu %ld %lu %lu %lu " + "%llu %llu\n", + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); +} + +/** * xs_udp_print_stats - display UDP socket-specifc stats * @xprt: rpc_xprt struct containing statistics * @seq: output file @@ -2014,10 +2325,7 @@ static int bc_sendto(struct rpc_rqst *req) unsigned long headoff; unsigned long tailoff; - /* - * Set up the rpc header and record marker stuff - */ - xs_encode_tcp_record_marker(xbufp); + xs_encode_stream_record_marker(xbufp); tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK; headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK; @@ -2089,6 +2397,21 @@ static void bc_destroy(struct rpc_xprt *xprt) { } +static struct rpc_xprt_ops xs_local_ops = { + .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xs_tcp_release_xprt, + .rpcbind = xs_local_rpcbind, + .set_port = xs_local_set_port, + .connect = xs_connect, + .buf_alloc = rpc_malloc, + .buf_free = rpc_free, + .send_request = xs_local_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, + .close = xs_close, + .destroy = xs_destroy, + .print_stats = xs_local_print_stats, +}; + static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, @@ -2150,6 +2473,8 @@ static int xs_init_anyaddr(const int family, struct sockaddr *sap) }; switch (family) { + case AF_LOCAL: + break; case AF_INET: memcpy(sap, &sin, sizeof(sin)); break; @@ -2197,6 +2522,70 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, return xprt; } +static const struct rpc_timeout xs_local_default_timeout = { + .to_initval = 10 * HZ, + .to_maxval = 10 * HZ, + .to_retries = 2, +}; + +/** + * xs_setup_local - Set up transport to use an AF_LOCAL socket + * @args: rpc transport creation arguments + * + * AF_LOCAL is a "tpi_cots_ord" transport, just like TCP + */ +static struct rpc_xprt *xs_setup_local(struct xprt_create *args) +{ + struct sockaddr_un *sun = (struct sockaddr_un *)args->dstaddr; + struct sock_xprt *transport; + struct rpc_xprt *xprt; + struct rpc_xprt *ret; + + xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries); + if (IS_ERR(xprt)) + return xprt; + transport = container_of(xprt, struct sock_xprt, xprt); + + xprt->prot = 0; + xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; + + xprt->bind_timeout = XS_BIND_TO; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; + + xprt->ops = &xs_local_ops; + xprt->timeout = &xs_local_default_timeout; + + switch (sun->sun_family) { + case AF_LOCAL: + if (sun->sun_path[0] != '/') { + dprintk("RPC: bad AF_LOCAL address: %s\n", + sun->sun_path); + ret = ERR_PTR(-EINVAL); + goto out_err; + } + xprt_set_bound(xprt); + INIT_DELAYED_WORK(&transport->connect_worker, + xs_local_setup_socket); + xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL); + break; + default: + ret = ERR_PTR(-EAFNOSUPPORT); + goto out_err; + } + + dprintk("RPC: set up xprt to %s via AF_LOCAL\n", + xprt->address_strings[RPC_DISPLAY_ADDR]); + + if (try_module_get(THIS_MODULE)) + return xprt; + ret = ERR_PTR(-EINVAL); +out_err: + xprt_free(xprt); + return ret; +} + static const struct rpc_timeout xs_udp_default_timeout = { .to_initval = 5 * HZ, .to_maxval = 30 * HZ, @@ -2438,6 +2827,14 @@ out_err: return ret; } +static struct xprt_class xs_local_transport = { + .list = LIST_HEAD_INIT(xs_local_transport.list), + .name = "named UNIX socket", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_LOCAL, + .setup = xs_setup_local, +}; + static struct xprt_class xs_udp_transport = { .list = LIST_HEAD_INIT(xs_udp_transport.list), .name = "udp", @@ -2473,6 +2870,7 @@ int init_socket_xprt(void) sunrpc_table_header = register_sysctl_table(sunrpc_table); #endif + xprt_register_transport(&xs_local_transport); xprt_register_transport(&xs_udp_transport); xprt_register_transport(&xs_tcp_transport); xprt_register_transport(&xs_bc_tcp_transport); @@ -2493,6 +2891,7 @@ void cleanup_socket_xprt(void) } #endif + xprt_unregister_transport(&xs_local_transport); xprt_unregister_transport(&xs_udp_transport); xprt_unregister_transport(&xs_tcp_transport); xprt_unregister_transport(&xs_bc_tcp_transport); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 9bec2e8a838..5ce74a38552 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -50,7 +50,7 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); static void xfrm_init_pmtu(struct dst_entry *dst); static int stale_bundle(struct dst_entry *dst); -static int xfrm_bundle_ok(struct xfrm_dst *xdst, int family); +static int xfrm_bundle_ok(struct xfrm_dst *xdst); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, @@ -2241,7 +2241,7 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) static int stale_bundle(struct dst_entry *dst) { - return !xfrm_bundle_ok((struct xfrm_dst *)dst, AF_UNSPEC); + return !xfrm_bundle_ok((struct xfrm_dst *)dst); } void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) @@ -2313,7 +2313,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst) * still valid. */ -static int xfrm_bundle_ok(struct xfrm_dst *first, int family) +static int xfrm_bundle_ok(struct xfrm_dst *first) { struct dst_entry *dst = &first->u.dst; struct xfrm_dst *last; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 47f1b8638df..b11ea692bd7 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -265,7 +265,7 @@ static void xfrm_replay_advance_bmp(struct xfrm_state *x, __be32 net_seq) bitnr = bitnr & 0x1F; replay_esn->bmp[nr] |= (1U << bitnr); } else { - nr = replay_esn->replay_window >> 5; + nr = (replay_esn->replay_window - 1) >> 5; for (i = 0; i <= nr; i++) replay_esn->bmp[i] = 0; @@ -471,7 +471,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) bitnr = bitnr & 0x1F; replay_esn->bmp[nr] |= (1U << bitnr); } else { - nr = replay_esn->replay_window >> 5; + nr = (replay_esn->replay_window - 1) >> 5; for (i = 0; i <= nr; i++) replay_esn->bmp[i] = 0; |