diff options
author | wanchao-xu <wanchao.xu@samsung.com> | 2024-02-23 09:56:11 +0800 |
---|---|---|
committer | wanchao-xu <wanchao.xu@samsung.com> | 2024-02-23 09:56:29 +0800 |
commit | d39f72b68d5a576b5cd2178a40e2e4013ae14398 (patch) | |
tree | e5cd61a749edfcd55a69209f0eb34abb66a38e67 /net | |
parent | 0889ee8339e51dfdf78c39a8f15b6e5fe5ab49d5 (diff) | |
parent | e021b4728ed2aea486975a63a342ece1d2178b18 (diff) | |
download | qemu-arm-static-d39f72b68d5a576b5cd2178a40e2e4013ae14398.tar.gz qemu-arm-static-d39f72b68d5a576b5cd2178a40e2e4013ae14398.tar.bz2 qemu-arm-static-d39f72b68d5a576b5cd2178a40e2e4013ae14398.zip |
Merge branch 'sandbox/xuwc/devel-riscv-py2' into devel-py2
Change-Id: I4afa9c5351396b4aa3a45713523b38de970dbe4a
Signed-off-by: wanchao-xu <wanchao.xu@samsung.com>
Diffstat (limited to 'net')
42 files changed, 8682 insertions, 1386 deletions
diff --git a/net/Makefile.objs b/net/Makefile.objs index 4854a14fe..c5d076d19 100644 --- a/net/Makefile.objs +++ b/net/Makefile.objs @@ -2,12 +2,31 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o common-obj-y += socket.o common-obj-y += dump.o common-obj-y += eth.o -common-obj-$(CONFIG_POSIX) += tap.o -common-obj-$(CONFIG_LINUX) += tap-linux.o -common-obj-$(CONFIG_WIN32) += tap-win32.o -common-obj-$(CONFIG_BSD) += tap-bsd.o -common-obj-$(CONFIG_SOLARIS) += tap-solaris.o -common-obj-$(CONFIG_AIX) += tap-aix.o -common-obj-$(CONFIG_HAIKU) += tap-haiku.o +common-obj-y += announce.o +common-obj-$(CONFIG_L2TPV3) += l2tpv3.o +common-obj-$(call land,$(CONFIG_VIRTIO_NET),$(CONFIG_VHOST_NET_USER)) += vhost-user.o +common-obj-$(call land,$(call lnot,$(CONFIG_VIRTIO_NET)),$(CONFIG_VHOST_NET_USER)) += vhost-user-stub.o +common-obj-$(CONFIG_ALL) += vhost-user-stub.o common-obj-$(CONFIG_SLIRP) += slirp.o +slirp.o-cflags := $(SLIRP_CFLAGS) +slirp.o-libs := $(SLIRP_LIBS) common-obj-$(CONFIG_VDE) += vde.o +common-obj-$(CONFIG_NETMAP) += netmap.o +common-obj-y += filter.o +common-obj-y += filter-buffer.o +common-obj-y += filter-mirror.o +common-obj-y += colo-compare.o +common-obj-y += colo.o +common-obj-y += filter-rewriter.o +common-obj-y += filter-replay.o + +tap-obj-$(CONFIG_LINUX) = tap-linux.o +tap-obj-$(CONFIG_BSD) = tap-bsd.o +tap-obj-$(CONFIG_SOLARIS) = tap-solaris.o +tap-obj-y ?= tap-stub.o +common-obj-$(CONFIG_POSIX) += tap.o $(tap-obj-y) +common-obj-$(CONFIG_WIN32) += tap-win32.o + +vde.o-libs = $(VDE_LIBS) + +common-obj-$(CONFIG_CAN_BUS) += can/ diff --git a/net/announce.c b/net/announce.c new file mode 100644 index 000000000..db90d3bd4 --- /dev/null +++ b/net/announce.c @@ -0,0 +1,203 @@ +/* + * Self-announce + * (c) 2017-2019 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "net/announce.h" +#include "net/net.h" +#include "qapi/clone-visitor.h" +#include "qapi/qapi-visit-net.h" +#include "qapi/qapi-commands-net.h" +#include "trace.h" + +static GData *named_timers; + +int64_t qemu_announce_timer_step(AnnounceTimer *timer) +{ + int64_t step; + + step = timer->params.initial + + (timer->params.rounds - timer->round - 1) * + timer->params.step; + + if (step < 0 || step > timer->params.max) { + step = timer->params.max; + } + timer_mod(timer->tm, qemu_clock_get_ms(timer->type) + step); + + return step; +} + +/* + * If 'free_named' is true, then remove the timer from the list + * and free the timer itself. + */ +void qemu_announce_timer_del(AnnounceTimer *timer, bool free_named) +{ + bool free_timer = false; + if (timer->tm) { + timer_del(timer->tm); + timer_free(timer->tm); + timer->tm = NULL; + } + qapi_free_strList(timer->params.interfaces); + timer->params.interfaces = NULL; + if (free_named && timer->params.has_id) { + AnnounceTimer *list_timer; + /* + * Sanity check: There should only be one timer on the list with + * the id. + */ + list_timer = g_datalist_get_data(&named_timers, timer->params.id); + assert(timer == list_timer); + free_timer = true; + g_datalist_remove_data(&named_timers, timer->params.id); + } + trace_qemu_announce_timer_del(free_named, free_timer, timer->params.id); + g_free(timer->params.id); + timer->params.id = NULL; + + if (free_timer) { + g_free(timer); + } +} + +/* + * Under BQL/main thread + * Reset the timer to the given parameters/type/notifier. + */ +void qemu_announce_timer_reset(AnnounceTimer *timer, + AnnounceParameters *params, + QEMUClockType type, + QEMUTimerCB *cb, + void *opaque) +{ + /* + * We're under the BQL, so the current timer can't + * be firing, so we should be able to delete it. + */ + qemu_announce_timer_del(timer, false); + + QAPI_CLONE_MEMBERS(AnnounceParameters, &timer->params, params); + timer->round = params->rounds; + timer->type = type; + timer->tm = timer_new_ms(type, cb, opaque); +} + +#ifndef ETH_P_RARP +#define ETH_P_RARP 0x8035 +#endif +#define ARP_HTYPE_ETH 0x0001 +#define ARP_PTYPE_IP 0x0800 +#define ARP_OP_REQUEST_REV 0x3 + +static int announce_self_create(uint8_t *buf, + uint8_t *mac_addr) +{ + /* Ethernet header. */ + memset(buf, 0xff, 6); /* destination MAC addr */ + memcpy(buf + 6, mac_addr, 6); /* source MAC addr */ + *(uint16_t *)(buf + 12) = htons(ETH_P_RARP); /* ethertype */ + + /* RARP header. */ + *(uint16_t *)(buf + 14) = htons(ARP_HTYPE_ETH); /* hardware addr space */ + *(uint16_t *)(buf + 16) = htons(ARP_PTYPE_IP); /* protocol addr space */ + *(buf + 18) = 6; /* hardware addr length (ethernet) */ + *(buf + 19) = 4; /* protocol addr length (IPv4) */ + *(uint16_t *)(buf + 20) = htons(ARP_OP_REQUEST_REV); /* opcode */ + memcpy(buf + 22, mac_addr, 6); /* source hw addr */ + memset(buf + 28, 0x00, 4); /* source protocol addr */ + memcpy(buf + 32, mac_addr, 6); /* target hw addr */ + memset(buf + 38, 0x00, 4); /* target protocol addr */ + + /* Padding to get up to 60 bytes (ethernet min packet size, minus FCS). */ + memset(buf + 42, 0x00, 18); + + return 60; /* len (FCS will be added by hardware) */ +} + +static void qemu_announce_self_iter(NICState *nic, void *opaque) +{ + AnnounceTimer *timer = opaque; + uint8_t buf[60]; + int len; + bool skip; + + if (timer->params.has_interfaces) { + strList *entry = timer->params.interfaces; + /* Skip unless we find our name in the requested list */ + skip = true; + + while (entry) { + if (!strcmp(entry->value, nic->ncs->name)) { + /* Found us */ + skip = false; + break; + } + entry = entry->next; + } + } else { + skip = false; + } + + trace_qemu_announce_self_iter(timer->params.has_id ? timer->params.id : "_", + nic->ncs->name, + qemu_ether_ntoa(&nic->conf->macaddr), skip); + + if (!skip) { + len = announce_self_create(buf, nic->conf->macaddr.a); + + qemu_send_packet_raw(qemu_get_queue(nic), buf, len); + + /* if the NIC provides it's own announcement support, use it as well */ + if (nic->ncs->info->announce) { + nic->ncs->info->announce(nic->ncs); + } + } +} +static void qemu_announce_self_once(void *opaque) +{ + AnnounceTimer *timer = (AnnounceTimer *)opaque; + + qemu_foreach_nic(qemu_announce_self_iter, timer); + + if (--timer->round) { + qemu_announce_timer_step(timer); + } else { + qemu_announce_timer_del(timer, true); + } +} + +void qemu_announce_self(AnnounceTimer *timer, AnnounceParameters *params) +{ + qemu_announce_timer_reset(timer, params, QEMU_CLOCK_REALTIME, + qemu_announce_self_once, timer); + if (params->rounds) { + qemu_announce_self_once(timer); + } else { + qemu_announce_timer_del(timer, true); + } +} + +void qmp_announce_self(AnnounceParameters *params, Error **errp) +{ + AnnounceTimer *named_timer; + if (!params->has_id) { + params->id = g_strdup(""); + params->has_id = true; + } + + named_timer = g_datalist_get_data(&named_timers, params->id); + + if (!named_timer) { + named_timer = g_new0(AnnounceTimer, 1); + g_datalist_set_data(&named_timers, params->id, named_timer); + } + + qemu_announce_self(named_timer, params); +} diff --git a/net/can/Makefile.objs b/net/can/Makefile.objs new file mode 100644 index 000000000..9f35dc5c8 --- /dev/null +++ b/net/can/Makefile.objs @@ -0,0 +1,2 @@ +common-obj-y += can_core.o can_host.o +common-obj-$(CONFIG_LINUX) += can_socketcan.o diff --git a/net/can/can_core.c b/net/can/can_core.c new file mode 100644 index 000000000..90f4d8576 --- /dev/null +++ b/net/can/can_core.c @@ -0,0 +1,140 @@ +/* + * CAN common CAN bus emulation support + * + * Copyright (c) 2013-2014 Jin Yang + * Copyright (c) 2014-2018 Pavel Pisa + * + * Initial development supported by Google GSoC 2013 from RTEMS project slot + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "chardev/char.h" +#include "qemu/module.h" +#include "qemu/sockets.h" +#include "qapi/error.h" +#include "net/can_emu.h" +#include "qom/object_interfaces.h" + +struct CanBusState { + Object object; + + QTAILQ_HEAD(, CanBusClientState) clients; +}; + +static void can_bus_instance_init(Object *object) +{ + CanBusState *bus = (CanBusState *)object; + + QTAILQ_INIT(&bus->clients); +} + +int can_bus_insert_client(CanBusState *bus, CanBusClientState *client) +{ + client->bus = bus; + QTAILQ_INSERT_TAIL(&bus->clients, client, next); + return 0; +} + +int can_bus_remove_client(CanBusClientState *client) +{ + CanBusState *bus = client->bus; + if (bus == NULL) { + return 0; + } + + QTAILQ_REMOVE(&bus->clients, client, next); + client->bus = NULL; + return 1; +} + +ssize_t can_bus_client_send(CanBusClientState *client, + const struct qemu_can_frame *frames, size_t frames_cnt) +{ + int ret = 0; + CanBusState *bus = client->bus; + CanBusClientState *peer; + if (bus == NULL) { + return -1; + } + + QTAILQ_FOREACH(peer, &bus->clients, next) { + if (peer->info->can_receive(peer)) { + if (peer == client) { + /* No loopback support for now */ + continue; + } + if (peer->info->receive(peer, frames, frames_cnt) > 0) { + ret = 1; + } + } + } + + return ret; +} + +int can_bus_filter_match(struct qemu_can_filter *filter, qemu_canid_t can_id) +{ + int m; + if (((can_id | filter->can_mask) & QEMU_CAN_ERR_FLAG)) { + return (filter->can_mask & QEMU_CAN_ERR_FLAG) != 0; + } + m = (can_id & filter->can_mask) == (filter->can_id & filter->can_mask); + return filter->can_id & QEMU_CAN_INV_FILTER ? !m : m; +} + +int can_bus_client_set_filters(CanBusClientState *client, + const struct qemu_can_filter *filters, size_t filters_cnt) +{ + return 0; +} + + +static bool can_bus_can_be_deleted(UserCreatable *uc) +{ + return false; +} + +static void can_bus_class_init(ObjectClass *klass, + void *class_data G_GNUC_UNUSED) +{ + UserCreatableClass *uc_klass = USER_CREATABLE_CLASS(klass); + + uc_klass->can_be_deleted = can_bus_can_be_deleted; +} + +static const TypeInfo can_bus_info = { + .parent = TYPE_OBJECT, + .name = TYPE_CAN_BUS, + .instance_size = sizeof(CanBusState), + .instance_init = can_bus_instance_init, + .class_init = can_bus_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void can_bus_register_types(void) +{ + type_register_static(&can_bus_info); +} + +type_init(can_bus_register_types); diff --git a/net/can/can_host.c b/net/can/can_host.c new file mode 100644 index 000000000..1dfaf0ced --- /dev/null +++ b/net/can/can_host.c @@ -0,0 +1,114 @@ +/* + * CAN generic CAN host connection support + * + * Copyright (c) 2013-2014 Jin Yang + * Copyright (c) 2014-2018 Pavel Pisa + * + * Initial development supported by Google GSoC 2013 from RTEMS project slot + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "chardev/char.h" +#include "qemu/module.h" +#include "qemu/sockets.h" +#include "qapi/error.h" +#include "qom/object_interfaces.h" +#include "net/can_emu.h" +#include "net/can_host.h" + +struct CanBusState { + Object object; + + QTAILQ_HEAD(, CanBusClientState) clients; +}; + +static void can_host_disconnect(CanHostState *ch) +{ + CanHostClass *chc = CAN_HOST_GET_CLASS(ch); + + can_bus_remove_client(&ch->bus_client); + chc->disconnect(ch); +} + +static void can_host_connect(CanHostState *ch, Error **errp) +{ + CanHostClass *chc = CAN_HOST_GET_CLASS(ch); + Error *local_err = NULL; + + chc->connect(ch, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + can_bus_insert_client(ch->bus, &ch->bus_client); +} + +static void can_host_unparent(Object *obj) +{ + can_host_disconnect(CAN_HOST(obj)); +} + +static void can_host_complete(UserCreatable *uc, Error **errp) +{ + can_host_connect(CAN_HOST(uc), errp); +} + +static void can_host_instance_init(Object *obj) +{ + CanHostState *ch = CAN_HOST(obj); + + object_property_add_link(obj, "canbus", TYPE_CAN_BUS, + (Object **)&ch->bus, + object_property_allow_set_link, + OBJ_PROP_LINK_STRONG, + &error_abort); +} + +static void can_host_class_init(ObjectClass *klass, + void *class_data G_GNUC_UNUSED) +{ + UserCreatableClass *uc_klass = USER_CREATABLE_CLASS(klass); + + klass->unparent = can_host_unparent; + uc_klass->complete = can_host_complete; +} + +static const TypeInfo can_host_info = { + .parent = TYPE_OBJECT, + .name = TYPE_CAN_HOST, + .instance_size = sizeof(CanHostState), + .class_size = sizeof(CanHostClass), + .abstract = true, + .instance_init = can_host_instance_init, + .class_init = can_host_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void can_host_register_types(void) +{ + type_register_static(&can_host_info); +} + +type_init(can_host_register_types); diff --git a/net/can/can_socketcan.c b/net/can/can_socketcan.c new file mode 100644 index 000000000..8a6ffad40 --- /dev/null +++ b/net/can/can_socketcan.c @@ -0,0 +1,289 @@ +/* + * CAN c support to connect to the Linux host SocketCAN interfaces + * + * Copyright (c) 2013-2014 Jin Yang + * Copyright (c) 2014-2018 Pavel Pisa + * + * Initial development supported by Google GSoC 2013 from RTEMS project slot + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include "qemu/main-loop.h" +#include "qemu/module.h" +#include "qapi/error.h" +#include "chardev/char.h" +#include "qemu/sockets.h" +#include "qemu/error-report.h" +#include "net/can_emu.h" +#include "net/can_host.h" + +#include <sys/ioctl.h> +#include <net/if.h> +#include <linux/can.h> +#include <linux/can/raw.h> + +#ifndef DEBUG_CAN +#define DEBUG_CAN 0 +#endif /*DEBUG_CAN*/ + +#define TYPE_CAN_HOST_SOCKETCAN "can-host-socketcan" +#define CAN_HOST_SOCKETCAN(obj) \ + OBJECT_CHECK(CanHostSocketCAN, (obj), TYPE_CAN_HOST_SOCKETCAN) + +#define CAN_READ_BUF_LEN 5 +typedef struct CanHostSocketCAN { + CanHostState parent; + char *ifname; + + qemu_can_filter *rfilter; + int rfilter_num; + can_err_mask_t err_mask; + + qemu_can_frame buf[CAN_READ_BUF_LEN]; + int bufcnt; + int bufptr; + + int fd; +} CanHostSocketCAN; + +/* Check that QEMU and Linux kernel flags encoding and structure matches */ +QEMU_BUILD_BUG_ON(QEMU_CAN_EFF_FLAG != CAN_EFF_FLAG); +QEMU_BUILD_BUG_ON(QEMU_CAN_RTR_FLAG != CAN_RTR_FLAG); +QEMU_BUILD_BUG_ON(QEMU_CAN_ERR_FLAG != CAN_ERR_FLAG); +QEMU_BUILD_BUG_ON(QEMU_CAN_INV_FILTER != CAN_INV_FILTER); +QEMU_BUILD_BUG_ON(offsetof(qemu_can_frame, data) + != offsetof(struct can_frame, data)); + +static void can_host_socketcan_display_msg(struct qemu_can_frame *msg) +{ + int i; + + qemu_log_lock(); + qemu_log("[cansocketcan]: %03X [%01d] %s %s", + msg->can_id & QEMU_CAN_EFF_MASK, + msg->can_dlc, + msg->can_id & QEMU_CAN_EFF_FLAG ? "EFF" : "SFF", + msg->can_id & QEMU_CAN_RTR_FLAG ? "RTR" : "DAT"); + + for (i = 0; i < msg->can_dlc; i++) { + qemu_log(" %02X", msg->data[i]); + } + qemu_log("\n"); + qemu_log_flush(); + qemu_log_unlock(); +} + +static void can_host_socketcan_read(void *opaque) +{ + CanHostSocketCAN *c = opaque; + CanHostState *ch = CAN_HOST(c); + + /* CAN_READ_BUF_LEN for multiple messages syscall is possible for future */ + c->bufcnt = read(c->fd, c->buf, sizeof(qemu_can_frame)); + if (c->bufcnt < 0) { + warn_report("CAN bus host read failed (%s)", strerror(errno)); + return; + } + + can_bus_client_send(&ch->bus_client, c->buf, 1); + + if (DEBUG_CAN) { + can_host_socketcan_display_msg(c->buf); + } +} + +static int can_host_socketcan_can_receive(CanBusClientState *client) +{ + return 1; +} + +static ssize_t can_host_socketcan_receive(CanBusClientState *client, + const qemu_can_frame *frames, size_t frames_cnt) +{ + CanHostState *ch = container_of(client, CanHostState, bus_client); + CanHostSocketCAN *c = CAN_HOST_SOCKETCAN(ch); + + size_t len = sizeof(qemu_can_frame); + int res; + + if (c->fd < 0) { + return -1; + } + + res = write(c->fd, frames, len); + + if (!res) { + warn_report("[cansocketcan]: write message to host returns zero"); + return -1; + } + + if (res != len) { + if (res < 0) { + warn_report("[cansocketcan]: write to host failed (%s)", + strerror(errno)); + } else { + warn_report("[cansocketcan]: write to host truncated"); + } + return -1; + } + + return 1; +} + +static void can_host_socketcan_disconnect(CanHostState *ch) +{ + CanHostSocketCAN *c = CAN_HOST_SOCKETCAN(ch); + + if (c->fd >= 0) { + qemu_set_fd_handler(c->fd, NULL, NULL, c); + close(c->fd); + c->fd = -1; + } + + g_free(c->rfilter); + c->rfilter = NULL; + c->rfilter_num = 0; +} + +static CanBusClientInfo can_host_socketcan_bus_client_info = { + .can_receive = can_host_socketcan_can_receive, + .receive = can_host_socketcan_receive, +}; + +static void can_host_socketcan_connect(CanHostState *ch, Error **errp) +{ + CanHostSocketCAN *c = CAN_HOST_SOCKETCAN(ch); + int s; /* can raw socket */ + struct sockaddr_can addr; + struct ifreq ifr; + + /* open socket */ + s = qemu_socket(PF_CAN, SOCK_RAW, CAN_RAW); + if (s < 0) { + error_setg_errno(errp, errno, "failed to create CAN_RAW socket"); + return; + } + + addr.can_family = AF_CAN; + memset(&ifr.ifr_name, 0, sizeof(ifr.ifr_name)); + strcpy(ifr.ifr_name, c->ifname); + if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) { + error_setg_errno(errp, errno, + "SocketCAN host interface %s not available", c->ifname); + goto fail; + } + addr.can_ifindex = ifr.ifr_ifindex; + + c->err_mask = 0xffffffff; /* Receive error frame. */ + setsockopt(s, SOL_CAN_RAW, CAN_RAW_ERR_FILTER, + &c->err_mask, sizeof(c->err_mask)); + + c->rfilter_num = 1; + c->rfilter = g_new(struct qemu_can_filter, c->rfilter_num); + + /* Receive all data frame. If |= CAN_INV_FILTER no data. */ + c->rfilter[0].can_id = 0; + c->rfilter[0].can_mask = 0; + c->rfilter[0].can_mask &= ~CAN_ERR_FLAG; + + setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, c->rfilter, + c->rfilter_num * sizeof(struct qemu_can_filter)); + + if (bind(s, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + error_setg_errno(errp, errno, "failed to bind to host interface %s", + c->ifname); + goto fail; + } + + c->fd = s; + ch->bus_client.info = &can_host_socketcan_bus_client_info; + qemu_set_fd_handler(c->fd, can_host_socketcan_read, NULL, c); + return; + +fail: + close(s); + g_free(c->rfilter); + c->rfilter = NULL; + c->rfilter_num = 0; +} + +static char *can_host_socketcan_get_if(Object *obj, Error **errp) +{ + CanHostSocketCAN *c = CAN_HOST_SOCKETCAN(obj); + + return g_strdup(c->ifname); +} + +static void can_host_socketcan_set_if(Object *obj, const char *value, Error **errp) +{ + CanHostSocketCAN *c = CAN_HOST_SOCKETCAN(obj); + struct ifreq ifr; + + if (strlen(value) >= sizeof(ifr.ifr_name)) { + error_setg(errp, "CAN interface name longer than %zd characters", + sizeof(ifr.ifr_name) - 1); + return; + } + + if (c->fd != -1) { + error_setg(errp, "CAN interface already connected"); + return; + } + + g_free(c->ifname); + c->ifname = g_strdup(value); +} + +static void can_host_socketcan_instance_init(Object *obj) +{ + CanHostSocketCAN *c = CAN_HOST_SOCKETCAN(obj); + + c->fd = -1; +} + +static void can_host_socketcan_class_init(ObjectClass *klass, + void *class_data G_GNUC_UNUSED) +{ + CanHostClass *chc = CAN_HOST_CLASS(klass); + + object_class_property_add_str(klass, "if", + can_host_socketcan_get_if, + can_host_socketcan_set_if, + &error_abort); + chc->connect = can_host_socketcan_connect; + chc->disconnect = can_host_socketcan_disconnect; +} + +static const TypeInfo can_host_socketcan_info = { + .parent = TYPE_CAN_HOST, + .name = TYPE_CAN_HOST_SOCKETCAN, + .instance_size = sizeof(CanHostSocketCAN), + .instance_init = can_host_socketcan_instance_init, + .class_init = can_host_socketcan_class_init, +}; + +static void can_host_register_types(void) +{ + type_register_static(&can_host_socketcan_info); +} + +type_init(can_host_register_types); diff --git a/net/checksum.c b/net/checksum.c index 14c08550e..aaa400023 100644 --- a/net/checksum.c +++ b/net/checksum.c @@ -15,31 +15,34 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ -#include "qemu-common.h" +#include "qemu/osdep.h" #include "net/checksum.h" - -#define PROTO_TCP 6 -#define PROTO_UDP 17 +#include "net/eth.h" uint32_t net_checksum_add_cont(int len, uint8_t *buf, int seq) { - uint32_t sum = 0; + uint32_t sum1 = 0, sum2 = 0; int i; - for (i = seq; i < seq + len; i++) { - if (i & 1) { - sum += (uint32_t)buf[i - seq]; - } else { - sum += (uint32_t)buf[i - seq] << 8; - } + for (i = 0; i < len - 1; i += 2) { + sum1 += (uint32_t)buf[i]; + sum2 += (uint32_t)buf[i + 1]; + } + if (i < len) { + sum1 += (uint32_t)buf[i]; + } + + if (seq & 1) { + return sum1 + (sum2 << 8); + } else { + return sum2 + (sum1 << 8); } - return sum; } uint16_t net_checksum_finish(uint32_t sum) { while (sum>>16) - sum = (sum & 0xFFFF)+(sum >> 16); + sum = (sum & 0xFFFF)+(sum >> 16); return ~sum; } @@ -56,44 +59,118 @@ uint16_t net_checksum_tcpudp(uint16_t length, uint16_t proto, void net_checksum_calculate(uint8_t *data, int length) { - int hlen, plen, proto, csum_offset; - uint16_t csum; - - if ((data[14] & 0xf0) != 0x40) - return; /* not IPv4 */ - hlen = (data[14] & 0x0f) * 4; - plen = (data[16] << 8 | data[17]) - hlen; - proto = data[23]; - - switch (proto) { - case PROTO_TCP: - csum_offset = 16; - break; - case PROTO_UDP: - csum_offset = 6; - break; + int mac_hdr_len, ip_len; + struct ip_header *ip; + + /* + * Note: We cannot assume "data" is aligned, so the all code uses + * some macros that take care of possible unaligned access for + * struct members (just in case). + */ + + /* Ensure we have at least an Eth header */ + if (length < sizeof(struct eth_header)) { + return; + } + + /* Handle the optionnal VLAN headers */ + switch (lduw_be_p(&PKT_GET_ETH_HDR(data)->h_proto)) { + case ETH_P_VLAN: + mac_hdr_len = sizeof(struct eth_header) + + sizeof(struct vlan_header); + break; + case ETH_P_DVLAN: + if (lduw_be_p(&PKT_GET_VLAN_HDR(data)->h_proto) == ETH_P_VLAN) { + mac_hdr_len = sizeof(struct eth_header) + + 2 * sizeof(struct vlan_header); + } else { + mac_hdr_len = sizeof(struct eth_header) + + sizeof(struct vlan_header); + } + break; default: - return; + mac_hdr_len = sizeof(struct eth_header); + break; + } + + length -= mac_hdr_len; + + /* Now check we have an IP header (with an optionnal VLAN header) */ + if (length < sizeof(struct ip_header)) { + return; + } + + ip = (struct ip_header *)(data + mac_hdr_len); + + if (IP_HEADER_VERSION(ip) != IP_HEADER_VERSION_4) { + return; /* not IPv4 */ + } + + ip_len = lduw_be_p(&ip->ip_len); + + /* Last, check that we have enough data for the all IP frame */ + if (length < ip_len) { + return; + } + + ip_len -= IP_HDR_GET_LEN(ip); + + switch (ip->ip_p) { + case IP_PROTO_TCP: + { + uint16_t csum; + tcp_header *tcp = (tcp_header *)(ip + 1); + + if (ip_len < sizeof(tcp_header)) { + return; + } + + /* Set csum to 0 */ + stw_he_p(&tcp->th_sum, 0); + + csum = net_checksum_tcpudp(ip_len, ip->ip_p, + (uint8_t *)&ip->ip_src, + (uint8_t *)tcp); + + /* Store computed csum */ + stw_be_p(&tcp->th_sum, csum); + + break; } + case IP_PROTO_UDP: + { + uint16_t csum; + udp_header *udp = (udp_header *)(ip + 1); + + if (ip_len < sizeof(udp_header)) { + return; + } + + /* Set csum to 0 */ + stw_he_p(&udp->uh_sum, 0); - if (plen < csum_offset+2) - return; + csum = net_checksum_tcpudp(ip_len, ip->ip_p, + (uint8_t *)&ip->ip_src, + (uint8_t *)udp); - data[14+hlen+csum_offset] = 0; - data[14+hlen+csum_offset+1] = 0; - csum = net_checksum_tcpudp(plen, proto, data+14+12, data+14+hlen); - data[14+hlen+csum_offset] = csum >> 8; - data[14+hlen+csum_offset+1] = csum & 0xff; + /* Store computed csum */ + stw_be_p(&udp->uh_sum, csum); + + break; + } + default: + /* Can't handle any other protocol */ + break; + } } uint32_t net_checksum_add_iov(const struct iovec *iov, const unsigned int iov_cnt, - uint32_t iov_off, uint32_t size) + uint32_t iov_off, uint32_t size, uint32_t csum_offset) { size_t iovec_off, buf_off; unsigned int i; uint32_t res = 0; - uint32_t seq = 0; iovec_off = 0; buf_off = 0; @@ -102,8 +179,8 @@ net_checksum_add_iov(const struct iovec *iov, const unsigned int iov_cnt, size_t len = MIN((iovec_off + iov[i].iov_len) - iov_off , size); void *chunk_buf = iov[i].iov_base + (iov_off - iovec_off); - res += net_checksum_add_cont(len, chunk_buf, seq); - seq += len; + res += net_checksum_add_cont(len, chunk_buf, csum_offset); + csum_offset += len; buf_off += len; iov_off += len; diff --git a/net/clients.h b/net/clients.h index 77932942b..a6ef267e1 100644 --- a/net/clients.h +++ b/net/clients.h @@ -25,31 +25,40 @@ #define QEMU_NET_CLIENTS_H #include "net/net.h" -#include "qapi-types.h" -int net_init_dump(const NetClientOptions *opts, const char *name, - NetClientState *peer); +int net_init_dump(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp); #ifdef CONFIG_SLIRP -int net_init_slirp(const NetClientOptions *opts, const char *name, - NetClientState *peer); +int net_init_slirp(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp); #endif -int net_init_hubport(const NetClientOptions *opts, const char *name, - NetClientState *peer); +int net_init_hubport(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp); -int net_init_socket(const NetClientOptions *opts, const char *name, - NetClientState *peer); +int net_init_socket(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp); -int net_init_tap(const NetClientOptions *opts, const char *name, - NetClientState *peer); +int net_init_tap(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp); -int net_init_bridge(const NetClientOptions *opts, const char *name, - NetClientState *peer); +int net_init_bridge(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp); +int net_init_l2tpv3(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp); #ifdef CONFIG_VDE -int net_init_vde(const NetClientOptions *opts, const char *name, - NetClientState *peer); +int net_init_vde(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp); #endif +#ifdef CONFIG_NETMAP +int net_init_netmap(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp); +#endif + +int net_init_vhost_user(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp); + #endif /* QEMU_NET_CLIENTS_H */ diff --git a/net/colo-compare.c b/net/colo-compare.c new file mode 100644 index 000000000..7ee17f2cf --- /dev/null +++ b/net/colo-compare.c @@ -0,0 +1,1259 @@ +/* + * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO) + * (a.k.a. Fault Tolerance or Continuous Replication) + * + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * Copyright (c) 2016 FUJITSU LIMITED + * Copyright (c) 2016 Intel Corporation + * + * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "trace.h" +#include "qapi/error.h" +#include "net/net.h" +#include "net/eth.h" +#include "qom/object_interfaces.h" +#include "qemu/iov.h" +#include "qom/object.h" +#include "net/queue.h" +#include "chardev/char-fe.h" +#include "qemu/sockets.h" +#include "colo.h" +#include "sysemu/iothread.h" +#include "net/colo-compare.h" +#include "migration/colo.h" +#include "migration/migration.h" +#include "util.h" + +#define TYPE_COLO_COMPARE "colo-compare" +#define COLO_COMPARE(obj) \ + OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE) + +static QTAILQ_HEAD(, CompareState) net_compares = + QTAILQ_HEAD_INITIALIZER(net_compares); + +static NotifierList colo_compare_notifiers = + NOTIFIER_LIST_INITIALIZER(colo_compare_notifiers); + +#define COMPARE_READ_LEN_MAX NET_BUFSIZE +#define MAX_QUEUE_SIZE 1024 + +#define COLO_COMPARE_FREE_PRIMARY 0x01 +#define COLO_COMPARE_FREE_SECONDARY 0x02 + +/* TODO: Should be configurable */ +#define REGULAR_PACKET_CHECK_MS 3000 + +static QemuMutex event_mtx; +static QemuCond event_complete_cond; +static int event_unhandled_count; + +/* + * + CompareState ++ + * | | + * +---------------+ +---------------+ +---------------+ + * | conn list + - > conn + ------- > conn + -- > ...... + * +---------------+ +---------------+ +---------------+ + * | | | | | | + * +---------------+ +---v----+ +---v----+ +---v----+ +---v----+ + * |primary | |secondary |primary | |secondary + * |packet | |packet + |packet | |packet + + * +--------+ +--------+ +--------+ +--------+ + * | | | | + * +---v----+ +---v----+ +---v----+ +---v----+ + * |primary | |secondary |primary | |secondary + * |packet | |packet + |packet | |packet + + * +--------+ +--------+ +--------+ +--------+ + * | | | | + * +---v----+ +---v----+ +---v----+ +---v----+ + * |primary | |secondary |primary | |secondary + * |packet | |packet + |packet | |packet + + * +--------+ +--------+ +--------+ +--------+ + */ +typedef struct CompareState { + Object parent; + + char *pri_indev; + char *sec_indev; + char *outdev; + char *notify_dev; + CharBackend chr_pri_in; + CharBackend chr_sec_in; + CharBackend chr_out; + CharBackend chr_notify_dev; + SocketReadState pri_rs; + SocketReadState sec_rs; + SocketReadState notify_rs; + bool vnet_hdr; + + /* + * Record the connection that through the NIC + * Element type: Connection + */ + GQueue conn_list; + /* Record the connection without repetition */ + GHashTable *connection_track_table; + + IOThread *iothread; + GMainContext *worker_context; + QEMUTimer *packet_check_timer; + + QEMUBH *event_bh; + enum colo_event event; + + QTAILQ_ENTRY(CompareState) next; +} CompareState; + +typedef struct CompareClass { + ObjectClass parent_class; +} CompareClass; + +enum { + PRIMARY_IN = 0, + SECONDARY_IN, +}; + + +static int compare_chr_send(CompareState *s, + const uint8_t *buf, + uint32_t size, + uint32_t vnet_hdr_len, + bool notify_remote_frame); + +static bool packet_matches_str(const char *str, + const uint8_t *buf, + uint32_t packet_len) +{ + if (packet_len != strlen(str)) { + return false; + } + + return !memcmp(str, buf, strlen(str)); +} + +static void notify_remote_frame(CompareState *s) +{ + char msg[] = "DO_CHECKPOINT"; + int ret = 0; + + ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true); + if (ret < 0) { + error_report("Notify Xen COLO-frame failed"); + } +} + +static void colo_compare_inconsistency_notify(CompareState *s) +{ + if (s->notify_dev) { + notify_remote_frame(s); + } else { + notifier_list_notify(&colo_compare_notifiers, + migrate_get_current()); + } +} + +static gint seq_sorter(Packet *a, Packet *b, gpointer data) +{ + struct tcp_hdr *atcp, *btcp; + + atcp = (struct tcp_hdr *)(a->transport_header); + btcp = (struct tcp_hdr *)(b->transport_header); + return ntohl(atcp->th_seq) - ntohl(btcp->th_seq); +} + +static void fill_pkt_tcp_info(void *data, uint32_t *max_ack) +{ + Packet *pkt = data; + struct tcp_hdr *tcphd; + + tcphd = (struct tcp_hdr *)pkt->transport_header; + + pkt->tcp_seq = ntohl(tcphd->th_seq); + pkt->tcp_ack = ntohl(tcphd->th_ack); + *max_ack = *max_ack > pkt->tcp_ack ? *max_ack : pkt->tcp_ack; + pkt->header_size = pkt->transport_header - (uint8_t *)pkt->data + + (tcphd->th_off << 2) - pkt->vnet_hdr_len; + pkt->payload_size = pkt->size - pkt->header_size; + pkt->seq_end = pkt->tcp_seq + pkt->payload_size; + pkt->flags = tcphd->th_flags; +} + +/* + * Return 1 on success, if return 0 means the + * packet will be dropped + */ +static int colo_insert_packet(GQueue *queue, Packet *pkt, uint32_t *max_ack) +{ + if (g_queue_get_length(queue) <= MAX_QUEUE_SIZE) { + if (pkt->ip->ip_p == IPPROTO_TCP) { + fill_pkt_tcp_info(pkt, max_ack); + g_queue_insert_sorted(queue, + pkt, + (GCompareDataFunc)seq_sorter, + NULL); + } else { + g_queue_push_tail(queue, pkt); + } + return 1; + } + return 0; +} + +/* + * Return 0 on success, if return -1 means the pkt + * is unsupported(arp and ipv6) and will be sent later + */ +static int packet_enqueue(CompareState *s, int mode, Connection **con) +{ + ConnectionKey key; + Packet *pkt = NULL; + Connection *conn; + + if (mode == PRIMARY_IN) { + pkt = packet_new(s->pri_rs.buf, + s->pri_rs.packet_len, + s->pri_rs.vnet_hdr_len); + } else { + pkt = packet_new(s->sec_rs.buf, + s->sec_rs.packet_len, + s->sec_rs.vnet_hdr_len); + } + + if (parse_packet_early(pkt)) { + packet_destroy(pkt, NULL); + pkt = NULL; + return -1; + } + fill_connection_key(pkt, &key); + + conn = connection_get(s->connection_track_table, + &key, + &s->conn_list); + + if (!conn->processing) { + g_queue_push_tail(&s->conn_list, conn); + conn->processing = true; + } + + if (mode == PRIMARY_IN) { + if (!colo_insert_packet(&conn->primary_list, pkt, &conn->pack)) { + error_report("colo compare primary queue size too big," + "drop packet"); + } + } else { + if (!colo_insert_packet(&conn->secondary_list, pkt, &conn->sack)) { + error_report("colo compare secondary queue size too big," + "drop packet"); + } + } + *con = conn; + + return 0; +} + +static inline bool after(uint32_t seq1, uint32_t seq2) +{ + return (int32_t)(seq1 - seq2) > 0; +} + +static void colo_release_primary_pkt(CompareState *s, Packet *pkt) +{ + int ret; + ret = compare_chr_send(s, + pkt->data, + pkt->size, + pkt->vnet_hdr_len, + false); + if (ret < 0) { + error_report("colo send primary packet failed"); + } + trace_colo_compare_main("packet same and release packet"); + packet_destroy(pkt, NULL); +} + +/* + * The IP packets sent by primary and secondary + * will be compared in here + * TODO support ip fragment, Out-Of-Order + * return: 0 means packet same + * > 0 || < 0 means packet different + */ +static int colo_compare_packet_payload(Packet *ppkt, + Packet *spkt, + uint16_t poffset, + uint16_t soffset, + uint16_t len) + +{ + if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) { + char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20]; + + strcpy(pri_ip_src, inet_ntoa(ppkt->ip->ip_src)); + strcpy(pri_ip_dst, inet_ntoa(ppkt->ip->ip_dst)); + strcpy(sec_ip_src, inet_ntoa(spkt->ip->ip_src)); + strcpy(sec_ip_dst, inet_ntoa(spkt->ip->ip_dst)); + + trace_colo_compare_ip_info(ppkt->size, pri_ip_src, + pri_ip_dst, spkt->size, + sec_ip_src, sec_ip_dst); + } + + return memcmp(ppkt->data + poffset, spkt->data + soffset, len); +} + +/* + * return true means that the payload is consist and + * need to make the next comparison, false means do + * the checkpoint +*/ +static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt, + int8_t *mark, uint32_t max_ack) +{ + *mark = 0; + + if (ppkt->tcp_seq == spkt->tcp_seq && ppkt->seq_end == spkt->seq_end) { + if (!colo_compare_packet_payload(ppkt, spkt, + ppkt->header_size, spkt->header_size, + ppkt->payload_size)) { + *mark = COLO_COMPARE_FREE_SECONDARY | COLO_COMPARE_FREE_PRIMARY; + return true; + } + } + + /* one part of secondary packet payload still need to be compared */ + if (!after(ppkt->seq_end, spkt->seq_end)) { + if (!colo_compare_packet_payload(ppkt, spkt, + ppkt->header_size + ppkt->offset, + spkt->header_size + spkt->offset, + ppkt->payload_size - ppkt->offset)) { + if (!after(ppkt->tcp_ack, max_ack)) { + *mark = COLO_COMPARE_FREE_PRIMARY; + spkt->offset += ppkt->payload_size - ppkt->offset; + return true; + } else { + /* secondary guest hasn't ack the data, don't send + * out this packet + */ + return false; + } + } + } else { + /* primary packet is longer than secondary packet, compare + * the same part and mark the primary packet offset + */ + if (!colo_compare_packet_payload(ppkt, spkt, + ppkt->header_size + ppkt->offset, + spkt->header_size + spkt->offset, + spkt->payload_size - spkt->offset)) { + *mark = COLO_COMPARE_FREE_SECONDARY; + ppkt->offset += spkt->payload_size - spkt->offset; + return true; + } + } + + return false; +} + +static void colo_compare_tcp(CompareState *s, Connection *conn) +{ + Packet *ppkt = NULL, *spkt = NULL; + int8_t mark; + + /* + * If ppkt and spkt have the same payload, but ppkt's ACK + * is greater than spkt's ACK, in this case we can not + * send the ppkt because it will cause the secondary guest + * to miss sending some data in the next. Therefore, we + * record the maximum ACK in the current queue at both + * primary side and secondary side. Only when the ack is + * less than the smaller of the two maximum ack, then we + * can ensure that the packet's payload is acknowledged by + * primary and secondary. + */ + uint32_t min_ack = conn->pack > conn->sack ? conn->sack : conn->pack; + +pri: + if (g_queue_is_empty(&conn->primary_list)) { + return; + } + ppkt = g_queue_pop_head(&conn->primary_list); +sec: + if (g_queue_is_empty(&conn->secondary_list)) { + g_queue_push_head(&conn->primary_list, ppkt); + return; + } + spkt = g_queue_pop_head(&conn->secondary_list); + + if (ppkt->tcp_seq == ppkt->seq_end) { + colo_release_primary_pkt(s, ppkt); + ppkt = NULL; + } + + if (ppkt && conn->compare_seq && !after(ppkt->seq_end, conn->compare_seq)) { + trace_colo_compare_main("pri: this packet has compared"); + colo_release_primary_pkt(s, ppkt); + ppkt = NULL; + } + + if (spkt->tcp_seq == spkt->seq_end) { + packet_destroy(spkt, NULL); + if (!ppkt) { + goto pri; + } else { + goto sec; + } + } else { + if (conn->compare_seq && !after(spkt->seq_end, conn->compare_seq)) { + trace_colo_compare_main("sec: this packet has compared"); + packet_destroy(spkt, NULL); + if (!ppkt) { + goto pri; + } else { + goto sec; + } + } + if (!ppkt) { + g_queue_push_head(&conn->secondary_list, spkt); + goto pri; + } + } + + if (colo_mark_tcp_pkt(ppkt, spkt, &mark, min_ack)) { + trace_colo_compare_tcp_info("pri", + ppkt->tcp_seq, ppkt->tcp_ack, + ppkt->header_size, ppkt->payload_size, + ppkt->offset, ppkt->flags); + + trace_colo_compare_tcp_info("sec", + spkt->tcp_seq, spkt->tcp_ack, + spkt->header_size, spkt->payload_size, + spkt->offset, spkt->flags); + + if (mark == COLO_COMPARE_FREE_PRIMARY) { + conn->compare_seq = ppkt->seq_end; + colo_release_primary_pkt(s, ppkt); + g_queue_push_head(&conn->secondary_list, spkt); + goto pri; + } + if (mark == COLO_COMPARE_FREE_SECONDARY) { + conn->compare_seq = spkt->seq_end; + packet_destroy(spkt, NULL); + goto sec; + } + if (mark == (COLO_COMPARE_FREE_PRIMARY | COLO_COMPARE_FREE_SECONDARY)) { + conn->compare_seq = ppkt->seq_end; + colo_release_primary_pkt(s, ppkt); + packet_destroy(spkt, NULL); + goto pri; + } + } else { + g_queue_push_head(&conn->primary_list, ppkt); + g_queue_push_head(&conn->secondary_list, spkt); + + qemu_hexdump((char *)ppkt->data, stderr, + "colo-compare ppkt", ppkt->size); + qemu_hexdump((char *)spkt->data, stderr, + "colo-compare spkt", spkt->size); + + colo_compare_inconsistency_notify(s); + } +} + + +/* + * Called from the compare thread on the primary + * for compare udp packet + */ +static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt) +{ + uint16_t network_header_length = ppkt->ip->ip_hl << 2; + uint16_t offset = network_header_length + ETH_HLEN + ppkt->vnet_hdr_len; + + trace_colo_compare_main("compare udp"); + + /* + * Because of ppkt and spkt are both in the same connection, + * The ppkt's src ip, dst ip, src port, dst port, ip_proto all are + * same with spkt. In addition, IP header's Identification is a random + * field, we can handle it in IP fragmentation function later. + * COLO just concern the response net packet payload from primary guest + * and secondary guest are same or not, So we ignored all IP header include + * other field like TOS,TTL,IP Checksum. we only need to compare + * the ip payload here. + */ + if (ppkt->size != spkt->size) { + trace_colo_compare_main("UDP: payload size of packets are different"); + return -1; + } + if (colo_compare_packet_payload(ppkt, spkt, offset, offset, + ppkt->size - offset)) { + trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size); + trace_colo_compare_udp_miscompare("Secondary pkt size", spkt->size); + if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) { + qemu_hexdump((char *)ppkt->data, stderr, "colo-compare pri pkt", + ppkt->size); + qemu_hexdump((char *)spkt->data, stderr, "colo-compare sec pkt", + spkt->size); + } + return -1; + } else { + return 0; + } +} + +/* + * Called from the compare thread on the primary + * for compare icmp packet + */ +static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt) +{ + uint16_t network_header_length = ppkt->ip->ip_hl << 2; + uint16_t offset = network_header_length + ETH_HLEN + ppkt->vnet_hdr_len; + + trace_colo_compare_main("compare icmp"); + + /* + * Because of ppkt and spkt are both in the same connection, + * The ppkt's src ip, dst ip, src port, dst port, ip_proto all are + * same with spkt. In addition, IP header's Identification is a random + * field, we can handle it in IP fragmentation function later. + * COLO just concern the response net packet payload from primary guest + * and secondary guest are same or not, So we ignored all IP header include + * other field like TOS,TTL,IP Checksum. we only need to compare + * the ip payload here. + */ + if (ppkt->size != spkt->size) { + trace_colo_compare_main("ICMP: payload size of packets are different"); + return -1; + } + if (colo_compare_packet_payload(ppkt, spkt, offset, offset, + ppkt->size - offset)) { + trace_colo_compare_icmp_miscompare("primary pkt size", + ppkt->size); + trace_colo_compare_icmp_miscompare("Secondary pkt size", + spkt->size); + if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) { + qemu_hexdump((char *)ppkt->data, stderr, "colo-compare pri pkt", + ppkt->size); + qemu_hexdump((char *)spkt->data, stderr, "colo-compare sec pkt", + spkt->size); + } + return -1; + } else { + return 0; + } +} + +/* + * Called from the compare thread on the primary + * for compare other packet + */ +static int colo_packet_compare_other(Packet *spkt, Packet *ppkt) +{ + uint16_t offset = ppkt->vnet_hdr_len; + + trace_colo_compare_main("compare other"); + if (trace_event_get_state_backends(TRACE_COLO_COMPARE_MISCOMPARE)) { + char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20]; + + strcpy(pri_ip_src, inet_ntoa(ppkt->ip->ip_src)); + strcpy(pri_ip_dst, inet_ntoa(ppkt->ip->ip_dst)); + strcpy(sec_ip_src, inet_ntoa(spkt->ip->ip_src)); + strcpy(sec_ip_dst, inet_ntoa(spkt->ip->ip_dst)); + + trace_colo_compare_ip_info(ppkt->size, pri_ip_src, + pri_ip_dst, spkt->size, + sec_ip_src, sec_ip_dst); + } + + if (ppkt->size != spkt->size) { + trace_colo_compare_main("Other: payload size of packets are different"); + return -1; + } + return colo_compare_packet_payload(ppkt, spkt, offset, offset, + ppkt->size - offset); +} + +static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time) +{ + int64_t now = qemu_clock_get_ms(QEMU_CLOCK_HOST); + + if ((now - pkt->creation_ms) > (*check_time)) { + trace_colo_old_packet_check_found(pkt->creation_ms); + return 0; + } else { + return 1; + } +} + +void colo_compare_register_notifier(Notifier *notify) +{ + notifier_list_add(&colo_compare_notifiers, notify); +} + +void colo_compare_unregister_notifier(Notifier *notify) +{ + notifier_remove(notify); +} + +static int colo_old_packet_check_one_conn(Connection *conn, + CompareState *s) +{ + GList *result = NULL; + int64_t check_time = REGULAR_PACKET_CHECK_MS; + + result = g_queue_find_custom(&conn->primary_list, + &check_time, + (GCompareFunc)colo_old_packet_check_one); + + if (result) { + /* Do checkpoint will flush old packet */ + colo_compare_inconsistency_notify(s); + return 0; + } + + return 1; +} + +/* + * Look for old packets that the secondary hasn't matched, + * if we have some then we have to checkpoint to wake + * the secondary up. + */ +static void colo_old_packet_check(void *opaque) +{ + CompareState *s = opaque; + + /* + * If we find one old packet, stop finding job and notify + * COLO frame do checkpoint. + */ + g_queue_find_custom(&s->conn_list, s, + (GCompareFunc)colo_old_packet_check_one_conn); +} + +static void colo_compare_packet(CompareState *s, Connection *conn, + int (*HandlePacket)(Packet *spkt, + Packet *ppkt)) +{ + Packet *pkt = NULL; + GList *result = NULL; + + while (!g_queue_is_empty(&conn->primary_list) && + !g_queue_is_empty(&conn->secondary_list)) { + pkt = g_queue_pop_head(&conn->primary_list); + result = g_queue_find_custom(&conn->secondary_list, + pkt, (GCompareFunc)HandlePacket); + + if (result) { + colo_release_primary_pkt(s, pkt); + g_queue_remove(&conn->secondary_list, result->data); + } else { + /* + * If one packet arrive late, the secondary_list or + * primary_list will be empty, so we can't compare it + * until next comparison. If the packets in the list are + * timeout, it will trigger a checkpoint request. + */ + trace_colo_compare_main("packet different"); + g_queue_push_head(&conn->primary_list, pkt); + + colo_compare_inconsistency_notify(s); + break; + } + } +} + +/* + * Called from the compare thread on the primary + * for compare packet with secondary list of the + * specified connection when a new packet was + * queued to it. + */ +static void colo_compare_connection(void *opaque, void *user_data) +{ + CompareState *s = user_data; + Connection *conn = opaque; + + switch (conn->ip_proto) { + case IPPROTO_TCP: + colo_compare_tcp(s, conn); + break; + case IPPROTO_UDP: + colo_compare_packet(s, conn, colo_packet_compare_udp); + break; + case IPPROTO_ICMP: + colo_compare_packet(s, conn, colo_packet_compare_icmp); + break; + default: + colo_compare_packet(s, conn, colo_packet_compare_other); + break; + } +} + +static int compare_chr_send(CompareState *s, + const uint8_t *buf, + uint32_t size, + uint32_t vnet_hdr_len, + bool notify_remote_frame) +{ + int ret = 0; + uint32_t len = htonl(size); + + if (!size) { + return 0; + } + + if (notify_remote_frame) { + ret = qemu_chr_fe_write_all(&s->chr_notify_dev, + (uint8_t *)&len, + sizeof(len)); + } else { + ret = qemu_chr_fe_write_all(&s->chr_out, (uint8_t *)&len, sizeof(len)); + } + + if (ret != sizeof(len)) { + goto err; + } + + if (s->vnet_hdr) { + /* + * We send vnet header len make other module(like filter-redirector) + * know how to parse net packet correctly. + */ + len = htonl(vnet_hdr_len); + + if (!notify_remote_frame) { + ret = qemu_chr_fe_write_all(&s->chr_out, + (uint8_t *)&len, + sizeof(len)); + } + + if (ret != sizeof(len)) { + goto err; + } + } + + if (notify_remote_frame) { + ret = qemu_chr_fe_write_all(&s->chr_notify_dev, + (uint8_t *)buf, + size); + } else { + ret = qemu_chr_fe_write_all(&s->chr_out, (uint8_t *)buf, size); + } + + if (ret != size) { + goto err; + } + + return 0; + +err: + return ret < 0 ? ret : -EIO; +} + +static int compare_chr_can_read(void *opaque) +{ + return COMPARE_READ_LEN_MAX; +} + +/* + * Called from the main thread on the primary for packets + * arriving over the socket from the primary. + */ +static void compare_pri_chr_in(void *opaque, const uint8_t *buf, int size) +{ + CompareState *s = COLO_COMPARE(opaque); + int ret; + + ret = net_fill_rstate(&s->pri_rs, buf, size); + if (ret == -1) { + qemu_chr_fe_set_handlers(&s->chr_pri_in, NULL, NULL, NULL, NULL, + NULL, NULL, true); + error_report("colo-compare primary_in error"); + } +} + +/* + * Called from the main thread on the primary for packets + * arriving over the socket from the secondary. + */ +static void compare_sec_chr_in(void *opaque, const uint8_t *buf, int size) +{ + CompareState *s = COLO_COMPARE(opaque); + int ret; + + ret = net_fill_rstate(&s->sec_rs, buf, size); + if (ret == -1) { + qemu_chr_fe_set_handlers(&s->chr_sec_in, NULL, NULL, NULL, NULL, + NULL, NULL, true); + error_report("colo-compare secondary_in error"); + } +} + +static void compare_notify_chr(void *opaque, const uint8_t *buf, int size) +{ + CompareState *s = COLO_COMPARE(opaque); + int ret; + + ret = net_fill_rstate(&s->notify_rs, buf, size); + if (ret == -1) { + qemu_chr_fe_set_handlers(&s->chr_notify_dev, NULL, NULL, NULL, NULL, + NULL, NULL, true); + error_report("colo-compare notify_dev error"); + } +} + +/* + * Check old packet regularly so it can watch for any packets + * that the secondary hasn't produced equivalents of. + */ +static void check_old_packet_regular(void *opaque) +{ + CompareState *s = opaque; + + /* if have old packet we will notify checkpoint */ + colo_old_packet_check(s); + timer_mod(s->packet_check_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + + REGULAR_PACKET_CHECK_MS); +} + +/* Public API, Used for COLO frame to notify compare event */ +void colo_notify_compares_event(void *opaque, int event, Error **errp) +{ + CompareState *s; + + qemu_mutex_lock(&event_mtx); + QTAILQ_FOREACH(s, &net_compares, next) { + s->event = event; + qemu_bh_schedule(s->event_bh); + event_unhandled_count++; + } + /* Wait all compare threads to finish handling this event */ + while (event_unhandled_count > 0) { + qemu_cond_wait(&event_complete_cond, &event_mtx); + } + + qemu_mutex_unlock(&event_mtx); +} + +static void colo_compare_timer_init(CompareState *s) +{ + AioContext *ctx = iothread_get_aio_context(s->iothread); + + s->packet_check_timer = aio_timer_new(ctx, QEMU_CLOCK_VIRTUAL, + SCALE_MS, check_old_packet_regular, + s); + timer_mod(s->packet_check_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + + REGULAR_PACKET_CHECK_MS); +} + +static void colo_compare_timer_del(CompareState *s) +{ + if (s->packet_check_timer) { + timer_del(s->packet_check_timer); + timer_free(s->packet_check_timer); + s->packet_check_timer = NULL; + } + } + +static void colo_flush_packets(void *opaque, void *user_data); + +static void colo_compare_handle_event(void *opaque) +{ + CompareState *s = opaque; + + switch (s->event) { + case COLO_EVENT_CHECKPOINT: + g_queue_foreach(&s->conn_list, colo_flush_packets, s); + break; + case COLO_EVENT_FAILOVER: + break; + default: + break; + } + + qemu_mutex_lock(&event_mtx); + assert(event_unhandled_count > 0); + event_unhandled_count--; + qemu_cond_broadcast(&event_complete_cond); + qemu_mutex_unlock(&event_mtx); +} + +static void colo_compare_iothread(CompareState *s) +{ + object_ref(OBJECT(s->iothread)); + s->worker_context = iothread_get_g_main_context(s->iothread); + + qemu_chr_fe_set_handlers(&s->chr_pri_in, compare_chr_can_read, + compare_pri_chr_in, NULL, NULL, + s, s->worker_context, true); + qemu_chr_fe_set_handlers(&s->chr_sec_in, compare_chr_can_read, + compare_sec_chr_in, NULL, NULL, + s, s->worker_context, true); + if (s->notify_dev) { + qemu_chr_fe_set_handlers(&s->chr_notify_dev, compare_chr_can_read, + compare_notify_chr, NULL, NULL, + s, s->worker_context, true); + } + + colo_compare_timer_init(s); + s->event_bh = qemu_bh_new(colo_compare_handle_event, s); +} + +static char *compare_get_pri_indev(Object *obj, Error **errp) +{ + CompareState *s = COLO_COMPARE(obj); + + return g_strdup(s->pri_indev); +} + +static void compare_set_pri_indev(Object *obj, const char *value, Error **errp) +{ + CompareState *s = COLO_COMPARE(obj); + + g_free(s->pri_indev); + s->pri_indev = g_strdup(value); +} + +static char *compare_get_sec_indev(Object *obj, Error **errp) +{ + CompareState *s = COLO_COMPARE(obj); + + return g_strdup(s->sec_indev); +} + +static void compare_set_sec_indev(Object *obj, const char *value, Error **errp) +{ + CompareState *s = COLO_COMPARE(obj); + + g_free(s->sec_indev); + s->sec_indev = g_strdup(value); +} + +static char *compare_get_outdev(Object *obj, Error **errp) +{ + CompareState *s = COLO_COMPARE(obj); + + return g_strdup(s->outdev); +} + +static void compare_set_outdev(Object *obj, const char *value, Error **errp) +{ + CompareState *s = COLO_COMPARE(obj); + + g_free(s->outdev); + s->outdev = g_strdup(value); +} + +static bool compare_get_vnet_hdr(Object *obj, Error **errp) +{ + CompareState *s = COLO_COMPARE(obj); + + return s->vnet_hdr; +} + +static void compare_set_vnet_hdr(Object *obj, + bool value, + Error **errp) +{ + CompareState *s = COLO_COMPARE(obj); + + s->vnet_hdr = value; +} + +static char *compare_get_notify_dev(Object *obj, Error **errp) +{ + CompareState *s = COLO_COMPARE(obj); + + return g_strdup(s->notify_dev); +} + +static void compare_set_notify_dev(Object *obj, const char *value, Error **errp) +{ + CompareState *s = COLO_COMPARE(obj); + + g_free(s->notify_dev); + s->notify_dev = g_strdup(value); +} + +static void compare_pri_rs_finalize(SocketReadState *pri_rs) +{ + CompareState *s = container_of(pri_rs, CompareState, pri_rs); + Connection *conn = NULL; + + if (packet_enqueue(s, PRIMARY_IN, &conn)) { + trace_colo_compare_main("primary: unsupported packet in"); + compare_chr_send(s, + pri_rs->buf, + pri_rs->packet_len, + pri_rs->vnet_hdr_len, + false); + } else { + /* compare packet in the specified connection */ + colo_compare_connection(conn, s); + } +} + +static void compare_sec_rs_finalize(SocketReadState *sec_rs) +{ + CompareState *s = container_of(sec_rs, CompareState, sec_rs); + Connection *conn = NULL; + + if (packet_enqueue(s, SECONDARY_IN, &conn)) { + trace_colo_compare_main("secondary: unsupported packet in"); + } else { + /* compare packet in the specified connection */ + colo_compare_connection(conn, s); + } +} + +static void compare_notify_rs_finalize(SocketReadState *notify_rs) +{ + CompareState *s = container_of(notify_rs, CompareState, notify_rs); + + const char msg[] = "COLO_COMPARE_GET_XEN_INIT"; + int ret; + + if (packet_matches_str("COLO_USERSPACE_PROXY_INIT", + notify_rs->buf, + notify_rs->packet_len)) { + ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true); + if (ret < 0) { + error_report("Notify Xen COLO-frame INIT failed"); + } + } else if (packet_matches_str("COLO_CHECKPOINT", + notify_rs->buf, + notify_rs->packet_len)) { + /* colo-compare do checkpoint, flush pri packet and remove sec packet */ + g_queue_foreach(&s->conn_list, colo_flush_packets, s); + } else { + error_report("COLO compare got unsupported instruction"); + } +} + +/* + * Return 0 is success. + * Return 1 is failed. + */ +static int find_and_check_chardev(Chardev **chr, + char *chr_name, + Error **errp) +{ + *chr = qemu_chr_find(chr_name); + if (*chr == NULL) { + error_setg(errp, "Device '%s' not found", + chr_name); + return 1; + } + + if (!qemu_chr_has_feature(*chr, QEMU_CHAR_FEATURE_RECONNECTABLE)) { + error_setg(errp, "chardev \"%s\" is not reconnectable", + chr_name); + return 1; + } + + if (!qemu_chr_has_feature(*chr, QEMU_CHAR_FEATURE_GCONTEXT)) { + error_setg(errp, "chardev \"%s\" cannot switch context", + chr_name); + return 1; + } + + return 0; +} + +/* + * Called from the main thread on the primary + * to setup colo-compare. + */ +static void colo_compare_complete(UserCreatable *uc, Error **errp) +{ + CompareState *s = COLO_COMPARE(uc); + Chardev *chr; + + if (!s->pri_indev || !s->sec_indev || !s->outdev || !s->iothread) { + error_setg(errp, "colo compare needs 'primary_in' ," + "'secondary_in','outdev','iothread' property set"); + return; + } else if (!strcmp(s->pri_indev, s->outdev) || + !strcmp(s->sec_indev, s->outdev) || + !strcmp(s->pri_indev, s->sec_indev)) { + error_setg(errp, "'indev' and 'outdev' could not be same " + "for compare module"); + return; + } + + if (find_and_check_chardev(&chr, s->pri_indev, errp) || + !qemu_chr_fe_init(&s->chr_pri_in, chr, errp)) { + return; + } + + if (find_and_check_chardev(&chr, s->sec_indev, errp) || + !qemu_chr_fe_init(&s->chr_sec_in, chr, errp)) { + return; + } + + if (find_and_check_chardev(&chr, s->outdev, errp) || + !qemu_chr_fe_init(&s->chr_out, chr, errp)) { + return; + } + + net_socket_rs_init(&s->pri_rs, compare_pri_rs_finalize, s->vnet_hdr); + net_socket_rs_init(&s->sec_rs, compare_sec_rs_finalize, s->vnet_hdr); + + /* Try to enable remote notify chardev, currently just for Xen COLO */ + if (s->notify_dev) { + if (find_and_check_chardev(&chr, s->notify_dev, errp) || + !qemu_chr_fe_init(&s->chr_notify_dev, chr, errp)) { + return; + } + + net_socket_rs_init(&s->notify_rs, compare_notify_rs_finalize, + s->vnet_hdr); + } + + QTAILQ_INSERT_TAIL(&net_compares, s, next); + + g_queue_init(&s->conn_list); + + qemu_mutex_init(&event_mtx); + qemu_cond_init(&event_complete_cond); + + s->connection_track_table = g_hash_table_new_full(connection_key_hash, + connection_key_equal, + g_free, + connection_destroy); + + colo_compare_iothread(s); + return; +} + +static void colo_flush_packets(void *opaque, void *user_data) +{ + CompareState *s = user_data; + Connection *conn = opaque; + Packet *pkt = NULL; + + while (!g_queue_is_empty(&conn->primary_list)) { + pkt = g_queue_pop_head(&conn->primary_list); + compare_chr_send(s, + pkt->data, + pkt->size, + pkt->vnet_hdr_len, + false); + packet_destroy(pkt, NULL); + } + while (!g_queue_is_empty(&conn->secondary_list)) { + pkt = g_queue_pop_head(&conn->secondary_list); + packet_destroy(pkt, NULL); + } +} + +static void colo_compare_class_init(ObjectClass *oc, void *data) +{ + UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); + + ucc->complete = colo_compare_complete; +} + +static void colo_compare_init(Object *obj) +{ + CompareState *s = COLO_COMPARE(obj); + + object_property_add_str(obj, "primary_in", + compare_get_pri_indev, compare_set_pri_indev, + NULL); + object_property_add_str(obj, "secondary_in", + compare_get_sec_indev, compare_set_sec_indev, + NULL); + object_property_add_str(obj, "outdev", + compare_get_outdev, compare_set_outdev, + NULL); + object_property_add_link(obj, "iothread", TYPE_IOTHREAD, + (Object **)&s->iothread, + object_property_allow_set_link, + OBJ_PROP_LINK_STRONG, NULL); + /* This parameter just for Xen COLO */ + object_property_add_str(obj, "notify_dev", + compare_get_notify_dev, compare_set_notify_dev, + NULL); + + s->vnet_hdr = false; + object_property_add_bool(obj, "vnet_hdr_support", compare_get_vnet_hdr, + compare_set_vnet_hdr, NULL); +} + +static void colo_compare_finalize(Object *obj) +{ + CompareState *s = COLO_COMPARE(obj); + CompareState *tmp = NULL; + + qemu_chr_fe_deinit(&s->chr_pri_in, false); + qemu_chr_fe_deinit(&s->chr_sec_in, false); + qemu_chr_fe_deinit(&s->chr_out, false); + if (s->notify_dev) { + qemu_chr_fe_deinit(&s->chr_notify_dev, false); + } + + if (s->iothread) { + colo_compare_timer_del(s); + } + + qemu_bh_delete(s->event_bh); + + QTAILQ_FOREACH(tmp, &net_compares, next) { + if (tmp == s) { + QTAILQ_REMOVE(&net_compares, s, next); + break; + } + } + + /* Release all unhandled packets after compare thead exited */ + g_queue_foreach(&s->conn_list, colo_flush_packets, s); + + g_queue_clear(&s->conn_list); + + if (s->connection_track_table) { + g_hash_table_destroy(s->connection_track_table); + } + + if (s->iothread) { + object_unref(OBJECT(s->iothread)); + } + + qemu_mutex_destroy(&event_mtx); + qemu_cond_destroy(&event_complete_cond); + + g_free(s->pri_indev); + g_free(s->sec_indev); + g_free(s->outdev); + g_free(s->notify_dev); +} + +static const TypeInfo colo_compare_info = { + .name = TYPE_COLO_COMPARE, + .parent = TYPE_OBJECT, + .instance_size = sizeof(CompareState), + .instance_init = colo_compare_init, + .instance_finalize = colo_compare_finalize, + .class_size = sizeof(CompareClass), + .class_init = colo_compare_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void register_types(void) +{ + type_register_static(&colo_compare_info); +} + +type_init(register_types); diff --git a/net/colo-compare.h b/net/colo-compare.h new file mode 100644 index 000000000..22ddd512e --- /dev/null +++ b/net/colo-compare.h @@ -0,0 +1,24 @@ +/* + * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO) + * (a.k.a. Fault Tolerance or Continuous Replication) + * + * Copyright (c) 2017 HUAWEI TECHNOLOGIES CO., LTD. + * Copyright (c) 2017 FUJITSU LIMITED + * Copyright (c) 2017 Intel Corporation + * + * Authors: + * zhanghailiang <zhang.zhanghailiang@huawei.com> + * Zhang Chen <zhangckid@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_COLO_COMPARE_H +#define QEMU_COLO_COMPARE_H + +void colo_notify_compares_event(void *opaque, int event, Error **errp); +void colo_compare_register_notifier(Notifier *notify); +void colo_compare_unregister_notifier(Notifier *notify); + +#endif /* QEMU_COLO_COMPARE_H */ diff --git a/net/colo.c b/net/colo.c new file mode 100644 index 000000000..8196b3583 --- /dev/null +++ b/net/colo.c @@ -0,0 +1,232 @@ +/* + * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO) + * (a.k.a. Fault Tolerance or Continuous Replication) + * + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * Copyright (c) 2016 FUJITSU LIMITED + * Copyright (c) 2016 Intel Corporation + * + * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "trace.h" +#include "colo.h" +#include "util.h" + +uint32_t connection_key_hash(const void *opaque) +{ + const ConnectionKey *key = opaque; + uint32_t a, b, c; + + /* Jenkins hash */ + a = b = c = JHASH_INITVAL + sizeof(*key); + a += key->src.s_addr; + b += key->dst.s_addr; + c += (key->src_port | key->dst_port << 16); + __jhash_mix(a, b, c); + + a += key->ip_proto; + __jhash_final(a, b, c); + + return c; +} + +int connection_key_equal(const void *key1, const void *key2) +{ + return memcmp(key1, key2, sizeof(ConnectionKey)) == 0; +} + +int parse_packet_early(Packet *pkt) +{ + int network_length; + static const uint8_t vlan[] = {0x81, 0x00}; + uint8_t *data = pkt->data + pkt->vnet_hdr_len; + uint16_t l3_proto; + ssize_t l2hdr_len = eth_get_l2_hdr_length(data); + + if (pkt->size < ETH_HLEN + pkt->vnet_hdr_len) { + trace_colo_proxy_main("pkt->size < ETH_HLEN"); + return 1; + } + + /* + * TODO: support vlan. + */ + if (!memcmp(&data[12], vlan, sizeof(vlan))) { + trace_colo_proxy_main("COLO-proxy don't support vlan"); + return 1; + } + + pkt->network_header = data + l2hdr_len; + + const struct iovec l2vec = { + .iov_base = (void *) data, + .iov_len = l2hdr_len + }; + l3_proto = eth_get_l3_proto(&l2vec, 1, l2hdr_len); + + if (l3_proto != ETH_P_IP) { + return 1; + } + + network_length = pkt->ip->ip_hl * 4; + if (pkt->size < l2hdr_len + network_length + pkt->vnet_hdr_len) { + trace_colo_proxy_main("pkt->size < network_header + network_length"); + return 1; + } + pkt->transport_header = pkt->network_header + network_length; + + return 0; +} + +void extract_ip_and_port(uint32_t tmp_ports, ConnectionKey *key, Packet *pkt) +{ + key->src = pkt->ip->ip_src; + key->dst = pkt->ip->ip_dst; + key->src_port = ntohs(tmp_ports >> 16); + key->dst_port = ntohs(tmp_ports & 0xffff); +} + +void fill_connection_key(Packet *pkt, ConnectionKey *key) +{ + uint32_t tmp_ports; + + memset(key, 0, sizeof(*key)); + key->ip_proto = pkt->ip->ip_p; + + switch (key->ip_proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_DCCP: + case IPPROTO_ESP: + case IPPROTO_SCTP: + case IPPROTO_UDPLITE: + tmp_ports = *(uint32_t *)(pkt->transport_header); + extract_ip_and_port(tmp_ports, key, pkt); + break; + case IPPROTO_AH: + tmp_ports = *(uint32_t *)(pkt->transport_header + 4); + extract_ip_and_port(tmp_ports, key, pkt); + break; + default: + break; + } +} + +void reverse_connection_key(ConnectionKey *key) +{ + struct in_addr tmp_ip; + uint16_t tmp_port; + + tmp_ip = key->src; + key->src = key->dst; + key->dst = tmp_ip; + + tmp_port = key->src_port; + key->src_port = key->dst_port; + key->dst_port = tmp_port; +} + +Connection *connection_new(ConnectionKey *key) +{ + Connection *conn = g_slice_new(Connection); + + conn->ip_proto = key->ip_proto; + conn->processing = false; + conn->offset = 0; + conn->tcp_state = TCPS_CLOSED; + conn->pack = 0; + conn->sack = 0; + g_queue_init(&conn->primary_list); + g_queue_init(&conn->secondary_list); + + return conn; +} + +void connection_destroy(void *opaque) +{ + Connection *conn = opaque; + + g_queue_foreach(&conn->primary_list, packet_destroy, NULL); + g_queue_clear(&conn->primary_list); + g_queue_foreach(&conn->secondary_list, packet_destroy, NULL); + g_queue_clear(&conn->secondary_list); + g_slice_free(Connection, conn); +} + +Packet *packet_new(const void *data, int size, int vnet_hdr_len) +{ + Packet *pkt = g_slice_new(Packet); + + pkt->data = g_memdup(data, size); + pkt->size = size; + pkt->creation_ms = qemu_clock_get_ms(QEMU_CLOCK_HOST); + pkt->vnet_hdr_len = vnet_hdr_len; + pkt->tcp_seq = 0; + pkt->tcp_ack = 0; + pkt->seq_end = 0; + pkt->header_size = 0; + pkt->payload_size = 0; + pkt->offset = 0; + pkt->flags = 0; + + return pkt; +} + +void packet_destroy(void *opaque, void *user_data) +{ + Packet *pkt = opaque; + + g_free(pkt->data); + g_slice_free(Packet, pkt); +} + +/* + * Clear hashtable, stop this hash growing really huge + */ +void connection_hashtable_reset(GHashTable *connection_track_table) +{ + g_hash_table_remove_all(connection_track_table); +} + +/* if not found, create a new connection and add to hash table */ +Connection *connection_get(GHashTable *connection_track_table, + ConnectionKey *key, + GQueue *conn_list) +{ + Connection *conn = g_hash_table_lookup(connection_track_table, key); + + if (conn == NULL) { + ConnectionKey *new_key = g_memdup(key, sizeof(*key)); + + conn = connection_new(key); + + if (g_hash_table_size(connection_track_table) > HASHTABLE_MAX_SIZE) { + trace_colo_proxy_main("colo proxy connection hashtable full," + " clear it"); + connection_hashtable_reset(connection_track_table); + /* + * clear the conn_list + */ + while (!g_queue_is_empty(conn_list)) { + connection_destroy(g_queue_pop_head(conn_list)); + } + } + + g_hash_table_insert(connection_track_table, new_key, conn); + } + + return conn; +} + +bool connection_has_tracked(GHashTable *connection_track_table, + ConnectionKey *key) +{ + Connection *conn = g_hash_table_lookup(connection_track_table, key); + + return conn ? true : false; +} diff --git a/net/colo.h b/net/colo.h new file mode 100644 index 000000000..679314b1c --- /dev/null +++ b/net/colo.h @@ -0,0 +1,106 @@ +/* + * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO) + * (a.k.a. Fault Tolerance or Continuous Replication) + * + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * Copyright (c) 2016 FUJITSU LIMITED + * Copyright (c) 2016 Intel Corporation + * + * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef NET_COLO_H +#define NET_COLO_H + +#include "qemu/jhash.h" +#include "qemu/timer.h" +#include "net/eth.h" + +#define HASHTABLE_MAX_SIZE 16384 + +#ifndef IPPROTO_DCCP +#define IPPROTO_DCCP 33 +#endif + +#ifndef IPPROTO_SCTP +#define IPPROTO_SCTP 132 +#endif + +#ifndef IPPROTO_UDPLITE +#define IPPROTO_UDPLITE 136 +#endif + +typedef struct Packet { + void *data; + union { + uint8_t *network_header; + struct ip *ip; + }; + uint8_t *transport_header; + int size; + /* Time of packet creation, in wall clock ms */ + int64_t creation_ms; + /* Get vnet_hdr_len from filter */ + uint32_t vnet_hdr_len; + uint32_t tcp_seq; /* sequence number */ + uint32_t tcp_ack; /* acknowledgement number */ + /* the sequence number of the last byte of the packet */ + uint32_t seq_end; + uint8_t header_size; /* the header length */ + uint16_t payload_size; /* the payload length */ + /* record the payload offset(the length that has been compared) */ + uint16_t offset; + uint8_t flags; /* Flags(aka Control bits) */ +} Packet; + +typedef struct ConnectionKey { + /* (src, dst) must be grouped, in the same way than in IP header */ + struct in_addr src; + struct in_addr dst; + uint16_t src_port; + uint16_t dst_port; + uint8_t ip_proto; +} QEMU_PACKED ConnectionKey; + +typedef struct Connection { + /* connection primary send queue: element type: Packet */ + GQueue primary_list; + /* connection secondary send queue: element type: Packet */ + GQueue secondary_list; + /* flag to enqueue unprocessed_connections */ + bool processing; + uint8_t ip_proto; + /* record the sequence number that has been compared */ + uint32_t compare_seq; + /* the maximum of acknowledgement number in primary_list queue */ + uint32_t pack; + /* the maximum of acknowledgement number in secondary_list queue */ + uint32_t sack; + /* offset = secondary_seq - primary_seq */ + uint32_t offset; + + int tcp_state; /* TCP FSM state */ + uint32_t fin_ack_seq; /* the seq of 'fin=1,ack=1' */ +} Connection; + +uint32_t connection_key_hash(const void *opaque); +int connection_key_equal(const void *opaque1, const void *opaque2); +int parse_packet_early(Packet *pkt); +void extract_ip_and_port(uint32_t tmp_ports, ConnectionKey *key, Packet *pkt); +void fill_connection_key(Packet *pkt, ConnectionKey *key); +void reverse_connection_key(ConnectionKey *key); +Connection *connection_new(ConnectionKey *key); +void connection_destroy(void *opaque); +Connection *connection_get(GHashTable *connection_track_table, + ConnectionKey *key, + GQueue *conn_list); +bool connection_has_tracked(GHashTable *connection_track_table, + ConnectionKey *key); +void connection_hashtable_reset(GHashTable *connection_track_table); +Packet *packet_new(const void *data, int size, int vnet_hdr_len); +void packet_destroy(void *opaque, void *user_data); + +#endif /* NET_COLO_H */ diff --git a/net/dump.c b/net/dump.c index 411972172..23b3628dd 100644 --- a/net/dump.c +++ b/net/dump.c @@ -22,15 +22,19 @@ * THE SOFTWARE. */ -#include "clients.h" +#include "qemu/osdep.h" #include "qemu-common.h" +#include "clients.h" +#include "qapi/error.h" #include "qemu/error-report.h" +#include "qemu/iov.h" #include "qemu/log.h" +#include "qemu/module.h" #include "qemu/timer.h" -#include "hub.h" +#include "qapi/visitor.h" +#include "net/filter.h" typedef struct DumpState { - NetClientState nc; int64_t start_ts; int fd; int pcap_caplen; @@ -57,28 +61,33 @@ struct pcap_sf_pkthdr { uint32_t len; }; -static ssize_t dump_receive(NetClientState *nc, const uint8_t *buf, size_t size) +static ssize_t dump_receive_iov(DumpState *s, const struct iovec *iov, int cnt) { - DumpState *s = DO_UPCAST(DumpState, nc, nc); struct pcap_sf_pkthdr hdr; int64_t ts; int caplen; + size_t size = iov_size(iov, cnt); + struct iovec dumpiov[cnt + 1]; /* Early return in case of previous error. */ if (s->fd < 0) { return size; } - ts = muldiv64(qemu_get_clock_ns(vm_clock), 1000000, get_ticks_per_sec()); + ts = qemu_clock_get_us(QEMU_CLOCK_VIRTUAL); caplen = size > s->pcap_caplen ? s->pcap_caplen : size; hdr.ts.tv_sec = ts / 1000000 + s->start_ts; hdr.ts.tv_usec = ts % 1000000; hdr.caplen = caplen; hdr.len = size; - if (write(s->fd, &hdr, sizeof(hdr)) != sizeof(hdr) || - write(s->fd, buf, caplen) != caplen) { - qemu_log("-net dump write error - stop dump\n"); + + dumpiov[0].iov_base = &hdr; + dumpiov[0].iov_len = sizeof(hdr); + cnt = iov_copy(&dumpiov[1], cnt, iov, cnt, 0, caplen); + + if (writev(s->fd, dumpiov, cnt + 1) != sizeof(hdr) + caplen) { + error_report("network dump write error - stopping dump"); close(s->fd); s->fd = -1; } @@ -86,32 +95,22 @@ static ssize_t dump_receive(NetClientState *nc, const uint8_t *buf, size_t size) return size; } -static void dump_cleanup(NetClientState *nc) +static void dump_cleanup(DumpState *s) { - DumpState *s = DO_UPCAST(DumpState, nc, nc); - close(s->fd); + s->fd = -1; } -static NetClientInfo net_dump_info = { - .type = NET_CLIENT_OPTIONS_KIND_DUMP, - .size = sizeof(DumpState), - .receive = dump_receive, - .cleanup = dump_cleanup, -}; - -static int net_dump_init(NetClientState *peer, const char *device, - const char *name, const char *filename, int len) +static int net_dump_state_init(DumpState *s, const char *filename, + int len, Error **errp) { struct pcap_file_hdr hdr; - NetClientState *nc; - DumpState *s; struct tm tm; int fd; fd = open(filename, O_CREAT | O_TRUNC | O_WRONLY | O_BINARY, 0644); if (fd < 0) { - error_report("-net dump: can't open %s", filename); + error_setg_errno(errp, errno, "net dump: can't open %s", filename); return -1; } @@ -124,18 +123,11 @@ static int net_dump_init(NetClientState *peer, const char *device, hdr.linktype = 1; if (write(fd, &hdr, sizeof(hdr)) < sizeof(hdr)) { - error_report("-net dump write error: %s", strerror(errno)); + error_setg_errno(errp, errno, "net dump write error"); close(fd); return -1; } - nc = qemu_new_net_client(&net_dump_info, peer, device, name); - - snprintf(nc->info_str, sizeof(nc->info_str), - "dump to %s (len=%d)", filename, len); - - s = DO_UPCAST(DumpState, nc, nc); - s->fd = fd; s->pcap_caplen = len; @@ -145,41 +137,134 @@ static int net_dump_init(NetClientState *peer, const char *device, return 0; } -int net_init_dump(const NetClientOptions *opts, const char *name, - NetClientState *peer) +#define TYPE_FILTER_DUMP "filter-dump" + +#define FILTER_DUMP(obj) \ + OBJECT_CHECK(NetFilterDumpState, (obj), TYPE_FILTER_DUMP) + +struct NetFilterDumpState { + NetFilterState nfs; + DumpState ds; + char *filename; + uint32_t maxlen; +}; +typedef struct NetFilterDumpState NetFilterDumpState; + +static ssize_t filter_dump_receive_iov(NetFilterState *nf, NetClientState *sndr, + unsigned flags, const struct iovec *iov, + int iovcnt, NetPacketSent *sent_cb) { - int len; - const char *file; - char def_file[128]; - const NetdevDumpOptions *dump; + NetFilterDumpState *nfds = FILTER_DUMP(nf); - assert(opts->kind == NET_CLIENT_OPTIONS_KIND_DUMP); - dump = opts->dump; + dump_receive_iov(&nfds->ds, iov, iovcnt); + return 0; +} - assert(peer); +static void filter_dump_cleanup(NetFilterState *nf) +{ + NetFilterDumpState *nfds = FILTER_DUMP(nf); - if (dump->has_file) { - file = dump->file; - } else { - int id; - int ret; + dump_cleanup(&nfds->ds); +} - ret = net_hub_id_for_client(peer, &id); - assert(ret == 0); /* peer must be on a hub */ +static void filter_dump_setup(NetFilterState *nf, Error **errp) +{ + NetFilterDumpState *nfds = FILTER_DUMP(nf); - snprintf(def_file, sizeof(def_file), "qemu-vlan%d.pcap", id); - file = def_file; + if (!nfds->filename) { + error_setg(errp, "dump filter needs 'file' property set!"); + return; } - if (dump->has_len) { - if (dump->len > INT_MAX) { - error_report("invalid length: %"PRIu64, dump->len); - return -1; - } - len = dump->len; - } else { - len = 65536; + net_dump_state_init(&nfds->ds, nfds->filename, nfds->maxlen, errp); +} + +static void filter_dump_get_maxlen(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + NetFilterDumpState *nfds = FILTER_DUMP(obj); + uint32_t value = nfds->maxlen; + + visit_type_uint32(v, name, &value, errp); +} + +static void filter_dump_set_maxlen(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + NetFilterDumpState *nfds = FILTER_DUMP(obj); + Error *local_err = NULL; + uint32_t value; + + visit_type_uint32(v, name, &value, &local_err); + if (local_err) { + goto out; + } + if (value == 0) { + error_setg(&local_err, "Property '%s.%s' doesn't take value '%u'", + object_get_typename(obj), name, value); + goto out; } + nfds->maxlen = value; - return net_dump_init(peer, "dump", name, file, len); +out: + error_propagate(errp, local_err); } + +static char *file_dump_get_filename(Object *obj, Error **errp) +{ + NetFilterDumpState *nfds = FILTER_DUMP(obj); + + return g_strdup(nfds->filename); +} + +static void file_dump_set_filename(Object *obj, const char *value, Error **errp) +{ + NetFilterDumpState *nfds = FILTER_DUMP(obj); + + g_free(nfds->filename); + nfds->filename = g_strdup(value); +} + +static void filter_dump_instance_init(Object *obj) +{ + NetFilterDumpState *nfds = FILTER_DUMP(obj); + + nfds->maxlen = 65536; + + object_property_add(obj, "maxlen", "uint32", filter_dump_get_maxlen, + filter_dump_set_maxlen, NULL, NULL, NULL); + object_property_add_str(obj, "file", file_dump_get_filename, + file_dump_set_filename, NULL); +} + +static void filter_dump_instance_finalize(Object *obj) +{ + NetFilterDumpState *nfds = FILTER_DUMP(obj); + + g_free(nfds->filename); +} + +static void filter_dump_class_init(ObjectClass *oc, void *data) +{ + NetFilterClass *nfc = NETFILTER_CLASS(oc); + + nfc->setup = filter_dump_setup; + nfc->cleanup = filter_dump_cleanup; + nfc->receive_iov = filter_dump_receive_iov; +} + +static const TypeInfo filter_dump_info = { + .name = TYPE_FILTER_DUMP, + .parent = TYPE_NETFILTER, + .class_init = filter_dump_class_init, + .instance_init = filter_dump_instance_init, + .instance_finalize = filter_dump_instance_finalize, + .instance_size = sizeof(NetFilterDumpState), +}; + +static void filter_dump_register_types(void) +{ + type_register_static(&filter_dump_info); +} + +type_init(filter_dump_register_types); @@ -15,13 +15,14 @@ * */ +#include "qemu/osdep.h" +#include "qemu/log.h" #include "net/eth.h" #include "net/checksum.h" -#include "qemu-common.h" #include "net/tap.h" -void eth_setup_vlan_headers(struct eth_header *ehdr, uint16_t vlan_tag, - bool *is_new) +void eth_setup_vlan_headers_ex(struct eth_header *ehdr, uint16_t vlan_tag, + uint16_t vlan_ethtype, bool *is_new) { struct vlan_header *vhdr = PKT_GET_VLAN_HDR(ehdr); @@ -35,7 +36,7 @@ void eth_setup_vlan_headers(struct eth_header *ehdr, uint16_t vlan_tag, default: /* No VLAN header, put a new one */ vhdr->h_proto = ehdr->h_proto; - ehdr->h_proto = cpu_to_be16(ETH_P_VLAN); + ehdr->h_proto = cpu_to_be16(vlan_ethtype); *is_new = true; break; } @@ -71,33 +72,106 @@ eth_get_gso_type(uint16_t l3_proto, uint8_t *l3_hdr, uint8_t l4proto) return VIRTIO_NET_HDR_GSO_TCPV6 | ecn_state; } } - - /* Unsupported offload */ - g_assert_not_reached(); + qemu_log_mask(LOG_UNIMP, "%s: probably not GSO frame, " + "unknown L3 protocol: 0x%04"PRIx16"\n", __func__, l3_proto); return VIRTIO_NET_HDR_GSO_NONE | ecn_state; } -void eth_get_protocols(const uint8_t *headers, - uint32_t hdr_length, +uint16_t +eth_get_l3_proto(const struct iovec *l2hdr_iov, int iovcnt, size_t l2hdr_len) +{ + uint16_t proto; + size_t copied; + size_t size = iov_size(l2hdr_iov, iovcnt); + size_t proto_offset = l2hdr_len - sizeof(proto); + + if (size < proto_offset) { + return ETH_P_UNKNOWN; + } + + copied = iov_to_buf(l2hdr_iov, iovcnt, proto_offset, + &proto, sizeof(proto)); + + return (copied == sizeof(proto)) ? be16_to_cpu(proto) : ETH_P_UNKNOWN; +} + +static bool +_eth_copy_chunk(size_t input_size, + const struct iovec *iov, int iovcnt, + size_t offset, size_t length, + void *buffer) +{ + size_t copied; + + if (input_size < offset) { + return false; + } + + copied = iov_to_buf(iov, iovcnt, offset, buffer, length); + + if (copied < length) { + return false; + } + + return true; +} + +static bool +_eth_tcp_has_data(bool is_ip4, + const struct ip_header *ip4_hdr, + const struct ip6_header *ip6_hdr, + size_t full_ip6hdr_len, + const struct tcp_header *tcp) +{ + uint32_t l4len; + + if (is_ip4) { + l4len = be16_to_cpu(ip4_hdr->ip_len) - IP_HDR_GET_LEN(ip4_hdr); + } else { + size_t opts_len = full_ip6hdr_len - sizeof(struct ip6_header); + l4len = be16_to_cpu(ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen) - opts_len; + } + + return l4len > TCP_HEADER_DATA_OFFSET(tcp); +} + +void eth_get_protocols(const struct iovec *iov, int iovcnt, bool *isip4, bool *isip6, - bool *isudp, bool *istcp) + bool *isudp, bool *istcp, + size_t *l3hdr_off, + size_t *l4hdr_off, + size_t *l5hdr_off, + eth_ip6_hdr_info *ip6hdr_info, + eth_ip4_hdr_info *ip4hdr_info, + eth_l4_hdr_info *l4hdr_info) { int proto; - size_t l2hdr_len = eth_get_l2_hdr_length(headers); - assert(hdr_length >= eth_get_l2_hdr_length(headers)); + bool fragment = false; + size_t l2hdr_len = eth_get_l2_hdr_length_iov(iov, iovcnt); + size_t input_size = iov_size(iov, iovcnt); + size_t copied; + *isip4 = *isip6 = *isudp = *istcp = false; - proto = eth_get_l3_proto(headers, l2hdr_len); + proto = eth_get_l3_proto(iov, iovcnt, l2hdr_len); + + *l3hdr_off = l2hdr_len; + if (proto == ETH_P_IP) { - *isip4 = true; + struct ip_header *iphdr = &ip4hdr_info->ip4_hdr; + + if (input_size < l2hdr_len) { + return; + } - struct ip_header *iphdr; + copied = iov_to_buf(iov, iovcnt, l2hdr_len, iphdr, sizeof(*iphdr)); - assert(hdr_length >= - eth_get_l2_hdr_length(headers) + sizeof(struct ip_header)); + *isip4 = true; - iphdr = PKT_GET_IP_HDR(headers); + if (copied < sizeof(*iphdr)) { + return; + } if (IP_HEADER_VERSION(iphdr) == IP_HEADER_VERSION_4) { if (iphdr->ip_p == IP_PROTO_TCP) { @@ -106,33 +180,152 @@ void eth_get_protocols(const uint8_t *headers, *isudp = true; } } - } else if (proto == ETH_P_IPV6) { - uint8_t l4proto; - size_t full_ip6hdr_len; - struct iovec hdr_vec; - hdr_vec.iov_base = (void *) headers; - hdr_vec.iov_len = hdr_length; + ip4hdr_info->fragment = IP4_IS_FRAGMENT(iphdr); + *l4hdr_off = l2hdr_len + IP_HDR_GET_LEN(iphdr); + + fragment = ip4hdr_info->fragment; + } else if (proto == ETH_P_IPV6) { *isip6 = true; - if (eth_parse_ipv6_hdr(&hdr_vec, 1, l2hdr_len, - &l4proto, &full_ip6hdr_len)) { - if (l4proto == IP_PROTO_TCP) { + if (eth_parse_ipv6_hdr(iov, iovcnt, l2hdr_len, + ip6hdr_info)) { + if (ip6hdr_info->l4proto == IP_PROTO_TCP) { *istcp = true; - } else if (l4proto == IP_PROTO_UDP) { + } else if (ip6hdr_info->l4proto == IP_PROTO_UDP) { *isudp = true; } + } else { + return; + } + + *l4hdr_off = l2hdr_len + ip6hdr_info->full_hdr_len; + fragment = ip6hdr_info->fragment; + } + + if (!fragment) { + if (*istcp) { + *istcp = _eth_copy_chunk(input_size, + iov, iovcnt, + *l4hdr_off, sizeof(l4hdr_info->hdr.tcp), + &l4hdr_info->hdr.tcp); + + if (*istcp) { + *l5hdr_off = *l4hdr_off + + TCP_HEADER_DATA_OFFSET(&l4hdr_info->hdr.tcp); + + l4hdr_info->has_tcp_data = + _eth_tcp_has_data(proto == ETH_P_IP, + &ip4hdr_info->ip4_hdr, + &ip6hdr_info->ip6_hdr, + *l4hdr_off - *l3hdr_off, + &l4hdr_info->hdr.tcp); + } + } else if (*isudp) { + *isudp = _eth_copy_chunk(input_size, + iov, iovcnt, + *l4hdr_off, sizeof(l4hdr_info->hdr.udp), + &l4hdr_info->hdr.udp); + *l5hdr_off = *l4hdr_off + sizeof(l4hdr_info->hdr.udp); } } } +size_t +eth_strip_vlan(const struct iovec *iov, int iovcnt, size_t iovoff, + uint8_t *new_ehdr_buf, + uint16_t *payload_offset, uint16_t *tci) +{ + struct vlan_header vlan_hdr; + struct eth_header *new_ehdr = (struct eth_header *) new_ehdr_buf; + + size_t copied = iov_to_buf(iov, iovcnt, iovoff, + new_ehdr, sizeof(*new_ehdr)); + + if (copied < sizeof(*new_ehdr)) { + return 0; + } + + switch (be16_to_cpu(new_ehdr->h_proto)) { + case ETH_P_VLAN: + case ETH_P_DVLAN: + copied = iov_to_buf(iov, iovcnt, iovoff + sizeof(*new_ehdr), + &vlan_hdr, sizeof(vlan_hdr)); + + if (copied < sizeof(vlan_hdr)) { + return 0; + } + + new_ehdr->h_proto = vlan_hdr.h_proto; + + *tci = be16_to_cpu(vlan_hdr.h_tci); + *payload_offset = iovoff + sizeof(*new_ehdr) + sizeof(vlan_hdr); + + if (be16_to_cpu(new_ehdr->h_proto) == ETH_P_VLAN) { + + copied = iov_to_buf(iov, iovcnt, *payload_offset, + PKT_GET_VLAN_HDR(new_ehdr), sizeof(vlan_hdr)); + + if (copied < sizeof(vlan_hdr)) { + return 0; + } + + *payload_offset += sizeof(vlan_hdr); + + return sizeof(struct eth_header) + sizeof(struct vlan_header); + } else { + return sizeof(struct eth_header); + } + default: + return 0; + } +} + +size_t +eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff, + uint16_t vet, uint8_t *new_ehdr_buf, + uint16_t *payload_offset, uint16_t *tci) +{ + struct vlan_header vlan_hdr; + struct eth_header *new_ehdr = (struct eth_header *) new_ehdr_buf; + + size_t copied = iov_to_buf(iov, iovcnt, iovoff, + new_ehdr, sizeof(*new_ehdr)); + + if (copied < sizeof(*new_ehdr)) { + return 0; + } + + if (be16_to_cpu(new_ehdr->h_proto) == vet) { + copied = iov_to_buf(iov, iovcnt, iovoff + sizeof(*new_ehdr), + &vlan_hdr, sizeof(vlan_hdr)); + + if (copied < sizeof(vlan_hdr)) { + return 0; + } + + new_ehdr->h_proto = vlan_hdr.h_proto; + + *tci = be16_to_cpu(vlan_hdr.h_tci); + *payload_offset = iovoff + sizeof(*new_ehdr) + sizeof(vlan_hdr); + return sizeof(struct eth_header); + } + + return 0; +} + void eth_setup_ip4_fragmentation(const void *l2hdr, size_t l2hdr_len, void *l3hdr, size_t l3hdr_len, size_t l3payload_len, size_t frag_offset, bool more_frags) { - if (eth_get_l3_proto(l2hdr, l2hdr_len) == ETH_P_IP) { + const struct iovec l2vec = { + .iov_base = (void *) l2hdr, + .iov_len = l2hdr_len + }; + + if (eth_get_l3_proto(&l2vec, 1, l2hdr_len) == ETH_P_IP) { uint16_t orig_flags; struct ip_header *iphdr = (struct ip_header *) l3hdr; uint16_t frag_off_units = frag_offset / IP_FRAG_UNIT_SIZE; @@ -157,7 +350,9 @@ eth_fix_ip4_checksum(void *l3hdr, size_t l3hdr_len) } uint32_t -eth_calc_pseudo_hdr_csum(struct ip_header *iphdr, uint16_t csl) +eth_calc_ip4_pseudo_hdr_csum(struct ip_header *iphdr, + uint16_t csl, + uint32_t *cso) { struct ip_pseudo_header ipph; ipph.ip_src = iphdr->ip_src; @@ -165,7 +360,26 @@ eth_calc_pseudo_hdr_csum(struct ip_header *iphdr, uint16_t csl) ipph.ip_payload = cpu_to_be16(csl); ipph.ip_proto = iphdr->ip_p; ipph.zeros = 0; - return net_checksum_add(sizeof(ipph), (uint8_t *) &ipph); + *cso = sizeof(ipph); + return net_checksum_add(*cso, (uint8_t *) &ipph); +} + +uint32_t +eth_calc_ip6_pseudo_hdr_csum(struct ip6_header *iphdr, + uint16_t csl, + uint8_t l4_proto, + uint32_t *cso) +{ + struct ip6_pseudo_header ipph; + ipph.ip6_src = iphdr->ip6_src; + ipph.ip6_dst = iphdr->ip6_dst; + ipph.len = cpu_to_be16(csl); + ipph.zero[0] = 0; + ipph.zero[1] = 0; + ipph.zero[2] = 0; + ipph.next_hdr = l4_proto; + *cso = sizeof(ipph); + return net_checksum_add(*cso, (uint8_t *)&ipph); } static bool @@ -185,33 +399,152 @@ eth_is_ip6_extension_header_type(uint8_t hdr_type) } } -bool eth_parse_ipv6_hdr(struct iovec *pkt, int pkt_frags, - size_t ip6hdr_off, uint8_t *l4proto, - size_t *full_hdr_len) +static bool +_eth_get_rss_ex_dst_addr(const struct iovec *pkt, int pkt_frags, + size_t rthdr_offset, + struct ip6_ext_hdr *ext_hdr, + struct in6_address *dst_addr) +{ + struct ip6_ext_hdr_routing *rthdr = (struct ip6_ext_hdr_routing *) ext_hdr; + + if ((rthdr->rtype == 2) && + (rthdr->len == sizeof(struct in6_address) / 8) && + (rthdr->segleft == 1)) { + + size_t input_size = iov_size(pkt, pkt_frags); + size_t bytes_read; + + if (input_size < rthdr_offset + sizeof(*ext_hdr)) { + return false; + } + + bytes_read = iov_to_buf(pkt, pkt_frags, + rthdr_offset + sizeof(*ext_hdr), + dst_addr, sizeof(*dst_addr)); + + return bytes_read == sizeof(*dst_addr); + } + + return false; +} + +static bool +_eth_get_rss_ex_src_addr(const struct iovec *pkt, int pkt_frags, + size_t dsthdr_offset, + struct ip6_ext_hdr *ext_hdr, + struct in6_address *src_addr) +{ + size_t bytes_left = (ext_hdr->ip6r_len + 1) * 8 - sizeof(*ext_hdr); + struct ip6_option_hdr opthdr; + size_t opt_offset = dsthdr_offset + sizeof(*ext_hdr); + + while (bytes_left > sizeof(opthdr)) { + size_t input_size = iov_size(pkt, pkt_frags); + size_t bytes_read, optlen; + + if (input_size < opt_offset) { + return false; + } + + bytes_read = iov_to_buf(pkt, pkt_frags, opt_offset, + &opthdr, sizeof(opthdr)); + + if (bytes_read != sizeof(opthdr)) { + return false; + } + + optlen = (opthdr.type == IP6_OPT_PAD1) ? 1 + : (opthdr.len + sizeof(opthdr)); + + if (optlen > bytes_left) { + return false; + } + + if (opthdr.type == IP6_OPT_HOME) { + size_t input_size = iov_size(pkt, pkt_frags); + + if (input_size < opt_offset + sizeof(opthdr)) { + return false; + } + + bytes_read = iov_to_buf(pkt, pkt_frags, + opt_offset + sizeof(opthdr), + src_addr, sizeof(*src_addr)); + + return bytes_read == sizeof(*src_addr); + } + + opt_offset += optlen; + bytes_left -= optlen; + } + + return false; +} + +bool eth_parse_ipv6_hdr(const struct iovec *pkt, int pkt_frags, + size_t ip6hdr_off, eth_ip6_hdr_info *info) { - struct ip6_header ip6_hdr; struct ip6_ext_hdr ext_hdr; size_t bytes_read; + uint8_t curr_ext_hdr_type; + size_t input_size = iov_size(pkt, pkt_frags); + + info->rss_ex_dst_valid = false; + info->rss_ex_src_valid = false; + info->fragment = false; + + if (input_size < ip6hdr_off) { + return false; + } bytes_read = iov_to_buf(pkt, pkt_frags, ip6hdr_off, - &ip6_hdr, sizeof(ip6_hdr)); - if (bytes_read < sizeof(ip6_hdr)) { + &info->ip6_hdr, sizeof(info->ip6_hdr)); + if (bytes_read < sizeof(info->ip6_hdr)) { return false; } - *full_hdr_len = sizeof(struct ip6_header); + info->full_hdr_len = sizeof(struct ip6_header); + + curr_ext_hdr_type = info->ip6_hdr.ip6_nxt; - if (!eth_is_ip6_extension_header_type(ip6_hdr.ip6_nxt)) { - *l4proto = ip6_hdr.ip6_nxt; + if (!eth_is_ip6_extension_header_type(curr_ext_hdr_type)) { + info->l4proto = info->ip6_hdr.ip6_nxt; + info->has_ext_hdrs = false; return true; } + info->has_ext_hdrs = true; + do { - bytes_read = iov_to_buf(pkt, pkt_frags, ip6hdr_off + *full_hdr_len, + if (input_size < ip6hdr_off + info->full_hdr_len) { + return false; + } + + bytes_read = iov_to_buf(pkt, pkt_frags, ip6hdr_off + info->full_hdr_len, &ext_hdr, sizeof(ext_hdr)); - *full_hdr_len += (ext_hdr.ip6r_len + 1) * IP6_EXT_GRANULARITY; - } while (eth_is_ip6_extension_header_type(ext_hdr.ip6r_nxt)); - *l4proto = ext_hdr.ip6r_nxt; + if (bytes_read < sizeof(ext_hdr)) { + return false; + } + + if (curr_ext_hdr_type == IP6_ROUTING) { + info->rss_ex_dst_valid = + _eth_get_rss_ex_dst_addr(pkt, pkt_frags, + ip6hdr_off + info->full_hdr_len, + &ext_hdr, &info->rss_ex_dst); + } else if (curr_ext_hdr_type == IP6_DESTINATON) { + info->rss_ex_src_valid = + _eth_get_rss_ex_src_addr(pkt, pkt_frags, + ip6hdr_off + info->full_hdr_len, + &ext_hdr, &info->rss_ex_src); + } else if (curr_ext_hdr_type == IP6_FRAGMENT) { + info->fragment = true; + } + + info->full_hdr_len += (ext_hdr.ip6r_len + 1) * IP6_EXT_GRANULARITY; + curr_ext_hdr_type = ext_hdr.ip6r_nxt; + } while (eth_is_ip6_extension_header_type(curr_ext_hdr_type)); + + info->l4proto = ext_hdr.ip6r_nxt; return true; } diff --git a/net/filter-buffer.c b/net/filter-buffer.c new file mode 100644 index 000000000..88da78f82 --- /dev/null +++ b/net/filter-buffer.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2015 FUJITSU LIMITED + * Author: Yang Hongyang <yanghy@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "net/filter.h" +#include "net/queue.h" +#include "qapi/error.h" +#include "qemu/timer.h" +#include "qemu/iov.h" +#include "qapi/qapi-builtin-visit.h" +#include "qapi/qmp/qerror.h" +#include "qom/object.h" + +#define TYPE_FILTER_BUFFER "filter-buffer" + +#define FILTER_BUFFER(obj) \ + OBJECT_CHECK(FilterBufferState, (obj), TYPE_FILTER_BUFFER) + +typedef struct FilterBufferState { + NetFilterState parent_obj; + + NetQueue *incoming_queue; + uint32_t interval; + QEMUTimer release_timer; +} FilterBufferState; + +static void filter_buffer_flush(NetFilterState *nf) +{ + FilterBufferState *s = FILTER_BUFFER(nf); + + if (!qemu_net_queue_flush(s->incoming_queue)) { + /* Unable to empty the queue, purge remaining packets */ + qemu_net_queue_purge(s->incoming_queue, nf->netdev); + } +} + +static void filter_buffer_release_timer(void *opaque) +{ + NetFilterState *nf = opaque; + FilterBufferState *s = FILTER_BUFFER(nf); + + /* + * Note: filter_buffer_flush() drops packets that can't be sent + * TODO: We should leave them queued. But currently there's no way + * for the next filter or receiver to notify us that it can receive + * more packets. + */ + filter_buffer_flush(nf); + /* Timer rearmed to fire again in s->interval microseconds. */ + timer_mod(&s->release_timer, + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval); +} + +/* filter APIs */ +static ssize_t filter_buffer_receive_iov(NetFilterState *nf, + NetClientState *sender, + unsigned flags, + const struct iovec *iov, + int iovcnt, + NetPacketSent *sent_cb) +{ + FilterBufferState *s = FILTER_BUFFER(nf); + + /* + * We return size when buffer a packet, the sender will take it as + * a already sent packet, so sent_cb should not be called later. + * + * FIXME: Even if the guest can't receive packets for some reasons, + * the filter can still accept packets until its internal queue is full. + * For example: + * For some reason, receiver could not receive more packets + * (.can_receive() returns zero). Without a filter, at most one packet + * will be queued in incoming queue and sender's poll will be disabled + * unit its sent_cb() was called. With a filter, it will keep receiving + * the packets without caring about the receiver. This is suboptimal. + * May need more thoughts (e.g keeping sent_cb). + */ + qemu_net_queue_append_iov(s->incoming_queue, sender, flags, + iov, iovcnt, NULL); + return iov_size(iov, iovcnt); +} + +static void filter_buffer_cleanup(NetFilterState *nf) +{ + FilterBufferState *s = FILTER_BUFFER(nf); + + if (s->interval) { + timer_del(&s->release_timer); + } + + /* flush packets */ + if (s->incoming_queue) { + filter_buffer_flush(nf); + g_free(s->incoming_queue); + } +} + +static void filter_buffer_setup_timer(NetFilterState *nf) +{ + FilterBufferState *s = FILTER_BUFFER(nf); + + if (s->interval) { + timer_init_us(&s->release_timer, QEMU_CLOCK_VIRTUAL, + filter_buffer_release_timer, nf); + /* Timer armed to fire in s->interval microseconds. */ + timer_mod(&s->release_timer, + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval); + } +} + +static void filter_buffer_setup(NetFilterState *nf, Error **errp) +{ + FilterBufferState *s = FILTER_BUFFER(nf); + + /* + * We may want to accept zero interval when VM FT solutions like MC + * or COLO use this filter to release packets on demand. + */ + if (!s->interval) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "interval", + "a non-zero interval"); + return; + } + + s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf); + filter_buffer_setup_timer(nf); +} + +static void filter_buffer_status_changed(NetFilterState *nf, Error **errp) +{ + FilterBufferState *s = FILTER_BUFFER(nf); + + if (!nf->on) { + if (s->interval) { + timer_del(&s->release_timer); + } + filter_buffer_flush(nf); + } else { + filter_buffer_setup_timer(nf); + } +} + +static void filter_buffer_class_init(ObjectClass *oc, void *data) +{ + NetFilterClass *nfc = NETFILTER_CLASS(oc); + + nfc->setup = filter_buffer_setup; + nfc->cleanup = filter_buffer_cleanup; + nfc->receive_iov = filter_buffer_receive_iov; + nfc->status_changed = filter_buffer_status_changed; +} + +static void filter_buffer_get_interval(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + FilterBufferState *s = FILTER_BUFFER(obj); + uint32_t value = s->interval; + + visit_type_uint32(v, name, &value, errp); +} + +static void filter_buffer_set_interval(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + FilterBufferState *s = FILTER_BUFFER(obj); + Error *local_err = NULL; + uint32_t value; + + visit_type_uint32(v, name, &value, &local_err); + if (local_err) { + goto out; + } + if (!value) { + error_setg(&local_err, "Property '%s.%s' requires a positive value", + object_get_typename(obj), name); + goto out; + } + s->interval = value; + +out: + error_propagate(errp, local_err); +} + +static void filter_buffer_init(Object *obj) +{ + object_property_add(obj, "interval", "uint32", + filter_buffer_get_interval, + filter_buffer_set_interval, NULL, NULL, NULL); +} + +static const TypeInfo filter_buffer_info = { + .name = TYPE_FILTER_BUFFER, + .parent = TYPE_NETFILTER, + .class_init = filter_buffer_class_init, + .instance_init = filter_buffer_init, + .instance_size = sizeof(FilterBufferState), +}; + +static void register_types(void) +{ + type_register_static(&filter_buffer_info); +} + +type_init(register_types); diff --git a/net/filter-mirror.c b/net/filter-mirror.c new file mode 100644 index 000000000..8d36009c5 --- /dev/null +++ b/net/filter-mirror.c @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * Copyright (c) 2016 FUJITSU LIMITED + * Copyright (c) 2016 Intel Corporation + * + * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "net/filter.h" +#include "net/net.h" +#include "qapi/error.h" +#include "qom/object.h" +#include "qemu/main-loop.h" +#include "qemu/error-report.h" +#include "trace.h" +#include "chardev/char-fe.h" +#include "qemu/iov.h" +#include "qemu/sockets.h" + +#define FILTER_MIRROR(obj) \ + OBJECT_CHECK(MirrorState, (obj), TYPE_FILTER_MIRROR) + +#define FILTER_REDIRECTOR(obj) \ + OBJECT_CHECK(MirrorState, (obj), TYPE_FILTER_REDIRECTOR) + +#define TYPE_FILTER_MIRROR "filter-mirror" +#define TYPE_FILTER_REDIRECTOR "filter-redirector" +#define REDIRECTOR_MAX_LEN NET_BUFSIZE + +typedef struct MirrorState { + NetFilterState parent_obj; + char *indev; + char *outdev; + CharBackend chr_in; + CharBackend chr_out; + SocketReadState rs; + bool vnet_hdr; +} MirrorState; + +static int filter_send(MirrorState *s, + const struct iovec *iov, + int iovcnt) +{ + NetFilterState *nf = NETFILTER(s); + int ret = 0; + ssize_t size = 0; + uint32_t len = 0; + char *buf; + + size = iov_size(iov, iovcnt); + if (!size) { + return 0; + } + + len = htonl(size); + ret = qemu_chr_fe_write_all(&s->chr_out, (uint8_t *)&len, sizeof(len)); + if (ret != sizeof(len)) { + goto err; + } + + if (s->vnet_hdr) { + /* + * If vnet_hdr = on, we send vnet header len to make other + * module(like colo-compare) know how to parse net + * packet correctly. + */ + ssize_t vnet_hdr_len; + + vnet_hdr_len = nf->netdev->vnet_hdr_len; + + len = htonl(vnet_hdr_len); + ret = qemu_chr_fe_write_all(&s->chr_out, (uint8_t *)&len, sizeof(len)); + if (ret != sizeof(len)) { + goto err; + } + } + + buf = g_malloc(size); + iov_to_buf(iov, iovcnt, 0, buf, size); + ret = qemu_chr_fe_write_all(&s->chr_out, (uint8_t *)buf, size); + g_free(buf); + if (ret != size) { + goto err; + } + + return 0; + +err: + return ret < 0 ? ret : -EIO; +} + +static void redirector_to_filter(NetFilterState *nf, + const uint8_t *buf, + int len) +{ + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = len, + }; + + if (nf->direction == NET_FILTER_DIRECTION_ALL || + nf->direction == NET_FILTER_DIRECTION_TX) { + qemu_netfilter_pass_to_next(nf->netdev, 0, &iov, 1, nf); + } + + if (nf->direction == NET_FILTER_DIRECTION_ALL || + nf->direction == NET_FILTER_DIRECTION_RX) { + qemu_netfilter_pass_to_next(nf->netdev->peer, 0, &iov, 1, nf); + } +} + +static int redirector_chr_can_read(void *opaque) +{ + return REDIRECTOR_MAX_LEN; +} + +static void redirector_chr_read(void *opaque, const uint8_t *buf, int size) +{ + NetFilterState *nf = opaque; + MirrorState *s = FILTER_REDIRECTOR(nf); + int ret; + + ret = net_fill_rstate(&s->rs, buf, size); + + if (ret == -1) { + qemu_chr_fe_set_handlers(&s->chr_in, NULL, NULL, NULL, + NULL, NULL, NULL, true); + } +} + +static void redirector_chr_event(void *opaque, int event) +{ + NetFilterState *nf = opaque; + MirrorState *s = FILTER_REDIRECTOR(nf); + + switch (event) { + case CHR_EVENT_CLOSED: + qemu_chr_fe_set_handlers(&s->chr_in, NULL, NULL, NULL, + NULL, NULL, NULL, true); + break; + default: + break; + } +} + +static ssize_t filter_mirror_receive_iov(NetFilterState *nf, + NetClientState *sender, + unsigned flags, + const struct iovec *iov, + int iovcnt, + NetPacketSent *sent_cb) +{ + MirrorState *s = FILTER_MIRROR(nf); + int ret; + + ret = filter_send(s, iov, iovcnt); + if (ret) { + error_report("filter mirror send failed(%s)", strerror(-ret)); + } + + /* + * we don't hope this error interrupt the normal + * path of net packet, so we always return zero. + */ + return 0; +} + +static ssize_t filter_redirector_receive_iov(NetFilterState *nf, + NetClientState *sender, + unsigned flags, + const struct iovec *iov, + int iovcnt, + NetPacketSent *sent_cb) +{ + MirrorState *s = FILTER_REDIRECTOR(nf); + int ret; + + if (qemu_chr_fe_backend_connected(&s->chr_out)) { + ret = filter_send(s, iov, iovcnt); + if (ret) { + error_report("filter redirector send failed(%s)", strerror(-ret)); + } + return iov_size(iov, iovcnt); + } else { + return 0; + } +} + +static void filter_mirror_cleanup(NetFilterState *nf) +{ + MirrorState *s = FILTER_MIRROR(nf); + + qemu_chr_fe_deinit(&s->chr_out, false); +} + +static void filter_redirector_cleanup(NetFilterState *nf) +{ + MirrorState *s = FILTER_REDIRECTOR(nf); + + qemu_chr_fe_deinit(&s->chr_in, false); + qemu_chr_fe_deinit(&s->chr_out, false); +} + +static void filter_mirror_setup(NetFilterState *nf, Error **errp) +{ + MirrorState *s = FILTER_MIRROR(nf); + Chardev *chr; + + if (s->outdev == NULL) { + error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, "filter-mirror parameter"\ + " 'outdev' cannot be empty"); + return; + } + + chr = qemu_chr_find(s->outdev); + if (chr == NULL) { + error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, + "Device '%s' not found", s->outdev); + return; + } + + qemu_chr_fe_init(&s->chr_out, chr, errp); +} + +static void redirector_rs_finalize(SocketReadState *rs) +{ + MirrorState *s = container_of(rs, MirrorState, rs); + NetFilterState *nf = NETFILTER(s); + + redirector_to_filter(nf, rs->buf, rs->packet_len); +} + +static void filter_redirector_setup(NetFilterState *nf, Error **errp) +{ + MirrorState *s = FILTER_REDIRECTOR(nf); + Chardev *chr; + + if (!s->indev && !s->outdev) { + error_setg(errp, "filter redirector needs 'indev' or " + "'outdev' at least one property set"); + return; + } else if (s->indev && s->outdev) { + if (!strcmp(s->indev, s->outdev)) { + error_setg(errp, "'indev' and 'outdev' could not be same " + "for filter redirector"); + return; + } + } + + net_socket_rs_init(&s->rs, redirector_rs_finalize, s->vnet_hdr); + + if (s->indev) { + chr = qemu_chr_find(s->indev); + if (chr == NULL) { + error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, + "IN Device '%s' not found", s->indev); + return; + } + + if (!qemu_chr_fe_init(&s->chr_in, chr, errp)) { + return; + } + + qemu_chr_fe_set_handlers(&s->chr_in, redirector_chr_can_read, + redirector_chr_read, redirector_chr_event, + NULL, nf, NULL, true); + } + + if (s->outdev) { + chr = qemu_chr_find(s->outdev); + if (chr == NULL) { + error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, + "OUT Device '%s' not found", s->outdev); + return; + } + if (!qemu_chr_fe_init(&s->chr_out, chr, errp)) { + return; + } + } +} + +static void filter_mirror_class_init(ObjectClass *oc, void *data) +{ + NetFilterClass *nfc = NETFILTER_CLASS(oc); + + nfc->setup = filter_mirror_setup; + nfc->cleanup = filter_mirror_cleanup; + nfc->receive_iov = filter_mirror_receive_iov; +} + +static void filter_redirector_class_init(ObjectClass *oc, void *data) +{ + NetFilterClass *nfc = NETFILTER_CLASS(oc); + + nfc->setup = filter_redirector_setup; + nfc->cleanup = filter_redirector_cleanup; + nfc->receive_iov = filter_redirector_receive_iov; +} + +static char *filter_redirector_get_indev(Object *obj, Error **errp) +{ + MirrorState *s = FILTER_REDIRECTOR(obj); + + return g_strdup(s->indev); +} + +static void filter_redirector_set_indev(Object *obj, + const char *value, + Error **errp) +{ + MirrorState *s = FILTER_REDIRECTOR(obj); + + g_free(s->indev); + s->indev = g_strdup(value); +} + +static char *filter_mirror_get_outdev(Object *obj, Error **errp) +{ + MirrorState *s = FILTER_MIRROR(obj); + + return g_strdup(s->outdev); +} + +static void filter_mirror_set_outdev(Object *obj, + const char *value, + Error **errp) +{ + MirrorState *s = FILTER_MIRROR(obj); + + g_free(s->outdev); + s->outdev = g_strdup(value); + if (!s->outdev) { + error_setg(errp, "filter mirror needs 'outdev' " + "property set"); + return; + } +} + +static bool filter_mirror_get_vnet_hdr(Object *obj, Error **errp) +{ + MirrorState *s = FILTER_MIRROR(obj); + + return s->vnet_hdr; +} + +static void filter_mirror_set_vnet_hdr(Object *obj, bool value, Error **errp) +{ + MirrorState *s = FILTER_MIRROR(obj); + + s->vnet_hdr = value; +} + +static char *filter_redirector_get_outdev(Object *obj, Error **errp) +{ + MirrorState *s = FILTER_REDIRECTOR(obj); + + return g_strdup(s->outdev); +} + +static void filter_redirector_set_outdev(Object *obj, + const char *value, + Error **errp) +{ + MirrorState *s = FILTER_REDIRECTOR(obj); + + g_free(s->outdev); + s->outdev = g_strdup(value); +} + +static bool filter_redirector_get_vnet_hdr(Object *obj, Error **errp) +{ + MirrorState *s = FILTER_REDIRECTOR(obj); + + return s->vnet_hdr; +} + +static void filter_redirector_set_vnet_hdr(Object *obj, + bool value, + Error **errp) +{ + MirrorState *s = FILTER_REDIRECTOR(obj); + + s->vnet_hdr = value; +} + +static void filter_mirror_init(Object *obj) +{ + MirrorState *s = FILTER_MIRROR(obj); + + object_property_add_str(obj, "outdev", filter_mirror_get_outdev, + filter_mirror_set_outdev, NULL); + + s->vnet_hdr = false; + object_property_add_bool(obj, "vnet_hdr_support", + filter_mirror_get_vnet_hdr, + filter_mirror_set_vnet_hdr, NULL); +} + +static void filter_redirector_init(Object *obj) +{ + MirrorState *s = FILTER_REDIRECTOR(obj); + + object_property_add_str(obj, "indev", filter_redirector_get_indev, + filter_redirector_set_indev, NULL); + object_property_add_str(obj, "outdev", filter_redirector_get_outdev, + filter_redirector_set_outdev, NULL); + + s->vnet_hdr = false; + object_property_add_bool(obj, "vnet_hdr_support", + filter_redirector_get_vnet_hdr, + filter_redirector_set_vnet_hdr, NULL); +} + +static void filter_mirror_fini(Object *obj) +{ + MirrorState *s = FILTER_MIRROR(obj); + + g_free(s->outdev); +} + +static void filter_redirector_fini(Object *obj) +{ + MirrorState *s = FILTER_REDIRECTOR(obj); + + g_free(s->indev); + g_free(s->outdev); +} + +static const TypeInfo filter_redirector_info = { + .name = TYPE_FILTER_REDIRECTOR, + .parent = TYPE_NETFILTER, + .class_init = filter_redirector_class_init, + .instance_init = filter_redirector_init, + .instance_finalize = filter_redirector_fini, + .instance_size = sizeof(MirrorState), +}; + +static const TypeInfo filter_mirror_info = { + .name = TYPE_FILTER_MIRROR, + .parent = TYPE_NETFILTER, + .class_init = filter_mirror_class_init, + .instance_init = filter_mirror_init, + .instance_finalize = filter_mirror_fini, + .instance_size = sizeof(MirrorState), +}; + +static void register_types(void) +{ + type_register_static(&filter_mirror_info); + type_register_static(&filter_redirector_info); +} + +type_init(register_types); diff --git a/net/filter-replay.c b/net/filter-replay.c new file mode 100644 index 000000000..9dda19392 --- /dev/null +++ b/net/filter-replay.c @@ -0,0 +1,91 @@ +/* + * filter-replay.c + * + * Copyright (c) 2010-2016 Institute for System Programming + * of the Russian Academy of Sciences. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "clients.h" +#include "qemu/error-report.h" +#include "qemu/iov.h" +#include "qemu/log.h" +#include "qemu/module.h" +#include "qemu/timer.h" +#include "qapi/visitor.h" +#include "net/filter.h" +#include "sysemu/replay.h" + +#define TYPE_FILTER_REPLAY "filter-replay" + +#define FILTER_REPLAY(obj) \ + OBJECT_CHECK(NetFilterReplayState, (obj), TYPE_FILTER_REPLAY) + +struct NetFilterReplayState { + NetFilterState nfs; + ReplayNetState *rns; +}; +typedef struct NetFilterReplayState NetFilterReplayState; + +static ssize_t filter_replay_receive_iov(NetFilterState *nf, + NetClientState *sndr, + unsigned flags, + const struct iovec *iov, + int iovcnt, NetPacketSent *sent_cb) +{ + NetFilterReplayState *nfrs = FILTER_REPLAY(nf); + switch (replay_mode) { + case REPLAY_MODE_RECORD: + if (nf->netdev == sndr) { + replay_net_packet_event(nfrs->rns, flags, iov, iovcnt); + return iov_size(iov, iovcnt); + } + return 0; + case REPLAY_MODE_PLAY: + /* Drop all packets in replay mode. + Packets from the log will be injected by the replay module. */ + return iov_size(iov, iovcnt); + default: + /* Pass all the packets. */ + return 0; + } +} + +static void filter_replay_instance_init(Object *obj) +{ + NetFilterReplayState *nfrs = FILTER_REPLAY(obj); + nfrs->rns = replay_register_net(&nfrs->nfs); +} + +static void filter_replay_instance_finalize(Object *obj) +{ + NetFilterReplayState *nfrs = FILTER_REPLAY(obj); + replay_unregister_net(nfrs->rns); +} + +static void filter_replay_class_init(ObjectClass *oc, void *data) +{ + NetFilterClass *nfc = NETFILTER_CLASS(oc); + + nfc->receive_iov = filter_replay_receive_iov; +} + +static const TypeInfo filter_replay_info = { + .name = TYPE_FILTER_REPLAY, + .parent = TYPE_NETFILTER, + .class_init = filter_replay_class_init, + .instance_init = filter_replay_instance_init, + .instance_finalize = filter_replay_instance_finalize, + .instance_size = sizeof(NetFilterReplayState), +}; + +static void filter_replay_register_types(void) +{ + type_register_static(&filter_replay_info); +} + +type_init(filter_replay_register_types); diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c new file mode 100644 index 000000000..31da08a2f --- /dev/null +++ b/net/filter-rewriter.c @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * Copyright (c) 2016 FUJITSU LIMITED + * Copyright (c) 2016 Intel Corporation + * + * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "trace.h" +#include "colo.h" +#include "net/filter.h" +#include "net/net.h" +#include "qemu/error-report.h" +#include "qom/object.h" +#include "qemu/main-loop.h" +#include "qemu/iov.h" +#include "net/checksum.h" +#include "net/colo.h" +#include "migration/colo.h" +#include "util.h" + +#define FILTER_COLO_REWRITER(obj) \ + OBJECT_CHECK(RewriterState, (obj), TYPE_FILTER_REWRITER) + +#define TYPE_FILTER_REWRITER "filter-rewriter" +#define FAILOVER_MODE_ON true +#define FAILOVER_MODE_OFF false + +typedef struct RewriterState { + NetFilterState parent_obj; + NetQueue *incoming_queue; + /* hashtable to save connection */ + GHashTable *connection_track_table; + bool vnet_hdr; + bool failover_mode; +} RewriterState; + +static void filter_rewriter_failover_mode(RewriterState *s) +{ + s->failover_mode = FAILOVER_MODE_ON; +} + +static void filter_rewriter_flush(NetFilterState *nf) +{ + RewriterState *s = FILTER_COLO_REWRITER(nf); + + if (!qemu_net_queue_flush(s->incoming_queue)) { + /* Unable to empty the queue, purge remaining packets */ + qemu_net_queue_purge(s->incoming_queue, nf->netdev); + } +} + +/* + * Return 1 on success, if return 0 means the pkt + * is not TCP packet + */ +static int is_tcp_packet(Packet *pkt) +{ + if (!parse_packet_early(pkt) && + pkt->ip->ip_p == IPPROTO_TCP) { + return 1; + } else { + return 0; + } +} + +/* handle tcp packet from primary guest */ +static int handle_primary_tcp_pkt(RewriterState *rf, + Connection *conn, + Packet *pkt, ConnectionKey *key) +{ + struct tcp_hdr *tcp_pkt; + + tcp_pkt = (struct tcp_hdr *)pkt->transport_header; + if (trace_event_get_state_backends(TRACE_COLO_FILTER_REWRITER_DEBUG)) { + trace_colo_filter_rewriter_pkt_info(__func__, + inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst), + ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack), + tcp_pkt->th_flags); + trace_colo_filter_rewriter_conn_offset(conn->offset); + } + + if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN)) && + conn->tcp_state == TCPS_SYN_SENT) { + conn->tcp_state = TCPS_ESTABLISHED; + } + + if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) { + /* + * we use this flag update offset func + * run once in independent tcp connection + */ + conn->tcp_state = TCPS_SYN_RECEIVED; + } + + if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) { + if (conn->tcp_state == TCPS_SYN_RECEIVED) { + /* + * offset = secondary_seq - primary seq + * ack packet sent by guest from primary node, + * so we use th_ack - 1 get primary_seq + */ + conn->offset -= (ntohl(tcp_pkt->th_ack) - 1); + conn->tcp_state = TCPS_ESTABLISHED; + } + if (conn->offset) { + /* handle packets to the secondary from the primary */ + tcp_pkt->th_ack = htonl(ntohl(tcp_pkt->th_ack) + conn->offset); + + net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len, + pkt->size - pkt->vnet_hdr_len); + } + + /* + * Passive close step 3 + */ + if ((conn->tcp_state == TCPS_LAST_ACK) && + (ntohl(tcp_pkt->th_ack) == (conn->fin_ack_seq + 1))) { + conn->tcp_state = TCPS_CLOSED; + g_hash_table_remove(rf->connection_track_table, key); + } + } + + if ((tcp_pkt->th_flags & TH_FIN) == TH_FIN) { + /* + * Passive close. + * Step 1: + * The *server* side of this connect is VM, *client* tries to close + * the connection. We will into CLOSE_WAIT status. + * + * Step 2: + * In this step we will into LAST_ACK status. + * + * We got 'fin=1, ack=1' packet from server side, we need to + * record the seq of 'fin=1, ack=1' packet. + * + * Step 3: + * We got 'ack=1' packets from client side, it acks 'fin=1, ack=1' + * packet from server side. From this point, we can ensure that there + * will be no packets in the connection, except that, some errors + * happen between the path of 'filter object' and vNIC, if this rare + * case really happen, we can still create a new connection, + * So it is safe to remove the connection from connection_track_table. + * + */ + if (conn->tcp_state == TCPS_ESTABLISHED) { + conn->tcp_state = TCPS_CLOSE_WAIT; + } + + /* + * Active close step 2. + */ + if (conn->tcp_state == TCPS_FIN_WAIT_1) { + /* + * For simplify implementation, we needn't wait 2MSL time + * in filter rewriter. Because guest kernel will track the + * TCP status and wait 2MSL time, if client resend the FIN + * packet, guest will apply the last ACK too. + * So, we skip the TCPS_TIME_WAIT state here and go straight + * to TCPS_CLOSED state. + */ + conn->tcp_state = TCPS_CLOSED; + g_hash_table_remove(rf->connection_track_table, key); + } + } + + return 0; +} + +/* handle tcp packet from secondary guest */ +static int handle_secondary_tcp_pkt(RewriterState *rf, + Connection *conn, + Packet *pkt, ConnectionKey *key) +{ + struct tcp_hdr *tcp_pkt; + + tcp_pkt = (struct tcp_hdr *)pkt->transport_header; + + if (trace_event_get_state_backends(TRACE_COLO_FILTER_REWRITER_DEBUG)) { + trace_colo_filter_rewriter_pkt_info(__func__, + inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst), + ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack), + tcp_pkt->th_flags); + trace_colo_filter_rewriter_conn_offset(conn->offset); + } + + if (conn->tcp_state == TCPS_SYN_RECEIVED && + ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) { + /* + * save offset = secondary_seq and then + * in handle_primary_tcp_pkt make offset + * = secondary_seq - primary_seq + */ + conn->offset = ntohl(tcp_pkt->th_seq); + } + + /* VM active connect */ + if (conn->tcp_state == TCPS_CLOSED && + ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) { + conn->tcp_state = TCPS_SYN_SENT; + } + + if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) { + /* Only need to adjust seq while offset is Non-zero */ + if (conn->offset) { + /* handle packets to the primary from the secondary*/ + tcp_pkt->th_seq = htonl(ntohl(tcp_pkt->th_seq) - conn->offset); + + net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len, + pkt->size - pkt->vnet_hdr_len); + } + } + + /* + * Passive close step 2: + */ + if (conn->tcp_state == TCPS_CLOSE_WAIT && + (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) { + conn->fin_ack_seq = ntohl(tcp_pkt->th_seq); + conn->tcp_state = TCPS_LAST_ACK; + } + + /* + * Active close + * + * Step 1: + * The *server* side of this connect is VM, *server* tries to close + * the connection. + * + * Step 2: + * We will into CLOSE_WAIT status. + * We simplify the TCPS_FIN_WAIT_2, TCPS_TIME_WAIT and + * CLOSING status. + */ + if (conn->tcp_state == TCPS_ESTABLISHED && + (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == TH_FIN) { + conn->tcp_state = TCPS_FIN_WAIT_1; + } + + return 0; +} + +static ssize_t colo_rewriter_receive_iov(NetFilterState *nf, + NetClientState *sender, + unsigned flags, + const struct iovec *iov, + int iovcnt, + NetPacketSent *sent_cb) +{ + RewriterState *s = FILTER_COLO_REWRITER(nf); + Connection *conn; + ConnectionKey key; + Packet *pkt; + ssize_t size = iov_size(iov, iovcnt); + ssize_t vnet_hdr_len = 0; + char *buf = g_malloc0(size); + + iov_to_buf(iov, iovcnt, 0, buf, size); + + if (s->vnet_hdr) { + vnet_hdr_len = nf->netdev->vnet_hdr_len; + } + + pkt = packet_new(buf, size, vnet_hdr_len); + g_free(buf); + + /* + * if we get tcp packet + * we will rewrite it to make secondary guest's + * connection established successfully + */ + if (pkt && is_tcp_packet(pkt)) { + + fill_connection_key(pkt, &key); + + if (sender == nf->netdev) { + /* + * We need make tcp TX and RX packet + * into one connection. + */ + reverse_connection_key(&key); + } + + /* After failover we needn't change new TCP packet */ + if (s->failover_mode && + !connection_has_tracked(s->connection_track_table, &key)) { + goto out; + } + + conn = connection_get(s->connection_track_table, + &key, + NULL); + + if (sender == nf->netdev) { + /* NET_FILTER_DIRECTION_TX */ + if (!handle_primary_tcp_pkt(s, conn, pkt, &key)) { + qemu_net_queue_send(s->incoming_queue, sender, 0, + (const uint8_t *)pkt->data, pkt->size, NULL); + packet_destroy(pkt, NULL); + pkt = NULL; + /* + * We block the packet here,after rewrite pkt + * and will send it + */ + return 1; + } + } else { + /* NET_FILTER_DIRECTION_RX */ + if (!handle_secondary_tcp_pkt(s, conn, pkt, &key)) { + qemu_net_queue_send(s->incoming_queue, sender, 0, + (const uint8_t *)pkt->data, pkt->size, NULL); + packet_destroy(pkt, NULL); + pkt = NULL; + /* + * We block the packet here,after rewrite pkt + * and will send it + */ + return 1; + } + } + } + +out: + packet_destroy(pkt, NULL); + pkt = NULL; + return 0; +} + +static void reset_seq_offset(gpointer key, gpointer value, gpointer user_data) +{ + Connection *conn = (Connection *)value; + + conn->offset = 0; +} + +static gboolean offset_is_nonzero(gpointer key, + gpointer value, + gpointer user_data) +{ + Connection *conn = (Connection *)value; + + return conn->offset ? true : false; +} + +static void colo_rewriter_handle_event(NetFilterState *nf, int event, + Error **errp) +{ + RewriterState *rs = FILTER_COLO_REWRITER(nf); + + switch (event) { + case COLO_EVENT_CHECKPOINT: + g_hash_table_foreach(rs->connection_track_table, + reset_seq_offset, NULL); + break; + case COLO_EVENT_FAILOVER: + if (!g_hash_table_find(rs->connection_track_table, + offset_is_nonzero, NULL)) { + filter_rewriter_failover_mode(rs); + } + break; + default: + break; + } +} + +static void colo_rewriter_cleanup(NetFilterState *nf) +{ + RewriterState *s = FILTER_COLO_REWRITER(nf); + + /* flush packets */ + if (s->incoming_queue) { + filter_rewriter_flush(nf); + g_free(s->incoming_queue); + } +} + +static void colo_rewriter_setup(NetFilterState *nf, Error **errp) +{ + RewriterState *s = FILTER_COLO_REWRITER(nf); + + s->connection_track_table = g_hash_table_new_full(connection_key_hash, + connection_key_equal, + g_free, + connection_destroy); + s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf); +} + +static bool filter_rewriter_get_vnet_hdr(Object *obj, Error **errp) +{ + RewriterState *s = FILTER_COLO_REWRITER(obj); + + return s->vnet_hdr; +} + +static void filter_rewriter_set_vnet_hdr(Object *obj, + bool value, + Error **errp) +{ + RewriterState *s = FILTER_COLO_REWRITER(obj); + + s->vnet_hdr = value; +} + +static void filter_rewriter_init(Object *obj) +{ + RewriterState *s = FILTER_COLO_REWRITER(obj); + + s->vnet_hdr = false; + s->failover_mode = FAILOVER_MODE_OFF; + object_property_add_bool(obj, "vnet_hdr_support", + filter_rewriter_get_vnet_hdr, + filter_rewriter_set_vnet_hdr, NULL); +} + +static void colo_rewriter_class_init(ObjectClass *oc, void *data) +{ + NetFilterClass *nfc = NETFILTER_CLASS(oc); + + nfc->setup = colo_rewriter_setup; + nfc->cleanup = colo_rewriter_cleanup; + nfc->receive_iov = colo_rewriter_receive_iov; + nfc->handle_event = colo_rewriter_handle_event; +} + +static const TypeInfo colo_rewriter_info = { + .name = TYPE_FILTER_REWRITER, + .parent = TYPE_NETFILTER, + .class_init = colo_rewriter_class_init, + .instance_init = filter_rewriter_init, + .instance_size = sizeof(RewriterState), +}; + +static void register_types(void) +{ + type_register_static(&colo_rewriter_info); +} + +type_init(register_types); diff --git a/net/filter.c b/net/filter.c new file mode 100644 index 000000000..4b932e79f --- /dev/null +++ b/net/filter.c @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2015 FUJITSU LIMITED + * Author: Yang Hongyang <yanghy@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qmp/qerror.h" +#include "qemu/error-report.h" + +#include "net/filter.h" +#include "net/net.h" +#include "net/vhost_net.h" +#include "qom/object_interfaces.h" +#include "qemu/iov.h" +#include "qemu/module.h" +#include "net/colo.h" +#include "migration/colo.h" + +static inline bool qemu_can_skip_netfilter(NetFilterState *nf) +{ + return !nf->on; +} + +ssize_t qemu_netfilter_receive(NetFilterState *nf, + NetFilterDirection direction, + NetClientState *sender, + unsigned flags, + const struct iovec *iov, + int iovcnt, + NetPacketSent *sent_cb) +{ + if (qemu_can_skip_netfilter(nf)) { + return 0; + } + if (nf->direction == direction || + nf->direction == NET_FILTER_DIRECTION_ALL) { + return NETFILTER_GET_CLASS(OBJECT(nf))->receive_iov( + nf, sender, flags, iov, iovcnt, sent_cb); + } + + return 0; +} + +static NetFilterState *netfilter_next(NetFilterState *nf, + NetFilterDirection dir) +{ + NetFilterState *next; + + if (dir == NET_FILTER_DIRECTION_TX) { + /* forward walk through filters */ + next = QTAILQ_NEXT(nf, next); + } else { + /* reverse order */ + next = QTAILQ_PREV(nf, next); + } + + return next; +} + +ssize_t qemu_netfilter_pass_to_next(NetClientState *sender, + unsigned flags, + const struct iovec *iov, + int iovcnt, + void *opaque) +{ + int ret = 0; + int direction; + NetFilterState *nf = opaque; + NetFilterState *next = NULL; + + if (!sender || !sender->peer) { + /* no receiver, or sender been deleted, no need to pass it further */ + goto out; + } + + if (nf->direction == NET_FILTER_DIRECTION_ALL) { + if (sender == nf->netdev) { + /* This packet is sent by netdev itself */ + direction = NET_FILTER_DIRECTION_TX; + } else { + direction = NET_FILTER_DIRECTION_RX; + } + } else { + direction = nf->direction; + } + + next = netfilter_next(nf, direction); + while (next) { + /* + * if qemu_netfilter_pass_to_next been called, means that + * the packet has been hold by filter and has already retured size + * to the sender, so sent_cb shouldn't be called later, just + * pass NULL to next. + */ + ret = qemu_netfilter_receive(next, direction, sender, flags, iov, + iovcnt, NULL); + if (ret) { + return ret; + } + next = netfilter_next(next, direction); + } + + /* + * We have gone through all filters, pass it to receiver. + * Do the valid check again incase sender or receiver been + * deleted while we go through filters. + */ + if (sender && sender->peer) { + qemu_net_queue_send_iov(sender->peer->incoming_queue, + sender, flags, iov, iovcnt, NULL); + } + +out: + /* no receiver, or sender been deleted */ + return iov_size(iov, iovcnt); +} + +static char *netfilter_get_netdev_id(Object *obj, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + + return g_strdup(nf->netdev_id); +} + +static void netfilter_set_netdev_id(Object *obj, const char *str, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + + nf->netdev_id = g_strdup(str); +} + +static int netfilter_get_direction(Object *obj, Error **errp G_GNUC_UNUSED) +{ + NetFilterState *nf = NETFILTER(obj); + return nf->direction; +} + +static void netfilter_set_direction(Object *obj, int direction, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + nf->direction = direction; +} + +static char *netfilter_get_status(Object *obj, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + + return nf->on ? g_strdup("on") : g_strdup("off"); +} + +static void netfilter_set_status(Object *obj, const char *str, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + NetFilterClass *nfc = NETFILTER_GET_CLASS(obj); + + if (strcmp(str, "on") && strcmp(str, "off")) { + error_setg(errp, "Invalid value for netfilter status, " + "should be 'on' or 'off'"); + return; + } + if (nf->on == !strcmp(str, "on")) { + return; + } + nf->on = !nf->on; + if (nf->netdev && nfc->status_changed) { + nfc->status_changed(nf, errp); + } +} + +static void netfilter_init(Object *obj) +{ + NetFilterState *nf = NETFILTER(obj); + + nf->on = true; + + object_property_add_str(obj, "netdev", + netfilter_get_netdev_id, netfilter_set_netdev_id, + NULL); + object_property_add_enum(obj, "queue", "NetFilterDirection", + &NetFilterDirection_lookup, + netfilter_get_direction, netfilter_set_direction, + NULL); + object_property_add_str(obj, "status", + netfilter_get_status, netfilter_set_status, + NULL); +} + +static void netfilter_complete(UserCreatable *uc, Error **errp) +{ + NetFilterState *nf = NETFILTER(uc); + NetClientState *ncs[MAX_QUEUE_NUM]; + NetFilterClass *nfc = NETFILTER_GET_CLASS(uc); + int queues; + Error *local_err = NULL; + + if (!nf->netdev_id) { + error_setg(errp, "Parameter 'netdev' is required"); + return; + } + + queues = qemu_find_net_clients_except(nf->netdev_id, ncs, + NET_CLIENT_DRIVER_NIC, + MAX_QUEUE_NUM); + if (queues < 1) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "netdev", + "a network backend id"); + return; + } else if (queues > 1) { + error_setg(errp, "multiqueue is not supported"); + return; + } + + if (get_vhost_net(ncs[0])) { + error_setg(errp, "Vhost is not supported"); + return; + } + + nf->netdev = ncs[0]; + + if (nfc->setup) { + nfc->setup(nf, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } + QTAILQ_INSERT_TAIL(&nf->netdev->filters, nf, next); +} + +static void netfilter_finalize(Object *obj) +{ + NetFilterState *nf = NETFILTER(obj); + NetFilterClass *nfc = NETFILTER_GET_CLASS(obj); + + if (nfc->cleanup) { + nfc->cleanup(nf); + } + + if (nf->netdev && !QTAILQ_EMPTY(&nf->netdev->filters) && + QTAILQ_IN_USE(nf, next)) { + QTAILQ_REMOVE(&nf->netdev->filters, nf, next); + } + g_free(nf->netdev_id); +} + +static void default_handle_event(NetFilterState *nf, int event, Error **errp) +{ + switch (event) { + case COLO_EVENT_CHECKPOINT: + break; + case COLO_EVENT_FAILOVER: + object_property_set_str(OBJECT(nf), "off", "status", errp); + break; + default: + break; + } +} + +static void netfilter_class_init(ObjectClass *oc, void *data) +{ + UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); + NetFilterClass *nfc = NETFILTER_CLASS(oc); + + ucc->complete = netfilter_complete; + nfc->handle_event = default_handle_event; +} + +static const TypeInfo netfilter_info = { + .name = TYPE_NETFILTER, + .parent = TYPE_OBJECT, + .abstract = true, + .class_size = sizeof(NetFilterClass), + .class_init = netfilter_class_init, + .instance_size = sizeof(NetFilterState), + .instance_init = netfilter_init, + .instance_finalize = netfilter_finalize, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void register_types(void) +{ + type_register_static(&netfilter_info); +} + +type_init(register_types); @@ -12,16 +12,19 @@ * */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "monitor/monitor.h" #include "net/net.h" #include "clients.h" #include "hub.h" #include "qemu/iov.h" +#include "qemu/error-report.h" +#include "sysemu/qtest.h" /* * A hub broadcasts incoming packets to all its ports except the source port. - * Hubs can be used to provide independent network segments, also confusingly - * named the QEMU 'vlan' feature. + * Hubs can be used to provide independent emulated network segments. */ typedef struct NetHub NetHub; @@ -130,7 +133,7 @@ static void net_hub_port_cleanup(NetClientState *nc) } static NetClientInfo net_hub_port_info = { - .type = NET_CLIENT_OPTIONS_KIND_HUBPORT, + .type = NET_CLIENT_DRIVER_HUBPORT, .size = sizeof(NetHubPort), .can_receive = net_hub_port_can_receive, .receive = net_hub_port_receive, @@ -138,7 +141,8 @@ static NetClientInfo net_hub_port_info = { .cleanup = net_hub_port_cleanup, }; -static NetHubPort *net_hub_port_new(NetHub *hub, const char *name) +static NetHubPort *net_hub_port_new(NetHub *hub, const char *name, + NetClientState *hubpeer) { NetClientState *nc; NetHubPort *port; @@ -151,7 +155,7 @@ static NetHubPort *net_hub_port_new(NetHub *hub, const char *name) name = default_name; } - nc = qemu_new_net_client(&net_hub_port_info, NULL, "hub", name); + nc = qemu_new_net_client(&net_hub_port_info, hubpeer, "hub", name); port = DO_UPCAST(NetHubPort, nc, nc); port->id = id; port->hub = hub; @@ -163,11 +167,14 @@ static NetHubPort *net_hub_port_new(NetHub *hub, const char *name) /** * Create a port on a given hub + * @hub_id: Number of the hub * @name: Net client name or NULL for default name. + * @hubpeer: Peer to use (if "netdev=id" has been specified) * * If there is no existing hub with the given id then a new hub is created. */ -NetClientState *net_hub_add_port(int hub_id, const char *name) +NetClientState *net_hub_add_port(int hub_id, const char *name, + NetClientState *hubpeer) { NetHub *hub; NetHubPort *port; @@ -182,7 +189,7 @@ NetClientState *net_hub_add_port(int hub_id, const char *name) hub = net_hub_new(hub_id); } - port = net_hub_port_new(hub, name); + port = net_hub_port_new(hub, name, hubpeer); return &port->nc; } @@ -230,7 +237,7 @@ NetClientState *net_hub_port_find(int hub_id) } } - nc = net_hub_add_port(hub_id, NULL); + nc = net_hub_add_port(hub_id, NULL, NULL); return nc; } @@ -245,9 +252,12 @@ void net_hub_info(Monitor *mon) QLIST_FOREACH(hub, &hubs, next) { monitor_printf(mon, "hub %d\n", hub->id); QLIST_FOREACH(port, &hub->ports, next) { + monitor_printf(mon, " \\ %s", port->nc.name); if (port->nc.peer) { - monitor_printf(mon, " \\ "); + monitor_printf(mon, ": "); print_net_client(mon, port->nc.peer); + } else { + monitor_printf(mon, "\n"); } } } @@ -262,10 +272,10 @@ int net_hub_id_for_client(NetClientState *nc, int *id) { NetHubPort *port; - if (nc->info->type == NET_CLIENT_OPTIONS_KIND_HUBPORT) { + if (nc->info->type == NET_CLIENT_DRIVER_HUBPORT) { port = DO_UPCAST(NetHubPort, nc, nc); } else if (nc->peer != NULL && nc->peer->info->type == - NET_CLIENT_OPTIONS_KIND_HUBPORT) { + NET_CLIENT_DRIVER_HUBPORT) { port = DO_UPCAST(NetHubPort, nc, nc->peer); } else { return -ENOENT; @@ -277,20 +287,26 @@ int net_hub_id_for_client(NetClientState *nc, int *id) return 0; } -int net_init_hubport(const NetClientOptions *opts, const char *name, - NetClientState *peer) +int net_init_hubport(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp) { const NetdevHubPortOptions *hubport; + NetClientState *hubpeer = NULL; - assert(opts->kind == NET_CLIENT_OPTIONS_KIND_HUBPORT); - hubport = opts->hubport; + assert(netdev->type == NET_CLIENT_DRIVER_HUBPORT); + assert(!peer); + hubport = &netdev->u.hubport; - /* Treat hub port like a backend, NIC must be the one to peer */ - if (peer) { - return -EINVAL; + if (hubport->has_netdev) { + hubpeer = qemu_find_netdev(hubport->netdev); + if (!hubpeer) { + error_setg(errp, "netdev '%s' not found", hubport->netdev); + return -1; + } } - net_hub_add_port(hubport->hubid, name); + net_hub_add_port(hubport->hubid, name, hubpeer); + return 0; } @@ -309,19 +325,19 @@ void net_hub_check_clients(void) QLIST_FOREACH(port, &hub->ports, next) { peer = port->nc.peer; if (!peer) { - fprintf(stderr, "Warning: hub port %s has no peer\n", - port->nc.name); + warn_report("hub port %s has no peer", port->nc.name); continue; } switch (peer->info->type) { - case NET_CLIENT_OPTIONS_KIND_NIC: + case NET_CLIENT_DRIVER_NIC: has_nic = 1; break; - case NET_CLIENT_OPTIONS_KIND_USER: - case NET_CLIENT_OPTIONS_KIND_TAP: - case NET_CLIENT_OPTIONS_KIND_SOCKET: - case NET_CLIENT_OPTIONS_KIND_VDE: + case NET_CLIENT_DRIVER_USER: + case NET_CLIENT_DRIVER_TAP: + case NET_CLIENT_DRIVER_SOCKET: + case NET_CLIENT_DRIVER_VDE: + case NET_CLIENT_DRIVER_VHOST_USER: has_host_dev = 1; break; default: @@ -329,12 +345,10 @@ void net_hub_check_clients(void) } } if (has_host_dev && !has_nic) { - fprintf(stderr, "Warning: vlan %d with no nics\n", hub->id); + warn_report("hub %d with no nics", hub->id); } - if (has_nic && !has_host_dev) { - fprintf(stderr, - "Warning: vlan %d is not connected to host network\n", - hub->id); + if (has_nic && !has_host_dev && !qtest_enabled()) { + warn_report("hub %d is not connected to host network", hub->id); } } } @@ -347,7 +361,7 @@ bool net_hub_flush(NetClientState *nc) QLIST_FOREACH(port, &source_port->hub->ports, next) { if (port != source_port) { - ret += qemu_net_queue_flush(port->nc.send_queue); + ret += qemu_net_queue_flush(port->nc.incoming_queue); } } return ret ? true : false; @@ -15,9 +15,9 @@ #ifndef NET_HUB_H #define NET_HUB_H -#include "qemu-common.h" -NetClientState *net_hub_add_port(int hub_id, const char *name); +NetClientState *net_hub_add_port(int hub_id, const char *name, + NetClientState *hubpeer); NetClientState *net_hub_find_client_by_name(int hub_id, const char *name); void net_hub_info(Monitor *mon); void net_hub_check_clients(void); diff --git a/net/l2tpv3.c b/net/l2tpv3.c new file mode 100644 index 000000000..55fea17c0 --- /dev/null +++ b/net/l2tpv3.c @@ -0,0 +1,742 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2012-2014 Cisco Systems + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include <linux/ip.h> +#include <netdb.h> +#include "net/net.h" +#include "clients.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/option.h" +#include "qemu/sockets.h" +#include "qemu/iov.h" +#include "qemu/main-loop.h" + + +/* The buffer size needs to be investigated for optimum numbers and + * optimum means of paging in on different systems. This size is + * chosen to be sufficient to accommodate one packet with some headers + */ + +#define BUFFER_ALIGN sysconf(_SC_PAGESIZE) +#define BUFFER_SIZE 2048 +#define IOVSIZE 2 +#define MAX_L2TPV3_MSGCNT 64 +#define MAX_L2TPV3_IOVCNT (MAX_L2TPV3_MSGCNT * IOVSIZE) + +/* Header set to 0x30000 signifies a data packet */ + +#define L2TPV3_DATA_PACKET 0x30000 + +/* IANA-assigned IP protocol ID for L2TPv3 */ + +#ifndef IPPROTO_L2TP +#define IPPROTO_L2TP 0x73 +#endif + +typedef struct NetL2TPV3State { + NetClientState nc; + int fd; + + /* + * these are used for xmit - that happens packet a time + * and for first sign of life packet (easier to parse that once) + */ + + uint8_t *header_buf; + struct iovec *vec; + + /* + * these are used for receive - try to "eat" up to 32 packets at a time + */ + + struct mmsghdr *msgvec; + + /* + * peer address + */ + + struct sockaddr_storage *dgram_dst; + uint32_t dst_size; + + /* + * L2TPv3 parameters + */ + + uint64_t rx_cookie; + uint64_t tx_cookie; + uint32_t rx_session; + uint32_t tx_session; + uint32_t header_size; + uint32_t counter; + + /* + * DOS avoidance in error handling + */ + + bool header_mismatch; + + /* + * Ring buffer handling + */ + + int queue_head; + int queue_tail; + int queue_depth; + + /* + * Precomputed offsets + */ + + uint32_t offset; + uint32_t cookie_offset; + uint32_t counter_offset; + uint32_t session_offset; + + /* Poll Control */ + + bool read_poll; + bool write_poll; + + /* Flags */ + + bool ipv6; + bool udp; + bool has_counter; + bool pin_counter; + bool cookie; + bool cookie_is_64; + +} NetL2TPV3State; + +static void net_l2tpv3_send(void *opaque); +static void l2tpv3_writable(void *opaque); + +static void l2tpv3_update_fd_handler(NetL2TPV3State *s) +{ + qemu_set_fd_handler(s->fd, + s->read_poll ? net_l2tpv3_send : NULL, + s->write_poll ? l2tpv3_writable : NULL, + s); +} + +static void l2tpv3_read_poll(NetL2TPV3State *s, bool enable) +{ + if (s->read_poll != enable) { + s->read_poll = enable; + l2tpv3_update_fd_handler(s); + } +} + +static void l2tpv3_write_poll(NetL2TPV3State *s, bool enable) +{ + if (s->write_poll != enable) { + s->write_poll = enable; + l2tpv3_update_fd_handler(s); + } +} + +static void l2tpv3_writable(void *opaque) +{ + NetL2TPV3State *s = opaque; + l2tpv3_write_poll(s, false); + qemu_flush_queued_packets(&s->nc); +} + +static void l2tpv3_send_completed(NetClientState *nc, ssize_t len) +{ + NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); + l2tpv3_read_poll(s, true); +} + +static void l2tpv3_poll(NetClientState *nc, bool enable) +{ + NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); + l2tpv3_write_poll(s, enable); + l2tpv3_read_poll(s, enable); +} + +static void l2tpv3_form_header(NetL2TPV3State *s) +{ + uint32_t *counter; + + if (s->udp) { + stl_be_p((uint32_t *) s->header_buf, L2TPV3_DATA_PACKET); + } + stl_be_p( + (uint32_t *) (s->header_buf + s->session_offset), + s->tx_session + ); + if (s->cookie) { + if (s->cookie_is_64) { + stq_be_p( + (uint64_t *)(s->header_buf + s->cookie_offset), + s->tx_cookie + ); + } else { + stl_be_p( + (uint32_t *) (s->header_buf + s->cookie_offset), + s->tx_cookie + ); + } + } + if (s->has_counter) { + counter = (uint32_t *)(s->header_buf + s->counter_offset); + if (s->pin_counter) { + *counter = 0; + } else { + stl_be_p(counter, ++s->counter); + } + } +} + +static ssize_t net_l2tpv3_receive_dgram_iov(NetClientState *nc, + const struct iovec *iov, + int iovcnt) +{ + NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); + + struct msghdr message; + int ret; + + if (iovcnt > MAX_L2TPV3_IOVCNT - 1) { + error_report( + "iovec too long %d > %d, change l2tpv3.h", + iovcnt, MAX_L2TPV3_IOVCNT + ); + return -1; + } + l2tpv3_form_header(s); + memcpy(s->vec + 1, iov, iovcnt * sizeof(struct iovec)); + s->vec->iov_base = s->header_buf; + s->vec->iov_len = s->offset; + message.msg_name = s->dgram_dst; + message.msg_namelen = s->dst_size; + message.msg_iov = s->vec; + message.msg_iovlen = iovcnt + 1; + message.msg_control = NULL; + message.msg_controllen = 0; + message.msg_flags = 0; + do { + ret = sendmsg(s->fd, &message, 0); + } while ((ret == -1) && (errno == EINTR)); + if (ret > 0) { + ret -= s->offset; + } else if (ret == 0) { + /* belt and braces - should not occur on DGRAM + * we should get an error and never a 0 send + */ + ret = iov_size(iov, iovcnt); + } else { + /* signal upper layer that socket buffer is full */ + ret = -errno; + if (ret == -EAGAIN || ret == -ENOBUFS) { + l2tpv3_write_poll(s, true); + ret = 0; + } + } + return ret; +} + +static ssize_t net_l2tpv3_receive_dgram(NetClientState *nc, + const uint8_t *buf, + size_t size) +{ + NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); + + struct iovec *vec; + struct msghdr message; + ssize_t ret = 0; + + l2tpv3_form_header(s); + vec = s->vec; + vec->iov_base = s->header_buf; + vec->iov_len = s->offset; + vec++; + vec->iov_base = (void *) buf; + vec->iov_len = size; + message.msg_name = s->dgram_dst; + message.msg_namelen = s->dst_size; + message.msg_iov = s->vec; + message.msg_iovlen = 2; + message.msg_control = NULL; + message.msg_controllen = 0; + message.msg_flags = 0; + do { + ret = sendmsg(s->fd, &message, 0); + } while ((ret == -1) && (errno == EINTR)); + if (ret > 0) { + ret -= s->offset; + } else if (ret == 0) { + /* belt and braces - should not occur on DGRAM + * we should get an error and never a 0 send + */ + ret = size; + } else { + ret = -errno; + if (ret == -EAGAIN || ret == -ENOBUFS) { + /* signal upper layer that socket buffer is full */ + l2tpv3_write_poll(s, true); + ret = 0; + } + } + return ret; +} + +static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf) +{ + + uint32_t *session; + uint64_t cookie; + + if ((!s->udp) && (!s->ipv6)) { + buf += sizeof(struct iphdr) /* fix for ipv4 raw */; + } + + /* we do not do a strict check for "data" packets as per + * the RFC spec because the pure IP spec does not have + * that anyway. + */ + + if (s->cookie) { + if (s->cookie_is_64) { + cookie = ldq_be_p(buf + s->cookie_offset); + } else { + cookie = ldl_be_p(buf + s->cookie_offset) & 0xffffffffULL; + } + if (cookie != s->rx_cookie) { + if (!s->header_mismatch) { + error_report("unknown cookie id"); + } + return -1; + } + } + session = (uint32_t *) (buf + s->session_offset); + if (ldl_be_p(session) != s->rx_session) { + if (!s->header_mismatch) { + error_report("session mismatch"); + } + return -1; + } + return 0; +} + +static void net_l2tpv3_process_queue(NetL2TPV3State *s) +{ + int size = 0; + struct iovec *vec; + bool bad_read; + int data_size; + struct mmsghdr *msgvec; + + /* go into ring mode only if there is a "pending" tail */ + if (s->queue_depth > 0) { + do { + msgvec = s->msgvec + s->queue_tail; + if (msgvec->msg_len > 0) { + data_size = msgvec->msg_len - s->header_size; + vec = msgvec->msg_hdr.msg_iov; + if ((data_size > 0) && + (l2tpv3_verify_header(s, vec->iov_base) == 0)) { + vec++; + /* Use the legacy delivery for now, we will + * switch to using our own ring as a queueing mechanism + * at a later date + */ + size = qemu_send_packet_async( + &s->nc, + vec->iov_base, + data_size, + l2tpv3_send_completed + ); + if (size == 0) { + l2tpv3_read_poll(s, false); + } + bad_read = false; + } else { + bad_read = true; + if (!s->header_mismatch) { + /* report error only once */ + error_report("l2tpv3 header verification failed"); + s->header_mismatch = true; + } + } + } else { + bad_read = true; + } + s->queue_tail = (s->queue_tail + 1) % MAX_L2TPV3_MSGCNT; + s->queue_depth--; + } while ( + (s->queue_depth > 0) && + qemu_can_send_packet(&s->nc) && + ((size > 0) || bad_read) + ); + } +} + +static void net_l2tpv3_send(void *opaque) +{ + NetL2TPV3State *s = opaque; + int target_count, count; + struct mmsghdr *msgvec; + + /* go into ring mode only if there is a "pending" tail */ + + if (s->queue_depth) { + + /* The ring buffer we use has variable intake + * count of how much we can read varies - adjust accordingly + */ + + target_count = MAX_L2TPV3_MSGCNT - s->queue_depth; + + /* Ensure we do not overrun the ring when we have + * a lot of enqueued packets + */ + + if (s->queue_head + target_count > MAX_L2TPV3_MSGCNT) { + target_count = MAX_L2TPV3_MSGCNT - s->queue_head; + } + } else { + + /* we do not have any pending packets - we can use + * the whole message vector linearly instead of using + * it as a ring + */ + + s->queue_head = 0; + s->queue_tail = 0; + target_count = MAX_L2TPV3_MSGCNT; + } + + msgvec = s->msgvec + s->queue_head; + if (target_count > 0) { + do { + count = recvmmsg( + s->fd, + msgvec, + target_count, MSG_DONTWAIT, NULL); + } while ((count == -1) && (errno == EINTR)); + if (count < 0) { + /* Recv error - we still need to flush packets here, + * (re)set queue head to current position + */ + count = 0; + } + s->queue_head = (s->queue_head + count) % MAX_L2TPV3_MSGCNT; + s->queue_depth += count; + } + net_l2tpv3_process_queue(s); +} + +static void destroy_vector(struct mmsghdr *msgvec, int count, int iovcount) +{ + int i, j; + struct iovec *iov; + struct mmsghdr *cleanup = msgvec; + if (cleanup) { + for (i = 0; i < count; i++) { + if (cleanup->msg_hdr.msg_iov) { + iov = cleanup->msg_hdr.msg_iov; + for (j = 0; j < iovcount; j++) { + g_free(iov->iov_base); + iov++; + } + g_free(cleanup->msg_hdr.msg_iov); + } + cleanup++; + } + g_free(msgvec); + } +} + +static struct mmsghdr *build_l2tpv3_vector(NetL2TPV3State *s, int count) +{ + int i; + struct iovec *iov; + struct mmsghdr *msgvec, *result; + + msgvec = g_new(struct mmsghdr, count); + result = msgvec; + for (i = 0; i < count ; i++) { + msgvec->msg_hdr.msg_name = NULL; + msgvec->msg_hdr.msg_namelen = 0; + iov = g_new(struct iovec, IOVSIZE); + msgvec->msg_hdr.msg_iov = iov; + iov->iov_base = g_malloc(s->header_size); + iov->iov_len = s->header_size; + iov++ ; + iov->iov_base = qemu_memalign(BUFFER_ALIGN, BUFFER_SIZE); + iov->iov_len = BUFFER_SIZE; + msgvec->msg_hdr.msg_iovlen = 2; + msgvec->msg_hdr.msg_control = NULL; + msgvec->msg_hdr.msg_controllen = 0; + msgvec->msg_hdr.msg_flags = 0; + msgvec++; + } + return result; +} + +static void net_l2tpv3_cleanup(NetClientState *nc) +{ + NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); + qemu_purge_queued_packets(nc); + l2tpv3_read_poll(s, false); + l2tpv3_write_poll(s, false); + if (s->fd >= 0) { + close(s->fd); + } + destroy_vector(s->msgvec, MAX_L2TPV3_MSGCNT, IOVSIZE); + g_free(s->vec); + g_free(s->header_buf); + g_free(s->dgram_dst); +} + +static NetClientInfo net_l2tpv3_info = { + .type = NET_CLIENT_DRIVER_L2TPV3, + .size = sizeof(NetL2TPV3State), + .receive = net_l2tpv3_receive_dgram, + .receive_iov = net_l2tpv3_receive_dgram_iov, + .poll = l2tpv3_poll, + .cleanup = net_l2tpv3_cleanup, +}; + +int net_init_l2tpv3(const Netdev *netdev, + const char *name, + NetClientState *peer, Error **errp) +{ + const NetdevL2TPv3Options *l2tpv3; + NetL2TPV3State *s; + NetClientState *nc; + int fd = -1, gairet; + struct addrinfo hints; + struct addrinfo *result = NULL; + char *srcport, *dstport; + + nc = qemu_new_net_client(&net_l2tpv3_info, peer, "l2tpv3", name); + + s = DO_UPCAST(NetL2TPV3State, nc, nc); + + s->queue_head = 0; + s->queue_tail = 0; + s->header_mismatch = false; + + assert(netdev->type == NET_CLIENT_DRIVER_L2TPV3); + l2tpv3 = &netdev->u.l2tpv3; + + if (l2tpv3->has_ipv6 && l2tpv3->ipv6) { + s->ipv6 = l2tpv3->ipv6; + } else { + s->ipv6 = false; + } + + if ((l2tpv3->has_offset) && (l2tpv3->offset > 256)) { + error_setg(errp, "offset must be less than 256 bytes"); + goto outerr; + } + + if (l2tpv3->has_rxcookie || l2tpv3->has_txcookie) { + if (l2tpv3->has_rxcookie && l2tpv3->has_txcookie) { + s->cookie = true; + } else { + error_setg(errp, + "require both 'rxcookie' and 'txcookie' or neither"); + goto outerr; + } + } else { + s->cookie = false; + } + + if (l2tpv3->has_cookie64 || l2tpv3->cookie64) { + s->cookie_is_64 = true; + } else { + s->cookie_is_64 = false; + } + + if (l2tpv3->has_udp && l2tpv3->udp) { + s->udp = true; + if (!(l2tpv3->has_srcport && l2tpv3->has_dstport)) { + error_setg(errp, "need both src and dst port for udp"); + goto outerr; + } else { + srcport = l2tpv3->srcport; + dstport = l2tpv3->dstport; + } + } else { + s->udp = false; + srcport = NULL; + dstport = NULL; + } + + + s->offset = 4; + s->session_offset = 0; + s->cookie_offset = 4; + s->counter_offset = 4; + + s->tx_session = l2tpv3->txsession; + if (l2tpv3->has_rxsession) { + s->rx_session = l2tpv3->rxsession; + } else { + s->rx_session = s->tx_session; + } + + if (s->cookie) { + s->rx_cookie = l2tpv3->rxcookie; + s->tx_cookie = l2tpv3->txcookie; + if (s->cookie_is_64 == true) { + /* 64 bit cookie */ + s->offset += 8; + s->counter_offset += 8; + } else { + /* 32 bit cookie */ + s->offset += 4; + s->counter_offset += 4; + } + } + + memset(&hints, 0, sizeof(hints)); + + if (s->ipv6) { + hints.ai_family = AF_INET6; + } else { + hints.ai_family = AF_INET; + } + if (s->udp) { + hints.ai_socktype = SOCK_DGRAM; + hints.ai_protocol = 0; + s->offset += 4; + s->counter_offset += 4; + s->session_offset += 4; + s->cookie_offset += 4; + } else { + hints.ai_socktype = SOCK_RAW; + hints.ai_protocol = IPPROTO_L2TP; + } + + gairet = getaddrinfo(l2tpv3->src, srcport, &hints, &result); + + if ((gairet != 0) || (result == NULL)) { + error_setg(errp, "could not resolve src, errno = %s", + gai_strerror(gairet)); + goto outerr; + } + fd = socket(result->ai_family, result->ai_socktype, result->ai_protocol); + if (fd == -1) { + fd = -errno; + error_setg(errp, "socket creation failed, errno = %d", + -fd); + goto outerr; + } + if (bind(fd, (struct sockaddr *) result->ai_addr, result->ai_addrlen)) { + error_setg(errp, "could not bind socket err=%i", errno); + goto outerr; + } + if (result) { + freeaddrinfo(result); + } + + memset(&hints, 0, sizeof(hints)); + + if (s->ipv6) { + hints.ai_family = AF_INET6; + } else { + hints.ai_family = AF_INET; + } + if (s->udp) { + hints.ai_socktype = SOCK_DGRAM; + hints.ai_protocol = 0; + } else { + hints.ai_socktype = SOCK_RAW; + hints.ai_protocol = IPPROTO_L2TP; + } + + result = NULL; + gairet = getaddrinfo(l2tpv3->dst, dstport, &hints, &result); + if ((gairet != 0) || (result == NULL)) { + error_setg(errp, "could not resolve dst, error = %s", + gai_strerror(gairet)); + goto outerr; + } + + s->dgram_dst = g_new0(struct sockaddr_storage, 1); + memcpy(s->dgram_dst, result->ai_addr, result->ai_addrlen); + s->dst_size = result->ai_addrlen; + + if (result) { + freeaddrinfo(result); + } + + if (l2tpv3->has_counter && l2tpv3->counter) { + s->has_counter = true; + s->offset += 4; + } else { + s->has_counter = false; + } + + if (l2tpv3->has_pincounter && l2tpv3->pincounter) { + s->has_counter = true; /* pin counter implies that there is counter */ + s->pin_counter = true; + } else { + s->pin_counter = false; + } + + if (l2tpv3->has_offset) { + /* extra offset */ + s->offset += l2tpv3->offset; + } + + if ((s->ipv6) || (s->udp)) { + s->header_size = s->offset; + } else { + s->header_size = s->offset + sizeof(struct iphdr); + } + + s->msgvec = build_l2tpv3_vector(s, MAX_L2TPV3_MSGCNT); + s->vec = g_new(struct iovec, MAX_L2TPV3_IOVCNT); + s->header_buf = g_malloc(s->header_size); + + qemu_set_nonblock(fd); + + s->fd = fd; + s->counter = 0; + + l2tpv3_read_poll(s, true); + + snprintf(s->nc.info_str, sizeof(s->nc.info_str), + "l2tpv3: connected"); + return 0; +outerr: + qemu_del_net_client(nc); + if (fd >= 0) { + close(fd); + } + if (result) { + freeaddrinfo(result); + } + return -1; +} + @@ -21,113 +21,109 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#include "config-host.h" + +#include "qemu/osdep.h" #include "net/net.h" #include "clients.h" #include "hub.h" +#include "hw/qdev-properties.h" #include "net/slirp.h" +#include "net/eth.h" #include "util.h" #include "monitor/monitor.h" -#include "qemu-common.h" +#include "qemu/help_option.h" +#include "qapi/qapi-commands-net.h" +#include "qapi/qapi-visit-net.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qerror.h" +#include "qemu/error-report.h" #include "qemu/sockets.h" +#include "qemu/cutils.h" #include "qemu/config-file.h" -#include "qmp-commands.h" -#include "hw/qdev.h" +#include "qemu/ctype.h" #include "qemu/iov.h" -#include "qapi-visit.h" +#include "qemu/main-loop.h" +#include "qemu/option.h" +#include "qapi/error.h" #include "qapi/opts-visitor.h" -#include "qapi/dealloc-visitor.h" +#include "sysemu/sysemu.h" +#include "sysemu/qtest.h" +#include "sysemu/runstate.h" +#include "sysemu/sysemu.h" +#include "net/filter.h" +#include "qapi/string-output-visitor.h" /* Net bridge is currently not supported for W32. */ #if !defined(_WIN32) # define CONFIG_NET_BRIDGE #endif +static VMChangeStateEntry *net_change_state_entry; static QTAILQ_HEAD(, NetClientState) net_clients; -int default_net = 1; - /***********************************************************/ /* network device redirectors */ -#if defined(DEBUG_NET) -static void hex_dump(FILE *f, const uint8_t *buf, int size) -{ - int len, i, j, c; - - for(i=0;i<size;i+=16) { - len = size - i; - if (len > 16) - len = 16; - fprintf(f, "%08x ", i); - for(j=0;j<16;j++) { - if (j < len) - fprintf(f, " %02x", buf[i+j]); - else - fprintf(f, " "); - } - fprintf(f, " "); - for(j=0;j<len;j++) { - c = buf[i+j]; - if (c < ' ' || c > '~') - c = '.'; - fprintf(f, "%c", c); - } - fprintf(f, "\n"); - } -} -#endif - -static int get_str_sep(char *buf, int buf_size, const char **pp, int sep) +int parse_host_port(struct sockaddr_in *saddr, const char *str, + Error **errp) { - const char *p, *p1; - int len; - p = *pp; - p1 = strchr(p, sep); - if (!p1) - return -1; - len = p1 - p; - p1++; - if (buf_size > 0) { - if (len > buf_size - 1) - len = buf_size - 1; - memcpy(buf, p, len); - buf[len] = '\0'; - } - *pp = p1; - return 0; -} - -int parse_host_port(struct sockaddr_in *saddr, const char *str) -{ - char buf[512]; + gchar **substrings; struct hostent *he; - const char *p, *r; - int port; + const char *addr, *p, *r; + int port, ret = 0; + + substrings = g_strsplit(str, ":", 2); + if (!substrings || !substrings[0] || !substrings[1]) { + error_setg(errp, "host address '%s' doesn't contain ':' " + "separating host from port", str); + ret = -1; + goto out; + } + + addr = substrings[0]; + p = substrings[1]; - p = str; - if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) - return -1; saddr->sin_family = AF_INET; - if (buf[0] == '\0') { + if (addr[0] == '\0') { saddr->sin_addr.s_addr = 0; } else { - if (qemu_isdigit(buf[0])) { - if (!inet_aton(buf, &saddr->sin_addr)) - return -1; + if (qemu_isdigit(addr[0])) { + if (!inet_aton(addr, &saddr->sin_addr)) { + error_setg(errp, "host address '%s' is not a valid " + "IPv4 address", addr); + ret = -1; + goto out; + } } else { - if ((he = gethostbyname(buf)) == NULL) - return - 1; + he = gethostbyname(addr); + if (he == NULL) { + error_setg(errp, "can't resolve host address '%s'", addr); + ret = -1; + goto out; + } saddr->sin_addr = *(struct in_addr *)he->h_addr; } } port = strtol(p, (char **)&r, 0); - if (r == p) - return -1; + if (r == p) { + error_setg(errp, "port number '%s' is invalid", p); + ret = -1; + goto out; + } saddr->sin_port = htons(port); - return 0; + +out: + g_strfreev(substrings); + return ret; +} + +char *qemu_mac_strdup_printf(const uint8_t *macaddr) +{ + return g_strdup_printf("%.2x:%.2x:%.2x:%.2x:%.2x:%.2x", + macaddr[0], macaddr[1], macaddr[2], + macaddr[3], macaddr[4], macaddr[5]); } void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]) @@ -139,19 +135,68 @@ void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]) macaddr[3], macaddr[4], macaddr[5]); } +static int mac_table[256] = {0}; + +static void qemu_macaddr_set_used(MACAddr *macaddr) +{ + int index; + + for (index = 0x56; index < 0xFF; index++) { + if (macaddr->a[5] == index) { + mac_table[index]++; + } + } +} + +static void qemu_macaddr_set_free(MACAddr *macaddr) +{ + int index; + static const MACAddr base = { .a = { 0x52, 0x54, 0x00, 0x12, 0x34, 0 } }; + + if (memcmp(macaddr->a, &base.a, (sizeof(base.a) - 1)) != 0) { + return; + } + for (index = 0x56; index < 0xFF; index++) { + if (macaddr->a[5] == index) { + mac_table[index]--; + } + } +} + +static int qemu_macaddr_get_free(void) +{ + int index; + + for (index = 0x56; index < 0xFF; index++) { + if (mac_table[index] == 0) { + return index; + } + } + + return -1; +} + void qemu_macaddr_default_if_unset(MACAddr *macaddr) { - static int index = 0; static const MACAddr zero = { .a = { 0,0,0,0,0,0 } }; + static const MACAddr base = { .a = { 0x52, 0x54, 0x00, 0x12, 0x34, 0 } }; + + if (memcmp(macaddr, &zero, sizeof(zero)) != 0) { + if (memcmp(macaddr->a, &base.a, (sizeof(base.a) - 1)) != 0) { + return; + } else { + qemu_macaddr_set_used(macaddr); + return; + } + } - if (memcmp(macaddr, &zero, sizeof(zero)) != 0) - return; macaddr->a[0] = 0x52; macaddr->a[1] = 0x54; macaddr->a[2] = 0x00; macaddr->a[3] = 0x12; macaddr->a[4] = 0x34; - macaddr->a[5] = 0x56 + index++; + macaddr->a[5] = qemu_macaddr_get_free(); + qemu_macaddr_set_used(macaddr); } /** @@ -162,7 +207,6 @@ void qemu_macaddr_default_if_unset(MACAddr *macaddr) static char *assign_name(NetClientState *nc1, const char *model) { NetClientState *nc; - char buf[256]; int id = 0; QTAILQ_FOREACH(nc, &net_clients, next) { @@ -174,15 +218,18 @@ static char *assign_name(NetClientState *nc1, const char *model) } } - snprintf(buf, sizeof(buf), "%s.%d", model, id); - - return g_strdup(buf); + return g_strdup_printf("%s.%d", model, id); } static void qemu_net_client_destructor(NetClientState *nc) { g_free(nc); } +static ssize_t qemu_deliver_packet_iov(NetClientState *sender, + unsigned flags, + const struct iovec *iov, + int iovcnt, + void *opaque); static void qemu_net_client_setup(NetClientState *nc, NetClientInfo *info, @@ -206,8 +253,9 @@ static void qemu_net_client_setup(NetClientState *nc, } QTAILQ_INSERT_TAIL(&net_clients, nc, next); - nc->send_queue = qemu_new_net_queue(nc); + nc->incoming_queue = qemu_new_net_queue(qemu_deliver_packet_iov, nc); nc->destructor = destructor; + QTAILQ_INIT(&nc->filters); } NetClientState *qemu_new_net_client(NetClientInfo *info, @@ -234,9 +282,9 @@ NICState *qemu_new_nic(NetClientInfo *info, { NetClientState **peers = conf->peers.ncs; NICState *nic; - int i, queues = MAX(1, conf->queues); + int i, queues = MAX(1, conf->peers.queues); - assert(info->type == NET_CLIENT_OPTIONS_KIND_NIC); + assert(info->type == NET_CLIENT_DRIVER_NIC); assert(info->size >= sizeof(NICState)); nic = g_malloc0(info->size + sizeof(NetClientState) * queues); @@ -288,8 +336,8 @@ static void qemu_cleanup_net_client(NetClientState *nc) static void qemu_free_net_client(NetClientState *nc) { - if (nc->send_queue) { - qemu_del_net_queue(nc->send_queue); + if (nc->incoming_queue) { + qemu_del_net_queue(nc->incoming_queue); } if (nc->peer) { nc->peer->peer = NULL; @@ -305,17 +353,24 @@ void qemu_del_net_client(NetClientState *nc) { NetClientState *ncs[MAX_QUEUE_NUM]; int queues, i; + NetFilterState *nf, *next; + + assert(nc->info->type != NET_CLIENT_DRIVER_NIC); /* If the NetClientState belongs to a multiqueue backend, we will change all * other NetClientStates also. */ queues = qemu_find_net_clients_except(nc->name, ncs, - NET_CLIENT_OPTIONS_KIND_NIC, + NET_CLIENT_DRIVER_NIC, MAX_QUEUE_NUM); assert(queues != 0); + QTAILQ_FOREACH_SAFE(nf, &nc->filters, next, next) { + object_unparent(OBJECT(nf)); + } + /* If there is a peer NIC, delete and cleanup client, but do not free. */ - if (nc->peer && nc->peer->info->type == NET_CLIENT_OPTIONS_KIND_NIC) { + if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_NIC) { NICState *nic = qemu_get_nic(nc->peer); if (nic->peer_deleted) { return; @@ -337,8 +392,6 @@ void qemu_del_net_client(NetClientState *nc) return; } - assert(nc->info->type != NET_CLIENT_OPTIONS_KIND_NIC); - for (i = 0; i < queues; i++) { qemu_cleanup_net_client(ncs[i]); qemu_free_net_client(ncs[i]); @@ -347,7 +400,9 @@ void qemu_del_net_client(NetClientState *nc) void qemu_del_nic(NICState *nic) { - int i, queues = MAX(nic->conf->queues, 1); + int i, queues = MAX(nic->conf->peers.queues, 1); + + qemu_macaddr_set_free(&nic->conf->macaddr); /* If this is a peer NIC and peer has already been deleted, free it now. */ if (nic->peer_deleted) { @@ -371,7 +426,7 @@ void qemu_foreach_nic(qemu_nic_foreach func, void *opaque) NetClientState *nc; QTAILQ_FOREACH(nc, &net_clients, next) { - if (nc->info->type == NET_CLIENT_OPTIONS_KIND_NIC) { + if (nc->info->type == NET_CLIENT_DRIVER_NIC) { if (nc->queue_index == 0) { func(qemu_get_nic(nc), opaque); } @@ -379,49 +434,160 @@ void qemu_foreach_nic(qemu_nic_foreach func, void *opaque) } } -int qemu_can_send_packet(NetClientState *sender) +bool qemu_has_ufo(NetClientState *nc) { - if (!sender->peer) { - return 1; + if (!nc || !nc->info->has_ufo) { + return false; + } + + return nc->info->has_ufo(nc); +} + +bool qemu_has_vnet_hdr(NetClientState *nc) +{ + if (!nc || !nc->info->has_vnet_hdr) { + return false; + } + + return nc->info->has_vnet_hdr(nc); +} + +bool qemu_has_vnet_hdr_len(NetClientState *nc, int len) +{ + if (!nc || !nc->info->has_vnet_hdr_len) { + return false; + } + + return nc->info->has_vnet_hdr_len(nc, len); +} + +void qemu_using_vnet_hdr(NetClientState *nc, bool enable) +{ + if (!nc || !nc->info->using_vnet_hdr) { + return; + } + + nc->info->using_vnet_hdr(nc, enable); +} + +void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6, + int ecn, int ufo) +{ + if (!nc || !nc->info->set_offload) { + return; } - if (sender->peer->receive_disabled) { + nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo); +} + +void qemu_set_vnet_hdr_len(NetClientState *nc, int len) +{ + if (!nc || !nc->info->set_vnet_hdr_len) { + return; + } + + nc->vnet_hdr_len = len; + nc->info->set_vnet_hdr_len(nc, len); +} + +int qemu_set_vnet_le(NetClientState *nc, bool is_le) +{ +#ifdef HOST_WORDS_BIGENDIAN + if (!nc || !nc->info->set_vnet_le) { + return -ENOSYS; + } + + return nc->info->set_vnet_le(nc, is_le); +#else + return 0; +#endif +} + +int qemu_set_vnet_be(NetClientState *nc, bool is_be) +{ +#ifdef HOST_WORDS_BIGENDIAN + return 0; +#else + if (!nc || !nc->info->set_vnet_be) { + return -ENOSYS; + } + + return nc->info->set_vnet_be(nc, is_be); +#endif +} + +int qemu_can_receive_packet(NetClientState *nc) +{ + if (nc->receive_disabled) { return 0; - } else if (sender->peer->info->can_receive && - !sender->peer->info->can_receive(sender->peer)) { + } else if (nc->info->can_receive && + !nc->info->can_receive(nc)) { return 0; } return 1; } -ssize_t qemu_deliver_packet(NetClientState *sender, - unsigned flags, - const uint8_t *data, - size_t size, - void *opaque) +int qemu_can_send_packet(NetClientState *sender) { - NetClientState *nc = opaque; - ssize_t ret; + int vm_running = runstate_is_running(); - if (nc->link_down) { - return size; + if (!vm_running) { + return 0; } - if (nc->receive_disabled) { - return 0; + if (!sender->peer) { + return 1; } - if (flags & QEMU_NET_PACKET_FLAG_RAW && nc->info->receive_raw) { - ret = nc->info->receive_raw(nc, data, size); + return qemu_can_receive_packet(sender->peer); +} + +static ssize_t filter_receive_iov(NetClientState *nc, + NetFilterDirection direction, + NetClientState *sender, + unsigned flags, + const struct iovec *iov, + int iovcnt, + NetPacketSent *sent_cb) +{ + ssize_t ret = 0; + NetFilterState *nf = NULL; + + if (direction == NET_FILTER_DIRECTION_TX) { + QTAILQ_FOREACH(nf, &nc->filters, next) { + ret = qemu_netfilter_receive(nf, direction, sender, flags, iov, + iovcnt, sent_cb); + if (ret) { + return ret; + } + } } else { - ret = nc->info->receive(nc, data, size); + QTAILQ_FOREACH_REVERSE(nf, &nc->filters, next) { + ret = qemu_netfilter_receive(nf, direction, sender, flags, iov, + iovcnt, sent_cb); + if (ret) { + return ret; + } + } } - if (ret == 0) { - nc->receive_disabled = 1; + return ret; +} + +static ssize_t filter_receive(NetClientState *nc, + NetFilterDirection direction, + NetClientState *sender, + unsigned flags, + const uint8_t *data, + size_t size, + NetPacketSent *sent_cb) +{ + struct iovec iov = { + .iov_base = (void *)data, + .iov_len = size }; - return ret; + return filter_receive_iov(nc, direction, sender, flags, &iov, 1, sent_cb); } void qemu_purge_queued_packets(NetClientState *nc) @@ -430,44 +596,65 @@ void qemu_purge_queued_packets(NetClientState *nc) return; } - qemu_net_queue_purge(nc->peer->send_queue, nc); + qemu_net_queue_purge(nc->peer->incoming_queue, nc); } -void qemu_flush_queued_packets(NetClientState *nc) +void qemu_flush_or_purge_queued_packets(NetClientState *nc, bool purge) { nc->receive_disabled = 0; - if (nc->peer && nc->peer->info->type == NET_CLIENT_OPTIONS_KIND_HUBPORT) { + if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_HUBPORT) { if (net_hub_flush(nc->peer)) { qemu_notify_event(); } - return; } - if (qemu_net_queue_flush(nc->send_queue)) { + if (qemu_net_queue_flush(nc->incoming_queue)) { /* We emptied the queue successfully, signal to the IO thread to repoll * the file descriptor (for tap, for example). */ qemu_notify_event(); + } else if (purge) { + /* Unable to empty the queue, purge remaining packets */ + qemu_net_queue_purge(nc->incoming_queue, nc); } } +void qemu_flush_queued_packets(NetClientState *nc) +{ + qemu_flush_or_purge_queued_packets(nc, false); +} + static ssize_t qemu_send_packet_async_with_flags(NetClientState *sender, unsigned flags, const uint8_t *buf, int size, NetPacketSent *sent_cb) { NetQueue *queue; + int ret; #ifdef DEBUG_NET printf("qemu_send_packet_async:\n"); - hex_dump(stdout, buf, size); + qemu_hexdump((const char *)buf, stdout, "net", size); #endif if (sender->link_down || !sender->peer) { return size; } - queue = sender->peer->send_queue; + /* Let filters handle the packet first */ + ret = filter_receive(sender, NET_FILTER_DIRECTION_TX, + sender, flags, buf, size, sent_cb); + if (ret) { + return ret; + } + + ret = filter_receive(sender->peer, NET_FILTER_DIRECTION_RX, + sender, flags, buf, size, sent_cb); + if (ret) { + return ret; + } + + queue = sender->peer->incoming_queue; return qemu_net_queue_send(queue, sender, flags, buf, size, sent_cb); } @@ -480,9 +667,28 @@ ssize_t qemu_send_packet_async(NetClientState *sender, buf, size, sent_cb); } -void qemu_send_packet(NetClientState *nc, const uint8_t *buf, int size) +ssize_t qemu_send_packet(NetClientState *nc, const uint8_t *buf, int size) +{ + return qemu_send_packet_async(nc, buf, size, NULL); +} + +ssize_t qemu_receive_packet(NetClientState *nc, const uint8_t *buf, int size) +{ + if (!qemu_can_receive_packet(nc)) { + return 0; + } + + return qemu_net_queue_receive(nc->incoming_queue, buf, size); +} + +ssize_t qemu_receive_packet_iov(NetClientState *nc, const struct iovec *iov, + int iovcnt) { - qemu_send_packet_async(nc, buf, size, NULL); + if (!qemu_can_receive_packet(nc)) { + return 0; + } + + return qemu_net_queue_receive_iov(nc->incoming_queue, iov, iovcnt); } ssize_t qemu_send_packet_raw(NetClientState *nc, const uint8_t *buf, int size) @@ -492,25 +698,46 @@ ssize_t qemu_send_packet_raw(NetClientState *nc, const uint8_t *buf, int size) } static ssize_t nc_sendv_compat(NetClientState *nc, const struct iovec *iov, - int iovcnt) + int iovcnt, unsigned flags) { - uint8_t buffer[NET_BUFSIZE]; + uint8_t *buf = NULL; + uint8_t *buffer; size_t offset; + ssize_t ret; + + if (iovcnt == 1) { + buffer = iov[0].iov_base; + offset = iov[0].iov_len; + } else { + offset = iov_size(iov, iovcnt); + if (offset > NET_BUFSIZE) { + return -1; + } + buf = g_malloc(offset); + buffer = buf; + offset = iov_to_buf(iov, iovcnt, 0, buf, offset); + } - offset = iov_to_buf(iov, iovcnt, 0, buffer, sizeof(buffer)); + if (flags & QEMU_NET_PACKET_FLAG_RAW && nc->info->receive_raw) { + ret = nc->info->receive_raw(nc, buffer, offset); + } else { + ret = nc->info->receive(nc, buffer, offset); + } - return nc->info->receive(nc, buffer, offset); + g_free(buf); + return ret; } -ssize_t qemu_deliver_packet_iov(NetClientState *sender, - unsigned flags, - const struct iovec *iov, - int iovcnt, - void *opaque) +static ssize_t qemu_deliver_packet_iov(NetClientState *sender, + unsigned flags, + const struct iovec *iov, + int iovcnt, + void *opaque) { NetClientState *nc = opaque; int ret; + if (nc->link_down) { return iov_size(iov, iovcnt); } @@ -519,10 +746,10 @@ ssize_t qemu_deliver_packet_iov(NetClientState *sender, return 0; } - if (nc->info->receive_iov) { + if (nc->info->receive_iov && !(flags & QEMU_NET_PACKET_FLAG_RAW)) { ret = nc->info->receive_iov(nc, iov, iovcnt); } else { - ret = nc_sendv_compat(nc, iov, iovcnt); + ret = nc_sendv_compat(nc, iov, iovcnt, flags); } if (ret == 0) { @@ -537,12 +764,31 @@ ssize_t qemu_sendv_packet_async(NetClientState *sender, NetPacketSent *sent_cb) { NetQueue *queue; + size_t size = iov_size(iov, iovcnt); + int ret; + + if (size > NET_BUFSIZE) { + return size; + } if (sender->link_down || !sender->peer) { - return iov_size(iov, iovcnt); + return size; + } + + /* Let filters handle the packet first */ + ret = filter_receive_iov(sender, NET_FILTER_DIRECTION_TX, sender, + QEMU_NET_PACKET_FLAG_NONE, iov, iovcnt, sent_cb); + if (ret) { + return ret; + } + + ret = filter_receive_iov(sender->peer, NET_FILTER_DIRECTION_RX, sender, + QEMU_NET_PACKET_FLAG_NONE, iov, iovcnt, sent_cb); + if (ret) { + return ret; } - queue = sender->peer->send_queue; + queue = sender->peer->incoming_queue; return qemu_net_queue_send_iov(queue, sender, QEMU_NET_PACKET_FLAG_NONE, @@ -560,7 +806,7 @@ NetClientState *qemu_find_netdev(const char *id) NetClientState *nc; QTAILQ_FOREACH(nc, &net_clients, next) { - if (nc->info->type == NET_CLIENT_OPTIONS_KIND_NIC) + if (nc->info->type == NET_CLIENT_DRIVER_NIC) continue; if (!strcmp(nc->name, id)) { return nc; @@ -571,7 +817,7 @@ NetClientState *qemu_find_netdev(const char *id) } int qemu_find_net_clients_except(const char *id, NetClientState **ncs, - NetClientOptionsKind type, int max) + NetClientDriver type, int max) { NetClientState *nc; int ret = 0; @@ -580,7 +826,7 @@ int qemu_find_net_clients_except(const char *id, NetClientState **ncs, if (nc->info->type == type) { continue; } - if (!strcmp(nc->name, id)) { + if (!id || !strcmp(nc->name, id)) { if (ret < max) { ncs[ret] = nc; } @@ -609,9 +855,10 @@ int qemu_show_nic_models(const char *arg, const char *const *models) return 0; } - fprintf(stderr, "qemu: Supported NIC models: "); - for (i = 0 ; models[i]; i++) - fprintf(stderr, "%s%c", models[i], models[i+1] ? ',' : '\n'); + printf("Supported NIC models:\n"); + for (i = 0 ; models[i]; i++) { + printf("%s\n", models[i]); + } return 1; } @@ -645,19 +892,19 @@ int qemu_find_nic_model(NICInfo *nd, const char * const *models, return -1; } -static int net_init_nic(const NetClientOptions *opts, const char *name, - NetClientState *peer) +static int net_init_nic(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp) { int idx; NICInfo *nd; const NetLegacyNicOptions *nic; - assert(opts->kind == NET_CLIENT_OPTIONS_KIND_NIC); - nic = opts->nic; + assert(netdev->type == NET_CLIENT_DRIVER_NIC); + nic = &netdev->u.nic; idx = nic_get_free_idx(); if (idx == -1 || nb_nics >= MAX_NICS) { - error_report("Too Many NICs"); + error_setg(errp, "too many NICs"); return -1; } @@ -668,7 +915,7 @@ static int net_init_nic(const NetClientOptions *opts, const char *name, if (nic->has_netdev) { nd->netdev = qemu_find_netdev(nic->netdev); if (!nd->netdev) { - error_report("netdev '%s' not found", nic->netdev); + error_setg(errp, "netdev '%s' not found", nic->netdev); return -1; } } else { @@ -685,14 +932,20 @@ static int net_init_nic(const NetClientOptions *opts, const char *name, if (nic->has_macaddr && net_parse_macaddr(nd->macaddr.a, nic->macaddr) < 0) { - error_report("invalid syntax for ethernet address"); + error_setg(errp, "invalid syntax for ethernet address"); + return -1; + } + if (nic->has_macaddr && + is_multicast_ether_addr(nd->macaddr.a)) { + error_setg(errp, + "NIC cannot have multicast MAC address (odd 1st byte)"); return -1; } qemu_macaddr_default_if_unset(&nd->macaddr); if (nic->has_vectors) { if (nic->vectors > 0x7ffffff) { - error_report("invalid # of vectors: %"PRIu32, nic->vectors); + error_setg(errp, "invalid # of vectors: %"PRIu32, nic->vectors); return -1; } nd->nvectors = nic->vectors; @@ -707,229 +960,267 @@ static int net_init_nic(const NetClientOptions *opts, const char *name, } -static int (* const net_client_init_fun[NET_CLIENT_OPTIONS_KIND_MAX])( - const NetClientOptions *opts, +static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])( + const Netdev *netdev, const char *name, - NetClientState *peer) = { - [NET_CLIENT_OPTIONS_KIND_NIC] = net_init_nic, + NetClientState *peer, Error **errp) = { + [NET_CLIENT_DRIVER_NIC] = net_init_nic, #ifdef CONFIG_SLIRP - [NET_CLIENT_OPTIONS_KIND_USER] = net_init_slirp, + [NET_CLIENT_DRIVER_USER] = net_init_slirp, #endif - [NET_CLIENT_OPTIONS_KIND_TAP] = net_init_tap, - [NET_CLIENT_OPTIONS_KIND_SOCKET] = net_init_socket, + [NET_CLIENT_DRIVER_TAP] = net_init_tap, + [NET_CLIENT_DRIVER_SOCKET] = net_init_socket, #ifdef CONFIG_VDE - [NET_CLIENT_OPTIONS_KIND_VDE] = net_init_vde, + [NET_CLIENT_DRIVER_VDE] = net_init_vde, +#endif +#ifdef CONFIG_NETMAP + [NET_CLIENT_DRIVER_NETMAP] = net_init_netmap, #endif - [NET_CLIENT_OPTIONS_KIND_DUMP] = net_init_dump, #ifdef CONFIG_NET_BRIDGE - [NET_CLIENT_OPTIONS_KIND_BRIDGE] = net_init_bridge, + [NET_CLIENT_DRIVER_BRIDGE] = net_init_bridge, +#endif + [NET_CLIENT_DRIVER_HUBPORT] = net_init_hubport, +#ifdef CONFIG_VHOST_NET_USER + [NET_CLIENT_DRIVER_VHOST_USER] = net_init_vhost_user, +#endif +#ifdef CONFIG_L2TPV3 + [NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3, #endif - [NET_CLIENT_OPTIONS_KIND_HUBPORT] = net_init_hubport, }; -static int net_client_init1(const void *object, int is_netdev, Error **errp) +static int net_client_init1(const void *object, bool is_netdev, Error **errp) { - union { - const Netdev *netdev; - const NetLegacy *net; - } u; - const NetClientOptions *opts; + Netdev legacy = {0}; + const Netdev *netdev; const char *name; + NetClientState *peer = NULL; if (is_netdev) { - u.netdev = object; - opts = u.netdev->opts; - name = u.netdev->id; + netdev = object; + name = netdev->id; - switch (opts->kind) { -#ifdef CONFIG_SLIRP - case NET_CLIENT_OPTIONS_KIND_USER: -#endif - case NET_CLIENT_OPTIONS_KIND_TAP: - case NET_CLIENT_OPTIONS_KIND_SOCKET: -#ifdef CONFIG_VDE - case NET_CLIENT_OPTIONS_KIND_VDE: -#endif -#ifdef CONFIG_NET_BRIDGE - case NET_CLIENT_OPTIONS_KIND_BRIDGE: -#endif - case NET_CLIENT_OPTIONS_KIND_HUBPORT: - break; - - default: - error_set(errp, QERR_INVALID_PARAMETER_VALUE, "type", - "a netdev backend type"); + if (netdev->type == NET_CLIENT_DRIVER_NIC || + !net_client_init_fun[netdev->type]) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "type", + "a netdev backend type"); return -1; } } else { - u.net = object; - opts = u.net->opts; + const NetLegacy *net = object; + const NetLegacyOptions *opts = net->opts; + legacy.id = net->id; + netdev = &legacy; /* missing optional values have been initialized to "all bits zero" */ - name = u.net->has_id ? u.net->id : u.net->name; - } + name = net->has_id ? net->id : net->name; - if (net_client_init_fun[opts->kind]) { - NetClientState *peer = NULL; + if (net->has_name) { + warn_report("The 'name' parameter is deprecated, use 'id' instead"); + } - /* Do not add to a vlan if it's a -netdev or a nic with a netdev= - * parameter. */ - if (!is_netdev && - (opts->kind != NET_CLIENT_OPTIONS_KIND_NIC || - !opts->nic->has_netdev)) { - peer = net_hub_add_port(u.net->has_vlan ? u.net->vlan : 0, NULL); + /* Map the old options to the new flat type */ + switch (opts->type) { + case NET_LEGACY_OPTIONS_TYPE_NONE: + return 0; /* nothing to do */ + case NET_LEGACY_OPTIONS_TYPE_NIC: + legacy.type = NET_CLIENT_DRIVER_NIC; + legacy.u.nic = opts->u.nic; + break; + case NET_LEGACY_OPTIONS_TYPE_USER: + legacy.type = NET_CLIENT_DRIVER_USER; + legacy.u.user = opts->u.user; + break; + case NET_LEGACY_OPTIONS_TYPE_TAP: + legacy.type = NET_CLIENT_DRIVER_TAP; + legacy.u.tap = opts->u.tap; + break; + case NET_LEGACY_OPTIONS_TYPE_L2TPV3: + legacy.type = NET_CLIENT_DRIVER_L2TPV3; + legacy.u.l2tpv3 = opts->u.l2tpv3; + break; + case NET_LEGACY_OPTIONS_TYPE_SOCKET: + legacy.type = NET_CLIENT_DRIVER_SOCKET; + legacy.u.socket = opts->u.socket; + break; + case NET_LEGACY_OPTIONS_TYPE_VDE: + legacy.type = NET_CLIENT_DRIVER_VDE; + legacy.u.vde = opts->u.vde; + break; + case NET_LEGACY_OPTIONS_TYPE_BRIDGE: + legacy.type = NET_CLIENT_DRIVER_BRIDGE; + legacy.u.bridge = opts->u.bridge; + break; + case NET_LEGACY_OPTIONS_TYPE_NETMAP: + legacy.type = NET_CLIENT_DRIVER_NETMAP; + legacy.u.netmap = opts->u.netmap; + break; + case NET_LEGACY_OPTIONS_TYPE_VHOST_USER: + legacy.type = NET_CLIENT_DRIVER_VHOST_USER; + legacy.u.vhost_user = opts->u.vhost_user; + break; + default: + abort(); } - if (net_client_init_fun[opts->kind](opts, name, peer) < 0) { - /* TODO push error reporting into init() methods */ - error_set(errp, QERR_DEVICE_INIT_FAILED, - NetClientOptionsKind_lookup[opts->kind]); + if (!net_client_init_fun[netdev->type]) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "type", + "a net backend type (maybe it is not compiled " + "into this binary)"); return -1; } + + /* Do not add to a hub if it's a nic with a netdev= parameter. */ + if (netdev->type != NET_CLIENT_DRIVER_NIC || + !opts->u.nic.has_netdev) { + peer = net_hub_add_port(0, NULL, NULL); + } + } + + if (net_client_init_fun[netdev->type](netdev, name, peer, errp) < 0) { + /* FIXME drop when all init functions store an Error */ + if (errp && !*errp) { + error_setg(errp, QERR_DEVICE_INIT_FAILED, + NetClientDriver_str(netdev->type)); + } + return -1; } return 0; } - -static void net_visit(Visitor *v, int is_netdev, void **object, Error **errp) +static void show_netdevs(void) { - if (is_netdev) { - visit_type_Netdev(v, (Netdev **)object, NULL, errp); - } else { - visit_type_NetLegacy(v, (NetLegacy **)object, NULL, errp); + int idx; + const char *available_netdevs[] = { + "socket", + "hubport", + "tap", +#ifdef CONFIG_SLIRP + "user", +#endif +#ifdef CONFIG_L2TPV3 + "l2tpv3", +#endif +#ifdef CONFIG_VDE + "vde", +#endif +#ifdef CONFIG_NET_BRIDGE + "bridge", +#endif +#ifdef CONFIG_NETMAP + "netmap", +#endif +#ifdef CONFIG_POSIX + "vhost-user", +#endif + }; + + printf("Available netdev backend types:\n"); + for (idx = 0; idx < ARRAY_SIZE(available_netdevs); idx++) { + puts(available_netdevs[idx]); } } - -int net_client_init(QemuOpts *opts, int is_netdev, Error **errp) +static int net_client_init(QemuOpts *opts, bool is_netdev, Error **errp) { + gchar **substrings = NULL; void *object = NULL; Error *err = NULL; int ret = -1; + Visitor *v = opts_visitor_new(opts); - { - OptsVisitor *ov = opts_visitor_new(opts); + const char *type = qemu_opt_get(opts, "type"); - net_visit(opts_get_visitor(ov), is_netdev, &object, &err); - opts_visitor_cleanup(ov); - } - - if (!err) { - ret = net_client_init1(object, is_netdev, &err); - } - - if (object) { - QapiDeallocVisitor *dv = qapi_dealloc_visitor_new(); + if (is_netdev && type && is_help_option(type)) { + show_netdevs(); + exit(0); + } else { + /* Parse convenience option format ip6-net=fec0::0[/64] */ + const char *ip6_net = qemu_opt_get(opts, "ipv6-net"); + + if (ip6_net) { + char *prefix_addr; + unsigned long prefix_len = 64; /* Default 64bit prefix length. */ + + substrings = g_strsplit(ip6_net, "/", 2); + if (!substrings || !substrings[0]) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "ipv6-net", + "a valid IPv6 prefix"); + goto out; + } - net_visit(qapi_dealloc_get_visitor(dv), is_netdev, &object, NULL); - qapi_dealloc_visitor_cleanup(dv); - } + prefix_addr = substrings[0]; - error_propagate(errp, err); - return ret; -} + if (substrings[1]) { + /* User-specified prefix length. */ + int err; + err = qemu_strtoul(substrings[1], NULL, 10, &prefix_len); + if (err) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, + "ipv6-prefixlen", "a number"); + goto out; + } + } -static int net_host_check_device(const char *device) -{ - int i; - const char *valid_param_list[] = { "tap", "socket", "dump" -#ifdef CONFIG_NET_BRIDGE - , "bridge" -#endif -#ifdef CONFIG_SLIRP - ,"user" -#endif -#ifdef CONFIG_VDE - ,"vde" -#endif - }; - for (i = 0; i < sizeof(valid_param_list) / sizeof(char *); i++) { - if (!strncmp(valid_param_list[i], device, - strlen(valid_param_list[i]))) - return 1; + qemu_opt_set(opts, "ipv6-prefix", prefix_addr, &error_abort); + qemu_opt_set_number(opts, "ipv6-prefixlen", prefix_len, + &error_abort); + qemu_opt_unset(opts, "ipv6-net"); + } } - return 0; -} - -void net_host_device_add(Monitor *mon, const QDict *qdict) -{ - const char *device = qdict_get_str(qdict, "device"); - const char *opts_str = qdict_get_try_str(qdict, "opts"); - Error *local_err = NULL; - QemuOpts *opts; - - if (!net_host_check_device(device)) { - monitor_printf(mon, "invalid host network device %s\n", device); - return; + if (is_netdev) { + visit_type_Netdev(v, NULL, (Netdev **)&object, &err); + } else { + visit_type_NetLegacy(v, NULL, (NetLegacy **)&object, &err); } - opts = qemu_opts_parse(qemu_find_opts("net"), opts_str ? opts_str : "", 0); - if (!opts) { - return; + if (!err) { + ret = net_client_init1(object, is_netdev, &err); } - qemu_opt_set(opts, "type", device); - - net_client_init(opts, 0, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); - monitor_printf(mon, "adding host network device %s failed\n", device); + if (is_netdev) { + qapi_free_Netdev(object); + } else { + qapi_free_NetLegacy(object); } -} -void net_host_device_remove(Monitor *mon, const QDict *qdict) -{ - NetClientState *nc; - int vlan_id = qdict_get_int(qdict, "vlan_id"); - const char *device = qdict_get_str(qdict, "device"); - - nc = net_hub_find_client_by_name(vlan_id, device); - if (!nc) { - return; - } - if (!net_host_check_device(nc->model)) { - monitor_printf(mon, "invalid host network device %s\n", device); - return; - } - qemu_del_net_client(nc); +out: + error_propagate(errp, err); + g_strfreev(substrings); + visit_free(v); + return ret; } void netdev_add(QemuOpts *opts, Error **errp) { - net_client_init(opts, 1, errp); + net_client_init(opts, true, errp); } -int qmp_netdev_add(Monitor *mon, const QDict *qdict, QObject **ret) +void qmp_netdev_add(QDict *qdict, QObject **ret, Error **errp) { Error *local_err = NULL; QemuOptsList *opts_list; QemuOpts *opts; opts_list = qemu_find_opts_err("netdev", &local_err); - if (error_is_set(&local_err)) { - goto exit_err; + if (local_err) { + goto out; } opts = qemu_opts_from_qdict(opts_list, qdict, &local_err); - if (error_is_set(&local_err)) { - goto exit_err; + if (local_err) { + goto out; } netdev_add(opts, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { qemu_opts_del(opts); - goto exit_err; + goto out; } - return 0; - -exit_err: - qerror_report_err(local_err); - error_free(local_err); - return -1; +out: + error_propagate(errp, local_err); } void qmp_netdev_del(const char *id, Error **errp) @@ -939,7 +1230,8 @@ void qmp_netdev_del(const char *id, Error **errp) nc = qemu_find_netdev(id); if (!nc) { - error_set(errp, QERR_DEVICE_NOT_FOUND, id); + error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, + "Device '%s' not found", id); return; } @@ -953,12 +1245,48 @@ void qmp_netdev_del(const char *id, Error **errp) qemu_opts_del(opts); } +static void netfilter_print_info(Monitor *mon, NetFilterState *nf) +{ + char *str; + ObjectProperty *prop; + ObjectPropertyIterator iter; + Visitor *v; + + /* generate info str */ + object_property_iter_init(&iter, OBJECT(nf)); + while ((prop = object_property_iter_next(&iter))) { + if (!strcmp(prop->name, "type")) { + continue; + } + v = string_output_visitor_new(false, &str); + object_property_get(OBJECT(nf), v, prop->name, NULL); + visit_complete(v, &str); + visit_free(v); + monitor_printf(mon, ",%s=%s", prop->name, str); + g_free(str); + } + monitor_printf(mon, "\n"); +} + void print_net_client(Monitor *mon, NetClientState *nc) { + NetFilterState *nf; + monitor_printf(mon, "%s: index=%d,type=%s,%s\n", nc->name, nc->queue_index, - NetClientOptionsKind_lookup[nc->info->type], + NetClientDriver_str(nc->info->type), nc->info_str); + if (!QTAILQ_EMPTY(&nc->filters)) { + monitor_printf(mon, "filters:\n"); + } + QTAILQ_FOREACH(nf, &nc->filters, next) { + char *path = object_get_canonical_path_component(OBJECT(nf)); + + monitor_printf(mon, " - %s: type=%s", path, + object_get_typename(OBJECT(nf))); + netfilter_print_info(mon, nf); + g_free(path); + } } RxFilterInfoList *qmp_query_rx_filter(bool has_name, const char *name, @@ -976,14 +1304,20 @@ RxFilterInfoList *qmp_query_rx_filter(bool has_name, const char *name, } /* only query rx-filter information of NIC */ - if (nc->info->type != NET_CLIENT_OPTIONS_KIND_NIC) { + if (nc->info->type != NET_CLIENT_DRIVER_NIC) { if (has_name) { error_setg(errp, "net client(%s) isn't a NIC", name); - break; + return NULL; } continue; } + /* only query information on queue 0 since the info is per nic, + * not per queue + */ + if (nc->queue_index != 0) + continue; + if (nc->info->query_rx_filter) { info = nc->info->query_rx_filter(nc); entry = g_malloc0(sizeof(*entry)); @@ -998,21 +1332,25 @@ RxFilterInfoList *qmp_query_rx_filter(bool has_name, const char *name, } else if (has_name) { error_setg(errp, "net client(%s) doesn't support" " rx-filter querying", name); + return NULL; + } + + if (has_name) { break; } } - if (filter_list == NULL && !error_is_set(errp) && has_name) { + if (filter_list == NULL && has_name) { error_setg(errp, "invalid net client name: %s", name); } return filter_list; } -void do_info_network(Monitor *mon, const QDict *qdict) +void hmp_info_network(Monitor *mon, const QDict *qdict) { NetClientState *nc, *peer; - NetClientOptionsKind type; + NetClientDriver type; net_hub_info(mon); @@ -1025,16 +1363,35 @@ void do_info_network(Monitor *mon, const QDict *qdict) continue; } - if (!peer || type == NET_CLIENT_OPTIONS_KIND_NIC) { + if (!peer || type == NET_CLIENT_DRIVER_NIC) { print_net_client(mon, nc); } /* else it's a netdev connected to a NIC, printed with the NIC */ - if (peer && type == NET_CLIENT_OPTIONS_KIND_NIC) { + if (peer && type == NET_CLIENT_DRIVER_NIC) { monitor_printf(mon, " \\ "); print_net_client(mon, peer); } } } +void colo_notify_filters_event(int event, Error **errp) +{ + NetClientState *nc; + NetFilterState *nf; + NetFilterClass *nfc = NULL; + Error *local_err = NULL; + + QTAILQ_FOREACH(nc, &net_clients, next) { + QTAILQ_FOREACH(nf, &nc->filters, next) { + nfc = NETFILTER_GET_CLASS(OBJECT(nf)); + nfc->handle_event(nf, event, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } + } +} + void qmp_set_link(const char *name, bool up, Error **errp) { NetClientState *ncs[MAX_QUEUE_NUM]; @@ -1042,11 +1399,12 @@ void qmp_set_link(const char *name, bool up, Error **errp) int queues, i; queues = qemu_find_net_clients_except(name, ncs, - NET_CLIENT_OPTIONS_KIND_MAX, + NET_CLIENT_DRIVER__MAX, MAX_QUEUE_NUM); if (queues == 0) { - error_set(errp, QERR_DEVICE_NOT_FOUND, name); + error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, + "Device '%s' not found", name); return; } nc = ncs[0]; @@ -1059,15 +1417,44 @@ void qmp_set_link(const char *name, bool up, Error **errp) nc->info->link_status_changed(nc); } - /* Notify peer. Don't update peer link status: this makes it possible to - * disconnect from host network without notifying the guest. - * FIXME: is disconnected link status change operation useful? - * - * Current behaviour is compatible with qemu vlans where there could be - * multiple clients that can still communicate with each other in - * disconnected mode. For now maintain this compatibility. */ - if (nc->peer && nc->peer->info->link_status_changed) { - nc->peer->info->link_status_changed(nc->peer); + if (nc->peer) { + /* Change peer link only if the peer is NIC and then notify peer. + * If the peer is a HUBPORT or a backend, we do not change the + * link status. + * + * This behavior is compatible with qemu hubs where there could be + * multiple clients that can still communicate with each other in + * disconnected mode. For now maintain this compatibility. + */ + if (nc->peer->info->type == NET_CLIENT_DRIVER_NIC) { + for (i = 0; i < queues; i++) { + ncs[i]->peer->link_down = !up; + } + } + if (nc->peer->info->link_status_changed) { + nc->peer->info->link_status_changed(nc->peer); + } + } +} + +static void net_vm_change_state_handler(void *opaque, int running, + RunState state) +{ + NetClientState *nc; + NetClientState *tmp; + + QTAILQ_FOREACH_SAFE(nc, &net_clients, next, tmp) { + if (running) { + /* Flush queued packets and wake up backends. */ + if (nc->peer && qemu_can_send_packet(nc)) { + qemu_flush_queued_packets(nc->peer); + } + } else { + /* Complete all queued packets, to guarantee we don't modify + * state later when VM is not running. + */ + qemu_flush_or_purge_queued_packets(nc, true); + } } } @@ -1080,12 +1467,14 @@ void net_cleanup(void) */ while (!QTAILQ_EMPTY(&net_clients)) { nc = QTAILQ_FIRST(&net_clients); - if (nc->info->type == NET_CLIENT_OPTIONS_KIND_NIC) { + if (nc->info->type == NET_CLIENT_DRIVER_NIC) { qemu_del_nic(qemu_get_nic(nc)); } else { qemu_del_net_client(nc); } } + + qemu_del_vm_change_state_handler(net_change_state_entry); } void net_check_clients(void) @@ -1093,25 +1482,14 @@ void net_check_clients(void) NetClientState *nc; int i; - /* Don't warn about the default network setup that you get if - * no command line -net or -netdev options are specified. There - * are two cases that we would otherwise complain about: - * (1) board doesn't support a NIC but the implicit "-net nic" - * requested one - * (2) CONFIG_SLIRP not set, in which case the implicit "-net nic" - * sets up a nic that isn't connected to anything. - */ - if (default_net) { - return; - } - net_hub_check_clients(); QTAILQ_FOREACH(nc, &net_clients, next) { if (!nc->peer) { - fprintf(stderr, "Warning: %s %s has no peer\n", - nc->info->type == NET_CLIENT_OPTIONS_KIND_NIC ? - "nic" : "netdev", nc->name); + warn_report("%s %s has no peer", + nc->info->type == NET_CLIENT_DRIVER_NIC + ? "nic" : "netdev", + nc->name); } } @@ -1122,61 +1500,104 @@ void net_check_clients(void) for (i = 0; i < MAX_NICS; i++) { NICInfo *nd = &nd_table[i]; if (nd->used && !nd->instantiated) { - fprintf(stderr, "Warning: requested NIC (%s, model %s) " - "was not created (not supported by this machine?)\n", - nd->name ? nd->name : "anonymous", - nd->model ? nd->model : "unspecified"); + warn_report("requested NIC (%s, model %s) " + "was not created (not supported by this machine?)", + nd->name ? nd->name : "anonymous", + nd->model ? nd->model : "unspecified"); } } } -static int net_init_client(QemuOpts *opts, void *dummy) +static int net_init_client(void *dummy, QemuOpts *opts, Error **errp) { - Error *local_err = NULL; - - net_client_init(opts, 0, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); - return -1; - } + return net_client_init(opts, false, errp); +} - return 0; +static int net_init_netdev(void *dummy, QemuOpts *opts, Error **errp) +{ + return net_client_init(opts, true, errp); } -static int net_init_netdev(QemuOpts *opts, void *dummy) +/* For the convenience "--nic" parameter */ +static int net_param_nic(void *dummy, QemuOpts *opts, Error **errp) { - Error *local_err = NULL; - int ret; + char *mac, *nd_id; + int idx, ret; + NICInfo *ni; + const char *type; + + type = qemu_opt_get(opts, "type"); + if (type && g_str_equal(type, "none")) { + return 0; /* Nothing to do, default_net is cleared in vl.c */ + } - ret = net_client_init(opts, 1, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + idx = nic_get_free_idx(); + if (idx == -1 || nb_nics >= MAX_NICS) { + error_setg(errp, "no more on-board/default NIC slots available"); return -1; } + if (!type) { + qemu_opt_set(opts, "type", "user", &error_abort); + } + + ni = &nd_table[idx]; + memset(ni, 0, sizeof(*ni)); + ni->model = qemu_opt_get_del(opts, "model"); + + /* Create an ID if the user did not specify one */ + nd_id = g_strdup(qemu_opts_id(opts)); + if (!nd_id) { + nd_id = g_strdup_printf("__org.qemu.nic%i", idx); + qemu_opts_set_id(opts, nd_id); + } + + /* Handle MAC address */ + mac = qemu_opt_get_del(opts, "mac"); + if (mac) { + ret = net_parse_macaddr(ni->macaddr.a, mac); + g_free(mac); + if (ret) { + error_setg(errp, "invalid syntax for ethernet address"); + goto out; + } + if (is_multicast_ether_addr(ni->macaddr.a)) { + error_setg(errp, "NIC cannot have multicast MAC address"); + ret = -1; + goto out; + } + } + qemu_macaddr_default_if_unset(&ni->macaddr); + + ret = net_client_init(opts, true, errp); + if (ret == 0) { + ni->netdev = qemu_find_netdev(nd_id); + ni->used = true; + nb_nics++; + } + +out: + g_free(nd_id); return ret; } -int net_init_clients(void) +int net_init_clients(Error **errp) { - QemuOptsList *net = qemu_find_opts("net"); - - if (default_net) { - /* if no clients, we use a default config */ - qemu_opts_set(net, NULL, "type", "nic"); -#ifdef CONFIG_SLIRP - qemu_opts_set(net, NULL, "type", "user"); -#endif - } + net_change_state_entry = + qemu_add_vm_change_state_handler(net_vm_change_state_handler, NULL); QTAILQ_INIT(&net_clients); - if (qemu_opts_foreach(qemu_find_opts("netdev"), net_init_netdev, NULL, 1) == -1) + if (qemu_opts_foreach(qemu_find_opts("netdev"), + net_init_netdev, NULL, errp)) { return -1; + } - if (qemu_opts_foreach(net, net_init_client, NULL, 1) == -1) { + if (qemu_opts_foreach(qemu_find_opts("nic"), net_param_nic, NULL, errp)) { + return -1; + } + + if (qemu_opts_foreach(qemu_find_opts("net"), net_init_client, NULL, errp)) { return -1; } @@ -1185,42 +1606,57 @@ int net_init_clients(void) int net_client_parse(QemuOptsList *opts_list, const char *optarg) { -#if defined(CONFIG_SLIRP) - int ret; - if (net_slirp_parse_legacy(opts_list, optarg, &ret)) { - return ret; - } -#endif - - if (!qemu_opts_parse(opts_list, optarg, 1)) { + if (!qemu_opts_parse_noisily(opts_list, optarg, true)) { return -1; } - default_net = 0; return 0; } /* From FreeBSD */ /* XXX: optimize */ -unsigned compute_mcast_idx(const uint8_t *ep) +uint32_t net_crc32(const uint8_t *p, int len) { uint32_t crc; int carry, i, j; uint8_t b; crc = 0xffffffff; - for (i = 0; i < 6; i++) { - b = *ep++; + for (i = 0; i < len; i++) { + b = *p++; for (j = 0; j < 8; j++) { carry = ((crc & 0x80000000L) ? 1 : 0) ^ (b & 0x01); crc <<= 1; b >>= 1; if (carry) { - crc = ((crc ^ POLYNOMIAL) | carry); + crc = ((crc ^ POLYNOMIAL_BE) | carry); } } } - return crc >> 26; + + return crc; +} + +uint32_t net_crc32_le(const uint8_t *p, int len) +{ + uint32_t crc; + int carry, i, j; + uint8_t b; + + crc = 0xffffffff; + for (i = 0; i < len; i++) { + b = *p++; + for (j = 0; j < 8; j++) { + carry = (crc & 0x1) ^ (b & 0x01); + crc >>= 1; + b >>= 1; + if (carry) { + crc ^= POLYNOMIAL_LE; + } + } + } + + return crc; } QemuOptsList qemu_netdev_opts = { @@ -1236,6 +1672,19 @@ QemuOptsList qemu_netdev_opts = { }, }; +QemuOptsList qemu_nic_opts = { + .name = "nic", + .implied_opt_name = "type", + .head = QTAILQ_HEAD_INITIALIZER(qemu_nic_opts.head), + .desc = { + /* + * no elements => accept any params + * validation will happen later + */ + { /* end of list */ } + }, +}; + QemuOptsList qemu_net_opts = { .name = "net", .implied_opt_name = "type", @@ -1248,3 +1697,100 @@ QemuOptsList qemu_net_opts = { { /* end of list */ } }, }; + +void net_socket_rs_init(SocketReadState *rs, + SocketReadStateFinalize *finalize, + bool vnet_hdr) +{ + rs->state = 0; + rs->vnet_hdr = vnet_hdr; + rs->index = 0; + rs->packet_len = 0; + rs->vnet_hdr_len = 0; + memset(rs->buf, 0, sizeof(rs->buf)); + rs->finalize = finalize; +} + +/* + * Returns + * 0: success + * -1: error occurs + */ +int net_fill_rstate(SocketReadState *rs, const uint8_t *buf, int size) +{ + unsigned int l; + + while (size > 0) { + /* Reassemble a packet from the network. + * 0 = getting length. + * 1 = getting vnet header length. + * 2 = getting data. + */ + switch (rs->state) { + case 0: + l = 4 - rs->index; + if (l > size) { + l = size; + } + memcpy(rs->buf + rs->index, buf, l); + buf += l; + size -= l; + rs->index += l; + if (rs->index == 4) { + /* got length */ + rs->packet_len = ntohl(*(uint32_t *)rs->buf); + rs->index = 0; + if (rs->vnet_hdr) { + rs->state = 1; + } else { + rs->state = 2; + rs->vnet_hdr_len = 0; + } + } + break; + case 1: + l = 4 - rs->index; + if (l > size) { + l = size; + } + memcpy(rs->buf + rs->index, buf, l); + buf += l; + size -= l; + rs->index += l; + if (rs->index == 4) { + /* got vnet header length */ + rs->vnet_hdr_len = ntohl(*(uint32_t *)rs->buf); + rs->index = 0; + rs->state = 2; + } + break; + case 2: + l = rs->packet_len - rs->index; + if (l > size) { + l = size; + } + if (rs->index + l <= sizeof(rs->buf)) { + memcpy(rs->buf + rs->index, buf, l); + } else { + fprintf(stderr, "serious error: oversized packet received," + "connection terminated.\n"); + rs->index = rs->state = 0; + return -1; + } + + rs->index += l; + buf += l; + size -= l; + if (rs->index >= rs->packet_len) { + rs->index = 0; + rs->state = 0; + assert(rs->finalize); + rs->finalize(rs); + } + break; + } + } + + assert(size == 0); + return 0; +} diff --git a/net/netmap.c b/net/netmap.c new file mode 100644 index 000000000..350f097f9 --- /dev/null +++ b/net/netmap.c @@ -0,0 +1,432 @@ +/* + * netmap access for qemu + * + * Copyright (c) 2012-2013 Luigi Rizzo + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + + +#include "qemu/osdep.h" +#include <sys/ioctl.h> +#include <net/if.h> +#define NETMAP_WITH_LIBS +#include <net/netmap.h> +#include <net/netmap_user.h> + +#include "net/net.h" +#include "net/tap.h" +#include "clients.h" +#include "sysemu/sysemu.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "qemu/iov.h" +#include "qemu/cutils.h" +#include "qemu/main-loop.h" + +typedef struct NetmapState { + NetClientState nc; + struct nm_desc *nmd; + char ifname[IFNAMSIZ]; + struct netmap_ring *tx; + struct netmap_ring *rx; + bool read_poll; + bool write_poll; + struct iovec iov[IOV_MAX]; + int vnet_hdr_len; /* Current virtio-net header length. */ +} NetmapState; + +#ifndef __FreeBSD__ +#define pkt_copy bcopy +#else +/* A fast copy routine only for multiples of 64 bytes, non overlapped. */ +static inline void +pkt_copy(const void *_src, void *_dst, int l) +{ + const uint64_t *src = _src; + uint64_t *dst = _dst; + if (unlikely(l >= 1024)) { + bcopy(src, dst, l); + return; + } + for (; l > 0; l -= 64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} +#endif /* __FreeBSD__ */ + +/* + * Open a netmap device. We assume there is only one queue + * (which is the case for the VALE bridge). + */ +static struct nm_desc *netmap_open(const NetdevNetmapOptions *nm_opts, + Error **errp) +{ + struct nm_desc *nmd; + struct nmreq req; + + memset(&req, 0, sizeof(req)); + + nmd = nm_open(nm_opts->ifname, &req, NETMAP_NO_TX_POLL, + NULL); + if (nmd == NULL) { + error_setg_errno(errp, errno, "Failed to nm_open() %s", + nm_opts->ifname); + return NULL; + } + + return nmd; +} + +static void netmap_send(void *opaque); +static void netmap_writable(void *opaque); + +/* Set the event-loop handlers for the netmap backend. */ +static void netmap_update_fd_handler(NetmapState *s) +{ + qemu_set_fd_handler(s->nmd->fd, + s->read_poll ? netmap_send : NULL, + s->write_poll ? netmap_writable : NULL, + s); +} + +/* Update the read handler. */ +static void netmap_read_poll(NetmapState *s, bool enable) +{ + if (s->read_poll != enable) { /* Do nothing if not changed. */ + s->read_poll = enable; + netmap_update_fd_handler(s); + } +} + +/* Update the write handler. */ +static void netmap_write_poll(NetmapState *s, bool enable) +{ + if (s->write_poll != enable) { + s->write_poll = enable; + netmap_update_fd_handler(s); + } +} + +static void netmap_poll(NetClientState *nc, bool enable) +{ + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); + + if (s->read_poll != enable || s->write_poll != enable) { + s->write_poll = enable; + s->read_poll = enable; + netmap_update_fd_handler(s); + } +} + +/* + * The fd_write() callback, invoked if the fd is marked as + * writable after a poll. Unregister the handler and flush any + * buffered packets. + */ +static void netmap_writable(void *opaque) +{ + NetmapState *s = opaque; + + netmap_write_poll(s, false); + qemu_flush_queued_packets(&s->nc); +} + +static ssize_t netmap_receive_iov(NetClientState *nc, + const struct iovec *iov, int iovcnt) +{ + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); + struct netmap_ring *ring = s->tx; + unsigned int tail = ring->tail; + ssize_t totlen = 0; + uint32_t last; + uint32_t idx; + uint8_t *dst; + int j; + uint32_t i; + + last = i = ring->head; + + if (nm_ring_space(ring) < iovcnt) { + /* Not enough netmap slots. Tell the kernel that we have seen the new + * available slots (so that it notifies us again when it has more + * ones), but without publishing any new slots to be processed + * (e.g., we don't advance ring->head). */ + ring->cur = tail; + netmap_write_poll(s, true); + return 0; + } + + for (j = 0; j < iovcnt; j++) { + int iov_frag_size = iov[j].iov_len; + int offset = 0; + int nm_frag_size; + + totlen += iov_frag_size; + + /* Split each iovec fragment over more netmap slots, if + necessary. */ + while (iov_frag_size) { + nm_frag_size = MIN(iov_frag_size, ring->nr_buf_size); + + if (unlikely(i == tail)) { + /* We ran out of netmap slots while splitting the + iovec fragments. */ + ring->cur = tail; + netmap_write_poll(s, true); + return 0; + } + + idx = ring->slot[i].buf_idx; + dst = (uint8_t *)NETMAP_BUF(ring, idx); + + ring->slot[i].len = nm_frag_size; + ring->slot[i].flags = NS_MOREFRAG; + pkt_copy(iov[j].iov_base + offset, dst, nm_frag_size); + + last = i; + i = nm_ring_next(ring, i); + + offset += nm_frag_size; + iov_frag_size -= nm_frag_size; + } + } + /* The last slot must not have NS_MOREFRAG set. */ + ring->slot[last].flags &= ~NS_MOREFRAG; + + /* Now update ring->head and ring->cur to publish the new slots and + * the new wakeup point. */ + ring->head = ring->cur = i; + + ioctl(s->nmd->fd, NIOCTXSYNC, NULL); + + return totlen; +} + +static ssize_t netmap_receive(NetClientState *nc, + const uint8_t *buf, size_t size) +{ + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = size; + + return netmap_receive_iov(nc, &iov, 1); +} + +/* Complete a previous send (backend --> guest) and enable the + fd_read callback. */ +static void netmap_send_completed(NetClientState *nc, ssize_t len) +{ + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); + + netmap_read_poll(s, true); +} + +static void netmap_send(void *opaque) +{ + NetmapState *s = opaque; + struct netmap_ring *ring = s->rx; + unsigned int tail = ring->tail; + + /* Keep sending while there are available slots in the netmap + RX ring and the forwarding path towards the peer is open. */ + while (ring->head != tail) { + uint32_t i = ring->head; + uint32_t idx; + bool morefrag; + int iovcnt = 0; + int iovsize; + + /* Get a (possibly multi-slot) packet. */ + do { + idx = ring->slot[i].buf_idx; + morefrag = (ring->slot[i].flags & NS_MOREFRAG); + s->iov[iovcnt].iov_base = (void *)NETMAP_BUF(ring, idx); + s->iov[iovcnt].iov_len = ring->slot[i].len; + iovcnt++; + i = nm_ring_next(ring, i); + } while (i != tail && morefrag); + + /* Advance ring->cur to tell the kernel that we have seen the slots. */ + ring->cur = i; + + if (unlikely(morefrag)) { + /* This is a truncated packet, so we can stop without releasing the + * incomplete slots by updating ring->head. We will hopefully + * re-read the complete packet the next time we are called. */ + break; + } + + iovsize = qemu_sendv_packet_async(&s->nc, s->iov, iovcnt, + netmap_send_completed); + + /* Release the slots to the kernel. */ + ring->head = i; + + if (iovsize == 0) { + /* The peer does not receive anymore. Packet is queued, stop + * reading from the backend until netmap_send_completed(). */ + netmap_read_poll(s, false); + break; + } + } +} + +/* Flush and close. */ +static void netmap_cleanup(NetClientState *nc) +{ + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); + + qemu_purge_queued_packets(nc); + + netmap_poll(nc, false); + nm_close(s->nmd); + s->nmd = NULL; +} + +/* Offloading manipulation support callbacks. */ +static int netmap_fd_set_vnet_hdr_len(NetmapState *s, int len) +{ + struct nmreq req; + + /* Issue a NETMAP_BDG_VNET_HDR command to change the virtio-net header + * length for the netmap adapter associated to 's->ifname'. + */ + memset(&req, 0, sizeof(req)); + pstrcpy(req.nr_name, sizeof(req.nr_name), s->ifname); + req.nr_version = NETMAP_API; + req.nr_cmd = NETMAP_BDG_VNET_HDR; + req.nr_arg1 = len; + + return ioctl(s->nmd->fd, NIOCREGIF, &req); +} + +static bool netmap_has_vnet_hdr_len(NetClientState *nc, int len) +{ + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); + int prev_len = s->vnet_hdr_len; + + /* Check that we can set the new length. */ + if (netmap_fd_set_vnet_hdr_len(s, len)) { + return false; + } + + /* Restore the previous length. */ + if (netmap_fd_set_vnet_hdr_len(s, prev_len)) { + error_report("Failed to restore vnet-hdr length %d on %s: %s", + prev_len, s->ifname, strerror(errno)); + abort(); + } + + return true; +} + +/* A netmap interface that supports virtio-net headers always + * supports UFO, so we use this callback also for the has_ufo hook. */ +static bool netmap_has_vnet_hdr(NetClientState *nc) +{ + return netmap_has_vnet_hdr_len(nc, sizeof(struct virtio_net_hdr)); +} + +static void netmap_using_vnet_hdr(NetClientState *nc, bool enable) +{ +} + +static void netmap_set_vnet_hdr_len(NetClientState *nc, int len) +{ + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); + int err; + + err = netmap_fd_set_vnet_hdr_len(s, len); + if (err) { + error_report("Unable to set vnet-hdr length %d on %s: %s", + len, s->ifname, strerror(errno)); + } else { + /* Keep track of the current length. */ + s->vnet_hdr_len = len; + } +} + +static void netmap_set_offload(NetClientState *nc, int csum, int tso4, int tso6, + int ecn, int ufo) +{ + NetmapState *s = DO_UPCAST(NetmapState, nc, nc); + + /* Setting a virtio-net header length greater than zero automatically + * enables the offloadings. */ + if (!s->vnet_hdr_len) { + netmap_set_vnet_hdr_len(nc, sizeof(struct virtio_net_hdr)); + } +} + +/* NetClientInfo methods */ +static NetClientInfo net_netmap_info = { + .type = NET_CLIENT_DRIVER_NETMAP, + .size = sizeof(NetmapState), + .receive = netmap_receive, + .receive_iov = netmap_receive_iov, + .poll = netmap_poll, + .cleanup = netmap_cleanup, + .has_ufo = netmap_has_vnet_hdr, + .has_vnet_hdr = netmap_has_vnet_hdr, + .has_vnet_hdr_len = netmap_has_vnet_hdr_len, + .using_vnet_hdr = netmap_using_vnet_hdr, + .set_offload = netmap_set_offload, + .set_vnet_hdr_len = netmap_set_vnet_hdr_len, +}; + +/* The exported init function + * + * ... -net netmap,ifname="..." + */ +int net_init_netmap(const Netdev *netdev, + const char *name, NetClientState *peer, Error **errp) +{ + const NetdevNetmapOptions *netmap_opts = &netdev->u.netmap; + struct nm_desc *nmd; + NetClientState *nc; + Error *err = NULL; + NetmapState *s; + + nmd = netmap_open(netmap_opts, &err); + if (err) { + error_propagate(errp, err); + return -1; + } + /* Create the object. */ + nc = qemu_new_net_client(&net_netmap_info, peer, "netmap", name); + s = DO_UPCAST(NetmapState, nc, nc); + s->nmd = nmd; + s->tx = NETMAP_TXRING(nmd->nifp, 0); + s->rx = NETMAP_RXRING(nmd->nifp, 0); + s->vnet_hdr_len = 0; + pstrcpy(s->ifname, sizeof(s->ifname), netmap_opts->ifname); + netmap_read_poll(s, true); /* Initially only poll for reads. */ + + return 0; +} + diff --git a/net/queue.c b/net/queue.c index 859d02a13..7c0b72c8e 100644 --- a/net/queue.c +++ b/net/queue.c @@ -21,6 +21,7 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" #include "net/queue.h" #include "qemu/queue.h" #include "net/net.h" @@ -52,21 +53,23 @@ struct NetQueue { void *opaque; uint32_t nq_maxlen; uint32_t nq_count; + NetQueueDeliverFunc *deliver; - QTAILQ_HEAD(packets, NetPacket) packets; + QTAILQ_HEAD(, NetPacket) packets; unsigned delivering : 1; }; -NetQueue *qemu_new_net_queue(void *opaque) +NetQueue *qemu_new_net_queue(NetQueueDeliverFunc *deliver, void *opaque) { NetQueue *queue; - queue = g_malloc0(sizeof(NetQueue)); + queue = g_new0(NetQueue, 1); queue->opaque = opaque; queue->nq_maxlen = 10000; queue->nq_count = 0; + queue->deliver = deliver; QTAILQ_INIT(&queue->packets); @@ -110,12 +113,12 @@ static void qemu_net_queue_append(NetQueue *queue, QTAILQ_INSERT_TAIL(&queue->packets, packet, entry); } -static void qemu_net_queue_append_iov(NetQueue *queue, - NetClientState *sender, - unsigned flags, - const struct iovec *iov, - int iovcnt, - NetPacketSent *sent_cb) +void qemu_net_queue_append_iov(NetQueue *queue, + NetClientState *sender, + unsigned flags, + const struct iovec *iov, + int iovcnt, + NetPacketSent *sent_cb) { NetPacket *packet; size_t max_len = 0; @@ -152,9 +155,13 @@ static ssize_t qemu_net_queue_deliver(NetQueue *queue, size_t size) { ssize_t ret = -1; + struct iovec iov = { + .iov_base = (void *)data, + .iov_len = size + }; queue->delivering = 1; - ret = qemu_deliver_packet(sender, flags, data, size, queue->opaque); + ret = queue->deliver(sender, flags, &iov, 1, queue->opaque); queue->delivering = 0; return ret; @@ -169,12 +176,34 @@ static ssize_t qemu_net_queue_deliver_iov(NetQueue *queue, ssize_t ret = -1; queue->delivering = 1; - ret = qemu_deliver_packet_iov(sender, flags, iov, iovcnt, queue->opaque); + ret = queue->deliver(sender, flags, iov, iovcnt, queue->opaque); queue->delivering = 0; return ret; } +ssize_t qemu_net_queue_receive(NetQueue *queue, + const uint8_t *data, + size_t size) +{ + if (queue->delivering) { + return 0; + } + + return qemu_net_queue_deliver(queue, NULL, 0, data, size); +} + +ssize_t qemu_net_queue_receive_iov(NetQueue *queue, + const struct iovec *iov, + int iovcnt) +{ + if (queue->delivering) { + return 0; + } + + return qemu_net_queue_deliver_iov(queue, NULL, 0, iov, iovcnt); +} + ssize_t qemu_net_queue_send(NetQueue *queue, NetClientState *sender, unsigned flags, @@ -233,6 +262,9 @@ void qemu_net_queue_purge(NetQueue *queue, NetClientState *from) if (packet->sender == from) { QTAILQ_REMOVE(&queue->packets, packet, entry); queue->nq_count--; + if (packet->sent_cb) { + packet->sent_cb(packet->sender, 0); + } g_free(packet); } } diff --git a/net/slirp.c b/net/slirp.c index 124e953d9..c4334ee87 100644 --- a/net/slirp.c +++ b/net/slirp.c @@ -21,9 +21,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ + +#include "qemu/osdep.h" +#include "qemu/log.h" #include "net/slirp.h" -#include "config-host.h" #ifndef _WIN32 #include <pwd.h> @@ -33,9 +35,17 @@ #include "clients.h" #include "hub.h" #include "monitor/monitor.h" +#include "qemu/error-report.h" #include "qemu/sockets.h" -#include "slirp/libslirp.h" -#include "sysemu/char.h" +#include <libslirp.h> +#include "chardev/char-fe.h" +#include "sysemu/sysemu.h" +#include "qemu/cutils.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "util.h" +#include "migration/register.h" +#include "migration/qemu-file-types.h" static int get_str_sep(char *buf, int buf_size, const char **pp, int sep) { @@ -60,50 +70,53 @@ static int get_str_sep(char *buf, int buf_size, const char **pp, int sep) /* slirp network adapter */ #define SLIRP_CFG_HOSTFWD 1 -#define SLIRP_CFG_LEGACY 2 struct slirp_config_str { struct slirp_config_str *next; int flags; char str[1024]; - int legacy_format; +}; + +struct GuestFwd { + CharBackend hd; + struct in_addr server; + int port; + Slirp *slirp; }; typedef struct SlirpState { NetClientState nc; QTAILQ_ENTRY(SlirpState) entry; Slirp *slirp; + Notifier poll_notifier; + Notifier exit_notifier; #ifndef _WIN32 - char smb_dir[128]; + gchar *smb_dir; #endif + GSList *fwd; } SlirpState; static struct slirp_config_str *slirp_configs; -const char *legacy_tftp_prefix; -const char *legacy_bootp_filename; -static QTAILQ_HEAD(slirp_stacks, SlirpState) slirp_stacks = +static QTAILQ_HEAD(, SlirpState) slirp_stacks = QTAILQ_HEAD_INITIALIZER(slirp_stacks); -static int slirp_hostfwd(SlirpState *s, const char *redir_str, - int legacy_format); -static int slirp_guestfwd(SlirpState *s, const char *config_str, - int legacy_format); +static int slirp_hostfwd(SlirpState *s, const char *redir_str, Error **errp); +static int slirp_guestfwd(SlirpState *s, const char *config_str, Error **errp); #ifndef _WIN32 -static const char *legacy_smb_export; - static int slirp_smb(SlirpState *s, const char *exported_dir, - struct in_addr vserver_addr); + struct in_addr vserver_addr, Error **errp); static void slirp_smb_cleanup(SlirpState *s); #else static inline void slirp_smb_cleanup(SlirpState *s) { } #endif -void slirp_output(void *opaque, const uint8_t *pkt, int pkt_len) +static ssize_t net_slirp_send_packet(const void *pkt, size_t pkt_len, + void *opaque) { SlirpState *s = opaque; - qemu_send_packet(&s->nc, pkt, pkt_len); + return qemu_send_packet(&s->nc, pkt, pkt_len); } static ssize_t net_slirp_receive(NetClientState *nc, const uint8_t *buf, size_t size) @@ -115,29 +128,236 @@ static ssize_t net_slirp_receive(NetClientState *nc, const uint8_t *buf, size_t return size; } +static void slirp_smb_exit(Notifier *n, void *data) +{ + SlirpState *s = container_of(n, SlirpState, exit_notifier); + slirp_smb_cleanup(s); +} + +static void slirp_free_fwd(gpointer data) +{ + struct GuestFwd *fwd = data; + + qemu_chr_fe_deinit(&fwd->hd, true); + g_free(data); +} + static void net_slirp_cleanup(NetClientState *nc) { SlirpState *s = DO_UPCAST(SlirpState, nc, nc); + g_slist_free_full(s->fwd, slirp_free_fwd); + main_loop_poll_remove_notifier(&s->poll_notifier); + unregister_savevm(NULL, "slirp", s->slirp); slirp_cleanup(s->slirp); + if (s->exit_notifier.notify) { + qemu_remove_exit_notifier(&s->exit_notifier); + } slirp_smb_cleanup(s); QTAILQ_REMOVE(&slirp_stacks, s, entry); } static NetClientInfo net_slirp_info = { - .type = NET_CLIENT_OPTIONS_KIND_USER, + .type = NET_CLIENT_DRIVER_USER, .size = sizeof(SlirpState), .receive = net_slirp_receive, .cleanup = net_slirp_cleanup, }; +static void net_slirp_guest_error(const char *msg, void *opaque) +{ + qemu_log_mask(LOG_GUEST_ERROR, "%s", msg); +} + +static int64_t net_slirp_clock_get_ns(void *opaque) +{ + return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); +} + +static void *net_slirp_timer_new(SlirpTimerCb cb, + void *cb_opaque, void *opaque) +{ + return timer_new_full(NULL, QEMU_CLOCK_VIRTUAL, + SCALE_MS, QEMU_TIMER_ATTR_EXTERNAL, + cb, cb_opaque); +} + +static void net_slirp_timer_free(void *timer, void *opaque) +{ + timer_del(timer); + timer_free(timer); +} + +static void net_slirp_timer_mod(void *timer, int64_t expire_timer, + void *opaque) +{ + timer_mod(timer, expire_timer); +} + +static void net_slirp_register_poll_fd(int fd, void *opaque) +{ + qemu_fd_register(fd); +} + +static void net_slirp_unregister_poll_fd(int fd, void *opaque) +{ + /* no qemu_fd_unregister */ +} + +static void net_slirp_notify(void *opaque) +{ + qemu_notify_event(); +} + +static const SlirpCb slirp_cb = { + .send_packet = net_slirp_send_packet, + .guest_error = net_slirp_guest_error, + .clock_get_ns = net_slirp_clock_get_ns, + .timer_new = net_slirp_timer_new, + .timer_free = net_slirp_timer_free, + .timer_mod = net_slirp_timer_mod, + .register_poll_fd = net_slirp_register_poll_fd, + .unregister_poll_fd = net_slirp_unregister_poll_fd, + .notify = net_slirp_notify, +}; + +static int slirp_poll_to_gio(int events) +{ + int ret = 0; + + if (events & SLIRP_POLL_IN) { + ret |= G_IO_IN; + } + if (events & SLIRP_POLL_OUT) { + ret |= G_IO_OUT; + } + if (events & SLIRP_POLL_PRI) { + ret |= G_IO_PRI; + } + if (events & SLIRP_POLL_ERR) { + ret |= G_IO_ERR; + } + if (events & SLIRP_POLL_HUP) { + ret |= G_IO_HUP; + } + + return ret; +} + +static int net_slirp_add_poll(int fd, int events, void *opaque) +{ + GArray *pollfds = opaque; + GPollFD pfd = { + .fd = fd, + .events = slirp_poll_to_gio(events), + }; + int idx = pollfds->len; + g_array_append_val(pollfds, pfd); + return idx; +} + +static int slirp_gio_to_poll(int events) +{ + int ret = 0; + + if (events & G_IO_IN) { + ret |= SLIRP_POLL_IN; + } + if (events & G_IO_OUT) { + ret |= SLIRP_POLL_OUT; + } + if (events & G_IO_PRI) { + ret |= SLIRP_POLL_PRI; + } + if (events & G_IO_ERR) { + ret |= SLIRP_POLL_ERR; + } + if (events & G_IO_HUP) { + ret |= SLIRP_POLL_HUP; + } + + return ret; +} + +static int net_slirp_get_revents(int idx, void *opaque) +{ + GArray *pollfds = opaque; + + return slirp_gio_to_poll(g_array_index(pollfds, GPollFD, idx).revents); +} + +static void net_slirp_poll_notify(Notifier *notifier, void *data) +{ + MainLoopPoll *poll = data; + SlirpState *s = container_of(notifier, SlirpState, poll_notifier); + + switch (poll->state) { + case MAIN_LOOP_POLL_FILL: + slirp_pollfds_fill(s->slirp, &poll->timeout, + net_slirp_add_poll, poll->pollfds); + break; + case MAIN_LOOP_POLL_OK: + case MAIN_LOOP_POLL_ERR: + slirp_pollfds_poll(s->slirp, poll->state == MAIN_LOOP_POLL_ERR, + net_slirp_get_revents, poll->pollfds); + break; + default: + g_assert_not_reached(); + } +} + +static ssize_t +net_slirp_stream_read(void *buf, size_t size, void *opaque) +{ + QEMUFile *f = opaque; + + return qemu_get_buffer(f, buf, size); +} + +static ssize_t +net_slirp_stream_write(const void *buf, size_t size, void *opaque) +{ + QEMUFile *f = opaque; + + qemu_put_buffer(f, buf, size); + if (qemu_file_get_error(f)) { + return -1; + } + + return size; +} + +static int net_slirp_state_load(QEMUFile *f, void *opaque, int version_id) +{ + Slirp *slirp = opaque; + + return slirp_state_load(slirp, version_id, net_slirp_stream_read, f); +} + +static void net_slirp_state_save(QEMUFile *f, void *opaque) +{ + Slirp *slirp = opaque; + + slirp_state_save(slirp, net_slirp_stream_write, f); +} + +static SaveVMHandlers savevm_slirp_state = { + .save_state = net_slirp_state_save, + .load_state = net_slirp_state_load, +}; + static int net_slirp_init(NetClientState *peer, const char *model, const char *name, int restricted, - const char *vnetwork, const char *vhost, + bool ipv4, const char *vnetwork, const char *vhost, + bool ipv6, const char *vprefix6, int vprefix6_len, + const char *vhost6, const char *vhostname, const char *tftp_export, const char *bootfile, const char *vdhcp_start, - const char *vnameserver, const char *smb_export, - const char *vsmbserver, const char **dnssearch) + const char *vnameserver, const char *vnameserver6, + const char *smb_export, const char *vsmbserver, + const char **dnssearch, const char *vdomainname, + const char *tftp_server_name, + Error **errp) { /* default settings according to historic slirp */ struct in_addr net = { .s_addr = htonl(0x0a000200) }; /* 10.0.2.0 */ @@ -145,6 +365,9 @@ static int net_slirp_init(NetClientState *peer, const char *model, struct in_addr host = { .s_addr = htonl(0x0a000202) }; /* 10.0.2.2 */ struct in_addr dhcp = { .s_addr = htonl(0x0a00020f) }; /* 10.0.2.15 */ struct in_addr dns = { .s_addr = htonl(0x0a000203) }; /* 10.0.2.3 */ + struct in6_addr ip6_prefix; + struct in6_addr ip6_host; + struct in6_addr ip6_dns; #ifndef _WIN32 struct in_addr smbsrv = { .s_addr = 0 }; #endif @@ -156,16 +379,26 @@ static int net_slirp_init(NetClientState *peer, const char *model, char *end; struct slirp_config_str *config; - if (!tftp_export) { - tftp_export = legacy_tftp_prefix; + if (!ipv4 && (vnetwork || vhost || vnameserver)) { + error_setg(errp, "IPv4 disabled but netmask/host/dns provided"); + return -1; + } + + if (!ipv6 && (vprefix6 || vhost6 || vnameserver6)) { + error_setg(errp, "IPv6 disabled but prefix/host6/dns6 provided"); + return -1; } - if (!bootfile) { - bootfile = legacy_bootp_filename; + + if (!ipv4 && !ipv6) { + /* It doesn't make sense to disable both */ + error_setg(errp, "IPv4 and IPv6 disabled"); + return -1; } if (vnetwork) { if (get_str_sep(buf, sizeof(buf), &vnetwork, '/') < 0) { if (!inet_aton(vnetwork, &net)) { + error_setg(errp, "Failed to parse netmask"); return -1; } addr = ntohl(net.s_addr); @@ -186,14 +419,19 @@ static int net_slirp_init(NetClientState *peer, const char *model, } } else { if (!inet_aton(buf, &net)) { + error_setg(errp, "Failed to parse netmask"); return -1; } shift = strtol(vnetwork, &end, 10); if (*end != '\0') { if (!inet_aton(vnetwork, &mask)) { + error_setg(errp, + "Failed to parse netmask (trailing chars)"); return -1; } } else if (shift < 4 || shift > 32) { + error_setg(errp, + "Invalid netmask provided (must be in range 4-32)"); return -1; } else { mask.s_addr = htonl(0xffffffff << (32 - shift)); @@ -206,34 +444,113 @@ static int net_slirp_init(NetClientState *peer, const char *model, } if (vhost && !inet_aton(vhost, &host)) { + error_setg(errp, "Failed to parse host"); return -1; } if ((host.s_addr & mask.s_addr) != net.s_addr) { + error_setg(errp, "Host doesn't belong to network"); return -1; } if (vnameserver && !inet_aton(vnameserver, &dns)) { + error_setg(errp, "Failed to parse DNS"); return -1; } - if ((dns.s_addr & mask.s_addr) != net.s_addr || - dns.s_addr == host.s_addr) { + if (restricted && (dns.s_addr & mask.s_addr) != net.s_addr) { + error_setg(errp, "DNS doesn't belong to network"); + return -1; + } + if (dns.s_addr == host.s_addr) { + error_setg(errp, "DNS must be different from host"); return -1; } if (vdhcp_start && !inet_aton(vdhcp_start, &dhcp)) { + error_setg(errp, "Failed to parse DHCP start address"); + return -1; + } + if ((dhcp.s_addr & mask.s_addr) != net.s_addr) { + error_setg(errp, "DHCP doesn't belong to network"); return -1; } - if ((dhcp.s_addr & mask.s_addr) != net.s_addr || - dhcp.s_addr == host.s_addr || dhcp.s_addr == dns.s_addr) { + if (dhcp.s_addr == host.s_addr || dhcp.s_addr == dns.s_addr) { + error_setg(errp, "DNS must be different from host and DNS"); return -1; } #ifndef _WIN32 if (vsmbserver && !inet_aton(vsmbserver, &smbsrv)) { + error_setg(errp, "Failed to parse SMB address"); return -1; } #endif + if (!vprefix6) { + vprefix6 = "fec0::"; + } + if (!inet_pton(AF_INET6, vprefix6, &ip6_prefix)) { + error_setg(errp, "Failed to parse IPv6 prefix"); + return -1; + } + + if (!vprefix6_len) { + vprefix6_len = 64; + } + if (vprefix6_len < 0 || vprefix6_len > 126) { + error_setg(errp, + "Invalid IPv6 prefix provided " + "(IPv6 prefix length must be between 0 and 126)"); + return -1; + } + + if (vhost6) { + if (!inet_pton(AF_INET6, vhost6, &ip6_host)) { + error_setg(errp, "Failed to parse IPv6 host"); + return -1; + } + if (!in6_equal_net(&ip6_prefix, &ip6_host, vprefix6_len)) { + error_setg(errp, "IPv6 Host doesn't belong to network"); + return -1; + } + } else { + ip6_host = ip6_prefix; + ip6_host.s6_addr[15] |= 2; + } + + if (vnameserver6) { + if (!inet_pton(AF_INET6, vnameserver6, &ip6_dns)) { + error_setg(errp, "Failed to parse IPv6 DNS"); + return -1; + } + if (restricted && !in6_equal_net(&ip6_prefix, &ip6_dns, vprefix6_len)) { + error_setg(errp, "IPv6 DNS doesn't belong to network"); + return -1; + } + } else { + ip6_dns = ip6_prefix; + ip6_dns.s6_addr[15] |= 3; + } + + if (vdomainname && !*vdomainname) { + error_setg(errp, "'domainname' parameter cannot be empty"); + return -1; + } + + if (vdomainname && strlen(vdomainname) > 255) { + error_setg(errp, "'domainname' parameter cannot exceed 255 bytes"); + return -1; + } + + if (vhostname && strlen(vhostname) > 255) { + error_setg(errp, "'vhostname' parameter cannot exceed 255 bytes"); + return -1; + } + + if (tftp_server_name && strlen(tftp_server_name) > 255) { + error_setg(errp, "'tftp-server-name' parameter cannot exceed 255 bytes"); + return -1; + } + nc = qemu_new_net_client(&net_slirp_info, peer, model, name); snprintf(nc->info_str, sizeof(nc->info_str), @@ -242,31 +559,50 @@ static int net_slirp_init(NetClientState *peer, const char *model, s = DO_UPCAST(SlirpState, nc, nc); - s->slirp = slirp_init(restricted, net, mask, host, vhostname, - tftp_export, bootfile, dhcp, dns, dnssearch, s); + s->slirp = slirp_init(restricted, ipv4, net, mask, host, + ipv6, ip6_prefix, vprefix6_len, ip6_host, + vhostname, tftp_server_name, + tftp_export, bootfile, dhcp, + dns, ip6_dns, dnssearch, vdomainname, + &slirp_cb, s); QTAILQ_INSERT_TAIL(&slirp_stacks, s, entry); + /* + * Make sure the current bitstream version of slirp is 4, to avoid + * QEMU migration incompatibilities, if upstream slirp bumped the + * version. + * + * FIXME: use bitfields of features? teach libslirp to save with + * specific version? + */ + g_assert(slirp_state_version() == 4); + register_savevm_live("slirp", 0, slirp_state_version(), + &savevm_slirp_state, s->slirp); + + s->poll_notifier.notify = net_slirp_poll_notify; + main_loop_poll_add_notifier(&s->poll_notifier); + for (config = slirp_configs; config; config = config->next) { if (config->flags & SLIRP_CFG_HOSTFWD) { - if (slirp_hostfwd(s, config->str, - config->flags & SLIRP_CFG_LEGACY) < 0) + if (slirp_hostfwd(s, config->str, errp) < 0) { goto error; + } } else { - if (slirp_guestfwd(s, config->str, - config->flags & SLIRP_CFG_LEGACY) < 0) + if (slirp_guestfwd(s, config->str, errp) < 0) { goto error; + } } } #ifndef _WIN32 - if (!smb_export) { - smb_export = legacy_smb_export; - } if (smb_export) { - if (slirp_smb(s, smb_export, smbsrv) < 0) + if (slirp_smb(s, smb_export, smbsrv, errp) < 0) { goto error; + } } #endif + s->exit_notifier.notify = slirp_smb_exit; + qemu_add_exit_notifier(&s->exit_notifier); return 0; error: @@ -274,15 +610,25 @@ error: return -1; } -static SlirpState *slirp_lookup(Monitor *mon, const char *vlan, - const char *stack) +static SlirpState *slirp_lookup(Monitor *mon, const char *hub_id, + const char *name) { - - if (vlan) { + if (name) { NetClientState *nc; - nc = net_hub_find_client_by_name(strtol(vlan, NULL, 0), stack); - if (!nc) { - return NULL; + if (hub_id) { + nc = net_hub_find_client_by_name(strtol(hub_id, NULL, 0), name); + if (!nc) { + monitor_printf(mon, "unrecognized (hub-id, stackname) pair\n"); + return NULL; + } + warn_report("Using 'hub-id' is deprecated, specify the netdev id " + "directly instead"); + } else { + nc = qemu_find_netdev(name); + if (!nc) { + monitor_printf(mon, "unrecognized netdev id '%s'\n", name); + return NULL; + } } if (strcmp(nc->model, "user")) { monitor_printf(mon, "invalid device specified\n"); @@ -298,7 +644,7 @@ static SlirpState *slirp_lookup(Monitor *mon, const char *vlan, } } -void net_slirp_hostfwd_remove(Monitor *mon, const QDict *qdict) +void hmp_hostfwd_remove(Monitor *mon, const QDict *qdict) { struct in_addr host_addr = { .s_addr = INADDR_ANY }; int host_port; @@ -311,9 +657,12 @@ void net_slirp_hostfwd_remove(Monitor *mon, const QDict *qdict) const char *arg2 = qdict_get_try_str(qdict, "arg2"); const char *arg3 = qdict_get_try_str(qdict, "arg3"); - if (arg2) { + if (arg3) { s = slirp_lookup(mon, arg1, arg2); src_str = arg3; + } else if (arg2) { + s = slirp_lookup(mon, NULL, arg1); + src_str = arg2; } else { s = slirp_lookup(mon, NULL, NULL); src_str = arg1; @@ -342,10 +691,11 @@ void net_slirp_hostfwd_remove(Monitor *mon, const QDict *qdict) goto fail_syntax; } - host_port = atoi(p); + if (qemu_strtoi(p, NULL, 10, &host_port)) { + goto fail_syntax; + } - err = slirp_remove_hostfwd(QTAILQ_FIRST(&slirp_stacks)->slirp, is_udp, - host_addr, host_port); + err = slirp_remove_hostfwd(s->slirp, is_udp, host_addr, host_port); monitor_printf(mon, "host forwarding rule for %s %s\n", src_str, err ? "not found" : "removed"); @@ -355,8 +705,7 @@ void net_slirp_hostfwd_remove(Monitor *mon, const QDict *qdict) monitor_printf(mon, "invalid format\n"); } -static int slirp_hostfwd(SlirpState *s, const char *redir_str, - int legacy_format) +static int slirp_hostfwd(SlirpState *s, const char *redir_str, Error **errp) { struct in_addr host_addr = { .s_addr = INADDR_ANY }; struct in_addr guest_addr = { .s_addr = 0 }; @@ -365,9 +714,11 @@ static int slirp_hostfwd(SlirpState *s, const char *redir_str, char buf[256]; int is_udp; char *end; + const char *fail_reason = "Unknown reason"; p = redir_str; if (!p || get_str_sep(buf, sizeof(buf), &p, ':') < 0) { + fail_reason = "No : separators"; goto fail_syntax; } if (!strcmp(buf, "tcp") || buf[0] == '\0') { @@ -375,52 +726,59 @@ static int slirp_hostfwd(SlirpState *s, const char *redir_str, } else if (!strcmp(buf, "udp")) { is_udp = 1; } else { + fail_reason = "Bad protocol name"; goto fail_syntax; } - if (!legacy_format) { - if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) { - goto fail_syntax; - } - if (buf[0] != '\0' && !inet_aton(buf, &host_addr)) { - goto fail_syntax; - } + if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) { + fail_reason = "Missing : separator"; + goto fail_syntax; + } + if (buf[0] != '\0' && !inet_aton(buf, &host_addr)) { + fail_reason = "Bad host address"; + goto fail_syntax; } - if (get_str_sep(buf, sizeof(buf), &p, legacy_format ? ':' : '-') < 0) { + if (get_str_sep(buf, sizeof(buf), &p, '-') < 0) { + fail_reason = "Bad host port separator"; goto fail_syntax; } host_port = strtol(buf, &end, 0); - if (*end != '\0' || host_port < 1 || host_port > 65535) { + if (*end != '\0' || host_port < 0 || host_port > 65535) { + fail_reason = "Bad host port"; goto fail_syntax; } if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) { + fail_reason = "Missing guest address"; goto fail_syntax; } if (buf[0] != '\0' && !inet_aton(buf, &guest_addr)) { + fail_reason = "Bad guest address"; goto fail_syntax; } guest_port = strtol(p, &end, 0); if (*end != '\0' || guest_port < 1 || guest_port > 65535) { + fail_reason = "Bad guest port"; goto fail_syntax; } if (slirp_add_hostfwd(s->slirp, is_udp, host_addr, host_port, guest_addr, guest_port) < 0) { - error_report("could not set up host forwarding rule '%s'", - redir_str); + error_setg(errp, "Could not set up host forwarding rule '%s'", + redir_str); return -1; } return 0; fail_syntax: - error_report("invalid host forwarding rule '%s'", redir_str); + error_setg(errp, "Invalid host forwarding rule '%s' (%s)", redir_str, + fail_reason); return -1; } -void net_slirp_hostfwd_add(Monitor *mon, const QDict *qdict) +void hmp_hostfwd_add(Monitor *mon, const QDict *qdict) { const char *redir_str; SlirpState *s; @@ -428,33 +786,23 @@ void net_slirp_hostfwd_add(Monitor *mon, const QDict *qdict) const char *arg2 = qdict_get_try_str(qdict, "arg2"); const char *arg3 = qdict_get_try_str(qdict, "arg3"); - if (arg2) { + if (arg3) { s = slirp_lookup(mon, arg1, arg2); redir_str = arg3; + } else if (arg2) { + s = slirp_lookup(mon, NULL, arg1); + redir_str = arg2; } else { s = slirp_lookup(mon, NULL, NULL); redir_str = arg1; } if (s) { - slirp_hostfwd(s, redir_str, 0); - } - -} - -int net_slirp_redir(const char *redir_str) -{ - struct slirp_config_str *config; - - if (QTAILQ_EMPTY(&slirp_stacks)) { - config = g_malloc(sizeof(*config)); - pstrcpy(config->str, sizeof(config->str), redir_str); - config->flags = SLIRP_CFG_HOSTFWD | SLIRP_CFG_LEGACY; - config->next = slirp_configs; - slirp_configs = config; - return 0; + Error *err = NULL; + if (slirp_hostfwd(s, redir_str, &err) < 0) { + error_report_err(err); + } } - return slirp_hostfwd(QTAILQ_FIRST(&slirp_stacks), redir_str, 1); } #ifndef _WIN32 @@ -462,11 +810,10 @@ int net_slirp_redir(const char *redir_str) /* automatic user mode samba server configuration */ static void slirp_smb_cleanup(SlirpState *s) { - char cmd[128]; int ret; - if (s->smb_dir[0] != '\0') { - snprintf(cmd, sizeof(cmd), "rm -rf %s", s->smb_dir); + if (s->smb_dir) { + gchar *cmd = g_strdup_printf("rm -rf %s", s->smb_dir); ret = system(cmd); if (ret == -1 || !WIFEXITED(ret)) { error_report("'%s' failed.", cmd); @@ -474,62 +821,72 @@ static void slirp_smb_cleanup(SlirpState *s) error_report("'%s' failed. Error code: %d", cmd, WEXITSTATUS(ret)); } - s->smb_dir[0] = '\0'; + g_free(cmd); + g_free(s->smb_dir); + s->smb_dir = NULL; } } static int slirp_smb(SlirpState* s, const char *exported_dir, - struct in_addr vserver_addr) + struct in_addr vserver_addr, Error **errp) { - static int instance; - char smb_conf[128]; - char smb_cmdline[128]; + char *smb_conf; + char *smb_cmdline; struct passwd *passwd; FILE *f; passwd = getpwuid(geteuid()); if (!passwd) { - error_report("failed to retrieve user name"); + error_setg(errp, "Failed to retrieve user name"); return -1; } if (access(CONFIG_SMBD_COMMAND, F_OK)) { - error_report("could not find '%s', please install it", - CONFIG_SMBD_COMMAND); + error_setg(errp, "Could not find '%s', please install it", + CONFIG_SMBD_COMMAND); return -1; } if (access(exported_dir, R_OK | X_OK)) { - error_report("error accessing shared directory '%s': %s", - exported_dir, strerror(errno)); + error_setg(errp, "Error accessing shared directory '%s': %s", + exported_dir, strerror(errno)); return -1; } - snprintf(s->smb_dir, sizeof(s->smb_dir), "/tmp/qemu-smb.%ld-%d", - (long)getpid(), instance++); - if (mkdir(s->smb_dir, 0700) < 0) { - error_report("could not create samba server dir '%s'", s->smb_dir); + s->smb_dir = g_dir_make_tmp("qemu-smb.XXXXXX", NULL); + if (!s->smb_dir) { + error_setg(errp, "Could not create samba server dir"); return -1; } - snprintf(smb_conf, sizeof(smb_conf), "%s/%s", s->smb_dir, "smb.conf"); + smb_conf = g_strdup_printf("%s/%s", s->smb_dir, "smb.conf"); f = fopen(smb_conf, "w"); if (!f) { slirp_smb_cleanup(s); - error_report("could not create samba server configuration file '%s'", - smb_conf); + error_setg(errp, + "Could not create samba server configuration file '%s'", + smb_conf); + g_free(smb_conf); return -1; } fprintf(f, "[global]\n" "private dir=%s\n" - "socket address=127.0.0.1\n" + "interfaces=127.0.0.1\n" + "bind interfaces only=yes\n" "pid directory=%s\n" "lock directory=%s\n" "state directory=%s\n" + "cache directory=%s\n" + "ncalrpc dir=%s/ncalrpc\n" "log file=%s/log.smbd\n" "smb passwd file=%s/smbpasswd\n" - "security = share\n" + "security = user\n" + "map to guest = Bad User\n" + "load printers = no\n" + "printing = bsd\n" + "disable spoolss = yes\n" + "usershare max shares = 0\n" "[qemu]\n" "path=%s\n" "read only=no\n" @@ -541,48 +898,30 @@ static int slirp_smb(SlirpState* s, const char *exported_dir, s->smb_dir, s->smb_dir, s->smb_dir, + s->smb_dir, + s->smb_dir, exported_dir, passwd->pw_name ); fclose(f); - snprintf(smb_cmdline, sizeof(smb_cmdline), "%s -s %s", - CONFIG_SMBD_COMMAND, smb_conf); + smb_cmdline = g_strdup_printf("%s -l %s -s %s", + CONFIG_SMBD_COMMAND, s->smb_dir, smb_conf); + g_free(smb_conf); - if (slirp_add_exec(s->slirp, 0, smb_cmdline, &vserver_addr, 139) < 0) { + if (slirp_add_exec(s->slirp, smb_cmdline, &vserver_addr, 139) < 0 || + slirp_add_exec(s->slirp, smb_cmdline, &vserver_addr, 445) < 0) { slirp_smb_cleanup(s); - error_report("conflicting/invalid smbserver address"); + g_free(smb_cmdline); + error_setg(errp, "Conflicting/invalid smbserver address"); return -1; } - return 0; -} - -/* automatic user mode samba server configuration (legacy interface) */ -int net_slirp_smb(const char *exported_dir) -{ - struct in_addr vserver_addr = { .s_addr = 0 }; - - if (legacy_smb_export) { - fprintf(stderr, "-smb given twice\n"); - return -1; - } - legacy_smb_export = exported_dir; - if (!QTAILQ_EMPTY(&slirp_stacks)) { - return slirp_smb(QTAILQ_FIRST(&slirp_stacks), exported_dir, - vserver_addr); - } + g_free(smb_cmdline); return 0; } #endif /* !defined(_WIN32) */ -struct GuestFwd { - CharDriverState *hd; - struct in_addr server; - int port; - Slirp *slirp; -}; - static int guestfwd_can_read(void *opaque) { struct GuestFwd *fwd = opaque; @@ -595,9 +934,14 @@ static void guestfwd_read(void *opaque, const uint8_t *buf, int size) slirp_socket_recv(fwd->slirp, fwd->server, fwd->port, buf, size); } -static int slirp_guestfwd(SlirpState *s, const char *config_str, - int legacy_format) +static ssize_t guestfwd_write(const void *buf, size_t len, void *chr) { + return qemu_chr_fe_write_all(chr, buf, len); +} + +static int slirp_guestfwd(SlirpState *s, const char *config_str, Error **errp) +{ + /* TODO: IPv6 */ struct in_addr server = { .s_addr = 0 }; struct GuestFwd *fwd; const char *p; @@ -606,53 +950,62 @@ static int slirp_guestfwd(SlirpState *s, const char *config_str, int port; p = config_str; - if (legacy_format) { - if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) { - goto fail_syntax; - } - } else { - if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) { - goto fail_syntax; - } - if (strcmp(buf, "tcp") && buf[0] != '\0') { - goto fail_syntax; - } - if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) { - goto fail_syntax; - } - if (buf[0] != '\0' && !inet_aton(buf, &server)) { - goto fail_syntax; - } - if (get_str_sep(buf, sizeof(buf), &p, '-') < 0) { - goto fail_syntax; - } + if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) { + goto fail_syntax; + } + if (strcmp(buf, "tcp") && buf[0] != '\0') { + goto fail_syntax; + } + if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) { + goto fail_syntax; + } + if (buf[0] != '\0' && !inet_aton(buf, &server)) { + goto fail_syntax; + } + if (get_str_sep(buf, sizeof(buf), &p, '-') < 0) { + goto fail_syntax; } port = strtol(buf, &end, 10); if (*end != '\0' || port < 1 || port > 65535) { goto fail_syntax; } - fwd = g_malloc(sizeof(struct GuestFwd)); snprintf(buf, sizeof(buf), "guestfwd.tcp.%d", port); - if ((strlen(p) > 4) && !strncmp(p, "cmd:", 4)) { - if (slirp_add_exec(s->slirp, 0, &p[4], &server, port) < 0) { - error_report("conflicting/invalid host:port in guest forwarding " - "rule '%s'", config_str); - g_free(fwd); + if (g_str_has_prefix(p, "cmd:")) { + if (slirp_add_exec(s->slirp, &p[4], &server, port) < 0) { + error_setg(errp, "Conflicting/invalid host:port in guest " + "forwarding rule '%s'", config_str); return -1; } } else { - fwd->hd = qemu_chr_new(buf, p, NULL); - if (!fwd->hd) { - error_report("could not open guest forwarding device '%s'", buf); + Error *err = NULL; + /* + * FIXME: sure we want to support implicit + * muxed monitors here? + */ + Chardev *chr = qemu_chr_new_mux_mon(buf, p, NULL); + + if (!chr) { + error_setg(errp, "Could not open guest forwarding device '%s'", + buf); + return -1; + } + + fwd = g_new(struct GuestFwd, 1); + qemu_chr_fe_init(&fwd->hd, chr, &err); + if (err) { + error_propagate(errp, err); + object_unparent(OBJECT(chr)); g_free(fwd); return -1; } - if (slirp_add_exec(s->slirp, 3, fwd->hd, &server, port) < 0) { - error_report("conflicting/invalid host:port in guest forwarding " - "rule '%s'", config_str); + if (slirp_add_guestfwd(s->slirp, guestfwd_write, &fwd->hd, + &server, port) < 0) { + error_setg(errp, "Conflicting/invalid host:port in guest " + "forwarding rule '%s'", config_str); + qemu_chr_fe_deinit(&fwd->hd, true); g_free(fwd); return -1; } @@ -660,28 +1013,29 @@ static int slirp_guestfwd(SlirpState *s, const char *config_str, fwd->port = port; fwd->slirp = s->slirp; - qemu_chr_fe_claim_no_fail(fwd->hd); - qemu_chr_add_handlers(fwd->hd, guestfwd_can_read, guestfwd_read, - NULL, fwd); + qemu_chr_fe_set_handlers(&fwd->hd, guestfwd_can_read, guestfwd_read, + NULL, NULL, fwd, NULL, true); + s->fwd = g_slist_append(s->fwd, fwd); } return 0; fail_syntax: - error_report("invalid guest forwarding rule '%s'", config_str); + error_setg(errp, "Invalid guest forwarding rule '%s'", config_str); return -1; } -void do_info_usernet(Monitor *mon, const QDict *qdict) +void hmp_info_usernet(Monitor *mon, const QDict *qdict) { SlirpState *s; QTAILQ_FOREACH(s, &slirp_stacks, entry) { int id; - bool got_vlan_id = net_hub_id_for_client(&s->nc, &id) == 0; - monitor_printf(mon, "VLAN %d (%s):\n", - got_vlan_id ? id : -1, - s->nc.name); - slirp_connection_info(s->slirp, mon); + bool got_hub_id = net_hub_id_for_client(&s->nc, &id) == 0; + char *info = slirp_connection_info(s->slirp); + monitor_printf(mon, "Hub %d (%s):\n%s", + got_hub_id ? id : -1, + s->nc.name, info); + g_free(info); } } @@ -726,17 +1080,27 @@ static const char **slirp_dnssearch(const StringList *dnsname) return ret; } -int net_init_slirp(const NetClientOptions *opts, const char *name, - NetClientState *peer) +int net_init_slirp(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp) { struct slirp_config_str *config; char *vnet; int ret; const NetdevUserOptions *user; const char **dnssearch; + bool ipv4 = true, ipv6 = true; - assert(opts->kind == NET_CLIENT_OPTIONS_KIND_USER); - user = opts->user; + assert(netdev->type == NET_CLIENT_DRIVER_USER); + user = &netdev->u.user; + + if ((user->has_ipv6 && user->ipv6 && !user->has_ipv4) || + (user->has_ipv4 && !user->ipv4)) { + ipv4 = 0; + } + if ((user->has_ipv4 && user->ipv4 && !user->has_ipv6) || + (user->has_ipv6 && !user->ipv6)) { + ipv6 = 0; + } vnet = user->has_net ? g_strdup(user->net) : user->has_ip ? g_strdup_printf("%s/24", user->ip) : @@ -749,10 +1113,14 @@ int net_init_slirp(const NetClientOptions *opts, const char *name, net_init_slirp_configs(user->hostfwd, SLIRP_CFG_HOSTFWD); net_init_slirp_configs(user->guestfwd, 0); - ret = net_slirp_init(peer, "user", name, user->q_restrict, vnet, - user->host, user->hostname, user->tftp, - user->bootfile, user->dhcpstart, user->dns, user->smb, - user->smbserver, dnssearch); + ret = net_slirp_init(peer, "user", name, user->q_restrict, + ipv4, vnet, user->host, + ipv6, user->ipv6_prefix, user->ipv6_prefixlen, + user->ipv6_host, user->hostname, user->tftp, + user->bootfile, user->dhcpstart, + user->dns, user->ipv6_dns, user->smb, + user->smbserver, dnssearch, user->domainname, + user->tftp_server_name, errp); while (slirp_configs) { config = slirp_configs; @@ -765,30 +1133,3 @@ int net_init_slirp(const NetClientOptions *opts, const char *name, return ret; } - -int net_slirp_parse_legacy(QemuOptsList *opts_list, const char *optarg, int *ret) -{ - if (strcmp(opts_list->name, "net") != 0 || - strncmp(optarg, "channel,", strlen("channel,")) != 0) { - return 0; - } - - /* handle legacy -net channel,port:chr */ - optarg += strlen("channel,"); - - if (QTAILQ_EMPTY(&slirp_stacks)) { - struct slirp_config_str *config; - - config = g_malloc(sizeof(*config)); - pstrcpy(config->str, sizeof(config->str), optarg); - config->flags = SLIRP_CFG_LEGACY; - config->next = slirp_configs; - slirp_configs = config; - *ret = 0; - } else { - *ret = slirp_guestfwd(QTAILQ_FIRST(&slirp_stacks), optarg, 1); - } - - return 1; -} - diff --git a/net/socket.c b/net/socket.c index 87af1d3d3..c92354049 100644 --- a/net/socket.c +++ b/net/socket.c @@ -21,26 +21,25 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#include "config-host.h" +#include "qemu/osdep.h" #include "net/net.h" #include "clients.h" #include "monitor/monitor.h" +#include "qapi/error.h" #include "qemu-common.h" #include "qemu/error-report.h" #include "qemu/option.h" #include "qemu/sockets.h" #include "qemu/iov.h" +#include "qemu/main-loop.h" typedef struct NetSocketState { NetClientState nc; int listen_fd; int fd; - int state; /* 0 = getting length, 1 = getting data */ - unsigned int index; - unsigned int packet_len; + SocketReadState rs; unsigned int send_index; /* number of bytes sent (only SOCK_STREAM) */ - uint8_t buf[NET_BUFSIZE]; struct sockaddr_in dgram_dst; /* contains inet host and port destination iff connectionless (SOCK_DGRAM) */ IOHandler *send_fn; /* differs between SOCK_STREAM/SOCK_DGRAM */ bool read_poll; /* waiting to receive data? */ @@ -50,21 +49,12 @@ typedef struct NetSocketState { static void net_socket_accept(void *opaque); static void net_socket_writable(void *opaque); -/* Only read packets from socket when peer can receive them */ -static int net_socket_can_send(void *opaque) -{ - NetSocketState *s = opaque; - - return qemu_can_send_packet(&s->nc); -} - static void net_socket_update_fd_handler(NetSocketState *s) { - qemu_set_fd_handler2(s->fd, - s->read_poll ? net_socket_can_send : NULL, - s->read_poll ? s->send_fn : NULL, - s->write_poll ? net_socket_writable : NULL, - s); + qemu_set_fd_handler(s->fd, + s->read_poll ? s->send_fn : NULL, + s->write_poll ? net_socket_writable : NULL, + s); } static void net_socket_read_poll(NetSocketState *s, bool enable) @@ -129,9 +119,13 @@ static ssize_t net_socket_receive_dgram(NetClientState *nc, const uint8_t *buf, ssize_t ret; do { - ret = qemu_sendto(s->fd, buf, size, 0, - (struct sockaddr *)&s->dgram_dst, - sizeof(s->dgram_dst)); + if (s->dgram_dst.sin_family != AF_UNIX) { + ret = qemu_sendto(s->fd, buf, size, 0, + (struct sockaddr *)&s->dgram_dst, + sizeof(s->dgram_dst)); + } else { + ret = send(s->fd, buf, size, 0); + } } while (ret == -1 && errno == EINTR); if (ret == -1 && errno == EAGAIN) { @@ -141,18 +135,37 @@ static ssize_t net_socket_receive_dgram(NetClientState *nc, const uint8_t *buf, return ret; } +static void net_socket_send_completed(NetClientState *nc, ssize_t len) +{ + NetSocketState *s = DO_UPCAST(NetSocketState, nc, nc); + + if (!s->read_poll) { + net_socket_read_poll(s, true); + } +} + +static void net_socket_rs_finalize(SocketReadState *rs) +{ + NetSocketState *s = container_of(rs, NetSocketState, rs); + + if (qemu_send_packet_async(&s->nc, rs->buf, + rs->packet_len, + net_socket_send_completed) == 0) { + net_socket_read_poll(s, false); + } +} + static void net_socket_send(void *opaque) { NetSocketState *s = opaque; - int size, err; - unsigned l; + int size; + int ret; uint8_t buf1[NET_BUFSIZE]; const uint8_t *buf; size = qemu_recv(s->fd, buf1, sizeof(buf1), 0); if (size < 0) { - err = socket_error(); - if (err != EWOULDBLOCK) + if (errno != EWOULDBLOCK) goto eoc; } else if (size == 0) { /* end of connection */ @@ -165,57 +178,18 @@ static void net_socket_send(void *opaque) closesocket(s->fd); s->fd = -1; - s->state = 0; - s->index = 0; - s->packet_len = 0; + net_socket_rs_init(&s->rs, net_socket_rs_finalize, false); s->nc.link_down = true; - memset(s->buf, 0, sizeof(s->buf)); memset(s->nc.info_str, 0, sizeof(s->nc.info_str)); return; } buf = buf1; - while (size > 0) { - /* reassemble a packet from the network */ - switch(s->state) { - case 0: - l = 4 - s->index; - if (l > size) - l = size; - memcpy(s->buf + s->index, buf, l); - buf += l; - size -= l; - s->index += l; - if (s->index == 4) { - /* got length */ - s->packet_len = ntohl(*(uint32_t *)s->buf); - s->index = 0; - s->state = 1; - } - break; - case 1: - l = s->packet_len - s->index; - if (l > size) - l = size; - if (s->index + l <= sizeof(s->buf)) { - memcpy(s->buf + s->index, buf, l); - } else { - fprintf(stderr, "serious error: oversized packet received," - "connection terminated.\n"); - s->state = 0; - goto eoc; - } - s->index += l; - buf += l; - size -= l; - if (s->index >= s->packet_len) { - qemu_send_packet(&s->nc, s->buf, s->packet_len); - s->index = 0; - s->state = 0; - } - break; - } + ret = net_fill_rstate(&s->rs, buf, size); + + if (ret == -1) { + goto eoc; } } @@ -224,7 +198,7 @@ static void net_socket_send_dgram(void *opaque) NetSocketState *s = opaque; int size; - size = qemu_recv(s->fd, s->buf, sizeof(s->buf), 0); + size = qemu_recv(s->fd, s->rs.buf, sizeof(s->rs.buf), 0); if (size < 0) return; if (size == 0) { @@ -233,10 +207,15 @@ static void net_socket_send_dgram(void *opaque) net_socket_write_poll(s, false); return; } - qemu_send_packet(&s->nc, s->buf, size); + if (qemu_send_packet_async(&s->nc, s->rs.buf, size, + net_socket_send_completed) == 0) { + net_socket_read_poll(s, false); + } } -static int net_socket_mcast_create(struct sockaddr_in *mcastaddr, struct in_addr *localaddr) +static int net_socket_mcast_create(struct sockaddr_in *mcastaddr, + struct in_addr *localaddr, + Error **errp) { struct ip_mreq imr; int fd; @@ -248,29 +227,36 @@ static int net_socket_mcast_create(struct sockaddr_in *mcastaddr, struct in_addr #endif if (!IN_MULTICAST(ntohl(mcastaddr->sin_addr.s_addr))) { - fprintf(stderr, "qemu: error: specified mcastaddr \"%s\" (0x%08x) " - "does not contain a multicast address\n", - inet_ntoa(mcastaddr->sin_addr), - (int)ntohl(mcastaddr->sin_addr.s_addr)); + error_setg(errp, "specified mcastaddr %s (0x%08x) " + "does not contain a multicast address", + inet_ntoa(mcastaddr->sin_addr), + (int)ntohl(mcastaddr->sin_addr.s_addr)); return -1; - } + fd = qemu_socket(PF_INET, SOCK_DGRAM, 0); if (fd < 0) { - perror("socket(PF_INET, SOCK_DGRAM)"); + error_setg_errno(errp, errno, "can't create datagram socket"); return -1; } + /* Allow multiple sockets to bind the same multicast ip and port by setting + * SO_REUSEADDR. This is the only situation where SO_REUSEADDR should be set + * on windows. Use socket_set_fast_reuse otherwise as it sets SO_REUSEADDR + * only on posix systems. + */ val = 1; ret = qemu_setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)); if (ret < 0) { - perror("setsockopt(SOL_SOCKET, SO_REUSEADDR)"); + error_setg_errno(errp, errno, + "can't set socket option SO_REUSEADDR"); goto fail; } ret = bind(fd, (struct sockaddr *)mcastaddr, sizeof(*mcastaddr)); if (ret < 0) { - perror("bind"); + error_setg_errno(errp, errno, "can't bind ip=%s to socket", + inet_ntoa(mcastaddr->sin_addr)); goto fail; } @@ -285,7 +271,9 @@ static int net_socket_mcast_create(struct sockaddr_in *mcastaddr, struct in_addr ret = qemu_setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &imr, sizeof(struct ip_mreq)); if (ret < 0) { - perror("setsockopt(IP_ADD_MEMBERSHIP)"); + error_setg_errno(errp, errno, + "can't add socket to multicast group %s", + inet_ntoa(imr.imr_multiaddr)); goto fail; } @@ -294,7 +282,8 @@ static int net_socket_mcast_create(struct sockaddr_in *mcastaddr, struct in_addr ret = qemu_setsockopt(fd, IPPROTO_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); if (ret < 0) { - perror("setsockopt(SOL_IP, IP_MULTICAST_LOOP)"); + error_setg_errno(errp, errno, + "can't force multicast message to loopback"); goto fail; } @@ -303,7 +292,8 @@ static int net_socket_mcast_create(struct sockaddr_in *mcastaddr, struct in_addr ret = qemu_setsockopt(fd, IPPROTO_IP, IP_MULTICAST_IF, localaddr, sizeof(*localaddr)); if (ret < 0) { - perror("setsockopt(IP_MULTICAST_IF)"); + error_setg_errno(errp, errno, + "can't set the default network send interface"); goto fail; } } @@ -333,7 +323,7 @@ static void net_socket_cleanup(NetClientState *nc) } static NetClientInfo net_dgram_socket_info = { - .type = NET_CLIENT_OPTIONS_KIND_SOCKET, + .type = NET_CLIENT_DRIVER_SOCKET, .size = sizeof(NetSocketState), .receive = net_socket_receive_dgram, .cleanup = net_socket_cleanup, @@ -342,62 +332,72 @@ static NetClientInfo net_dgram_socket_info = { static NetSocketState *net_socket_fd_init_dgram(NetClientState *peer, const char *model, const char *name, - int fd, int is_connected) + int fd, int is_connected, + const char *mcast, + Error **errp) { struct sockaddr_in saddr; int newfd; - socklen_t saddr_len; NetClientState *nc; NetSocketState *s; + SocketAddress *sa; + SocketAddressType sa_type; + + sa = socket_local_address(fd, errp); + if (!sa) { + return NULL; + } + sa_type = sa->type; + qapi_free_SocketAddress(sa); /* fd passed: multicast: "learn" dgram_dst address from bound address and save it * Because this may be "shared" socket from a "master" process, datagrams would be recv() * by ONLY ONE process: we must "clone" this dgram socket --jjo */ - if (is_connected) { - if (getsockname(fd, (struct sockaddr *) &saddr, &saddr_len) == 0) { + if (is_connected && mcast != NULL) { + if (parse_host_port(&saddr, mcast, errp) < 0) { + goto err; + } /* must be bound */ if (saddr.sin_addr.s_addr == 0) { - fprintf(stderr, "qemu: error: init_dgram: fd=%d unbound, " - "cannot setup multicast dst addr\n", fd); + error_setg(errp, "can't setup multicast destination address"); goto err; } /* clone dgram socket */ - newfd = net_socket_mcast_create(&saddr, NULL); + newfd = net_socket_mcast_create(&saddr, NULL, errp); if (newfd < 0) { - /* error already reported by net_socket_mcast_create() */ goto err; } /* clone newfd to fd, close newfd */ dup2(newfd, fd); close(newfd); - } else { - fprintf(stderr, - "qemu: error: init_dgram: fd=%d failed getsockname(): %s\n", - fd, strerror(errno)); - goto err; - } } nc = qemu_new_net_client(&net_dgram_socket_info, peer, model, name); - snprintf(nc->info_str, sizeof(nc->info_str), - "socket: fd=%d (%s mcast=%s:%d)", - fd, is_connected ? "cloned" : "", - inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port)); - s = DO_UPCAST(NetSocketState, nc, nc); s->fd = fd; s->listen_fd = -1; s->send_fn = net_socket_send_dgram; + net_socket_rs_init(&s->rs, net_socket_rs_finalize, false); net_socket_read_poll(s, true); /* mcast: save bound address as dst */ - if (is_connected) { + if (is_connected && mcast != NULL) { s->dgram_dst = saddr; + snprintf(nc->info_str, sizeof(nc->info_str), + "socket: fd=%d (cloned mcast=%s:%d)", + fd, inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port)); + } else { + if (sa_type == SOCKET_ADDRESS_TYPE_UNIX) { + s->dgram_dst.sin_family = AF_UNIX; + } + + snprintf(nc->info_str, sizeof(nc->info_str), + "socket: fd=%d %s", fd, SocketAddressType_str(sa_type)); } return s; @@ -415,7 +415,7 @@ static void net_socket_connect(void *opaque) } static NetClientInfo net_socket_info = { - .type = NET_CLIENT_OPTIONS_KIND_SOCKET, + .type = NET_CLIENT_DRIVER_SOCKET, .size = sizeof(NetSocketState), .receive = net_socket_receive, .cleanup = net_socket_cleanup, @@ -437,6 +437,7 @@ static NetSocketState *net_socket_fd_init_stream(NetClientState *peer, s->fd = fd; s->listen_fd = -1; + net_socket_rs_init(&s->rs, net_socket_rs_finalize, false); /* Disable Nagle algorithm on TCP sockets to reduce latency */ socket_set_nodelay(fd); @@ -451,26 +452,27 @@ static NetSocketState *net_socket_fd_init_stream(NetClientState *peer, static NetSocketState *net_socket_fd_init(NetClientState *peer, const char *model, const char *name, - int fd, int is_connected) + int fd, int is_connected, + const char *mc, Error **errp) { int so_type = -1, optlen=sizeof(so_type); if(getsockopt(fd, SOL_SOCKET, SO_TYPE, (char *)&so_type, (socklen_t *)&optlen)< 0) { - fprintf(stderr, "qemu: error: getsockopt(SO_TYPE) for fd=%d failed\n", - fd); + error_setg(errp, "can't get socket option SO_TYPE"); closesocket(fd); return NULL; } switch(so_type) { case SOCK_DGRAM: - return net_socket_fd_init_dgram(peer, model, name, fd, is_connected); + return net_socket_fd_init_dgram(peer, model, name, fd, is_connected, + mc, errp); case SOCK_STREAM: return net_socket_fd_init_stream(peer, model, name, fd, is_connected); default: - /* who knows ... this could be a eg. a pty, do warn and continue as stream */ - fprintf(stderr, "qemu: warning: socket type=%d for fd=%d is not SOCK_DGRAM or SOCK_STREAM\n", so_type, fd); - return net_socket_fd_init_stream(peer, model, name, fd, is_connected); + error_setg(errp, "socket type=%d for fd=%d must be either" + " SOCK_DGRAM or SOCK_STREAM", so_type, fd); + closesocket(fd); } return NULL; } @@ -504,36 +506,37 @@ static void net_socket_accept(void *opaque) static int net_socket_listen_init(NetClientState *peer, const char *model, const char *name, - const char *host_str) + const char *host_str, + Error **errp) { NetClientState *nc; NetSocketState *s; struct sockaddr_in saddr; - int fd, val, ret; + int fd, ret; - if (parse_host_port(&saddr, host_str) < 0) + if (parse_host_port(&saddr, host_str, errp) < 0) { return -1; + } fd = qemu_socket(PF_INET, SOCK_STREAM, 0); if (fd < 0) { - perror("socket"); + error_setg_errno(errp, errno, "can't create stream socket"); return -1; } qemu_set_nonblock(fd); - /* allow fast reuse */ - val = 1; - qemu_setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)); + socket_set_fast_reuse(fd); ret = bind(fd, (struct sockaddr *)&saddr, sizeof(saddr)); if (ret < 0) { - perror("bind"); + error_setg_errno(errp, errno, "can't bind ip=%s to socket", + inet_ntoa(saddr.sin_addr)); closesocket(fd); return -1; } ret = listen(fd, 0); if (ret < 0) { - perror("listen"); + error_setg_errno(errp, errno, "can't listen on socket"); closesocket(fd); return -1; } @@ -543,6 +546,7 @@ static int net_socket_listen_init(NetClientState *peer, s->fd = -1; s->listen_fd = fd; s->nc.link_down = true; + net_socket_rs_init(&s->rs, net_socket_rs_finalize, false); qemu_set_fd_handler(s->listen_fd, net_socket_accept, NULL, s); return 0; @@ -551,18 +555,20 @@ static int net_socket_listen_init(NetClientState *peer, static int net_socket_connect_init(NetClientState *peer, const char *model, const char *name, - const char *host_str) + const char *host_str, + Error **errp) { NetSocketState *s; - int fd, connected, ret, err; + int fd, connected, ret; struct sockaddr_in saddr; - if (parse_host_port(&saddr, host_str) < 0) + if (parse_host_port(&saddr, host_str, errp) < 0) { return -1; + } fd = qemu_socket(PF_INET, SOCK_STREAM, 0); if (fd < 0) { - perror("socket"); + error_setg_errno(errp, errno, "can't create stream socket"); return -1; } qemu_set_nonblock(fd); @@ -571,16 +577,14 @@ static int net_socket_connect_init(NetClientState *peer, for(;;) { ret = connect(fd, (struct sockaddr *)&saddr, sizeof(saddr)); if (ret < 0) { - err = socket_error(); - if (err == EINTR || err == EWOULDBLOCK) { - } else if (err == EINPROGRESS) { - break; -#ifdef _WIN32 - } else if (err == WSAEALREADY || err == WSAEINVAL) { + if (errno == EINTR || errno == EWOULDBLOCK) { + /* continue */ + } else if (errno == EINPROGRESS || + errno == EALREADY || + errno == EINVAL) { break; -#endif } else { - perror("connect"); + error_setg_errno(errp, errno, "can't connect socket"); closesocket(fd); return -1; } @@ -589,9 +593,11 @@ static int net_socket_connect_init(NetClientState *peer, break; } } - s = net_socket_fd_init(peer, model, name, fd, connected); - if (!s) + s = net_socket_fd_init(peer, model, name, fd, connected, NULL, errp); + if (!s) { return -1; + } + snprintf(s->nc.info_str, sizeof(s->nc.info_str), "socket: connect to %s:%d", inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port)); @@ -602,31 +608,38 @@ static int net_socket_mcast_init(NetClientState *peer, const char *model, const char *name, const char *host_str, - const char *localaddr_str) + const char *localaddr_str, + Error **errp) { NetSocketState *s; int fd; struct sockaddr_in saddr; struct in_addr localaddr, *param_localaddr; - if (parse_host_port(&saddr, host_str) < 0) + if (parse_host_port(&saddr, host_str, errp) < 0) { return -1; + } if (localaddr_str != NULL) { - if (inet_aton(localaddr_str, &localaddr) == 0) + if (inet_aton(localaddr_str, &localaddr) == 0) { + error_setg(errp, "localaddr '%s' is not a valid IPv4 address", + localaddr_str); return -1; + } param_localaddr = &localaddr; } else { param_localaddr = NULL; } - fd = net_socket_mcast_create(&saddr, param_localaddr); - if (fd < 0) + fd = net_socket_mcast_create(&saddr, param_localaddr, errp); + if (fd < 0) { return -1; + } - s = net_socket_fd_init(peer, model, name, fd, 0); - if (!s) + s = net_socket_fd_init(peer, model, name, fd, 0, NULL, errp); + if (!s) { return -1; + } s->dgram_dst = saddr; @@ -641,42 +654,44 @@ static int net_socket_udp_init(NetClientState *peer, const char *model, const char *name, const char *rhost, - const char *lhost) + const char *lhost, + Error **errp) { NetSocketState *s; - int fd, val, ret; + int fd, ret; struct sockaddr_in laddr, raddr; - if (parse_host_port(&laddr, lhost) < 0) { + if (parse_host_port(&laddr, lhost, errp) < 0) { return -1; } - if (parse_host_port(&raddr, rhost) < 0) { + if (parse_host_port(&raddr, rhost, errp) < 0) { return -1; } fd = qemu_socket(PF_INET, SOCK_DGRAM, 0); if (fd < 0) { - perror("socket(PF_INET, SOCK_DGRAM)"); + error_setg_errno(errp, errno, "can't create datagram socket"); return -1; } - val = 1; - ret = qemu_setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, - &val, sizeof(val)); + + ret = socket_set_fast_reuse(fd); if (ret < 0) { - perror("setsockopt(SOL_SOCKET, SO_REUSEADDR)"); + error_setg_errno(errp, errno, + "can't set socket option SO_REUSEADDR"); closesocket(fd); return -1; } ret = bind(fd, (struct sockaddr *)&laddr, sizeof(laddr)); if (ret < 0) { - perror("bind"); + error_setg_errno(errp, errno, "can't bind ip=%s to socket", + inet_ntoa(laddr.sin_addr)); closesocket(fd); return -1; } qemu_set_nonblock(fd); - s = net_socket_fd_init(peer, model, name, fd, 0); + s = net_socket_fd_init(peer, model, name, fd, 0, NULL, errp); if (!s) { return -1; } @@ -689,50 +704,52 @@ static int net_socket_udp_init(NetClientState *peer, return 0; } -int net_init_socket(const NetClientOptions *opts, const char *name, - NetClientState *peer) +int net_init_socket(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp) { const NetdevSocketOptions *sock; - assert(opts->kind == NET_CLIENT_OPTIONS_KIND_SOCKET); - sock = opts->socket; + assert(netdev->type == NET_CLIENT_DRIVER_SOCKET); + sock = &netdev->u.socket; if (sock->has_fd + sock->has_listen + sock->has_connect + sock->has_mcast + sock->has_udp != 1) { - error_report("exactly one of fd=, listen=, connect=, mcast= or udp=" - " is required"); + error_setg(errp, "exactly one of listen=, connect=, mcast= or udp=" + " is required"); return -1; } if (sock->has_localaddr && !sock->has_mcast && !sock->has_udp) { - error_report("localaddr= is only valid with mcast= or udp="); + error_setg(errp, "localaddr= is only valid with mcast= or udp="); return -1; } if (sock->has_fd) { int fd; - fd = monitor_handle_fd_param(cur_mon, sock->fd); + fd = monitor_fd_param(cur_mon, sock->fd, errp); if (fd == -1) { return -1; } qemu_set_nonblock(fd); - if (!net_socket_fd_init(peer, "socket", name, fd, 1)) { + if (!net_socket_fd_init(peer, "socket", name, fd, 1, sock->mcast, + errp)) { return -1; } return 0; } if (sock->has_listen) { - if (net_socket_listen_init(peer, "socket", name, sock->listen) == -1) { + if (net_socket_listen_init(peer, "socket", name, sock->listen, errp) + < 0) { return -1; } return 0; } if (sock->has_connect) { - if (net_socket_connect_init(peer, "socket", name, sock->connect) == - -1) { + if (net_socket_connect_init(peer, "socket", name, sock->connect, errp) + < 0) { return -1; } return 0; @@ -742,7 +759,7 @@ int net_init_socket(const NetClientOptions *opts, const char *name, /* if sock->localaddr is missing, it has been initialized to "all bits * zero" */ if (net_socket_mcast_init(peer, "socket", name, sock->mcast, - sock->localaddr) == -1) { + sock->localaddr, errp) < 0) { return -1; } return 0; @@ -750,11 +767,11 @@ int net_init_socket(const NetClientOptions *opts, const char *name, assert(sock->has_udp); if (!sock->has_localaddr) { - error_report("localaddr= is mandatory with udp="); + error_setg(errp, "localaddr= is mandatory with udp="); return -1; } - if (net_socket_udp_init(peer, "socket", name, sock->udp, sock->localaddr) == - -1) { + if (net_socket_udp_init(peer, "socket", name, sock->udp, sock->localaddr, + errp) < 0) { return -1; } return 0; diff --git a/net/tap-bsd.c b/net/tap-bsd.c index f61d58096..a5c3707f8 100644 --- a/net/tap-bsd.c +++ b/net/tap-bsd.c @@ -22,19 +22,26 @@ * THE SOFTWARE. */ -#include "tap_int.h" +#include "qemu/osdep.h" #include "qemu-common.h" -#include "sysemu/sysemu.h" +#include "qapi/error.h" +#include "tap_int.h" +#include "qemu/cutils.h" #include "qemu/error-report.h" -#ifdef __NetBSD__ +#if defined(__NetBSD__) || defined(__FreeBSD__) #include <sys/ioctl.h> #include <net/if.h> #include <net/if_tap.h> #endif +#if defined(__OpenBSD__) +#include <sys/param.h> +#endif + +#ifndef __FreeBSD__ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, - int vnet_hdr_required, int mq_required) + int vnet_hdr_required, int mq_required, Error **errp) { int fd; #ifdef TAPGIFNAME @@ -44,8 +51,6 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, struct stat s; #endif -#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || \ - defined(__OpenBSD__) || defined(__APPLE__) /* if no ifname is given, always start the search from tap0/tun0. */ int i; char dname[100]; @@ -54,7 +59,7 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, if (*ifname) { snprintf(dname, sizeof dname, "/dev/%s", ifname); } else { -#if defined(__OpenBSD__) +#if defined(__OpenBSD__) && OpenBSD < 201605 snprintf(dname, sizeof dname, "/dev/tun%d", i); #else snprintf(dname, sizeof dname, "/dev/tap%d", i); @@ -72,32 +77,19 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, } } if (fd < 0) { - error_report("warning: could not open %s (%s): no virtual network emulation", - dname, strerror(errno)); - return -1; - } -#else - TFR(fd = open("/dev/tap", O_RDWR)); - if (fd < 0) { - fprintf(stderr, - "warning: could not open /dev/tap: no virtual network emulation: %s\n", - strerror(errno)); + error_setg_errno(errp, errno, "could not open %s", dname); return -1; } -#endif #ifdef TAPGIFNAME if (ioctl(fd, TAPGIFNAME, (void *)&ifr) < 0) { - fprintf(stderr, "warning: could not get tap name: %s\n", - strerror(errno)); + error_setg_errno(errp, errno, "could not get tap name"); return -1; } pstrcpy(ifname, ifname_size, ifr.ifr_name); #else if (fstat(fd, &s) < 0) { - fprintf(stderr, - "warning: could not stat /dev/tap: no virtual network emulation: %s\n", - strerror(errno)); + error_setg_errno(errp, errno, "could not stat %s", dname); return -1; } dev = devname(s.st_rdev, S_IFCHR); @@ -109,8 +101,8 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, *vnet_hdr = 0; if (vnet_hdr_required && !*vnet_hdr) { - error_report("vnet_hdr=1 requested, but no kernel " - "support for IFF_VNET_HDR available"); + error_setg(errp, "vnet_hdr=1 requested, but no kernel " + "support for IFF_VNET_HDR available"); close(fd); return -1; } @@ -119,9 +111,104 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, return fd; } -int tap_set_sndbuf(int fd, const NetdevTapOptions *tap) +#else /* __FreeBSD__ */ + +#define PATH_NET_TAP "/dev/tap" + +static int tap_open_clone(char *ifname, int ifname_size, Error **errp) +{ + int fd, s, ret; + struct ifreq ifr; + + TFR(fd = open(PATH_NET_TAP, O_RDWR)); + if (fd < 0) { + error_setg_errno(errp, errno, "could not open %s", PATH_NET_TAP); + return -1; + } + + memset(&ifr, 0, sizeof(ifr)); + + ret = ioctl(fd, TAPGIFNAME, (void *)&ifr); + if (ret < 0) { + error_setg_errno(errp, errno, "could not get tap interface name"); + close(fd); + return -1; + } + + if (ifname[0] != '\0') { + /* User requested the interface to have a specific name */ + s = socket(AF_LOCAL, SOCK_DGRAM, 0); + if (s < 0) { + error_setg_errno(errp, errno, + "could not open socket to set interface name"); + close(fd); + return -1; + } + ifr.ifr_data = ifname; + ret = ioctl(s, SIOCSIFNAME, (void *)&ifr); + close(s); + if (ret < 0) { + error_setg(errp, "could not set tap interface name"); + close(fd); + return -1; + } + } else { + pstrcpy(ifname, ifname_size, ifr.ifr_name); + } + + return fd; +} + +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required, Error **errp) +{ + int fd = -1; + + /* If the specified tap device already exists just use it. */ + if (ifname[0] != '\0') { + char dname[100]; + snprintf(dname, sizeof dname, "/dev/%s", ifname); + TFR(fd = open(dname, O_RDWR)); + if (fd < 0 && errno != ENOENT) { + error_setg_errno(errp, errno, "could not open %s", dname); + return -1; + } + } + + if (fd < 0) { + /* Tap device not specified or does not exist. */ + if ((fd = tap_open_clone(ifname, ifname_size, errp)) < 0) { + return -1; + } + } + + if (*vnet_hdr) { + /* BSD doesn't have IFF_VNET_HDR */ + *vnet_hdr = 0; + + if (vnet_hdr_required && !*vnet_hdr) { + error_setg(errp, "vnet_hdr=1 requested, but no kernel " + "support for IFF_VNET_HDR available"); + goto error; + } + } + if (mq_required) { + error_setg(errp, "mq_required requested, but no kernel support" + " for IFF_MULTI_QUEUE available"); + goto error; + } + + fcntl(fd, F_SETFL, O_NONBLOCK); + return fd; + +error: + close(fd); + return -1; +} +#endif /* __FreeBSD__ */ + +void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp) { - return 0; } int tap_probe_vnet_hdr(int fd) @@ -143,6 +230,16 @@ void tap_fd_set_vnet_hdr_len(int fd, int len) { } +int tap_fd_set_vnet_le(int fd, int is_le) +{ + return -EINVAL; +} + +int tap_fd_set_vnet_be(int fd, int is_be) +{ + return -EINVAL; +} + void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { diff --git a/net/tap-haiku.c b/net/tap-haiku.c deleted file mode 100644 index e5ce436d2..000000000 --- a/net/tap-haiku.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * QEMU System Emulator - * - * Copyright (c) 2003-2008 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "tap_int.h" -#include <stdio.h> - -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, - int vnet_hdr_required, int mq_required) -{ - fprintf(stderr, "no tap on Haiku\n"); - return -1; -} - -int tap_set_sndbuf(int fd, const NetdevTapOptions *tap) -{ - return 0; -} - -int tap_probe_vnet_hdr(int fd) -{ - return 0; -} - -int tap_probe_has_ufo(int fd) -{ - return 0; -} - -int tap_probe_vnet_hdr_len(int fd, int len) -{ - return 0; -} - -void tap_fd_set_vnet_hdr_len(int fd, int len) -{ -} - -void tap_fd_set_offload(int fd, int csum, int tso4, - int tso6, int ecn, int ufo) -{ -} - -int tap_fd_enable(int fd) -{ - return -1; -} - -int tap_fd_disable(int fd) -{ - return -1; -} - -int tap_fd_get_ifname(int fd, char *ifname) -{ - return -1; -} diff --git a/net/tap-linux.c b/net/tap-linux.c index 36c09e24d..e0dd442ee 100644 --- a/net/tap-linux.c +++ b/net/tap-linux.c @@ -23,6 +23,8 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" +#include "qemu-common.h" #include "tap_int.h" #include "tap-linux.h" #include "net/tap.h" @@ -30,14 +32,14 @@ #include <net/if.h> #include <sys/ioctl.h> -#include "sysemu/sysemu.h" -#include "qemu-common.h" +#include "qapi/error.h" #include "qemu/error-report.h" +#include "qemu/cutils.h" #define PATH_NET_TUN "/dev/net/tun" int tap_open(char *ifname, int ifname_size, int *vnet_hdr, - int vnet_hdr_required, int mq_required) + int vnet_hdr_required, int mq_required, Error **errp) { struct ifreq ifr; int fd, ret; @@ -46,20 +48,23 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, TFR(fd = open(PATH_NET_TUN, O_RDWR)); if (fd < 0) { - error_report("could not open %s: %m", PATH_NET_TUN); + error_setg_errno(errp, errno, "could not open %s", PATH_NET_TUN); return -1; } memset(&ifr, 0, sizeof(ifr)); ifr.ifr_flags = IFF_TAP | IFF_NO_PI; - if (ioctl(fd, TUNGETFEATURES, &features) == 0 && - features & IFF_ONE_QUEUE) { + if (ioctl(fd, TUNGETFEATURES, &features) == -1) { + warn_report("TUNGETFEATURES failed: %s", strerror(errno)); + features = 0; + } + + if (features & IFF_ONE_QUEUE) { ifr.ifr_flags |= IFF_ONE_QUEUE; } if (*vnet_hdr) { - if (ioctl(fd, TUNGETFEATURES, &features) == 0 && - features & IFF_VNET_HDR) { + if (features & IFF_VNET_HDR) { *vnet_hdr = 1; ifr.ifr_flags |= IFF_VNET_HDR; } else { @@ -67,8 +72,8 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, } if (vnet_hdr_required && !*vnet_hdr) { - error_report("vnet_hdr=1 requested, but no kernel " - "support for IFF_VNET_HDR available"); + error_setg(errp, "vnet_hdr=1 requested, but no kernel " + "support for IFF_VNET_HDR available"); close(fd); return -1; } @@ -82,10 +87,9 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, } if (mq_required) { - if ((ioctl(fd, TUNGETFEATURES, &features) != 0) || - !(features & IFF_MULTI_QUEUE)) { - error_report("multiqueue required, but no kernel " - "support for IFF_MULTI_QUEUE available"); + if (!(features & IFF_MULTI_QUEUE)) { + error_setg(errp, "multiqueue required, but no kernel " + "support for IFF_MULTI_QUEUE available"); close(fd); return -1; } else { @@ -100,9 +104,11 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, ret = ioctl(fd, TUNSETIFF, (void *) &ifr); if (ret != 0) { if (ifname[0] != '\0') { - error_report("could not configure %s (%s): %m", PATH_NET_TUN, ifr.ifr_name); + error_setg_errno(errp, errno, "could not configure %s (%s)", + PATH_NET_TUN, ifr.ifr_name); } else { - error_report("could not configure %s: %m", PATH_NET_TUN); + error_setg_errno(errp, errno, "could not configure %s", + PATH_NET_TUN); } close(fd); return -1; @@ -124,7 +130,7 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, */ #define TAP_DEFAULT_SNDBUF 0 -int tap_set_sndbuf(int fd, const NetdevTapOptions *tap) +void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp) { int sndbuf; @@ -137,10 +143,8 @@ int tap_set_sndbuf(int fd, const NetdevTapOptions *tap) } if (ioctl(fd, TUNSETSNDBUF, &sndbuf) == -1 && tap->has_sndbuf) { - error_report("TUNSETSNDBUF ioctl failed: %s", strerror(errno)); - return -1; + error_setg_errno(errp, errno, "TUNSETSNDBUF ioctl failed"); } - return 0; } int tap_probe_vnet_hdr(int fd) @@ -196,6 +200,40 @@ void tap_fd_set_vnet_hdr_len(int fd, int len) } } +int tap_fd_set_vnet_le(int fd, int is_le) +{ + int arg = is_le ? 1 : 0; + + if (!ioctl(fd, TUNSETVNETLE, &arg)) { + return 0; + } + + /* Check if our kernel supports TUNSETVNETLE */ + if (errno == EINVAL) { + return -errno; + } + + error_report("TUNSETVNETLE ioctl() failed: %s.", strerror(errno)); + abort(); +} + +int tap_fd_set_vnet_be(int fd, int is_be) +{ + int arg = is_be ? 1 : 0; + + if (!ioctl(fd, TUNSETVNETBE, &arg)) { + return 0; + } + + /* Check if our kernel supports TUNSETVNETBE */ + if (errno == EINVAL) { + return -errno; + } + + error_report("TUNSETVNETBE ioctl() failed: %s.", strerror(errno)); + abort(); +} + void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { diff --git a/net/tap-linux.h b/net/tap-linux.h index 1cf35d41b..2f36d100f 100644 --- a/net/tap-linux.h +++ b/net/tap-linux.h @@ -16,7 +16,6 @@ #ifndef QEMU_TAP_LINUX_H #define QEMU_TAP_LINUX_H -#include <stdint.h> #ifdef __linux__ #include <linux/ioctl.h> @@ -30,6 +29,8 @@ #define TUNGETVNETHDRSZ _IOR('T', 215, int) #define TUNSETVNETHDRSZ _IOW('T', 216, int) #define TUNSETQUEUE _IOW('T', 217, int) +#define TUNSETVNETLE _IOW('T', 220, int) +#define TUNSETVNETBE _IOW('T', 222, int) #endif @@ -49,4 +50,4 @@ #define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */ #define TUN_F_UFO 0x10 /* I can handle UFO packets */ -#endif /* QEMU_TAP_H */ +#endif /* QEMU_TAP_LINUX_H */ diff --git a/net/tap-solaris.c b/net/tap-solaris.c index 9c7278f1b..4725d2314 100644 --- a/net/tap-solaris.c +++ b/net/tap-solaris.c @@ -22,10 +22,12 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "tap_int.h" -#include "sysemu/sysemu.h" +#include "qemu/ctype.h" +#include "qemu/cutils.h" -#include <sys/stat.h> #include <sys/ethernet.h> #include <sys/sockio.h> #include <netinet/arp.h> @@ -36,7 +38,6 @@ #include <netinet/udp.h> #include <netinet/tcp.h> #include <net/if.h> -#include <syslog.h> #include <stropts.h> #include "qemu/error-report.h" @@ -56,8 +57,10 @@ ssize_t tap_read_packet(int tapfd, uint8_t *buf, int maxlen) * Allocate TAP device, returns opened fd. * Stores dev name in the first arg(must be large enough). */ -static int tap_alloc(char *dev, size_t dev_size) +static int tap_alloc(char *dev, size_t dev_size, Error **errp) { + /* FIXME leaks like a sieve on error paths */ + /* FIXME suspicious: many errors are reported, then ignored */ int tap_fd, if_fd, ppa = -1; static int ip_fd = 0; char *ptr; @@ -83,14 +86,14 @@ static int tap_alloc(char *dev, size_t dev_size) TFR(ip_fd = open("/dev/udp", O_RDWR, 0)); if (ip_fd < 0) { - syslog(LOG_ERR, "Can't open /dev/ip (actually /dev/udp)"); - return -1; + error_setg(errp, "Can't open /dev/ip (actually /dev/udp)"); + return -1; } TFR(tap_fd = open("/dev/tap", O_RDWR, 0)); if (tap_fd < 0) { - syslog(LOG_ERR, "Can't open /dev/tap"); - return -1; + error_setg(errp, "Can't open /dev/tap"); + return -1; } /* Assign a new PPA and get its unit number. */ @@ -99,20 +102,20 @@ static int tap_alloc(char *dev, size_t dev_size) strioc_ppa.ic_len = sizeof(ppa); strioc_ppa.ic_dp = (char *)&ppa; if ((ppa = ioctl (tap_fd, I_STR, &strioc_ppa)) < 0) - syslog (LOG_ERR, "Can't assign new interface"); + error_report("Can't assign new interface"); TFR(if_fd = open("/dev/tap", O_RDWR, 0)); if (if_fd < 0) { - syslog(LOG_ERR, "Can't open /dev/tap (2)"); - return -1; + error_setg(errp, "Can't open /dev/tap (2)"); + return -1; } if(ioctl(if_fd, I_PUSH, "ip") < 0){ - syslog(LOG_ERR, "Can't push IP module"); - return -1; + error_setg(errp, "Can't push IP module"); + return -1; } if (ioctl(if_fd, SIOCGLIFFLAGS, &ifr) < 0) - syslog(LOG_ERR, "Can't get flags\n"); + error_report("Can't get flags"); snprintf (actual_name, 32, "tap%d", ppa); pstrcpy(ifr.lifr_name, sizeof(ifr.lifr_name), actual_name); @@ -121,22 +124,22 @@ static int tap_alloc(char *dev, size_t dev_size) /* Assign ppa according to the unit number returned by tun device */ if (ioctl (if_fd, SIOCSLIFNAME, &ifr) < 0) - syslog (LOG_ERR, "Can't set PPA %d", ppa); + error_report("Can't set PPA %d", ppa); if (ioctl(if_fd, SIOCGLIFFLAGS, &ifr) <0) - syslog (LOG_ERR, "Can't get flags\n"); + error_report("Can't get flags"); /* Push arp module to if_fd */ if (ioctl (if_fd, I_PUSH, "arp") < 0) - syslog (LOG_ERR, "Can't push ARP module (2)"); + error_report("Can't push ARP module (2)"); /* Push arp module to ip_fd */ if (ioctl (ip_fd, I_POP, NULL) < 0) - syslog (LOG_ERR, "I_POP failed\n"); + error_report("I_POP failed"); if (ioctl (ip_fd, I_PUSH, "arp") < 0) - syslog (LOG_ERR, "Can't push ARP module (3)\n"); + error_report("Can't push ARP module (3)"); /* Open arp_fd */ TFR(arp_fd = open ("/dev/tap", O_RDWR, 0)); if (arp_fd < 0) - syslog (LOG_ERR, "Can't open %s\n", "/dev/tap"); + error_report("Can't open %s", "/dev/tap"); /* Set ifname to arp */ strioc_if.ic_cmd = SIOCSLIFNAME; @@ -144,16 +147,16 @@ static int tap_alloc(char *dev, size_t dev_size) strioc_if.ic_len = sizeof(ifr); strioc_if.ic_dp = (char *)𝔦 if (ioctl(arp_fd, I_STR, &strioc_if) < 0){ - syslog (LOG_ERR, "Can't set ifname to arp\n"); + error_report("Can't set ifname to arp"); } if((ip_muxid = ioctl(ip_fd, I_LINK, if_fd)) < 0){ - syslog(LOG_ERR, "Can't link TAP device to IP"); - return -1; + error_setg(errp, "Can't link TAP device to IP"); + return -1; } if ((arp_muxid = ioctl (ip_fd, link_type, arp_fd)) < 0) - syslog (LOG_ERR, "Can't link TAP device to ARP"); + error_report("Can't link TAP device to ARP"); close (if_fd); @@ -166,7 +169,7 @@ static int tap_alloc(char *dev, size_t dev_size) { ioctl (ip_fd, I_PUNLINK , arp_muxid); ioctl (ip_fd, I_PUNLINK, ip_muxid); - syslog (LOG_ERR, "Can't set multiplexor id"); + error_report("Can't set multiplexor id"); } snprintf(dev, dev_size, "tap%d", ppa); @@ -174,13 +177,14 @@ static int tap_alloc(char *dev, size_t dev_size) } int tap_open(char *ifname, int ifname_size, int *vnet_hdr, - int vnet_hdr_required, int mq_required) + int vnet_hdr_required, int mq_required, Error **errp) { char dev[10]=""; int fd; - if( (fd = tap_alloc(dev, sizeof(dev))) < 0 ){ - fprintf(stderr, "Cannot allocate TAP device\n"); - return -1; + + fd = tap_alloc(dev, sizeof(dev), errp); + if (fd < 0) { + return -1; } pstrcpy(ifname, ifname_size, dev); if (*vnet_hdr) { @@ -188,8 +192,8 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, *vnet_hdr = 0; if (vnet_hdr_required && !*vnet_hdr) { - error_report("vnet_hdr=1 requested, but no kernel " - "support for IFF_VNET_HDR available"); + error_setg(errp, "vnet_hdr=1 requested, but no kernel " + "support for IFF_VNET_HDR available"); close(fd); return -1; } @@ -198,9 +202,8 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, return fd; } -int tap_set_sndbuf(int fd, const NetdevTapOptions *tap) +void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp) { - return 0; } int tap_probe_vnet_hdr(int fd) @@ -222,6 +225,16 @@ void tap_fd_set_vnet_hdr_len(int fd, int len) { } +int tap_fd_set_vnet_le(int fd, int is_le) +{ + return -EINVAL; +} + +int tap_fd_set_vnet_be(int fd, int is_be) +{ + return -EINVAL; +} + void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { diff --git a/net/tap-aix.c b/net/tap-stub.c index 804d16448..a9ab8f829 100644 --- a/net/tap-aix.c +++ b/net/tap-stub.c @@ -22,19 +22,19 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "tap_int.h" -#include <stdio.h> int tap_open(char *ifname, int ifname_size, int *vnet_hdr, - int vnet_hdr_required, int mq_required) + int vnet_hdr_required, int mq_required, Error **errp) { - fprintf(stderr, "no tap on AIX\n"); + error_setg(errp, "tap is not supported in this build"); return -1; } -int tap_set_sndbuf(int fd, const NetdevTapOptions *tap) +void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp) { - return 0; } int tap_probe_vnet_hdr(int fd) @@ -56,6 +56,16 @@ void tap_fd_set_vnet_hdr_len(int fd, int len) { } +int tap_fd_set_vnet_le(int fd, int is_le) +{ + return -EINVAL; +} + +int tap_fd_set_vnet_be(int fd, int is_be) +{ + return -EINVAL; +} + void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { @@ -75,4 +85,3 @@ int tap_fd_get_ifname(int fd, char *ifname) { return -1; } - diff --git a/net/tap-win32.c b/net/tap-win32.c index 91e9e844a..2b5dcda36 100644 --- a/net/tap-win32.c +++ b/net/tap-win32.c @@ -26,15 +26,15 @@ * distribution); if not, see <http://www.gnu.org/licenses/>. */ +#include "qemu/osdep.h" #include "tap_int.h" #include "qemu-common.h" #include "clients.h" /* net_init_tap */ #include "net/net.h" #include "net/tap.h" /* tap_has_ufo, ... */ -#include "sysemu/sysemu.h" #include "qemu/error-report.h" -#include <stdio.h> +#include "qemu/main-loop.h" #include <windows.h> #include <winioctl.h> @@ -77,7 +77,12 @@ //#define DEBUG_TAP_WIN32 -#define TUN_ASYNCHRONOUS_WRITES 1 +/* FIXME: The asynch write path appears to be broken at + * present. WriteFile() ignores the lpNumberOfBytesWritten parameter + * for overlapped writes, with the result we return zero bytes sent, + * and after handling a single packet, receive is disabled for this + * interface. */ +/* #define TUN_ASYNCHRONOUS_WRITES 1 */ #define TUN_BUFFER_SIZE 1560 #define TUN_MAX_BUFFER_COUNT 32 @@ -356,7 +361,8 @@ static int get_device_guid( &len); if (status != ERROR_SUCCESS || name_type != REG_SZ) { - return -1; + ++i; + continue; } else { if (is_tap_win32_dev(enum_name)) { @@ -460,27 +466,48 @@ static int tap_win32_write(tap_win32_overlapped_t *overlapped, BOOL result; DWORD error; +#ifdef TUN_ASYNCHRONOUS_WRITES result = GetOverlappedResult( overlapped->handle, &overlapped->write_overlapped, &write_size, FALSE); if (!result && GetLastError() == ERROR_IO_INCOMPLETE) WaitForSingleObject(overlapped->write_event, INFINITE); +#endif result = WriteFile(overlapped->handle, buffer, size, &write_size, &overlapped->write_overlapped); +#ifdef TUN_ASYNCHRONOUS_WRITES + /* FIXME: we can't sensibly set write_size here, without waiting + * for the IO to complete! Moreover, we can't return zero, + * because that will disable receive on this interface, and we + * also can't assume it will succeed and return the full size, + * because that will result in the buffer being reclaimed while + * the IO is in progress. */ +#error Async writes are broken. Please disable TUN_ASYNCHRONOUS_WRITES. +#else /* !TUN_ASYNCHRONOUS_WRITES */ if (!result) { - switch (error = GetLastError()) - { - case ERROR_IO_PENDING: -#ifndef TUN_ASYNCHRONOUS_WRITES - WaitForSingleObject(overlapped->write_event, INFINITE); -#endif - break; - default: - return -1; + error = GetLastError(); + if (error == ERROR_IO_PENDING) { + result = GetOverlappedResult(overlapped->handle, + &overlapped->write_overlapped, + &write_size, TRUE); } } +#endif + + if (!result) { +#ifdef DEBUG_TAP_WIN32 + LPTSTR msgbuf; + error = GetLastError(); + FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER|FORMAT_MESSAGE_FROM_SYSTEM, + NULL, error, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + &msgbuf, 0, NULL); + fprintf(stderr, "Tap-Win32: Error WriteFile %d - %s\n", error, msgbuf); + LocalFree(msgbuf); +#endif + return 0; + } return write_size; } @@ -669,11 +696,70 @@ static void tap_win32_send(void *opaque) } } +static bool tap_has_ufo(NetClientState *nc) +{ + return false; +} + +static bool tap_has_vnet_hdr(NetClientState *nc) +{ + return false; +} + +int tap_probe_vnet_hdr_len(int fd, int len) +{ + return 0; +} + +void tap_fd_set_vnet_hdr_len(int fd, int len) +{ +} + +int tap_fd_set_vnet_le(int fd, int is_le) +{ + return -EINVAL; +} + +int tap_fd_set_vnet_be(int fd, int is_be) +{ + return -EINVAL; +} + +static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr) +{ +} + +static void tap_set_offload(NetClientState *nc, int csum, int tso4, + int tso6, int ecn, int ufo) +{ +} + +struct vhost_net *tap_get_vhost_net(NetClientState *nc) +{ + return NULL; +} + +static bool tap_has_vnet_hdr_len(NetClientState *nc, int len) +{ + return false; +} + +static void tap_set_vnet_hdr_len(NetClientState *nc, int len) +{ + abort(); +} + static NetClientInfo net_tap_win32_info = { - .type = NET_CLIENT_OPTIONS_KIND_TAP, + .type = NET_CLIENT_DRIVER_TAP, .size = sizeof(TAPState), .receive = tap_receive, .cleanup = tap_cleanup, + .has_ufo = tap_has_ufo, + .has_vnet_hdr = tap_has_vnet_hdr, + .has_vnet_hdr_len = tap_has_vnet_hdr_len, + .using_vnet_hdr = tap_using_vnet_hdr, + .set_offload = tap_set_offload, + .set_vnet_hdr_len = tap_set_vnet_hdr_len, }; static int tap_win32_init(NetClientState *peer, const char *model, @@ -702,13 +788,14 @@ static int tap_win32_init(NetClientState *peer, const char *model, return 0; } -int net_init_tap(const NetClientOptions *opts, const char *name, - NetClientState *peer) +int net_init_tap(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp) { + /* FIXME error_setg(errp, ...) on failure */ const NetdevTapOptions *tap; - assert(opts->kind == NET_CLIENT_OPTIONS_KIND_TAP); - tap = opts->tap; + assert(netdev->type == NET_CLIENT_DRIVER_TAP); + tap = &netdev->u.tap; if (!tap->has_ifname) { error_report("tap: no interface name"); @@ -722,49 +809,6 @@ int net_init_tap(const NetClientOptions *opts, const char *name, return 0; } -bool tap_has_ufo(NetClientState *nc) -{ - return false; -} - -int tap_has_vnet_hdr(NetClientState *nc) -{ - return 0; -} - -int tap_probe_vnet_hdr_len(int fd, int len) -{ - return 0; -} - -void tap_fd_set_vnet_hdr_len(int fd, int len) -{ -} - -void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr) -{ -} - -void tap_set_offload(NetClientState *nc, int csum, int tso4, - int tso6, int ecn, int ufo) -{ -} - -struct vhost_net *tap_get_vhost_net(NetClientState *nc) -{ - return NULL; -} - -int tap_has_vnet_hdr_len(NetClientState *nc, int len) -{ - return 0; -} - -void tap_set_vnet_hdr_len(NetClientState *nc, int len) -{ - abort(); -} - int tap_enable(NetClientState *nc) { abort(); @@ -23,12 +23,11 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" #include "tap_int.h" -#include "config-host.h" #include <sys/ioctl.h> -#include <sys/stat.h> #include <sys/wait.h> #include <sys/socket.h> #include <net/if.h> @@ -37,8 +36,12 @@ #include "clients.h" #include "monitor/monitor.h" #include "sysemu/sysemu.h" +#include "qapi/error.h" #include "qemu-common.h" +#include "qemu/cutils.h" #include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "qemu/sockets.h" #include "net/tap.h" @@ -57,21 +60,21 @@ typedef struct TAPState { bool enabled; VHostNetState *vhost_net; unsigned host_vnet_hdr_len; + Notifier exit; } TAPState; -static int launch_script(const char *setup_script, const char *ifname, int fd); +static void launch_script(const char *setup_script, const char *ifname, + int fd, Error **errp); -static int tap_can_send(void *opaque); static void tap_send(void *opaque); static void tap_writable(void *opaque); static void tap_update_fd_handler(TAPState *s) { - qemu_set_fd_handler2(s->fd, - s->read_poll && s->enabled ? tap_can_send : NULL, - s->read_poll && s->enabled ? tap_send : NULL, - s->write_poll && s->enabled ? tap_writable : NULL, - s); + qemu_set_fd_handler(s->fd, + s->read_poll && s->enabled ? tap_send : NULL, + s->write_poll && s->enabled ? tap_writable : NULL, + s); } static void tap_read_poll(TAPState *s, bool enable) @@ -165,13 +168,6 @@ static ssize_t tap_receive(NetClientState *nc, const uint8_t *buf, size_t size) return tap_write_packet(s, iov, 1); } -static int tap_can_send(void *opaque) -{ - TAPState *s = opaque; - - return qemu_can_send_packet(&s->nc); -} - #ifndef __sun__ ssize_t tap_read_packet(int tapfd, uint8_t *buf, int maxlen) { @@ -189,8 +185,9 @@ static void tap_send(void *opaque) { TAPState *s = opaque; int size; + int packets = 0; - do { + while (true) { uint8_t *buf = s->buf; size = tap_read_packet(s->fd, s->buf, sizeof(s->buf)); @@ -206,42 +203,56 @@ static void tap_send(void *opaque) size = qemu_send_packet_async(&s->nc, buf, size, tap_send_completed); if (size == 0) { tap_read_poll(s, false); + break; + } else if (size < 0) { + break; + } + + /* + * When the host keeps receiving more packets while tap_send() is + * running we can hog the QEMU global mutex. Limit the number of + * packets that are processed per tap_send() callback to prevent + * stalling the guest. + */ + packets++; + if (packets >= 50) { + break; } - } while (size > 0 && qemu_can_send_packet(&s->nc)); + } } -bool tap_has_ufo(NetClientState *nc) +static bool tap_has_ufo(NetClientState *nc) { TAPState *s = DO_UPCAST(TAPState, nc, nc); - assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP); + assert(nc->info->type == NET_CLIENT_DRIVER_TAP); return s->has_ufo; } -int tap_has_vnet_hdr(NetClientState *nc) +static bool tap_has_vnet_hdr(NetClientState *nc) { TAPState *s = DO_UPCAST(TAPState, nc, nc); - assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP); + assert(nc->info->type == NET_CLIENT_DRIVER_TAP); return !!s->host_vnet_hdr_len; } -int tap_has_vnet_hdr_len(NetClientState *nc, int len) +static bool tap_has_vnet_hdr_len(NetClientState *nc, int len) { TAPState *s = DO_UPCAST(TAPState, nc, nc); - assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP); + assert(nc->info->type == NET_CLIENT_DRIVER_TAP); - return tap_probe_vnet_hdr_len(s->fd, len); + return !!tap_probe_vnet_hdr_len(s->fd, len); } -void tap_set_vnet_hdr_len(NetClientState *nc, int len) +static void tap_set_vnet_hdr_len(NetClientState *nc, int len) { TAPState *s = DO_UPCAST(TAPState, nc, nc); - assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP); + assert(nc->info->type == NET_CLIENT_DRIVER_TAP); assert(len == sizeof(struct virtio_net_hdr_mrg_rxbuf) || len == sizeof(struct virtio_net_hdr)); @@ -249,17 +260,31 @@ void tap_set_vnet_hdr_len(NetClientState *nc, int len) s->host_vnet_hdr_len = len; } -void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr) +static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr) { TAPState *s = DO_UPCAST(TAPState, nc, nc); - assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP); + assert(nc->info->type == NET_CLIENT_DRIVER_TAP); assert(!!s->host_vnet_hdr_len == using_vnet_hdr); s->using_vnet_hdr = using_vnet_hdr; } -void tap_set_offload(NetClientState *nc, int csum, int tso4, +static int tap_set_vnet_le(NetClientState *nc, bool is_le) +{ + TAPState *s = DO_UPCAST(TAPState, nc, nc); + + return tap_fd_set_vnet_le(s->fd, is_le); +} + +static int tap_set_vnet_be(NetClientState *nc, bool is_be) +{ + TAPState *s = DO_UPCAST(TAPState, nc, nc); + + return tap_fd_set_vnet_be(s->fd, is_be); +} + +static void tap_set_offload(NetClientState *nc, int csum, int tso4, int tso6, int ecn, int ufo) { TAPState *s = DO_UPCAST(TAPState, nc, nc); @@ -270,19 +295,33 @@ void tap_set_offload(NetClientState *nc, int csum, int tso4, tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo); } +static void tap_exit_notify(Notifier *notifier, void *data) +{ + TAPState *s = container_of(notifier, TAPState, exit); + Error *err = NULL; + + if (s->down_script[0]) { + launch_script(s->down_script, s->down_script_arg, s->fd, &err); + if (err) { + error_report_err(err); + } + } +} + static void tap_cleanup(NetClientState *nc) { TAPState *s = DO_UPCAST(TAPState, nc, nc); if (s->vhost_net) { vhost_net_cleanup(s->vhost_net); + g_free(s->vhost_net); s->vhost_net = NULL; } qemu_purge_queued_packets(nc); - if (s->down_script[0]) - launch_script(s->down_script, s->down_script_arg, s->fd); + tap_exit_notify(&s->exit, NULL); + qemu_remove_exit_notifier(&s->exit); tap_read_poll(s, false); tap_write_poll(s, false); @@ -300,20 +339,28 @@ static void tap_poll(NetClientState *nc, bool enable) int tap_get_fd(NetClientState *nc) { TAPState *s = DO_UPCAST(TAPState, nc, nc); - assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP); + assert(nc->info->type == NET_CLIENT_DRIVER_TAP); return s->fd; } /* fd support */ static NetClientInfo net_tap_info = { - .type = NET_CLIENT_OPTIONS_KIND_TAP, + .type = NET_CLIENT_DRIVER_TAP, .size = sizeof(TAPState), .receive = tap_receive, .receive_raw = tap_receive_raw, .receive_iov = tap_receive_iov, .poll = tap_poll, .cleanup = tap_cleanup, + .has_ufo = tap_has_ufo, + .has_vnet_hdr = tap_has_vnet_hdr, + .has_vnet_hdr_len = tap_has_vnet_hdr_len, + .using_vnet_hdr = tap_using_vnet_hdr, + .set_offload = tap_set_offload, + .set_vnet_hdr_len = tap_set_vnet_hdr_len, + .set_vnet_le = tap_set_vnet_le, + .set_vnet_be = tap_set_vnet_be, }; static TAPState *net_tap_fd_init(NetClientState *peer, @@ -344,10 +391,15 @@ static TAPState *net_tap_fd_init(NetClientState *peer, } tap_read_poll(s, true); s->vhost_net = NULL; + + s->exit.notify = tap_exit_notify; + qemu_add_exit_notifier(&s->exit); + return s; } -static int launch_script(const char *setup_script, const char *ifname, int fd) +static void launch_script(const char *setup_script, const char *ifname, + int fd, Error **errp) { int pid, status; char *args[3]; @@ -355,14 +407,16 @@ static int launch_script(const char *setup_script, const char *ifname, int fd) /* try to launch network script */ pid = fork(); + if (pid < 0) { + error_setg_errno(errp, errno, "could not launch network script %s", + setup_script); + return; + } if (pid == 0) { int open_max = sysconf(_SC_OPEN_MAX), i; - for (i = 0; i < open_max; i++) { - if (i != STDIN_FILENO && - i != STDOUT_FILENO && - i != STDERR_FILENO && - i != fd) { + for (i = 3; i < open_max; i++) { + if (i != fd) { close(i); } } @@ -372,17 +426,17 @@ static int launch_script(const char *setup_script, const char *ifname, int fd) *parg = NULL; execv(setup_script, args); _exit(1); - } else if (pid > 0) { + } else { while (waitpid(pid, &status, 0) != pid) { /* loop */ } if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { - return 0; + return; } + error_setg(errp, "network script %s failed with status %d", + setup_script, status); } - fprintf(stderr, "%s: could not launch network script\n", setup_script); - return -1; } static int recv_fd(int c) @@ -419,7 +473,8 @@ static int recv_fd(int c) return len; } -static int net_bridge_run_helper(const char *helper, const char *bridge) +static int net_bridge_run_helper(const char *helper, const char *bridge, + Error **errp) { sigset_t oldmask, mask; int pid, status; @@ -432,37 +487,39 @@ static int net_bridge_run_helper(const char *helper, const char *bridge) sigprocmask(SIG_BLOCK, &mask, &oldmask); if (socketpair(PF_UNIX, SOCK_STREAM, 0, sv) == -1) { + error_setg_errno(errp, errno, "socketpair() failed"); return -1; } /* try to launch bridge helper */ pid = fork(); + if (pid < 0) { + error_setg_errno(errp, errno, "Can't fork bridge helper"); + return -1; + } if (pid == 0) { int open_max = sysconf(_SC_OPEN_MAX), i; - char fd_buf[6+10]; - char br_buf[6+IFNAMSIZ] = {0}; - char helper_cmd[PATH_MAX + sizeof(fd_buf) + sizeof(br_buf) + 15]; - - for (i = 0; i < open_max; i++) { - if (i != STDIN_FILENO && - i != STDOUT_FILENO && - i != STDERR_FILENO && - i != sv[1]) { + char *fd_buf = NULL; + char *br_buf = NULL; + char *helper_cmd = NULL; + + for (i = 3; i < open_max; i++) { + if (i != sv[1]) { close(i); } } - snprintf(fd_buf, sizeof(fd_buf), "%s%d", "--fd=", sv[1]); + fd_buf = g_strdup_printf("%s%d", "--fd=", sv[1]); if (strrchr(helper, ' ') || strrchr(helper, '\t')) { /* assume helper is a command */ if (strstr(helper, "--br=") == NULL) { - snprintf(br_buf, sizeof(br_buf), "%s%s", "--br=", bridge); + br_buf = g_strdup_printf("%s%s", "--br=", bridge); } - snprintf(helper_cmd, sizeof(helper_cmd), "%s %s %s %s", - helper, "--use-vnet", fd_buf, br_buf); + helper_cmd = g_strdup_printf("%s %s %s %s", helper, + "--use-vnet", fd_buf, br_buf ? br_buf : ""); parg = args; *parg++ = (char *)"sh"; @@ -471,10 +528,11 @@ static int net_bridge_run_helper(const char *helper, const char *bridge) *parg++ = NULL; execv("/bin/sh", args); + g_free(helper_cmd); } else { /* assume helper is just the executable path name */ - snprintf(br_buf, sizeof(br_buf), "%s%s", "--br=", bridge); + br_buf = g_strdup_printf("%s%s", "--br=", bridge); parg = args; *parg++ = (char *)helper; @@ -485,16 +543,20 @@ static int net_bridge_run_helper(const char *helper, const char *bridge) execv(helper, args); } + g_free(fd_buf); + g_free(br_buf); _exit(1); - } else if (pid > 0) { + } else { int fd; + int saved_errno; close(sv[1]); do { fd = recv_fd(sv[0]); } while (fd == -1 && errno == EINTR); + saved_errno = errno; close(sv[0]); @@ -503,47 +565,40 @@ static int net_bridge_run_helper(const char *helper, const char *bridge) } sigprocmask(SIG_SETMASK, &oldmask, NULL); if (fd < 0) { - fprintf(stderr, "failed to recv file descriptor\n"); + error_setg_errno(errp, saved_errno, + "failed to recv file descriptor"); return -1; } - - if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { - return fd; + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + error_setg(errp, "bridge helper failed"); + return -1; } + return fd; } - fprintf(stderr, "failed to launch bridge helper\n"); - return -1; } -int net_init_bridge(const NetClientOptions *opts, const char *name, - NetClientState *peer) +int net_init_bridge(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp) { const NetdevBridgeOptions *bridge; const char *helper, *br; - TAPState *s; int fd, vnet_hdr; - assert(opts->kind == NET_CLIENT_OPTIONS_KIND_BRIDGE); - bridge = opts->bridge; + assert(netdev->type == NET_CLIENT_DRIVER_BRIDGE); + bridge = &netdev->u.bridge; helper = bridge->has_helper ? bridge->helper : DEFAULT_BRIDGE_HELPER; br = bridge->has_br ? bridge->br : DEFAULT_BRIDGE_INTERFACE; - fd = net_bridge_run_helper(helper, br); + fd = net_bridge_run_helper(helper, br, errp); if (fd == -1) { return -1; } - fcntl(fd, F_SETFL, O_NONBLOCK); - + qemu_set_nonblock(fd); vnet_hdr = tap_probe_vnet_hdr(fd); - s = net_tap_fd_init(peer, "bridge", name, fd, vnet_hdr); - if (!s) { - close(fd); - return -1; - } snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s,br=%s", helper, br); @@ -553,8 +608,9 @@ int net_init_bridge(const NetClientOptions *opts, const char *name, static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr, const char *setup_script, char *ifname, - size_t ifname_sz, int mq_required) + size_t ifname_sz, int mq_required, Error **errp) { + Error *err = NULL; int fd, vnet_hdr_required; if (tap->has_vnet_hdr) { @@ -566,17 +622,20 @@ static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr, } TFR(fd = tap_open(ifname, ifname_sz, vnet_hdr, vnet_hdr_required, - mq_required)); + mq_required, errp)); if (fd < 0) { return -1; } if (setup_script && setup_script[0] != '\0' && - strcmp(setup_script, "no") != 0 && - launch_script(setup_script, ifname, fd)) { - close(fd); - return -1; + strcmp(setup_script, "no") != 0) { + launch_script(setup_script, ifname, fd, &err); + if (err) { + error_propagate(errp, err); + close(fd); + return -1; + } } return fd; @@ -584,22 +643,20 @@ static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr, #define MAX_TAP_QUEUES 1024 -static int net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, - const char *model, const char *name, - const char *ifname, const char *script, - const char *downscript, const char *vhostfdname, - int vnet_hdr, int fd) +static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, + const char *model, const char *name, + const char *ifname, const char *script, + const char *downscript, const char *vhostfdname, + int vnet_hdr, int fd, Error **errp) { - TAPState *s; + Error *err = NULL; + TAPState *s = net_tap_fd_init(peer, model, name, fd, vnet_hdr); + int vhostfd; - s = net_tap_fd_init(peer, model, name, fd, vnet_hdr); - if (!s) { - close(fd); - return -1; - } - - if (tap_set_sndbuf(s->fd, tap) < 0) { - return -1; + tap_set_sndbuf(s->fd, tap, &err); + if (err) { + error_propagate(errp, err); + return; } if (tap->has_fd || tap->has_fds) { @@ -621,29 +678,55 @@ static int net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, if (tap->has_vhost ? tap->vhost : vhostfdname || (tap->has_vhostforce && tap->vhostforce)) { - int vhostfd; + VhostNetOptions options; - if (tap->has_vhostfd || tap->has_vhostfds) { - vhostfd = monitor_handle_fd_param(cur_mon, vhostfdname); + options.backend_type = VHOST_BACKEND_TYPE_KERNEL; + options.net_backend = &s->nc; + if (tap->has_poll_us) { + options.busyloop_timeout = tap->poll_us; + } else { + options.busyloop_timeout = 0; + } + + if (vhostfdname) { + vhostfd = monitor_fd_param(cur_mon, vhostfdname, &err); if (vhostfd == -1) { - return -1; + if (tap->has_vhostforce && tap->vhostforce) { + error_propagate(errp, err); + } else { + warn_report_err(err); + } + return; } + qemu_set_nonblock(vhostfd); } else { - vhostfd = -1; + vhostfd = open("/dev/vhost-net", O_RDWR); + if (vhostfd < 0) { + if (tap->has_vhostforce && tap->vhostforce) { + error_setg_errno(errp, errno, + "tap: open vhost char device failed"); + } else { + warn_report("tap: open vhost char device failed: %s", + strerror(errno)); + } + return; + } + qemu_set_nonblock(vhostfd); } + options.opaque = (void *)(uintptr_t)vhostfd; - s->vhost_net = vhost_net_init(&s->nc, vhostfd, - tap->has_vhostforce && tap->vhostforce); + s->vhost_net = vhost_net_init(&options); if (!s->vhost_net) { - error_report("vhost-net requested but could not be initialized"); - return -1; + if (tap->has_vhostforce && tap->vhostforce) { + error_setg(errp, VHOST_NET_INIT_FAILED); + } else { + warn_report(VHOST_NET_INIT_FAILED); + } + return; } - } else if (tap->has_vhostfd || tap->has_vhostfds) { - error_report("vhostfd= is not valid without vhost"); - return -1; + } else if (vhostfdname) { + error_setg(errp, "vhostfd(s)= is not valid without vhost"); } - - return 0; } static int get_fds(char *str, char *fds[], int max) @@ -672,26 +755,27 @@ static int get_fds(char *str, char *fds[], int max) return i; } -int net_init_tap(const NetClientOptions *opts, const char *name, - NetClientState *peer) +int net_init_tap(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp) { const NetdevTapOptions *tap; int fd, vnet_hdr = 0, i = 0, queues; /* for the no-fd, no-helper case */ const char *script = NULL; /* suppress wrong "uninit'd use" gcc warning */ const char *downscript = NULL; + Error *err = NULL; const char *vhostfdname; char ifname[128]; - assert(opts->kind == NET_CLIENT_OPTIONS_KIND_TAP); - tap = opts->tap; + assert(netdev->type == NET_CLIENT_DRIVER_TAP); + tap = &netdev->u.tap; queues = tap->has_queues ? tap->queues : 1; vhostfdname = tap->has_vhostfd ? tap->vhostfd : NULL; - /* QEMU vlans does not support multiqueue tap, in this case peer is set. + /* QEMU hubs do not support multiqueue tap, in this case peer is set. * For -netdev, peer is always NULL. */ if (peer && (tap->has_queues || tap->has_fds || tap->has_vhostfds)) { - error_report("Multiqueue tap cannot be used with QEMU vlans"); + error_setg(errp, "Multiqueue tap cannot be used with hubs"); return -1; } @@ -699,96 +783,128 @@ int net_init_tap(const NetClientOptions *opts, const char *name, if (tap->has_ifname || tap->has_script || tap->has_downscript || tap->has_vnet_hdr || tap->has_helper || tap->has_queues || tap->has_fds || tap->has_vhostfds) { - error_report("ifname=, script=, downscript=, vnet_hdr=, " - "helper=, queues=, fds=, and vhostfds= " - "are invalid with fd="); + error_setg(errp, "ifname=, script=, downscript=, vnet_hdr=, " + "helper=, queues=, fds=, and vhostfds= " + "are invalid with fd="); return -1; } - fd = monitor_handle_fd_param(cur_mon, tap->fd); + fd = monitor_fd_param(cur_mon, tap->fd, &err); if (fd == -1) { + error_propagate(errp, err); return -1; } - fcntl(fd, F_SETFL, O_NONBLOCK); + qemu_set_nonblock(fd); vnet_hdr = tap_probe_vnet_hdr(fd); - if (net_init_tap_one(tap, peer, "tap", name, NULL, - script, downscript, - vhostfdname, vnet_hdr, fd)) { + net_init_tap_one(tap, peer, "tap", name, NULL, + script, downscript, + vhostfdname, vnet_hdr, fd, &err); + if (err) { + error_propagate(errp, err); return -1; } } else if (tap->has_fds) { - char *fds[MAX_TAP_QUEUES]; - char *vhost_fds[MAX_TAP_QUEUES]; - int nfds, nvhosts; + char **fds; + char **vhost_fds; + int nfds = 0, nvhosts = 0; + int ret = 0; if (tap->has_ifname || tap->has_script || tap->has_downscript || tap->has_vnet_hdr || tap->has_helper || tap->has_queues || tap->has_vhostfd) { - error_report("ifname=, script=, downscript=, vnet_hdr=, " - "helper=, queues=, and vhostfd= " - "are invalid with fds="); + error_setg(errp, "ifname=, script=, downscript=, vnet_hdr=, " + "helper=, queues=, and vhostfd= " + "are invalid with fds="); return -1; } + fds = g_new0(char *, MAX_TAP_QUEUES); + vhost_fds = g_new0(char *, MAX_TAP_QUEUES); + nfds = get_fds(tap->fds, fds, MAX_TAP_QUEUES); if (tap->has_vhostfds) { nvhosts = get_fds(tap->vhostfds, vhost_fds, MAX_TAP_QUEUES); if (nfds != nvhosts) { - error_report("The number of fds passed does not match the " - "number of vhostfds passed"); - return -1; + error_setg(errp, "The number of fds passed does not match " + "the number of vhostfds passed"); + ret = -1; + goto free_fail; } } for (i = 0; i < nfds; i++) { - fd = monitor_handle_fd_param(cur_mon, fds[i]); + fd = monitor_fd_param(cur_mon, fds[i], &err); if (fd == -1) { - return -1; + error_propagate(errp, err); + ret = -1; + goto free_fail; } - fcntl(fd, F_SETFL, O_NONBLOCK); + qemu_set_nonblock(fd); if (i == 0) { vnet_hdr = tap_probe_vnet_hdr(fd); } else if (vnet_hdr != tap_probe_vnet_hdr(fd)) { - error_report("vnet_hdr not consistent across given tap fds"); - return -1; + error_setg(errp, + "vnet_hdr not consistent across given tap fds"); + ret = -1; + goto free_fail; } - if (net_init_tap_one(tap, peer, "tap", name, ifname, - script, downscript, - tap->has_vhostfds ? vhost_fds[i] : NULL, - vnet_hdr, fd)) { - return -1; + net_init_tap_one(tap, peer, "tap", name, ifname, + script, downscript, + tap->has_vhostfds ? vhost_fds[i] : NULL, + vnet_hdr, fd, &err); + if (err) { + error_propagate(errp, err); + ret = -1; + goto free_fail; } } + +free_fail: + for (i = 0; i < nvhosts; i++) { + g_free(vhost_fds[i]); + } + for (i = 0; i < nfds; i++) { + g_free(fds[i]); + } + g_free(fds); + g_free(vhost_fds); + return ret; } else if (tap->has_helper) { if (tap->has_ifname || tap->has_script || tap->has_downscript || tap->has_vnet_hdr || tap->has_queues || tap->has_vhostfds) { - error_report("ifname=, script=, downscript=, and vnet_hdr= " - "queues=, and vhostfds= are invalid with helper="); + error_setg(errp, "ifname=, script=, downscript=, vnet_hdr=, " + "queues=, and vhostfds= are invalid with helper="); return -1; } - fd = net_bridge_run_helper(tap->helper, DEFAULT_BRIDGE_INTERFACE); + fd = net_bridge_run_helper(tap->helper, + tap->has_br ? + tap->br : DEFAULT_BRIDGE_INTERFACE, + errp); if (fd == -1) { return -1; } - fcntl(fd, F_SETFL, O_NONBLOCK); + qemu_set_nonblock(fd); vnet_hdr = tap_probe_vnet_hdr(fd); - if (net_init_tap_one(tap, peer, "bridge", name, ifname, - script, downscript, vhostfdname, - vnet_hdr, fd)) { + net_init_tap_one(tap, peer, "bridge", name, ifname, + script, downscript, vhostfdname, + vnet_hdr, fd, &err); + if (err) { + error_propagate(errp, err); + close(fd); return -1; } } else { if (tap->has_vhostfds) { - error_report("vhostfds= is invalid if fds= wasn't specified"); + error_setg(errp, "vhostfds= is invalid if fds= wasn't specified"); return -1; } script = tap->has_script ? tap->script : DEFAULT_NETWORK_SCRIPT; @@ -803,22 +919,26 @@ int net_init_tap(const NetClientOptions *opts, const char *name, for (i = 0; i < queues; i++) { fd = net_tap_init(tap, &vnet_hdr, i >= 1 ? "no" : script, - ifname, sizeof ifname, queues > 1); + ifname, sizeof ifname, queues > 1, errp); if (fd == -1) { return -1; } if (queues > 1 && i == 0 && !tap->has_ifname) { if (tap_fd_get_ifname(fd, ifname)) { - error_report("Fail to get ifname"); + error_setg(errp, "Fail to get ifname"); + close(fd); return -1; } } - if (net_init_tap_one(tap, peer, "tap", name, ifname, - i >= 1 ? "no" : script, - i >= 1 ? "no" : downscript, - vhostfdname, vnet_hdr, fd)) { + net_init_tap_one(tap, peer, "tap", name, ifname, + i >= 1 ? "no" : script, + i >= 1 ? "no" : downscript, + vhostfdname, vnet_hdr, fd, &err); + if (err) { + error_propagate(errp, err); + close(fd); return -1; } } @@ -830,7 +950,7 @@ int net_init_tap(const NetClientOptions *opts, const char *name, VHostNetState *tap_get_vhost_net(NetClientState *nc) { TAPState *s = DO_UPCAST(TAPState, nc, nc); - assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP); + assert(nc->info->type == NET_CLIENT_DRIVER_TAP); return s->vhost_net; } diff --git a/net/tap_int.h b/net/tap_int.h index 86bb224bc..e3194b23f 100644 --- a/net/tap_int.h +++ b/net/tap_int.h @@ -23,28 +23,26 @@ * THE SOFTWARE. */ -#ifndef QEMU_TAP_H -#define QEMU_TAP_H +#ifndef NET_TAP_INT_H +#define NET_TAP_INT_H -#include "qemu-common.h" -#include "qapi-types.h" - -#define DEFAULT_NETWORK_SCRIPT "/etc/qemu-ifup" -#define DEFAULT_NETWORK_DOWN_SCRIPT "/etc/qemu-ifdown" +#include "qapi/qapi-types-net.h" int tap_open(char *ifname, int ifname_size, int *vnet_hdr, - int vnet_hdr_required, int mq_required); + int vnet_hdr_required, int mq_required, Error **errp); ssize_t tap_read_packet(int tapfd, uint8_t *buf, int maxlen); -int tap_set_sndbuf(int fd, const NetdevTapOptions *tap); +void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp); int tap_probe_vnet_hdr(int fd); int tap_probe_vnet_hdr_len(int fd, int len); int tap_probe_has_ufo(int fd); void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo); void tap_fd_set_vnet_hdr_len(int fd, int len); +int tap_fd_set_vnet_le(int fd, int vnet_is_le); +int tap_fd_set_vnet_be(int fd, int vnet_is_be); int tap_fd_enable(int fd); int tap_fd_disable(int fd); int tap_fd_get_ifname(int fd, char *ifname); -#endif /* QEMU_TAP_H */ +#endif /* NET_TAP_INT_H */ diff --git a/net/trace-events b/net/trace-events new file mode 100644 index 000000000..02c13fd0b --- /dev/null +++ b/net/trace-events @@ -0,0 +1,25 @@ +# See docs/devel/tracing.txt for syntax documentation. + +# announce.c +qemu_announce_self_iter(const char *id, const char *name, const char *mac, int skip) "%s:%s:%s skip: %d" +qemu_announce_timer_del(bool free_named, bool free_timer, char *id) "free named: %d free timer: %d id: %s" + +# vhost-user.c +vhost_user_event(const char *chr, int event) "chr: %s got event: %d" + +# colo.c +colo_proxy_main(const char *chr) ": %s" + +# colo-compare.c +colo_compare_main(const char *chr) ": %s" +colo_compare_udp_miscompare(const char *sta, int size) ": %s = %d" +colo_compare_icmp_miscompare(const char *sta, int size) ": %s = %d" +colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, spkt size = %d, ip_src = %s, ip_dst = %s" +colo_old_packet_check_found(int64_t old_time) "%" PRId64 +colo_compare_miscompare(void) "" +colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int hdlen, int pdlen, int offset, int flags) "%s: seq/ack= %u/%u hdlen= %d pdlen= %d offset= %d flags=%d" + +# filter-rewriter.c +colo_filter_rewriter_debug(void) "" +colo_filter_rewriter_pkt_info(const char *func, const char *src, const char *dst, uint32_t seq, uint32_t ack, uint32_t flag) "%s: src/dst: %s/%s p: seq/ack=%u/%u flags=0x%x" +colo_filter_rewriter_conn_offset(uint32_t offset) ": offset=%u" diff --git a/net/util.c b/net/util.c index 7e9507679..0b3dbfe5d 100644 --- a/net/util.c +++ b/net/util.c @@ -22,9 +22,8 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" #include "util.h" -#include <errno.h> -#include <stdlib.h> int net_parse_macaddr(uint8_t *macaddr, const char *p) { diff --git a/net/util.h b/net/util.h index 10c7da95f..358185fd5 100644 --- a/net/util.h +++ b/net/util.h @@ -25,7 +25,61 @@ #ifndef QEMU_NET_UTIL_H #define QEMU_NET_UTIL_H -#include <stdint.h> + +/* + * Structure of an internet header, naked of options. + */ +struct ip { +#ifdef HOST_WORDS_BIGENDIAN + uint8_t ip_v:4, /* version */ + ip_hl:4; /* header length */ +#else + uint8_t ip_hl:4, /* header length */ + ip_v:4; /* version */ +#endif + uint8_t ip_tos; /* type of service */ + uint16_t ip_len; /* total length */ + uint16_t ip_id; /* identification */ + uint16_t ip_off; /* fragment offset field */ +#define IP_DF 0x4000 /* don't fragment flag */ +#define IP_MF 0x2000 /* more fragments flag */ +#define IP_OFFMASK 0x1fff /* mask for fragmenting bits */ + uint8_t ip_ttl; /* time to live */ + uint8_t ip_p; /* protocol */ + uint16_t ip_sum; /* checksum */ + struct in_addr ip_src, ip_dst; /* source and dest address */ +} QEMU_PACKED; + +static inline bool in6_equal_net(const struct in6_addr *a, + const struct in6_addr *b, + int prefix_len) +{ + if (memcmp(a, b, prefix_len / 8) != 0) { + return 0; + } + + if (prefix_len % 8 == 0) { + return 1; + } + + return a->s6_addr[prefix_len / 8] >> (8 - (prefix_len % 8)) + == b->s6_addr[prefix_len / 8] >> (8 - (prefix_len % 8)); +} + +#define TCPS_CLOSED 0 /* closed */ +#define TCPS_LISTEN 1 /* listening for connection */ +#define TCPS_SYN_SENT 2 /* active, have sent syn */ +#define TCPS_SYN_RECEIVED 3 /* have send and received syn */ +/* states < TCPS_ESTABLISHED are those where connections not established */ +#define TCPS_ESTABLISHED 4 /* established */ +#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */ +/* states > TCPS_CLOSE_WAIT are those where user has closed */ +#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */ +#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */ +#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */ +/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */ +#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */ +#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */ int net_parse_macaddr(uint8_t *macaddr, const char *p); @@ -21,7 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#include "config-host.h" +#include "qemu/osdep.h" #include <libvdeplug.h> @@ -30,6 +30,7 @@ #include "qemu-common.h" #include "qemu/option.h" #include "qemu/main-loop.h" +#include "qapi/error.h" typedef struct VDEState { NetClientState nc; @@ -68,7 +69,7 @@ static void vde_cleanup(NetClientState *nc) } static NetClientInfo net_vde_info = { - .type = NET_CLIENT_OPTIONS_KIND_VDE, + .type = NET_CLIENT_DRIVER_VDE, .size = sizeof(VDEState), .receive = vde_receive, .cleanup = vde_cleanup, @@ -76,7 +77,7 @@ static NetClientInfo net_vde_info = { static int net_vde_init(NetClientState *peer, const char *model, const char *name, const char *sock, - int port, const char *group, int mode) + int port, const char *group, int mode, Error **errp) { NetClientState *nc; VDEState *s; @@ -92,6 +93,7 @@ static int net_vde_init(NetClientState *peer, const char *model, vde = vde_open(init_sock, (char *)"QEMU", &args); if (!vde){ + error_setg_errno(errp, errno, "Could not open vde"); return -1; } @@ -109,17 +111,17 @@ static int net_vde_init(NetClientState *peer, const char *model, return 0; } -int net_init_vde(const NetClientOptions *opts, const char *name, - NetClientState *peer) +int net_init_vde(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp) { const NetdevVdeOptions *vde; - assert(opts->kind == NET_CLIENT_OPTIONS_KIND_VDE); - vde = opts->vde; + assert(netdev->type == NET_CLIENT_DRIVER_VDE); + vde = &netdev->u.vde; /* missing optional values have been initialized to "all bits zero" */ if (net_vde_init(peer, "vde", name, vde->sock, vde->port, vde->group, - vde->has_mode ? vde->mode : 0700) == -1) { + vde->has_mode ? vde->mode : 0700, errp) == -1) { return -1; } diff --git a/net/vhost-user-stub.c b/net/vhost-user-stub.c new file mode 100644 index 000000000..52ab4e13f --- /dev/null +++ b/net/vhost-user-stub.c @@ -0,0 +1,23 @@ +/* + * vhost-user-stub.c + * + * Copyright (c) 2018 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "clients.h" +#include "net/vhost_net.h" +#include "net/vhost-user.h" +#include "qemu/error-report.h" +#include "qapi/error.h" + +int net_init_vhost_user(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp) +{ + error_setg(errp, "vhost-user requires frontend driver virtio-net-*"); + return -1; +} diff --git a/net/vhost-user.c b/net/vhost-user.c new file mode 100644 index 000000000..014199d60 --- /dev/null +++ b/net/vhost-user.c @@ -0,0 +1,441 @@ +/* + * vhost-user.c + * + * Copyright (c) 2013 Virtual Open Systems Sarl. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "clients.h" +#include "net/vhost_net.h" +#include "net/vhost-user.h" +#include "hw/virtio/vhost-user.h" +#include "chardev/char-fe.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-net.h" +#include "qemu/config-file.h" +#include "qemu/error-report.h" +#include "qemu/option.h" +#include "trace.h" + +typedef struct NetVhostUserState { + NetClientState nc; + CharBackend chr; /* only queue index 0 */ + VhostUserState *vhost_user; + VHostNetState *vhost_net; + guint watch; + uint64_t acked_features; + bool started; +} NetVhostUserState; + +VHostNetState *vhost_user_get_vhost_net(NetClientState *nc) +{ + NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc); + assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER); + return s->vhost_net; +} + +uint64_t vhost_user_get_acked_features(NetClientState *nc) +{ + NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc); + assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER); + return s->acked_features; +} + +static void vhost_user_stop(int queues, NetClientState *ncs[]) +{ + NetVhostUserState *s; + int i; + + for (i = 0; i < queues; i++) { + assert(ncs[i]->info->type == NET_CLIENT_DRIVER_VHOST_USER); + + s = DO_UPCAST(NetVhostUserState, nc, ncs[i]); + + if (s->vhost_net) { + /* save acked features */ + uint64_t features = vhost_net_get_acked_features(s->vhost_net); + if (features) { + s->acked_features = features; + } + vhost_net_cleanup(s->vhost_net); + } + } +} + +static int vhost_user_start(int queues, NetClientState *ncs[], + VhostUserState *be) +{ + VhostNetOptions options; + struct vhost_net *net = NULL; + NetVhostUserState *s; + int max_queues; + int i; + + options.backend_type = VHOST_BACKEND_TYPE_USER; + + for (i = 0; i < queues; i++) { + assert(ncs[i]->info->type == NET_CLIENT_DRIVER_VHOST_USER); + + s = DO_UPCAST(NetVhostUserState, nc, ncs[i]); + + options.net_backend = ncs[i]; + options.opaque = be; + options.busyloop_timeout = 0; + net = vhost_net_init(&options); + if (!net) { + error_report("failed to init vhost_net for queue %d", i); + goto err; + } + + if (i == 0) { + max_queues = vhost_net_get_max_queues(net); + if (queues > max_queues) { + error_report("you are asking more queues than supported: %d", + max_queues); + goto err; + } + } + + if (s->vhost_net) { + vhost_net_cleanup(s->vhost_net); + g_free(s->vhost_net); + } + s->vhost_net = net; + } + + return 0; + +err: + if (net) { + vhost_net_cleanup(net); + g_free(net); + } + vhost_user_stop(i, ncs); + return -1; +} + +static ssize_t vhost_user_receive(NetClientState *nc, const uint8_t *buf, + size_t size) +{ + /* In case of RARP (message size is 60) notify backup to send a fake RARP. + This fake RARP will be sent by backend only for guest + without GUEST_ANNOUNCE capability. + */ + if (size == 60) { + NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc); + int r; + static int display_rarp_failure = 1; + char mac_addr[6]; + + /* extract guest mac address from the RARP message */ + memcpy(mac_addr, &buf[6], 6); + + r = vhost_net_notify_migration_done(s->vhost_net, mac_addr); + + if ((r != 0) && (display_rarp_failure)) { + fprintf(stderr, + "Vhost user backend fails to broadcast fake RARP\n"); + fflush(stderr); + display_rarp_failure = 0; + } + } + + return size; +} + +static void net_vhost_user_cleanup(NetClientState *nc) +{ + NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc); + + if (s->vhost_net) { + vhost_net_cleanup(s->vhost_net); + g_free(s->vhost_net); + s->vhost_net = NULL; + } + if (nc->queue_index == 0) { + if (s->watch) { + g_source_remove(s->watch); + s->watch = 0; + } + qemu_chr_fe_deinit(&s->chr, true); + if (s->vhost_user) { + vhost_user_cleanup(s->vhost_user); + g_free(s->vhost_user); + s->vhost_user = NULL; + } + } + + qemu_purge_queued_packets(nc); +} + +static int vhost_user_set_vnet_endianness(NetClientState *nc, + bool enable) +{ + /* Nothing to do. If the server supports + * VHOST_USER_PROTOCOL_F_CROSS_ENDIAN, it will get the + * vnet header endianness from there. If it doesn't, negotiation + * fails. + */ + return 0; +} + +static bool vhost_user_has_vnet_hdr(NetClientState *nc) +{ + assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER); + + return true; +} + +static bool vhost_user_has_ufo(NetClientState *nc) +{ + assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER); + + return true; +} + +static NetClientInfo net_vhost_user_info = { + .type = NET_CLIENT_DRIVER_VHOST_USER, + .size = sizeof(NetVhostUserState), + .receive = vhost_user_receive, + .cleanup = net_vhost_user_cleanup, + .has_vnet_hdr = vhost_user_has_vnet_hdr, + .has_ufo = vhost_user_has_ufo, + .set_vnet_be = vhost_user_set_vnet_endianness, + .set_vnet_le = vhost_user_set_vnet_endianness, +}; + +static gboolean net_vhost_user_watch(GIOChannel *chan, GIOCondition cond, + void *opaque) +{ + NetVhostUserState *s = opaque; + + qemu_chr_fe_disconnect(&s->chr); + + return TRUE; +} + +static void net_vhost_user_event(void *opaque, int event); + +static void chr_closed_bh(void *opaque) +{ + const char *name = opaque; + NetClientState *ncs[MAX_QUEUE_NUM]; + NetVhostUserState *s; + Error *err = NULL; + int queues; + + queues = qemu_find_net_clients_except(name, ncs, + NET_CLIENT_DRIVER_NIC, + MAX_QUEUE_NUM); + assert(queues < MAX_QUEUE_NUM); + + s = DO_UPCAST(NetVhostUserState, nc, ncs[0]); + + if (s->vhost_net) { + s->acked_features = vhost_net_get_acked_features(s->vhost_net); + } + + qmp_set_link(name, false, &err); + + qemu_chr_fe_set_handlers(&s->chr, NULL, NULL, net_vhost_user_event, + NULL, opaque, NULL, true); + + if (err) { + error_report_err(err); + } +} + +static void net_vhost_user_event(void *opaque, int event) +{ + const char *name = opaque; + NetClientState *ncs[MAX_QUEUE_NUM]; + NetVhostUserState *s; + Chardev *chr; + Error *err = NULL; + int queues; + + queues = qemu_find_net_clients_except(name, ncs, + NET_CLIENT_DRIVER_NIC, + MAX_QUEUE_NUM); + assert(queues < MAX_QUEUE_NUM); + + s = DO_UPCAST(NetVhostUserState, nc, ncs[0]); + chr = qemu_chr_fe_get_driver(&s->chr); + trace_vhost_user_event(chr->label, event); + switch (event) { + case CHR_EVENT_OPENED: + if (vhost_user_start(queues, ncs, s->vhost_user) < 0) { + qemu_chr_fe_disconnect(&s->chr); + return; + } + s->watch = qemu_chr_fe_add_watch(&s->chr, G_IO_HUP, + net_vhost_user_watch, s); + qmp_set_link(name, true, &err); + s->started = true; + break; + case CHR_EVENT_CLOSED: + /* a close event may happen during a read/write, but vhost + * code assumes the vhost_dev remains setup, so delay the + * stop & clear to idle. + * FIXME: better handle failure in vhost code, remove bh + */ + if (s->watch) { + AioContext *ctx = qemu_get_current_aio_context(); + + g_source_remove(s->watch); + s->watch = 0; + qemu_chr_fe_set_handlers(&s->chr, NULL, NULL, NULL, NULL, + NULL, NULL, false); + + aio_bh_schedule_oneshot(ctx, chr_closed_bh, opaque); + } + break; + } + + if (err) { + error_report_err(err); + } +} + +static int net_vhost_user_init(NetClientState *peer, const char *device, + const char *name, Chardev *chr, + int queues) +{ + Error *err = NULL; + NetClientState *nc, *nc0 = NULL; + NetVhostUserState *s = NULL; + VhostUserState *user; + int i; + + assert(name); + assert(queues > 0); + + user = g_new0(struct VhostUserState, 1); + for (i = 0; i < queues; i++) { + nc = qemu_new_net_client(&net_vhost_user_info, peer, device, name); + snprintf(nc->info_str, sizeof(nc->info_str), "vhost-user%d to %s", + i, chr->label); + nc->queue_index = i; + if (!nc0) { + nc0 = nc; + s = DO_UPCAST(NetVhostUserState, nc, nc); + if (!qemu_chr_fe_init(&s->chr, chr, &err) || + !vhost_user_init(user, &s->chr, &err)) { + error_report_err(err); + goto err; + } + } + s = DO_UPCAST(NetVhostUserState, nc, nc); + s->vhost_user = user; + } + + s = DO_UPCAST(NetVhostUserState, nc, nc0); + do { + if (qemu_chr_fe_wait_connected(&s->chr, &err) < 0) { + error_report_err(err); + goto err; + } + qemu_chr_fe_set_handlers(&s->chr, NULL, NULL, + net_vhost_user_event, NULL, nc0->name, NULL, + true); + } while (!s->started); + + assert(s->vhost_net); + + return 0; + +err: + if (user) { + vhost_user_cleanup(user); + g_free(user); + if (s) { + s->vhost_user = NULL; + } + } + if (nc0) { + qemu_del_net_client(nc0); + } + + return -1; +} + +static Chardev *net_vhost_claim_chardev( + const NetdevVhostUserOptions *opts, Error **errp) +{ + Chardev *chr = qemu_chr_find(opts->chardev); + + if (chr == NULL) { + error_setg(errp, "chardev \"%s\" not found", opts->chardev); + return NULL; + } + + if (!qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_RECONNECTABLE)) { + error_setg(errp, "chardev \"%s\" is not reconnectable", + opts->chardev); + return NULL; + } + if (!qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_FD_PASS)) { + error_setg(errp, "chardev \"%s\" does not support FD passing", + opts->chardev); + return NULL; + } + + return chr; +} + +static int net_vhost_check_net(void *opaque, QemuOpts *opts, Error **errp) +{ + const char *name = opaque; + const char *driver, *netdev; + + driver = qemu_opt_get(opts, "driver"); + netdev = qemu_opt_get(opts, "netdev"); + + if (!driver || !netdev) { + return 0; + } + + if (strcmp(netdev, name) == 0 && + !g_str_has_prefix(driver, "virtio-net-")) { + error_setg(errp, "vhost-user requires frontend driver virtio-net-*"); + return -1; + } + + return 0; +} + +int net_init_vhost_user(const Netdev *netdev, const char *name, + NetClientState *peer, Error **errp) +{ + int queues; + const NetdevVhostUserOptions *vhost_user_opts; + Chardev *chr; + + assert(netdev->type == NET_CLIENT_DRIVER_VHOST_USER); + vhost_user_opts = &netdev->u.vhost_user; + + chr = net_vhost_claim_chardev(vhost_user_opts, errp); + if (!chr) { + return -1; + } + + /* verify net frontend */ + if (qemu_opts_foreach(qemu_find_opts("device"), net_vhost_check_net, + (char *)name, errp)) { + return -1; + } + + queues = vhost_user_opts->has_queues ? vhost_user_opts->queues : 1; + if (queues < 1 || queues > MAX_QUEUE_NUM) { + error_setg(errp, + "vhost-user number of queues must be in range [1, %d]", + MAX_QUEUE_NUM); + return -1; + } + + return net_vhost_user_init(peer, "vhost_user", name, chr, queues); +} |