diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2015-10-29 09:49:52 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2015-10-29 09:49:52 +0000 |
commit | 7bc8e0c967a4ef77657174d28af775691e18b4ce (patch) | |
tree | eb44098ade96974f811b5861135cdd8a9746b587 | |
parent | 331c5e2091009f170554fed4ef884aeea871e4bb (diff) | |
parent | 3f1e1478db2d67098d98f2c3acf5a4946b7fb643 (diff) | |
download | qemu-7bc8e0c967a4ef77657174d28af775691e18b4ce.tar.gz qemu-7bc8e0c967a4ef77657174d28af775691e18b4ce.tar.bz2 qemu-7bc8e0c967a4ef77657174d28af775691e18b4ce.zip |
Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging
virtio, pc, memory: fixes+features for 2.5
New features:
This enables hotplug for multifunction devices.
Patches are very small, so I think it's OK to merge
at this stage.
There's also some new infrastructure for vhost-user testing
not enabled yet so it's harmless to merge.
I've reverted the "gap between DIMMs" workaround, as it seems too risky, and
applied my own patch in virtio, but not in dataplane code. This means that
dataplane is broken for some complex DIMM configurations for now. Waiting for
Stefan to review the dataplane fix.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
# gpg: Signature made Thu 29 Oct 2015 09:36:16 GMT using RSA key ID D28D5469
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>"
# gpg: aka "Michael S. Tsirkin <mst@redhat.com>"
* remotes/mst/tags/for_upstream:
enable multi-function hot-add
remove function during multi-function hot-add
tests/vhost-user-bridge: add vhost-user bridge application
Revert "memhp: extend address auto assignment to support gaps"
Revert "pc: memhp: force gaps between DIMM's GPA"
virtio: drop virtqueue_map_sg
virtio-scsi: convert to virtqueue_map
virtio-serial: convert to virtio_map
virtio-blk: convert to virtqueue_map
virtio: switch to virtio_map
virtio: introduce virtio_map
mmap-alloc: fix error handling
pc: memhp: do not emit inserting event for coldplugged DIMMs
vhost-user-test: fix up rhel6 build
vhost-user: cleanup msg size math
vhost-user: cleanup struct size math
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | hw/acpi/memory_hotplug.c | 8 | ||||
-rw-r--r-- | hw/block/virtio-blk.c | 5 | ||||
-rw-r--r-- | hw/char/virtio-serial-bus.c | 5 | ||||
-rw-r--r-- | hw/i386/pc.c | 5 | ||||
-rw-r--r-- | hw/i386/pc_piix.c | 1 | ||||
-rw-r--r-- | hw/i386/pc_q35.c | 1 | ||||
-rw-r--r-- | hw/mem/pc-dimm.c | 15 | ||||
-rw-r--r-- | hw/pci/pci.c | 40 | ||||
-rw-r--r-- | hw/pci/pci_host.c | 15 | ||||
-rw-r--r-- | hw/pci/pcie.c | 39 | ||||
-rw-r--r-- | hw/ppc/spapr.c | 2 | ||||
-rw-r--r-- | hw/scsi/virtio-scsi.c | 16 | ||||
-rw-r--r-- | hw/virtio/vhost-user.c | 22 | ||||
-rw-r--r-- | hw/virtio/virtio.c | 52 | ||||
-rw-r--r-- | include/hw/i386/pc.h | 1 | ||||
-rw-r--r-- | include/hw/mem/pc-dimm.h | 7 | ||||
-rw-r--r-- | include/hw/pci/pci.h | 1 | ||||
-rw-r--r-- | include/hw/virtio/virtio.h | 3 | ||||
-rw-r--r-- | tests/Makefile | 1 | ||||
-rw-r--r-- | tests/vhost-user-bridge.c | 1110 | ||||
-rw-r--r-- | tests/vhost-user-test.c | 18 | ||||
-rw-r--r-- | util/mmap-alloc.c | 4 |
22 files changed, 1276 insertions, 95 deletions
diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c index 2ff0d5ce1b..ce428dfc18 100644 --- a/hw/acpi/memory_hotplug.c +++ b/hw/acpi/memory_hotplug.c @@ -238,10 +238,12 @@ void acpi_memory_plug_cb(ACPIREGS *ar, qemu_irq irq, MemHotplugState *mem_st, mdev->dimm = dev; mdev->is_enabled = true; - mdev->is_inserting = true; + if (dev->hotplugged) { + mdev->is_inserting = true; - /* do ACPI magic */ - acpi_send_gpe_event(ar, irq, ACPI_MEMORY_HOTPLUG_STATUS); + /* do ACPI magic */ + acpi_send_gpe_event(ar, irq, ACPI_MEMORY_HOTPLUG_STATUS); + } return; } diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 8beb26b4db..3e230debb8 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -839,10 +839,7 @@ static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f, req->next = s->rq; s->rq = req; - virtqueue_map_sg(req->elem.in_sg, req->elem.in_addr, - req->elem.in_num, 1); - virtqueue_map_sg(req->elem.out_sg, req->elem.out_addr, - req->elem.out_num, 0); + virtqueue_map(&req->elem); } return 0; diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index be97058712..497b0afd9f 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -705,10 +705,7 @@ static int fetch_active_ports_list(QEMUFile *f, int version_id, qemu_get_buffer(f, (unsigned char *)&port->elem, sizeof(port->elem)); - virtqueue_map_sg(port->elem.in_sg, port->elem.in_addr, - port->elem.in_num, 1); - virtqueue_map_sg(port->elem.out_sg, port->elem.out_addr, - port->elem.out_num, 1); + virtqueue_map(&port->elem); /* * Port was throttled on source machine. Let's diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 3d958bae5b..0cb8afd2c2 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1616,7 +1616,6 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, HotplugHandlerClass *hhc; Error *local_err = NULL; PCMachineState *pcms = PC_MACHINE(hotplug_dev); - PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); PCDIMMDevice *dimm = PC_DIMM(dev); PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm); MemoryRegion *mr = ddc->get_memory_region(dimm); @@ -1632,8 +1631,7 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, goto out; } - pc_dimm_memory_plug(dev, &pcms->hotplug_memory, mr, align, - pcmc->inter_dimm_gap, &local_err); + pc_dimm_memory_plug(dev, &pcms->hotplug_memory, mr, align, &local_err); if (local_err) { goto out; } @@ -1953,7 +1951,6 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) PCMachineClass *pcmc = PC_MACHINE_CLASS(oc); HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); - pcmc->inter_dimm_gap = true; pcmc->get_hotplug_handler = mc->get_hotplug_handler; mc->get_hotplug_handler = pc_get_hotpug_handler; mc->cpu_index_to_socket_id = pc_cpu_index_to_socket_id; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 9d4425a5b9..393dcc4544 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -487,7 +487,6 @@ static void pc_i440fx_2_4_machine_options(MachineClass *m) m->alias = NULL; m->is_default = 0; pcmc->broken_reserved_end = true; - pcmc->inter_dimm_gap = false; SET_MACHINE_COMPAT(m, PC_COMPAT_2_4); } diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 3744abd397..2f8f3963c4 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -385,7 +385,6 @@ static void pc_q35_2_4_machine_options(MachineClass *m) pc_q35_2_5_machine_options(m); m->alias = NULL; pcmc->broken_reserved_end = true; - pcmc->inter_dimm_gap = false; SET_MACHINE_COMPAT(m, PC_COMPAT_2_4); } diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c index 2bae994667..80f424b442 100644 --- a/hw/mem/pc-dimm.c +++ b/hw/mem/pc-dimm.c @@ -33,8 +33,7 @@ typedef struct pc_dimms_capacity { } pc_dimms_capacity; void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms, - MemoryRegion *mr, uint64_t align, bool gap, - Error **errp) + MemoryRegion *mr, uint64_t align, Error **errp) { int slot; MachineState *machine = MACHINE(qdev_get_machine()); @@ -50,7 +49,7 @@ void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms, addr = pc_dimm_get_free_addr(hpms->base, memory_region_size(&hpms->mr), - !addr ? NULL : &addr, align, gap, + !addr ? NULL : &addr, align, memory_region_size(mr), &local_err); if (local_err) { goto out; @@ -295,8 +294,8 @@ static int pc_dimm_built_list(Object *obj, void *opaque) uint64_t pc_dimm_get_free_addr(uint64_t address_space_start, uint64_t address_space_size, - uint64_t *hint, uint64_t align, bool gap, - uint64_t size, Error **errp) + uint64_t *hint, uint64_t align, uint64_t size, + Error **errp) { GSList *list = NULL, *item; uint64_t new_addr, ret = 0; @@ -341,15 +340,13 @@ uint64_t pc_dimm_get_free_addr(uint64_t address_space_start, goto out; } - if (ranges_overlap(dimm->addr, dimm_size, new_addr, - size + (gap ? 1 : 0))) { + if (ranges_overlap(dimm->addr, dimm_size, new_addr, size)) { if (hint) { DeviceState *d = DEVICE(dimm); error_setg(errp, "address range conflicts with '%s'", d->id); goto out; } - new_addr = QEMU_ALIGN_UP(dimm->addr + dimm_size + (gap ? 1 : 0), - align); + new_addr = QEMU_ALIGN_UP(dimm->addr + dimm_size, align); } } ret = new_addr; diff --git a/hw/pci/pci.c b/hw/pci/pci.c index b0bf54061f..168b9cc56b 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -847,6 +847,9 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus, PCIConfigWriteFunc *config_write = pc->config_write; Error *local_err = NULL; AddressSpace *dma_as; + DeviceState *dev = DEVICE(pci_dev); + + pci_dev->bus = bus; if (devfn < 0) { for(devfn = bus->devfn_min ; devfn < ARRAY_SIZE(bus->devices); @@ -864,9 +867,17 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus, PCI_SLOT(devfn), PCI_FUNC(devfn), name, bus->devices[devfn]->name); return NULL; + } else if (dev->hotplugged && + pci_get_function_0(pci_dev)) { + error_setg(errp, "PCI: slot %d function 0 already ocuppied by %s," + " new func %s cannot be exposed to guest.", + PCI_SLOT(devfn), + bus->devices[PCI_DEVFN(PCI_SLOT(devfn), 0)]->name, + name); + + return NULL; } - pci_dev->bus = bus; pci_dev->devfn = devfn; dma_as = pci_device_iommu_address_space(pci_dev); @@ -2454,6 +2465,33 @@ void pci_bus_get_w64_range(PCIBus *bus, Range *range) pci_for_each_device_under_bus(bus, pci_dev_get_w64, range); } +static bool pcie_has_upstream_port(PCIDevice *dev) +{ + PCIDevice *parent_dev = pci_bridge_get_device(dev->bus); + + /* Device associated with an upstream port. + * As there are several types of these, it's easier to check the + * parent device: upstream ports are always connected to + * root or downstream ports. + */ + return parent_dev && + pci_is_express(parent_dev) && + parent_dev->exp.exp_cap && + (pcie_cap_get_type(parent_dev) == PCI_EXP_TYPE_ROOT_PORT || + pcie_cap_get_type(parent_dev) == PCI_EXP_TYPE_DOWNSTREAM); +} + +PCIDevice *pci_get_function_0(PCIDevice *pci_dev) +{ + if(pcie_has_upstream_port(pci_dev)) { + /* With an upstream PCIe port, we only support 1 device at slot 0 */ + return pci_dev->bus->devices[0]; + } else { + /* Other bus types might support multiple devices at slots 0-31 */ + return pci_dev->bus->devices[PCI_DEVFN(PCI_SLOT(pci_dev->devfn), 0)]; + } +} + static const TypeInfo pci_device_type_info = { .name = TYPE_PCI_DEVICE, .parent = TYPE_DEVICE, diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c index 3e26f9256c..49f59a5dbc 100644 --- a/hw/pci/pci_host.c +++ b/hw/pci/pci_host.c @@ -20,6 +20,7 @@ #include "hw/pci/pci.h" #include "hw/pci/pci_host.h" +#include "hw/pci/pci_bus.h" #include "trace.h" /* debug PCI */ @@ -52,6 +53,13 @@ void pci_host_config_write_common(PCIDevice *pci_dev, uint32_t addr, uint32_t limit, uint32_t val, uint32_t len) { assert(len <= 4); + /* non-zero functions are only exposed when function 0 is present, + * allowing direct removal of unexposed functions. + */ + if (pci_dev->qdev.hotplugged && !pci_get_function_0(pci_dev)) { + return; + } + trace_pci_cfg_write(pci_dev->name, PCI_SLOT(pci_dev->devfn), PCI_FUNC(pci_dev->devfn), addr, val); pci_dev->config_write(pci_dev, addr, val, MIN(len, limit - addr)); @@ -63,6 +71,13 @@ uint32_t pci_host_config_read_common(PCIDevice *pci_dev, uint32_t addr, uint32_t ret; assert(len <= 4); + /* non-zero functions are only exposed when function 0 is present, + * allowing direct removal of unexposed functions. + */ + if (pci_dev->qdev.hotplugged && !pci_get_function_0(pci_dev)) { + return ~0x0; + } + ret = pci_dev->config_read(pci_dev, addr, MIN(len, limit - addr)); trace_pci_cfg_read(pci_dev->name, PCI_SLOT(pci_dev->devfn), PCI_FUNC(pci_dev->devfn), addr, ret); diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index 6e28985bd1..32c65c27a4 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -249,25 +249,43 @@ void pcie_cap_slot_hotplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, return; } - /* TODO: multifunction hot-plug. - * Right now, only a device of function = 0 is allowed to be - * hot plugged/unplugged. + /* To enable multifunction hot-plug, we just ensure the function + * 0 added last. When function 0 is added, we set the sltsta and + * inform OS via event notification. */ - assert(PCI_FUNC(pci_dev->devfn) == 0); + if (pci_get_function_0(pci_dev)) { + pci_word_test_and_set_mask(exp_cap + PCI_EXP_SLTSTA, + PCI_EXP_SLTSTA_PDS); + pcie_cap_slot_event(PCI_DEVICE(hotplug_dev), + PCI_EXP_HP_EV_PDC | PCI_EXP_HP_EV_ABP); + } +} - pci_word_test_and_set_mask(exp_cap + PCI_EXP_SLTSTA, - PCI_EXP_SLTSTA_PDS); - pcie_cap_slot_event(PCI_DEVICE(hotplug_dev), - PCI_EXP_HP_EV_PDC | PCI_EXP_HP_EV_ABP); +static void pcie_unplug_device(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + object_unparent(OBJECT(dev)); } void pcie_cap_slot_hot_unplug_request_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { uint8_t *exp_cap; + PCIDevice *pci_dev = PCI_DEVICE(dev); + PCIBus *bus = pci_dev->bus; pcie_cap_slot_hotplug_common(PCI_DEVICE(hotplug_dev), dev, &exp_cap, errp); + /* In case user cancel the operation of multi-function hot-add, + * remove the function that is unexposed to guest individually, + * without interaction with guest. + */ + if (pci_dev->devfn && + !bus->devices[0]) { + pcie_unplug_device(bus, pci_dev, NULL); + + return; + } + pcie_cap_slot_push_attention_button(PCI_DEVICE(hotplug_dev)); } @@ -378,11 +396,6 @@ void pcie_cap_slot_reset(PCIDevice *dev) hotplug_event_update_event_status(dev); } -static void pcie_unplug_device(PCIBus *bus, PCIDevice *dev, void *opaque) -{ - object_unparent(OBJECT(dev)); -} - void pcie_cap_slot_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len) { diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index e1202cec9f..288b57e24f 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2157,7 +2157,7 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev, goto out; } - pc_dimm_memory_plug(dev, &ms->hotplug_memory, mr, align, false, &local_err); + pc_dimm_memory_plug(dev, &ms->hotplug_memory, mr, align, &local_err); if (local_err) { goto out; } diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 20885fb7f1..46553e1e68 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -205,20 +205,8 @@ static void *virtio_scsi_load_request(QEMUFile *f, SCSIRequest *sreq) assert(n < vs->conf.num_queues); req = virtio_scsi_init_req(s, vs->cmd_vqs[n]); qemu_get_buffer(f, (unsigned char *)&req->elem, sizeof(req->elem)); - /* TODO: add a way for SCSIBusInfo's load_request to fail, - * and fail migration instead of asserting here. - * When we do, we might be able to re-enable NDEBUG below. - */ -#ifdef NDEBUG -#error building with NDEBUG is not supported -#endif - assert(req->elem.in_num <= ARRAY_SIZE(req->elem.in_sg)); - assert(req->elem.out_num <= ARRAY_SIZE(req->elem.out_sg)); - - virtqueue_map_sg(req->elem.in_sg, req->elem.in_addr, - req->elem.in_num, 1); - virtqueue_map_sg(req->elem.out_sg, req->elem.out_addr, - req->elem.out_num, 0); + + virtqueue_map(&req->elem); if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICmdReq) + vs->cdb_size, sizeof(VirtIOSCSICmdResp) + vs->sense_size) < 0) { diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 0aa8e0d93a..83c84f1cd6 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -201,7 +201,7 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base, .request = VHOST_USER_SET_LOG_BASE, .flags = VHOST_USER_VERSION, .payload.u64 = base, - .size = sizeof(m.payload.u64), + .size = sizeof(msg.payload.u64), }; if (shmfd && log->fd != -1) { @@ -265,8 +265,8 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev, return -1; } - msg.size = sizeof(m.payload.memory.nregions); - msg.size += sizeof(m.payload.memory.padding); + msg.size = sizeof(msg.payload.memory.nregions); + msg.size += sizeof(msg.payload.memory.padding); msg.size += fd_num * sizeof(VhostUserMemoryRegion); vhost_user_write(dev, &msg, fds, fd_num); @@ -281,7 +281,7 @@ static int vhost_user_set_vring_addr(struct vhost_dev *dev, .request = VHOST_USER_SET_VRING_ADDR, .flags = VHOST_USER_VERSION, .payload.addr = *addr, - .size = sizeof(*addr), + .size = sizeof(msg.payload.addr), }; vhost_user_write(dev, &msg, NULL, 0); @@ -304,7 +304,7 @@ static int vhost_set_vring(struct vhost_dev *dev, .request = request, .flags = VHOST_USER_VERSION, .payload.state = *ring, - .size = sizeof(*ring), + .size = sizeof(msg.payload.state), }; vhost_user_write(dev, &msg, NULL, 0); @@ -346,7 +346,7 @@ static int vhost_user_get_vring_base(struct vhost_dev *dev, .request = VHOST_USER_GET_VRING_BASE, .flags = VHOST_USER_VERSION, .payload.state = *ring, - .size = sizeof(*ring), + .size = sizeof(msg.payload.state), }; vhost_user_write(dev, &msg, NULL, 0); @@ -361,7 +361,7 @@ static int vhost_user_get_vring_base(struct vhost_dev *dev, return -1; } - if (msg.size != sizeof(m.payload.state)) { + if (msg.size != sizeof(msg.payload.state)) { error_report("Received bad msg size."); return -1; } @@ -381,7 +381,7 @@ static int vhost_set_vring_file(struct vhost_dev *dev, .request = request, .flags = VHOST_USER_VERSION, .payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK, - .size = sizeof(m.payload.u64), + .size = sizeof(msg.payload.u64), }; if (ioeventfd_enabled() && file->fd > 0) { @@ -413,7 +413,7 @@ static int vhost_user_set_u64(struct vhost_dev *dev, int request, uint64_t u64) .request = request, .flags = VHOST_USER_VERSION, .payload.u64 = u64, - .size = sizeof(m.payload.u64), + .size = sizeof(msg.payload.u64), }; vhost_user_write(dev, &msg, NULL, 0); @@ -456,7 +456,7 @@ static int vhost_user_get_u64(struct vhost_dev *dev, int request, uint64_t *u64) return -1; } - if (msg.size != sizeof(m.payload.u64)) { + if (msg.size != sizeof(msg.payload.u64)) { error_report("Received bad msg size."); return -1; } @@ -592,7 +592,7 @@ static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr) msg.request = VHOST_USER_SEND_RARP; msg.flags = VHOST_USER_VERSION; memcpy((char *)&msg.payload.u64, mac_addr, 6); - msg.size = sizeof(m.payload.u64); + msg.size = sizeof(msg.payload.u64); err = vhost_user_write(dev, &msg, NULL, 0); return err; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index d0bc72e0e4..939f802110 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -448,28 +448,59 @@ int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, return in_bytes <= in_total && out_bytes <= out_total; } -void virtqueue_map_sg(struct iovec *sg, hwaddr *addr, - size_t num_sg, int is_write) +static void virtqueue_map_iovec(struct iovec *sg, hwaddr *addr, + unsigned int *num_sg, unsigned int max_size, + int is_write) { unsigned int i; hwaddr len; - if (num_sg > VIRTQUEUE_MAX_SIZE) { - error_report("virtio: map attempt out of bounds: %zd > %d", - num_sg, VIRTQUEUE_MAX_SIZE); - exit(1); - } + /* Note: this function MUST validate input, some callers + * are passing in num_sg values received over the network. + */ + /* TODO: teach all callers that this can fail, and return failure instead + * of asserting here. + * When we do, we might be able to re-enable NDEBUG below. + */ +#ifdef NDEBUG +#error building with NDEBUG is not supported +#endif + assert(*num_sg <= max_size); - for (i = 0; i < num_sg; i++) { + for (i = 0; i < *num_sg; i++) { len = sg[i].iov_len; sg[i].iov_base = cpu_physical_memory_map(addr[i], &len, is_write); - if (sg[i].iov_base == NULL || len != sg[i].iov_len) { + if (!sg[i].iov_base) { error_report("virtio: error trying to map MMIO memory"); exit(1); } + if (len == sg[i].iov_len) { + continue; + } + if (*num_sg >= max_size) { + error_report("virtio: memory split makes iovec too large"); + exit(1); + } + memmove(sg + i + 1, sg + i, sizeof(*sg) * (*num_sg - i)); + memmove(addr + i + 1, addr + i, sizeof(*addr) * (*num_sg - i)); + assert(len < sg[i + 1].iov_len); + sg[i].iov_len = len; + addr[i + 1] += len; + sg[i + 1].iov_len -= len; + ++*num_sg; } } +void virtqueue_map(VirtQueueElement *elem) +{ + virtqueue_map_iovec(elem->in_sg, elem->in_addr, &elem->in_num, + MIN(ARRAY_SIZE(elem->in_sg), ARRAY_SIZE(elem->in_addr)), + 1); + virtqueue_map_iovec(elem->out_sg, elem->out_addr, &elem->out_num, + MIN(ARRAY_SIZE(elem->out_sg), ARRAY_SIZE(elem->out_addr)), + 0); +} + int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) { unsigned int i, head, max; @@ -531,8 +562,7 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max); /* Now map what we have collected */ - virtqueue_map_sg(elem->in_sg, elem->in_addr, elem->in_num, 1); - virtqueue_map_sg(elem->out_sg, elem->out_addr, elem->out_num, 0); + virtqueue_map(elem); elem->index = head; diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 7037de044d..606dbc2854 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -60,7 +60,6 @@ struct PCMachineClass { /*< public >*/ bool broken_reserved_end; - bool inter_dimm_gap; HotplugHandler *(*get_hotplug_handler)(MachineState *machine, DeviceState *dev); }; diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h index c1ee7b0408..d83bf30ea9 100644 --- a/include/hw/mem/pc-dimm.h +++ b/include/hw/mem/pc-dimm.h @@ -83,16 +83,15 @@ typedef struct MemoryHotplugState { uint64_t pc_dimm_get_free_addr(uint64_t address_space_start, uint64_t address_space_size, - uint64_t *hint, uint64_t align, bool gap, - uint64_t size, Error **errp); + uint64_t *hint, uint64_t align, uint64_t size, + Error **errp); int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp); int qmp_pc_dimm_device_list(Object *obj, void *opaque); uint64_t pc_existing_dimms_capacity(Error **errp); void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms, - MemoryRegion *mr, uint64_t align, bool gap, - Error **errp); + MemoryRegion *mr, uint64_t align, Error **errp); void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms, MemoryRegion *mr); #endif diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index f5e7fd818a..379b6e1a45 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -397,6 +397,7 @@ void pci_for_each_bus_depth_first(PCIBus *bus, void *(*begin)(PCIBus *bus, void *parent_state), void (*end)(PCIBus *bus, void *state), void *parent_state); +PCIDevice *pci_get_function_0(PCIDevice *pci_dev); /* Use this wrapper when specific scan order is not required. */ static inline diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 9d09115fab..205fadf234 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -151,8 +151,7 @@ void virtqueue_discard(VirtQueue *vq, const VirtQueueElement *elem, void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len, unsigned int idx); -void virtqueue_map_sg(struct iovec *sg, hwaddr *addr, - size_t num_sg, int is_write); +void virtqueue_map(VirtQueueElement *elem); int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem); int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, unsigned int out_bytes); diff --git a/tests/Makefile b/tests/Makefile index 1c57e39c53..0739bfe1bf 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -525,6 +525,7 @@ tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o $(test-util-obj-y) tests/test-write-threshold$(EXESUF): tests/test-write-threshold.o $(test-block-obj-y) tests/test-netfilter$(EXESUF): tests/test-netfilter.o $(qtest-obj-y) tests/ivshmem-test$(EXESUF): tests/ivshmem-test.o contrib/ivshmem-server/ivshmem-server.o $(libqos-pc-obj-y) +tests/vhost-user-bridge$(EXESUF): tests/vhost-user-bridge.o ifeq ($(CONFIG_POSIX),y) LIBS += -lutil diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c new file mode 100644 index 0000000000..fa18ad55fb --- /dev/null +++ b/tests/vhost-user-bridge.c @@ -0,0 +1,1110 @@ +/* + * Vhost User Bridge + * + * Copyright (c) 2015 Red Hat, Inc. + * + * Authors: + * Victor Kaplansky <victork@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +/* + * TODO: + * - main should get parameters from the command line. + * - implement all request handlers. + * - test for broken requests and virtqueue. + * - implement features defined by Virtio 1.0 spec. + * - support mergeable buffers and indirect descriptors. + * - implement RESET_DEVICE request. + * - implement clean shutdown. + * - implement non-blocking writes to UDP backend. + * - implement polling strategy. + */ + +#include <stddef.h> +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <inttypes.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/unistd.h> +#include <sys/mman.h> +#include <sys/eventfd.h> +#include <arpa/inet.h> + +#include <linux/vhost.h> + +#include "qemu/atomic.h" +#include "standard-headers/linux/virtio_net.h" +#include "standard-headers/linux/virtio_ring.h" + +#define VHOST_USER_BRIDGE_DEBUG 1 + +#define DPRINT(...) \ + do { \ + if (VHOST_USER_BRIDGE_DEBUG) { \ + printf(__VA_ARGS__); \ + } \ + } while (0) + +typedef void (*CallbackFunc)(int sock, void *ctx); + +typedef struct Event { + void *ctx; + CallbackFunc callback; +} Event; + +typedef struct Dispatcher { + int max_sock; + fd_set fdset; + Event events[FD_SETSIZE]; +} Dispatcher; + +static void +vubr_die(const char *s) +{ + perror(s); + exit(1); +} + +static int +dispatcher_init(Dispatcher *dispr) +{ + FD_ZERO(&dispr->fdset); + dispr->max_sock = -1; + return 0; +} + +static int +dispatcher_add(Dispatcher *dispr, int sock, void *ctx, CallbackFunc cb) +{ + if (sock >= FD_SETSIZE) { + fprintf(stderr, + "Error: Failed to add new event. sock %d should be less than %d\n", + sock, FD_SETSIZE); + return -1; + } + + dispr->events[sock].ctx = ctx; + dispr->events[sock].callback = cb; + + FD_SET(sock, &dispr->fdset); + if (sock > dispr->max_sock) { + dispr->max_sock = sock; + } + DPRINT("Added sock %d for watching. max_sock: %d\n", + sock, dispr->max_sock); + return 0; +} + +#if 0 +/* dispatcher_remove() is not currently in use but may be useful + * in the future. */ +static int +dispatcher_remove(Dispatcher *dispr, int sock) +{ + if (sock >= FD_SETSIZE) { + fprintf(stderr, + "Error: Failed to remove event. sock %d should be less than %d\n", + sock, FD_SETSIZE); + return -1; + } + + FD_CLR(sock, &dispr->fdset); + return 0; +} +#endif + +/* timeout in us */ +static int +dispatcher_wait(Dispatcher *dispr, uint32_t timeout) +{ + struct timeval tv; + tv.tv_sec = timeout / 1000000; + tv.tv_usec = timeout % 1000000; + + fd_set fdset = dispr->fdset; + + /* wait until some of sockets become readable. */ + int rc = select(dispr->max_sock + 1, &fdset, 0, 0, &tv); + + if (rc == -1) { + vubr_die("select"); + } + + /* Timeout */ + if (rc == 0) { + return 0; + } + + /* Now call callback for every ready socket. */ + + int sock; + for (sock = 0; sock < dispr->max_sock + 1; sock++) + if (FD_ISSET(sock, &fdset)) { + Event *e = &dispr->events[sock]; + e->callback(sock, e->ctx); + } + + return 0; +} + +typedef struct VubrVirtq { + int call_fd; + int kick_fd; + uint32_t size; + uint16_t last_avail_index; + uint16_t last_used_index; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; +} VubrVirtq; + +/* Based on qemu/hw/virtio/vhost-user.c */ + +#define VHOST_MEMORY_MAX_NREGIONS 8 +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +enum VhostUserProtocolFeature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + + VHOST_USER_PROTOCOL_F_MAX +}; + +#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1) + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_DEVICE = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_MAX +} VhostUserRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK (0x3) +#define VHOST_USER_REPLY_MASK (0x1<<2) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK (0xff) +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fd_num; +} QEMU_PACKED VhostUserMsg; + +#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION (0x1) + +#define MAX_NR_VIRTQUEUE (8) + +typedef struct VubrDevRegion { + /* Guest Physical address. */ + uint64_t gpa; + /* Memory region size. */ + uint64_t size; + /* QEMU virtual address (userspace). */ + uint64_t qva; + /* Starting offset in our mmaped space. */ + uint64_t mmap_offset; + /* Start address of mmaped space. */ + uint64_t mmap_addr; +} VubrDevRegion; + +typedef struct VubrDev { + int sock; + Dispatcher dispatcher; + uint32_t nregions; + VubrDevRegion regions[VHOST_MEMORY_MAX_NREGIONS]; + VubrVirtq vq[MAX_NR_VIRTQUEUE]; + int backend_udp_sock; + struct sockaddr_in backend_udp_dest; +} VubrDev; + +static const char *vubr_request_str[] = { + [VHOST_USER_NONE] = "VHOST_USER_NONE", + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", + [VHOST_USER_RESET_DEVICE] = "VHOST_USER_RESET_DEVICE", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", + [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", + [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", + [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", + [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", + [VHOST_USER_MAX] = "VHOST_USER_MAX", +}; + +static void +print_buffer(uint8_t *buf, size_t len) +{ + int i; + printf("Raw buffer:\n"); + for (i = 0; i < len; i++) { + if (i % 16 == 0) { + printf("\n"); + } + if (i % 4 == 0) { + printf(" "); + } + printf("%02x ", buf[i]); + } + printf("\n............................................................\n"); +} + +/* Translate guest physical address to our virtual address. */ +static uint64_t +gpa_to_va(VubrDev *dev, uint64_t guest_addr) +{ + int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + VubrDevRegion *r = &dev->regions[i]; + + if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { + return guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; + } + } + + assert(!"address not found in regions"); + return 0; +} + +/* Translate qemu virtual address to our virtual address. */ +static uint64_t +qva_to_va(VubrDev *dev, uint64_t qemu_addr) +{ + int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + VubrDevRegion *r = &dev->regions[i]; + + if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { + return qemu_addr - r->qva + r->mmap_addr + r->mmap_offset; + } + } + + assert(!"address not found in regions"); + return 0; +} + +static void +vubr_message_read(int conn_fd, VhostUserMsg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + .msg_controllen = sizeof(control), + }; + size_t fd_size; + struct cmsghdr *cmsg; + int rc; + + rc = recvmsg(conn_fd, &msg, 0); + + if (rc <= 0) { + vubr_die("recvmsg"); + } + + vmsg->fd_num = 0; + for (cmsg = CMSG_FIRSTHDR(&msg); + cmsg != NULL; + cmsg = CMSG_NXTHDR(&msg, cmsg)) + { + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { + fd_size = cmsg->cmsg_len - CMSG_LEN(0); + vmsg->fd_num = fd_size / sizeof(int); + memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); + break; + } + } + + if (vmsg->size > sizeof(vmsg->payload)) { + fprintf(stderr, + "Error: too big message request: %d, size: vmsg->size: %u, " + "while sizeof(vmsg->payload) = %lu\n", + vmsg->request, vmsg->size, sizeof(vmsg->payload)); + exit(1); + } + + if (vmsg->size) { + rc = read(conn_fd, &vmsg->payload, vmsg->size); + if (rc <= 0) { + vubr_die("recvmsg"); + } + + assert(rc == vmsg->size); + } +} + +static void +vubr_message_write(int conn_fd, VhostUserMsg *vmsg) +{ + int rc; + + do { + rc = write(conn_fd, vmsg, VHOST_USER_HDR_SIZE + vmsg->size); + } while (rc < 0 && errno == EINTR); + + if (rc < 0) { + vubr_die("write"); + } +} + +static void +vubr_backend_udp_sendbuf(VubrDev *dev, uint8_t *buf, size_t len) +{ + int slen = sizeof(struct sockaddr_in); + + if (sendto(dev->backend_udp_sock, buf, len, 0, + (struct sockaddr *) &dev->backend_udp_dest, slen) == -1) { + vubr_die("sendto()"); + } +} + +static int +vubr_backend_udp_recvbuf(VubrDev *dev, uint8_t *buf, size_t buflen) +{ + int slen = sizeof(struct sockaddr_in); + int rc; + + rc = recvfrom(dev->backend_udp_sock, buf, buflen, 0, + (struct sockaddr *) &dev->backend_udp_dest, + (socklen_t *)&slen); + if (rc == -1) { + vubr_die("recvfrom()"); + } + + return rc; +} + +static void +vubr_consume_raw_packet(VubrDev *dev, uint8_t *buf, uint32_t len) +{ + int hdrlen = sizeof(struct virtio_net_hdr_v1); + + if (VHOST_USER_BRIDGE_DEBUG) { + print_buffer(buf, len); + } + vubr_backend_udp_sendbuf(dev, buf + hdrlen, len - hdrlen); +} + +/* Kick the guest if necessary. */ +static void +vubr_virtqueue_kick(VubrVirtq *vq) +{ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + DPRINT("Kicking the guest...\n"); + eventfd_write(vq->call_fd, 1); + } +} + +static void +vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len) +{ + struct vring_desc *desc = vq->desc; + struct vring_avail *avail = vq->avail; + struct vring_used *used = vq->used; + + unsigned int size = vq->size; + + uint16_t avail_index = atomic_mb_read(&avail->idx); + + /* We check the available descriptors before posting the + * buffer, so here we assume that enough available + * descriptors. */ + assert(vq->last_avail_index != avail_index); + uint16_t a_index = vq->last_avail_index % size; + uint16_t u_index = vq->last_used_index % size; + uint16_t d_index = avail->ring[a_index]; + + int i = d_index; + + DPRINT("Post packet to guest on vq:\n"); + DPRINT(" size = %d\n", vq->size); + DPRINT(" last_avail_index = %d\n", vq->last_avail_index); + DPRINT(" last_used_index = %d\n", vq->last_used_index); + DPRINT(" a_index = %d\n", a_index); + DPRINT(" u_index = %d\n", u_index); + DPRINT(" d_index = %d\n", d_index); + DPRINT(" desc[%d].addr = 0x%016"PRIx64"\n", i, desc[i].addr); + DPRINT(" desc[%d].len = %d\n", i, desc[i].len); + DPRINT(" desc[%d].flags = %d\n", i, desc[i].flags); + DPRINT(" avail->idx = %d\n", avail_index); + DPRINT(" used->idx = %d\n", used->idx); + + if (!(desc[i].flags & VRING_DESC_F_WRITE)) { + /* FIXME: we should find writable descriptor. */ + fprintf(stderr, "Error: descriptor is not writable. Exiting.\n"); + exit(1); + } + + void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr); + uint32_t chunk_len = desc[i].len; + + if (len <= chunk_len) { + memcpy(chunk_start, buf, len); + } else { + fprintf(stderr, + "Received too long packet from the backend. Dropping...\n"); + return; + } + + /* Add descriptor to the used ring. */ + used->ring[u_index].id = d_index; + used->ring[u_index].len = len; + + vq->last_avail_index++; + vq->last_used_index++; + + atomic_mb_set(&used->idx, vq->last_used_index); + + /* Kick the guest if necessary. */ + vubr_virtqueue_kick(vq); +} + +static int +vubr_process_desc(VubrDev *dev, VubrVirtq *vq) +{ + struct vring_desc *desc = vq->desc; + struct vring_avail *avail = vq->avail; + struct vring_used *used = vq->used; + + unsigned int size = vq->size; + + uint16_t a_index = vq->last_avail_index % size; + uint16_t u_index = vq->last_used_index % size; + uint16_t d_index = avail->ring[a_index]; + + uint32_t i, len = 0; + size_t buf_size = 4096; + uint8_t buf[4096]; + + DPRINT("Chunks: "); + i = d_index; + do { + void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr); + uint32_t chunk_len = desc[i].len; + + if (len + chunk_len < buf_size) { + memcpy(buf + len, chunk_start, chunk_len); + DPRINT("%d ", chunk_len); + } else { + fprintf(stderr, "Error: too long packet. Dropping...\n"); + break; + } + + len += chunk_len; + + if (!(desc[i].flags & VRING_DESC_F_NEXT)) { + break; + } + + i = desc[i].next; + } while (1); + DPRINT("\n"); + + if (!len) { + return -1; + } + + /* Add descriptor to the used ring. */ + used->ring[u_index].id = d_index; + used->ring[u_index].len = len; + + vubr_consume_raw_packet(dev, buf, len); + + return 0; +} + +static void +vubr_process_avail(VubrDev *dev, VubrVirtq *vq) +{ + struct vring_avail *avail = vq->avail; + struct vring_used *used = vq->used; + + while (vq->last_avail_index != atomic_mb_read(&avail->idx)) { + vubr_process_desc(dev, vq); + vq->last_avail_index++; + vq->last_used_index++; + } + + atomic_mb_set(&used->idx, vq->last_used_index); +} + +static void +vubr_backend_recv_cb(int sock, void *ctx) +{ + VubrDev *dev = (VubrDev *) ctx; + VubrVirtq *rx_vq = &dev->vq[0]; + uint8_t buf[4096]; + struct virtio_net_hdr_v1 *hdr = (struct virtio_net_hdr_v1 *)buf; + int hdrlen = sizeof(struct virtio_net_hdr_v1); + int buflen = sizeof(buf); + int len; + + DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n"); + + uint16_t avail_index = atomic_mb_read(&rx_vq->avail->idx); + + /* If there is no available descriptors, just do nothing. + * The buffer will be handled by next arrived UDP packet, + * or next kick on receive virtq. */ + if (rx_vq->last_avail_index == avail_index) { + DPRINT("Got UDP packet, but no available descriptors on RX virtq.\n"); + return; + } + + len = vubr_backend_udp_recvbuf(dev, buf + hdrlen, buflen - hdrlen); + + *hdr = (struct virtio_net_hdr_v1) { }; + hdr->num_buffers = 1; + vubr_post_buffer(dev, rx_vq, buf, len + hdrlen); +} + +static void +vubr_kick_cb(int sock, void *ctx) +{ + VubrDev *dev = (VubrDev *) ctx; + eventfd_t kick_data; + ssize_t rc; + + rc = eventfd_read(sock, &kick_data); + if (rc == -1) { + vubr_die("eventfd_read()"); + } else { + DPRINT("Got kick_data: %016"PRIx64"\n", kick_data); + vubr_process_avail(dev, &dev->vq[1]); + } +} + +static int +vubr_none_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_get_features_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + vmsg->payload.u64 = + ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | + (1ULL << VIRTIO_NET_F_CTRL_VQ) | + (1ULL << VIRTIO_NET_F_CTRL_RX) | + (1ULL << VHOST_F_LOG_ALL)); + vmsg->size = sizeof(vmsg->payload.u64); + + DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + /* reply */ + return 1; +} + +static int +vubr_set_features_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + return 0; +} + +static int +vubr_set_owner_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + return 0; +} + +static int +vubr_reset_device_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + int i; + VhostUserMemory *memory = &vmsg->payload.memory; + dev->nregions = memory->nregions; + + DPRINT("Nregions: %d\n", memory->nregions); + for (i = 0; i < dev->nregions; i++) { + void *mmap_addr; + VhostUserMemoryRegion *msg_region = &memory->regions[i]; + VubrDevRegion *dev_region = &dev->regions[i]; + + DPRINT("Region %d\n", i); + DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", + msg_region->guest_phys_addr); + DPRINT(" memory_size: 0x%016"PRIx64"\n", + msg_region->memory_size); + DPRINT(" userspace_addr 0x%016"PRIx64"\n", + msg_region->userspace_addr); + DPRINT(" mmap_offset 0x%016"PRIx64"\n", + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned, and we use huge + * pages. */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED, + vmsg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) { + vubr_die("mmap"); + } + + dev_region->mmap_addr = (uint64_t) mmap_addr; + DPRINT(" mmap_addr: 0x%016"PRIx64"\n", dev_region->mmap_addr); + } + + return 0; +} + +static int +vubr_set_log_base_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_set_log_fd_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_set_vring_num_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; + + DPRINT("State.index: %d\n", index); + DPRINT("State.num: %d\n", num); + dev->vq[index].size = num; + return 0; +} + +static int +vubr_set_vring_addr_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + struct vhost_vring_addr *vra = &vmsg->payload.addr; + unsigned int index = vra->index; + VubrVirtq *vq = &dev->vq[index]; + + DPRINT("vhost_vring_addr:\n"); + DPRINT(" index: %d\n", vra->index); + DPRINT(" flags: %d\n", vra->flags); + DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr); + DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr); + DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr); + DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr); + + vq->desc = (struct vring_desc *)qva_to_va(dev, vra->desc_user_addr); + vq->used = (struct vring_used *)qva_to_va(dev, vra->used_user_addr); + vq->avail = (struct vring_avail *)qva_to_va(dev, vra->avail_user_addr); + + DPRINT("Setting virtq addresses:\n"); + DPRINT(" vring_desc at %p\n", vq->desc); + DPRINT(" vring_used at %p\n", vq->used); + DPRINT(" vring_avail at %p\n", vq->avail); + + vq->last_used_index = vq->used->idx; + return 0; +} + +static int +vubr_set_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; + + DPRINT("State.index: %d\n", index); + DPRINT("State.num: %d\n", num); + dev->vq[index].last_avail_index = num; + + return 0; +} + +static int +vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_set_vring_kick_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + uint64_t u64_arg = vmsg->payload.u64; + int index = u64_arg & VHOST_USER_VRING_IDX_MASK; + + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0); + assert(vmsg->fd_num == 1); + + dev->vq[index].kick_fd = vmsg->fds[0]; + DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index); + + if (index % 2 == 1) { + /* TX queue. */ + dispatcher_add(&dev->dispatcher, dev->vq[index].kick_fd, + dev, vubr_kick_cb); + + DPRINT("Waiting for kicks on fd: %d for vq: %d\n", + dev->vq[index].kick_fd, index); + } + return 0; +} + +static int +vubr_set_vring_call_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + uint64_t u64_arg = vmsg->payload.u64; + int index = u64_arg & VHOST_USER_VRING_IDX_MASK; + + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0); + assert(vmsg->fd_num == 1); + + dev->vq[index].call_fd = vmsg->fds[0]; + DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index); + + return 0; +} + +static int +vubr_set_vring_err_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + return 0; +} + +static int +vubr_get_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + /* FIXME: unimplented */ + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + return 0; +} + +static int +vubr_set_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + /* FIXME: unimplented */ + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + return 0; +} + +static int +vubr_get_queue_num_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_set_vring_enable_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_send_rarp_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_execute_request(VubrDev *dev, VhostUserMsg *vmsg) +{ + /* Print out generic part of the request. */ + DPRINT( + "================== Vhost user message from QEMU ==================\n"); + DPRINT("Request: %s (%d)\n", vubr_request_str[vmsg->request], + vmsg->request); + DPRINT("Flags: 0x%x\n", vmsg->flags); + DPRINT("Size: %d\n", vmsg->size); + + if (vmsg->fd_num) { + int i; + DPRINT("Fds:"); + for (i = 0; i < vmsg->fd_num; i++) { + DPRINT(" %d", vmsg->fds[i]); + } + DPRINT("\n"); + } + + switch (vmsg->request) { + case VHOST_USER_NONE: + return vubr_none_exec(dev, vmsg); + case VHOST_USER_GET_FEATURES: + return vubr_get_features_exec(dev, vmsg); + case VHOST_USER_SET_FEATURES: + return vubr_set_features_exec(dev, vmsg); + case VHOST_USER_SET_OWNER: + return vubr_set_owner_exec(dev, vmsg); + case VHOST_USER_RESET_DEVICE: + return vubr_reset_device_exec(dev, vmsg); + case VHOST_USER_SET_MEM_TABLE: + return vubr_set_mem_table_exec(dev, vmsg); + case VHOST_USER_SET_LOG_BASE: + return vubr_set_log_base_exec(dev, vmsg); + case VHOST_USER_SET_LOG_FD: + return vubr_set_log_fd_exec(dev, vmsg); + case VHOST_USER_SET_VRING_NUM: + return vubr_set_vring_num_exec(dev, vmsg); + case VHOST_USER_SET_VRING_ADDR: + return vubr_set_vring_addr_exec(dev, vmsg); + case VHOST_USER_SET_VRING_BASE: + return vubr_set_vring_base_exec(dev, vmsg); + case VHOST_USER_GET_VRING_BASE: + return vubr_get_vring_base_exec(dev, vmsg); + case VHOST_USER_SET_VRING_KICK: + return vubr_set_vring_kick_exec(dev, vmsg); + case VHOST_USER_SET_VRING_CALL: + return vubr_set_vring_call_exec(dev, vmsg); + case VHOST_USER_SET_VRING_ERR: + return vubr_set_vring_err_exec(dev, vmsg); + case VHOST_USER_GET_PROTOCOL_FEATURES: + return vubr_get_protocol_features_exec(dev, vmsg); + case VHOST_USER_SET_PROTOCOL_FEATURES: + return vubr_set_protocol_features_exec(dev, vmsg); + case VHOST_USER_GET_QUEUE_NUM: + return vubr_get_queue_num_exec(dev, vmsg); + case VHOST_USER_SET_VRING_ENABLE: + return vubr_set_vring_enable_exec(dev, vmsg); + case VHOST_USER_SEND_RARP: + return vubr_send_rarp_exec(dev, vmsg); + + case VHOST_USER_MAX: + assert(vmsg->request != VHOST_USER_MAX); + } + return 0; +} + +static void +vubr_receive_cb(int sock, void *ctx) +{ + VubrDev *dev = (VubrDev *) ctx; + VhostUserMsg vmsg; + int reply_requested; + + vubr_message_read(sock, &vmsg); + reply_requested = vubr_execute_request(dev, &vmsg); + if (reply_requested) { + /* Set the version in the flags when sending the reply */ + vmsg.flags &= ~VHOST_USER_VERSION_MASK; + vmsg.flags |= VHOST_USER_VERSION; + vmsg.flags |= VHOST_USER_REPLY_MASK; + vubr_message_write(sock, &vmsg); + } +} + +static void +vubr_accept_cb(int sock, void *ctx) +{ + VubrDev *dev = (VubrDev *)ctx; + int conn_fd; + struct sockaddr_un un; + socklen_t len = sizeof(un); + + conn_fd = accept(sock, (struct sockaddr *) &un, &len); + if (conn_fd == -1) { + vubr_die("accept()"); + } + DPRINT("Got connection from remote peer on sock %d\n", conn_fd); + dispatcher_add(&dev->dispatcher, conn_fd, ctx, vubr_receive_cb); +} + +static VubrDev * +vubr_new(const char *path) +{ + VubrDev *dev = (VubrDev *) calloc(1, sizeof(VubrDev)); + dev->nregions = 0; + int i; + struct sockaddr_un un; + size_t len; + + for (i = 0; i < MAX_NR_VIRTQUEUE; i++) { + dev->vq[i] = (VubrVirtq) { + .call_fd = -1, .kick_fd = -1, + .size = 0, + .last_avail_index = 0, .last_used_index = 0, + .desc = 0, .avail = 0, .used = 0, + }; + } + + /* Get a UNIX socket. */ + dev->sock = socket(AF_UNIX, SOCK_STREAM, 0); + if (dev->sock == -1) { + vubr_die("socket"); + } + + un.sun_family = AF_UNIX; + strcpy(un.sun_path, path); + len = sizeof(un.sun_family) + strlen(path); + unlink(path); + + if (bind(dev->sock, (struct sockaddr *) &un, len) == -1) { + vubr_die("bind"); + } + + if (listen(dev->sock, 1) == -1) { + vubr_die("listen"); + } + + dispatcher_init(&dev->dispatcher); + dispatcher_add(&dev->dispatcher, dev->sock, (void *)dev, + vubr_accept_cb); + + DPRINT("Waiting for connections on UNIX socket %s ...\n", path); + return dev; +} + +static void +vubr_backend_udp_setup(VubrDev *dev, + const char *local_host, + uint16_t local_port, + const char *dest_host, + uint16_t dest_port) +{ + int sock; + struct sockaddr_in si_local = { + .sin_family = AF_INET, + .sin_port = htons(local_port), + }; + + if (inet_aton(local_host, &si_local.sin_addr) == 0) { + fprintf(stderr, "inet_aton() failed.\n"); + exit(1); + } + + /* setup destination for sends */ + dev->backend_udp_dest = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_port = htons(dest_port), + }; + if (inet_aton(dest_host, &dev->backend_udp_dest.sin_addr) == 0) { + fprintf(stderr, "inet_aton() failed.\n"); + exit(1); + } + + sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (sock == -1) { + vubr_die("socket"); + } + + if (bind(sock, (struct sockaddr *)&si_local, sizeof(si_local)) == -1) { + vubr_die("bind"); + } + + dev->backend_udp_sock = sock; + dispatcher_add(&dev->dispatcher, sock, dev, vubr_backend_recv_cb); + DPRINT("Waiting for data from udp backend on %s:%d...\n", + local_host, local_port); +} + +static void +vubr_run(VubrDev *dev) +{ + while (1) { + /* timeout 200ms */ + dispatcher_wait(&dev->dispatcher, 200000); + /* Here one can try polling strategy. */ + } +} + +int +main(int argc, char *argv[]) +{ + VubrDev *dev; + + dev = vubr_new("/tmp/vubr.sock"); + if (!dev) { + return 1; + } + + vubr_backend_udp_setup(dev, + "127.0.0.1", 4444, + "127.0.0.1", 5555); + vubr_run(dev); + return 0; +} diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c index a74c934cc0..b6dde753f8 100644 --- a/tests/vhost-user-test.c +++ b/tests/vhost-user-test.c @@ -98,7 +98,7 @@ typedef struct VhostUserMsg { struct vhost_vring_state state; struct vhost_vring_addr addr; VhostUserMemory memory; - }; + } payload; } QEMU_PACKED VhostUserMsg; static VhostUserMsg m __attribute__ ((unused)); @@ -242,23 +242,23 @@ static void chr_read(void *opaque, const uint8_t *buf, int size) case VHOST_USER_GET_FEATURES: /* send back features to qemu */ msg.flags |= VHOST_USER_REPLY_MASK; - msg.size = sizeof(m.u64); - msg.u64 = 0x1ULL << VHOST_F_LOG_ALL | + msg.size = sizeof(m.payload.u64); + msg.payload.u64 = 0x1ULL << VHOST_F_LOG_ALL | 0x1ULL << VHOST_USER_F_PROTOCOL_FEATURES; p = (uint8_t *) &msg; qemu_chr_fe_write_all(chr, p, VHOST_USER_HDR_SIZE + msg.size); break; case VHOST_USER_SET_FEATURES: - g_assert_cmpint(msg.u64 & (0x1ULL << VHOST_USER_F_PROTOCOL_FEATURES), + g_assert_cmpint(msg.payload.u64 & (0x1ULL << VHOST_USER_F_PROTOCOL_FEATURES), !=, 0ULL); break; case VHOST_USER_GET_PROTOCOL_FEATURES: /* send back features to qemu */ msg.flags |= VHOST_USER_REPLY_MASK; - msg.size = sizeof(m.u64); - msg.u64 = 1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD; + msg.size = sizeof(m.payload.u64); + msg.payload.u64 = 1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD; p = (uint8_t *) &msg; qemu_chr_fe_write_all(chr, p, VHOST_USER_HDR_SIZE + msg.size); break; @@ -266,15 +266,15 @@ static void chr_read(void *opaque, const uint8_t *buf, int size) case VHOST_USER_GET_VRING_BASE: /* send back vring base to qemu */ msg.flags |= VHOST_USER_REPLY_MASK; - msg.size = sizeof(m.state); - msg.state.num = 0; + msg.size = sizeof(m.payload.state); + msg.payload.state.num = 0; p = (uint8_t *) &msg; qemu_chr_fe_write_all(chr, p, VHOST_USER_HDR_SIZE + msg.size); break; case VHOST_USER_SET_MEM_TABLE: /* received the mem table */ - memcpy(&s->memory, &msg.memory, sizeof(msg.memory)); + memcpy(&s->memory, &msg.payload.memory, sizeof(msg.payload.memory)); s->fds_num = qemu_chr_fe_get_msgfds(chr, s->fds, G_N_ELEMENTS(s->fds)); /* signal the test that it can continue */ diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c index 13942694cc..c37acbe58e 100644 --- a/util/mmap-alloc.c +++ b/util/mmap-alloc.c @@ -26,7 +26,7 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) void *ptr1; if (ptr == MAP_FAILED) { - return NULL; + return MAP_FAILED; } /* Make sure align is a power of 2 */ @@ -41,7 +41,7 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) fd, 0); if (ptr1 == MAP_FAILED) { munmap(ptr, total); - return NULL; + return MAP_FAILED; } ptr += offset; |