diff options
-rw-r--r-- | Documentation/lguest/lguest.c | 231 | ||||
-rw-r--r-- | arch/x86/kvm/Kconfig | 1 | ||||
-rw-r--r-- | drivers/Kconfig | 2 | ||||
-rw-r--r-- | drivers/block/Kconfig | 3 | ||||
-rw-r--r-- | drivers/block/virtio_blk.c | 106 | ||||
-rw-r--r-- | drivers/char/virtio_console.c | 4 | ||||
-rw-r--r-- | drivers/lguest/lguest_device.c | 146 | ||||
-rw-r--r-- | drivers/net/Kconfig | 3 | ||||
-rw-r--r-- | drivers/net/virtio_net.c | 155 | ||||
-rw-r--r-- | drivers/virtio/Kconfig | 31 | ||||
-rw-r--r-- | drivers/virtio/Makefile | 2 | ||||
-rw-r--r-- | drivers/virtio/virtio.c | 65 | ||||
-rw-r--r-- | drivers/virtio/virtio_balloon.c | 284 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci.c | 446 | ||||
-rw-r--r-- | drivers/virtio/virtio_ring.c | 51 | ||||
-rw-r--r-- | include/linux/lguest_launcher.h | 9 | ||||
-rw-r--r-- | include/linux/skbuff.h | 1 | ||||
-rw-r--r-- | include/linux/virtio.h | 19 | ||||
-rw-r--r-- | include/linux/virtio_balloon.h | 18 | ||||
-rw-r--r-- | include/linux/virtio_blk.h | 22 | ||||
-rw-r--r-- | include/linux/virtio_config.h | 104 | ||||
-rw-r--r-- | include/linux/virtio_net.h | 32 | ||||
-rw-r--r-- | include/linux/virtio_pci.h | 57 | ||||
-rw-r--r-- | include/linux/virtio_ring.h | 14 | ||||
-rw-r--r-- | net/9p/trans_virtio.c | 8 | ||||
-rw-r--r-- | net/core/skbuff.c | 29 |
26 files changed, 1422 insertions, 421 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 6c8a2386cd5..0f23d67f958 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -34,6 +34,8 @@ #include <zlib.h> #include <assert.h> #include <sched.h> +#include <limits.h> +#include <stddef.h> #include "linux/lguest_launcher.h" #include "linux/virtio_config.h" #include "linux/virtio_net.h" @@ -99,13 +101,11 @@ struct device_list /* The descriptor page for the devices. */ u8 *descpage; - /* The tail of the last descriptor. */ - unsigned int desc_used; - /* A single linked list of devices. */ struct device *dev; - /* ... And an end pointer so we can easily append new devices */ - struct device **lastdev; + /* And a pointer to the last device for easy append and also for + * configuration appending. */ + struct device *lastdev; }; /* The list of Guest devices, based on command line arguments. */ @@ -191,7 +191,14 @@ static void *_convert(struct iovec *iov, size_t size, size_t align, #define cpu_to_le64(v64) (v64) #define le16_to_cpu(v16) (v16) #define le32_to_cpu(v32) (v32) -#define le64_to_cpu(v32) (v64) +#define le64_to_cpu(v64) (v64) + +/* The device virtqueue descriptors are followed by feature bitmasks. */ +static u8 *get_feature_bits(struct device *dev) +{ + return (u8 *)(dev->desc + 1) + + dev->desc->num_vq * sizeof(struct lguest_vqconfig); +} /*L:100 The Launcher code itself takes us out into userspace, that scary place * where pointers run wild and free! Unfortunately, like most userspace @@ -914,21 +921,58 @@ static void enable_fd(int fd, struct virtqueue *vq) write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); } +/* Resetting a device is fairly easy. */ +static void reset_device(struct device *dev) +{ + struct virtqueue *vq; + + verbose("Resetting device %s\n", dev->name); + /* Clear the status. */ + dev->desc->status = 0; + + /* Clear any features they've acked. */ + memset(get_feature_bits(dev) + dev->desc->feature_len, 0, + dev->desc->feature_len); + + /* Zero out the virtqueues. */ + for (vq = dev->vq; vq; vq = vq->next) { + memset(vq->vring.desc, 0, + vring_size(vq->config.num, getpagesize())); + vq->last_avail_idx = 0; + } +} + /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ static void handle_output(int fd, unsigned long addr) { struct device *i; struct virtqueue *vq; - /* Check each virtqueue. */ + /* Check each device and virtqueue. */ for (i = devices.dev; i; i = i->next) { + /* Notifications to device descriptors reset the device. */ + if (from_guest_phys(addr) == i->desc) { + reset_device(i); + return; + } + + /* Notifications to virtqueues mean output has occurred. */ for (vq = i->vq; vq; vq = vq->next) { - if (vq->config.pfn == addr/getpagesize() - && vq->handle_output) { - verbose("Output to %s\n", vq->dev->name); - vq->handle_output(fd, vq); + if (vq->config.pfn != addr/getpagesize()) + continue; + + /* Guest should acknowledge (and set features!) before + * using the device. */ + if (i->desc->status == 0) { + warnx("%s gave early output", i->name); return; } + + if (strcmp(vq->dev->name, "console") != 0) + verbose("Output to %s\n", vq->dev->name); + if (vq->handle_output) + vq->handle_output(fd, vq); + return; } } @@ -986,54 +1030,44 @@ static void handle_input(int fd) * * All devices need a descriptor so the Guest knows it exists, and a "struct * device" so the Launcher can keep track of it. We have common helper - * routines to allocate them. - * - * This routine allocates a new "struct lguest_device_desc" from descriptor - * table just above the Guest's normal memory. It returns a pointer to that - * descriptor. */ -static struct lguest_device_desc *new_dev_desc(u16 type) -{ - struct lguest_device_desc *d; + * routines to allocate and manage them. */ - /* We only have one page for all the descriptors. */ - if (devices.desc_used + sizeof(*d) > getpagesize()) - errx(1, "Too many devices"); - - /* We don't need to set config_len or status: page is 0 already. */ - d = (void *)devices.descpage + devices.desc_used; - d->type = type; - devices.desc_used += sizeof(*d); - - return d; +/* The layout of the device page is a "struct lguest_device_desc" followed by a + * number of virtqueue descriptors, then two sets of feature bits, then an + * array of configuration bytes. This routine returns the configuration + * pointer. */ +static u8 *device_config(const struct device *dev) +{ + return (void *)(dev->desc + 1) + + dev->desc->num_vq * sizeof(struct lguest_vqconfig) + + dev->desc->feature_len * 2; } -/* Each device descriptor is followed by some configuration information. - * Each configuration field looks like: u8 type, u8 len, [... len bytes...]. - * - * This routine adds a new field to an existing device's descriptor. It only - * works for the last device, but that's OK because that's how we use it. */ -static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c) +/* This routine allocates a new "struct lguest_device_desc" from descriptor + * table page just above the Guest's normal memory. It returns a pointer to + * that descriptor. */ +static struct lguest_device_desc *new_dev_desc(u16 type) { - /* This is the last descriptor, right? */ - assert(devices.descpage + devices.desc_used - == (u8 *)(dev->desc + 1) + dev->desc->config_len); + struct lguest_device_desc d = { .type = type }; + void *p; - /* We only have one page of device descriptions. */ - if (devices.desc_used + 2 + len > getpagesize()) - errx(1, "Too many devices"); + /* Figure out where the next device config is, based on the last one. */ + if (devices.lastdev) + p = device_config(devices.lastdev) + + devices.lastdev->desc->config_len; + else + p = devices.descpage; - /* Copy in the new config header: type then length. */ - devices.descpage[devices.desc_used++] = type; - devices.descpage[devices.desc_used++] = len; - memcpy(devices.descpage + devices.desc_used, c, len); - devices.desc_used += len; + /* We only have one page for all the descriptors. */ + if (p + sizeof(d) > (void *)devices.descpage + getpagesize()) + errx(1, "Too many devices"); - /* Update the device descriptor length: two byte head then data. */ - dev->desc->config_len += 2 + len; + /* p might not be aligned, so we memcpy in. */ + return memcpy(p, &d, sizeof(d)); } -/* This routine adds a virtqueue to a device. We specify how many descriptors - * the virtqueue is to have. */ +/* Each device descriptor is followed by the description of its virtqueues. We + * specify how many descriptors the virtqueue is to have. */ static void add_virtqueue(struct device *dev, unsigned int num_descs, void (*handle_output)(int fd, struct virtqueue *me)) { @@ -1059,9 +1093,15 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, /* Initialize the vring. */ vring_init(&vq->vring, num_descs, p, getpagesize()); - /* Add the configuration information to this device's descriptor. */ - add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE, - sizeof(vq->config), &vq->config); + /* Append virtqueue to this device's descriptor. We use + * device_config() to get the end of the device's current virtqueues; + * we check that we haven't added any config or feature information + * yet, otherwise we'd be overwriting them. */ + assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); + memcpy(device_config(dev), &vq->config, sizeof(vq->config)); + dev->desc->num_vq++; + + verbose("Virtqueue page %#lx\n", to_guest_phys(p)); /* Add to tail of list, so dev->vq is first vq, dev->vq->next is * second. */ @@ -1072,11 +1112,41 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, * virtqueue. */ vq->handle_output = handle_output; - /* Set the "Don't Notify Me" flag if we don't have a handler */ + /* As an optimization, set the advisory "Don't Notify Me" flag if we + * don't have a handler */ if (!handle_output) vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; } +/* The first half of the feature bitmask is for us to advertise features. The + * second half if for the Guest to accept features. */ +static void add_feature(struct device *dev, unsigned bit) +{ + u8 *features = get_feature_bits(dev); + + /* We can't extend the feature bits once we've added config bytes */ + if (dev->desc->feature_len <= bit / CHAR_BIT) { + assert(dev->desc->config_len == 0); + dev->desc->feature_len = (bit / CHAR_BIT) + 1; + } + + features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); +} + +/* This routine sets the configuration fields for an existing device's + * descriptor. It only works for the last device, but that's OK because that's + * how we use it. */ +static void set_config(struct device *dev, unsigned len, const void *conf) +{ + /* Check we haven't overflowed our single page. */ + if (device_config(dev) + len > devices.descpage + getpagesize()) + errx(1, "Too many devices"); + + /* Copy in the config information, and store the length. */ + memcpy(device_config(dev), conf, len); + dev->desc->config_len = len; +} + /* This routine does all the creation and setup of a new device, including * calling new_dev_desc() to allocate the descriptor and device memory. */ static struct device *new_device(const char *name, u16 type, int fd, @@ -1084,14 +1154,6 @@ static struct device *new_device(const char *name, u16 type, int fd, { struct device *dev = malloc(sizeof(*dev)); - /* Append to device list. Prepending to a single-linked list is - * easier, but the user expects the devices to be arranged on the bus - * in command-line order. The first network device on the command line - * is eth0, the first block device /dev/vda, etc. */ - *devices.lastdev = dev; - dev->next = NULL; - devices.lastdev = &dev->next; - /* Now we populate the fields one at a time. */ dev->fd = fd; /* If we have an input handler for this file descriptor, then we add it @@ -1102,6 +1164,17 @@ static struct device *new_device(const char *name, u16 type, int fd, dev->handle_input = handle_input; dev->name = name; dev->vq = NULL; + + /* Append to device list. Prepending to a single-linked list is + * easier, but the user expects the devices to be arranged on the bus + * in command-line order. The first network device on the command line + * is eth0, the first block device /dev/vda, etc. */ + if (devices.lastdev) + devices.lastdev->next = dev; + else + devices.dev = dev; + devices.lastdev = dev; + return dev; } @@ -1226,7 +1299,7 @@ static void setup_tun_net(const char *arg) int netfd, ipfd; u32 ip; const char *br_name = NULL; - u8 hwaddr[6]; + struct virtio_net_config conf; /* We open the /dev/net/tun device and tell it we want a tap device. A * tap device is like a tun device, only somehow different. To tell @@ -1265,12 +1338,13 @@ static void setup_tun_net(const char *arg) ip = str2ip(arg); /* Set up the tun device, and get the mac address for the interface. */ - configure_device(ipfd, ifr.ifr_name, ip, hwaddr); + configure_device(ipfd, ifr.ifr_name, ip, conf.mac); /* Tell Guest what MAC address to use. */ - add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr); + add_feature(dev, VIRTIO_NET_F_MAC); + set_config(dev, sizeof(conf), &conf); - /* We don't seed the socket any more; setup is done. */ + /* We don't need the socket any more; setup is done. */ close(ipfd); verbose("device %u: tun net %u.%u.%u.%u\n", @@ -1458,8 +1532,7 @@ static void setup_block_file(const char *filename) struct device *dev; struct vblk_info *vblk; void *stack; - u64 cap; - unsigned int val; + struct virtio_blk_config conf; /* This is the pipe the I/O thread will use to tell us I/O is done. */ pipe(p); @@ -1477,14 +1550,18 @@ static void setup_block_file(const char *filename) vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); vblk->len = lseek64(vblk->fd, 0, SEEK_END); + /* We support barriers. */ + add_feature(dev, VIRTIO_BLK_F_BARRIER); + /* Tell Guest how many sectors this device has. */ - cap = cpu_to_le64(vblk->len / 512); - add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap); + conf.capacity = cpu_to_le64(vblk->len / 512); /* Tell Guest not to put in too many descriptors at once: two are used * for the in and out elements. */ - val = cpu_to_le32(VIRTQUEUE_NUM - 2); - add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val); + add_feature(dev, VIRTIO_BLK_F_SEG_MAX); + conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); + + set_config(dev, sizeof(conf), &conf); /* The I/O thread writes to this end of the pipe when done. */ vblk->done_fd = p[1]; @@ -1505,7 +1582,7 @@ static void setup_block_file(const char *filename) close(vblk->workpipe[0]); verbose("device %u: virtblock %llu sectors\n", - devices.device_num, cap); + devices.device_num, le64_to_cpu(conf.capacity)); } /* That's the end of device setup. :*/ @@ -1610,12 +1687,12 @@ int main(int argc, char *argv[]) /* First we initialize the device list. Since console and network * device receive input from a file descriptor, we keep an fdset * (infds) and the maximum fd number (max_infd) with the head of the - * list. We also keep a pointer to the last device, for easy appending - * to the list. Finally, we keep the next interrupt number to hand out - * (1: remember that 0 is used by the timer). */ + * list. We also keep a pointer to the last device. Finally, we keep + * the next interrupt number to hand out (1: remember that 0 is used by + * the timer). */ FD_ZERO(&devices.infds); devices.max_infd = -1; - devices.lastdev = &devices.dev; + devices.lastdev = NULL; devices.next_irq = 1; cpu_id = 0; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index c83e1c9b512..41962e793c0 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -53,5 +53,6 @@ config KVM_AMD # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/lguest/Kconfig +source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/drivers/Kconfig b/drivers/Kconfig index 08d4ae20159..3f8a231fe75 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -91,6 +91,4 @@ source "drivers/dca/Kconfig" source "drivers/auxdisplay/Kconfig" source "drivers/uio/Kconfig" - -source "drivers/virtio/Kconfig" endmenu diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index f2122855d4e..64e5148d82b 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -440,6 +440,7 @@ config VIRTIO_BLK tristate "Virtio block driver (EXPERIMENTAL)" depends on EXPERIMENTAL && VIRTIO ---help--- - This is the virtual block driver for lguest. Say Y or M. + This is the virtual block driver for virtio. It can be used with + lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. endif # BLK_DEV diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 924ddd8bccd..3b1a68d6edd 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -7,8 +7,10 @@ #include <linux/scatterlist.h> #define VIRTIO_MAX_SG (3+MAX_PHYS_SEGMENTS) +#define PART_BITS 4 + +static int major, index; -static unsigned char virtblk_index = 'a'; struct virtio_blk { spinlock_t lock; @@ -36,7 +38,7 @@ struct virtblk_req struct virtio_blk_inhdr in_hdr; }; -static bool blk_done(struct virtqueue *vq) +static void blk_done(struct virtqueue *vq) { struct virtio_blk *vblk = vq->vdev->priv; struct virtblk_req *vbr; @@ -65,7 +67,6 @@ static bool blk_done(struct virtqueue *vq) /* In case queue is stopped waiting for more buffers. */ blk_start_queue(vblk->disk->queue); spin_unlock_irqrestore(&vblk->lock, flags); - return true; } static bool do_req(struct request_queue *q, struct virtio_blk *vblk, @@ -153,20 +154,37 @@ static int virtblk_ioctl(struct inode *inode, struct file *filp, (void __user *)data); } +/* We provide getgeo only to please some old bootloader/partitioning tools */ +static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) +{ + /* some standard values, similar to sd */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bd->bd_disk) >> 11; + return 0; +} + static struct block_device_operations virtblk_fops = { - .ioctl = virtblk_ioctl, - .owner = THIS_MODULE, + .ioctl = virtblk_ioctl, + .owner = THIS_MODULE, + .getgeo = virtblk_getgeo, }; +static int index_to_minor(int index) +{ + return index << PART_BITS; +} + static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; - int err, major; - void *token; - unsigned int len; + int err; u64 cap; u32 v; + if (index_to_minor(index) >= 1 << MINORBITS) + return -ENOSPC; + vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); if (!vblk) { err = -ENOMEM; @@ -178,7 +196,7 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->vdev = vdev; /* We expect one virtqueue, for output. */ - vblk->vq = vdev->config->find_vq(vdev, blk_done); + vblk->vq = vdev->config->find_vq(vdev, 0, blk_done); if (IS_ERR(vblk->vq)) { err = PTR_ERR(vblk->vq); goto out_free_vblk; @@ -190,17 +208,11 @@ static int virtblk_probe(struct virtio_device *vdev) goto out_free_vq; } - major = register_blkdev(0, "virtblk"); - if (major < 0) { - err = major; - goto out_mempool; - } - /* FIXME: How many partitions? How long is a piece of string? */ - vblk->disk = alloc_disk(1 << 4); + vblk->disk = alloc_disk(1 << PART_BITS); if (!vblk->disk) { err = -ENOMEM; - goto out_unregister_blkdev; + goto out_mempool; } vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); @@ -209,22 +221,32 @@ static int virtblk_probe(struct virtio_device *vdev) goto out_put_disk; } - sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++); + if (index < 26) { + sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26); + } else if (index < (26 + 1) * 26) { + sprintf(vblk->disk->disk_name, "vd%c%c", + 'a' + index / 26 - 1, 'a' + index % 26); + } else { + const unsigned int m1 = (index / 26 - 1) / 26 - 1; + const unsigned int m2 = (index / 26 - 1) % 26; + const unsigned int m3 = index % 26; + sprintf(vblk->disk->disk_name, "vd%c%c%c", + 'a' + m1, 'a' + m2, 'a' + m3); + } + vblk->disk->major = major; - vblk->disk->first_minor = 0; + vblk->disk->first_minor = index_to_minor(index); vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; + index++; /* If barriers are supported, tell block layer that queue is ordered */ - token = vdev->config->find(vdev, VIRTIO_CONFIG_BLK_F, &len); - if (virtio_use_bit(vdev, token, len, VIRTIO_BLK_F_BARRIER)) + if (vdev->config->feature(vdev, VIRTIO_BLK_F_BARRIER)) blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL); - err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_CAPACITY, &cap); - if (err) { - dev_err(&vdev->dev, "Bad/missing capacity in config\n"); - goto out_cleanup_queue; - } + /* Host must always specify the capacity. */ + __virtio_config_val(vdev, offsetof(struct virtio_blk_config, capacity), + &cap); /* If capacity is too big, truncate with warning. */ if ((sector_t)cap != cap) { @@ -234,31 +256,25 @@ static int virtblk_probe(struct virtio_device *vdev) } set_capacity(vblk->disk, cap); - err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SIZE_MAX, &v); + /* Host can optionally specify maximum segment size and number of + * segments. */ + err = virtio_config_val(vdev, VIRTIO_BLK_F_SIZE_MAX, + offsetof(struct virtio_blk_config, size_max), + &v); if (!err) blk_queue_max_segment_size(vblk->disk->queue, v); - else if (err != -ENOENT) { - dev_err(&vdev->dev, "Bad SIZE_MAX in config\n"); - goto out_cleanup_queue; - } - err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SEG_MAX, &v); + err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX, + offsetof(struct virtio_blk_config, seg_max), + &v); if (!err) blk_queue_max_hw_segments(vblk->disk->queue, v); - else if (err != -ENOENT) { - dev_err(&vdev->dev, "Bad SEG_MAX in config\n"); - goto out_cleanup_queue; - } add_disk(vblk->disk); return 0; -out_cleanup_queue: - blk_cleanup_queue(vblk->disk->queue); out_put_disk: put_disk(vblk->disk); -out_unregister_blkdev: - unregister_blkdev(major, "virtblk"); out_mempool: mempool_destroy(vblk->pool); out_free_vq: @@ -274,12 +290,16 @@ static void virtblk_remove(struct virtio_device *vdev) struct virtio_blk *vblk = vdev->priv; int major = vblk->disk->major; + /* Nothing should be pending. */ BUG_ON(!list_empty(&vblk->reqs)); + + /* Stop all the virtqueues. */ + vdev->config->reset(vdev); + blk_cleanup_queue(vblk->disk->queue); put_disk(vblk->disk); unregister_blkdev(major, "virtblk"); mempool_destroy(vblk->pool); - /* There should be nothing in the queue now, so no need to shutdown */ vdev->config->del_vq(vblk->vq); kfree(vblk); } @@ -299,11 +319,15 @@ static struct virtio_driver virtio_blk = { static int __init init(void) { + major = register_blkdev(0, "virtblk"); + if (major < 0) + return major; return register_virtio_driver(&virtio_blk); } static void __exit fini(void) { + unregister_blkdev(major, "virtblk"); unregister_virtio_driver(&virtio_blk); } module_init(init); diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index e34da5c9719..dc17fe3a88b 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -158,13 +158,13 @@ static int __devinit virtcons_probe(struct virtio_device *dev) /* Find the input queue. */ /* FIXME: This is why we want to wean off hvc: we do nothing * when input comes in. */ - in_vq = vdev->config->find_vq(vdev, NULL); + in_vq = vdev->config->find_vq(vdev, 0, NULL); if (IS_ERR(in_vq)) { err = PTR_ERR(in_vq); goto free; } - out_vq = vdev->config->find_vq(vdev, NULL); + out_vq = vdev->config->find_vq(vdev, 1, NULL); if (IS_ERR(out_vq)) { err = PTR_ERR(out_vq); goto free_in_vq; diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index e2eec38c83c..84f85e23cca 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -52,57 +52,82 @@ struct lguest_device { /*D:130 * Device configurations * - * The configuration information for a device consists of a series of fields. - * We don't really care what they are: the Launcher set them up, and the driver - * will look at them during setup. + * The configuration information for a device consists of one or more + * virtqueues, a feature bitmaks, and some configuration bytes. The + * configuration bytes don't really matter to us: the Launcher sets them up, and + * the driver will look at them during setup. * - * For us these fields come immediately after that device's descriptor in the - * lguest_devices page. - * - * Each field starts with a "type" byte, a "length" byte, then that number of - * bytes of configuration information. The device descriptor tells us the - * total configuration length so we know when we've reached the last field. */ + * A convenient routine to return the device's virtqueue config array: + * immediately after the descriptor. */ +static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) +{ + return (void *)(desc + 1); +} -/* type + length bytes */ -#define FHDR_LEN 2 +/* The features come immediately after the virtqueues. */ +static u8 *lg_features(const struct lguest_device_desc *desc) +{ + return (void *)(lg_vq(desc) + desc->num_vq); +} -/* This finds the first field of a given type for a device's configuration. */ -static void *lg_find(struct virtio_device *vdev, u8 type, unsigned int *len) +/* The config space comes after the two feature bitmasks. */ +static u8 *lg_config(const struct lguest_device_desc *desc) { - struct lguest_device_desc *desc = to_lgdev(vdev)->desc; - int i; - - for (i = 0; i < desc->config_len; i += FHDR_LEN + desc->config[i+1]) { - if (desc->config[i] == type) { - /* Mark it used, so Host can know we looked at it, and - * also so we won't find the same one twice. */ - desc->config[i] |= 0x80; - /* Remember, the second byte is the length. */ - *len = desc->config[i+1]; - /* We return a pointer to the field header. */ - return desc->config + i; - } - } + return lg_features(desc) + desc->feature_len * 2; +} - /* Not found: return NULL for failure. */ - return NULL; +/* The total size of the config page used by this device (incl. desc) */ +static unsigned desc_size(const struct lguest_device_desc *desc) +{ + return sizeof(*desc) + + desc->num_vq * sizeof(struct lguest_vqconfig) + + desc->feature_len * 2 + + desc->config_len; +} + +/* This tests (and acknowleges) a feature bit. */ +static bool lg_feature(struct virtio_device *vdev, unsigned fbit) +{ + struct lguest_device_desc *desc = to_lgdev(vdev)->desc; + u8 *features; + + /* Obviously if they ask for a feature off the end of our feature + * bitmap, it's not set. */ + if (fbit / 8 > desc->feature_len) + return false; + + /* The feature bitmap comes after the virtqueues. */ + features = lg_features(desc); + if (!(features[fbit / 8] & (1 << (fbit % 8)))) + return false; + + /* We set the matching bit in the other half of the bitmap to tell the + * Host we want to use this feature. We don't use this yet, but we + * could in future. */ + features[desc->feature_len + fbit / 8] |= (1 << (fbit % 8)); + return true; } /* Once they've found a field, getting a copy of it is easy. */ -static void lg_get(struct virtio_device *vdev, void *token, +static void lg_get(struct virtio_device *vdev, unsigned int offset, void *buf, unsigned len) { - /* Check they didn't ask for more than the length of the field! */ - BUG_ON(len > ((u8 *)token)[1]); - memcpy(buf, token + FHDR_LEN, len); + struct lguest_device_desc *desc = to_lgdev(vdev)->desc; + + /* Check they didn't ask for more than the length of the config! */ + BUG_ON(offset + len > desc->config_len); + memcpy(buf, lg_config(desc) + offset, len); } /* Setting the contents is also trivial. */ -static void lg_set(struct virtio_device *vdev, void *token, +static void lg_set(struct virtio_device *vdev, unsigned int offset, const void *buf, unsigned len) { - BUG_ON(len > ((u8 *)token)[1]); - memcpy(token + FHDR_LEN, buf, len); + struct lguest_device_desc *desc = to_lgdev(vdev)->desc; + + /* Check they didn't ask for more than the length of the config! */ + BUG_ON(offset + len > desc->config_len); + memcpy(lg_config(desc) + offset, buf, len); } /* The operations to get and set the status word just access the status field @@ -114,9 +139,20 @@ static u8 lg_get_status(struct virtio_device *vdev) static void lg_set_status(struct virtio_device *vdev, u8 status) { + BUG_ON(!status); to_lgdev(vdev)->desc->status = status; } +/* To reset the device, we (ab)use the NOTIFY hypercall, with the descriptor + * address of the device. The Host will zero the status and all the + * features. */ +static void lg_reset(struct virtio_device *vdev) +{ + unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; + + hcall(LHCALL_NOTIFY, (max_pfn<<PAGE_SHIFT) + offset, 0, 0); +} + /* * Virtqueues * @@ -165,39 +201,29 @@ static void lg_notify(struct virtqueue *vq) * * So we provide devices with a "find virtqueue and set it up" function. */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, - bool (*callback)(struct virtqueue *vq)) + unsigned index, + void (*callback)(struct virtqueue *vq)) { + struct lguest_device *ldev = to_lgdev(vdev); struct lguest_vq_info *lvq; struct virtqueue *vq; - unsigned int len; - void *token; int err; - /* Look for a field of the correct type to mark a virtqueue. Note that - * if this succeeds, then the type will be changed so it won't be found - * again, and future lg_find_vq() calls will find the next - * virtqueue (if any). */ - token = vdev->config->find(vdev, VIRTIO_CONFIG_F_VIRTQUEUE, &len); - if (!token) + /* We must have this many virtqueues. */ + if (index >= ldev->desc->num_vq) return ERR_PTR(-ENOENT); lvq = kmalloc(sizeof(*lvq), GFP_KERNEL); if (!lvq) return ERR_PTR(-ENOMEM); - /* Note: we could use a configuration space inside here, just like we - * do for the device. This would allow expansion in future, because - * our configuration system is designed to be expansible. But this is - * way easier. */ - if (len != sizeof(lvq->config)) { - dev_err(&vdev->dev, "Unexpected virtio config len %u\n", len); - err = -EIO; - goto free_lvq; - } - /* Make a copy of the "struct lguest_vqconfig" field. We need a copy - * because the config space might not be aligned correctly. */ - vdev->config->get(vdev, token, &lvq->config, sizeof(lvq->config)); + /* Make a copy of the "struct lguest_vqconfig" entry, which sits after + * the descriptor. We need a copy because the config space might not + * be aligned correctly. */ + memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); + printk("Mapping virtqueue %i addr %lx\n", index, + (unsigned long)lvq->config.pfn << PAGE_SHIFT); /* Figure out how many pages the ring will take, and map that memory */ lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT, DIV_ROUND_UP(vring_size(lvq->config.num, @@ -259,11 +285,12 @@ static void lg_del_vq(struct virtqueue *vq) /* The ops structure which hooks everything together. */ static struct virtio_config_ops lguest_config_ops = { - .find = lg_find, + .feature = lg_feature, .get = lg_get, .set = lg_set, .get_status = lg_get_status, .set_status = lg_set_status, + .reset = lg_reset, .find_vq = lg_find_vq, .del_vq = lg_del_vq, }; @@ -329,13 +356,14 @@ static void scan_devices(void) struct lguest_device_desc *d; /* We start at the page beginning, and skip over each entry. */ - for (i = 0; i < PAGE_SIZE; i += sizeof(*d) + d->config_len) { + for (i = 0; i < PAGE_SIZE; i += desc_size(d)) { d = lguest_devices + i; /* Once we hit a zero, stop. */ if (d->type == 0) break; + printk("Device at %i has size %u\n", i, desc_size(d)); add_lguest_device(d); } } diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 55d224c8a0b..f234ba3f040 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -3114,6 +3114,7 @@ config VIRTIO_NET tristate "Virtio network driver (EXPERIMENTAL)" depends on EXPERIMENTAL && VIRTIO ---help--- - This is the virtual network driver for lguest. Say Y or M. + This is the virtual network driver for virtio. It can be used with + lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. endif # NETDEVICES diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 5413dbf3d4a..e66de0c12fc 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -24,6 +24,13 @@ #include <linux/virtio_net.h> #include <linux/scatterlist.h> +static int napi_weight = 128; +module_param(napi_weight, int, 0444); + +static int csum = 1, gso = 1; +module_param(csum, bool, 0444); +module_param(gso, bool, 0444); + /* FIXME: MTU in config. */ #define MAX_PACKET_LEN (ETH_HLEN+ETH_DATA_LEN) @@ -52,13 +59,14 @@ static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb) sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr)); } -static bool skb_xmit_done(struct virtqueue *rvq) +static void skb_xmit_done(struct virtqueue *svq) { - struct virtnet_info *vi = rvq->vdev->priv; + struct virtnet_info *vi = svq->vdev->priv; - /* In case we were waiting for output buffers. */ + /* Suppress further interrupts. */ + svq->vq_ops->disable_cb(svq); + /* We were waiting for more output buffers. */ netif_wake_queue(vi->dev); - return true; } static void receive_skb(struct net_device *dev, struct sk_buff *skb, @@ -83,28 +91,16 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb, if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { pr_debug("Needs csum!\n"); - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = hdr->csum_start; - skb->csum_offset = hdr->csum_offset; - if (skb->csum_start > skb->len - 2 - || skb->csum_offset > skb->len - 2) { - if (net_ratelimit()) - printk(KERN_WARNING "%s: csum=%u/%u len=%u\n", - dev->name, skb->csum_start, - skb->csum_offset, skb->len); + if (!skb_partial_csum_set(skb,hdr->csum_start,hdr->csum_offset)) goto frame_err; - } } if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { pr_debug("GSO!\n"); - switch (hdr->gso_type) { + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; break; - case VIRTIO_NET_HDR_GSO_TCPV4_ECN: - skb_shinfo(skb)->gso_type = SKB_GSO_TCP_ECN; - break; case VIRTIO_NET_HDR_GSO_UDP: skb_shinfo(skb)->gso_type = SKB_GSO_UDP; break; @@ -118,6 +114,9 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb, goto frame_err; } + if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + skb_shinfo(skb)->gso_size = hdr->gso_size; if (skb_shinfo(skb)->gso_size == 0) { if (net_ratelimit()) @@ -170,12 +169,14 @@ static void try_fill_recv(struct virtnet_info *vi) vi->rvq->vq_ops->kick(vi->rvq); } -static bool skb_recv_done(struct virtqueue *rvq) +static void skb_recv_done(struct virtqueue *rvq) { struct virtnet_info *vi = rvq->vdev->priv; - netif_rx_schedule(vi->dev, &vi->napi); - /* Suppress further interrupts. */ - return false; + /* Schedule NAPI, Suppress further interrupts if successful. */ + if (netif_rx_schedule_prep(vi->dev, &vi->napi)) { + rvq->vq_ops->disable_cb(rvq); + __netif_rx_schedule(vi->dev, &vi->napi); + } } static int virtnet_poll(struct napi_struct *napi, int budget) @@ -201,7 +202,7 @@ again: /* Out of packets? */ if (received < budget) { netif_rx_complete(vi->dev, napi); - if (unlikely(!vi->rvq->vq_ops->restart(vi->rvq)) + if (unlikely(!vi->rvq->vq_ops->enable_cb(vi->rvq)) && netif_rx_reschedule(vi->dev, napi)) goto again; } @@ -236,8 +237,6 @@ static int start_xmit(struct sk_buff *skb, struct net_device *dev) pr_debug("%s: xmit %p %s\n", dev->name, skb, print_mac(mac, dest)); - free_old_xmit_skbs(vi); - /* Encode metadata header at front. */ hdr = skb_vnet_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { @@ -250,10 +249,9 @@ static int start_xmit(struct sk_buff *skb, struct net_device *dev) } if (skb_is_gso(skb)) { + hdr->hdr_len = skb_transport_header(skb) - skb->data; hdr->gso_size = skb_shinfo(skb)->gso_size; - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) - hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN; - else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; @@ -261,19 +259,34 @@ static int start_xmit(struct sk_buff *skb, struct net_device *dev) hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; else BUG(); + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) + hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } else { hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; - hdr->gso_size = 0; + hdr->gso_size = hdr->hdr_len = 0; } vnet_hdr_to_sg(sg, skb); num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; __skb_queue_head(&vi->send, skb); + +again: + /* Free up any pending old buffers before queueing new ones. */ + free_old_xmit_skbs(vi); err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb); if (err) { pr_debug("%s: virtio not prepared to send\n", dev->name); - skb_unlink(skb, &vi->send); netif_stop_queue(dev); + + /* Activate callback for using skbs: if this fails it + * means some were used in the meantime. */ + if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) { + printk("Unlikely: restart svq failed\n"); + netif_start_queue(dev); + goto again; + } + __skb_unlink(skb, &vi->send); + return NETDEV_TX_BUSY; } vi->svq->vq_ops->kick(vi->svq); @@ -285,45 +298,31 @@ static int virtnet_open(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); - try_fill_recv(vi); + napi_enable(&vi->napi); - /* If we didn't even get one input buffer, we're useless. */ - if (vi->num == 0) - return -ENOMEM; + /* If all buffers were filled by other side before we napi_enabled, we + * won't get another interrupt, so process any outstanding packets + * now. virtnet_poll wants re-enable the queue, so we disable here. */ + vi->rvq->vq_ops->disable_cb(vi->rvq); + netif_rx_schedule(vi->dev, &vi->napi); - napi_enable(&vi->napi); return 0; } static int virtnet_close(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); - struct sk_buff *skb; napi_disable(&vi->napi); - /* networking core has neutered skb_xmit_done/skb_recv_done, so don't - * worry about races vs. get(). */ - vi->rvq->vq_ops->shutdown(vi->rvq); - while ((skb = __skb_dequeue(&vi->recv)) != NULL) { - kfree_skb(skb); - vi->num--; - } - vi->svq->vq_ops->shutdown(vi->svq); - while ((skb = __skb_dequeue(&vi->send)) != NULL) - kfree_skb(skb); - - BUG_ON(vi->num != 0); return 0; } static int virtnet_probe(struct virtio_device *vdev) { int err; - unsigned int len; struct net_device *dev; struct virtnet_info *vi; - void *token; /* Allocate ourselves a network device with room for our info */ dev = alloc_etherdev(sizeof(struct virtnet_info)); @@ -331,7 +330,6 @@ static int virtnet_probe(struct virtio_device *vdev) return -ENOMEM; /* Set up network device as normal. */ - ether_setup(dev); dev->open = virtnet_open; dev->stop = virtnet_close; dev->hard_start_xmit = start_xmit; @@ -339,42 +337,37 @@ static int virtnet_probe(struct virtio_device *vdev) SET_NETDEV_DEV(dev, &vdev->dev); /* Do we support "hardware" checksums? */ - token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_F, &len); - if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_NO_CSUM)) { + if (csum && vdev->config->feature(vdev, VIRTIO_NET_F_CSUM)) { /* This opens up the world of extra features. */ dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST; - if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4)) - dev->features |= NETIF_F_TSO; - if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_UFO)) - dev->features |= NETIF_F_UFO; - if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4_ECN)) - dev->features |= NETIF_F_TSO_ECN; - if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO6)) - dev->features |= NETIF_F_TSO6; + if (gso && vdev->config->feature(vdev, VIRTIO_NET_F_GSO)) { + dev->features |= NETIF_F_TSO | NETIF_F_UFO + | NETIF_F_TSO_ECN | NETIF_F_TSO6; + } } /* Configuration may specify what MAC to use. Otherwise random. */ - token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_MAC_F, &len); - if (token) { - dev->addr_len = len; - vdev->config->get(vdev, token, dev->dev_addr, len); + if (vdev->config->feature(vdev, VIRTIO_NET_F_MAC)) { + vdev->config->get(vdev, + offsetof(struct virtio_net_config, mac), + dev->dev_addr, dev->addr_len); } else random_ether_addr(dev->dev_addr); /* Set up our device-specific information */ vi = netdev_priv(dev); - netif_napi_add(dev, &vi->napi, virtnet_poll, 16); + netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight); vi->dev = dev; vi->vdev = vdev; /* We expect two virtqueues, receive then send. */ - vi->rvq = vdev->config->find_vq(vdev, skb_recv_done); + vi->rvq = vdev->config->find_vq(vdev, 0, skb_recv_done); if (IS_ERR(vi->rvq)) { err = PTR_ERR(vi->rvq); goto free; } - vi->svq = vdev->config->find_vq(vdev, skb_xmit_done); + vi->svq = vdev->config->find_vq(vdev, 1, skb_xmit_done); if (IS_ERR(vi->svq)) { err = PTR_ERR(vi->svq); goto free_recv; @@ -389,10 +382,22 @@ static int virtnet_probe(struct virtio_device *vdev) pr_debug("virtio_net: registering device failed\n"); goto free_send; } + + /* Last of all, set up some receive buffers. */ + try_fill_recv(vi); + + /* If we didn't even get one input buffer, we're useless. */ + if (vi->num == 0) { + err = -ENOMEM; + goto unregister; + } + pr_debug("virtnet: registered device %s\n", dev->name); vdev->priv = vi; return 0; +unregister: + unregister_netdev(dev); free_send: vdev->config->del_vq(vi->svq); free_recv: @@ -405,6 +410,20 @@ free: static void virtnet_remove(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; + struct sk_buff *skb; + + /* Stop all the virtqueues. */ + vdev->config->reset(vdev); + + /* Free our skbs in send and recv queues, if any. */ + while ((skb = __skb_dequeue(&vi->recv)) != NULL) { + kfree_skb(skb); + vi->num--; + } + while ((skb = __skb_dequeue(&vi->send)) != NULL) + kfree_skb(skb); + + BUG_ON(vi->num != 0); vdev->config->del_vq(vi->svq); vdev->config->del_vq(vi->rvq); diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 9e33fc4da87..3dd6294d10b 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -1,8 +1,35 @@ # Virtio always gets selected by whoever wants it. config VIRTIO - bool + tristate # Similarly the virtio ring implementation. config VIRTIO_RING - bool + tristate depends on VIRTIO + +config VIRTIO_PCI + tristate "PCI driver for virtio devices (EXPERIMENTAL)" + depends on PCI && EXPERIMENTAL + select VIRTIO + select VIRTIO_RING + ---help--- + This drivers provides support for virtio based paravirtual device + drivers over PCI. This requires that your VMM has appropriate PCI + virtio backends. Most QEMU based VMMs should support these devices + (like KVM or Xen). + + Currently, the ABI is not considered stable so there is no guarantee + that this version of the driver will work with your VMM. + + If unsure, say M. + +config VIRTIO_BALLOON + tristate "Virtio balloon driver (EXPERIMENTAL)" + select VIRTIO + select VIRTIO_RING + ---help--- + This driver supports increasing and decreasing the amount + of memory within a KVM guest. + + If unsure, say M. + diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index f70e40971dd..6738c446c19 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -1,2 +1,4 @@ obj-$(CONFIG_VIRTIO) += virtio.o obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o +obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o +obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 69d7ea02cd4..b535483bc55 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -102,9 +102,13 @@ static int virtio_dev_remove(struct device *_d) struct virtio_driver *drv = container_of(dev->dev.driver, struct virtio_driver, driver); - dev->config->set_status(dev, dev->config->get_status(dev) - & ~VIRTIO_CONFIG_S_DRIVER); drv->remove(dev); + + /* Driver should have reset device. */ + BUG_ON(dev->config->get_status(dev)); + + /* Acknowledge the device's existence again. */ + add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); return 0; } @@ -130,6 +134,10 @@ int register_virtio_device(struct virtio_device *dev) dev->dev.bus = &virtio_bus; sprintf(dev->dev.bus_id, "%u", dev->index); + /* We always start by resetting the device, in case a previous + * driver messed it up. This also tests that code path a little. */ + dev->config->reset(dev); + /* Acknowledge that we've seen the device. */ add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); @@ -148,55 +156,18 @@ void unregister_virtio_device(struct virtio_device *dev) } EXPORT_SYMBOL_GPL(unregister_virtio_device); -int __virtio_config_val(struct virtio_device *vdev, - u8 type, void *val, size_t size) -{ - void *token; - unsigned int len; - - token = vdev->config->find(vdev, type, &len); - if (!token) - return -ENOENT; - - if (len != size) - return -EIO; - - vdev->config->get(vdev, token, val, size); - return 0; -} -EXPORT_SYMBOL_GPL(__virtio_config_val); - -int virtio_use_bit(struct virtio_device *vdev, - void *token, unsigned int len, unsigned int bitnum) -{ - unsigned long bits[16]; - - /* This makes it convenient to pass-through find() results. */ - if (!token) - return 0; - - /* bit not in range of this bitfield? */ - if (bitnum * 8 >= len / 2) - return 0; - - /* Giant feature bitfields are silly. */ - BUG_ON(len > sizeof(bits)); - vdev->config->get(vdev, token, bits, len); - - if (!test_bit(bitnum, bits)) - return 0; - - /* Set acknowledge bit, and write it back. */ - set_bit(bitnum + len * 8 / 2, bits); - vdev->config->set(vdev, token, bits, len); - return 1; -} -EXPORT_SYMBOL_GPL(virtio_use_bit); - static int virtio_init(void) { if (bus_register(&virtio_bus) != 0) panic("virtio bus registration failed"); return 0; } + +static void __exit virtio_exit(void) +{ + bus_unregister(&virtio_bus); +} core_initcall(virtio_init); +module_exit(virtio_exit); + +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c new file mode 100644 index 00000000000..622aece1acc --- /dev/null +++ b/drivers/virtio/virtio_balloon.c @@ -0,0 +1,284 @@ +/* Virtio balloon implementation, inspired by Dor Loar and Marcelo + * Tosatti's implementations. + * + * Copyright 2008 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +//#define DEBUG +#include <linux/virtio.h> +#include <linux/virtio_balloon.h> +#include <linux/swap.h> +#include <linux/kthread.h> +#include <linux/freezer.h> + +struct virtio_balloon +{ + struct virtio_device *vdev; + struct virtqueue *inflate_vq, *deflate_vq; + + /* Where the ballooning thread waits for config to change. */ + wait_queue_head_t config_change; + + /* The thread servicing the balloon. */ + struct task_struct *thread; + + /* Waiting for host to ack the pages we released. */ + struct completion acked; + + /* Do we have to tell Host *before* we reuse pages? */ + bool tell_host_first; + + /* The pages we've told the Host we're not using. */ + unsigned int num_pages; + struct list_head pages; + + /* The array of pfns we tell the Host about. */ + unsigned int num_pfns; + u32 pfns[256]; +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_BALLOON, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static void balloon_ack(struct virtqueue *vq) +{ + struct virtio_balloon *vb; + unsigned int len; + + vb = vq->vq_ops->get_buf(vq, &len); + if (vb) + complete(&vb->acked); +} + +static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) +{ + struct scatterlist sg; + + sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); + + init_completion(&vb->acked); + + /* We should always be able to add one buffer to an empty queue. */ + if (vq->vq_ops->add_buf(vq, &sg, 1, 0, vb) != 0) + BUG(); + vq->vq_ops->kick(vq); + + /* When host has read buffer, this completes via balloon_ack */ + wait_for_completion(&vb->acked); +} + +static void fill_balloon(struct virtio_balloon *vb, size_t num) +{ + /* We can only do one array worth at a time. */ + num = min(num, ARRAY_SIZE(vb->pfns)); + + for (vb->num_pfns = 0; vb->num_pfns < num; vb->num_pfns++) { + struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY); + if (!page) { + if (printk_ratelimit()) + dev_printk(KERN_INFO, &vb->vdev->dev, + "Out of puff! Can't get %zu pages\n", + num); + /* Sleep for at least 1/5 of a second before retry. */ + msleep(200); + break; + } + vb->pfns[vb->num_pfns] = page_to_pfn(page); + totalram_pages--; + vb->num_pages++; + list_add(&page->lru, &vb->pages); + } + + /* Didn't get any? Oh well. */ + if (vb->num_pfns == 0) + return; + + tell_host(vb, vb->inflate_vq); +} + +static void release_pages_by_pfn(const u32 pfns[], unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) { + __free_page(pfn_to_page(pfns[i])); + totalram_pages++; + } +} + +static void leak_balloon(struct virtio_balloon *vb, size_t num) +{ + struct page *page; + + /* We can only do one array worth at a time. */ + num = min(num, ARRAY_SIZE(vb->pfns)); + + for (vb->num_pfns = 0; vb->num_pfns < num; vb->num_pfns++) { + page = list_first_entry(&vb->pages, struct page, lru); + list_del(&page->lru); + vb->pfns[vb->num_pfns] = page_to_pfn(page); + vb->num_pages--; + } + + if (vb->tell_host_first) { + tell_host(vb, vb->deflate_vq); + release_pages_by_pfn(vb->pfns, vb->num_pfns); + } else { + release_pages_by_pfn(vb->pfns, vb->num_pfns); + tell_host(vb, vb->deflate_vq); + } +} + +static void virtballoon_changed(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + + wake_up(&vb->config_change); +} + +static inline int towards_target(struct virtio_balloon *vb) +{ + u32 v; + __virtio_config_val(vb->vdev, + offsetof(struct virtio_balloon_config, num_pages), + &v); + return v - vb->num_pages; +} + +static void update_balloon_size(struct virtio_balloon *vb) +{ + __le32 actual = cpu_to_le32(vb->num_pages); + + vb->vdev->config->set(vb->vdev, + offsetof(struct virtio_balloon_config, actual), + &actual, sizeof(actual)); +} + +static int balloon(void *_vballoon) +{ + struct virtio_balloon *vb = _vballoon; + + set_freezable(); + while (!kthread_should_stop()) { + int diff; + + try_to_freeze(); + wait_event_interruptible(vb->config_change, + (diff = towards_target(vb)) != 0 + || kthread_should_stop()); + if (diff > 0) + fill_balloon(vb, diff); + else if (diff < 0) + leak_balloon(vb, -diff); + update_balloon_size(vb); + } + return 0; +} + +static int virtballoon_probe(struct virtio_device *vdev) +{ + struct virtio_balloon *vb; + int err; + + vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); + if (!vb) { + err = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&vb->pages); + vb->num_pages = 0; + init_waitqueue_head(&vb->config_change); + vb->vdev = vdev; + + /* We expect two virtqueues. */ + vb->inflate_vq = vdev->config->find_vq(vdev, 0, balloon_ack); + if (IS_ERR(vb->inflate_vq)) { + err = PTR_ERR(vb->inflate_vq); + goto out_free_vb; + } + + vb->deflate_vq = vdev->config->find_vq(vdev, 1, balloon_ack); + if (IS_ERR(vb->deflate_vq)) { + err = PTR_ERR(vb->deflate_vq); + goto out_del_inflate_vq; + } + + vb->thread = kthread_run(balloon, vb, "vballoon"); + if (IS_ERR(vb->thread)) { + err = PTR_ERR(vb->thread); + goto out_del_deflate_vq; + } + + vb->tell_host_first + = vdev->config->feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); + + return 0; + +out_del_deflate_vq: + vdev->config->del_vq(vb->deflate_vq); +out_del_inflate_vq: + vdev->config->del_vq(vb->inflate_vq); +out_free_vb: + kfree(vb); +out: + return err; +} + +static void virtballoon_remove(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + + kthread_stop(vb->thread); + + /* There might be pages left in the balloon: free them. */ + while (vb->num_pages) + leak_balloon(vb, vb->num_pages); + + /* Now we reset the device so we can clean up the queues. */ + vdev->config->reset(vdev); + + vdev->config->del_vq(vb->deflate_vq); + vdev->config->del_vq(vb->inflate_vq); + kfree(vb); +} + +static struct virtio_driver virtio_balloon = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtballoon_probe, + .remove = __devexit_p(virtballoon_remove), + .config_changed = virtballoon_changed, +}; + +static int __init init(void) +{ + return register_virtio_driver(&virtio_balloon); +} + +static void __exit fini(void) +{ + unregister_virtio_driver(&virtio_balloon); +} +module_init(init); +module_exit(fini); + +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio balloon driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c new file mode 100644 index 00000000000..26f787ddd5f --- /dev/null +++ b/drivers/virtio/virtio_pci.c @@ -0,0 +1,446 @@ +/* + * Virtio PCI driver + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include <linux/module.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/interrupt.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/virtio_ring.h> +#include <linux/virtio_pci.h> +#include <linux/highmem.h> +#include <linux/spinlock.h> + +MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>"); +MODULE_DESCRIPTION("virtio-pci"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1"); + +/* Our device structure */ +struct virtio_pci_device +{ + struct virtio_device vdev; + struct pci_dev *pci_dev; + + /* the IO mapping for the PCI config space */ + void *ioaddr; + + /* a list of queues so we can dispatch IRQs */ + spinlock_t lock; + struct list_head virtqueues; +}; + +struct virtio_pci_vq_info +{ + /* the actual virtqueue */ + struct virtqueue *vq; + + /* the number of entries in the queue */ + int num; + + /* the index of the queue */ + int queue_index; + + /* the virtual address of the ring queue */ + void *queue; + + /* the list node for the virtqueues list */ + struct list_head node; +}; + +/* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ +static struct pci_device_id virtio_pci_id_table[] = { + { 0x1af4, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, + { 0 }, +}; + +MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); + +/* A PCI device has it's own struct device and so does a virtio device so + * we create a place for the virtio devices to show up in sysfs. I think it + * would make more sense for virtio to not insist on having it's own device. */ +static struct device virtio_pci_root = { + .parent = NULL, + .bus_id = "virtio-pci", +}; + +/* Unique numbering for devices under the kvm root */ +static unsigned int dev_index; + +/* Convert a generic virtio device to our structure */ +static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev) +{ + return container_of(vdev, struct virtio_pci_device, vdev); +} + +/* virtio config->feature() implementation */ +static bool vp_feature(struct virtio_device *vdev, unsigned bit) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u32 mask; + + /* Since this function is supposed to have the side effect of + * enabling a queried feature, we simulate that by doing a read + * from the host feature bitmask and then writing to the guest + * feature bitmask */ + mask = ioread32(vp_dev->ioaddr + VIRTIO_PCI_HOST_FEATURES); + if (mask & (1 << bit)) { + mask |= (1 << bit); + iowrite32(mask, vp_dev->ioaddr + VIRTIO_PCI_GUEST_FEATURES); + } + + return !!(mask & (1 << bit)); +} + +/* virtio config->get() implementation */ +static void vp_get(struct virtio_device *vdev, unsigned offset, + void *buf, unsigned len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void *ioaddr = vp_dev->ioaddr + VIRTIO_PCI_CONFIG + offset; + u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + ptr[i] = ioread8(ioaddr + i); +} + +/* the config->set() implementation. it's symmetric to the config->get() + * implementation */ +static void vp_set(struct virtio_device *vdev, unsigned offset, + const void *buf, unsigned len) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void *ioaddr = vp_dev->ioaddr + VIRTIO_PCI_CONFIG + offset; + const u8 *ptr = buf; + int i; + + for (i = 0; i < len; i++) + iowrite8(ptr[i], ioaddr + i); +} + +/* config->{get,set}_status() implementations */ +static u8 vp_get_status(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + return ioread8(vp_dev->ioaddr + VIRTIO_PCI_STATUS); +} + +static void vp_set_status(struct virtio_device *vdev, u8 status) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* We should never be setting status to 0. */ + BUG_ON(status == 0); + return iowrite8(status, vp_dev->ioaddr + VIRTIO_PCI_STATUS); +} + +static void vp_reset(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + /* 0 status means a reset. */ + return iowrite8(0, vp_dev->ioaddr + VIRTIO_PCI_STATUS); +} + +/* the notify function used when creating a virt queue */ +static void vp_notify(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_vq_info *info = vq->priv; + + /* we write the queue's selector into the notification register to + * signal the other end */ + iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY); +} + +/* A small wrapper to also acknowledge the interrupt when it's handled. + * I really need an EIO hook for the vring so I can ack the interrupt once we + * know that we'll be handling the IRQ but before we invoke the callback since + * the callback may notify the host which results in the host attempting to + * raise an interrupt that we would then mask once we acknowledged the + * interrupt. */ +static irqreturn_t vp_interrupt(int irq, void *opaque) +{ + struct virtio_pci_device *vp_dev = opaque; + struct virtio_pci_vq_info *info; + irqreturn_t ret = IRQ_NONE; + u8 isr; + + /* reading the ISR has the effect of also clearing it so it's very + * important to save off the value. */ + isr = ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR); + + /* It's definitely not us if the ISR was not high */ + if (!isr) + return IRQ_NONE; + + /* Configuration change? Tell driver if it wants to know. */ + if (isr & VIRTIO_PCI_ISR_CONFIG) { + struct virtio_driver *drv; + drv = container_of(vp_dev->vdev.dev.driver, + struct virtio_driver, driver); + + if (drv->config_changed) + drv->config_changed(&vp_dev->vdev); + } + + spin_lock(&vp_dev->lock); + list_for_each_entry(info, &vp_dev->virtqueues, node) { + if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) + ret = IRQ_HANDLED; + } + spin_unlock(&vp_dev->lock); + + return ret; +} + +/* the config->find_vq() implementation */ +static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, + void (*callback)(struct virtqueue *vq)) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_vq_info *info; + struct virtqueue *vq; + u16 num; + int err; + + /* Select the queue we're interested in */ + iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + + /* Check if queue is either not available or already active. */ + num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM); + if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN)) + return ERR_PTR(-ENOENT); + + /* allocate and fill out our structure the represents an active + * queue */ + info = kmalloc(sizeof(struct virtio_pci_vq_info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + info->queue_index = index; + info->num = num; + + info->queue = kzalloc(PAGE_ALIGN(vring_size(num,PAGE_SIZE)), GFP_KERNEL); + if (info->queue == NULL) { + err = -ENOMEM; + goto out_info; + } + + /* activate the queue */ + iowrite32(virt_to_phys(info->queue) >> PAGE_SHIFT, + vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); + + /* create the vring */ + vq = vring_new_virtqueue(info->num, vdev, info->queue, + vp_notify, callback); + if (!vq) { + err = -ENOMEM; + goto out_activate_queue; + } + + vq->priv = info; + info->vq = vq; + + spin_lock(&vp_dev->lock); + list_add(&info->node, &vp_dev->virtqueues); + spin_unlock(&vp_dev->lock); + + return vq; + +out_activate_queue: + iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); + kfree(info->queue); +out_info: + kfree(info); + return ERR_PTR(err); +} + +/* the config->del_vq() implementation */ +static void vp_del_vq(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_vq_info *info = vq->priv; + + spin_lock(&vp_dev->lock); + list_del(&info->node); + spin_unlock(&vp_dev->lock); + + vring_del_virtqueue(vq); + + /* Select and deactivate the queue */ + iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); + + kfree(info->queue); + kfree(info); +} + +static struct virtio_config_ops virtio_pci_config_ops = { + .feature = vp_feature, + .get = vp_get, + .set = vp_set, + .get_status = vp_get_status, + .set_status = vp_set_status, + .reset = vp_reset, + .find_vq = vp_find_vq, + .del_vq = vp_del_vq, +}; + +/* the PCI probing function */ +static int __devinit virtio_pci_probe(struct pci_dev *pci_dev, + const struct pci_device_id *id) +{ + struct virtio_pci_device *vp_dev; + int err; + + /* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */ + if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f) + return -ENODEV; + + if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION) { + printk(KERN_ERR "virtio_pci: expected ABI version %d, got %d\n", + VIRTIO_PCI_ABI_VERSION, pci_dev->revision); + return -ENODEV; + } + + /* allocate our structure and fill it out */ + vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL); + if (vp_dev == NULL) + return -ENOMEM; + + snprintf(vp_dev->vdev.dev.bus_id, BUS_ID_SIZE, "virtio%d", dev_index); + vp_dev->vdev.index = dev_index; + dev_index++; + + vp_dev->vdev.dev.parent = &virtio_pci_root; + vp_dev->vdev.config = &virtio_pci_config_ops; + vp_dev->pci_dev = pci_dev; + INIT_LIST_HEAD(&vp_dev->virtqueues); + spin_lock_init(&vp_dev->lock); + + /* enable the device */ + err = pci_enable_device(pci_dev); + if (err) + goto out; + + err = pci_request_regions(pci_dev, "virtio-pci"); + if (err) + goto out_enable_device; + + vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0); + if (vp_dev->ioaddr == NULL) + goto out_req_regions; + + pci_set_drvdata(pci_dev, vp_dev); + + /* we use the subsystem vendor/device id as the virtio vendor/device + * id. this allows us to use the same PCI vendor/device id for all + * virtio devices and to identify the particular virtio driver by + * the subsytem ids */ + vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor; + vp_dev->vdev.id.device = pci_dev->subsystem_device; + + /* register a handler for the queue with the PCI device's interrupt */ + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, + vp_dev->vdev.dev.bus_id, vp_dev); + if (err) + goto out_set_drvdata; + + /* finally register the virtio device */ + err = register_virtio_device(&vp_dev->vdev); + if (err) + goto out_req_irq; + + return 0; + +out_req_irq: + free_irq(pci_dev->irq, vp_dev); +out_set_drvdata: + pci_set_drvdata(pci_dev, NULL); + pci_iounmap(pci_dev, vp_dev->ioaddr); +out_req_regions: + pci_release_regions(pci_dev); +out_enable_device: + pci_disable_device(pci_dev); +out: + kfree(vp_dev); + return err; +} + +static void __devexit virtio_pci_remove(struct pci_dev *pci_dev) +{ + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + + free_irq(pci_dev->irq, vp_dev); + pci_set_drvdata(pci_dev, NULL); + pci_iounmap(pci_dev, vp_dev->ioaddr); + pci_release_regions(pci_dev); + pci_disable_device(pci_dev); + kfree(vp_dev); +} + +#ifdef CONFIG_PM +static int virtio_pci_suspend(struct pci_dev *pci_dev, pm_message_t state) +{ + pci_save_state(pci_dev); + pci_set_power_state(pci_dev, PCI_D3hot); + return 0; +} + +static int virtio_pci_resume(struct pci_dev *pci_dev) +{ + pci_restore_state(pci_dev); + pci_set_power_state(pci_dev, PCI_D0); + return 0; +} +#endif + +static struct pci_driver virtio_pci_driver = { + .name = "virtio-pci", + .id_table = virtio_pci_id_table, + .probe = virtio_pci_probe, + .remove = virtio_pci_remove, +#ifdef CONFIG_PM + .suspend = virtio_pci_suspend, + .resume = virtio_pci_resume, +#endif +}; + +static int __init virtio_pci_init(void) +{ + int err; + + err = device_register(&virtio_pci_root); + if (err) + return err; + + err = pci_register_driver(&virtio_pci_driver); + if (err) + device_unregister(&virtio_pci_root); + + return err; +} + +module_init(virtio_pci_init); + +static void __exit virtio_pci_exit(void) +{ + device_unregister(&virtio_pci_root); + pci_unregister_driver(&virtio_pci_driver); +} + +module_exit(virtio_pci_exit); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 1dc04b6684e..3a28c138213 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -87,6 +87,8 @@ static int vring_add_buf(struct virtqueue *_vq, if (vq->num_free < out + in) { pr_debug("Can't add buf len %i - avail = %i\n", out + in, vq->num_free); + /* We notify *even if* VRING_USED_F_NO_NOTIFY is set here. */ + vq->notify(&vq->vq); END_USE(vq); return -ENOSPC; } @@ -97,16 +99,14 @@ static int vring_add_buf(struct virtqueue *_vq, head = vq->free_head; for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { vq->vring.desc[i].flags = VRING_DESC_F_NEXT; - vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT) - + sg->offset; + vq->vring.desc[i].addr = sg_phys(sg); vq->vring.desc[i].len = sg->length; prev = i; sg++; } for (; in; i = vq->vring.desc[i].next, in--) { vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; - vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT) - + sg->offset; + vq->vring.desc[i].addr = sg_phys(sg); vq->vring.desc[i].len = sg->length; prev = i; sg++; @@ -171,16 +171,6 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head) vq->num_free++; } -/* FIXME: We need to tell other side about removal, to synchronize. */ -static void vring_shutdown(struct virtqueue *_vq) -{ - struct vring_virtqueue *vq = to_vvq(_vq); - unsigned int i; - - for (i = 0; i < vq->vring.num; i++) - detach_buf(vq, i); -} - static inline bool more_used(const struct vring_virtqueue *vq) { return vq->last_used_idx != vq->vring.used->idx; @@ -220,7 +210,17 @@ static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len) return ret; } -static bool vring_restart(struct virtqueue *_vq) +static void vring_disable_cb(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + START_USE(vq); + BUG_ON(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT); + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; + END_USE(vq); +} + +static bool vring_enable_cb(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -253,26 +253,34 @@ irqreturn_t vring_interrupt(int irq, void *_vq) if (unlikely(vq->broken)) return IRQ_HANDLED; + /* Other side may have missed us turning off the interrupt, + * but we should preserve disable semantic for virtio users. */ + if (unlikely(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + pr_debug("virtqueue interrupt after disable for %p\n", vq); + return IRQ_HANDLED; + } + pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); - if (vq->vq.callback && !vq->vq.callback(&vq->vq)) - vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; + if (vq->vq.callback) + vq->vq.callback(&vq->vq); return IRQ_HANDLED; } +EXPORT_SYMBOL_GPL(vring_interrupt); static struct virtqueue_ops vring_vq_ops = { .add_buf = vring_add_buf, .get_buf = vring_get_buf, .kick = vring_kick, - .restart = vring_restart, - .shutdown = vring_shutdown, + .disable_cb = vring_disable_cb, + .enable_cb = vring_enable_cb, }; struct virtqueue *vring_new_virtqueue(unsigned int num, struct virtio_device *vdev, void *pages, void (*notify)(struct virtqueue *), - bool (*callback)(struct virtqueue *)) + void (*callback)(struct virtqueue *)) { struct vring_virtqueue *vq; unsigned int i; @@ -311,9 +319,12 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, return &vq->vq; } +EXPORT_SYMBOL_GPL(vring_new_virtqueue); void vring_del_virtqueue(struct virtqueue *vq) { kfree(to_vvq(vq)); } +EXPORT_SYMBOL_GPL(vring_del_virtqueue); +MODULE_LICENSE("GPL"); diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index 697104da91f..589be3e1f3a 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -23,7 +23,12 @@ struct lguest_device_desc { /* The device type: console, network, disk etc. Type 0 terminates. */ __u8 type; - /* The number of bytes of the config array. */ + /* The number of virtqueues (first in config array) */ + __u8 num_vq; + /* The number of bytes of feature bits. Multiply by 2: one for host + * features and one for guest acknowledgements. */ + __u8 feature_len; + /* The number of bytes of the config array after virtqueues. */ __u8 config_len; /* A status byte, written by the Guest. */ __u8 status; @@ -31,7 +36,7 @@ struct lguest_device_desc { }; /*D:135 This is how we expect the device configuration field for a virtqueue - * (type VIRTIO_CONFIG_F_VIRTQUEUE) to be laid out: */ + * to be laid out in config space. */ struct lguest_vqconfig { /* The number of entries in the virtio_ring */ __u16 num; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index dfe975a9967..412672a79e8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1810,5 +1810,6 @@ static inline void skb_forward_csum(struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; } +bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 14e1379876d..260d1fcf29a 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -11,15 +11,13 @@ /** * virtqueue - a queue to register buffers for sending or receiving. * @callback: the function to call when buffers are consumed (can be NULL). - * If this returns false, callbacks are suppressed until vq_ops->restart - * is called. * @vdev: the virtio device this queue was created for. * @vq_ops: the operations for this virtqueue (see below). * @priv: a pointer for the virtqueue implementation to use. */ struct virtqueue { - bool (*callback)(struct virtqueue *vq); + void (*callback)(struct virtqueue *vq); struct virtio_device *vdev; struct virtqueue_ops *vq_ops; void *priv; @@ -41,13 +39,12 @@ struct virtqueue * vq: the struct virtqueue we're talking about. * len: the length written into the buffer * Returns NULL or the "data" token handed to add_buf. - * @restart: restart callbacks after callback returned false. + * @disable_cb: disable callbacks + * vq: the struct virtqueue we're talking about. + * @enable_cb: restart callbacks after disable_cb. * vq: the struct virtqueue we're talking about. * This returns "false" (and doesn't re-enable) if there are pending * buffers in the queue, to avoid a race. - * @shutdown: "unadd" all buffers. - * vq: the struct virtqueue we're talking about. - * Remove everything from the queue. * * Locking rules are straightforward: the driver is responsible for * locking. No two operations may be invoked simultaneously. @@ -65,9 +62,8 @@ struct virtqueue_ops { void *(*get_buf)(struct virtqueue *vq, unsigned int *len); - bool (*restart)(struct virtqueue *vq); - - void (*shutdown)(struct virtqueue *vq); + void (*disable_cb)(struct virtqueue *vq); + bool (*enable_cb)(struct virtqueue *vq); }; /** @@ -97,12 +93,15 @@ void unregister_virtio_device(struct virtio_device *dev); * @probe: the function to call when a device is found. Returns a token for * remove, or PTR_ERR(). * @remove: the function when a device is removed. + * @config_changed: optional function to call when the device configuration + * changes; may be called in interrupt context. */ struct virtio_driver { struct device_driver driver; const struct virtio_device_id *id_table; int (*probe)(struct virtio_device *dev); void (*remove)(struct virtio_device *dev); + void (*config_changed)(struct virtio_device *dev); }; int register_virtio_driver(struct virtio_driver *drv); diff --git a/include/linux/virtio_balloon.h b/include/linux/virtio_balloon.h new file mode 100644 index 00000000000..979524ee75b --- /dev/null +++ b/include/linux/virtio_balloon.h @@ -0,0 +1,18 @@ +#ifndef _LINUX_VIRTIO_BALLOON_H +#define _LINUX_VIRTIO_BALLOON_H +#include <linux/virtio_config.h> + +/* The ID for virtio_balloon */ +#define VIRTIO_ID_BALLOON 5 + +/* The feature bitmap for virtio balloon */ +#define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ + +struct virtio_balloon_config +{ + /* Number of pages host wants Guest to give up. */ + __le32 num_pages; + /* Number of pages we've actually got in balloon. */ + __le32 actual; +}; +#endif /* _LINUX_VIRTIO_BALLOON_H */ diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h index 7bd2bce0cfd..bca0b10d794 100644 --- a/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -6,15 +6,19 @@ #define VIRTIO_ID_BLOCK 2 /* Feature bits */ -#define VIRTIO_CONFIG_BLK_F 0x40 -#define VIRTIO_BLK_F_BARRIER 1 /* Does host support barriers? */ +#define VIRTIO_BLK_F_BARRIER 0 /* Does host support barriers? */ +#define VIRTIO_BLK_F_SIZE_MAX 1 /* Indicates maximum segment size */ +#define VIRTIO_BLK_F_SEG_MAX 2 /* Indicates maximum # of segments */ -/* The capacity (in 512-byte sectors). */ -#define VIRTIO_CONFIG_BLK_F_CAPACITY 0x41 -/* The maximum segment size. */ -#define VIRTIO_CONFIG_BLK_F_SIZE_MAX 0x42 -/* The maximum number of segments. */ -#define VIRTIO_CONFIG_BLK_F_SEG_MAX 0x43 +struct virtio_blk_config +{ + /* The capacity (in 512-byte sectors). */ + __le64 capacity; + /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */ + __le32 size_max; + /* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */ + __le32 seg_max; +} __attribute__((packed)); /* These two define direction. */ #define VIRTIO_BLK_T_IN 0 @@ -35,8 +39,6 @@ struct virtio_blk_outhdr __u32 ioprio; /* Sector (ie. 512 byte offset) */ __u64 sector; - /* Where to put reply. */ - __u64 id; }; #define VIRTIO_BLK_S_OK 0 diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index bcc01888df7..d581b2914b3 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -5,7 +5,7 @@ * store and access that space differently. */ #include <linux/types.h> -/* Status byte for guest to report progress, and synchronize config. */ +/* Status byte for guest to report progress, and synchronize features. */ /* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */ #define VIRTIO_CONFIG_S_ACKNOWLEDGE 1 /* We have found a driver for the device. */ @@ -15,34 +15,27 @@ /* We've given up on this device. */ #define VIRTIO_CONFIG_S_FAILED 0x80 -/* Feature byte (actually 7 bits availabe): */ -/* Requirements/features of the virtio implementation. */ -#define VIRTIO_CONFIG_F_VIRTIO 1 -/* Requirements/features of the virtqueue (may have more than one). */ -#define VIRTIO_CONFIG_F_VIRTQUEUE 2 - #ifdef __KERNEL__ struct virtio_device; /** * virtio_config_ops - operations for configuring a virtio device - * @find: search for the next configuration field of the given type. + * @feature: search for a feature in this config * vdev: the virtio_device - * type: the feature type - * len: the (returned) length of the field if found. - * Returns a token if found, or NULL. Never returnes the same field twice - * (ie. it's used up). - * @get: read the value of a configuration field after find(). + * bit: the feature bit + * Returns true if the feature is supported. Acknowledges the feature + * so the host can see it. + * @get: read the value of a configuration field * vdev: the virtio_device - * token: the token returned from find(). + * offset: the offset of the configuration field * buf: the buffer to write the field value into. - * len: the length of the buffer (given by find()). + * len: the length of the buffer * Note that contents are conventionally little-endian. - * @set: write the value of a configuration field after find(). + * @set: write the value of a configuration field * vdev: the virtio_device - * token: the token returned from find(). + * offset: the offset of the configuration field * buf: the buffer to read the field value from. - * len: the length of the buffer (given by find()). + * len: the length of the buffer * Note that contents are conventionally little-endian. * @get_status: read the status byte * vdev: the virtio_device @@ -50,62 +43,67 @@ struct virtio_device; * @set_status: write the status byte * vdev: the virtio_device * status: the new status byte - * @find_vq: find the first VIRTIO_CONFIG_F_VIRTQUEUE and create a virtqueue. + * @reset: reset the device + * vdev: the virtio device + * After this, status and feature negotiation must be done again + * @find_vq: find a virtqueue and instantiate it. * vdev: the virtio_device + * index: the 0-based virtqueue number in case there's more than one. * callback: the virqtueue callback - * Returns the new virtqueue or ERR_PTR(). + * Returns the new virtqueue or ERR_PTR() (eg. -ENOENT). * @del_vq: free a virtqueue found by find_vq(). */ struct virtio_config_ops { - void *(*find)(struct virtio_device *vdev, u8 type, unsigned *len); - void (*get)(struct virtio_device *vdev, void *token, + bool (*feature)(struct virtio_device *vdev, unsigned bit); + void (*get)(struct virtio_device *vdev, unsigned offset, void *buf, unsigned len); - void (*set)(struct virtio_device *vdev, void *token, + void (*set)(struct virtio_device *vdev, unsigned offset, const void *buf, unsigned len); u8 (*get_status)(struct virtio_device *vdev); void (*set_status)(struct virtio_device *vdev, u8 status); + void (*reset)(struct virtio_device *vdev); struct virtqueue *(*find_vq)(struct virtio_device *vdev, - bool (*callback)(struct virtqueue *)); + unsigned index, + void (*callback)(struct virtqueue *)); void (*del_vq)(struct virtqueue *vq); }; /** - * virtio_config_val - get a single virtio config and mark it used. - * @config: the virtio config space - * @type: the type to search for. + * virtio_config_val - look for a feature and get a single virtio config. + * @vdev: the virtio device + * @fbit: the feature bit + * @offset: the type to search for. * @val: a pointer to the value to fill in. * - * Once used, the config type is marked with VIRTIO_CONFIG_F_USED so it can't - * be found again. This version does endian conversion. */ -#define virtio_config_val(vdev, type, v) ({ \ - int _err = __virtio_config_val((vdev),(type),(v),sizeof(*(v))); \ - \ - BUILD_BUG_ON(sizeof(*(v)) != 1 && sizeof(*(v)) != 2 \ - && sizeof(*(v)) != 4 && sizeof(*(v)) != 8); \ - if (!_err) { \ - switch (sizeof(*(v))) { \ - case 2: le16_to_cpus((__u16 *) v); break; \ - case 4: le32_to_cpus((__u32 *) v); break; \ - case 8: le64_to_cpus((__u64 *) v); break; \ - } \ - } \ + * The return value is -ENOENT if the feature doesn't exist. Otherwise + * the value is endian-corrected and returned in v. */ +#define virtio_config_val(vdev, fbit, offset, v) ({ \ + int _err; \ + if ((vdev)->config->feature((vdev), (fbit))) { \ + __virtio_config_val((vdev), (offset), (v)); \ + _err = 0; \ + } else \ + _err = -ENOENT; \ _err; \ }) -int __virtio_config_val(struct virtio_device *dev, - u8 type, void *val, size_t size); - /** - * virtio_use_bit - helper to use a feature bit in a bitfield value. - * @dev: the virtio device - * @token: the token as returned from vdev->config->find(). - * @len: the length of the field. - * @bitnum: the bit to test. + * __virtio_config_val - get a single virtio config without feature check. + * @vdev: the virtio device + * @offset: the type to search for. + * @val: a pointer to the value to fill in. * - * If handed a NULL token, it returns false, otherwise returns bit status. - * If it's one, it sets the mirroring acknowledgement bit. */ -int virtio_use_bit(struct virtio_device *vdev, - void *token, unsigned int len, unsigned int bitnum); + * The value is endian-corrected and returned in v. */ +#define __virtio_config_val(vdev, offset, v) do { \ + BUILD_BUG_ON(sizeof(*(v)) != 1 && sizeof(*(v)) != 2 \ + && sizeof(*(v)) != 4 && sizeof(*(v)) != 8); \ + (vdev)->config->get((vdev), (offset), (v), sizeof(*(v))); \ + switch (sizeof(*(v))) { \ + case 2: le16_to_cpus((__u16 *) v); break; \ + case 4: le32_to_cpus((__u32 *) v); break; \ + case 8: le64_to_cpus((__u64 *) v); break; \ + } \ +} while(0) #endif /* __KERNEL__ */ #endif /* _LINUX_VIRTIO_CONFIG_H */ diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index ae469ae55d3..1ea3351df60 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -5,32 +5,32 @@ /* The ID for virtio_net */ #define VIRTIO_ID_NET 1 -/* The bitmap of config for virtio net */ -#define VIRTIO_CONFIG_NET_F 0x40 -#define VIRTIO_NET_F_NO_CSUM 0 -#define VIRTIO_NET_F_TSO4 1 -#define VIRTIO_NET_F_UFO 2 -#define VIRTIO_NET_F_TSO4_ECN 3 -#define VIRTIO_NET_F_TSO6 4 +/* The feature bitmap for virtio net */ +#define VIRTIO_NET_F_CSUM 0 /* Can handle pkts w/ partial csum */ +#define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */ +#define VIRTIO_NET_F_GSO 6 /* Can handle pkts w/ any GSO type */ -/* The config defining mac address. */ -#define VIRTIO_CONFIG_NET_MAC_F 0x41 +struct virtio_net_config +{ + /* The config defining mac address (if VIRTIO_NET_F_MAC) */ + __u8 mac[6]; +} __attribute__((packed)); /* This is the first element of the scatter-gather list. If you don't * specify GSO or CSUM features, you can simply ignore the header. */ struct virtio_net_hdr { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset - __u8 flags; + __u8 flags; #define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame #define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO) -/* FIXME: Do we need this? If they said they can handle ECN, do they care? */ -#define VIRTIO_NET_HDR_GSO_TCPV4_ECN 2 // GSO frame, IPv4 TCP w/ ECN #define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO) #define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP - __u8 gso_type; - __u16 gso_size; - __u16 csum_start; - __u16 csum_offset; +#define VIRTIO_NET_HDR_GSO_ECN 0x80 // TCP has ECN set + __u8 gso_type; + __u16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ + __u16 gso_size; /* Bytes to append to gso_hdr_len per frame */ + __u16 csum_start; /* Position to start checksumming from */ + __u16 csum_offset; /* Offset after that to place checksum */ }; #endif /* _LINUX_VIRTIO_NET_H */ diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h new file mode 100644 index 00000000000..b3151659cf4 --- /dev/null +++ b/include/linux/virtio_pci.h @@ -0,0 +1,57 @@ +/* + * Virtio PCI driver + * + * This module allows virtio devices to be used over a virtual PCI device. + * This can be used with QEMU based VMMs like KVM or Xen. + * + * Copyright IBM Corp. 2007 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef _LINUX_VIRTIO_PCI_H +#define _LINUX_VIRTIO_PCI_H + +#include <linux/virtio_config.h> + +/* A 32-bit r/o bitmask of the features supported by the host */ +#define VIRTIO_PCI_HOST_FEATURES 0 + +/* A 32-bit r/w bitmask of features activated by the guest */ +#define VIRTIO_PCI_GUEST_FEATURES 4 + +/* A 32-bit r/w PFN for the currently selected queue */ +#define VIRTIO_PCI_QUEUE_PFN 8 + +/* A 16-bit r/o queue size for the currently selected queue */ +#define VIRTIO_PCI_QUEUE_NUM 12 + +/* A 16-bit r/w queue selector */ +#define VIRTIO_PCI_QUEUE_SEL 14 + +/* A 16-bit r/w queue notifier */ +#define VIRTIO_PCI_QUEUE_NOTIFY 16 + +/* An 8-bit device status register. */ +#define VIRTIO_PCI_STATUS 18 + +/* An 8-bit r/o interrupt status register. Reading the value will return the + * current contents of the ISR and will also clear it. This is effectively + * a read-and-acknowledge. */ +#define VIRTIO_PCI_ISR 19 + +/* The bit of the ISR which indicates a device configuration change. */ +#define VIRTIO_PCI_ISR_CONFIG 0x2 + +/* The remaining space is defined by each driver as the per-driver + * configuration space */ +#define VIRTIO_PCI_CONFIG 20 + +/* Virtio ABI version, this must match exactly */ +#define VIRTIO_PCI_ABI_VERSION 0 +#endif diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 1a4ed49f647..abe481ed990 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -15,9 +15,13 @@ /* This marks a buffer as write-only (otherwise read-only). */ #define VRING_DESC_F_WRITE 2 -/* This means don't notify other side when buffer added. */ +/* The Host uses this in used->flags to advise the Guest: don't kick me when + * you add a buffer. It's unreliable, so it's simply an optimization. Guest + * will still kick if it's out of buffers. */ #define VRING_USED_F_NO_NOTIFY 1 -/* This means don't interrupt guest when buffer consumed. */ +/* The Guest uses this in avail->flags to advise the Host: don't interrupt me + * when you consume a buffer. It's unreliable, so it's simply an + * optimization. */ #define VRING_AVAIL_F_NO_INTERRUPT 1 /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ @@ -89,7 +93,7 @@ struct vring { * }; */ static inline void vring_init(struct vring *vr, unsigned int num, void *p, - unsigned int pagesize) + unsigned long pagesize) { vr->num = num; vr->desc = p; @@ -98,7 +102,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p, & ~(pagesize - 1)); } -static inline unsigned vring_size(unsigned int num, unsigned int pagesize) +static inline unsigned vring_size(unsigned int num, unsigned long pagesize) { return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num) + pagesize - 1) & ~(pagesize - 1)) @@ -114,7 +118,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, struct virtio_device *vdev, void *pages, void (*notify)(struct virtqueue *vq), - bool (*callback)(struct virtqueue *vq)); + void (*callback)(struct virtqueue *vq)); void vring_del_virtqueue(struct virtqueue *vq); irqreturn_t vring_interrupt(int irq, void *_vq); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 40b71a29fc3..42eea5fe262 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -199,14 +199,12 @@ static void p9_virtio_close(struct p9_trans *trans) kfree(trans); } -static bool p9_virtio_intr(struct virtqueue *q) +static void p9_virtio_intr(struct virtqueue *q) { struct virtio_chan *chan = q->vdev->priv; P9_DPRINTK(P9_DEBUG_TRANS, "9p poll_wakeup: %p\n", &chan->wq); wake_up_interruptible(&chan->wq); - - return true; } static int p9_virtio_probe(struct virtio_device *dev) @@ -236,13 +234,13 @@ static int p9_virtio_probe(struct virtio_device *dev) /* Find the input queue. */ dev->priv = chan; - chan->in_vq = dev->config->find_vq(dev, p9_virtio_intr); + chan->in_vq = dev->config->find_vq(dev, 0, p9_virtio_intr); if (IS_ERR(chan->in_vq)) { err = PTR_ERR(chan->in_vq); goto free; } - chan->out_vq = dev->config->find_vq(dev, NULL); + chan->out_vq = dev->config->find_vq(dev, 1, NULL); if (IS_ERR(chan->out_vq)) { err = PTR_ERR(chan->out_vq); goto free_in_vq; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 98420f9c4b6..4e354221ec2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2461,6 +2461,34 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return elt; } +/** + * skb_partial_csum_set - set up and verify partial csum values for packet + * @skb: the skb to set + * @start: the number of bytes after skb->data to start checksumming. + * @off: the offset from start to place the checksum. + * + * For untrusted partially-checksummed packets, we need to make sure the values + * for skb->csum_start and skb->csum_offset are valid so we don't oops. + * + * This function checks and sets those values and skb->ip_summed: if this + * returns false you should drop the packet. + */ +bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) +{ + if (unlikely(start > skb->len - 2) || + unlikely((int)start + off > skb->len - 2)) { + if (net_ratelimit()) + printk(KERN_WARNING + "bad partial csum: csum=%u/%u len=%u\n", + start, off, skb->len); + return false; + } + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + start; + skb->csum_offset = off; + return true; +} + EXPORT_SYMBOL(___pskb_trim); EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(kfree_skb); @@ -2497,3 +2525,4 @@ EXPORT_SYMBOL(skb_append_datato_frags); EXPORT_SYMBOL_GPL(skb_to_sgvec); EXPORT_SYMBOL_GPL(skb_cow_data); +EXPORT_SYMBOL_GPL(skb_partial_csum_set); |