From eb5e89fc70bb3f115b3206ed0c57d3aba1fdd155 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Aug 2012 21:30:39 +0900 Subject: virtio/console: Add splice_write support Enable to use splice_write from pipe to virtio-console port. This steals pages from pipe and directly send it to host. Note that this may accelerate only the guest to host path. Changes in v2: - Use GFP_KERNEL instead of GFP_ATOMIC in syscall context function. Signed-off-by: Masami Hiramatsu Acked-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/char/virtio_console.c | 136 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 128 insertions(+), 8 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index cdf2f5451c7..730816cdeb4 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include #include @@ -227,6 +229,7 @@ struct port { bool guest_connected; }; +#define MAX_SPLICE_PAGES 32 /* This is the very early arch-specified put chars function. */ static int (*early_put_chars)(u32, const char *, int); @@ -474,26 +477,52 @@ static ssize_t send_control_msg(struct port *port, unsigned int event, return 0; } +struct buffer_token { + union { + void *buf; + struct scatterlist *sg; + } u; + bool sgpages; +}; + +static void reclaim_sg_pages(struct scatterlist *sg) +{ + int i; + struct page *page; + + for (i = 0; i < MAX_SPLICE_PAGES; i++) { + page = sg_page(&sg[i]); + if (!page) + break; + put_page(page); + } + kfree(sg); +} + /* Callers must take the port->outvq_lock */ static void reclaim_consumed_buffers(struct port *port) { - void *buf; + struct buffer_token *tok; unsigned int len; if (!port->portdev) { /* Device has been unplugged. vqs are already gone. */ return; } - while ((buf = virtqueue_get_buf(port->out_vq, &len))) { - kfree(buf); + while ((tok = virtqueue_get_buf(port->out_vq, &len))) { + if (tok->sgpages) + reclaim_sg_pages(tok->u.sg); + else + kfree(tok->u.buf); + kfree(tok); port->outvq_full = false; } } -static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count, - bool nonblock) +static ssize_t __send_to_port(struct port *port, struct scatterlist *sg, + int nents, size_t in_count, + struct buffer_token *tok, bool nonblock) { - struct scatterlist sg[1]; struct virtqueue *out_vq; ssize_t ret; unsigned long flags; @@ -505,8 +534,7 @@ static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count, reclaim_consumed_buffers(port); - sg_init_one(sg, in_buf, in_count); - ret = virtqueue_add_buf(out_vq, sg, 1, 0, in_buf, GFP_ATOMIC); + ret = virtqueue_add_buf(out_vq, sg, nents, 0, tok, GFP_ATOMIC); /* Tell Host to go! */ virtqueue_kick(out_vq); @@ -544,6 +572,37 @@ done: return in_count; } +static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count, + bool nonblock) +{ + struct scatterlist sg[1]; + struct buffer_token *tok; + + tok = kmalloc(sizeof(*tok), GFP_ATOMIC); + if (!tok) + return -ENOMEM; + tok->sgpages = false; + tok->u.buf = in_buf; + + sg_init_one(sg, in_buf, in_count); + + return __send_to_port(port, sg, 1, in_count, tok, nonblock); +} + +static ssize_t send_pages(struct port *port, struct scatterlist *sg, int nents, + size_t in_count, bool nonblock) +{ + struct buffer_token *tok; + + tok = kmalloc(sizeof(*tok), GFP_ATOMIC); + if (!tok) + return -ENOMEM; + tok->sgpages = true; + tok->u.sg = sg; + + return __send_to_port(port, sg, nents, in_count, tok, nonblock); +} + /* * Give out the data that's requested from the buffer that we have * queued up. @@ -725,6 +784,66 @@ out: return ret; } +struct sg_list { + unsigned int n; + size_t len; + struct scatterlist *sg; +}; + +static int pipe_to_sg(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + struct sg_list *sgl = sd->u.data; + unsigned int len = 0; + + if (sgl->n == MAX_SPLICE_PAGES) + return 0; + + /* Try lock this page */ + if (buf->ops->steal(pipe, buf) == 0) { + /* Get reference and unlock page for moving */ + get_page(buf->page); + unlock_page(buf->page); + + len = min(buf->len, sd->len); + sg_set_page(&(sgl->sg[sgl->n]), buf->page, len, buf->offset); + sgl->n++; + sgl->len += len; + } + + return len; +} + +/* Faster zero-copy write by splicing */ +static ssize_t port_fops_splice_write(struct pipe_inode_info *pipe, + struct file *filp, loff_t *ppos, + size_t len, unsigned int flags) +{ + struct port *port = filp->private_data; + struct sg_list sgl; + ssize_t ret; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.data = &sgl, + }; + + sgl.n = 0; + sgl.len = 0; + sgl.sg = kmalloc(sizeof(struct scatterlist) * MAX_SPLICE_PAGES, + GFP_KERNEL); + if (unlikely(!sgl.sg)) + return -ENOMEM; + + sg_init_table(sgl.sg, MAX_SPLICE_PAGES); + ret = __splice_from_pipe(pipe, &sd, pipe_to_sg); + if (likely(ret > 0)) + ret = send_pages(port, sgl.sg, sgl.n, sgl.len, true); + + return ret; +} + static unsigned int port_fops_poll(struct file *filp, poll_table *wait) { struct port *port; @@ -856,6 +975,7 @@ static const struct file_operations port_fops = { .open = port_fops_open, .read = port_fops_read, .write = port_fops_write, + .splice_write = port_fops_splice_write, .poll = port_fops_poll, .release = port_fops_release, .fasync = port_fops_fasync, -- cgit v1.2.3 From ec8fc870156b2b144f55b6a5a7d135018f04b30e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Aug 2012 21:30:50 +0900 Subject: virtio/console: Add a failback for unstealable pipe buffer Add a failback memcpy path for unstealable pipe buffer. If buf->ops->steal() fails, virtio-serial tries to copy the page contents to an allocated page, instead of just failing splice(). Signed-off-by: Masami Hiramatsu Acked-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/char/virtio_console.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 730816cdeb4..22b73735301 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -794,7 +794,7 @@ static int pipe_to_sg(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct sg_list *sgl = sd->u.data; - unsigned int len = 0; + unsigned int offset, len; if (sgl->n == MAX_SPLICE_PAGES) return 0; @@ -807,9 +807,31 @@ static int pipe_to_sg(struct pipe_inode_info *pipe, struct pipe_buffer *buf, len = min(buf->len, sd->len); sg_set_page(&(sgl->sg[sgl->n]), buf->page, len, buf->offset); - sgl->n++; - sgl->len += len; + } else { + /* Failback to copying a page */ + struct page *page = alloc_page(GFP_KERNEL); + char *src = buf->ops->map(pipe, buf, 1); + char *dst; + + if (!page) + return -ENOMEM; + dst = kmap(page); + + offset = sd->pos & ~PAGE_MASK; + + len = sd->len; + if (len + offset > PAGE_SIZE) + len = PAGE_SIZE - offset; + + memcpy(dst + offset, src + buf->offset, len); + + kunmap(page); + buf->ops->unmap(pipe, buf, src); + + sg_set_page(&(sgl->sg[sgl->n]), page, len, offset); } + sgl->n++; + sgl->len += len; return len; } -- cgit v1.2.3 From efe75d24a69fc39bb09d882ca2d5b90d4da02afe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Aug 2012 21:31:00 +0900 Subject: virtio/console: Wait until the port is ready on splice Wait if the port is not connected or full on splice like as write is doing. Signed-off-by: Masami Hiramatsu Acked-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/char/virtio_console.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 22b73735301..b2fc2abedc7 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -724,6 +724,26 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf, return fill_readbuf(port, ubuf, count, true); } +static int wait_port_writable(struct port *port, bool nonblock) +{ + int ret; + + if (will_write_block(port)) { + if (nonblock) + return -EAGAIN; + + ret = wait_event_freezable(port->waitqueue, + !will_write_block(port)); + if (ret < 0) + return ret; + } + /* Port got hot-unplugged. */ + if (!port->guest_connected) + return -ENODEV; + + return 0; +} + static ssize_t port_fops_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *offp) { @@ -740,18 +760,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf, nonblock = filp->f_flags & O_NONBLOCK; - if (will_write_block(port)) { - if (nonblock) - return -EAGAIN; - - ret = wait_event_freezable(port->waitqueue, - !will_write_block(port)); - if (ret < 0) - return ret; - } - /* Port got hot-unplugged. */ - if (!port->guest_connected) - return -ENODEV; + ret = wait_port_writable(port, nonblock); + if (ret < 0) + return ret; count = min((size_t)(32 * 1024), count); @@ -851,6 +862,10 @@ static ssize_t port_fops_splice_write(struct pipe_inode_info *pipe, .u.data = &sgl, }; + ret = wait_port_writable(port, filp->f_flags & O_NONBLOCK); + if (ret < 0) + return ret; + sgl.n = 0; sgl.len = 0; sgl.sg = kmalloc(sizeof(struct scatterlist) * MAX_SPLICE_PAGES, -- cgit v1.2.3 From d55cb6cf143ae16eaa415baab520b8eaf4a1012f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Aug 2012 21:31:10 +0900 Subject: ftrace: Allow stealing pages from pipe buffer Use generic steal operation on pipe buffer to allow stealing ring buffer's read page from pipe buffer. Note that this could reduce the performance of splice on the splice_write side operation without affinity setting. Since the ring buffer's read pages are allocated on the tracing-node, but the splice user does not always execute splice write side operation on the same node. In this case, the page will be accessed from the another node. Thus, it is strongly recommended to assign the splicing thread to corresponding node. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Signed-off-by: Rusty Russell --- kernel/trace/trace.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c38c81496c..adde0994911 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4195,12 +4195,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, buf->private = 0; } -static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -4216,7 +4210,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = buffer_pipe_buf_steal, + .steal = generic_pipe_buf_steal, .get = buffer_pipe_buf_get, }; -- cgit v1.2.3 From 8ca84a50e5b39487ea1de8809d0ee1c8474f6a5c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Aug 2012 21:31:20 +0900 Subject: virtio/console: Allocate scatterlist according to the current pipe size Allocate scatterlist according to the current pipe size. This allows splicing bigger buffer if the pipe size has been changed by fcntl. Changes in v2: - Just a minor fix for avoiding a confliction with previous patch. Signed-off-by: Masami Hiramatsu Acked-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/char/virtio_console.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index b2fc2abedc7..e88f8439042 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -229,7 +229,6 @@ struct port { bool guest_connected; }; -#define MAX_SPLICE_PAGES 32 /* This is the very early arch-specified put chars function. */ static int (*early_put_chars)(u32, const char *, int); @@ -482,15 +481,16 @@ struct buffer_token { void *buf; struct scatterlist *sg; } u; - bool sgpages; + /* If sgpages == 0 then buf is used, else sg is used */ + unsigned int sgpages; }; -static void reclaim_sg_pages(struct scatterlist *sg) +static void reclaim_sg_pages(struct scatterlist *sg, unsigned int nrpages) { int i; struct page *page; - for (i = 0; i < MAX_SPLICE_PAGES; i++) { + for (i = 0; i < nrpages; i++) { page = sg_page(&sg[i]); if (!page) break; @@ -511,7 +511,7 @@ static void reclaim_consumed_buffers(struct port *port) } while ((tok = virtqueue_get_buf(port->out_vq, &len))) { if (tok->sgpages) - reclaim_sg_pages(tok->u.sg); + reclaim_sg_pages(tok->u.sg, tok->sgpages); else kfree(tok->u.buf); kfree(tok); @@ -581,7 +581,7 @@ static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count, tok = kmalloc(sizeof(*tok), GFP_ATOMIC); if (!tok) return -ENOMEM; - tok->sgpages = false; + tok->sgpages = 0; tok->u.buf = in_buf; sg_init_one(sg, in_buf, in_count); @@ -597,7 +597,7 @@ static ssize_t send_pages(struct port *port, struct scatterlist *sg, int nents, tok = kmalloc(sizeof(*tok), GFP_ATOMIC); if (!tok) return -ENOMEM; - tok->sgpages = true; + tok->sgpages = nents; tok->u.sg = sg; return __send_to_port(port, sg, nents, in_count, tok, nonblock); @@ -797,6 +797,7 @@ out: struct sg_list { unsigned int n; + unsigned int size; size_t len; struct scatterlist *sg; }; @@ -807,7 +808,7 @@ static int pipe_to_sg(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct sg_list *sgl = sd->u.data; unsigned int offset, len; - if (sgl->n == MAX_SPLICE_PAGES) + if (sgl->n == sgl->size) return 0; /* Try lock this page */ @@ -868,12 +869,12 @@ static ssize_t port_fops_splice_write(struct pipe_inode_info *pipe, sgl.n = 0; sgl.len = 0; - sgl.sg = kmalloc(sizeof(struct scatterlist) * MAX_SPLICE_PAGES, - GFP_KERNEL); + sgl.size = pipe->nrbufs; + sgl.sg = kmalloc(sizeof(struct scatterlist) * sgl.size, GFP_KERNEL); if (unlikely(!sgl.sg)) return -ENOMEM; - sg_init_table(sgl.sg, MAX_SPLICE_PAGES); + sg_init_table(sgl.sg, sgl.size); ret = __splice_from_pipe(pipe, &sd, pipe_to_sg); if (likely(ret > 0)) ret = send_pages(port, sgl.sg, sgl.n, sgl.len, true); -- cgit v1.2.3 From 108fc82596e3b66b819df9d28c1ebbc9ab5de14c Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Thu, 9 Aug 2012 21:31:30 +0900 Subject: tools: Add guest trace agent as a user tool This patch adds a user tool, "trace agent" for sending trace data of a guest to a Host in low overhead. This agent has the following functions: - splice a page of ring-buffer to read_pipe without memory copying - splice the page from write_pipe to virtio-console without memory copying - write trace data to stdout by using -o option - controlled by start/stop orders from a Host Changes in v2: - Cleanup (change fprintf() to pr_err() and an include guard) Signed-off-by: Yoshihiro YUNOMAE Acked-by: Amit Shah Signed-off-by: Rusty Russell --- tools/virtio/virtio-trace/Makefile | 14 ++ tools/virtio/virtio-trace/README | 118 ++++++++++++ tools/virtio/virtio-trace/trace-agent-ctl.c | 137 ++++++++++++++ tools/virtio/virtio-trace/trace-agent-rw.c | 192 ++++++++++++++++++++ tools/virtio/virtio-trace/trace-agent.c | 270 ++++++++++++++++++++++++++++ tools/virtio/virtio-trace/trace-agent.h | 75 ++++++++ 6 files changed, 806 insertions(+) create mode 100644 tools/virtio/virtio-trace/Makefile create mode 100644 tools/virtio/virtio-trace/README create mode 100644 tools/virtio/virtio-trace/trace-agent-ctl.c create mode 100644 tools/virtio/virtio-trace/trace-agent-rw.c create mode 100644 tools/virtio/virtio-trace/trace-agent.c create mode 100644 tools/virtio/virtio-trace/trace-agent.h diff --git a/tools/virtio/virtio-trace/Makefile b/tools/virtio/virtio-trace/Makefile new file mode 100644 index 00000000000..ef3adfce594 --- /dev/null +++ b/tools/virtio/virtio-trace/Makefile @@ -0,0 +1,14 @@ +CC = gcc +CFLAGS = -O2 -Wall +LFLAG = -lpthread + +all: trace-agent + +.c.o: + $(CC) $(CFLAGS) $(LFLAG) -c $^ -o $@ + +trace-agent: trace-agent.o trace-agent-ctl.o trace-agent-rw.o + $(CC) $(CFLAGS) $(LFLAG) -o $@ $^ + +clean: + rm -f *.o trace-agent diff --git a/tools/virtio/virtio-trace/README b/tools/virtio/virtio-trace/README new file mode 100644 index 00000000000..b64845b823a --- /dev/null +++ b/tools/virtio/virtio-trace/README @@ -0,0 +1,118 @@ +Trace Agent for virtio-trace +============================ + +Trace agent is a user tool for sending trace data of a guest to a Host in low +overhead. Trace agent has the following functions: + - splice a page of ring-buffer to read_pipe without memory copying + - splice the page from write_pipe to virtio-console without memory copying + - write trace data to stdout by using -o option + - controlled by start/stop orders from a Host + +The trace agent operates as follows: + 1) Initialize all structures. + 2) Create a read/write thread per CPU. Each thread is bound to a CPU. + The read/write threads hold it. + 3) A controller thread does poll() for a start order of a host. + 4) After the controller of the trace agent receives a start order from a host, + the controller wake read/write threads. + 5) The read/write threads start to read trace data from ring-buffers and + write the data to virtio-serial. + 6) If the controller receives a stop order from a host, the read/write threads + stop to read trace data. + + +Files +===== + +README: this file +Makefile: Makefile of trace agent for virtio-trace +trace-agent.c: includes main function, sets up for operating trace agent +trace-agent.h: includes all structures and some macros +trace-agent-ctl.c: includes controller function for read/write threads +trace-agent-rw.c: includes read/write threads function + + +Setup +===== + +To use this trace agent for virtio-trace, we need to prepare some virtio-serial +I/Fs. + +1) Make FIFO in a host + virtio-trace uses virtio-serial pipe as trace data paths as to the number +of CPUs and a control path, so FIFO (named pipe) should be created as follows: + # mkdir /tmp/virtio-trace/ + # mkfifo /tmp/virtio-trace/trace-path-cpu{0,1,2,...,X}.{in,out} + # mkfifo /tmp/virtio-trace/agent-ctl-path.{in,out} + +For example, if a guest use three CPUs, the names are + trace-path-cpu{0,1,2}.{in.out} +and + agent-ctl-path.{in,out}. + +2) Set up of virtio-serial pipe in a host + Add qemu option to use virtio-serial pipe. + + ##virtio-serial device## + -device virtio-serial-pci,id=virtio-serial0\ + ##control path## + -chardev pipe,id=charchannel0,path=/tmp/virtio-trace/agent-ctl-path\ + -device virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,\ + id=channel0,name=agent-ctl-path\ + ##data path## + -chardev pipe,id=charchannel1,path=/tmp/virtio-trace/trace-path-cpu0\ + -device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel0,\ + id=channel1,name=trace-path-cpu0\ + ... + +If you manage guests with libvirt, add the following tags to domain XML files. +Then, libvirt passes the same command option to qemu. + + + + +
+ + + + +
+ + ... +Here, chardev names are restricted to trace-path-cpuX and agent-ctl-path. For +example, if a guest use three CPUs, chardev names should be trace-path-cpu0, +trace-path-cpu1, trace-path-cpu2, and agent-ctl-path. + +3) Boot the guest + You can find some chardev in /dev/virtio-ports/ in the guest. + + +Run +=== + +0) Build trace agent in a guest + $ make + +1) Enable ftrace in the guest + + # echo 1 > /sys/kernel/debug/tracing/events/sched/enable + +2) Run trace agent in the guest + This agent must be operated as root. + # ./trace-agent +read/write threads in the agent wait for start order from host. If you add -o +option, trace data are output via stdout in the guest. + +3) Open FIFO in a host + # cat /tmp/virtio-trace/trace-path-cpu0.out +If a host does not open these, trace data get stuck in buffers of virtio. Then, +the guest will stop by specification of chardev in QEMU. This blocking mode may +be solved in the future. + +4) Start to read trace data by ordering from a host + A host injects read start order to the guest via virtio-serial. + # echo 1 > /tmp/virtio-trace/agent-ctl-path.in + +5) Stop to read trace data by ordering from a host + A host injects read stop order to the guest via virtio-serial. + # echo 0 > /tmp/virtio-trace/agent-ctl-path.in diff --git a/tools/virtio/virtio-trace/trace-agent-ctl.c b/tools/virtio/virtio-trace/trace-agent-ctl.c new file mode 100644 index 00000000000..a2d0403c4f9 --- /dev/null +++ b/tools/virtio/virtio-trace/trace-agent-ctl.c @@ -0,0 +1,137 @@ +/* + * Controller of read/write threads for virtio-trace + * + * Copyright (C) 2012 Hitachi, Ltd. + * Created by Yoshihiro Yunomae + * Masami Hiramatsu + * + * Licensed under GPL version 2 only. + * + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "trace-agent.h" + +#define HOST_MSG_SIZE 256 +#define EVENT_WAIT_MSEC 100 + +static volatile sig_atomic_t global_signal_val; +bool global_sig_receive; /* default false */ +bool global_run_operation; /* default false*/ + +/* Handle SIGTERM/SIGINT/SIGQUIT to exit */ +static void signal_handler(int sig) +{ + global_signal_val = sig; +} + +int rw_ctl_init(const char *ctl_path) +{ + int ctl_fd; + + ctl_fd = open(ctl_path, O_RDONLY); + if (ctl_fd == -1) { + pr_err("Cannot open ctl_fd\n"); + goto error; + } + + return ctl_fd; + +error: + exit(EXIT_FAILURE); +} + +static int wait_order(int ctl_fd) +{ + struct pollfd poll_fd; + int ret = 0; + + while (!global_sig_receive) { + poll_fd.fd = ctl_fd; + poll_fd.events = POLLIN; + + ret = poll(&poll_fd, 1, EVENT_WAIT_MSEC); + + if (global_signal_val) { + global_sig_receive = true; + pr_info("Receive interrupt %d\n", global_signal_val); + + /* Wakes rw-threads when they are sleeping */ + if (!global_run_operation) + pthread_cond_broadcast(&cond_wakeup); + + ret = -1; + break; + } + + if (ret < 0) { + pr_err("Polling error\n"); + goto error; + } + + if (ret) + break; + }; + + return ret; + +error: + exit(EXIT_FAILURE); +} + +/* + * contol read/write threads by handling global_run_operation + */ +void *rw_ctl_loop(int ctl_fd) +{ + ssize_t rlen; + char buf[HOST_MSG_SIZE]; + int ret; + + /* Setup signal handlers */ + signal(SIGTERM, signal_handler); + signal(SIGINT, signal_handler); + signal(SIGQUIT, signal_handler); + + while (!global_sig_receive) { + + ret = wait_order(ctl_fd); + if (ret < 0) + break; + + rlen = read(ctl_fd, buf, sizeof(buf)); + if (rlen < 0) { + pr_err("read data error in ctl thread\n"); + goto error; + } + + if (rlen == 2 && buf[0] == '1') { + /* + * If host writes '1' to a control path, + * this controller wakes all read/write threads. + */ + global_run_operation = true; + pthread_cond_broadcast(&cond_wakeup); + pr_debug("Wake up all read/write threads\n"); + } else if (rlen == 2 && buf[0] == '0') { + /* + * If host writes '0' to a control path, read/write + * threads will wait for notification from Host. + */ + global_run_operation = false; + pr_debug("Stop all read/write threads\n"); + } else + pr_info("Invalid host notification: %s\n", buf); + } + + return NULL; + +error: + exit(EXIT_FAILURE); +} diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c new file mode 100644 index 00000000000..3aace5ea484 --- /dev/null +++ b/tools/virtio/virtio-trace/trace-agent-rw.c @@ -0,0 +1,192 @@ +/* + * Read/write thread of a guest agent for virtio-trace + * + * Copyright (C) 2012 Hitachi, Ltd. + * Created by Yoshihiro Yunomae + * Masami Hiramatsu + * + * Licensed under GPL version 2 only. + * + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include "trace-agent.h" + +#define READ_WAIT_USEC 100000 + +void *rw_thread_info_new(void) +{ + struct rw_thread_info *rw_ti; + + rw_ti = zalloc(sizeof(struct rw_thread_info)); + if (rw_ti == NULL) { + pr_err("rw_thread_info zalloc error\n"); + exit(EXIT_FAILURE); + } + + rw_ti->cpu_num = -1; + rw_ti->in_fd = -1; + rw_ti->out_fd = -1; + rw_ti->read_pipe = -1; + rw_ti->write_pipe = -1; + rw_ti->pipe_size = PIPE_INIT; + + return rw_ti; +} + +void *rw_thread_init(int cpu, const char *in_path, const char *out_path, + bool stdout_flag, unsigned long pipe_size, + struct rw_thread_info *rw_ti) +{ + int data_pipe[2]; + + rw_ti->cpu_num = cpu; + + /* set read(input) fd */ + rw_ti->in_fd = open(in_path, O_RDONLY); + if (rw_ti->in_fd == -1) { + pr_err("Could not open in_fd (CPU:%d)\n", cpu); + goto error; + } + + /* set write(output) fd */ + if (!stdout_flag) { + /* virtio-serial output mode */ + rw_ti->out_fd = open(out_path, O_WRONLY); + if (rw_ti->out_fd == -1) { + pr_err("Could not open out_fd (CPU:%d)\n", cpu); + goto error; + } + } else + /* stdout mode */ + rw_ti->out_fd = STDOUT_FILENO; + + if (pipe2(data_pipe, O_NONBLOCK) < 0) { + pr_err("Could not create pipe in rw-thread(%d)\n", cpu); + goto error; + } + + /* + * Size of pipe is 64kB in default based on fs/pipe.c. + * To read/write trace data speedy, pipe size is changed. + */ + if (fcntl(*data_pipe, F_SETPIPE_SZ, pipe_size) < 0) { + pr_err("Could not change pipe size in rw-thread(%d)\n", cpu); + goto error; + } + + rw_ti->read_pipe = data_pipe[1]; + rw_ti->write_pipe = data_pipe[0]; + rw_ti->pipe_size = pipe_size; + + return NULL; + +error: + exit(EXIT_FAILURE); +} + +/* Bind a thread to a cpu */ +static void bind_cpu(int cpu_num) +{ + cpu_set_t mask; + + CPU_ZERO(&mask); + CPU_SET(cpu_num, &mask); + + /* bind my thread to cpu_num by assigning zero to the first argument */ + if (sched_setaffinity(0, sizeof(mask), &mask) == -1) + pr_err("Could not set CPU#%d affinity\n", (int)cpu_num); +} + +static void *rw_thread_main(void *thread_info) +{ + ssize_t rlen, wlen; + ssize_t ret; + struct rw_thread_info *ts = (struct rw_thread_info *)thread_info; + + bind_cpu(ts->cpu_num); + + while (1) { + /* Wait for a read order of trace data by Host OS */ + if (!global_run_operation) { + pthread_mutex_lock(&mutex_notify); + pthread_cond_wait(&cond_wakeup, &mutex_notify); + pthread_mutex_unlock(&mutex_notify); + } + + if (global_sig_receive) + break; + + /* + * Each thread read trace_pipe_raw of each cpu bounding the + * thread, so contention of multi-threads does not occur. + */ + rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL, + ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE); + + if (rlen < 0) { + pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num); + goto error; + } else if (rlen == 0) { + /* + * If trace data do not exist or are unreadable not + * for exceeding the page size, splice_read returns + * NULL. Then, this waits for being filled the data in a + * ring-buffer. + */ + usleep(READ_WAIT_USEC); + pr_debug("Read retry(cpu:%d)\n", ts->cpu_num); + continue; + } + + wlen = 0; + + do { + ret = splice(ts->write_pipe, NULL, ts->out_fd, NULL, + rlen - wlen, + SPLICE_F_MOVE | SPLICE_F_MORE); + + if (ret < 0) { + pr_err("Splice_write in rw-thread(%d)\n", + ts->cpu_num); + goto error; + } else if (ret == 0) + /* + * When host reader is not in time for reading + * trace data, guest will be stopped. This is + * because char dev in QEMU is not supported + * non-blocking mode. Then, writer might be + * sleep in that case. + * This sleep will be removed by supporting + * non-blocking mode. + */ + sleep(1); + wlen += ret; + } while (wlen < rlen); + } + + return NULL; + +error: + exit(EXIT_FAILURE); +} + + +pthread_t rw_thread_run(struct rw_thread_info *rw_ti) +{ + int ret; + pthread_t rw_thread_per_cpu; + + ret = pthread_create(&rw_thread_per_cpu, NULL, rw_thread_main, rw_ti); + if (ret != 0) { + pr_err("Could not create a rw thread(%d)\n", rw_ti->cpu_num); + exit(EXIT_FAILURE); + } + + return rw_thread_per_cpu; +} diff --git a/tools/virtio/virtio-trace/trace-agent.c b/tools/virtio/virtio-trace/trace-agent.c new file mode 100644 index 00000000000..0a0a7dd4eff --- /dev/null +++ b/tools/virtio/virtio-trace/trace-agent.c @@ -0,0 +1,270 @@ +/* + * Guest agent for virtio-trace + * + * Copyright (C) 2012 Hitachi, Ltd. + * Created by Yoshihiro Yunomae + * Masami Hiramatsu + * + * Licensed under GPL version 2 only. + * + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include "trace-agent.h" + +#define PAGE_SIZE (sysconf(_SC_PAGE_SIZE)) +#define PIPE_DEF_BUFS 16 +#define PIPE_MIN_SIZE (PAGE_SIZE*PIPE_DEF_BUFS) +#define PIPE_MAX_SIZE (1024*1024) +#define READ_PATH_FMT \ + "/sys/kernel/debug/tracing/per_cpu/cpu%d/trace_pipe_raw" +#define WRITE_PATH_FMT "/dev/virtio-ports/trace-path-cpu%d" +#define CTL_PATH "/dev/virtio-ports/agent-ctl-path" + +pthread_mutex_t mutex_notify = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t cond_wakeup = PTHREAD_COND_INITIALIZER; + +static int get_total_cpus(void) +{ + int nr_cpus = (int)sysconf(_SC_NPROCESSORS_CONF); + + if (nr_cpus <= 0) { + pr_err("Could not read cpus\n"); + goto error; + } else if (nr_cpus > MAX_CPUS) { + pr_err("Exceed max cpus(%d)\n", (int)MAX_CPUS); + goto error; + } + + return nr_cpus; + +error: + exit(EXIT_FAILURE); +} + +static void *agent_info_new(void) +{ + struct agent_info *s; + int i; + + s = zalloc(sizeof(struct agent_info)); + if (s == NULL) { + pr_err("agent_info zalloc error\n"); + exit(EXIT_FAILURE); + } + + s->pipe_size = PIPE_INIT; + s->use_stdout = false; + s->cpus = get_total_cpus(); + s->ctl_fd = -1; + + /* read/write threads init */ + for (i = 0; i < s->cpus; i++) + s->rw_ti[i] = rw_thread_info_new(); + + return s; +} + +static unsigned long parse_size(const char *arg) +{ + unsigned long value, round; + char *ptr; + + value = strtoul(arg, &ptr, 10); + switch (*ptr) { + case 'K': case 'k': + value <<= 10; + break; + case 'M': case 'm': + value <<= 20; + break; + default: + break; + } + + if (value > PIPE_MAX_SIZE) { + pr_err("Pipe size must be less than 1MB\n"); + goto error; + } else if (value < PIPE_MIN_SIZE) { + pr_err("Pipe size must be over 64KB\n"); + goto error; + } + + /* Align buffer size with page unit */ + round = value & (PAGE_SIZE - 1); + value = value - round; + + return value; +error: + return 0; +} + +static void usage(char const *prg) +{ + pr_err("usage: %s [-h] [-o] [-s ]\n", prg); +} + +static const char *make_path(int cpu_num, bool this_is_write_path) +{ + int ret; + char *buf; + + buf = zalloc(PATH_MAX); + if (buf == NULL) { + pr_err("Could not allocate buffer\n"); + goto error; + } + + if (this_is_write_path) + /* write(output) path */ + ret = snprintf(buf, PATH_MAX, WRITE_PATH_FMT, cpu_num); + else + /* read(input) path */ + ret = snprintf(buf, PATH_MAX, READ_PATH_FMT, cpu_num); + + if (ret <= 0) { + pr_err("Failed to generate %s path(CPU#%d):%d\n", + this_is_write_path ? "read" : "write", cpu_num, ret); + goto error; + } + + return buf; + +error: + free(buf); + return NULL; +} + +static const char *make_input_path(int cpu_num) +{ + return make_path(cpu_num, false); +} + +static const char *make_output_path(int cpu_num) +{ + return make_path(cpu_num, true); +} + +static void *agent_info_init(struct agent_info *s) +{ + int cpu; + const char *in_path = NULL; + const char *out_path = NULL; + + /* init read/write threads */ + for (cpu = 0; cpu < s->cpus; cpu++) { + /* set read(input) path per read/write thread */ + in_path = make_input_path(cpu); + if (in_path == NULL) + goto error; + + /* set write(output) path per read/write thread*/ + if (!s->use_stdout) { + out_path = make_output_path(cpu); + if (out_path == NULL) + goto error; + } else + /* stdout mode */ + pr_debug("stdout mode\n"); + + rw_thread_init(cpu, in_path, out_path, s->use_stdout, + s->pipe_size, s->rw_ti[cpu]); + } + + /* init controller of read/write threads */ + s->ctl_fd = rw_ctl_init((const char *)CTL_PATH); + + return NULL; + +error: + exit(EXIT_FAILURE); +} + +static void *parse_args(int argc, char *argv[], struct agent_info *s) +{ + int cmd; + unsigned long size; + + while ((cmd = getopt(argc, argv, "hos:")) != -1) { + switch (cmd) { + /* stdout mode */ + case 'o': + s->use_stdout = true; + break; + /* size of pipe */ + case 's': + size = parse_size(optarg); + if (size == 0) + goto error; + s->pipe_size = size; + break; + case 'h': + default: + usage(argv[0]); + goto error; + } + } + + agent_info_init(s); + + return NULL; + +error: + exit(EXIT_FAILURE); +} + +static void agent_main_loop(struct agent_info *s) +{ + int cpu; + pthread_t rw_thread_per_cpu[MAX_CPUS]; + + /* Start all read/write threads */ + for (cpu = 0; cpu < s->cpus; cpu++) + rw_thread_per_cpu[cpu] = rw_thread_run(s->rw_ti[cpu]); + + rw_ctl_loop(s->ctl_fd); + + /* Finish all read/write threads */ + for (cpu = 0; cpu < s->cpus; cpu++) { + int ret; + + ret = pthread_join(rw_thread_per_cpu[cpu], NULL); + if (ret != 0) { + pr_err("pthread_join() error:%d (cpu %d)\n", ret, cpu); + exit(EXIT_FAILURE); + } + } +} + +static void agent_info_free(struct agent_info *s) +{ + int i; + + close(s->ctl_fd); + for (i = 0; i < s->cpus; i++) { + close(s->rw_ti[i]->in_fd); + close(s->rw_ti[i]->out_fd); + close(s->rw_ti[i]->read_pipe); + close(s->rw_ti[i]->write_pipe); + free(s->rw_ti[i]); + } + free(s); +} + +int main(int argc, char *argv[]) +{ + struct agent_info *s = NULL; + + s = agent_info_new(); + parse_args(argc, argv, s); + + agent_main_loop(s); + + agent_info_free(s); + + return 0; +} diff --git a/tools/virtio/virtio-trace/trace-agent.h b/tools/virtio/virtio-trace/trace-agent.h new file mode 100644 index 00000000000..8de79bfeaa7 --- /dev/null +++ b/tools/virtio/virtio-trace/trace-agent.h @@ -0,0 +1,75 @@ +#ifndef __TRACE_AGENT_H__ +#define __TRACE_AGENT_H__ +#include +#include + +#define MAX_CPUS 256 +#define PIPE_INIT (1024*1024) + +/* + * agent_info - structure managing total information of guest agent + * @pipe_size: size of pipe (default 1MB) + * @use_stdout: set to true when o option is added (default false) + * @cpus: total number of CPUs + * @ctl_fd: fd of control path, /dev/virtio-ports/agent-ctl-path + * @rw_ti: structure managing information of read/write threads + */ +struct agent_info { + unsigned long pipe_size; + bool use_stdout; + int cpus; + int ctl_fd; + struct rw_thread_info *rw_ti[MAX_CPUS]; +}; + +/* + * rw_thread_info - structure managing a read/write thread a cpu + * @cpu_num: cpu number operating this read/write thread + * @in_fd: fd of reading trace data path in cpu_num + * @out_fd: fd of writing trace data path in cpu_num + * @read_pipe: fd of read pipe + * @write_pipe: fd of write pipe + * @pipe_size: size of pipe (default 1MB) + */ +struct rw_thread_info { + int cpu_num; + int in_fd; + int out_fd; + int read_pipe; + int write_pipe; + unsigned long pipe_size; +}; + +/* use for stopping rw threads */ +extern bool global_sig_receive; + +/* use for notification */ +extern bool global_run_operation; +extern pthread_mutex_t mutex_notify; +extern pthread_cond_t cond_wakeup; + +/* for controller of read/write threads */ +extern int rw_ctl_init(const char *ctl_path); +extern void *rw_ctl_loop(int ctl_fd); + +/* for trace read/write thread */ +extern void *rw_thread_info_new(void); +extern void *rw_thread_init(int cpu, const char *in_path, const char *out_path, + bool stdout_flag, unsigned long pipe_size, + struct rw_thread_info *rw_ti); +extern pthread_t rw_thread_run(struct rw_thread_info *rw_ti); + +static inline void *zalloc(size_t size) +{ + return calloc(1, size); +} + +#define pr_err(format, ...) fprintf(stderr, format, ## __VA_ARGS__) +#define pr_info(format, ...) fprintf(stdout, format, ## __VA_ARGS__) +#ifdef DEBUG +#define pr_debug(format, ...) fprintf(stderr, format, ## __VA_ARGS__) +#else +#define pr_debug(format, ...) do {} while (0) +#endif + +#endif /*__TRACE_AGENT_H__*/ -- cgit v1.2.3 From 5b8fa822b71fa91b17c9fb38bcca31e771f7650d Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Tue, 4 Sep 2012 09:53:39 +0900 Subject: tools: Fix pthread flag for Makefile of trace-agent used by virtio-trace pthread flag should not be -lpthread but -pthread using gcc. The -lpthread links the external multithread library. On the other hand, the -pthread manages both the gcc's preprocessor and linker to be able to compile with pthread. Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: Rusty Russell --- tools/virtio/virtio-trace/Makefile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/virtio/virtio-trace/Makefile b/tools/virtio/virtio-trace/Makefile index ef3adfce594..0d238163347 100644 --- a/tools/virtio/virtio-trace/Makefile +++ b/tools/virtio/virtio-trace/Makefile @@ -1,14 +1,13 @@ CC = gcc -CFLAGS = -O2 -Wall -LFLAG = -lpthread +CFLAGS = -O2 -Wall -pthread all: trace-agent .c.o: - $(CC) $(CFLAGS) $(LFLAG) -c $^ -o $@ + $(CC) $(CFLAGS) -c $^ -o $@ trace-agent: trace-agent.o trace-agent-ctl.o trace-agent-rw.o - $(CC) $(CFLAGS) $(LFLAG) -o $@ $^ + $(CC) $(CFLAGS) -o $@ $^ clean: rm -f *.o trace-agent -- cgit v1.2.3 From 33e1afc3d82697599ccc8dc8f2fa44ffff5ae329 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 1 Sep 2012 23:49:37 +0400 Subject: virtio: console: fix error handling in init() function If register_virtio_driver() fails, virtio-ports class is not destroyed. The patch adds error handling of register_virtio_driver(). Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Acked-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/char/virtio_console.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index e88f8439042..8ab9c3d4bf1 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -2099,7 +2099,17 @@ static int __init init(void) INIT_LIST_HEAD(&pdrvdata.consoles); INIT_LIST_HEAD(&pdrvdata.portdevs); - return register_virtio_driver(&virtio_console); + err = register_virtio_driver(&virtio_console); + if (err < 0) { + pr_err("Error %d registering virtio driver\n", err); + goto free; + } + return 0; +free: + if (pdrvdata.debugfs_dir) + debugfs_remove_recursive(pdrvdata.debugfs_dir); + class_destroy(pdrvdata.class); + return err; } static void __exit fini(void) -- cgit v1.2.3 From a98755c559e0e944a44174883b74a97019e3a367 Mon Sep 17 00:00:00 2001 From: Asias He Date: Wed, 8 Aug 2012 16:07:04 +0800 Subject: virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell Cc: Jens Axboe Cc: Christoph Hellwig Cc: Tejun Heo Cc: Shaohua Li Cc: "Michael S. Tsirkin" Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig Signed-off-by: Minchan Kim Signed-off-by: Asias He Acked-by: Rusty Russell Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 203 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 163 insertions(+), 40 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index c0bbeb47075..95cfeeda4f3 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -14,6 +14,9 @@ #define PART_BITS 4 +static bool use_bio; +module_param(use_bio, bool, S_IRUGO); + static int major; static DEFINE_IDA(vd_index_ida); @@ -23,6 +26,7 @@ struct virtio_blk { struct virtio_device *vdev; struct virtqueue *vq; + wait_queue_head_t queue_wait; /* The disk structure for the kernel. */ struct gendisk *disk; @@ -51,53 +55,87 @@ struct virtio_blk struct virtblk_req { struct request *req; + struct bio *bio; struct virtio_blk_outhdr out_hdr; struct virtio_scsi_inhdr in_hdr; u8 status; + struct scatterlist sg[]; }; -static void blk_done(struct virtqueue *vq) +static inline int virtblk_result(struct virtblk_req *vbr) +{ + switch (vbr->status) { + case VIRTIO_BLK_S_OK: + return 0; + case VIRTIO_BLK_S_UNSUPP: + return -ENOTTY; + default: + return -EIO; + } +} + +static inline void virtblk_request_done(struct virtio_blk *vblk, + struct virtblk_req *vbr) +{ + struct request *req = vbr->req; + int error = virtblk_result(vbr); + + if (req->cmd_type == REQ_TYPE_BLOCK_PC) { + req->resid_len = vbr->in_hdr.residual; + req->sense_len = vbr->in_hdr.sense_len; + req->errors = vbr->in_hdr.errors; + } else if (req->cmd_type == REQ_TYPE_SPECIAL) { + req->errors = (error != 0); + } + + __blk_end_request_all(req, error); + mempool_free(vbr, vblk->pool); +} + +static inline void virtblk_bio_done(struct virtio_blk *vblk, + struct virtblk_req *vbr) +{ + bio_endio(vbr->bio, virtblk_result(vbr)); + mempool_free(vbr, vblk->pool); +} + +static void virtblk_done(struct virtqueue *vq) { struct virtio_blk *vblk = vq->vdev->priv; + unsigned long bio_done = 0, req_done = 0; struct virtblk_req *vbr; - unsigned int len; unsigned long flags; + unsigned int len; spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { - int error; - - switch (vbr->status) { - case VIRTIO_BLK_S_OK: - error = 0; - break; - case VIRTIO_BLK_S_UNSUPP: - error = -ENOTTY; - break; - default: - error = -EIO; - break; - } - - switch (vbr->req->cmd_type) { - case REQ_TYPE_BLOCK_PC: - vbr->req->resid_len = vbr->in_hdr.residual; - vbr->req->sense_len = vbr->in_hdr.sense_len; - vbr->req->errors = vbr->in_hdr.errors; - break; - case REQ_TYPE_SPECIAL: - vbr->req->errors = (error != 0); - break; - default: - break; + if (vbr->bio) { + virtblk_bio_done(vblk, vbr); + bio_done++; + } else { + virtblk_request_done(vblk, vbr); + req_done++; } - - __blk_end_request_all(vbr->req, error); - mempool_free(vbr, vblk->pool); } /* In case queue is stopped waiting for more buffers. */ - blk_start_queue(vblk->disk->queue); + if (req_done) + blk_start_queue(vblk->disk->queue); spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); + + if (bio_done) + wake_up(&vblk->queue_wait); +} + +static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, + gfp_t gfp_mask) +{ + struct virtblk_req *vbr; + + vbr = mempool_alloc(vblk->pool, gfp_mask); + if (vbr && use_bio) + sg_init_table(vbr->sg, vblk->sg_elems); + + return vbr; } static bool do_req(struct request_queue *q, struct virtio_blk *vblk, @@ -106,13 +144,13 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, unsigned long num, out = 0, in = 0; struct virtblk_req *vbr; - vbr = mempool_alloc(vblk->pool, GFP_ATOMIC); + vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); if (!vbr) /* When another request finishes we'll try again. */ return false; vbr->req = req; - + vbr->bio = NULL; if (req->cmd_flags & REQ_FLUSH) { vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; vbr->out_hdr.sector = 0; @@ -172,7 +210,8 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, } } - if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, GFP_ATOMIC)<0) { + if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, + GFP_ATOMIC) < 0) { mempool_free(vbr, vblk->pool); return false; } @@ -180,7 +219,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, return true; } -static void do_virtblk_request(struct request_queue *q) +static void virtblk_request(struct request_queue *q) { struct virtio_blk *vblk = q->queuedata; struct request *req; @@ -203,6 +242,82 @@ static void do_virtblk_request(struct request_queue *q) virtqueue_kick(vblk->vq); } +static void virtblk_add_buf_wait(struct virtio_blk *vblk, + struct virtblk_req *vbr, + unsigned long out, + unsigned long in) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait_exclusive(&vblk->queue_wait, &wait, + TASK_UNINTERRUPTIBLE); + + spin_lock_irq(vblk->disk->queue->queue_lock); + if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, + GFP_ATOMIC) < 0) { + spin_unlock_irq(vblk->disk->queue->queue_lock); + io_schedule(); + } else { + virtqueue_kick(vblk->vq); + spin_unlock_irq(vblk->disk->queue->queue_lock); + break; + } + + } + + finish_wait(&vblk->queue_wait, &wait); +} + +static void virtblk_make_request(struct request_queue *q, struct bio *bio) +{ + struct virtio_blk *vblk = q->queuedata; + unsigned int num, out = 0, in = 0; + struct virtblk_req *vbr; + + BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems); + BUG_ON(bio->bi_rw & (REQ_FLUSH | REQ_FUA)); + + vbr = virtblk_alloc_req(vblk, GFP_NOIO); + if (!vbr) { + bio_endio(bio, -ENOMEM); + return; + } + + vbr->bio = bio; + vbr->req = NULL; + vbr->out_hdr.type = 0; + vbr->out_hdr.sector = bio->bi_sector; + vbr->out_hdr.ioprio = bio_prio(bio); + + sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); + + num = blk_bio_map_sg(q, bio, vbr->sg + out); + + sg_set_buf(&vbr->sg[num + out + in++], &vbr->status, + sizeof(vbr->status)); + + if (num) { + if (bio->bi_rw & REQ_WRITE) { + vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; + out += num; + } else { + vbr->out_hdr.type |= VIRTIO_BLK_T_IN; + in += num; + } + } + + spin_lock_irq(vblk->disk->queue->queue_lock); + if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, + GFP_ATOMIC) < 0)) { + spin_unlock_irq(vblk->disk->queue->queue_lock); + virtblk_add_buf_wait(vblk, vbr, out, in); + return; + } + virtqueue_kick(vblk->vq); + spin_unlock_irq(vblk->disk->queue->queue_lock); +} + /* return id (s/n) string for *disk to *id_str */ static int virtblk_get_id(struct gendisk *disk, char *id_str) @@ -360,7 +475,7 @@ static int init_vq(struct virtio_blk *vblk) int err = 0; /* We expect one virtqueue, for output. */ - vblk->vq = virtio_find_single_vq(vblk->vdev, blk_done, "requests"); + vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests"); if (IS_ERR(vblk->vq)) err = PTR_ERR(vblk->vq); @@ -414,7 +529,7 @@ static void virtblk_update_cache_mode(struct virtio_device *vdev) u8 writeback = virtblk_get_cache_mode(vdev); struct virtio_blk *vblk = vdev->priv; - if (writeback) + if (writeback && !use_bio) blk_queue_flush(vblk->disk->queue, REQ_FLUSH); else blk_queue_flush(vblk->disk->queue, 0); @@ -477,6 +592,8 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) struct virtio_blk *vblk; struct request_queue *q; int err, index; + int pool_size; + u64 cap; u32 v, blk_size, sg_elems, opt_io_size; u16 min_io_size; @@ -506,10 +623,12 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) goto out_free_index; } + init_waitqueue_head(&vblk->queue_wait); vblk->vdev = vdev; vblk->sg_elems = sg_elems; sg_init_table(vblk->sg, vblk->sg_elems); mutex_init(&vblk->config_lock); + INIT_WORK(&vblk->config_work, virtblk_config_changed_work); vblk->config_enable = true; @@ -517,7 +636,10 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) if (err) goto out_free_vblk; - vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req)); + pool_size = sizeof(struct virtblk_req); + if (use_bio) + pool_size += sizeof(struct scatterlist) * sg_elems; + vblk->pool = mempool_create_kmalloc_pool(1, pool_size); if (!vblk->pool) { err = -ENOMEM; goto out_free_vq; @@ -530,12 +652,14 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) goto out_mempool; } - q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL); + q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL); if (!q) { err = -ENOMEM; goto out_put_disk; } + if (use_bio) + blk_queue_make_request(q, virtblk_make_request); q->queuedata = vblk; virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); @@ -620,7 +744,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) if (!err && opt_io_size) blk_queue_io_opt(q, blk_size * opt_io_size); - add_disk(vblk->disk); err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial); if (err) -- cgit v1.2.3 From c85a1f91b393a6c0c2ad382ba59d7618b29ab758 Mon Sep 17 00:00:00 2001 From: Asias He Date: Wed, 8 Aug 2012 16:07:05 +0800 Subject: virtio-blk: Add REQ_FLUSH and REQ_FUA support to bio path We need to support both REQ_FLUSH and REQ_FUA for bio based path since it does not get the sequencing of REQ_FUA into REQ_FLUSH that request based drivers can request. REQ_FLUSH is emulated by: A) If the bio has no data to write: 1. Send VIRTIO_BLK_T_FLUSH to device, 2. In the flush I/O completion handler, finish the bio B) If the bio has data to write: 1. Send VIRTIO_BLK_T_FLUSH to device 2. In the flush I/O completion handler, send the actual write data to device 3. In the write I/O completion handler, finish the bio REQ_FUA is emulated by: 1. Send the actual write data to device 2. In the write I/O completion handler, send VIRTIO_BLK_T_FLUSH to device 3. In the flush I/O completion handler, finish the bio Changes in v7: - Using vbr->flags to trace request type - Dropped unnecessary struct virtio_blk *vblk parameter - Reuse struct virtblk_req in bio done function Cahnges in v6: - Reworked REQ_FLUSH and REQ_FUA emulatation order Cc: Rusty Russell Cc: Jens Axboe Cc: Christoph Hellwig Cc: Tejun Heo Cc: Shaohua Li Cc: "Michael S. Tsirkin" Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Asias He Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 272 +++++++++++++++++++++++++++++++-------------- 1 file changed, 188 insertions(+), 84 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 95cfeeda4f3..2edfb5cef4f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -58,10 +58,20 @@ struct virtblk_req struct bio *bio; struct virtio_blk_outhdr out_hdr; struct virtio_scsi_inhdr in_hdr; + struct work_struct work; + struct virtio_blk *vblk; + int flags; u8 status; struct scatterlist sg[]; }; +enum { + VBLK_IS_FLUSH = 1, + VBLK_REQ_FLUSH = 2, + VBLK_REQ_DATA = 4, + VBLK_REQ_FUA = 8, +}; + static inline int virtblk_result(struct virtblk_req *vbr) { switch (vbr->status) { @@ -74,9 +84,133 @@ static inline int virtblk_result(struct virtblk_req *vbr) } } -static inline void virtblk_request_done(struct virtio_blk *vblk, - struct virtblk_req *vbr) +static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, + gfp_t gfp_mask) +{ + struct virtblk_req *vbr; + + vbr = mempool_alloc(vblk->pool, gfp_mask); + if (vbr && use_bio) + sg_init_table(vbr->sg, vblk->sg_elems); + + vbr->vblk = vblk; + + return vbr; +} + +static void virtblk_add_buf_wait(struct virtio_blk *vblk, + struct virtblk_req *vbr, + unsigned long out, + unsigned long in) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait_exclusive(&vblk->queue_wait, &wait, + TASK_UNINTERRUPTIBLE); + + spin_lock_irq(vblk->disk->queue->queue_lock); + if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, + GFP_ATOMIC) < 0) { + spin_unlock_irq(vblk->disk->queue->queue_lock); + io_schedule(); + } else { + virtqueue_kick(vblk->vq); + spin_unlock_irq(vblk->disk->queue->queue_lock); + break; + } + + } + + finish_wait(&vblk->queue_wait, &wait); +} + +static inline void virtblk_add_req(struct virtblk_req *vbr, + unsigned int out, unsigned int in) +{ + struct virtio_blk *vblk = vbr->vblk; + + spin_lock_irq(vblk->disk->queue->queue_lock); + if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, + GFP_ATOMIC) < 0)) { + spin_unlock_irq(vblk->disk->queue->queue_lock); + virtblk_add_buf_wait(vblk, vbr, out, in); + return; + } + virtqueue_kick(vblk->vq); + spin_unlock_irq(vblk->disk->queue->queue_lock); +} + +static int virtblk_bio_send_flush(struct virtblk_req *vbr) +{ + unsigned int out = 0, in = 0; + + vbr->flags |= VBLK_IS_FLUSH; + vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; + vbr->out_hdr.sector = 0; + vbr->out_hdr.ioprio = 0; + sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); + sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status)); + + virtblk_add_req(vbr, out, in); + + return 0; +} + +static int virtblk_bio_send_data(struct virtblk_req *vbr) { + struct virtio_blk *vblk = vbr->vblk; + unsigned int num, out = 0, in = 0; + struct bio *bio = vbr->bio; + + vbr->flags &= ~VBLK_IS_FLUSH; + vbr->out_hdr.type = 0; + vbr->out_hdr.sector = bio->bi_sector; + vbr->out_hdr.ioprio = bio_prio(bio); + + sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); + + num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out); + + sg_set_buf(&vbr->sg[num + out + in++], &vbr->status, + sizeof(vbr->status)); + + if (num) { + if (bio->bi_rw & REQ_WRITE) { + vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; + out += num; + } else { + vbr->out_hdr.type |= VIRTIO_BLK_T_IN; + in += num; + } + } + + virtblk_add_req(vbr, out, in); + + return 0; +} + +static void virtblk_bio_send_data_work(struct work_struct *work) +{ + struct virtblk_req *vbr; + + vbr = container_of(work, struct virtblk_req, work); + + virtblk_bio_send_data(vbr); +} + +static void virtblk_bio_send_flush_work(struct work_struct *work) +{ + struct virtblk_req *vbr; + + vbr = container_of(work, struct virtblk_req, work); + + virtblk_bio_send_flush(vbr); +} + +static inline void virtblk_request_done(struct virtblk_req *vbr) +{ + struct virtio_blk *vblk = vbr->vblk; struct request *req = vbr->req; int error = virtblk_result(vbr); @@ -92,17 +226,47 @@ static inline void virtblk_request_done(struct virtio_blk *vblk, mempool_free(vbr, vblk->pool); } -static inline void virtblk_bio_done(struct virtio_blk *vblk, - struct virtblk_req *vbr) +static inline void virtblk_bio_flush_done(struct virtblk_req *vbr) { - bio_endio(vbr->bio, virtblk_result(vbr)); - mempool_free(vbr, vblk->pool); + struct virtio_blk *vblk = vbr->vblk; + + if (vbr->flags & VBLK_REQ_DATA) { + /* Send out the actual write data */ + INIT_WORK(&vbr->work, virtblk_bio_send_data_work); + queue_work(virtblk_wq, &vbr->work); + } else { + bio_endio(vbr->bio, virtblk_result(vbr)); + mempool_free(vbr, vblk->pool); + } +} + +static inline void virtblk_bio_data_done(struct virtblk_req *vbr) +{ + struct virtio_blk *vblk = vbr->vblk; + + if (unlikely(vbr->flags & VBLK_REQ_FUA)) { + /* Send out a flush before end the bio */ + vbr->flags &= ~VBLK_REQ_DATA; + INIT_WORK(&vbr->work, virtblk_bio_send_flush_work); + queue_work(virtblk_wq, &vbr->work); + } else { + bio_endio(vbr->bio, virtblk_result(vbr)); + mempool_free(vbr, vblk->pool); + } +} + +static inline void virtblk_bio_done(struct virtblk_req *vbr) +{ + if (unlikely(vbr->flags & VBLK_IS_FLUSH)) + virtblk_bio_flush_done(vbr); + else + virtblk_bio_data_done(vbr); } static void virtblk_done(struct virtqueue *vq) { struct virtio_blk *vblk = vq->vdev->priv; - unsigned long bio_done = 0, req_done = 0; + bool bio_done = false, req_done = false; struct virtblk_req *vbr; unsigned long flags; unsigned int len; @@ -110,11 +274,11 @@ static void virtblk_done(struct virtqueue *vq) spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { if (vbr->bio) { - virtblk_bio_done(vblk, vbr); - bio_done++; + virtblk_bio_done(vbr); + bio_done = true; } else { - virtblk_request_done(vblk, vbr); - req_done++; + virtblk_request_done(vbr); + req_done = true; } } /* In case queue is stopped waiting for more buffers. */ @@ -126,18 +290,6 @@ static void virtblk_done(struct virtqueue *vq) wake_up(&vblk->queue_wait); } -static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, - gfp_t gfp_mask) -{ - struct virtblk_req *vbr; - - vbr = mempool_alloc(vblk->pool, gfp_mask); - if (vbr && use_bio) - sg_init_table(vbr->sg, vblk->sg_elems); - - return vbr; -} - static bool do_req(struct request_queue *q, struct virtio_blk *vblk, struct request *req) { @@ -242,41 +394,12 @@ static void virtblk_request(struct request_queue *q) virtqueue_kick(vblk->vq); } -static void virtblk_add_buf_wait(struct virtio_blk *vblk, - struct virtblk_req *vbr, - unsigned long out, - unsigned long in) -{ - DEFINE_WAIT(wait); - - for (;;) { - prepare_to_wait_exclusive(&vblk->queue_wait, &wait, - TASK_UNINTERRUPTIBLE); - - spin_lock_irq(vblk->disk->queue->queue_lock); - if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, - GFP_ATOMIC) < 0) { - spin_unlock_irq(vblk->disk->queue->queue_lock); - io_schedule(); - } else { - virtqueue_kick(vblk->vq); - spin_unlock_irq(vblk->disk->queue->queue_lock); - break; - } - - } - - finish_wait(&vblk->queue_wait, &wait); -} - static void virtblk_make_request(struct request_queue *q, struct bio *bio) { struct virtio_blk *vblk = q->queuedata; - unsigned int num, out = 0, in = 0; struct virtblk_req *vbr; BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems); - BUG_ON(bio->bi_rw & (REQ_FLUSH | REQ_FUA)); vbr = virtblk_alloc_req(vblk, GFP_NOIO); if (!vbr) { @@ -285,37 +408,18 @@ static void virtblk_make_request(struct request_queue *q, struct bio *bio) } vbr->bio = bio; - vbr->req = NULL; - vbr->out_hdr.type = 0; - vbr->out_hdr.sector = bio->bi_sector; - vbr->out_hdr.ioprio = bio_prio(bio); - - sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - - num = blk_bio_map_sg(q, bio, vbr->sg + out); - - sg_set_buf(&vbr->sg[num + out + in++], &vbr->status, - sizeof(vbr->status)); - - if (num) { - if (bio->bi_rw & REQ_WRITE) { - vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - out += num; - } else { - vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - in += num; - } - } - - spin_lock_irq(vblk->disk->queue->queue_lock); - if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, - GFP_ATOMIC) < 0)) { - spin_unlock_irq(vblk->disk->queue->queue_lock); - virtblk_add_buf_wait(vblk, vbr, out, in); - return; - } - virtqueue_kick(vblk->vq); - spin_unlock_irq(vblk->disk->queue->queue_lock); + vbr->flags = 0; + if (bio->bi_rw & REQ_FLUSH) + vbr->flags |= VBLK_REQ_FLUSH; + if (bio->bi_rw & REQ_FUA) + vbr->flags |= VBLK_REQ_FUA; + if (bio->bi_size) + vbr->flags |= VBLK_REQ_DATA; + + if (unlikely(vbr->flags & VBLK_REQ_FLUSH)) + virtblk_bio_send_flush(vbr); + else + virtblk_bio_send_data(vbr); } /* return id (s/n) string for *disk to *id_str @@ -529,7 +633,7 @@ static void virtblk_update_cache_mode(struct virtio_device *vdev) u8 writeback = virtblk_get_cache_mode(vdev); struct virtio_blk *vblk = vdev->priv; - if (writeback && !use_bio) + if (writeback) blk_queue_flush(vblk->disk->queue, REQ_FLUSH); else blk_queue_flush(vblk->disk->queue, 0); -- cgit v1.2.3 From f22cf8eb485260ac6e32a614121d44998d83a69a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 5 Sep 2012 15:32:53 +0300 Subject: virtio-blk: fix NULL checking in virtblk_alloc_req() Smatch complains about the inconsistent NULL checking here. Fix it to return NULL on failure. Signed-off-by: Dan Carpenter Signed-off-by: Rusty Russell (fixed accidental deletion) --- drivers/block/virtio_blk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2edfb5cef4f..53b81d59059 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -90,10 +90,12 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, struct virtblk_req *vbr; vbr = mempool_alloc(vblk->pool, gfp_mask); - if (vbr && use_bio) - sg_init_table(vbr->sg, vblk->sg_elems); + if (!vbr) + return NULL; vbr->vblk = vblk; + if (use_bio) + sg_init_table(vbr->sg, vblk->sg_elems); return vbr; } -- cgit v1.2.3 From 04679f34c141a0591e525d392929efc249440a7c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 28 Sep 2012 15:05:14 +0930 Subject: virtio-balloon: dependency fix Devices should depend on virtio, not select it. It's supposed to be selected by the particular driver, e.g. VIRTIO_PCI. Make balloon depend on VIRTIO and EXPERIMENTAL (to match description). Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index f38b17a86c3..271be8059b9 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -27,8 +27,7 @@ config VIRTIO_PCI config VIRTIO_BALLOON tristate "Virtio balloon driver (EXPERIMENTAL)" - select VIRTIO - select VIRTIO_RING + depends on EXPERIMENTAL && VIRTIO ---help--- This driver supports increasing and decreasing the amount of memory within a KVM guest. -- cgit v1.2.3 From 7a23eb28fa645f1f0c2ec38274c11bc78c50c047 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 28 Sep 2012 15:05:14 +0930 Subject: virtio_balloon: not EXPERIMENTAL any more. It is not experimental in any vaguely-sane sense. Reported-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 271be8059b9..8e048518eeb 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -26,8 +26,8 @@ config VIRTIO_PCI If unsure, say M. config VIRTIO_BALLOON - tristate "Virtio balloon driver (EXPERIMENTAL)" - depends on EXPERIMENTAL && VIRTIO + tristate "Virtio balloon driver" + depends on VIRTIO ---help--- This driver supports increasing and decreasing the amount of memory within a KVM guest. -- cgit v1.2.3 From 17bb6d40880d4178f5f8a75900ed8c9ff47d3fb2 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 28 Aug 2012 13:54:13 +0200 Subject: virtio-ring: move queue_index to vring_virtqueue Instead of storing the queue index in transport-specific virtio structs, this patch moves them to vring_virtqueue and introduces an helper to get the value. This lets drivers simplify their management and tracing of virtqueues. Signed-off-by: Jason Wang Signed-off-by: Paolo Bonzini Signed-off-by: Rusty Russell --- drivers/lguest/lguest_device.c | 2 +- drivers/remoteproc/remoteproc_virtio.c | 2 +- drivers/s390/kvm/kvm_virtio.c | 2 +- drivers/virtio/virtio_mmio.c | 12 ++++-------- drivers/virtio/virtio_pci.c | 13 +++++-------- drivers/virtio/virtio_ring.c | 14 +++++++++++++- include/linux/virtio.h | 2 ++ include/linux/virtio_ring.h | 3 ++- 8 files changed, 29 insertions(+), 21 deletions(-) diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 9e8388efd88..ccb7dfb028f 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -296,7 +296,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu * barriers. */ - vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, vdev, + vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev, true, lvq->pages, lg_notify, callback, name); if (!vq) { err = -ENOMEM; diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c index 3541b4492f6..343c1941c12 100644 --- a/drivers/remoteproc/remoteproc_virtio.c +++ b/drivers/remoteproc/remoteproc_virtio.c @@ -103,7 +103,7 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev, * Create the new vq, and tell virtio we're not interested in * the 'weak' smp barriers, since we're talking with a real device. */ - vq = vring_new_virtqueue(len, rvring->align, vdev, false, addr, + vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, addr, rproc_virtio_notify, callback, name); if (!vq) { dev_err(dev, "vring_new_virtqueue %s failed\n", name); diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c index 47cccd52aae..5565af20592 100644 --- a/drivers/s390/kvm/kvm_virtio.c +++ b/drivers/s390/kvm/kvm_virtio.c @@ -198,7 +198,7 @@ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev, if (err) goto out; - vq = vring_new_virtqueue(config->num, KVM_S390_VIRTIO_RING_ALIGN, + vq = vring_new_virtqueue(index, config->num, KVM_S390_VIRTIO_RING_ALIGN, vdev, true, (void *) config->address, kvm_notify, callback, name); if (!vq) { diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 453db0c403d..008bf58bdaa 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -131,9 +131,6 @@ struct virtio_mmio_vq_info { /* the number of entries in the queue */ unsigned int num; - /* the index of the queue */ - int queue_index; - /* the virtual address of the ring queue */ void *queue; @@ -225,11 +222,10 @@ static void vm_reset(struct virtio_device *vdev) static void vm_notify(struct virtqueue *vq) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); - struct virtio_mmio_vq_info *info = vq->priv; /* We write the queue's selector into the notification register to * signal the other end */ - writel(info->queue_index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); + writel(virtqueue_get_queue_index(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); } /* Notify all virtqueues on an interrupt. */ @@ -270,6 +266,7 @@ static void vm_del_vq(struct virtqueue *vq) struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); struct virtio_mmio_vq_info *info = vq->priv; unsigned long flags, size; + unsigned int index = virtqueue_get_queue_index(vq); spin_lock_irqsave(&vm_dev->lock, flags); list_del(&info->node); @@ -278,7 +275,7 @@ static void vm_del_vq(struct virtqueue *vq) vring_del_virtqueue(vq); /* Select and deactivate the queue */ - writel(info->queue_index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); + writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN)); @@ -324,7 +321,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, err = -ENOMEM; goto error_kmalloc; } - info->queue_index = index; /* Allocate pages for the queue - start with a queue as big as * possible (limited by maximum size allowed by device), drop down @@ -356,7 +352,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); /* Create the vring */ - vq = vring_new_virtqueue(info->num, VIRTIO_MMIO_VRING_ALIGN, vdev, + vq = vring_new_virtqueue(index, info->num, VIRTIO_MMIO_VRING_ALIGN, vdev, true, info->queue, vm_notify, callback, name); if (!vq) { err = -ENOMEM; diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 2e03d416b9a..d902464b89c 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -79,9 +79,6 @@ struct virtio_pci_vq_info /* the number of entries in the queue */ int num; - /* the index of the queue */ - int queue_index; - /* the virtual address of the ring queue */ void *queue; @@ -202,11 +199,11 @@ static void vp_reset(struct virtio_device *vdev) static void vp_notify(struct virtqueue *vq) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); - struct virtio_pci_vq_info *info = vq->priv; /* we write the queue's selector into the notification register to * signal the other end */ - iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY); + iowrite16(virtqueue_get_queue_index(vq), + vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY); } /* Handle a configuration change: Tell driver if it wants to know. */ @@ -402,7 +399,6 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index, if (!info) return ERR_PTR(-ENOMEM); - info->queue_index = index; info->num = num; info->msix_vector = msix_vec; @@ -418,7 +414,7 @@ static struct virtqueue *setup_vq(struct virtio_device *vdev, unsigned index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); /* create the vring */ - vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN, vdev, + vq = vring_new_virtqueue(index, info->num, VIRTIO_PCI_VRING_ALIGN, vdev, true, info->queue, vp_notify, callback, name); if (!vq) { err = -ENOMEM; @@ -467,7 +463,8 @@ static void vp_del_vq(struct virtqueue *vq) list_del(&info->node); spin_unlock_irqrestore(&vp_dev->lock, flags); - iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + iowrite16(virtqueue_get_queue_index(vq), + vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); if (vp_dev->msix_enabled) { iowrite16(VIRTIO_MSI_NO_VECTOR, diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 5aa43c3392a..e639584b2db 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -106,6 +106,9 @@ struct vring_virtqueue /* How to notify other side. FIXME: commonalize hcalls! */ void (*notify)(struct virtqueue *vq); + /* Index of the queue */ + int queue_index; + #ifdef DEBUG /* They're supposed to lock for us. */ unsigned int in_use; @@ -171,6 +174,13 @@ static int vring_add_indirect(struct vring_virtqueue *vq, return head; } +int virtqueue_get_queue_index(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + return vq->queue_index; +} +EXPORT_SYMBOL_GPL(virtqueue_get_queue_index); + /** * virtqueue_add_buf - expose buffer to other end * @vq: the struct virtqueue we're talking about. @@ -616,7 +626,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq) } EXPORT_SYMBOL_GPL(vring_interrupt); -struct virtqueue *vring_new_virtqueue(unsigned int num, +struct virtqueue *vring_new_virtqueue(unsigned int index, + unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, @@ -647,6 +658,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, vq->broken = false; vq->last_used_idx = 0; vq->num_added = 0; + vq->queue_index = index; list_add_tail(&vq->vq.list, &vdev->vqs); #ifdef DEBUG vq->in_use = false; diff --git a/include/linux/virtio.h b/include/linux/virtio.h index a1ba8bbd9fb..533b1157f22 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -50,6 +50,8 @@ void *virtqueue_detach_unused_buf(struct virtqueue *vq); unsigned int virtqueue_get_vring_size(struct virtqueue *vq); +int virtqueue_get_queue_index(struct virtqueue *vq); + /** * virtio_device - representation of a device using virtio * @index: unique position on the virtio bus diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index e338730c266..c2d793a06ad 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -165,7 +165,8 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old) struct virtio_device; struct virtqueue; -struct virtqueue *vring_new_virtqueue(unsigned int num, +struct virtqueue *vring_new_virtqueue(unsigned int index, + unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, -- cgit v1.2.3 From 75a0a52be3c27b58654fbed2c8f2ff401482b9a4 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 28 Aug 2012 13:54:14 +0200 Subject: virtio: introduce an API to set affinity for a virtqueue Sometimes, virtio device need to configure irq affinity hint to maximize the performance. Instead of just exposing the irq of a virtqueue, this patch introduce an API to set the affinity for a virtqueue. The api is best-effort, the affinity hint may not be set as expected due to platform support, irq sharing or irq type. Currently, only pci method were implemented and we set the affinity according to: - if device uses INTX, we just ignore the request - if device has per vq vector, we force the affinity hint - if the virtqueues share MSI, make the affinity OR over all affinities requested Signed-off-by: Jason Wang Signed-off-by: Paolo Bonzini Signed-off-by: Rusty Russell --- drivers/virtio/virtio_pci.c | 46 +++++++++++++++++++++++++++++++++++++++++++ include/linux/virtio_config.h | 21 ++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index d902464b89c..f5dfe6bdb95 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -48,6 +48,7 @@ struct virtio_pci_device int msix_enabled; int intx_enabled; struct msix_entry *msix_entries; + cpumask_var_t *msix_affinity_masks; /* Name strings for interrupts. This size should be enough, * and I'm too lazy to allocate each name separately. */ char (*msix_names)[256]; @@ -276,6 +277,10 @@ static void vp_free_vectors(struct virtio_device *vdev) for (i = 0; i < vp_dev->msix_used_vectors; ++i) free_irq(vp_dev->msix_entries[i].vector, vp_dev); + for (i = 0; i < vp_dev->msix_vectors; i++) + if (vp_dev->msix_affinity_masks[i]) + free_cpumask_var(vp_dev->msix_affinity_masks[i]); + if (vp_dev->msix_enabled) { /* Disable the vector used for configuration */ iowrite16(VIRTIO_MSI_NO_VECTOR, @@ -293,6 +298,8 @@ static void vp_free_vectors(struct virtio_device *vdev) vp_dev->msix_names = NULL; kfree(vp_dev->msix_entries); vp_dev->msix_entries = NULL; + kfree(vp_dev->msix_affinity_masks); + vp_dev->msix_affinity_masks = NULL; } static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, @@ -311,6 +318,15 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, GFP_KERNEL); if (!vp_dev->msix_names) goto error; + vp_dev->msix_affinity_masks + = kzalloc(nvectors * sizeof *vp_dev->msix_affinity_masks, + GFP_KERNEL); + if (!vp_dev->msix_affinity_masks) + goto error; + for (i = 0; i < nvectors; ++i) + if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i], + GFP_KERNEL)) + goto error; for (i = 0; i < nvectors; ++i) vp_dev->msix_entries[i].entry = i; @@ -606,6 +622,35 @@ static const char *vp_bus_name(struct virtio_device *vdev) return pci_name(vp_dev->pci_dev); } +/* Setup the affinity for a virtqueue: + * - force the affinity for per vq vector + * - OR over all affinities for shared MSI + * - ignore the affinity request if we're using INTX + */ +static int vp_set_vq_affinity(struct virtqueue *vq, int cpu) +{ + struct virtio_device *vdev = vq->vdev; + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_vq_info *info = vq->priv; + struct cpumask *mask; + unsigned int irq; + + if (!vq->callback) + return -EINVAL; + + if (vp_dev->msix_enabled) { + mask = vp_dev->msix_affinity_masks[info->msix_vector]; + irq = vp_dev->msix_entries[info->msix_vector].vector; + if (cpu == -1) + irq_set_affinity_hint(irq, NULL); + else { + cpumask_set_cpu(cpu, mask); + irq_set_affinity_hint(irq, mask); + } + } + return 0; +} + static struct virtio_config_ops virtio_pci_config_ops = { .get = vp_get, .set = vp_set, @@ -617,6 +662,7 @@ static struct virtio_config_ops virtio_pci_config_ops = { .get_features = vp_get_features, .finalize_features = vp_finalize_features, .bus_name = vp_bus_name, + .set_vq_affinity = vp_set_vq_affinity, }; static void virtio_pci_release_dev(struct device *_d) diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index fc457f452f6..2c4a9895379 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -98,6 +98,7 @@ * vdev: the virtio_device * This returns a pointer to the bus name a la pci_name from which * the caller can then copy. + * @set_vq_affinity: set the affinity for a virtqueue. */ typedef void vq_callback_t(struct virtqueue *); struct virtio_config_ops { @@ -116,6 +117,7 @@ struct virtio_config_ops { u32 (*get_features)(struct virtio_device *vdev); void (*finalize_features)(struct virtio_device *vdev); const char *(*bus_name)(struct virtio_device *vdev); + int (*set_vq_affinity)(struct virtqueue *vq, int cpu); }; /* If driver didn't advertise the feature, it will never appear. */ @@ -190,5 +192,24 @@ const char *virtio_bus_name(struct virtio_device *vdev) return vdev->config->bus_name(vdev); } +/** + * virtqueue_set_affinity - setting affinity for a virtqueue + * @vq: the virtqueue + * @cpu: the cpu no. + * + * Pay attention the function are best-effort: the affinity hint may not be set + * due to config support, irq type and sharing. + * + */ +static inline +int virtqueue_set_affinity(struct virtqueue *vq, int cpu) +{ + struct virtio_device *vdev = vq->vdev; + if (vdev->config->set_vq_affinity) + return vdev->config->set_vq_affinity(vq, cpu); + return 0; +} + + #endif /* __KERNEL__ */ #endif /* _LINUX_VIRTIO_CONFIG_H */ -- cgit v1.2.3 From 6457f126c888b3481fdae6f702e616cd0c79646e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 5 Sep 2012 21:47:45 +0300 Subject: virtio: support reserved vqs virtio network device multiqueue support reserves vq 3 for future use (useful both for future extensions and to make it pretty - this way receive vqs have even and transmit - odd numbers). Make it possible to skip initialization for specific vq numbers by specifying NULL for name. Document this usage as well as (existing) NULL callback. Drivers using this not coded up yet, so I simply tested with virtio-pci and verified that this patch does not break existing drivers. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/lguest/lguest_device.c | 3 +++ drivers/remoteproc/remoteproc_virtio.c | 3 +++ drivers/s390/kvm/kvm_virtio.c | 3 +++ drivers/virtio/virtio_mmio.c | 3 +++ drivers/virtio/virtio_pci.c | 5 ++++- include/linux/virtio_config.h | 2 ++ 6 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index ccb7dfb028f..fc92ccbd71d 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -263,6 +263,9 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, struct virtqueue *vq; int err; + if (!name) + return NULL; + /* We must have this many virtqueues. */ if (index >= ldev->desc->num_vq) return ERR_PTR(-ENOENT); diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c index 343c1941c12..e7a4780e93d 100644 --- a/drivers/remoteproc/remoteproc_virtio.c +++ b/drivers/remoteproc/remoteproc_virtio.c @@ -84,6 +84,9 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev, if (id >= ARRAY_SIZE(rvdev->vring)) return ERR_PTR(-EINVAL); + if (!name) + return NULL; + ret = rproc_alloc_vring(rvdev, id); if (ret) return ERR_PTR(ret); diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c index 5565af20592..7dabef624da 100644 --- a/drivers/s390/kvm/kvm_virtio.c +++ b/drivers/s390/kvm/kvm_virtio.c @@ -190,6 +190,9 @@ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev, if (index >= kdev->desc->num_vq) return ERR_PTR(-ENOENT); + if (!name) + return NULL; + config = kvm_vq_config(kdev->desc)+index; err = vmem_add_mapping(config->address, diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 008bf58bdaa..5d7fee385b7 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -306,6 +306,9 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, unsigned long flags, size; int err; + if (!name) + return NULL; + /* Select the queue we're interested in */ writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index f5dfe6bdb95..42b20769d12 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -555,7 +555,10 @@ static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, vp_dev->per_vq_vectors = per_vq_vectors; allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { - if (!callbacks[i] || !vp_dev->msix_enabled) + if (!names[i]) { + vqs[i] = NULL; + continue; + } else if (!callbacks[i] || !vp_dev->msix_enabled) msix_vec = VIRTIO_MSI_NO_VECTOR; else if (vp_dev->per_vq_vectors) msix_vec = allocated_vectors++; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 2c4a9895379..e2850a7ea27 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -84,7 +84,9 @@ * nvqs: the number of virtqueues to find * vqs: on success, includes new virtqueues * callbacks: array of callbacks, for each virtqueue + * include a NULL entry for vqs that do not need a callback * names: array of virtqueue names (mainly for debugging) + * include a NULL entry for vqs unused by driver * Returns 0 on success or error status * @del_vqs: free virtqueues found by find_vqs(). * @get_features: get the array of feature bits for this device. -- cgit v1.2.3 From 387daf1716da579fc1d198414d890967cd624a5b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 28 Sep 2012 15:05:15 +0930 Subject: virtio: add help to CONFIG_VIRTIO option. Trying to enable a virtio driver (eg CONFIG_VIRTIO_BLK) is painful because it depends on CONFIG_VIRTIO. CONFIG_VIRTIO doesn't tell you how to turn it on (it's selected from anything which provides a virtio bus). This patch at least adds some documentation, visible in menuconfig, as a hint. Reported-by: Kent Overstreet Signed-off-by: Rusty Russell --- drivers/virtio/Kconfig | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 8e048518eeb..eabc2a0cd98 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -1,8 +1,10 @@ -# Virtio always gets selected by whoever wants it. config VIRTIO tristate + ---help--- + This option is selected by any driver which implements the virtio + bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST, + CONFIG_RPMSG or CONFIG_S390_GUEST. -# Similarly the virtio ring implementation. config VIRTIO_RING tristate depends on VIRTIO -- cgit v1.2.3 From eccbb05a64fef867362ff05b5d266757e3c82b36 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 28 Sep 2012 15:05:15 +0930 Subject: virtio: remove CONFIG_VIRTIO_RING Everyone who selects VIRTIO is also made to select VIRTIO_RING; just make them synonymous, since we removed the indirection layer some time ago. Signed-off-by: Rusty Russell --- arch/s390/Kconfig | 1 - arch/x86/lguest/Kconfig | 1 - drivers/rpmsg/Kconfig | 1 - drivers/virtio/Kconfig | 6 ------ drivers/virtio/Makefile | 3 +-- 5 files changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 107610e01a2..9faf3d9b26c 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -587,7 +587,6 @@ config S390_GUEST depends on 64BIT && EXPERIMENTAL select VIRTUALIZATION select VIRTIO - select VIRTIO_RING select VIRTIO_CONSOLE help Select this option if you want to run the kernel as a guest under diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 6e121a2a49e..7872a3330fb 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -4,7 +4,6 @@ config LGUEST_GUEST depends on X86_32 select VIRTUALIZATION select VIRTIO - select VIRTIO_RING select VIRTIO_CONSOLE help Lguest is a tiny in-kernel hypervisor. Selecting this will diff --git a/drivers/rpmsg/Kconfig b/drivers/rpmsg/Kconfig index 32aead65735..2bd911f1257 100644 --- a/drivers/rpmsg/Kconfig +++ b/drivers/rpmsg/Kconfig @@ -4,7 +4,6 @@ menu "Rpmsg drivers (EXPERIMENTAL)" config RPMSG tristate select VIRTIO - select VIRTIO_RING depends on EXPERIMENTAL endmenu diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index eabc2a0cd98..8d5bddb56cb 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -5,17 +5,12 @@ config VIRTIO bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST, CONFIG_RPMSG or CONFIG_S390_GUEST. -config VIRTIO_RING - tristate - depends on VIRTIO - menu "Virtio drivers" config VIRTIO_PCI tristate "PCI driver for virtio devices (EXPERIMENTAL)" depends on PCI && EXPERIMENTAL select VIRTIO - select VIRTIO_RING ---help--- This drivers provides support for virtio based paravirtual device drivers over PCI. This requires that your VMM has appropriate PCI @@ -40,7 +35,6 @@ config VIRTIO_BALLOON tristate "Platform bus driver for memory mapped virtio devices (EXPERIMENTAL)" depends on HAS_IOMEM && EXPERIMENTAL select VIRTIO - select VIRTIO_RING ---help--- This drivers provides support for memory mapped virtio platform device driver. diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index 5a4c63cfd38..9076635697b 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -1,5 +1,4 @@ -obj-$(CONFIG_VIRTIO) += virtio.o -obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o +obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o -- cgit v1.2.3 From 5543a6ac31eb4bfd8d938db6b234ce833d14e04e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 28 Sep 2012 15:05:16 +0930 Subject: virtio: don't crash when device is buggy Because of a sanity check in virtio_dev_remove, a buggy device can crash kernel. And in case of rproc it's userspace so it's not a good idea. We are unloading a driver so how bad can it be? Be less aggressive in handling this error: if it's a driver bug, warning once should be enough. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/virtio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index c3b3f7f0d9d..1e8659ca27e 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -159,7 +159,7 @@ static int virtio_dev_remove(struct device *_d) drv->remove(dev); /* Driver should have reset device. */ - BUG_ON(dev->config->get_status(dev)); + WARN_ON_ONCE(dev->config->get_status(dev)); /* Acknowledge the device's existence again. */ add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); -- cgit v1.2.3 From 74a74b376c997645b32a4fdf8e76705a00ae097a Mon Sep 17 00:00:00 2001 From: Peter Senna Tschudin Date: Mon, 17 Sep 2012 19:31:17 +0200 Subject: drivers/virtio/virtio_pci.c: fix error return code Convert a nonnegative error return code to a negative one, as returned elsewhere in the function. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // ( if@p1 (\(ret < 0\|ret != 0\)) { ... return ret; } | ret@p1 = 0 ) ... when != ret = e1 when != &ret *if(...) { ... when != ret = e2 when forall return ret; } // Signed-off-by: Peter Senna Tschudin Signed-off-by: Rusty Russell --- drivers/virtio/virtio_pci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index 42b20769d12..c33aea36598 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -719,8 +719,10 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev, goto out_enable_device; vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0); - if (vp_dev->ioaddr == NULL) + if (vp_dev->ioaddr == NULL) { + err = -ENOMEM; goto out_req_regions; + } pci_set_drvdata(pci_dev, vp_dev); pci_set_master(pci_dev); -- cgit v1.2.3 From 3850d29fc40f3494a3e9c3aac45b6afe53526449 Mon Sep 17 00:00:00 2001 From: Brian Foley Date: Mon, 24 Sep 2012 14:33:41 +0100 Subject: virtio_mmio: fix off by one error allocating queue vm_setup_vq fails to allow VirtQueues needing only 2 pages of storage, as it should. Found with a kernel using 64kB pages, but can be provoked if a virtio device reports QueueNumMax where the descriptor table and available ring fit in one page, and the used ring on the second (<= 227 descriptors with 4kB pages and <= 3640 with 64kB pages.) Signed-off-by: Brian Foley Signed-off-by: Pawel Moll Signed-off-by: Rusty Russell --- drivers/virtio/virtio_mmio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 5d7fee385b7..09edeecd42a 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -334,8 +334,8 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, while (1) { size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN)); - /* Already smallest possible allocation? */ - if (size <= VIRTIO_MMIO_VRING_ALIGN * 2) { + /* Did the last iter shrink the queue below minimum size? */ + if (size < VIRTIO_MMIO_VRING_ALIGN * 2) { err = -ENOMEM; goto error_alloc_pages; } -- cgit v1.2.3 From d78b519f6b945aef6202bbb5b56f928572e15165 Mon Sep 17 00:00:00 2001 From: Brian Foley Date: Mon, 24 Sep 2012 14:33:42 +0100 Subject: virtio_mmio: Don't attempt to create empty virtqueues If a virtio device reports a QueueNumMax of 0, vring_new_virtqueue() doesn't check this, and thanks to an unsigned (i < num - 1) loop guard, scribbles over memory when initialising the free list. Avoid by not trying to create zero-descriptor queues, as there's no way to do any I/O with one. Signed-off-by: Brian Foley Signed-off-by: Pawel Moll Signed-off-by: Rusty Russell --- drivers/virtio/virtio_mmio.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 09edeecd42a..6b1b7e18493 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -331,6 +331,16 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, * and two rings (which makes it "alignment_size * 2") */ info->num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX); + + /* If the device reports a 0 entry queue, we won't be able to + * use it to perform I/O, and vring_new_virtqueue() can't create + * empty queues anyway, so don't bother to set up the device. + */ + if (info->num == 0) { + err = -ENOENT; + goto error_alloc_pages; + } + while (1) { size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN)); -- cgit v1.2.3 From bb8111086c12ebdadc0544ba04dccd3aad212ad2 Mon Sep 17 00:00:00 2001 From: Asias He Date: Tue, 25 Sep 2012 10:36:17 +0800 Subject: virtio-blk: Disable callback in virtblk_done() This reduces unnecessary interrupts that host could send to guest while guest is in the progress of irq handling. If one vcpu is handling the irq, while another interrupt comes, in handle_edge_irq(), the guest will mask the interrupt via mask_msi_irq() which is a very heavy operation that goes all the way down to host. Here are some performance numbers on qemu: Before: ------------------------------------- seq-read : io=0 B, bw=269730KB/s, iops=67432 , runt= 62200msec seq-write : io=0 B, bw=339716KB/s, iops=84929 , runt= 49386msec rand-read : io=0 B, bw=270435KB/s, iops=67608 , runt= 62038msec rand-write: io=0 B, bw=354436KB/s, iops=88608 , runt= 47335msec clat (usec): min=101 , max=138052 , avg=14822.09, stdev=11771.01 clat (usec): min=96 , max=81543 , avg=11798.94, stdev=7735.60 clat (usec): min=128 , max=140043 , avg=14835.85, stdev=11765.33 clat (usec): min=109 , max=147207 , avg=11337.09, stdev=5990.35 cpu : usr=15.93%, sys=60.37%, ctx=7764972, majf=0, minf=54 cpu : usr=32.73%, sys=120.49%, ctx=7372945, majf=0, minf=1 cpu : usr=18.84%, sys=58.18%, ctx=7775420, majf=0, minf=1 cpu : usr=24.20%, sys=59.85%, ctx=8307886, majf=0, minf=0 vdb: ios=8389107/8368136, merge=0/0, ticks=19457874/14616506, in_queue=34206098, util=99.68% 43: interrupt in total: 887320 fio --exec_prerun="echo 3 > /proc/sys/vm/drop_caches" --group_reporting --ioscheduler=noop --thread --bs=4k --size=512MB --direct=1 --numjobs=16 --ioengine=libaio --iodepth=64 --loops=3 --ramp_time=0 --filename=/dev/vdb --name=seq-read --stonewall --rw=read --name=seq-write --stonewall --rw=write --name=rnd-read --stonewall --rw=randread --name=rnd-write --stonewall --rw=randwrite After: ------------------------------------- seq-read : io=0 B, bw=309503KB/s, iops=77375 , runt= 54207msec seq-write : io=0 B, bw=448205KB/s, iops=112051 , runt= 37432msec rand-read : io=0 B, bw=311254KB/s, iops=77813 , runt= 53902msec rand-write: io=0 B, bw=377152KB/s, iops=94287 , runt= 44484msec clat (usec): min=81 , max=90588 , avg=12946.06, stdev=9085.94 clat (usec): min=57 , max=72264 , avg=8967.97, stdev=5951.04 clat (usec): min=29 , max=101046 , avg=12889.95, stdev=9067.91 clat (usec): min=52 , max=106152 , avg=10660.56, stdev=4778.19 cpu : usr=15.05%, sys=57.92%, ctx=7710941, majf=0, minf=54 cpu : usr=26.78%, sys=101.40%, ctx=7387891, majf=0, minf=2 cpu : usr=19.03%, sys=58.17%, ctx=7681976, majf=0, minf=8 cpu : usr=24.65%, sys=58.34%, ctx=8442632, majf=0, minf=4 vdb: ios=8389086/8361888, merge=0/0, ticks=17243780/12742010, in_queue=30078377, util=99.59% 43: interrupt in total: 1259639 fio --exec_prerun="echo 3 > /proc/sys/vm/drop_caches" --group_reporting --ioscheduler=noop --thread --bs=4k --size=512MB --direct=1 --numjobs=16 --ioengine=libaio --iodepth=64 --loops=3 --ramp_time=0 --filename=/dev/vdb --name=seq-read --stonewall --rw=read --name=seq-write --stonewall --rw=write --name=rnd-read --stonewall --rw=randread --name=rnd-write --stonewall --rw=randwrite Signed-off-by: Asias He Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 53b81d59059..0bdde8fba39 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -274,15 +274,18 @@ static void virtblk_done(struct virtqueue *vq) unsigned int len; spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); - while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { - if (vbr->bio) { - virtblk_bio_done(vbr); - bio_done = true; - } else { - virtblk_request_done(vbr); - req_done = true; + do { + virtqueue_disable_cb(vq); + while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { + if (vbr->bio) { + virtblk_bio_done(vbr); + bio_done = true; + } else { + virtblk_request_done(vbr); + req_done = true; + } } - } + } while (!virtqueue_enable_cb(vq)); /* In case queue is stopped waiting for more buffers. */ if (req_done) blk_start_queue(vblk->disk->queue); -- cgit v1.2.3 From ca16f580a5db7e60bfafe59a50bb133bd3347491 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 4 Oct 2012 12:03:25 +0930 Subject: lguest: fix occasional crash in example launcher. We usually got away with ->next on the final entry being NULL, but it finally bit me. Signed-off-by: Rusty Russell Cc: stable@kernel.org --- tools/lguest/lguest.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c index f759f4f097c..fd2f9221b24 100644 --- a/tools/lguest/lguest.c +++ b/tools/lguest/lguest.c @@ -1299,6 +1299,7 @@ static struct device *new_device(const char *name, u16 type) dev->feature_len = 0; dev->num_vq = 0; dev->running = false; + dev->next = NULL; /* * Append to device list. Prepending to a single-linked list is -- cgit v1.2.3