diff options
Diffstat (limited to 'util')
-rw-r--r-- | util/Makefile.objs | 18 | ||||
-rw-r--r-- | util/bitmap.c | 2 | ||||
-rw-r--r-- | util/buffer.c | 171 | ||||
-rw-r--r-- | util/coroutine-gthread.c | 198 | ||||
-rw-r--r-- | util/coroutine-sigaltstack.c | 293 | ||||
-rw-r--r-- | util/coroutine-ucontext.c | 194 | ||||
-rw-r--r-- | util/coroutine-win32.c | 104 | ||||
-rw-r--r-- | util/cutils.c | 203 | ||||
-rw-r--r-- | util/error.c | 180 | ||||
-rw-r--r-- | util/event_notifier-posix.c | 2 | ||||
-rw-r--r-- | util/id.c | 37 | ||||
-rw-r--r-- | util/memfd.c | 162 | ||||
-rw-r--r-- | util/mmap-alloc.c | 110 | ||||
-rw-r--r-- | util/osdep.c | 17 | ||||
-rw-r--r-- | util/oslib-posix.c | 115 | ||||
-rw-r--r-- | util/oslib-win32.c | 17 | ||||
-rw-r--r-- | util/qemu-coroutine-io.c | 91 | ||||
-rw-r--r-- | util/qemu-coroutine-lock.c | 186 | ||||
-rw-r--r-- | util/qemu-coroutine-sleep.c | 41 | ||||
-rw-r--r-- | util/qemu-coroutine.c | 150 | ||||
-rw-r--r-- | util/qemu-error.c | 2 | ||||
-rw-r--r-- | util/qemu-option.c | 45 | ||||
-rw-r--r-- | util/qemu-sockets.c | 212 | ||||
-rw-r--r-- | util/qemu-thread-posix.c | 13 | ||||
-rw-r--r-- | util/qemu-thread-win32.c | 66 | ||||
-rw-r--r-- | util/rcu.c | 48 | ||||
-rw-r--r-- | util/throttle.c | 15 | ||||
-rw-r--r-- | util/timed-average.c | 231 |
28 files changed, 2709 insertions, 214 deletions
diff --git a/util/Makefile.objs b/util/Makefile.objs index 114d6578c4..89dd80ef86 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -1,13 +1,20 @@ util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o -util-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o event_notifier-win32.o -util-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o event_notifier-posix.o qemu-openpty.o +util-obj-$(CONFIG_POSIX) += compatfd.o +util-obj-$(CONFIG_POSIX) += event_notifier-posix.o +util-obj-$(CONFIG_POSIX) += mmap-alloc.o +util-obj-$(CONFIG_POSIX) += oslib-posix.o +util-obj-$(CONFIG_POSIX) += qemu-openpty.o +util-obj-$(CONFIG_POSIX) += qemu-thread-posix.o +util-obj-$(CONFIG_WIN32) += event_notifier-win32.o +util-obj-$(CONFIG_POSIX) += memfd.o +util-obj-$(CONFIG_WIN32) += oslib-win32.o +util-obj-$(CONFIG_WIN32) += qemu-thread-win32.o util-obj-y += envlist.o path.o module.o util-obj-$(call lnot,$(CONFIG_INT128)) += host-utils.o util-obj-y += bitmap.o bitops.o hbitmap.o util-obj-y += fifo8.o util-obj-y += acl.o util-obj-y += error.o qemu-error.o -util-obj-$(CONFIG_POSIX) += compatfd.o util-obj-y += id.o util-obj-y += iov.o qemu-config.o qemu-sockets.o uri.o notify.o util-obj-y += qemu-option.o qemu-progress.o @@ -18,3 +25,8 @@ util-obj-y += getauxval.o util-obj-y += readline.o util-obj-y += rfifolock.o util-obj-y += rcu.o +util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o +util-obj-y += qemu-coroutine-sleep.o +util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o +util-obj-y += buffer.o +util-obj-y += timed-average.o diff --git a/util/bitmap.c b/util/bitmap.c index 300a68e38c..44f0f481be 100644 --- a/util/bitmap.c +++ b/util/bitmap.c @@ -14,7 +14,7 @@ #include "qemu/atomic.h" /* - * bitmaps provide an array of bits, implemented using an an + * bitmaps provide an array of bits, implemented using an * array of unsigned longs. The number of valid bits in a * given bitmap does _not_ need to be an exact multiple of * BITS_PER_LONG. diff --git a/util/buffer.c b/util/buffer.c new file mode 100644 index 0000000000..8b27c08aac --- /dev/null +++ b/util/buffer.c @@ -0,0 +1,171 @@ +/* + * QEMU generic buffers + * + * Copyright (c) 2015 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "qemu/buffer.h" +#include "trace.h" + +#define BUFFER_MIN_INIT_SIZE 4096 +#define BUFFER_MIN_SHRINK_SIZE 65536 + +/* define the factor alpha for the expentional smoothing + * that is used in the average size calculation. a shift + * of 7 results in an alpha of 1/2^7. */ +#define BUFFER_AVG_SIZE_SHIFT 7 + +static size_t buffer_req_size(Buffer *buffer, size_t len) +{ + return MAX(BUFFER_MIN_INIT_SIZE, + pow2ceil(buffer->offset + len)); +} + +static void buffer_adj_size(Buffer *buffer, size_t len) +{ + size_t old = buffer->capacity; + buffer->capacity = buffer_req_size(buffer, len); + buffer->buffer = g_realloc(buffer->buffer, buffer->capacity); + trace_buffer_resize(buffer->name ?: "unnamed", + old, buffer->capacity); + + /* make it even harder for the buffer to shrink, reset average size + * to currenty capacity if it is larger than the average. */ + buffer->avg_size = MAX(buffer->avg_size, + buffer->capacity << BUFFER_AVG_SIZE_SHIFT); +} + +void buffer_init(Buffer *buffer, const char *name, ...) +{ + va_list ap; + + va_start(ap, name); + buffer->name = g_strdup_vprintf(name, ap); + va_end(ap); +} + +static uint64_t buffer_get_avg_size(Buffer *buffer) +{ + return buffer->avg_size >> BUFFER_AVG_SIZE_SHIFT; +} + +void buffer_shrink(Buffer *buffer) +{ + size_t new; + + /* Calculate the average size of the buffer as + * avg_size = avg_size * ( 1 - a ) + required_size * a + * where a is 1 / 2 ^ BUFFER_AVG_SIZE_SHIFT. */ + buffer->avg_size *= (1 << BUFFER_AVG_SIZE_SHIFT) - 1; + buffer->avg_size >>= BUFFER_AVG_SIZE_SHIFT; + buffer->avg_size += buffer_req_size(buffer, 0); + + /* And then only shrink if the average size of the buffer is much + * too big, to avoid bumping up & down the buffers all the time. + * realloc() isn't exactly cheap ... */ + new = buffer_req_size(buffer, buffer_get_avg_size(buffer)); + if (new < buffer->capacity >> 3 && + new >= BUFFER_MIN_SHRINK_SIZE) { + buffer_adj_size(buffer, buffer_get_avg_size(buffer)); + } + + buffer_adj_size(buffer, 0); +} + +void buffer_reserve(Buffer *buffer, size_t len) +{ + if ((buffer->capacity - buffer->offset) < len) { + buffer_adj_size(buffer, len); + } +} + +gboolean buffer_empty(Buffer *buffer) +{ + return buffer->offset == 0; +} + +uint8_t *buffer_end(Buffer *buffer) +{ + return buffer->buffer + buffer->offset; +} + +void buffer_reset(Buffer *buffer) +{ + buffer->offset = 0; + buffer_shrink(buffer); +} + +void buffer_free(Buffer *buffer) +{ + trace_buffer_free(buffer->name ?: "unnamed", buffer->capacity); + g_free(buffer->buffer); + g_free(buffer->name); + buffer->offset = 0; + buffer->capacity = 0; + buffer->buffer = NULL; + buffer->name = NULL; +} + +void buffer_append(Buffer *buffer, const void *data, size_t len) +{ + memcpy(buffer->buffer + buffer->offset, data, len); + buffer->offset += len; +} + +void buffer_advance(Buffer *buffer, size_t len) +{ + memmove(buffer->buffer, buffer->buffer + len, + (buffer->offset - len)); + buffer->offset -= len; + buffer_shrink(buffer); +} + +void buffer_move_empty(Buffer *to, Buffer *from) +{ + trace_buffer_move_empty(to->name ?: "unnamed", + from->offset, + from->name ?: "unnamed"); + assert(to->offset == 0); + + g_free(to->buffer); + to->offset = from->offset; + to->capacity = from->capacity; + to->buffer = from->buffer; + + from->offset = 0; + from->capacity = 0; + from->buffer = NULL; +} + +void buffer_move(Buffer *to, Buffer *from) +{ + if (to->offset == 0) { + buffer_move_empty(to, from); + return; + } + + trace_buffer_move(to->name ?: "unnamed", + from->offset, + from->name ?: "unnamed"); + buffer_reserve(to, from->offset); + buffer_append(to, from->buffer, from->offset); + + g_free(from->buffer); + from->offset = 0; + from->capacity = 0; + from->buffer = NULL; +} diff --git a/util/coroutine-gthread.c b/util/coroutine-gthread.c new file mode 100644 index 0000000000..0bcd77867d --- /dev/null +++ b/util/coroutine-gthread.c @@ -0,0 +1,198 @@ +/* + * GThread coroutine initialization code + * + * Copyright (C) 2006 Anthony Liguori <anthony@codemonkey.ws> + * Copyright (C) 2011 Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.0 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <glib.h> +#include "qemu-common.h" +#include "qemu/coroutine_int.h" + +typedef struct { + Coroutine base; + GThread *thread; + bool runnable; + bool free_on_thread_exit; + CoroutineAction action; +} CoroutineGThread; + +static CompatGMutex coroutine_lock; +static CompatGCond coroutine_cond; + +/* GLib 2.31 and beyond deprecated various parts of the thread API, + * but the new interfaces are not available in older GLib versions + * so we have to cope with both. + */ +#if GLIB_CHECK_VERSION(2, 31, 0) +/* Awkwardly, the GPrivate API doesn't provide a way to update the + * GDestroyNotify handler for the coroutine key dynamically. So instead + * we track whether or not the CoroutineGThread should be freed on + * thread exit / coroutine key update using the free_on_thread_exit + * field. + */ +static void coroutine_destroy_notify(gpointer data) +{ + CoroutineGThread *co = data; + if (co && co->free_on_thread_exit) { + g_free(co); + } +} + +static GPrivate coroutine_key = G_PRIVATE_INIT(coroutine_destroy_notify); + +static inline CoroutineGThread *get_coroutine_key(void) +{ + return g_private_get(&coroutine_key); +} + +static inline void set_coroutine_key(CoroutineGThread *co, + bool free_on_thread_exit) +{ + /* Unlike g_static_private_set() this does not call the GDestroyNotify + * if the previous value of the key was NULL. Fortunately we only need + * the GDestroyNotify in the non-NULL key case. + */ + co->free_on_thread_exit = free_on_thread_exit; + g_private_replace(&coroutine_key, co); +} + +static inline GThread *create_thread(GThreadFunc func, gpointer data) +{ + return g_thread_new("coroutine", func, data); +} + +#else + +/* Handle older GLib versions */ + +static GStaticPrivate coroutine_key = G_STATIC_PRIVATE_INIT; + +static inline CoroutineGThread *get_coroutine_key(void) +{ + return g_static_private_get(&coroutine_key); +} + +static inline void set_coroutine_key(CoroutineGThread *co, + bool free_on_thread_exit) +{ + g_static_private_set(&coroutine_key, co, + free_on_thread_exit ? (GDestroyNotify)g_free : NULL); +} + +static inline GThread *create_thread(GThreadFunc func, gpointer data) +{ + return g_thread_create_full(func, data, 0, TRUE, TRUE, + G_THREAD_PRIORITY_NORMAL, NULL); +} + +#endif + + +static void __attribute__((constructor)) coroutine_init(void) +{ +#if !GLIB_CHECK_VERSION(2, 31, 0) + if (!g_thread_supported()) { + g_thread_init(NULL); + } +#endif +} + +static void coroutine_wait_runnable_locked(CoroutineGThread *co) +{ + while (!co->runnable) { + g_cond_wait(&coroutine_cond, &coroutine_lock); + } +} + +static void coroutine_wait_runnable(CoroutineGThread *co) +{ + g_mutex_lock(&coroutine_lock); + coroutine_wait_runnable_locked(co); + g_mutex_unlock(&coroutine_lock); +} + +static gpointer coroutine_thread(gpointer opaque) +{ + CoroutineGThread *co = opaque; + + set_coroutine_key(co, false); + coroutine_wait_runnable(co); + co->base.entry(co->base.entry_arg); + qemu_coroutine_switch(&co->base, co->base.caller, COROUTINE_TERMINATE); + return NULL; +} + +Coroutine *qemu_coroutine_new(void) +{ + CoroutineGThread *co; + + co = g_malloc0(sizeof(*co)); + co->thread = create_thread(coroutine_thread, co); + if (!co->thread) { + g_free(co); + return NULL; + } + return &co->base; +} + +void qemu_coroutine_delete(Coroutine *co_) +{ + CoroutineGThread *co = DO_UPCAST(CoroutineGThread, base, co_); + + g_thread_join(co->thread); + g_free(co); +} + +CoroutineAction qemu_coroutine_switch(Coroutine *from_, + Coroutine *to_, + CoroutineAction action) +{ + CoroutineGThread *from = DO_UPCAST(CoroutineGThread, base, from_); + CoroutineGThread *to = DO_UPCAST(CoroutineGThread, base, to_); + + g_mutex_lock(&coroutine_lock); + from->runnable = false; + from->action = action; + to->runnable = true; + to->action = action; + g_cond_broadcast(&coroutine_cond); + + if (action != COROUTINE_TERMINATE) { + coroutine_wait_runnable_locked(from); + } + g_mutex_unlock(&coroutine_lock); + return from->action; +} + +Coroutine *qemu_coroutine_self(void) +{ + CoroutineGThread *co = get_coroutine_key(); + if (!co) { + co = g_malloc0(sizeof(*co)); + co->runnable = true; + set_coroutine_key(co, true); + } + + return &co->base; +} + +bool qemu_in_coroutine(void) +{ + CoroutineGThread *co = get_coroutine_key(); + + return co && co->base.caller; +} diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c new file mode 100644 index 0000000000..39842a4a90 --- /dev/null +++ b/util/coroutine-sigaltstack.c @@ -0,0 +1,293 @@ +/* + * sigaltstack coroutine initialization code + * + * Copyright (C) 2006 Anthony Liguori <anthony@codemonkey.ws> + * Copyright (C) 2011 Kevin Wolf <kwolf@redhat.com> + * Copyright (C) 2012 Alex Barcelo <abarcelo@ac.upc.edu> +** This file is partly based on pth_mctx.c, from the GNU Portable Threads +** Copyright (c) 1999-2006 Ralf S. Engelschall <rse@engelschall.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */ +#ifdef _FORTIFY_SOURCE +#undef _FORTIFY_SOURCE +#endif +#include <stdlib.h> +#include <setjmp.h> +#include <stdint.h> +#include <pthread.h> +#include <signal.h> +#include "qemu-common.h" +#include "qemu/coroutine_int.h" + +typedef struct { + Coroutine base; + void *stack; + sigjmp_buf env; +} CoroutineUContext; + +/** + * Per-thread coroutine bookkeeping + */ +typedef struct { + /** Currently executing coroutine */ + Coroutine *current; + + /** The default coroutine */ + CoroutineUContext leader; + + /** Information for the signal handler (trampoline) */ + sigjmp_buf tr_reenter; + volatile sig_atomic_t tr_called; + void *tr_handler; +} CoroutineThreadState; + +static pthread_key_t thread_state_key; + +static CoroutineThreadState *coroutine_get_thread_state(void) +{ + CoroutineThreadState *s = pthread_getspecific(thread_state_key); + + if (!s) { + s = g_malloc0(sizeof(*s)); + s->current = &s->leader.base; + pthread_setspecific(thread_state_key, s); + } + return s; +} + +static void qemu_coroutine_thread_cleanup(void *opaque) +{ + CoroutineThreadState *s = opaque; + + g_free(s); +} + +static void __attribute__((constructor)) coroutine_init(void) +{ + int ret; + + ret = pthread_key_create(&thread_state_key, qemu_coroutine_thread_cleanup); + if (ret != 0) { + fprintf(stderr, "unable to create leader key: %s\n", strerror(errno)); + abort(); + } +} + +/* "boot" function + * This is what starts the coroutine, is called from the trampoline + * (from the signal handler when it is not signal handling, read ahead + * for more information). + */ +static void coroutine_bootstrap(CoroutineUContext *self, Coroutine *co) +{ + /* Initialize longjmp environment and switch back the caller */ + if (!sigsetjmp(self->env, 0)) { + siglongjmp(*(sigjmp_buf *)co->entry_arg, 1); + } + + while (true) { + co->entry(co->entry_arg); + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); + } +} + +/* + * This is used as the signal handler. This is called with the brand new stack + * (thanks to sigaltstack). We have to return, given that this is a signal + * handler and the sigmask and some other things are changed. + */ +static void coroutine_trampoline(int signal) +{ + CoroutineUContext *self; + Coroutine *co; + CoroutineThreadState *coTS; + + /* Get the thread specific information */ + coTS = coroutine_get_thread_state(); + self = coTS->tr_handler; + coTS->tr_called = 1; + co = &self->base; + + /* + * Here we have to do a bit of a ping pong between the caller, given that + * this is a signal handler and we have to do a return "soon". Then the + * caller can reestablish everything and do a siglongjmp here again. + */ + if (!sigsetjmp(coTS->tr_reenter, 0)) { + return; + } + + /* + * Ok, the caller has siglongjmp'ed back to us, so now prepare + * us for the real machine state switching. We have to jump + * into another function here to get a new stack context for + * the auto variables (which have to be auto-variables + * because the start of the thread happens later). Else with + * PIC (i.e. Position Independent Code which is used when PTH + * is built as a shared library) most platforms would + * horrible core dump as experience showed. + */ + coroutine_bootstrap(self, co); +} + +Coroutine *qemu_coroutine_new(void) +{ + const size_t stack_size = 1 << 20; + CoroutineUContext *co; + CoroutineThreadState *coTS; + struct sigaction sa; + struct sigaction osa; + stack_t ss; + stack_t oss; + sigset_t sigs; + sigset_t osigs; + sigjmp_buf old_env; + + /* The way to manipulate stack is with the sigaltstack function. We + * prepare a stack, with it delivering a signal to ourselves and then + * put sigsetjmp/siglongjmp where needed. + * This has been done keeping coroutine-ucontext as a model and with the + * pth ideas (GNU Portable Threads). See coroutine-ucontext for the basics + * of the coroutines and see pth_mctx.c (from the pth project) for the + * sigaltstack way of manipulating stacks. + */ + + co = g_malloc0(sizeof(*co)); + co->stack = g_malloc(stack_size); + co->base.entry_arg = &old_env; /* stash away our jmp_buf */ + + coTS = coroutine_get_thread_state(); + coTS->tr_handler = co; + + /* + * Preserve the SIGUSR2 signal state, block SIGUSR2, + * and establish our signal handler. The signal will + * later transfer control onto the signal stack. + */ + sigemptyset(&sigs); + sigaddset(&sigs, SIGUSR2); + pthread_sigmask(SIG_BLOCK, &sigs, &osigs); + sa.sa_handler = coroutine_trampoline; + sigfillset(&sa.sa_mask); + sa.sa_flags = SA_ONSTACK; + if (sigaction(SIGUSR2, &sa, &osa) != 0) { + abort(); + } + + /* + * Set the new stack. + */ + ss.ss_sp = co->stack; + ss.ss_size = stack_size; + ss.ss_flags = 0; + if (sigaltstack(&ss, &oss) < 0) { + abort(); + } + + /* + * Now transfer control onto the signal stack and set it up. + * It will return immediately via "return" after the sigsetjmp() + * was performed. Be careful here with race conditions. The + * signal can be delivered the first time sigsuspend() is + * called. + */ + coTS->tr_called = 0; + pthread_kill(pthread_self(), SIGUSR2); + sigfillset(&sigs); + sigdelset(&sigs, SIGUSR2); + while (!coTS->tr_called) { + sigsuspend(&sigs); + } + + /* + * Inform the system that we are back off the signal stack by + * removing the alternative signal stack. Be careful here: It + * first has to be disabled, before it can be removed. + */ + sigaltstack(NULL, &ss); + ss.ss_flags = SS_DISABLE; + if (sigaltstack(&ss, NULL) < 0) { + abort(); + } + sigaltstack(NULL, &ss); + if (!(oss.ss_flags & SS_DISABLE)) { + sigaltstack(&oss, NULL); + } + + /* + * Restore the old SIGUSR2 signal handler and mask + */ + sigaction(SIGUSR2, &osa, NULL); + pthread_sigmask(SIG_SETMASK, &osigs, NULL); + + /* + * Now enter the trampoline again, but this time not as a signal + * handler. Instead we jump into it directly. The functionally + * redundant ping-pong pointer arithmetic is necessary to avoid + * type-conversion warnings related to the `volatile' qualifier and + * the fact that `jmp_buf' usually is an array type. + */ + if (!sigsetjmp(old_env, 0)) { + siglongjmp(coTS->tr_reenter, 1); + } + + /* + * Ok, we returned again, so now we're finished + */ + + return &co->base; +} + +void qemu_coroutine_delete(Coroutine *co_) +{ + CoroutineUContext *co = DO_UPCAST(CoroutineUContext, base, co_); + + g_free(co->stack); + g_free(co); +} + +CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, + CoroutineAction action) +{ + CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_); + CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_); + CoroutineThreadState *s = coroutine_get_thread_state(); + int ret; + + s->current = to_; + + ret = sigsetjmp(from->env, 0); + if (ret == 0) { + siglongjmp(to->env, action); + } + return ret; +} + +Coroutine *qemu_coroutine_self(void) +{ + CoroutineThreadState *s = coroutine_get_thread_state(); + + return s->current; +} + +bool qemu_in_coroutine(void) +{ + CoroutineThreadState *s = pthread_getspecific(thread_state_key); + + return s && s->current->caller; +} + diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c new file mode 100644 index 0000000000..26cbebb7a7 --- /dev/null +++ b/util/coroutine-ucontext.c @@ -0,0 +1,194 @@ +/* + * ucontext coroutine initialization code + * + * Copyright (C) 2006 Anthony Liguori <anthony@codemonkey.ws> + * Copyright (C) 2011 Kevin Wolf <kwolf@redhat.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.0 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */ +#ifdef _FORTIFY_SOURCE +#undef _FORTIFY_SOURCE +#endif +#include <stdlib.h> +#include <setjmp.h> +#include <stdint.h> +#include <ucontext.h> +#include "qemu-common.h" +#include "qemu/coroutine_int.h" + +#ifdef CONFIG_VALGRIND_H +#include <valgrind/valgrind.h> +#endif + +typedef struct { + Coroutine base; + void *stack; + sigjmp_buf env; + +#ifdef CONFIG_VALGRIND_H + unsigned int valgrind_stack_id; +#endif + +} CoroutineUContext; + +/** + * Per-thread coroutine bookkeeping + */ +static __thread CoroutineUContext leader; +static __thread Coroutine *current; + +/* + * va_args to makecontext() must be type 'int', so passing + * the pointer we need may require several int args. This + * union is a quick hack to let us do that + */ +union cc_arg { + void *p; + int i[2]; +}; + +static void coroutine_trampoline(int i0, int i1) +{ + union cc_arg arg; + CoroutineUContext *self; + Coroutine *co; + + arg.i[0] = i0; + arg.i[1] = i1; + self = arg.p; + co = &self->base; + + /* Initialize longjmp environment and switch back the caller */ + if (!sigsetjmp(self->env, 0)) { + siglongjmp(*(sigjmp_buf *)co->entry_arg, 1); + } + + while (true) { + co->entry(co->entry_arg); + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); + } +} + +Coroutine *qemu_coroutine_new(void) +{ + const size_t stack_size = 1 << 20; + CoroutineUContext *co; + ucontext_t old_uc, uc; + sigjmp_buf old_env; + union cc_arg arg = {0}; + + /* The ucontext functions preserve signal masks which incurs a + * system call overhead. sigsetjmp(buf, 0)/siglongjmp() does not + * preserve signal masks but only works on the current stack. + * Since we need a way to create and switch to a new stack, use + * the ucontext functions for that but sigsetjmp()/siglongjmp() for + * everything else. + */ + + if (getcontext(&uc) == -1) { + abort(); + } + + co = g_malloc0(sizeof(*co)); + co->stack = g_malloc(stack_size); + co->base.entry_arg = &old_env; /* stash away our jmp_buf */ + + uc.uc_link = &old_uc; + uc.uc_stack.ss_sp = co->stack; + uc.uc_stack.ss_size = stack_size; + uc.uc_stack.ss_flags = 0; + +#ifdef CONFIG_VALGRIND_H + co->valgrind_stack_id = + VALGRIND_STACK_REGISTER(co->stack, co->stack + stack_size); +#endif + + arg.p = co; + + makecontext(&uc, (void (*)(void))coroutine_trampoline, + 2, arg.i[0], arg.i[1]); + + /* swapcontext() in, siglongjmp() back out */ + if (!sigsetjmp(old_env, 0)) { + swapcontext(&old_uc, &uc); + } + return &co->base; +} + +#ifdef CONFIG_VALGRIND_H +#ifdef CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE +/* Work around an unused variable in the valgrind.h macro... */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif +static inline void valgrind_stack_deregister(CoroutineUContext *co) +{ + VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id); +} +#ifdef CONFIG_PRAGMA_DIAGNOSTIC_AVAILABLE +#pragma GCC diagnostic pop +#endif +#endif + +void qemu_coroutine_delete(Coroutine *co_) +{ + CoroutineUContext *co = DO_UPCAST(CoroutineUContext, base, co_); + +#ifdef CONFIG_VALGRIND_H + valgrind_stack_deregister(co); +#endif + + g_free(co->stack); + g_free(co); +} + +/* This function is marked noinline to prevent GCC from inlining it + * into coroutine_trampoline(). If we allow it to do that then it + * hoists the code to get the address of the TLS variable "current" + * out of the while() loop. This is an invalid transformation because + * the sigsetjmp() call may be called when running thread A but + * return in thread B, and so we might be in a different thread + * context each time round the loop. + */ +CoroutineAction __attribute__((noinline)) +qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, + CoroutineAction action) +{ + CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_); + CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_); + int ret; + + current = to_; + + ret = sigsetjmp(from->env, 0); + if (ret == 0) { + siglongjmp(to->env, action); + } + return ret; +} + +Coroutine *qemu_coroutine_self(void) +{ + if (!current) { + current = &leader.base; + } + return current; +} + +bool qemu_in_coroutine(void) +{ + return current && current->caller; +} diff --git a/util/coroutine-win32.c b/util/coroutine-win32.c new file mode 100644 index 0000000000..990a3b3b0c --- /dev/null +++ b/util/coroutine-win32.c @@ -0,0 +1,104 @@ +/* + * Win32 coroutine initialization code + * + * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "qemu/coroutine_int.h" + +typedef struct +{ + Coroutine base; + + LPVOID fiber; + CoroutineAction action; +} CoroutineWin32; + +static __thread CoroutineWin32 leader; +static __thread Coroutine *current; + +/* This function is marked noinline to prevent GCC from inlining it + * into coroutine_trampoline(). If we allow it to do that then it + * hoists the code to get the address of the TLS variable "current" + * out of the while() loop. This is an invalid transformation because + * the SwitchToFiber() call may be called when running thread A but + * return in thread B, and so we might be in a different thread + * context each time round the loop. + */ +CoroutineAction __attribute__((noinline)) +qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, + CoroutineAction action) +{ + CoroutineWin32 *from = DO_UPCAST(CoroutineWin32, base, from_); + CoroutineWin32 *to = DO_UPCAST(CoroutineWin32, base, to_); + + current = to_; + + to->action = action; + SwitchToFiber(to->fiber); + return from->action; +} + +static void CALLBACK coroutine_trampoline(void *co_) +{ + Coroutine *co = co_; + + while (true) { + co->entry(co->entry_arg); + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); + } +} + +Coroutine *qemu_coroutine_new(void) +{ + const size_t stack_size = 1 << 20; + CoroutineWin32 *co; + + co = g_malloc0(sizeof(*co)); + co->fiber = CreateFiber(stack_size, coroutine_trampoline, &co->base); + + g_assert(co->fiber); + + return &co->base; +} + +void qemu_coroutine_delete(Coroutine *co_) +{ + CoroutineWin32 *co = DO_UPCAST(CoroutineWin32, base, co_); + + DeleteFiber(co->fiber); + g_free(co); +} + +Coroutine *qemu_coroutine_self(void) +{ + if (!current) { + current = &leader.base; + leader.fiber = ConvertThreadToFiber(NULL); + } + return current; +} + +bool qemu_in_coroutine(void) +{ + return current && current->caller; +} diff --git a/util/cutils.c b/util/cutils.c index 5d1c9ebe05..cfeb848d19 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -145,11 +145,6 @@ time_t mktimegm(struct tm *tm) return t; } -int qemu_fls(int i) -{ - return 32 - clz32(i); -} - /* * Make sure data goes on disk, but if possible do not bother to * write out the inode just for timestamp updates. @@ -281,19 +276,19 @@ int fcntl_setfl(int fd, int flag) static int64_t suffix_mul(char suffix, int64_t unit) { switch (qemu_toupper(suffix)) { - case STRTOSZ_DEFSUFFIX_B: + case QEMU_STRTOSZ_DEFSUFFIX_B: return 1; - case STRTOSZ_DEFSUFFIX_KB: + case QEMU_STRTOSZ_DEFSUFFIX_KB: return unit; - case STRTOSZ_DEFSUFFIX_MB: + case QEMU_STRTOSZ_DEFSUFFIX_MB: return unit * unit; - case STRTOSZ_DEFSUFFIX_GB: + case QEMU_STRTOSZ_DEFSUFFIX_GB: return unit * unit * unit; - case STRTOSZ_DEFSUFFIX_TB: + case QEMU_STRTOSZ_DEFSUFFIX_TB: return unit * unit * unit * unit; - case STRTOSZ_DEFSUFFIX_PB: + case QEMU_STRTOSZ_DEFSUFFIX_PB: return unit * unit * unit * unit * unit; - case STRTOSZ_DEFSUFFIX_EB: + case QEMU_STRTOSZ_DEFSUFFIX_EB: return unit * unit * unit * unit * unit * unit; } return -1; @@ -305,7 +300,7 @@ static int64_t suffix_mul(char suffix, int64_t unit) * in *end, if not NULL. Return -ERANGE on overflow, Return -EINVAL on * other error. */ -int64_t strtosz_suffix_unit(const char *nptr, char **end, +int64_t qemu_strtosz_suffix_unit(const char *nptr, char **end, const char default_suffix, int64_t unit) { int64_t retval = -EINVAL; @@ -348,14 +343,165 @@ fail: return retval; } -int64_t strtosz_suffix(const char *nptr, char **end, const char default_suffix) +int64_t qemu_strtosz_suffix(const char *nptr, char **end, + const char default_suffix) { - return strtosz_suffix_unit(nptr, end, default_suffix, 1024); + return qemu_strtosz_suffix_unit(nptr, end, default_suffix, 1024); } -int64_t strtosz(const char *nptr, char **end) +int64_t qemu_strtosz(const char *nptr, char **end) +{ + return qemu_strtosz_suffix(nptr, end, QEMU_STRTOSZ_DEFSUFFIX_MB); +} + +/** + * Helper function for qemu_strto*l() functions. + */ +static int check_strtox_error(const char *p, char *endptr, const char **next, + int err) +{ + /* If no conversion was performed, prefer BSD behavior over glibc + * behavior. + */ + if (err == 0 && endptr == p) { + err = EINVAL; + } + if (!next && *endptr) { + return -EINVAL; + } + if (next) { + *next = endptr; + } + return -err; +} + +/** + * QEMU wrappers for strtol(), strtoll(), strtoul(), strotull() C functions. + * + * Convert ASCII string @nptr to a long integer value + * from the given @base. Parameters @nptr, @endptr, @base + * follows same semantics as strtol() C function. + * + * Unlike from strtol() function, if @endptr is not NULL, this + * function will return -EINVAL whenever it cannot fully convert + * the string in @nptr with given @base to a long. This function returns + * the result of the conversion only through the @result parameter. + * + * If NULL is passed in @endptr, then the whole string in @ntpr + * is a number otherwise it returns -EINVAL. + * + * RETURN VALUE + * Unlike from strtol() function, this wrapper returns either + * -EINVAL or the errno set by strtol() function (e.g -ERANGE). + * If the conversion overflows, -ERANGE is returned, and @result + * is set to the max value of the desired type + * (e.g. LONG_MAX, LLONG_MAX, ULONG_MAX, ULLONG_MAX). If the case + * of underflow, -ERANGE is returned, and @result is set to the min + * value of the desired type. For strtol(), strtoll(), @result is set to + * LONG_MIN, LLONG_MIN, respectively, and for strtoul(), strtoull() it + * is set to 0. + */ +int qemu_strtol(const char *nptr, const char **endptr, int base, + long *result) { - return strtosz_suffix(nptr, end, STRTOSZ_DEFSUFFIX_MB); + char *p; + int err = 0; + if (!nptr) { + if (endptr) { + *endptr = nptr; + } + err = -EINVAL; + } else { + errno = 0; + *result = strtol(nptr, &p, base); + err = check_strtox_error(nptr, p, endptr, errno); + } + return err; +} + +/** + * Converts ASCII string to an unsigned long integer. + * + * If string contains a negative number, value will be converted to + * the unsigned representation of the signed value, unless the original + * (nonnegated) value would overflow, in this case, it will set @result + * to ULONG_MAX, and return ERANGE. + * + * The same behavior holds, for qemu_strtoull() but sets @result to + * ULLONG_MAX instead of ULONG_MAX. + * + * See qemu_strtol() documentation for more info. + */ +int qemu_strtoul(const char *nptr, const char **endptr, int base, + unsigned long *result) +{ + char *p; + int err = 0; + if (!nptr) { + if (endptr) { + *endptr = nptr; + } + err = -EINVAL; + } else { + errno = 0; + *result = strtoul(nptr, &p, base); + /* Windows returns 1 for negative out-of-range values. */ + if (errno == ERANGE) { + *result = -1; + } + err = check_strtox_error(nptr, p, endptr, errno); + } + return err; +} + +/** + * Converts ASCII string to a long long integer. + * + * See qemu_strtol() documentation for more info. + */ +int qemu_strtoll(const char *nptr, const char **endptr, int base, + int64_t *result) +{ + char *p; + int err = 0; + if (!nptr) { + if (endptr) { + *endptr = nptr; + } + err = -EINVAL; + } else { + errno = 0; + *result = strtoll(nptr, &p, base); + err = check_strtox_error(nptr, p, endptr, errno); + } + return err; +} + +/** + * Converts ASCII string to an unsigned long long integer. + * + * See qemu_strtol() documentation for more info. + */ +int qemu_strtoull(const char *nptr, const char **endptr, int base, + uint64_t *result) +{ + char *p; + int err = 0; + if (!nptr) { + if (endptr) { + *endptr = nptr; + } + err = -EINVAL; + } else { + errno = 0; + *result = strtoull(nptr, &p, base); + /* Windows returns 1 for negative out-of-range values. */ + if (errno == ERANGE) { + *result = -1; + } + err = check_strtox_error(nptr, p, endptr, errno); + } + return err; } /** @@ -474,29 +620,6 @@ int qemu_parse_fd(const char *param) return fd; } -/* round down to the nearest power of 2*/ -int64_t pow2floor(int64_t value) -{ - if (!is_power_of_2(value)) { - value = 0x8000000000000000ULL >> clz64(value); - } - return value; -} - -/* round up to the nearest power of 2 (0 if overflow) */ -uint64_t pow2ceil(uint64_t value) -{ - uint8_t nlz = clz64(value); - - if (is_power_of_2(value)) { - return value; - } - if (!nlz) { - return 0; - } - return 1ULL << (64 - nlz); -} - /* * Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128) * Input is limited to 14-bit numbers diff --git a/util/error.c b/util/error.c index 14f4351879..80c89a2079 100644 --- a/util/error.c +++ b/util/error.c @@ -2,9 +2,11 @@ * QEMU Error Objects * * Copyright IBM, Corp. 2011 + * Copyright (C) 2011-2015 Red Hat, Inc. * * Authors: * Anthony Liguori <aliguori@us.ibm.com> + * Markus Armbruster <armbru@redhat.com>, * * This work is licensed under the terms of the GNU LGPL, version 2. See * the COPYING.LIB file in the top-level directory. @@ -18,14 +20,33 @@ struct Error { char *msg; ErrorClass err_class; + const char *src, *func; + int line; + GString *hint; }; Error *error_abort; +Error *error_fatal; -void error_set(Error **errp, ErrorClass err_class, const char *fmt, ...) +static void error_handle_fatal(Error **errp, Error *err) +{ + if (errp == &error_abort) { + fprintf(stderr, "Unexpected error in %s() at %s:%d:\n", + err->func, err->src, err->line); + error_report_err(err); + abort(); + } + if (errp == &error_fatal) { + error_report_err(err); + exit(1); + } +} + +static void error_setv(Error **errp, + const char *src, int line, const char *func, + ErrorClass err_class, const char *fmt, va_list ap) { Error *err; - va_list ap; int saved_errno = errno; if (errp == NULL) { @@ -34,99 +55,120 @@ void error_set(Error **errp, ErrorClass err_class, const char *fmt, ...) assert(*errp == NULL); err = g_malloc0(sizeof(*err)); - - va_start(ap, fmt); err->msg = g_strdup_vprintf(fmt, ap); - va_end(ap); err->err_class = err_class; + err->src = src; + err->line = line; + err->func = func; - if (errp == &error_abort) { - error_report_err(err); - abort(); - } - + error_handle_fatal(errp, err); *errp = err; errno = saved_errno; } -void error_set_errno(Error **errp, int os_errno, ErrorClass err_class, - const char *fmt, ...) +void error_set_internal(Error **errp, + const char *src, int line, const char *func, + ErrorClass err_class, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + error_setv(errp, src, line, func, err_class, fmt, ap); + va_end(ap); +} + +void error_setg_internal(Error **errp, + const char *src, int line, const char *func, + const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + error_setv(errp, src, line, func, ERROR_CLASS_GENERIC_ERROR, fmt, ap); + va_end(ap); +} + +void error_setg_errno_internal(Error **errp, + const char *src, int line, const char *func, + int os_errno, const char *fmt, ...) { - Error *err; - char *msg1; va_list ap; + char *msg; int saved_errno = errno; if (errp == NULL) { return; } - assert(*errp == NULL); - - err = g_malloc0(sizeof(*err)); va_start(ap, fmt); - msg1 = g_strdup_vprintf(fmt, ap); - if (os_errno != 0) { - err->msg = g_strdup_printf("%s: %s", msg1, strerror(os_errno)); - g_free(msg1); - } else { - err->msg = msg1; - } + error_setv(errp, src, line, func, ERROR_CLASS_GENERIC_ERROR, fmt, ap); va_end(ap); - err->err_class = err_class; - if (errp == &error_abort) { - error_report_err(err); - abort(); + if (os_errno != 0) { + msg = (*errp)->msg; + (*errp)->msg = g_strdup_printf("%s: %s", msg, strerror(os_errno)); + g_free(msg); } - *errp = err; - errno = saved_errno; } -void error_setg_file_open(Error **errp, int os_errno, const char *filename) +void error_setg_file_open_internal(Error **errp, + const char *src, int line, const char *func, + int os_errno, const char *filename) { - error_setg_errno(errp, os_errno, "Could not open '%s'", filename); + error_setg_errno_internal(errp, src, line, func, os_errno, + "Could not open '%s'", filename); +} + +void error_append_hint(Error **errp, const char *fmt, ...) +{ + va_list ap; + int saved_errno = errno; + Error *err; + + if (!errp) { + return; + } + err = *errp; + assert(err && errp != &error_abort); + + if (!err->hint) { + err->hint = g_string_new(NULL); + } + va_start(ap, fmt); + g_string_append_vprintf(err->hint, fmt, ap); + va_end(ap); + + errno = saved_errno; } #ifdef _WIN32 -void error_set_win32(Error **errp, int win32_err, ErrorClass err_class, - const char *fmt, ...) +void error_setg_win32_internal(Error **errp, + const char *src, int line, const char *func, + int win32_err, const char *fmt, ...) { - Error *err; - char *msg1; va_list ap; + char *msg1, *msg2; if (errp == NULL) { return; } - assert(*errp == NULL); - - err = g_malloc0(sizeof(*err)); va_start(ap, fmt); - msg1 = g_strdup_vprintf(fmt, ap); + error_setv(errp, src, line, func, ERROR_CLASS_GENERIC_ERROR, fmt, ap); + va_end(ap); + if (win32_err != 0) { - char *msg2 = g_win32_error_message(win32_err); - err->msg = g_strdup_printf("%s: %s (error: %x)", msg1, msg2, - (unsigned)win32_err); + msg1 = (*errp)->msg; + msg2 = g_win32_error_message(win32_err); + (*errp)->msg = g_strdup_printf("%s: %s (error: %x)", msg1, msg2, + (unsigned)win32_err); g_free(msg2); g_free(msg1); - } else { - err->msg = msg1; - } - va_end(ap); - err->err_class = err_class; - - if (errp == &error_abort) { - error_report_err(err); - abort(); } - - *errp = err; } #endif @@ -138,6 +180,12 @@ Error *error_copy(const Error *err) err_new = g_malloc0(sizeof(*err)); err_new->msg = g_strdup(err->msg); err_new->err_class = err->err_class; + err_new->src = err->src; + err_new->line = err->line; + err_new->func = err->func; + if (err->hint) { + err_new->hint = g_string_new(err->hint->str); + } return err_new; } @@ -155,6 +203,9 @@ const char *error_get_pretty(Error *err) void error_report_err(Error *err) { error_report("%s", error_get_pretty(err)); + if (err->hint) { + error_printf_unless_qmp("%s\n", err->hint->str); + } error_free(err); } @@ -162,18 +213,29 @@ void error_free(Error *err) { if (err) { g_free(err->msg); + if (err->hint) { + g_string_free(err->hint, true); + } g_free(err); } } +void error_free_or_abort(Error **errp) +{ + assert(errp && *errp); + error_free(*errp); + *errp = NULL; +} + void error_propagate(Error **dst_errp, Error *local_err) { - if (local_err && dst_errp == &error_abort) { - error_report_err(local_err); - abort(); - } else if (dst_errp && !*dst_errp) { + if (!local_err) { + return; + } + error_handle_fatal(dst_errp, local_err); + if (dst_errp && !*dst_errp) { *dst_errp = local_err; - } else if (local_err) { + } else { error_free(local_err); } } diff --git a/util/event_notifier-posix.c b/util/event_notifier-posix.c index ed4ca2b01e..d4a0c63e12 100644 --- a/util/event_notifier-posix.c +++ b/util/event_notifier-posix.c @@ -77,7 +77,7 @@ void event_notifier_cleanup(EventNotifier *e) close(e->wfd); } -int event_notifier_get_fd(EventNotifier *e) +int event_notifier_get_fd(const EventNotifier *e) { return e->rfd; } @@ -26,3 +26,40 @@ bool id_wellformed(const char *id) } return true; } + +#define ID_SPECIAL_CHAR '#' + +static const char *const id_subsys_str[ID_MAX] = { + [ID_QDEV] = "qdev", + [ID_BLOCK] = "block", +}; + +/* + * Generates an ID of the form PREFIX SUBSYSTEM NUMBER + * where: + * + * - PREFIX is the reserved character '#' + * - SUBSYSTEM identifies the subsystem creating the ID + * - NUMBER is a decimal number unique within SUBSYSTEM. + * + * Example: "#block146" + * + * Note that these IDs do not satisfy id_wellformed(). + * + * The caller is responsible for freeing the returned string with g_free() + */ +char *id_generate(IdSubSystems id) +{ + static uint64_t id_counters[ID_MAX]; + uint32_t rnd; + + assert(id < ARRAY_SIZE(id_subsys_str)); + assert(id_subsys_str[id]); + + rnd = g_random_int_range(0, 100); + + return g_strdup_printf("%c%s%" PRIu64 "%02" PRId32, ID_SPECIAL_CHAR, + id_subsys_str[id], + id_counters[id]++, + rnd); +} diff --git a/util/memfd.c b/util/memfd.c new file mode 100644 index 0000000000..7c406914c5 --- /dev/null +++ b/util/memfd.c @@ -0,0 +1,162 @@ +/* + * memfd.c + * + * Copyright (c) 2015 Red Hat, Inc. + * + * QEMU library functions on POSIX which are shared between QEMU and + * the QEMU tools. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" + +#include <glib.h> +#include <glib/gprintf.h> + +#include <sys/mman.h> + +#include "qemu/memfd.h" + +#ifdef CONFIG_MEMFD +#include <sys/memfd.h> +#elif defined CONFIG_LINUX +#include <sys/syscall.h> +#include <asm/unistd.h> + +static int memfd_create(const char *name, unsigned int flags) +{ +#ifdef __NR_memfd_create + return syscall(__NR_memfd_create, name, flags); +#else + return -1; +#endif +} +#endif + +#ifndef MFD_CLOEXEC +#define MFD_CLOEXEC 0x0001U +#endif + +#ifndef MFD_ALLOW_SEALING +#define MFD_ALLOW_SEALING 0x0002U +#endif + +/* + * This is a best-effort helper for shared memory allocation, with + * optional sealing. The helper will do his best to allocate using + * memfd with sealing, but may fallback on other methods without + * sealing. + */ +void *qemu_memfd_alloc(const char *name, size_t size, unsigned int seals, + int *fd) +{ + void *ptr; + int mfd = -1; + + *fd = -1; + +#ifdef CONFIG_LINUX + if (seals) { + mfd = memfd_create(name, MFD_ALLOW_SEALING | MFD_CLOEXEC); + } + + if (mfd == -1) { + /* some systems have memfd without sealing */ + mfd = memfd_create(name, MFD_CLOEXEC); + seals = 0; + } +#endif + + if (mfd != -1) { + if (ftruncate(mfd, size) == -1) { + perror("ftruncate"); + close(mfd); + return NULL; + } + + if (seals && fcntl(mfd, F_ADD_SEALS, seals) == -1) { + perror("fcntl"); + close(mfd); + return NULL; + } + } else { + const char *tmpdir = g_get_tmp_dir(); + gchar *fname; + + fname = g_strdup_printf("%s/memfd-XXXXXX", tmpdir); + mfd = mkstemp(fname); + unlink(fname); + g_free(fname); + + if (mfd == -1) { + perror("mkstemp"); + return NULL; + } + + if (ftruncate(mfd, size) == -1) { + perror("ftruncate"); + close(mfd); + return NULL; + } + } + + ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + close(mfd); + return NULL; + } + + *fd = mfd; + return ptr; +} + +void qemu_memfd_free(void *ptr, size_t size, int fd) +{ + if (ptr) { + munmap(ptr, size); + } + + if (fd != -1) { + close(fd); + } +} + +enum { + MEMFD_KO, + MEMFD_OK, + MEMFD_TODO +}; + +bool qemu_memfd_check(void) +{ + static int memfd_check = MEMFD_TODO; + + if (memfd_check == MEMFD_TODO) { + int fd; + void *ptr; + + ptr = qemu_memfd_alloc("test", 4096, 0, &fd); + memfd_check = ptr ? MEMFD_OK : MEMFD_KO; + qemu_memfd_free(ptr, 4096, fd); + } + + return memfd_check == MEMFD_OK; +} diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c new file mode 100644 index 0000000000..54793a5dcf --- /dev/null +++ b/util/mmap-alloc.c @@ -0,0 +1,110 @@ +/* + * Support for RAM backed by mmaped host memory. + * + * Copyright (c) 2015 Red Hat, Inc. + * + * Authors: + * Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ +#include <qemu/mmap-alloc.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <assert.h> + +#define HUGETLBFS_MAGIC 0x958458f6 + +#ifdef CONFIG_LINUX +#include <sys/vfs.h> +#endif + +size_t qemu_fd_getpagesize(int fd) +{ +#ifdef CONFIG_LINUX + struct statfs fs; + int ret; + + if (fd != -1) { + do { + ret = fstatfs(fd, &fs); + } while (ret != 0 && errno == EINTR); + + if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) { + return fs.f_bsize; + } + } +#endif + + return getpagesize(); +} + +void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) +{ + /* + * Note: this always allocates at least one extra page of virtual address + * space, even if size is already aligned. + */ + size_t total = size + align; +#if defined(__powerpc64__) && defined(__linux__) + /* On ppc64 mappings in the same segment (aka slice) must share the same + * page size. Since we will be re-allocating part of this segment + * from the supplied fd, we should make sure to use the same page size, + * unless we are using the system page size, in which case anonymous memory + * is OK. Use align as a hint for the page size. + * In this case, set MAP_NORESERVE to avoid allocating backing store memory. + */ + int anonfd = fd == -1 || qemu_fd_getpagesize(fd) == getpagesize() ? -1 : fd; + int flags = anonfd == -1 ? MAP_ANONYMOUS : MAP_NORESERVE; + void *ptr = mmap(0, total, PROT_NONE, flags | MAP_PRIVATE, anonfd, 0); +#else + void *ptr = mmap(0, total, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); +#endif + size_t offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr; + void *ptr1; + + if (ptr == MAP_FAILED) { + return MAP_FAILED; + } + + /* Make sure align is a power of 2 */ + assert(!(align & (align - 1))); + /* Always align to host page size */ + assert(align >= getpagesize()); + + ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE, + MAP_FIXED | + (fd == -1 ? MAP_ANONYMOUS : 0) | + (shared ? MAP_SHARED : MAP_PRIVATE), + fd, 0); + if (ptr1 == MAP_FAILED) { + munmap(ptr, total); + return MAP_FAILED; + } + + ptr += offset; + total -= offset; + + if (offset > 0) { + munmap(ptr - offset, offset); + } + + /* + * Leave a single PROT_NONE page allocated after the RAM block, to serve as + * a guard page guarding against potential buffer overflows. + */ + if (total > size + getpagesize()) { + munmap(ptr + size + getpagesize(), total - size - getpagesize()); + } + + return ptr; +} + +void qemu_ram_munmap(void *ptr, size_t size) +{ + if (ptr) { + /* Unmap both the RAM block and the guard page */ + munmap(ptr, size + getpagesize()); + } +} diff --git a/util/osdep.c b/util/osdep.c index 0092bb61b9..534b51147c 100644 --- a/util/osdep.c +++ b/util/osdep.c @@ -52,7 +52,14 @@ extern int madvise(caddr_t, size_t, int); static bool fips_enabled = false; -static const char *qemu_version = QEMU_VERSION; +/* Starting on QEMU 2.5, qemu_hw_version() returns "2.5+" by default + * instead of QEMU_VERSION, so setting hw_version on MachineClass + * is no longer mandatory. + * + * Do NOT change this string, or it will break compatibility on all + * machine classes that don't set hw_version. + */ +static const char *hw_version = "2.5+"; int socket_set_cork(int fd, int v) { @@ -311,14 +318,14 @@ int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen) return ret; } -void qemu_set_version(const char *version) +void qemu_set_hw_version(const char *version) { - qemu_version = version; + hw_version = version; } -const char *qemu_get_version(void) +const char *qemu_hw_version(void) { - return qemu_version; + return hw_version; } void fips_set_state(bool requested) diff --git a/util/oslib-posix.c b/util/oslib-posix.c index 997f24d21b..6f18866cad 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -46,7 +46,6 @@ extern int daemon(int, int); #else # define QEMU_VMALLOC_ALIGN getpagesize() #endif -#define HUGETLBFS_MAGIC 0x958458f6 #include <termios.h> #include <unistd.h> @@ -66,13 +65,14 @@ extern int daemon(int, int); #ifdef CONFIG_LINUX #include <sys/syscall.h> -#include <sys/vfs.h> #endif #ifdef __FreeBSD__ #include <sys/sysctl.h> #endif +#include <qemu/mmap-alloc.h> + #ifdef CONFIG_MARU #include "../../tizen/src/emulator_common.h" #endif @@ -148,10 +148,7 @@ void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment) } #endif size_t align = QEMU_VMALLOC_ALIGN; - size_t total = size + align - getpagesize(); - void *ptr = mmap(0, total, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - size_t offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr; + void *ptr = qemu_ram_mmap(-1, size, align, false); if (ptr == MAP_FAILED) { return NULL; @@ -160,15 +157,6 @@ void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment) if (alignment) { *alignment = align; } - ptr += offset; - total -= offset; - - if (offset > 0) { - munmap(ptr - offset, offset); - } - if (total > size) { - munmap(ptr + size, total - size); - } trace_qemu_anon_ram_alloc(size, ptr); return ptr; @@ -183,9 +171,7 @@ void qemu_vfree(void *ptr) void qemu_anon_ram_free(void *ptr, size_t size) { trace_qemu_anon_ram_free(ptr, size); - if (ptr) { - munmap(ptr, size); - } + qemu_ram_munmap(ptr, size); } void qemu_set_block(int fd) @@ -372,26 +358,6 @@ static void sigbus_handler(int signal) siglongjmp(sigjump, 1); } -static size_t fd_getpagesize(int fd) -{ -#ifdef CONFIG_LINUX - struct statfs fs; - int ret; - - if (fd != -1) { - do { - ret = fstatfs(fd, &fs); - } while (ret != 0 && errno == EINTR); - - if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) { - return fs.f_bsize; - } - } -#endif - - return getpagesize(); -} - void os_mem_prealloc(int fd, char *area, size_t memory) { int ret; @@ -419,7 +385,7 @@ void os_mem_prealloc(int fd, char *area, size_t memory) exit(1); } else { int i; - size_t hpagesize = fd_getpagesize(fd); + size_t hpagesize = qemu_fd_getpagesize(fd); size_t numpages = DIV_ROUND_UP(memory, hpagesize); /* MAP_POPULATE silently ignores failures */ @@ -502,3 +468,74 @@ int qemu_read_password(char *buf, int buf_size) printf("\n"); return ret; } + + +pid_t qemu_fork(Error **errp) +{ + sigset_t oldmask, newmask; + struct sigaction sig_action; + int saved_errno; + pid_t pid; + + /* + * Need to block signals now, so that child process can safely + * kill off caller's signal handlers without a race. + */ + sigfillset(&newmask); + if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) { + error_setg_errno(errp, errno, + "cannot block signals"); + return -1; + } + + pid = fork(); + saved_errno = errno; + + if (pid < 0) { + /* attempt to restore signal mask, but ignore failure, to + * avoid obscuring the fork failure */ + (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL); + error_setg_errno(errp, saved_errno, + "cannot fork child process"); + errno = saved_errno; + return -1; + } else if (pid) { + /* parent process */ + + /* Restore our original signal mask now that the child is + * safely running. Only documented failures are EFAULT (not + * possible, since we are using just-grabbed mask) or EINVAL + * (not possible, since we are using correct arguments). */ + (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL); + } else { + /* child process */ + size_t i; + + /* Clear out all signal handlers from parent so nothing + * unexpected can happen in our child once we unblock + * signals */ + sig_action.sa_handler = SIG_DFL; + sig_action.sa_flags = 0; + sigemptyset(&sig_action.sa_mask); + + for (i = 1; i < NSIG; i++) { + /* Only possible errors are EFAULT or EINVAL The former + * won't happen, the latter we expect, so no need to check + * return value */ + (void)sigaction(i, &sig_action, NULL); + } + + /* Unmask all signals in child, since we've no idea what the + * caller's done with their signal mask and don't want to + * propagate that to children */ + sigemptyset(&newmask); + if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) { + Error *local_err = NULL; + error_setg_errno(&local_err, errno, + "cannot unblock signals"); + error_report_err(local_err); + _exit(1); + } + } + return pid; +} diff --git a/util/oslib-win32.c b/util/oslib-win32.c index 217f28a201..11f528a4df 100644 --- a/util/oslib-win32.c +++ b/util/oslib-win32.c @@ -110,9 +110,7 @@ void qemu_anon_ram_free(void *ptr, size_t size) } } -// CONFIG_MARU MODIFICATION -// recent MinGW-w64 provides gmtime_r(), localtime_r(). we use them. -#if 0 +#ifndef CONFIG_LOCALTIME_R /* FIXME: add proper locking */ struct tm *gmtime_r(const time_t *timep, struct tm *result) { @@ -136,7 +134,7 @@ struct tm *localtime_r(const time_t *timep, struct tm *result) } return p; } -#endif +#endif /* CONFIG_LOCALTIME_R */ void qemu_set_block(int fd) { @@ -471,7 +469,7 @@ gint g_poll(GPollFD *fds, guint nfds, gint timeout) return retval; } -size_t getpagesize(void) +int getpagesize(void) { SYSTEM_INFO system_info; @@ -513,3 +511,12 @@ int qemu_read_password(char *buf, int buf_size) buf[i] = '\0'; return 0; } + + +pid_t qemu_fork(Error **errp) +{ + errno = ENOSYS; + error_setg_errno(errp, errno, + "cannot fork child process"); + return -1; +} diff --git a/util/qemu-coroutine-io.c b/util/qemu-coroutine-io.c new file mode 100644 index 0000000000..e1eae7331e --- /dev/null +++ b/util/qemu-coroutine-io.c @@ -0,0 +1,91 @@ +/* + * Coroutine-aware I/O functions + * + * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation. + * Copyright (c) 2011, Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu/sockets.h" +#include "qemu/coroutine.h" +#include "qemu/iov.h" +#include "qemu/main-loop.h" + +ssize_t coroutine_fn +qemu_co_sendv_recvv(int sockfd, struct iovec *iov, unsigned iov_cnt, + size_t offset, size_t bytes, bool do_send) +{ + size_t done = 0; + ssize_t ret; + int err; + while (done < bytes) { + ret = iov_send_recv(sockfd, iov, iov_cnt, + offset + done, bytes - done, do_send); + if (ret > 0) { + done += ret; + } else if (ret < 0) { + err = socket_error(); + if (err == EAGAIN || err == EWOULDBLOCK) { + qemu_coroutine_yield(); + } else if (done == 0) { + return -err; + } else { + break; + } + } else if (ret == 0 && !do_send) { + /* write (send) should never return 0. + * read (recv) returns 0 for end-of-file (-data). + * In both cases there's little point retrying, + * but we do for write anyway, just in case */ + break; + } + } + return done; +} + +ssize_t coroutine_fn +qemu_co_send_recv(int sockfd, void *buf, size_t bytes, bool do_send) +{ + struct iovec iov = { .iov_base = buf, .iov_len = bytes }; + return qemu_co_sendv_recvv(sockfd, &iov, 1, 0, bytes, do_send); +} + +typedef struct { + Coroutine *co; + int fd; +} FDYieldUntilData; + +static void fd_coroutine_enter(void *opaque) +{ + FDYieldUntilData *data = opaque; + qemu_set_fd_handler(data->fd, NULL, NULL, NULL); + qemu_coroutine_enter(data->co, NULL); +} + +void coroutine_fn yield_until_fd_readable(int fd) +{ + FDYieldUntilData data; + + assert(qemu_in_coroutine()); + data.co = qemu_coroutine_self(); + data.fd = fd; + qemu_set_fd_handler(fd, fd_coroutine_enter, NULL, &data); + qemu_coroutine_yield(); +} diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c new file mode 100644 index 0000000000..130ee19d17 --- /dev/null +++ b/util/qemu-coroutine-lock.c @@ -0,0 +1,186 @@ +/* + * coroutine queues and locks + * + * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "qemu/coroutine.h" +#include "qemu/coroutine_int.h" +#include "qemu/queue.h" +#include "trace.h" + +void qemu_co_queue_init(CoQueue *queue) +{ + QTAILQ_INIT(&queue->entries); +} + +void coroutine_fn qemu_co_queue_wait(CoQueue *queue) +{ + Coroutine *self = qemu_coroutine_self(); + QTAILQ_INSERT_TAIL(&queue->entries, self, co_queue_next); + qemu_coroutine_yield(); + assert(qemu_in_coroutine()); +} + +/** + * qemu_co_queue_run_restart: + * + * Enter each coroutine that was previously marked for restart by + * qemu_co_queue_next() or qemu_co_queue_restart_all(). This function is + * invoked by the core coroutine code when the current coroutine yields or + * terminates. + */ +void qemu_co_queue_run_restart(Coroutine *co) +{ + Coroutine *next; + + trace_qemu_co_queue_run_restart(co); + while ((next = QTAILQ_FIRST(&co->co_queue_wakeup))) { + QTAILQ_REMOVE(&co->co_queue_wakeup, next, co_queue_next); + qemu_coroutine_enter(next, NULL); + } +} + +static bool qemu_co_queue_do_restart(CoQueue *queue, bool single) +{ + Coroutine *self = qemu_coroutine_self(); + Coroutine *next; + + if (QTAILQ_EMPTY(&queue->entries)) { + return false; + } + + while ((next = QTAILQ_FIRST(&queue->entries)) != NULL) { + QTAILQ_REMOVE(&queue->entries, next, co_queue_next); + QTAILQ_INSERT_TAIL(&self->co_queue_wakeup, next, co_queue_next); + trace_qemu_co_queue_next(next); + if (single) { + break; + } + } + return true; +} + +bool coroutine_fn qemu_co_queue_next(CoQueue *queue) +{ + assert(qemu_in_coroutine()); + return qemu_co_queue_do_restart(queue, true); +} + +void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue) +{ + assert(qemu_in_coroutine()); + qemu_co_queue_do_restart(queue, false); +} + +bool qemu_co_enter_next(CoQueue *queue) +{ + Coroutine *next; + + next = QTAILQ_FIRST(&queue->entries); + if (!next) { + return false; + } + + QTAILQ_REMOVE(&queue->entries, next, co_queue_next); + qemu_coroutine_enter(next, NULL); + return true; +} + +bool qemu_co_queue_empty(CoQueue *queue) +{ + return QTAILQ_FIRST(&queue->entries) == NULL; +} + +void qemu_co_mutex_init(CoMutex *mutex) +{ + memset(mutex, 0, sizeof(*mutex)); + qemu_co_queue_init(&mutex->queue); +} + +void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex) +{ + Coroutine *self = qemu_coroutine_self(); + + trace_qemu_co_mutex_lock_entry(mutex, self); + + while (mutex->locked) { + qemu_co_queue_wait(&mutex->queue); + } + + mutex->locked = true; + + trace_qemu_co_mutex_lock_return(mutex, self); +} + +void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex) +{ + Coroutine *self = qemu_coroutine_self(); + + trace_qemu_co_mutex_unlock_entry(mutex, self); + + assert(mutex->locked == true); + assert(qemu_in_coroutine()); + + mutex->locked = false; + qemu_co_queue_next(&mutex->queue); + + trace_qemu_co_mutex_unlock_return(mutex, self); +} + +void qemu_co_rwlock_init(CoRwlock *lock) +{ + memset(lock, 0, sizeof(*lock)); + qemu_co_queue_init(&lock->queue); +} + +void qemu_co_rwlock_rdlock(CoRwlock *lock) +{ + while (lock->writer) { + qemu_co_queue_wait(&lock->queue); + } + lock->reader++; +} + +void qemu_co_rwlock_unlock(CoRwlock *lock) +{ + assert(qemu_in_coroutine()); + if (lock->writer) { + lock->writer = false; + qemu_co_queue_restart_all(&lock->queue); + } else { + lock->reader--; + assert(lock->reader >= 0); + /* Wakeup only one waiting writer */ + if (!lock->reader) { + qemu_co_queue_next(&lock->queue); + } + } +} + +void qemu_co_rwlock_wrlock(CoRwlock *lock) +{ + while (lock->writer || lock->reader) { + qemu_co_queue_wait(&lock->queue); + } + lock->writer = true; +} diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c new file mode 100644 index 0000000000..b35db56356 --- /dev/null +++ b/util/qemu-coroutine-sleep.c @@ -0,0 +1,41 @@ +/* + * QEMU coroutine sleep + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu/coroutine.h" +#include "qemu/timer.h" +#include "block/aio.h" + +typedef struct CoSleepCB { + QEMUTimer *ts; + Coroutine *co; +} CoSleepCB; + +static void co_sleep_cb(void *opaque) +{ + CoSleepCB *sleep_cb = opaque; + + qemu_coroutine_enter(sleep_cb->co, NULL); +} + +void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type, + int64_t ns) +{ + CoSleepCB sleep_cb = { + .co = qemu_coroutine_self(), + }; + sleep_cb.ts = aio_timer_new(ctx, type, SCALE_NS, co_sleep_cb, &sleep_cb); + timer_mod(sleep_cb.ts, qemu_clock_get_ns(type) + ns); + qemu_coroutine_yield(); + timer_del(sleep_cb.ts); + timer_free(sleep_cb.ts); +} diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c new file mode 100644 index 0000000000..9a04f28865 --- /dev/null +++ b/util/qemu-coroutine.c @@ -0,0 +1,150 @@ +/* + * QEMU coroutines + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Kevin Wolf <kwolf@redhat.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "qemu-common.h" +#include "qemu/thread.h" +#include "qemu/atomic.h" +#include "qemu/coroutine.h" +#include "qemu/coroutine_int.h" + +enum { +#if defined(_WIN32) && !defined(_WIN64) + POOL_BATCH_SIZE = 32, +#else + POOL_BATCH_SIZE = 64, +#endif +}; + +/** Free list to speed up creation */ +static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool); +static unsigned int release_pool_size; +static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool); +static __thread unsigned int alloc_pool_size; +static __thread Notifier coroutine_pool_cleanup_notifier; + +static void coroutine_pool_cleanup(Notifier *n, void *value) +{ + Coroutine *co; + Coroutine *tmp; + + QSLIST_FOREACH_SAFE(co, &alloc_pool, pool_next, tmp) { + QSLIST_REMOVE_HEAD(&alloc_pool, pool_next); + qemu_coroutine_delete(co); + } +} + +Coroutine *qemu_coroutine_create(CoroutineEntry *entry) +{ + Coroutine *co = NULL; + + if (CONFIG_COROUTINE_POOL) { + co = QSLIST_FIRST(&alloc_pool); + if (!co) { + if (release_pool_size > POOL_BATCH_SIZE) { + /* Slow path; a good place to register the destructor, too. */ + if (!coroutine_pool_cleanup_notifier.notify) { + coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup; + qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); + } + + /* This is not exact; there could be a little skew between + * release_pool_size and the actual size of release_pool. But + * it is just a heuristic, it does not need to be perfect. + */ + alloc_pool_size = atomic_xchg(&release_pool_size, 0); + QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool); + co = QSLIST_FIRST(&alloc_pool); + } + } + if (co) { + QSLIST_REMOVE_HEAD(&alloc_pool, pool_next); + alloc_pool_size--; + } + } + + if (!co) { + co = qemu_coroutine_new(); + } + + co->entry = entry; + QTAILQ_INIT(&co->co_queue_wakeup); + return co; +} + +static void coroutine_delete(Coroutine *co) +{ + co->caller = NULL; + + if (CONFIG_COROUTINE_POOL) { + if (release_pool_size < POOL_BATCH_SIZE * 2) { + QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next); + atomic_inc(&release_pool_size); + return; + } + if (alloc_pool_size < POOL_BATCH_SIZE) { + QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next); + alloc_pool_size++; + return; + } + } + + qemu_coroutine_delete(co); +} + +void qemu_coroutine_enter(Coroutine *co, void *opaque) +{ + Coroutine *self = qemu_coroutine_self(); + CoroutineAction ret; + + trace_qemu_coroutine_enter(self, co, opaque); + + if (co->caller) { + fprintf(stderr, "Co-routine re-entered recursively\n"); + abort(); + } + + co->caller = self; + co->entry_arg = opaque; + ret = qemu_coroutine_switch(self, co, COROUTINE_ENTER); + + qemu_co_queue_run_restart(co); + + switch (ret) { + case COROUTINE_YIELD: + return; + case COROUTINE_TERMINATE: + trace_qemu_coroutine_terminate(co); + coroutine_delete(co); + return; + default: + abort(); + } +} + +void coroutine_fn qemu_coroutine_yield(void) +{ + Coroutine *self = qemu_coroutine_self(); + Coroutine *to = self->caller; + + trace_qemu_coroutine_yield(self, to); + + if (!to) { + fprintf(stderr, "Co-routine is yielding to no one\n"); + abort(); + } + + self->caller = NULL; + qemu_coroutine_switch(self, to, COROUTINE_YIELD); +} diff --git a/util/qemu-error.c b/util/qemu-error.c index 050d5804c8..d9cc6337c0 100644 --- a/util/qemu-error.c +++ b/util/qemu-error.c @@ -238,7 +238,7 @@ void error_vreport(const char *fmt, va_list ap) GTimeVal tv; gchar *timestr; - if (enable_timestamp_msg) { + if (enable_timestamp_msg && !cur_mon) { g_get_current_time(&tv); timestr = g_time_val_to_iso8601(&tv); error_printf("%s ", timestr); diff --git a/util/qemu-option.c b/util/qemu-option.c index efe9d279c4..a50eceae4a 100644 --- a/util/qemu-option.c +++ b/util/qemu-option.c @@ -180,6 +180,11 @@ void parse_option_size(const char *name, const char *value, if (value != NULL) { sizef = strtod(value, &postfix); + if (sizef < 0 || sizef > UINT64_MAX) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, name, + "a non-negative number below 2^64"); + return; + } switch (*postfix) { case 'T': sizef *= 1024; @@ -200,10 +205,8 @@ void parse_option_size(const char *name, const char *value, break; default: error_setg(errp, QERR_INVALID_PARAMETER_VALUE, name, "a size"); -#if 0 /* conversion from qerror_report() to error_set() broke this: */ - error_printf_unless_qmp("You may use k, M, G or T suffixes for " - "kilobytes, megabytes, gigabytes and terabytes.\n"); -#endif + error_append_hint(errp, "You may use k, M, G or T suffixes for " + "kilobytes, megabytes, gigabytes and terabytes."); return; } } else { @@ -643,9 +646,8 @@ QemuOpts *qemu_opts_create(QemuOptsList *list, const char *id, if (!id_wellformed(id)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "id", "an identifier"); -#if 0 /* conversion from qerror_report() to error_set() broke this: */ - error_printf_unless_qmp("Identifiers consist of letters, digits, '-', '.', '_', starting with a letter.\n"); -#endif + error_append_hint(errp, "Identifiers consist of letters, digits, " + "'-', '.', '_', starting with a letter."); return NULL; } opts = qemu_opts_find(list, id); @@ -730,14 +732,35 @@ void qemu_opts_del(QemuOpts *opts) g_free(opts); } -void qemu_opts_print(QemuOpts *opts, const char *sep) +/* print value, escaping any commas in value */ +static void escaped_print(const char *value) +{ + const char *ptr; + + for (ptr = value; *ptr; ++ptr) { + if (*ptr == ',') { + putchar(','); + } + putchar(*ptr); + } +} + +void qemu_opts_print(QemuOpts *opts, const char *separator) { QemuOpt *opt; QemuOptDesc *desc = opts->list->desc; + const char *sep = ""; + + if (opts->id) { + printf("id=%s", opts->id); /* passed id_wellformed -> no commas */ + sep = separator; + } if (desc[0].name == NULL) { QTAILQ_FOREACH(opt, &opts->head, next) { - printf("%s%s=\"%s\"", sep, opt->name, opt->str); + printf("%s%s=", sep, opt->name); + escaped_print(opt->str); + sep = separator; } return; } @@ -750,13 +773,15 @@ void qemu_opts_print(QemuOpts *opts, const char *sep) continue; } if (desc->type == QEMU_OPT_STRING) { - printf("%s%s='%s'", sep, desc->name, value); + printf("%s%s=", sep, desc->name); + escaped_print(value); } else if ((desc->type == QEMU_OPT_SIZE || desc->type == QEMU_OPT_NUMBER) && opt) { printf("%s%s=%" PRId64, sep, desc->name, opt->value.uint); } else { printf("%s%s=%s", sep, desc->name, value); } + sep = separator; } } diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c index a3d0be6d28..a96a8d7960 100644 --- a/util/qemu-sockets.c +++ b/util/qemu-sockets.c @@ -25,6 +25,9 @@ #include "monitor/monitor.h" #include "qemu/sockets.h" #include "qemu/main-loop.h" +#include "qapi/qmp-input-visitor.h" +#include "qapi/qmp-output-visitor.h" +#include "qapi-visit.h" #ifndef AI_ADDRCONFIG # define AI_ADDRCONFIG 0 @@ -125,12 +128,15 @@ int inet_listen_opts(QemuOpts *opts, int port_offset, Error **errp) ai.ai_family = PF_UNSPEC; ai.ai_socktype = SOCK_STREAM; - if ((qemu_opt_get(opts, "host") == NULL) || - (qemu_opt_get(opts, "port") == NULL)) { - error_setg(errp, "host and/or port not specified"); + if ((qemu_opt_get(opts, "host") == NULL)) { + error_setg(errp, "host not specified"); return -1; } - pstrcpy(port, sizeof(port), qemu_opt_get(opts, "port")); + if (qemu_opt_get(opts, "port") != NULL) { + pstrcpy(port, sizeof(port), qemu_opt_get(opts, "port")); + } else { + port[0] = '\0'; + } addr = qemu_opt_get(opts, "host"); to = qemu_opt_get_number(opts, "to", 0); @@ -142,6 +148,10 @@ int inet_listen_opts(QemuOpts *opts, int port_offset, Error **errp) /* lookup */ if (port_offset) { unsigned long long baseport; + if (strlen(port) == 0) { + error_setg(errp, "port not specified"); + return -1; + } if (parse_uint_full(port, &baseport, 10) < 0) { error_setg(errp, "can't convert to a number: %s", port); return -1; @@ -153,7 +163,8 @@ int inet_listen_opts(QemuOpts *opts, int port_offset, Error **errp) } snprintf(port, sizeof(port), "%d", (int)baseport + port_offset); } - rc = getaddrinfo(strlen(addr) ? addr : NULL, port, &ai, &res); + rc = getaddrinfo(strlen(addr) ? addr : NULL, + strlen(port) ? port : NULL, &ai, &res); if (rc != 0) { error_setg(errp, "address resolution failed for %s:%s: %s", addr, port, gai_strerror(rc)); @@ -594,12 +605,15 @@ fail: static void inet_addr_to_opts(QemuOpts *opts, const InetSocketAddress *addr) { - bool ipv4 = addr->ipv4 || !addr->has_ipv4; - bool ipv6 = addr->ipv6 || !addr->has_ipv6; + bool ipv4 = addr->has_ipv4 && addr->ipv4; + bool ipv6 = addr->has_ipv6 && addr->ipv6; - if (!ipv4 || !ipv6) { + if (ipv4 || ipv6) { qemu_opt_set_bool(opts, "ipv4", ipv4, &error_abort); qemu_opt_set_bool(opts, "ipv6", ipv6, &error_abort); + } else if (addr->has_ipv4 || addr->has_ipv6) { + qemu_opt_set_bool(opts, "ipv4", !addr->has_ipv4, &error_abort); + qemu_opt_set_bool(opts, "ipv6", !addr->has_ipv6, &error_abort); } if (addr->has_to) { qemu_opt_set_number(opts, "to", addr->to, &error_abort); @@ -745,8 +759,7 @@ int unix_listen_opts(QemuOpts *opts, Error **errp) qemu_opt_set(opts, "path", un.sun_path, &error_abort); } - if ((access(un.sun_path, F_OK) == 0) && - unlink(un.sun_path) < 0) { + if (unlink(un.sun_path) < 0 && errno != ENOENT) { error_setg_errno(errp, errno, "Failed to unlink socket %s", un.sun_path); goto err; @@ -912,23 +925,23 @@ SocketAddress *socket_parse(const char *str, Error **errp) error_setg(errp, "invalid Unix socket address"); goto fail; } else { - addr->kind = SOCKET_ADDRESS_KIND_UNIX; - addr->q_unix = g_new(UnixSocketAddress, 1); - addr->q_unix->path = g_strdup(str + 5); + addr->type = SOCKET_ADDRESS_KIND_UNIX; + addr->u.q_unix = g_new(UnixSocketAddress, 1); + addr->u.q_unix->path = g_strdup(str + 5); } } else if (strstart(str, "fd:", NULL)) { if (str[3] == '\0') { error_setg(errp, "invalid file descriptor address"); goto fail; } else { - addr->kind = SOCKET_ADDRESS_KIND_FD; - addr->fd = g_new(String, 1); - addr->fd->str = g_strdup(str + 3); + addr->type = SOCKET_ADDRESS_KIND_FD; + addr->u.fd = g_new(String, 1); + addr->u.fd->str = g_strdup(str + 3); } } else { - addr->kind = SOCKET_ADDRESS_KIND_INET; - addr->inet = inet_parse(str, errp); - if (addr->inet == NULL) { + addr->type = SOCKET_ADDRESS_KIND_INET; + addr->u.inet = inet_parse(str, errp); + if (addr->u.inet == NULL) { goto fail; } } @@ -946,19 +959,19 @@ int socket_connect(SocketAddress *addr, Error **errp, int fd; opts = qemu_opts_create(&socket_optslist, NULL, 0, &error_abort); - switch (addr->kind) { + switch (addr->type) { case SOCKET_ADDRESS_KIND_INET: - inet_addr_to_opts(opts, addr->inet); + inet_addr_to_opts(opts, addr->u.inet); fd = inet_connect_opts(opts, errp, callback, opaque); break; case SOCKET_ADDRESS_KIND_UNIX: - qemu_opt_set(opts, "path", addr->q_unix->path, &error_abort); + qemu_opt_set(opts, "path", addr->u.q_unix->path, &error_abort); fd = unix_connect_opts(opts, errp, callback, opaque); break; case SOCKET_ADDRESS_KIND_FD: - fd = monitor_get_fd(cur_mon, addr->fd->str, errp); + fd = monitor_get_fd(cur_mon, addr->u.fd->str, errp); if (fd >= 0 && callback) { qemu_set_nonblock(fd); callback(fd, NULL, opaque); @@ -978,19 +991,19 @@ int socket_listen(SocketAddress *addr, Error **errp) int fd; opts = qemu_opts_create(&socket_optslist, NULL, 0, &error_abort); - switch (addr->kind) { + switch (addr->type) { case SOCKET_ADDRESS_KIND_INET: - inet_addr_to_opts(opts, addr->inet); + inet_addr_to_opts(opts, addr->u.inet); fd = inet_listen_opts(opts, 0, errp); break; case SOCKET_ADDRESS_KIND_UNIX: - qemu_opt_set(opts, "path", addr->q_unix->path, &error_abort); + qemu_opt_set(opts, "path", addr->u.q_unix->path, &error_abort); fd = unix_listen_opts(opts, errp); break; case SOCKET_ADDRESS_KIND_FD: - fd = monitor_get_fd(cur_mon, addr->fd->str, errp); + fd = monitor_get_fd(cur_mon, addr->u.fd->str, errp); break; default: @@ -1006,12 +1019,12 @@ int socket_dgram(SocketAddress *remote, SocketAddress *local, Error **errp) int fd; opts = qemu_opts_create(&socket_optslist, NULL, 0, &error_abort); - switch (remote->kind) { + switch (remote->type) { case SOCKET_ADDRESS_KIND_INET: - inet_addr_to_opts(opts, remote->inet); + inet_addr_to_opts(opts, remote->u.inet); if (local) { - qemu_opt_set(opts, "localaddr", local->inet->host, &error_abort); - qemu_opt_set(opts, "localport", local->inet->port, &error_abort); + qemu_opt_set(opts, "localaddr", local->u.inet->host, &error_abort); + qemu_opt_set(opts, "localport", local->u.inet->port, &error_abort); } fd = inet_dgram_opts(opts, errp); break; @@ -1023,3 +1036,140 @@ int socket_dgram(SocketAddress *remote, SocketAddress *local, Error **errp) qemu_opts_del(opts); return fd; } + + +static SocketAddress * +socket_sockaddr_to_address_inet(struct sockaddr_storage *sa, + socklen_t salen, + Error **errp) +{ + char host[NI_MAXHOST]; + char serv[NI_MAXSERV]; + SocketAddress *addr; + int ret; + + ret = getnameinfo((struct sockaddr *)sa, salen, + host, sizeof(host), + serv, sizeof(serv), + NI_NUMERICHOST | NI_NUMERICSERV); + if (ret != 0) { + error_setg(errp, "Cannot format numeric socket address: %s", + gai_strerror(ret)); + return NULL; + } + + addr = g_new0(SocketAddress, 1); + addr->type = SOCKET_ADDRESS_KIND_INET; + addr->u.inet = g_new0(InetSocketAddress, 1); + addr->u.inet->host = g_strdup(host); + addr->u.inet->port = g_strdup(serv); + if (sa->ss_family == AF_INET) { + addr->u.inet->has_ipv4 = addr->u.inet->ipv4 = true; + } else { + addr->u.inet->has_ipv6 = addr->u.inet->ipv6 = true; + } + + return addr; +} + + +#ifndef WIN32 +static SocketAddress * +socket_sockaddr_to_address_unix(struct sockaddr_storage *sa, + socklen_t salen, + Error **errp) +{ + SocketAddress *addr; + struct sockaddr_un *su = (struct sockaddr_un *)sa; + + addr = g_new0(SocketAddress, 1); + addr->type = SOCKET_ADDRESS_KIND_UNIX; + addr->u.q_unix = g_new0(UnixSocketAddress, 1); + if (su->sun_path[0]) { + addr->u.q_unix->path = g_strndup(su->sun_path, + sizeof(su->sun_path)); + } + + return addr; +} +#endif /* WIN32 */ + +static SocketAddress * +socket_sockaddr_to_address(struct sockaddr_storage *sa, + socklen_t salen, + Error **errp) +{ + switch (sa->ss_family) { + case AF_INET: + case AF_INET6: + return socket_sockaddr_to_address_inet(sa, salen, errp); + +#ifndef WIN32 + case AF_UNIX: + return socket_sockaddr_to_address_unix(sa, salen, errp); +#endif /* WIN32 */ + + default: + error_setg(errp, "socket family %d unsupported", + sa->ss_family); + return NULL; + } + return 0; +} + + +SocketAddress *socket_local_address(int fd, Error **errp) +{ + struct sockaddr_storage ss; + socklen_t sslen = sizeof(ss); + + if (getsockname(fd, (struct sockaddr *)&ss, &sslen) < 0) { + error_setg_errno(errp, socket_error(), "%s", + "Unable to query local socket address"); + return NULL; + } + + return socket_sockaddr_to_address(&ss, sslen, errp); +} + + +SocketAddress *socket_remote_address(int fd, Error **errp) +{ + struct sockaddr_storage ss; + socklen_t sslen = sizeof(ss); + + if (getpeername(fd, (struct sockaddr *)&ss, &sslen) < 0) { + error_setg_errno(errp, socket_error(), "%s", + "Unable to query remote socket address"); + return NULL; + } + + return socket_sockaddr_to_address(&ss, sslen, errp); +} + + +void qapi_copy_SocketAddress(SocketAddress **p_dest, + SocketAddress *src) +{ + QmpOutputVisitor *qov; + QmpInputVisitor *qiv; + Visitor *ov, *iv; + QObject *obj; + + *p_dest = NULL; + + qov = qmp_output_visitor_new(); + ov = qmp_output_get_visitor(qov); + visit_type_SocketAddress(ov, &src, NULL, &error_abort); + obj = qmp_output_get_qobject(qov); + qmp_output_visitor_cleanup(qov); + if (!obj) { + return; + } + + qiv = qmp_input_visitor_new(obj); + iv = qmp_input_get_visitor(qiv); + visit_type_SocketAddress(iv, p_dest, NULL, &error_abort); + qmp_input_visitor_cleanup(qiv); + qobject_decref(obj); +} diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c index 23f93b73b1..dde2a213cd 100644 --- a/util/qemu-thread-posix.c +++ b/util/qemu-thread-posix.c @@ -298,7 +298,16 @@ static inline void futex_wake(QemuEvent *ev, int n) static inline void futex_wait(QemuEvent *ev, unsigned val) { - futex(ev, FUTEX_WAIT, (int) val, NULL, NULL, 0); + while (futex(ev, FUTEX_WAIT, (int) val, NULL, NULL, 0)) { + switch (errno) { + case EWOULDBLOCK: + return; + case EINTR: + break; /* get out of switch and retry */ + default: + abort(); + } + } } #else static inline void futex_wake(QemuEvent *ev, int n) @@ -389,7 +398,7 @@ void qemu_event_wait(QemuEvent *ev) /* * Leave the event reset and tell qemu_event_set that there * are waiters. No need to retry, because there cannot be - * a concurent busy->free transition. After the CAS, the + * a concurrent busy->free transition. After the CAS, the * event will be either set or busy. */ if (atomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) { diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c index 406b52f91d..6cdd553e9a 100644 --- a/util/qemu-thread-win32.c +++ b/util/qemu-thread-win32.c @@ -238,10 +238,34 @@ void qemu_sem_wait(QemuSemaphore *sem) } } +/* Wrap a Win32 manual-reset event with a fast userspace path. The idea + * is to reset the Win32 event lazily, as part of a test-reset-test-wait + * sequence. Such a sequence is, indeed, how QemuEvents are used by + * RCU and other subsystems! + * + * Valid transitions: + * - free->set, when setting the event + * - busy->set, when setting the event, followed by futex_wake + * - set->free, when resetting the event + * - free->busy, when waiting + * + * set->busy does not happen (it can be observed from the outside but + * it really is set->free->busy). + * + * busy->free provably cannot happen; to enforce it, the set->free transition + * is done with an OR, which becomes a no-op if the event has concurrently + * transitioned to free or busy (and is faster than cmpxchg). + */ + +#define EV_SET 0 +#define EV_FREE 1 +#define EV_BUSY -1 + void qemu_event_init(QemuEvent *ev, bool init) { /* Manual reset. */ - ev->event = CreateEvent(NULL, TRUE, init, NULL); + ev->event = CreateEvent(NULL, TRUE, TRUE, NULL); + ev->value = (init ? EV_SET : EV_FREE); } void qemu_event_destroy(QemuEvent *ev) @@ -251,17 +275,51 @@ void qemu_event_destroy(QemuEvent *ev) void qemu_event_set(QemuEvent *ev) { - SetEvent(ev->event); + if (atomic_mb_read(&ev->value) != EV_SET) { + if (atomic_xchg(&ev->value, EV_SET) == EV_BUSY) { + /* There were waiters, wake them up. */ + SetEvent(ev->event); + } + } } void qemu_event_reset(QemuEvent *ev) { - ResetEvent(ev->event); + if (atomic_mb_read(&ev->value) == EV_SET) { + /* If there was a concurrent reset (or even reset+wait), + * do nothing. Otherwise change EV_SET->EV_FREE. + */ + atomic_or(&ev->value, EV_FREE); + } } void qemu_event_wait(QemuEvent *ev) { - WaitForSingleObject(ev->event, INFINITE); + unsigned value; + + value = atomic_mb_read(&ev->value); + if (value != EV_SET) { + if (value == EV_FREE) { + /* qemu_event_set is not yet going to call SetEvent, but we are + * going to do another check for EV_SET below when setting EV_BUSY. + * At that point it is safe to call WaitForSingleObject. + */ + ResetEvent(ev->event); + + /* Tell qemu_event_set that there are waiters. No need to retry + * because there cannot be a concurent busy->free transition. + * After the CAS, the event will be either set or busy. + */ + if (atomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) { + value = EV_SET; + } else { + value = EV_BUSY; + } + } + if (value == EV_BUSY) { + WaitForSingleObject(ev->event, INFINITE); + } + } } struct QemuThreadData { diff --git a/util/rcu.c b/util/rcu.c index cdcad678b4..8ba304dc44 100644 --- a/util/rcu.c +++ b/util/rcu.c @@ -47,7 +47,8 @@ unsigned long rcu_gp_ctr = RCU_GP_LOCKED; QemuEvent rcu_gp_event; -static QemuMutex rcu_gp_lock; +static QemuMutex rcu_registry_lock; +static QemuMutex rcu_sync_lock; /* * Check whether a quiescent state was crossed between the beginning of @@ -66,7 +67,7 @@ static inline int rcu_gp_ongoing(unsigned long *ctr) */ __thread struct rcu_reader_data rcu_reader; -/* Protected by rcu_gp_lock. */ +/* Protected by rcu_registry_lock. */ typedef QLIST_HEAD(, rcu_reader_data) ThreadList; static ThreadList registry = QLIST_HEAD_INITIALIZER(registry); @@ -114,10 +115,26 @@ static void wait_for_readers(void) break; } - /* Wait for one thread to report a quiescent state and - * try again. + /* Wait for one thread to report a quiescent state and try again. + * Release rcu_registry_lock, so rcu_(un)register_thread() doesn't + * wait too much time. + * + * rcu_register_thread() may add nodes to ®istry; it will not + * wake up synchronize_rcu, but that is okay because at least another + * thread must exit its RCU read-side critical section before + * synchronize_rcu is done. The next iteration of the loop will + * move the new thread's rcu_reader from ®istry to &qsreaders, + * because rcu_gp_ongoing() will return false. + * + * rcu_unregister_thread() may remove nodes from &qsreaders instead + * of ®istry if it runs during qemu_event_wait. That's okay; + * the node then will not be added back to ®istry by QLIST_SWAP + * below. The invariant is that the node is part of one list when + * rcu_registry_lock is released. */ + qemu_mutex_unlock(&rcu_registry_lock); qemu_event_wait(&rcu_gp_event); + qemu_mutex_lock(&rcu_registry_lock); } /* put back the reader list in the registry */ @@ -126,7 +143,8 @@ static void wait_for_readers(void) void synchronize_rcu(void) { - qemu_mutex_lock(&rcu_gp_lock); + qemu_mutex_lock(&rcu_sync_lock); + qemu_mutex_lock(&rcu_registry_lock); if (!QLIST_EMPTY(®istry)) { /* In either case, the atomic_mb_set below blocks stores that free @@ -149,7 +167,8 @@ void synchronize_rcu(void) wait_for_readers(); } - qemu_mutex_unlock(&rcu_gp_lock); + qemu_mutex_unlock(&rcu_registry_lock); + qemu_mutex_unlock(&rcu_sync_lock); } @@ -273,23 +292,24 @@ void call_rcu1(struct rcu_head *node, void (*func)(struct rcu_head *node)) void rcu_register_thread(void) { assert(rcu_reader.ctr == 0); - qemu_mutex_lock(&rcu_gp_lock); + qemu_mutex_lock(&rcu_registry_lock); QLIST_INSERT_HEAD(®istry, &rcu_reader, node); - qemu_mutex_unlock(&rcu_gp_lock); + qemu_mutex_unlock(&rcu_registry_lock); } void rcu_unregister_thread(void) { - qemu_mutex_lock(&rcu_gp_lock); + qemu_mutex_lock(&rcu_registry_lock); QLIST_REMOVE(&rcu_reader, node); - qemu_mutex_unlock(&rcu_gp_lock); + qemu_mutex_unlock(&rcu_registry_lock); } static void rcu_init_complete(void) { QemuThread thread; - qemu_mutex_init(&rcu_gp_lock); + qemu_mutex_init(&rcu_registry_lock); + qemu_mutex_init(&rcu_sync_lock); qemu_event_init(&rcu_gp_event, true); qemu_event_init(&rcu_call_ready_event, false); @@ -306,12 +326,14 @@ static void rcu_init_complete(void) #ifdef CONFIG_POSIX static void rcu_init_lock(void) { - qemu_mutex_lock(&rcu_gp_lock); + qemu_mutex_lock(&rcu_sync_lock); + qemu_mutex_lock(&rcu_registry_lock); } static void rcu_init_unlock(void) { - qemu_mutex_unlock(&rcu_gp_lock); + qemu_mutex_unlock(&rcu_registry_lock); + qemu_mutex_unlock(&rcu_sync_lock); } #endif diff --git a/util/throttle.c b/util/throttle.c index 706c13111e..1113671ecf 100644 --- a/util/throttle.c +++ b/util/throttle.c @@ -300,6 +300,21 @@ bool throttle_is_valid(ThrottleConfig *cfg) return !invalid; } +/* check if bps_max/iops_max is used without bps/iops + * @cfg: the throttling configuration to inspect + */ +bool throttle_max_is_missing_limit(ThrottleConfig *cfg) +{ + int i; + + for (i = 0; i < BUCKETS_COUNT; i++) { + if (cfg->buckets[i].max && !cfg->buckets[i].avg) { + return true; + } + } + return false; +} + /* fix bucket parameters */ static void throttle_fix_bucket(LeakyBucket *bkt) { diff --git a/util/timed-average.c b/util/timed-average.c new file mode 100644 index 0000000000..a2dfb4834d --- /dev/null +++ b/util/timed-average.c @@ -0,0 +1,231 @@ +/* + * QEMU timed average computation + * + * Copyright (C) Nodalink, EURL. 2014 + * Copyright (C) Igalia, S.L. 2015 + * + * Authors: + * BenoĆ®t Canet <benoit.canet@nodalink.com> + * Alberto Garcia <berto@igalia.com> + * + * This program is free sofware: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Sofware Foundation, either version 2 of the License, or + * (at your option) version 3 or any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <string.h> + +#include "qemu/timed-average.h" + +/* This module computes an average of a set of values within a time + * window. + * + * Algorithm: + * + * - Create two windows with a certain expiration period, and + * offsetted by period / 2. + * - Each time you want to account a new value, do it in both windows. + * - The minimum / maximum / average values are always returned from + * the oldest window. + * + * Example: + * + * t=0 |t=0.5 |t=1 |t=1.5 |t=2 + * wnd0: [0,0.5)|wnd0: [0.5,1.5) | |wnd0: [1.5,2.5) | + * wnd1: [0,1) | |wnd1: [1,2) | | + * + * Values are returned from: + * + * wnd0---------|wnd1------------|wnd0---------|wnd1-------------| + */ + +/* Update the expiration of a time window + * + * @w: the window used + * @now: the current time in nanoseconds + * @period: the expiration period in nanoseconds + */ +static void update_expiration(TimedAverageWindow *w, int64_t now, + int64_t period) +{ + /* time elapsed since the last theoretical expiration */ + int64_t elapsed = (now - w->expiration) % period; + /* time remaininging until the next expiration */ + int64_t remaining = period - elapsed; + /* compute expiration */ + w->expiration = now + remaining; +} + +/* Reset a window + * + * @w: the window to reset + */ +static void window_reset(TimedAverageWindow *w) +{ + w->min = UINT64_MAX; + w->max = 0; + w->sum = 0; + w->count = 0; +} + +/* Get the current window (that is, the one with the earliest + * expiration time). + * + * @ta: the TimedAverage structure + * @ret: a pointer to the current window + */ +static TimedAverageWindow *current_window(TimedAverage *ta) +{ + return &ta->windows[ta->current]; +} + +/* Initialize a TimedAverage structure + * + * @ta: the TimedAverage structure + * @clock_type: the type of clock to use + * @period: the time window period in nanoseconds + */ +void timed_average_init(TimedAverage *ta, QEMUClockType clock_type, + uint64_t period) +{ + int64_t now = qemu_clock_get_ns(clock_type); + + /* Returned values are from the oldest window, so they belong to + * the interval [ta->period/2,ta->period). By adjusting the + * requested period by 4/3, we guarantee that they're in the + * interval [2/3 period,4/3 period), closer to the requested + * period on average */ + ta->period = (uint64_t) period * 4 / 3; + ta->clock_type = clock_type; + ta->current = 0; + + window_reset(&ta->windows[0]); + window_reset(&ta->windows[1]); + + /* Both windows are offsetted by half a period */ + ta->windows[0].expiration = now + ta->period / 2; + ta->windows[1].expiration = now + ta->period; +} + +/* Check if the time windows have expired, updating their counters and + * expiration time if that's the case. + * + * @ta: the TimedAverage structure + * @elapsed: if non-NULL, the elapsed time (in ns) within the current + * window will be stored here + */ +static void check_expirations(TimedAverage *ta, uint64_t *elapsed) +{ + int64_t now = qemu_clock_get_ns(ta->clock_type); + int i; + + assert(ta->period != 0); + + /* Check if the windows have expired */ + for (i = 0; i < 2; i++) { + TimedAverageWindow *w = &ta->windows[i]; + if (w->expiration <= now) { + window_reset(w); + update_expiration(w, now, ta->period); + } + } + + /* Make ta->current point to the oldest window */ + if (ta->windows[0].expiration < ta->windows[1].expiration) { + ta->current = 0; + } else { + ta->current = 1; + } + + /* Calculate the elapsed time within the current window */ + if (elapsed) { + int64_t remaining = ta->windows[ta->current].expiration - now; + *elapsed = ta->period - remaining; + } +} + +/* Account a value + * + * @ta: the TimedAverage structure + * @value: the value to account + */ +void timed_average_account(TimedAverage *ta, uint64_t value) +{ + int i; + check_expirations(ta, NULL); + + /* Do the accounting in both windows at the same time */ + for (i = 0; i < 2; i++) { + TimedAverageWindow *w = &ta->windows[i]; + + w->sum += value; + w->count++; + + if (value < w->min) { + w->min = value; + } + + if (value > w->max) { + w->max = value; + } + } +} + +/* Get the minimum value + * + * @ta: the TimedAverage structure + * @ret: the minimum value + */ +uint64_t timed_average_min(TimedAverage *ta) +{ + TimedAverageWindow *w; + check_expirations(ta, NULL); + w = current_window(ta); + return w->min < UINT64_MAX ? w->min : 0; +} + +/* Get the average value + * + * @ta: the TimedAverage structure + * @ret: the average value + */ +uint64_t timed_average_avg(TimedAverage *ta) +{ + TimedAverageWindow *w; + check_expirations(ta, NULL); + w = current_window(ta); + return w->count > 0 ? w->sum / w->count : 0; +} + +/* Get the maximum value + * + * @ta: the TimedAverage structure + * @ret: the maximum value + */ +uint64_t timed_average_max(TimedAverage *ta) +{ + check_expirations(ta, NULL); + return current_window(ta)->max; +} + +/* Get the sum of all accounted values + * @ta: the TimedAverage structure + * @elapsed: if non-NULL, the elapsed time (in ns) will be stored here + * @ret: the sum of all accounted values + */ +uint64_t timed_average_sum(TimedAverage *ta, uint64_t *elapsed) +{ + TimedAverageWindow *w; + check_expirations(ta, elapsed); + w = current_window(ta); + return w->sum; +} |