diff options
author | Yonghee Han <onstudy@samsung.com> | 2016-07-27 16:40:17 +0900 |
---|---|---|
committer | Yonghee Han <onstudy@samsung.com> | 2016-07-27 00:53:56 -0700 |
commit | 3158f4a51894e46ecb593bffbfd12824e1d6534a (patch) | |
tree | 2bef7f0238e687c5de65f48b5995ee124a95d157 /docs | |
parent | a3b133b0ea0696e42fd876b9a803e28bc6ef5299 (diff) | |
download | qemu-3158f4a51894e46ecb593bffbfd12824e1d6534a.tar.gz qemu-3158f4a51894e46ecb593bffbfd12824e1d6534a.tar.bz2 qemu-3158f4a51894e46ecb593bffbfd12824e1d6534a.zip |
Imported Upstream version 2.4.1upstream/2.4.1
Change-Id: I0b584f569cb0e0f4eac13cdb79e110c2dbc34bfc
Diffstat (limited to 'docs')
-rw-r--r-- | docs/aio_notify.promela | 77 | ||||
-rw-r--r-- | docs/aio_notify_accept.promela | 152 | ||||
-rw-r--r-- | docs/aio_notify_bug.promela | 140 | ||||
-rw-r--r-- | docs/atomics.txt | 4 | ||||
-rw-r--r-- | docs/bitmaps.md | 352 | ||||
-rw-r--r-- | docs/memory-hotplug.txt | 23 | ||||
-rw-r--r-- | docs/migration.txt | 11 | ||||
-rw-r--r-- | docs/multi-thread-compression.txt | 149 | ||||
-rw-r--r-- | docs/multiseat.txt | 37 | ||||
-rw-r--r-- | docs/pci_expander_bridge.txt | 58 | ||||
-rw-r--r-- | docs/qapi-code-gen.txt | 497 | ||||
-rw-r--r-- | docs/qmp/qmp-events.txt | 53 | ||||
-rw-r--r-- | docs/qmp/qmp-spec.txt | 115 | ||||
-rw-r--r-- | docs/specs/acpi_mem_hotplug.txt | 58 | ||||
-rw-r--r-- | docs/specs/fw_cfg.txt | 21 | ||||
-rw-r--r-- | docs/specs/pci-ids.txt | 2 | ||||
-rw-r--r-- | docs/specs/ppc-spapr-hotplug.txt | 305 | ||||
-rw-r--r-- | docs/specs/rocker.txt | 1014 | ||||
-rw-r--r-- | docs/writing-qmp-commands.txt | 2 |
19 files changed, 2856 insertions, 214 deletions
diff --git a/docs/aio_notify.promela b/docs/aio_notify.promela index ad3f6f08b..fccc7ee1c 100644 --- a/docs/aio_notify.promela +++ b/docs/aio_notify.promela @@ -1,5 +1,5 @@ /* - * This model describes the interaction between aio_set_dispatching() + * This model describes the interaction between ctx->notify_me * and aio_notify(). * * Author: Paolo Bonzini <pbonzini@redhat.com> @@ -14,57 +14,53 @@ * spin -a docs/aio_notify.promela * gcc -O2 pan.c * ./a.out -a + * + * To verify it (with a bug planted in the model): + * spin -a -DBUG docs/aio_notify.promela + * gcc -O2 pan.c + * ./a.out -a */ #define MAX 4 #define LAST (1 << (MAX - 1)) #define FINAL ((LAST << 1) - 1) -bool dispatching; +bool notify_me; bool event; -int req, done; +int req; +int done; active proctype waiter() { - int fetch, blocking; + int fetch; - do - :: done != FINAL -> { - // Computing "blocking" is separate from execution of the - // "bottom half" - blocking = (req == 0); - - // This is our "bottom half" - atomic { fetch = req; req = 0; } - done = done | fetch; - - // Wait for a nudge from the other side - do - :: event == 1 -> { event = 0; break; } - :: !blocking -> break; - od; + do + :: true -> { + notify_me++; - dispatching = 1; + if +#ifndef BUG + :: (req > 0) -> skip; +#endif + :: else -> + // Wait for a nudge from the other side + do + :: event == 1 -> { event = 0; break; } + od; + fi; - // If you are simulating this model, you may want to add - // something like this here: - // - // int foo; foo++; foo++; foo++; - // - // This only wastes some time and makes it more likely - // that the notifier process hits the "fast path". + notify_me--; - dispatching = 0; + atomic { fetch = req; req = 0; } + done = done | fetch; } - :: else -> break; od } active proctype notifier() { int next = 1; - int sets = 0; do :: next <= LAST -> { @@ -74,8 +70,8 @@ active proctype notifier() // aio_notify if - :: dispatching == 0 -> sets++; event = 1; - :: else -> skip; + :: notify_me == 1 -> event = 1; + :: else -> printf("Skipped event_notifier_set\n"); skip; fi; // Test both synchronous and asynchronous delivery @@ -86,19 +82,12 @@ active proctype notifier() :: 1 -> skip; fi; } - :: else -> break; od; - printf("Skipped %d event_notifier_set\n", MAX - sets); } -#define p (done == FINAL) - -never { - do - :: 1 // after an arbitrarily long prefix - :: p -> break // p becomes true - od; - do - :: !p -> accept: break // it then must remains true forever after - od +never { /* [] done < FINAL */ +accept_init: + do + :: done < FINAL -> skip; + od; } diff --git a/docs/aio_notify_accept.promela b/docs/aio_notify_accept.promela new file mode 100644 index 000000000..9cef2c955 --- /dev/null +++ b/docs/aio_notify_accept.promela @@ -0,0 +1,152 @@ +/* + * This model describes the interaction between ctx->notified + * and ctx->notifier. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This file is in the public domain. If you really want a license, + * the WTFPL will do. + * + * To verify the buggy version: + * spin -a -DBUG1 docs/aio_notify_bug.promela + * gcc -O2 pan.c + * ./a.out -a -f + * (or -DBUG2) + * + * To verify the fixed version: + * spin -a docs/aio_notify_bug.promela + * gcc -O2 pan.c + * ./a.out -a -f + * + * Add -DCHECK_REQ to test an alternative invariant and the + * "notify_me" optimization. + */ + +int notify_me; +bool notified; +bool event; +bool req; +bool notifier_done; + +#ifdef CHECK_REQ +#define USE_NOTIFY_ME 1 +#else +#define USE_NOTIFY_ME 0 +#endif + +#ifdef BUG +#error Please define BUG1 or BUG2 instead. +#endif + +active proctype notifier() +{ + do + :: true -> { + req = 1; + if + :: !USE_NOTIFY_ME || notify_me -> +#if defined BUG1 + /* CHECK_REQ does not detect this bug! */ + notified = 1; + event = 1; +#elif defined BUG2 + if + :: !notified -> event = 1; + :: else -> skip; + fi; + notified = 1; +#else + event = 1; + notified = 1; +#endif + :: else -> skip; + fi + } + :: true -> break; + od; + notifier_done = 1; +} + +#define AIO_POLL \ + notify_me++; \ + if \ + :: !req -> { \ + if \ + :: event -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + notify_me--; \ + \ + atomic { old = notified; notified = 0; } \ + if \ + :: old -> event = 0; \ + :: else -> skip; \ + fi; \ + \ + req = 0; + +active proctype waiter() +{ + bool old; + + do + :: true -> AIO_POLL; + od; +} + +/* Same as waiter(), but disappears after a while. */ +active proctype temporary_waiter() +{ + bool old; + + do + :: true -> AIO_POLL; + :: true -> break; + od; +} + +#ifdef CHECK_REQ +never { + do + :: req -> goto accept_if_req_not_eventually_false; + :: true -> skip; + od; + +accept_if_req_not_eventually_false: + if + :: req -> goto accept_if_req_not_eventually_false; + fi; + assert(0); +} + +#else +/* There must be infinitely many transitions of event as long + * as the notifier does not exit. + * + * If event stayed always true, the waiters would be busy looping. + * If event stayed always false, the waiters would be sleeping + * forever. + */ +never { + do + :: !event -> goto accept_if_event_not_eventually_true; + :: event -> goto accept_if_event_not_eventually_false; + :: true -> skip; + od; + +accept_if_event_not_eventually_true: + if + :: !event && notifier_done -> do :: true -> skip; od; + :: !event && !notifier_done -> goto accept_if_event_not_eventually_true; + fi; + assert(0); + +accept_if_event_not_eventually_false: + if + :: event -> goto accept_if_event_not_eventually_false; + fi; + assert(0); +} +#endif diff --git a/docs/aio_notify_bug.promela b/docs/aio_notify_bug.promela new file mode 100644 index 000000000..b3bfca1ca --- /dev/null +++ b/docs/aio_notify_bug.promela @@ -0,0 +1,140 @@ +/* + * This model describes a bug in aio_notify. If ctx->notifier is + * cleared too late, a wakeup could be lost. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This file is in the public domain. If you really want a license, + * the WTFPL will do. + * + * To verify the buggy version: + * spin -a -DBUG docs/aio_notify_bug.promela + * gcc -O2 pan.c + * ./a.out -a -f + * + * To verify the fixed version: + * spin -a docs/aio_notify_bug.promela + * gcc -O2 pan.c + * ./a.out -a -f + * + * Add -DCHECK_REQ to test an alternative invariant and the + * "notify_me" optimization. + */ + +int notify_me; +bool event; +bool req; +bool notifier_done; + +#ifdef CHECK_REQ +#define USE_NOTIFY_ME 1 +#else +#define USE_NOTIFY_ME 0 +#endif + +active proctype notifier() +{ + do + :: true -> { + req = 1; + if + :: !USE_NOTIFY_ME || notify_me -> event = 1; + :: else -> skip; + fi + } + :: true -> break; + od; + notifier_done = 1; +} + +#ifdef BUG +#define AIO_POLL \ + notify_me++; \ + if \ + :: !req -> { \ + if \ + :: event -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + notify_me--; \ + \ + req = 0; \ + event = 0; +#else +#define AIO_POLL \ + notify_me++; \ + if \ + :: !req -> { \ + if \ + :: event -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + notify_me--; \ + \ + event = 0; \ + req = 0; +#endif + +active proctype waiter() +{ + do + :: true -> AIO_POLL; + od; +} + +/* Same as waiter(), but disappears after a while. */ +active proctype temporary_waiter() +{ + do + :: true -> AIO_POLL; + :: true -> break; + od; +} + +#ifdef CHECK_REQ +never { + do + :: req -> goto accept_if_req_not_eventually_false; + :: true -> skip; + od; + +accept_if_req_not_eventually_false: + if + :: req -> goto accept_if_req_not_eventually_false; + fi; + assert(0); +} + +#else +/* There must be infinitely many transitions of event as long + * as the notifier does not exit. + * + * If event stayed always true, the waiters would be busy looping. + * If event stayed always false, the waiters would be sleeping + * forever. + */ +never { + do + :: !event -> goto accept_if_event_not_eventually_true; + :: event -> goto accept_if_event_not_eventually_false; + :: true -> skip; + od; + +accept_if_event_not_eventually_true: + if + :: !event && notifier_done -> do :: true -> skip; od; + :: !event && !notifier_done -> goto accept_if_event_not_eventually_true; + fi; + assert(0); + +accept_if_event_not_eventually_false: + if + :: event -> goto accept_if_event_not_eventually_false; + fi; + assert(0); +} +#endif diff --git a/docs/atomics.txt b/docs/atomics.txt index 6f2997bc6..ef285e3c2 100644 --- a/docs/atomics.txt +++ b/docs/atomics.txt @@ -281,7 +281,7 @@ note that the other barrier may actually be in a driver that runs in the guest! For the purposes of pairing, smp_read_barrier_depends() and smp_rmb() -both count as read barriers. A read barriers shall pair with a write +both count as read barriers. A read barrier shall pair with a write barrier or a full barrier; a write barrier shall pair with a read barrier or a full barrier. A full barrier can pair with anything. For example: @@ -294,7 +294,7 @@ For example: smp_rmb(); y = a; -Note that the "writing" thread are accessing the variables in the +Note that the "writing" thread is accessing the variables in the opposite order as the "reading" thread. This is expected: stores before the write barrier will normally match the loads after the read barrier, and vice versa. The same is true for more than 2 diff --git a/docs/bitmaps.md b/docs/bitmaps.md new file mode 100644 index 000000000..fa87f077f --- /dev/null +++ b/docs/bitmaps.md @@ -0,0 +1,352 @@ +<!-- +Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc. +All rights reserved. + +This file is licensed via The FreeBSD Documentation License, the full text of +which is included at the end of this document. +--> + +# Dirty Bitmaps and Incremental Backup + +* Dirty Bitmaps are objects that track which data needs to be backed up for the + next incremental backup. + +* Dirty bitmaps can be created at any time and attached to any node + (not just complete drives.) + +## Dirty Bitmap Names + +* A dirty bitmap's name is unique to the node, but bitmaps attached to different + nodes can share the same name. + +## Bitmap Modes + +* A Bitmap can be "frozen," which means that it is currently in-use by a backup + operation and cannot be deleted, renamed, written to, reset, + etc. + +## Basic QMP Usage + +### Supported Commands ### + +* block-dirty-bitmap-add +* block-dirty-bitmap-remove +* block-dirty-bitmap-clear + +### Creation + +* To create a new bitmap, enabled, on the drive with id=drive0: + +```json +{ "execute": "block-dirty-bitmap-add", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +* This bitmap will have a default granularity that matches the cluster size of + its associated drive, if available, clamped to between [4KiB, 64KiB]. + The current default for qcow2 is 64KiB. + +* To create a new bitmap that tracks changes in 32KiB segments: + +```json +{ "execute": "block-dirty-bitmap-add", + "arguments": { + "node": "drive0", + "name": "bitmap0", + "granularity": 32768 + } +} +``` + +### Deletion + +* Bitmaps that are frozen cannot be deleted. + +* Deleting the bitmap does not impact any other bitmaps attached to the same + node, nor does it affect any backups already created from this node. + +* Because bitmaps are only unique to the node to which they are attached, + you must specify the node/drive name here, too. + +```json +{ "execute": "block-dirty-bitmap-remove", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +### Resetting + +* Resetting a bitmap will clear all information it holds. + +* An incremental backup created from an empty bitmap will copy no data, + as if nothing has changed. + +```json +{ "execute": "block-dirty-bitmap-clear", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +## Transactions (Not yet implemented) + +* Transactional commands are forthcoming in a future version, + and are not yet available for use. This section serves as + documentation of intent for their design and usage. + +### Justification + +Bitmaps can be safely modified when the VM is paused or halted by using +the basic QMP commands. For instance, you might perform the following actions: + +1. Boot the VM in a paused state. +2. Create a full drive backup of drive0. +3. Create a new bitmap attached to drive0. +4. Resume execution of the VM. +5. Incremental backups are ready to be created. + +At this point, the bitmap and drive backup would be correctly in sync, +and incremental backups made from this point forward would be correctly aligned +to the full drive backup. + +This is not particularly useful if we decide we want to start incremental +backups after the VM has been running for a while, for which we will need to +perform actions such as the following: + +1. Boot the VM and begin execution. +2. Using a single transaction, perform the following operations: + * Create bitmap0. + * Create a full drive backup of drive0. +3. Incremental backups are now ready to be created. + +### Supported Bitmap Transactions + +* block-dirty-bitmap-add +* block-dirty-bitmap-clear + +The usages are identical to their respective QMP commands, but see below +for examples. + +### Example: New Incremental Backup + +As outlined in the justification, perhaps we want to create a new incremental +backup chain attached to a drive. + +```json +{ "execute": "transaction", + "arguments": { + "actions": [ + {"type": "block-dirty-bitmap-add", + "data": {"node": "drive0", "name": "bitmap0"} }, + {"type": "drive-backup", + "data": {"device": "drive0", "target": "/path/to/full_backup.img", + "sync": "full", "format": "qcow2"} } + ] + } +} +``` + +### Example: New Incremental Backup Anchor Point + +Maybe we just want to create a new full backup with an existing bitmap and +want to reset the bitmap to track the new chain. + +```json +{ "execute": "transaction", + "arguments": { + "actions": [ + {"type": "block-dirty-bitmap-clear", + "data": {"node": "drive0", "name": "bitmap0"} }, + {"type": "drive-backup", + "data": {"device": "drive0", "target": "/path/to/new_full_backup.img", + "sync": "full", "format": "qcow2"} } + ] + } +} +``` + +## Incremental Backups + +The star of the show. + +**Nota Bene!** Only incremental backups of entire drives are supported for now. +So despite the fact that you can attach a bitmap to any arbitrary node, they are +only currently useful when attached to the root node. This is because +drive-backup only supports drives/devices instead of arbitrary nodes. + +### Example: First Incremental Backup + +1. Create a full backup and sync it to the dirty bitmap, as in the transactional +examples above; or with the VM offline, manually create a full copy and then +create a new bitmap before the VM begins execution. + + * Let's assume the full backup is named 'full_backup.img'. + * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'. + +2. Create a destination image for the incremental backup that utilizes the +full backup as a backing image. + + * Let's assume it is named 'incremental.0.img'. + + ```sh + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +3. Issue the incremental backup command: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +### Example: Second Incremental Backup + +1. Create a new destination image for the incremental backup that points to the + previous one, e.g.: 'incremental.1.img' + + ```sh + # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2 + ``` + +2. Issue a new incremental backup command. The only difference here is that we + have changed the target image below. + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.1.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +## Errors + +* In the event of an error that occurs after a backup job is successfully + launched, either by a direct QMP command or a QMP transaction, the user + will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied + by a BLOCK_JOB_ERROR event. + +* In the case of an event being cancelled, the user will receive a + BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events. + +* In either case, the incremental backup data contained within the bitmap is + safely rolled back, and the data within the bitmap is not lost. The image + file created for the failed attempt can be safely deleted. + +* Once the underlying problem is fixed (e.g. more storage space is freed up), + you can simply retry the incremental backup command with the same bitmap. + +### Example + +1. Create a target image: + + ```sh + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +2. Attempt to create an incremental backup via QMP: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +3. Receive an event notifying us of failure: + + ```json + { "timestamp": { "seconds": 1424709442, "microseconds": 844524 }, + "data": { "speed": 0, "offset": 0, "len": 67108864, + "error": "No space left on device", + "device": "drive1", "type": "backup" }, + "event": "BLOCK_JOB_COMPLETED" } + ``` + +4. Delete the failed incremental, and re-create the image. + + ```sh + # rm incremental.0.img + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +5. Retry the command after fixing the underlying problem, + such as freeing up space on the backup volume: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +6. Receive confirmation that the job completed successfully: + + ```json + { "timestamp": { "seconds": 1424709668, "microseconds": 526525 }, + "data": { "device": "drive1", "type": "backup", + "speed": 0, "len": 67108864, "offset": 67108864}, + "event": "BLOCK_JOB_COMPLETED" } + ``` + +<!-- +The FreeBSD Documentation License + +Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML, +PDF, PostScript, RTF and so forth) with or without modification, are permitted +provided that the following conditions are met: + +Redistributions of source code (Markdown) must retain the above copyright +notice, this list of conditions and the following disclaimer of this file +unmodified. + +Redistributions in compiled form (transformed to other DTDs, converted to PDF, +PostScript, RTF and other formats) must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> diff --git a/docs/memory-hotplug.txt b/docs/memory-hotplug.txt index f70571df0..56bdd0a47 100644 --- a/docs/memory-hotplug.txt +++ b/docs/memory-hotplug.txt @@ -4,9 +4,7 @@ QEMU memory hotplug This document explains how to use the memory hotplug feature in QEMU, which is present since v2.1.0. -Please, note that memory hotunplug is not supported yet. This means -that you're able to add memory, but you're not able to remove it. -Also, proper guest support is required for memory hotplug to work. +Guest support is required for memory hotplug to work. Basic RAM hotplug ----------------- @@ -74,3 +72,22 @@ comes from regular RAM, 1GB is a 1GB hugepage page and 256MB is from -device pc-dimm,id=dimm1,memdev=mem1 \ -object memory-backend-file,id=mem2,size=256M,mem-path=/mnt/hugepages-2MB \ -device pc-dimm,id=dimm2,memdev=mem2 + + +RAM hot-unplug +--------------- + +In order to be able to hot unplug pc-dimm device, QEMU has to be told the ids +of pc-dimm device and memory backend object. The ids were assigned when you hot +plugged memory. + +Two monitor commands are used to hot unplug memory: + + - "device_del": deletes a front-end pc-dimm device + - "object_del": deletes a memory backend object + +For example, assuming that the pc-dimm device with id "dimm1" exists, and its memory +backend is "mem1", the following commands tries to remove it. + + (qemu) device_del dimm1 + (qemu) object_del mem1 diff --git a/docs/migration.txt b/docs/migration.txt index 0492a4547..f6df4beb2 100644 --- a/docs/migration.txt +++ b/docs/migration.txt @@ -257,6 +257,7 @@ const VMStateDescription vmstate_ide_drive_pio_state = { .minimum_version_id = 1, .pre_save = ide_drive_pio_pre_save, .post_load = ide_drive_pio_post_load, + .needed = ide_drive_pio_state_needed, .fields = (VMStateField[]) { VMSTATE_INT32(req_nb_sectors, IDEState), VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1, @@ -279,13 +280,9 @@ const VMStateDescription vmstate_ide_drive = { .... several fields .... VMSTATE_END_OF_LIST() }, - .subsections = (VMStateSubsection []) { - { - .vmsd = &vmstate_ide_drive_pio_state, - .needed = ide_drive_pio_state_needed, - }, { - /* empty */ - } + .subsections = (const VMStateDescription*[]) { + &vmstate_ide_drive_pio_state, + NULL } }; diff --git a/docs/multi-thread-compression.txt b/docs/multi-thread-compression.txt new file mode 100644 index 000000000..3d477c3bd --- /dev/null +++ b/docs/multi-thread-compression.txt @@ -0,0 +1,149 @@ +Use multiple thread (de)compression in live migration +===================================================== +Copyright (C) 2015 Intel Corporation +Author: Liang Li <liang.z.li@intel.com> + +This work is licensed under the terms of the GNU GPLv2 or later. See +the COPYING file in the top-level directory. + +Contents: +========= +* Introduction +* When to use +* Performance +* Usage +* TODO + +Introduction +============ +Instead of sending the guest memory directly, this solution will +compress the RAM page before sending; after receiving, the data will +be decompressed. Using compression in live migration can help +to reduce the data transferred about 60%, this is very useful when the +bandwidth is limited, and the total migration time can also be reduced +about 70% in a typical case. In addition to this, the VM downtime can be +reduced about 50%. The benefit depends on data's compressibility in VM. + +The process of compression will consume additional CPU cycles, and the +extra CPU cycles will increase the migration time. On the other hand, +the amount of data transferred will decrease; this factor can reduce +the total migration time. If the process of the compression is quick +enough, then the total migration time can be reduced, and multiple +thread compression can be used to accelerate the compression process. + +The decompression speed of Zlib is at least 4 times as quick as +compression, if the source and destination CPU have equal speed, +keeping the compression thread count 4 times the decompression +thread count can avoid resource waste. + +Compression level can be used to control the compression speed and the +compression ratio. High compression ratio will take more time, level 0 +stands for no compression, level 1 stands for the best compression +speed, and level 9 stands for the best compression ratio. Users can +select a level number between 0 and 9. + + +When to use the multiple thread compression in live migration +============================================================= +Compression of data will consume extra CPU cycles; so in a system with +high overhead of CPU, avoid using this feature. When the network +bandwidth is very limited and the CPU resource is adequate, use of +multiple thread compression will be very helpful. If both the CPU and +the network bandwidth are adequate, use of multiple thread compression +can still help to reduce the migration time. + +Performance +=========== +Test environment: + +CPU: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz +Socket Count: 2 +RAM: 128G +NIC: Intel I350 (10/100/1000Mbps) +Host OS: CentOS 7 64-bit +Guest OS: RHEL 6.5 64-bit +Parameter: qemu-system-x86_64 -enable-kvm -smp 4 -m 4096 + /share/ia32e_rhel6u5.qcow -monitor stdio + +There is no additional application is running on the guest when doing +the test. + + +Speed limit: 1000Gb/s +--------------------------------------------------------------- + | original | compress thread: 8 + | way | decompress thread: 2 + | | compression level: 1 +--------------------------------------------------------------- +total time(msec): | 3333 | 1833 +--------------------------------------------------------------- +downtime(msec): | 100 | 27 +--------------------------------------------------------------- +transferred ram(kB):| 363536 | 107819 +--------------------------------------------------------------- +throughput(mbps): | 893.73 | 482.22 +--------------------------------------------------------------- +total ram(kB): | 4211524 | 4211524 +--------------------------------------------------------------- + +There is an application running on the guest which write random numbers +to RAM block areas periodically. + +Speed limit: 1000Gb/s +--------------------------------------------------------------- + | original | compress thread: 8 + | way | decompress thread: 2 + | | compression level: 1 +--------------------------------------------------------------- +total time(msec): | 37369 | 15989 +--------------------------------------------------------------- +downtime(msec): | 337 | 173 +--------------------------------------------------------------- +transferred ram(kB):| 4274143 | 1699824 +--------------------------------------------------------------- +throughput(mbps): | 936.99 | 870.95 +--------------------------------------------------------------- +total ram(kB): | 4211524 | 4211524 +--------------------------------------------------------------- + +Usage +===== +1. Verify both the source and destination QEMU are able +to support the multiple thread compression migration: + {qemu} info_migrate_capabilities + {qemu} ... compress: off ... + +2. Activate compression on the source: + {qemu} migrate_set_capability compress on + +3. Set the compression thread count on source: + {qemu} migrate_set_parameter compress_threads 12 + +4. Set the compression level on the source: + {qemu} migrate_set_parameter compress_level 1 + +5. Set the decompression thread count on destination: + {qemu} migrate_set_parameter decompress_threads 3 + +6. Start outgoing migration: + {qemu} migrate -d tcp:destination.host:4444 + {qemu} info migrate + Capabilities: ... compress: on + ... + +The following are the default settings: + compress: off + compress_threads: 8 + decompress_threads: 2 + compress_level: 1 (which means best speed) + +So, only the first two steps are required to use the multiple +thread compression in migration. You can do more if the default +settings are not appropriate. + +TODO +==== +Some faster (de)compression method such as LZ4 and Quicklz can help +to reduce the CPU consumption when doing (de)compression. If using +these faster (de)compression method, less (de)compression threads +are needed when doing the migration. diff --git a/docs/multiseat.txt b/docs/multiseat.txt index b963665ef..ebf244693 100644 --- a/docs/multiseat.txt +++ b/docs/multiseat.txt @@ -2,8 +2,8 @@ multiseat howto (with some multihead coverage) ============================================== -host side ---------- +host devices +------------ First you must compile qemu with a user interface supporting multihead/multiseat and input event routing. Right now this @@ -41,6 +41,19 @@ The "display=video2" sets up the input routing. Any input coming from the window which belongs to the video.2 display adapter will be routed to these input devices. +Starting with qemu 2.4 and linux kernel 4.1 you can also use virtio +for the input devices, using this ... + + -device pci-bridge,addr=12.0,chassis_nr=2,id=head.2 \ + -device secondary-vga,bus=head.2,addr=02.0,id=video.2 \ + -device virtio-keyboard-pci,bus=head.2,addr=03.0,display=video.2 \ + -device virtio-tablet-pci,bus=head.2,addr=03.0,display=video.2 + +... instead of xhci and usb hid devices. + +host ui +------- + The sdl2 ui will start up with two windows, one for each display device. The gtk ui will start with a single window and each display in a separate tab. You can either simply switch tabs to switch heads, @@ -106,6 +119,26 @@ the devices attached to the seat. Background info is here: http://www.freedesktop.org/wiki/Software/systemd/multiseat/ + +guest side with pci-bridge-seat +------------------------------- + +Qemu version 2.4 and newer has a new pci-bridge-seat device which +can be used instead of pci-bridge. Just swap the device name in the +qemu command line above. The only difference between the two devices +is the pci id. We can match the pci id instead of the device path +with a nice generic rule now, which simplifies the guest +configuration: + + [root@fedora ~]# cat /etc/udev/rules.d/70-qemu-pci-bridge-seat.rules + SUBSYSTEM=="pci", ATTR{vendor}=="0x1b36", ATTR{device}=="0x000a", \ + TAG+="seat", ENV{ID_AUTOSEAT}="1" + +Patch with this rule has been submitted to upstream udev/systemd, was +accepted and and should be included in the next systemd release (222). +So, if your guest has this or a newer version, multiseat will work just +fine without any manual guest configuration. + Enjoy! -- diff --git a/docs/pci_expander_bridge.txt b/docs/pci_expander_bridge.txt new file mode 100644 index 000000000..d7913fb4a --- /dev/null +++ b/docs/pci_expander_bridge.txt @@ -0,0 +1,58 @@ +PCI EXPANDER BRIDGE (PXB) +========================= + +Description +=========== +PXB is a "light-weight" host bridge in the same PCI domain +as the main host bridge whose purpose is to enable +the main host bridge to support multiple PCI root buses. +It is implemented only for i440fx and can be placed only +on bus 0 (pci.0). + +As opposed to PCI-2-PCI bridge's secondary bus, PXB's bus +is a primary bus and can be associated with a NUMA node +(different from the main host bridge) allowing the guest OS +to recognize the proximity of a pass-through device to +other resources as RAM and CPUs. + +Usage +===== +A detailed command line would be: + +[qemu-bin + storage options] +-m 2G +-object memory-backend-ram,size=1024M,policy=bind,host-nodes=0,id=ram-node0 -numa node,nodeid=0,cpus=0,memdev=ram-node0 +-object memory-backend-ram,size=1024M,policy=bind,host-nodes=1,id=ram-node1 -numa node,nodeid=1,cpus=1,memdev=ram-node1 +-device pxb,id=bridge1,bus=pci.0,numa_node=1,bus_nr=4 -netdev user,id=nd-device e1000,bus=bridge1,addr=0x4,netdev=nd +-device pxb,id=bridge2,bus=pci.0,numa_node=0,bus_nr=8,bus=pci.0 -device e1000,bus=bridge2,addr=0x3 +-device pxb,id=bridge3,bus=pci.0,bus_nr=40,bus=pci.0 -drive if=none,id=drive0,file=[img] -device virtio-blk-pci,drive=drive0,scsi=off,bus=bridge3,addr=1 + +Here you have: + - 2 NUMA nodes for the guest, 0 and 1. (both mapped to the same NUMA node in host, but you can and should put it in different host NUMA nodes) + - a pxb host bridge attached to NUMA 1 with an e1000 behind it + - a pxb host bridge attached to NUMA 0 with an e1000 behind it + - a pxb host bridge not attached to any NUMA with a hard drive behind it. + +Limitations +=========== +Please observe that we specified the bus "pci.0" for the second and third pxb. +This is because when no bus is given, another pxb can be selected by QEMU as default bus, +however, PXBs can be placed only under the root bus. + +Implementation +============== +The PXB is composed by: +- HostBridge (TYPE_PXB_HOST) + The host bridge allows to register and query the PXB's rPCI root bus in QEMU. +- PXBDev(TYPE_PXB_DEVICE) + It is a regular PCI Device that resides on the piix host-bridge bus and its bus uses the same PCI domain. + However, the bus behind is exposed through ACPI as a primary PCI bus and starts a new PCI hierarchy. + The interrupts from devices behind the PXB are routed through this device the same as if it were a + PCI-2-PCI bridge. The _PRT follows the i440fx model. +- PCIBridgeDev(TYPE_PCI_BRIDGE_DEV) + Created automatically as part of init sequence. + When adding a device to PXB it is attached to the bridge for two reasons: + - Using the bridge will enable hotplug support + - All the devices behind the bridge will use bridge's IO/MEM windows compacting + the PCI address space. + diff --git a/docs/qapi-code-gen.txt b/docs/qapi-code-gen.txt index 8313ba6af..61b5be47f 100644 --- a/docs/qapi-code-gen.txt +++ b/docs/qapi-code-gen.txt @@ -1,61 +1,193 @@ = How to use the QAPI code generator = -QAPI is a native C API within QEMU which provides management-level -functionality to internal/external users. For external -users/processes, this interface is made available by a JSON-based -QEMU Monitor protocol that is provided by the QMP server. - -To map QMP-defined interfaces to the native C QAPI implementations, -a JSON-based schema is used to define types and function -signatures, and a set of scripts is used to generate types/signatures, -and marshaling/dispatch code. The QEMU Guest Agent also uses these -scripts, paired with a separate schema, to generate -marshaling/dispatch code for the guest agent server running in the -guest. +Copyright IBM Corp. 2011 +Copyright (C) 2012-2015 Red Hat, Inc. -This document will describe how the schemas, scripts, and resulting -code are used. +This work is licensed under the terms of the GNU GPL, version 2 or +later. See the COPYING file in the top-level directory. +== Introduction == -== QMP/Guest agent schema == - -This file defines the types, commands, and events used by QMP. It should -fully describe the interface used by QMP. +QAPI is a native C API within QEMU which provides management-level +functionality to internal and external users. For external +users/processes, this interface is made available by a JSON-based wire +format for the QEMU Monitor Protocol (QMP) for controlling qemu, as +well as the QEMU Guest Agent (QGA) for communicating with the guest. +The remainder of this document uses "Client JSON Protocol" when +referring to the wire contents of a QMP or QGA connection. -This file is designed to be loosely based on JSON although it's technically -executable Python. While dictionaries are used, they are parsed as -OrderedDicts so that ordering is preserved. +To map Client JSON Protocol interfaces to the native C QAPI +implementations, a JSON-based schema is used to define types and +function signatures, and a set of scripts is used to generate types, +signatures, and marshaling/dispatch code. This document will describe +how the schemas, scripts, and resulting code are used. -There are two basic syntaxes used, type definitions and command definitions. -The first syntax defines a type and is represented by a dictionary. There are -three kinds of user-defined types that are supported: complex types, -enumeration types and union types. +== QMP/Guest agent schema == -Generally speaking, types definitions should always use CamelCase for the type -names. Command names should be all lower case with words separated by a hyphen. +A QAPI schema file is designed to be loosely based on JSON +(http://www.ietf.org/rfc/rfc7159.txt) with changes for quoting style +and the use of comments; a QAPI schema file is then parsed by a python +code generation program. A valid QAPI schema consists of a series of +top-level expressions, with no commas between them. Where +dictionaries (JSON objects) are used, they are parsed as python +OrderedDicts so that ordering is preserved (for predictable layout of +generated C structs and parameter lists). Ordering doesn't matter +between top-level expressions or the keys within an expression, but +does matter within dictionary values for 'data' and 'returns' members +of a single expression. QAPI schema input is written using 'single +quotes' instead of JSON's "double quotes" (in contrast, Client JSON +Protocol uses no comments, and while input accepts 'single quotes' as +an extension, output is strict JSON using only "double quotes"). As +in JSON, trailing commas are not permitted in arrays or dictionaries. +Input must be ASCII (although QMP supports full Unicode strings, the +QAPI parser does not). At present, there is no place where a QAPI +schema requires the use of JSON numbers or null. + +Comments are allowed; anything between an unquoted # and the following +newline is ignored. Although there is not yet a documentation +generator, a form of stylized comments has developed for consistently +documenting details about an expression and when it was added to the +schema. The documentation is delimited between two lines of ##, then +the first line names the expression, an optional overview is provided, +then individual documentation about each member of 'data' is provided, +and finally, a 'Since: x.y.z' tag lists the release that introduced +the expression. Optional fields are tagged with the phrase +'#optional', often with their default value; and extensions added +after the expression was first released are also given a '(since +x.y.z)' comment. For example: + + ## + # @BlockStats: + # + # Statistics of a virtual block device or a block backing device. + # + # @device: #optional If the stats are for a virtual block device, the name + # corresponding to the virtual block device. + # + # @stats: A @BlockDeviceStats for the device. + # + # @parent: #optional This describes the file block device if it has one. + # + # @backing: #optional This describes the backing block device if it has one. + # (Since 2.0) + # + # Since: 0.14.0 + ## + { 'struct': 'BlockStats', + 'data': {'*device': 'str', 'stats': 'BlockDeviceStats', + '*parent': 'BlockStats', + '*backing': 'BlockStats'} } + +The schema sets up a series of types, as well as commands and events +that will use those types. Forward references are allowed: the parser +scans in two passes, where the first pass learns all type names, and +the second validates the schema and generates the code. This allows +the definition of complex structs that can have mutually recursive +types, and allows for indefinite nesting of Client JSON Protocol that +satisfies the schema. A type name should not be defined more than +once. It is permissible for the schema to contain additional types +not used by any commands or events in the Client JSON Protocol, for +the side effect of generated C code used internally. + +There are seven top-level expressions recognized by the parser: +'include', 'command', 'struct', 'enum', 'union', 'alternate', and +'event'. There are several groups of types: simple types (a number of +built-in types, such as 'int' and 'str'; as well as enumerations), +complex types (structs and two flavors of unions), and alternate types +(a choice between other types). The 'command' and 'event' expressions +can refer to existing types by name, or list an anonymous type as a +dictionary. Listing a type name inside an array refers to a +single-dimension array of that type; multi-dimension arrays are not +directly supported (although an array of a complex struct that +contains an array member is possible). + +Types, commands, and events share a common namespace. Therefore, +generally speaking, type definitions should always use CamelCase for +user-defined type names, while built-in types are lowercase. Type +definitions should not end in 'Kind', as this namespace is used for +creating implicit C enums for visiting union types. Command names, +and field names within a type, should be all lower case with words +separated by a hyphen. However, some existing older commands and +complex types use underscore; when extending such expressions, +consistency is preferred over blindly avoiding underscore. Event +names should be ALL_CAPS with words separated by underscore. The +special string '**' appears for some commands that manually perform +their own type checking rather than relying on the type-safe code +produced by the qapi code generators. + +Any name (command, event, type, field, or enum value) beginning with +"x-" is marked experimental, and may be withdrawn or changed +incompatibly in a future release. Downstream vendors may add +extensions; such extensions should begin with a prefix matching +"__RFQDN_" (for the reverse-fully-qualified-domain-name of the +vendor), even if the rest of the name uses dash (example: +__com.redhat_drive-mirror). Other than downstream extensions (with +leading underscore and the use of dots), all names should begin with a +letter, and contain only ASCII letters, digits, dash, and underscore. +It is okay to reuse names that match C keywords; the generator will +rename a field named "default" in the QAPI to "q_default" in the +generated C code. + +In the rest of this document, usage lines are given for each +expression type, with literal strings written in lower case and +placeholders written in capitals. If a literal string includes a +prefix of '*', that key/value pair can be omitted from the expression. +For example, a usage statement that includes '*base':STRUCT-NAME +means that an expression has an optional key 'base', which if present +must have a value that forms a struct name. + + +=== Built-in Types === + +The following types are built-in to the parser: + 'str' - arbitrary UTF-8 string + 'int' - 64-bit signed integer (although the C code may place further + restrictions on acceptable range) + 'number' - floating point number + 'bool' - JSON value of true or false + 'int8', 'int16', 'int32', 'int64' - like 'int', but enforce maximum + bit size + 'uint8', 'uint16', 'uint32', 'uint64' - unsigned counterparts + 'size' - like 'uint64', but allows scaled suffix from command line + visitor === Includes === +Usage: { 'include': STRING } + The QAPI schema definitions can be modularized using the 'include' directive: - { 'include': 'path/to/file.json'} + { 'include': 'path/to/file.json' } The directive is evaluated recursively, and include paths are relative to the -file using the directive. Multiple includes of the same file are safe. +file using the directive. Multiple includes of the same file are +safe. No other keys should appear in the expression, and the include +value should be a string. + +As a matter of style, it is a good idea to have all files be +self-contained, but at the moment, nothing prevents an included file +from making a forward reference to a type that is only introduced by +an outer file. The parser may be made stricter in the future to +prevent incomplete include files. -=== Complex types === +=== Struct types === -A complex type is a dictionary containing a single key whose value is a -dictionary. This corresponds to a struct in C or an Object in JSON. An -example of a complex type is: +Usage: { 'struct': STRING, 'data': DICT, '*base': STRUCT-NAME } - { 'type': 'MyType', +A struct is a dictionary containing a single 'data' key whose +value is a dictionary. This corresponds to a struct in C or an Object +in JSON. Each value of the 'data' dictionary must be the name of a +type, or a one-element array containing a type name. An example of a +struct is: + + { 'struct': 'MyType', 'data': { 'member1': 'str', 'member2': 'int', '*member3': 'str' } } -The use of '*' as a prefix to the name means the member is optional. +The use of '*' as a prefix to the name means the member is optional in +the corresponding JSON protocol usage. The default initialization value of an optional argument should not be changed between versions of QEMU unless the new default maintains backward @@ -84,13 +216,13 @@ A structure that is used in both input and output of various commands must consider the backwards compatibility constraints of both directions of use. -A complex type definition can specify another complex type as its base. +A struct definition can specify another struct as its base. In this case, the fields of the base type are included as top-level fields -of the new complex type's dictionary in the QMP wire format. An example -definition is: +of the new struct's dictionary in the Client JSON Protocol wire +format. An example definition is: - { 'type': 'BlockdevOptionsGenericFormat', 'data': { 'file': 'str' } } - { 'type': 'BlockdevOptionsGenericCOWFormat', + { 'struct': 'BlockdevOptionsGenericFormat', 'data': { 'file': 'str' } } + { 'struct': 'BlockdevOptionsGenericCOWFormat', 'base': 'BlockdevOptionsGenericFormat', 'data': { '*backing': 'str' } } @@ -100,97 +232,158 @@ both fields like this: { "file": "/some/place/my-image", "backing": "/some/place/my-backing-file" } + === Enumeration types === -An enumeration type is a dictionary containing a single key whose value is a -list of strings. An example enumeration is: +Usage: { 'enum': STRING, 'data': ARRAY-OF-STRING } + +An enumeration type is a dictionary containing a single 'data' key +whose value is a list of strings. An example enumeration is: { 'enum': 'MyEnum', 'data': [ 'value1', 'value2', 'value3' ] } +Nothing prevents an empty enumeration, although it is probably not +useful. The list of strings should be lower case; if an enum name +represents multiple words, use '-' between words. The string 'max' is +not allowed as an enum value, and values should not be repeated. + +The enumeration values are passed as strings over the Client JSON +Protocol, but are encoded as C enum integral values in generated code. +While the C code starts numbering at 0, it is better to use explicit +comparisons to enum values than implicit comparisons to 0; the C code +will also include a generated enum member ending in _MAX for tracking +the size of the enum, useful when using common functions for +converting between strings and enum values. Since the wire format +always passes by name, it is acceptable to reorder or add new +enumeration members in any location without breaking clients of Client +JSON Protocol; however, removing enum values would break +compatibility. For any struct that has a field that will only contain +a finite set of string values, using an enum type for that field is +better than open-coding the field to be type 'str'. + + === Union types === -Union types are used to let the user choose between several different data -types. A union type is defined using a dictionary as explained in the -following paragraphs. +Usage: { 'union': STRING, 'data': DICT } +or: { 'union': STRING, 'data': DICT, 'base': STRUCT-NAME, + 'discriminator': ENUM-MEMBER-OF-BASE } +Union types are used to let the user choose between several different +variants for an object. There are two flavors: simple (no +discriminator or base), flat (both discriminator and base). A union +type is defined using a data dictionary as explained in the following +paragraphs. -A simple union type defines a mapping from discriminator values to data types -like in this example: +A simple union type defines a mapping from automatic discriminator +values to data types like in this example: - { 'type': 'FileOptions', 'data': { 'filename': 'str' } } - { 'type': 'Qcow2Options', + { 'struct': 'FileOptions', 'data': { 'filename': 'str' } } + { 'struct': 'Qcow2Options', 'data': { 'backing-file': 'str', 'lazy-refcounts': 'bool' } } { 'union': 'BlockdevOptions', 'data': { 'file': 'FileOptions', 'qcow2': 'Qcow2Options' } } -In the QMP wire format, a simple union is represented by a dictionary that -contains the 'type' field as a discriminator, and a 'data' field that is of the -specified data type corresponding to the discriminator value: +In the Client JSON Protocol, a simple union is represented by a +dictionary that contains the 'type' field as a discriminator, and a +'data' field that is of the specified data type corresponding to the +discriminator value, as in these examples: + { "type": "file", "data" : { "filename": "/some/place/my-image" } } { "type": "qcow2", "data" : { "backing-file": "/some/place/my-image", "lazy-refcounts": true } } +The generated C code uses a struct containing a union. Additionally, +an implicit C enum 'NameKind' is created, corresponding to the union +'Name', for accessing the various branches of the union. No branch of +the union can be named 'max', as this would collide with the implicit +enum. The value for each branch can be of any type. -A union definition can specify a complex type as its base. In this case, the -fields of the complex type are included as top-level fields of the union -dictionary in the QMP wire format. An example definition is: - { 'type': 'BlockdevCommonOptions', 'data': { 'readonly': 'bool' } } - { 'union': 'BlockdevOptions', - 'base': 'BlockdevCommonOptions', - 'data': { 'raw': 'RawOptions', - 'qcow2': 'Qcow2Options' } } +A flat union definition specifies a struct as its base, and +avoids nesting on the wire. All branches of the union must be +complex types, and the top-level fields of the union dictionary on +the wire will be combination of fields from both the base type and the +appropriate branch type (when merging two dictionaries, there must be +no keys in common). The 'discriminator' field must be the name of an +enum-typed member of the base struct. -And it looks like this on the wire: - - { "type": "qcow2", - "readonly": false, - "data" : { "backing-file": "/some/place/my-image", - "lazy-refcounts": true } } - - -Flat union types avoid the nesting on the wire. They are used whenever a -specific field of the base type is declared as the discriminator ('type' is -then no longer generated). The discriminator must be of enumeration type. -The above example can then be modified as follows: +The following example enhances the above simple union example by +adding a common field 'readonly', renaming the discriminator to +something more applicable, and reducing the number of {} required on +the wire: { 'enum': 'BlockdevDriver', 'data': [ 'raw', 'qcow2' ] } - { 'type': 'BlockdevCommonOptions', + { 'struct': 'BlockdevCommonOptions', 'data': { 'driver': 'BlockdevDriver', 'readonly': 'bool' } } { 'union': 'BlockdevOptions', 'base': 'BlockdevCommonOptions', 'discriminator': 'driver', - 'data': { 'raw': 'RawOptions', + 'data': { 'file': 'FileOptions', 'qcow2': 'Qcow2Options' } } -Resulting in this JSON object: +Resulting in these JSON objects: + + { "driver": "file", "readonly": true, + "filename": "/some/place/my-image" } + { "driver": "qcow2", "readonly": false, + "backing-file": "/some/place/my-image", "lazy-refcounts": true } + +Notice that in a flat union, the discriminator name is controlled by +the user, but because it must map to a base member with enum type, the +code generator can ensure that branches exist for all values of the +enum (although the order of the keys need not match the declaration of +the enum). In the resulting generated C data types, a flat union is +represented as a struct with the base member fields included directly, +and then a union of structures for each branch of the struct. + +A simple union can always be re-written as a flat union where the base +class has a single member named 'type', and where each branch of the +union has a struct with a single member named 'data'. That is, - { "driver": "qcow2", - "readonly": false, - "backing-file": "/some/place/my-image", - "lazy-refcounts": true } + { 'union': 'Simple', 'data': { 'one': 'str', 'two': 'int' } } +is identical on the wire to: -A special type of unions are anonymous unions. They don't form a dictionary in -the wire format but allow the direct use of different types in their place. As -they aren't structured, they don't have any explicit discriminator but use -the (QObject) data type of their value as an implicit discriminator. This means -that they are restricted to using only one discriminator value per QObject -type. For example, you cannot have two different complex types in an anonymous -union, or two different integer types. + { 'enum': 'Enum', 'data': ['one', 'two'] } + { 'struct': 'Base', 'data': { 'type': 'Enum' } } + { 'struct': 'Branch1', 'data': { 'data': 'str' } } + { 'struct': 'Branch2', 'data': { 'data': 'int' } } + { 'union': 'Flat': 'base': 'Base', 'discriminator': 'type', + 'data': { 'one': 'Branch1', 'two': 'Branch2' } } -Anonymous unions are declared using an empty dictionary as their discriminator. -The discriminator values never appear on the wire, they are only used in the -generated C code. Anonymous unions cannot have a base type. - { 'union': 'BlockRef', - 'discriminator': {}, +=== Alternate types === + +Usage: { 'alternate': STRING, 'data': DICT } + +An alternate type is one that allows a choice between two or more JSON +data types (string, integer, number, or object, but currently not +array) on the wire. The definition is similar to a simple union type, +where each branch of the union names a QAPI type. For example: + + { 'alternate': 'BlockRef', 'data': { 'definition': 'BlockdevOptions', 'reference': 'str' } } -This example allows using both of the following example objects: +Just like for a simple union, an implicit C enum 'NameKind' is created +to enumerate the branches for the alternate 'Name'. + +Unlike a union, the discriminator string is never passed on the wire +for the Client JSON Protocol. Instead, the value's JSON type serves +as an implicit discriminator, which in turn means that an alternate +can only express a choice between types represented differently in +JSON. If a branch is typed as the 'bool' built-in, the alternate +accepts true and false; if it is typed as any of the various numeric +built-ins, it accepts a JSON number; if it is typed as a 'str' +built-in or named enum type, it accepts a JSON string; and if it is +typed as a complex type (struct or union), it accepts a JSON object. +Two different complex types, for instance, aren't permitted, because +both are represented as a JSON object. + +The example alternate declaration above allows using both of the +following example objects: { "file": "my_existing_block_device_id" } { "file": { "driver": "file", @@ -200,23 +393,95 @@ This example allows using both of the following example objects: === Commands === -Commands are defined by using a list containing three members. The first -member is the command name, the second member is a dictionary containing -arguments, and the third member is the return type. - -An example command is: +Usage: { 'command': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT, + '*returns': TYPE-NAME-OR-DICT, + '*gen': false, '*success-response': false } + +Commands are defined by using a dictionary containing several members, +where three members are most common. The 'command' member is a +mandatory string, and determines the "execute" value passed in a +Client JSON Protocol command exchange. + +The 'data' argument maps to the "arguments" dictionary passed in as +part of a Client JSON Protocol command. The 'data' member is optional +and defaults to {} (an empty dictionary). If present, it must be the +string name of a complex type, a one-element array containing the name +of a complex type, or a dictionary that declares an anonymous type +with the same semantics as a 'struct' expression, with one exception +noted below when 'gen' is used. + +The 'returns' member describes what will appear in the "return" field +of a Client JSON Protocol reply on successful completion of a command. +The member is optional from the command declaration; if absent, the +"return" field will be an empty dictionary. If 'returns' is present, +it must be the string name of a complex or built-in type, a +one-element array containing the name of a complex or built-in type, +or a dictionary that declares an anonymous type with the same +semantics as a 'struct' expression, with one exception noted below +when 'gen' is used. Although it is permitted to have the 'returns' +member name a built-in type or an array of built-in types, any command +that does this cannot be extended to return additional information in +the future; thus, new commands should strongly consider returning a +dictionary-based type or an array of dictionaries, even if the +dictionary only contains one field at the present. + +All commands in Client JSON Protocol use a dictionary to report +failure, with no way to specify that in QAPI. Where the error return +is different than the usual GenericError class in order to help the +client react differently to certain error conditions, it is worth +documenting this in the comments before the command declaration. + +Some example commands: + + { 'command': 'my-first-command', + 'data': { 'arg1': 'str', '*arg2': 'str' } } + { 'struct': 'MyType', 'data': { '*value': 'str' } } + { 'command': 'my-second-command', + 'returns': [ 'MyType' ] } + +which would validate this Client JSON Protocol transaction: + + => { "execute": "my-first-command", + "arguments": { "arg1": "hello" } } + <= { "return": { } } + => { "execute": "my-second-command" } + <= { "return": [ { "value": "one" }, { } ] } + +In rare cases, QAPI cannot express a type-safe representation of a +corresponding Client JSON Protocol command. In these cases, if the +command expression includes the key 'gen' with boolean value false, +then the 'data' or 'returns' member that intends to bypass generated +type-safety and do its own manual validation should use an inline +dictionary definition, with a value of '**' rather than a valid type +name for the keys that the generated code will not validate. Please +try to avoid adding new commands that rely on this, and instead use +type-safe unions. For an example of bypass usage: + + { 'command': 'netdev_add', + 'data': {'type': 'str', 'id': 'str', '*props': '**'}, + 'gen': false } + +Normally, the QAPI schema is used to describe synchronous exchanges, +where a response is expected. But in some cases, the action of a +command is expected to change state in a way that a successful +response is not possible (although the command will still return a +normal dictionary error on failure). When a successful reply is not +possible, the command expression should include the optional key +'success-response' with boolean value false. So far, only QGA makes +use of this field. - { 'command': 'my-command', - 'data': { 'arg1': 'str', '*arg2': 'str' }, - 'returns': 'str' } === Events === -Events are defined with the keyword 'event'. When 'data' is also specified, -additional info will be included in the event. Finally there will be C API -generated in qapi-event.h; when called by QEMU code, a message with timestamp -will be emitted on the wire. If timestamp is -1, it means failure to retrieve -host time. +Usage: { 'event': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT } + +Events are defined with the keyword 'event'. It is not allowed to +name an event 'MAX', since the generator also produces a C enumeration +of all event names with a generated _MAX value at the end. When +'data' is also specified, additional info will be included in the +event, with similar semantics to a 'struct' expression. Finally there +will be C API generated in qapi-event.h; when called by QEMU code, a +message with timestamp will be emitted on the wire. An example event is: @@ -234,9 +499,9 @@ Resulting in this JSON object: Schemas are fed into 3 scripts to generate all the code/files that, paired with the core QAPI libraries, comprise everything required to take JSON -commands read in by a QMP/guest agent server, unmarshal the arguments into +commands read in by a Client JSON Protocol server, unmarshal the arguments into the underlying C types, call into the corresponding C function, and map the -response back to a QMP/guest agent response to be returned to the user. +response back to a Client JSON Protocol response to be returned to the user. As an example, we'll use the following schema, which describes a single complex user-defined type (which will produce a C struct, along with a list @@ -245,7 +510,7 @@ case we want to accept/return a list of this type with a command), and a command which takes that type as a parameter and returns the same type: $ cat example-schema.json - { 'type': 'UserDefOne', + { 'struct': 'UserDefOne', 'data': { 'integer': 'int', 'string': 'str' } } { 'command': 'my-command', @@ -271,7 +536,7 @@ created code. Example: $ python scripts/qapi-types.py --output-dir="qapi-generated" \ - --prefix="example-" --input-file=example-schema.json + --prefix="example-" example-schema.json $ cat qapi-generated/example-qapi-types.c [Uninteresting stuff omitted...] @@ -311,7 +576,7 @@ Example: #ifndef EXAMPLE_QAPI_TYPES_H #define EXAMPLE_QAPI_TYPES_H -[Builtin types omitted...] +[Built-in types omitted...] typedef struct UserDefOne UserDefOne; @@ -324,7 +589,7 @@ Example: struct UserDefOneList *next; } UserDefOneList; -[Functions on builtin types omitted...] +[Functions on built-in types omitted...] struct UserDefOne { @@ -358,7 +623,7 @@ $(prefix)qapi-visit.h: declarations for previously mentioned visitor Example: $ python scripts/qapi-visit.py --output-dir="qapi-generated" - --prefix="example-" --input-file=example-schema.json + --prefix="example-" example-schema.json $ cat qapi-generated/example-qapi-visit.c [Uninteresting stuff omitted...] @@ -415,15 +680,13 @@ Example: out: error_propagate(errp, err); } - $ python scripts/qapi-commands.py --output-dir="qapi-generated" \ - --prefix="example-" --input-file=example-schema.json $ cat qapi-generated/example-qapi-visit.h [Uninteresting stuff omitted...] #ifndef EXAMPLE_QAPI_VISIT_H #define EXAMPLE_QAPI_VISIT_H -[Visitors for builtin types omitted...] +[Visitors for built-in types omitted...] void visit_type_UserDefOne(Visitor *m, UserDefOne **obj, const char *name, Error **errp); void visit_type_UserDefOneList(Visitor *m, UserDefOneList **obj, const char *name, Error **errp); @@ -450,7 +713,7 @@ $(prefix)qmp-commands.h: Function prototypes for the QMP commands Example: $ python scripts/qapi-commands.py --output-dir="qapi-generated" - --prefix="example-" --input-file=example-schema.json + --prefix="example-" example-schema.json $ cat qapi-generated/example-qmp-marshal.c [Uninteresting stuff omitted...] @@ -541,7 +804,7 @@ $(prefix)qapi-event.c - Implementation of functions to send an event Example: $ python scripts/qapi-event.py --output-dir="qapi-generated" - --prefix="example-" --input-file=example-schema.json + --prefix="example-" example-schema.json $ cat qapi-generated/example-qapi-event.c [Uninteresting stuff omitted...] diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt index d759d1974..d92cc4833 100644 --- a/docs/qmp/qmp-events.txt +++ b/docs/qmp/qmp-events.txt @@ -31,21 +31,27 @@ Example: BLOCK_IMAGE_CORRUPTED --------------------- -Emitted when a disk image is being marked corrupt. +Emitted when a disk image is being marked corrupt. The image can be +identified by its device or node name. The 'device' field is always +present for compatibility reasons, but it can be empty ("") if the +image does not have a device name associated. Data: -- "device": Device name (json-string) -- "msg": Informative message (e.g., reason for the corruption) (json-string) -- "offset": If the corruption resulted from an image access, this is the access - offset into the image (json-int) -- "size": If the corruption resulted from an image access, this is the access - size (json-int) +- "device": Device name (json-string) +- "node-name": Node name (json-string, optional) +- "msg": Informative message (e.g., reason for the corruption) + (json-string) +- "offset": If the corruption resulted from an image access, this + is the host's access offset into the image + (json-int, optional) +- "size": If the corruption resulted from an image access, this + is the access size (json-int, optional) Example: { "event": "BLOCK_IMAGE_CORRUPTED", - "data": { "device": "ide0-hd0", + "data": { "device": "ide0-hd0", "node-name": "node0", "msg": "Prevented active L1 table overwrite", "offset": 196608, "size": 65536 }, "timestamp": { "seconds": 1378126126, "microseconds": 966463 } } @@ -226,6 +232,23 @@ Example: { "event": "GUEST_PANICKED", "data": { "action": "pause" } } +MEM_UNPLUG_ERROR +-------------------- +Emitted when memory hot unplug error occurs. + +Data: + +- "device": device name (json-string) +- "msg": Informative message (e.g., reason for the error) (json-string) + +Example: + +{ "event": "MEM_UNPLUG_ERROR" + "data": { "device": "dimm1", + "msg": "acpi: device unplug for unsupported device" + }, + "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } + NIC_RX_FILTER_CHANGED --------------------- @@ -450,6 +473,20 @@ Example: { "timestamp": {"seconds": 1290688046, "microseconds": 417172}, "event": "SPICE_MIGRATE_COMPLETED" } +MIGRATION +--------- + +Emitted when a migration event happens + +Data: None. + + - "status": migration status + See MigrationStatus in ~/qapi-schema.json for possible values + +Example: + +{"timestamp": {"seconds": 1432121972, "microseconds": 744001}, + "event": "MIGRATION", "data": {"status": "completed"}} STOP ---- diff --git a/docs/qmp/qmp-spec.txt b/docs/qmp/qmp-spec.txt index 22568c644..4c28cd943 100644 --- a/docs/qmp/qmp-spec.txt +++ b/docs/qmp/qmp-spec.txt @@ -1,10 +1,21 @@ QEMU Machine Protocol Specification +0. About This Document +====================== + +Copyright (C) 2009-2015 Red Hat, Inc. + +This work is licensed under the terms of the GNU GPL, version 2 or +later. See the COPYING file in the top-level directory. + 1. Introduction =============== -This document specifies the QEMU Machine Protocol (QMP), a JSON-based protocol -which is available for applications to operate QEMU at the machine-level. +This document specifies the QEMU Machine Protocol (QMP), a JSON-based +protocol which is available for applications to operate QEMU at the +machine-level. It is also in use by the QEMU Guest Agent (QGA), which +is available for host applications to interact with the guest +operating system. 2. Protocol Specification ========================= @@ -18,14 +29,27 @@ following format: json-DATA-STRUCTURE-NAME -Where DATA-STRUCTURE-NAME is any valid JSON data structure, as defined by -the JSON standard: +Where DATA-STRUCTURE-NAME is any valid JSON data structure, as defined +by the JSON standard: + +http://www.ietf.org/rfc/rfc7159.txt -http://www.ietf.org/rfc/rfc4627.txt +The protocol is always encoded in UTF-8 except for synchronization +bytes (documented below); although thanks to json-string escape +sequences, the server will reply using only the strict ASCII subset. -For convenience, json-object members and json-array elements mentioned in -this document will be in a certain order. However, in real protocol usage -they can be in ANY order, thus no particular order should be assumed. +For convenience, json-object members mentioned in this document will +be in a certain order. However, in real protocol usage they can be in +ANY order, thus no particular order should be assumed. On the other +hand, use of json-array elements presumes that preserving order is +important unless specifically documented otherwise. Repeating a key +within a json-object gives unpredictable results. + +Also for convenience, the server will accept an extension of +'single-quoted' strings in place of the usual "double-quoted" +json-string, and both input forms of strings understand an additional +escape sequence of "\'" for a single quote. The server will only use +double quoting on output. 2.1 General Definitions ----------------------- @@ -52,7 +76,16 @@ The greeting message format is: - The "version" member contains the Server's version information (the format is the same of the query-version command) - The "capabilities" member specify the availability of features beyond the - baseline specification + baseline specification; the order of elements in this array has no + particular significance, so a client must search the entire array + when looking for a particular capability + +2.2.1 Capabilities +------------------ + +As of the date this document was last revised, no server or client +capability strings have been defined. + 2.3 Issuing Commands -------------------- @@ -65,10 +98,14 @@ The format for command execution is: - The "execute" member identifies the command to be executed by the Server - The "arguments" member is used to pass any arguments required for the - execution of the command, it is optional when no arguments are required + execution of the command, it is optional when no arguments are + required. Each command documents what contents will be considered + valid when handling the json-argument - The "id" member is a transaction identification associated with the command execution, it is optional and will be part of the response if - provided + provided. The "id" member can be any json-value, although most + clients merely use a json-number incremented for each successive + command 2.4 Commands Responses ---------------------- @@ -81,13 +118,15 @@ of a command execution: success or error. The format of a success response is: -{ "return": json-object, "id": json-value } +{ "return": json-value, "id": json-value } Where, -- The "return" member contains the command returned data, which is defined - in a per-command basis or an empty json-object if the command does not - return data +- The "return" member contains the data returned by the command, which + is defined on a per-command basis (usually a json-object or + json-array of json-objects, but sometimes a json-number, json-string, + or json-array of json-strings); it is an empty json-object if the + command does not return data - The "id" member contains the transaction identification associated with the command execution if issued by the Client @@ -114,7 +153,8 @@ if provided by the client. ----------------------- As a result of state changes, the Server may send messages unilaterally -to the Client at any time. They are called "asynchronous events". +to the Client at any time, when not in the middle of any other +response. They are called "asynchronous events". The format of asynchronous events is: @@ -126,13 +166,27 @@ The format of asynchronous events is: - The "event" member contains the event's name - The "data" member contains event specific data, which is defined in a per-event basis, it is optional -- The "timestamp" member contains the exact time of when the event occurred - in the Server. It is a fixed json-object with time in seconds and - microseconds +- The "timestamp" member contains the exact time of when the event + occurred in the Server. It is a fixed json-object with time in + seconds and microseconds relative to the Unix Epoch (1 Jan 1970); if + there is a failure to retrieve host time, both members of the + timestamp will be set to -1. For a listing of supported asynchronous events, please, refer to the qmp-events.txt file. +2.5 QGA Synchronization +----------------------- + +When using QGA, an additional synchronization feature is built into +the protocol. If the Client sends a raw 0xFF sentinel byte (not valid +JSON), then the Server will reset its state and discard all pending +data prior to the sentinel. Conversely, if the Client makes use of +the 'guest-sync-delimited' command, the Server will send a raw 0xFF +sentinel byte prior to its response, to aid the Client in discarding +any data prior to the sentinel. + + 3. QMP Examples =============== @@ -145,32 +199,37 @@ This section provides some examples of real QMP usage, in all of them S: { "QMP": { "version": { "qemu": { "micro": 50, "minor": 6, "major": 1 }, "package": ""}, "capabilities": []}} -3.2 Simple 'stop' execution +3.2 Client QMP negotiation +-------------------------- +C: { "execute": "qmp_capabilities" } +S: { "return": {}} + +3.3 Simple 'stop' execution --------------------------- C: { "execute": "stop" } S: { "return": {} } -3.3 KVM information +3.4 KVM information ------------------- C: { "execute": "query-kvm", "id": "example" } S: { "return": { "enabled": true, "present": true }, "id": "example"} -3.4 Parsing error +3.5 Parsing error ------------------ C: { "execute": } S: { "error": { "class": "GenericError", "desc": "Invalid JSON syntax" } } -3.5 Powerdown event +3.6 Powerdown event ------------------- S: { "timestamp": { "seconds": 1258551470, "microseconds": 802384 }, "event": "POWERDOWN" } 4. Capabilities Negotiation ----------------------------- +=========================== When a Client successfully establishes a connection, the Server is in Capabilities Negotiation mode. @@ -189,7 +248,7 @@ effect, all commands (except qmp_capabilities) are allowed and asynchronous messages are delivered. 5 Compatibility Considerations ------------------------------- +============================== All protocol changes or new features which modify the protocol format in an incompatible way are disabled by default and will be advertised by the @@ -213,12 +272,16 @@ However, Clients must not assume any particular: - Amount of errors generated by a command, that is, new errors can be added to any existing command in newer versions of the Server +Any command or field name beginning with "x-" is deemed experimental, +and may be withdrawn or changed in an incompatible manner in a future +release. + Of course, the Server does guarantee to send valid JSON. But apart from this, a Client should be "conservative in what they send, and liberal in what they accept". 6. Downstream extension of QMP ------------------------------- +============================== We recommend that downstream consumers of QEMU do *not* modify QMP. Management tools should be able to support both upstream and downstream diff --git a/docs/specs/acpi_mem_hotplug.txt b/docs/specs/acpi_mem_hotplug.txt index 12909940c..3df3620ce 100644 --- a/docs/specs/acpi_mem_hotplug.txt +++ b/docs/specs/acpi_mem_hotplug.txt @@ -2,7 +2,7 @@ QEMU<->ACPI BIOS memory hotplug interface -------------------------------------- ACPI BIOS GPE.3 handler is dedicated for notifying OS about memory hot-add -events. +and hot-remove events. Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access): --------------------------------------------------------------- @@ -19,7 +19,9 @@ Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access): 1: Device insert event, used to distinguish device for which no device check event to OSPM was issued. It's valid only when bit 1 is set. - 2-7: reserved and should be ignored by OSPM + 2: Device remove event, used to distinguish device for which + no device eject request to OSPM was issued. + 3-7: reserved and should be ignored by OSPM [0x15-0x17] reserved write access: @@ -31,14 +33,62 @@ Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access): [0xc-0x13] reserved, writes into it are ignored [0x14] Memory device control fields bits: - 0: reserved, OSPM must clear it before writing to register + 0: reserved, OSPM must clear it before writing to register. + Due to BUG in versions prior 2.4 that field isn't cleared + when other fields are written. Keep it reserved and don't + try to reuse it. 1: if set to 1 clears device insert event, set by OSPM after it has emitted device check event for the selected memory device - 2-7: reserved, OSPM must clear them before writing to register + 2: if set to 1 clears device remove event, set by OSPM + after it has emitted device eject request for the + selected memory device + 3: if set to 1 initiates device eject, set by OSPM when it + triggers memory device removal and calls _EJ0 method + 4-7: reserved, OSPM must clear them before writing to register Selecting memory device slot beyond present range has no effect on platform: - write accesses to memory hot-plug registers not documented above are ignored - read accesses to memory hot-plug registers not documented above return all bits set to 1. + +Memory hot remove process diagram: +---------------------------------- + +-------------+ +-----------------------+ +------------------+ + | 1. QEMU | | 2. QEMU | |3. QEMU | + | device_del +---->+ device unplug request +----->+Send SCI to guest,| + | | | cb | |return control to | + +-------------+ +-----------------------+ |management | + +------------------+ + + +---------------------------------------------------------------------+ + + +---------------------+ +-------------------------+ + | OSPM: | remove event | OSPM: | + | send Eject Request, | | Scan memory devices | + | clear remove event +<-------------+ for event flags | + | | | | + +---------------------+ +-------------------------+ + | + | + +---------v--------+ +-----------------------+ + | Guest OS: | success | OSPM: | + | process Ejection +----------->+ Execute _EJ0 method, | + | request | | set eject bit in flags| + +------------------+ +-----------------------+ + |failure | + v v + +------------------------+ +-----------------------+ + | OSPM: | | QEMU: | + | set OST event & status | | call device unplug cb | + | fields | | | + +------------------------+ +-----------------------+ + | | + v v + +------------------+ +-------------------+ + |QEMU: | |QEMU: | + |Send OST QMP event| |Send device deleted| + | | |QMP event | + +------------------+ | | + +-------------------+ diff --git a/docs/specs/fw_cfg.txt b/docs/specs/fw_cfg.txt index 6accd924b..74351dd18 100644 --- a/docs/specs/fw_cfg.txt +++ b/docs/specs/fw_cfg.txt @@ -203,3 +203,24 @@ completes fully overwriting the item's data. NOTE: This function is deprecated, and will be completely removed starting with QEMU v2.4. + +== Externally Provided Items == + +As of v2.4, "file" fw_cfg items (i.e., items with selector keys above +FW_CFG_FILE_FIRST, and with a corresponding entry in the fw_cfg file +directory structure) may be inserted via the QEMU command line, using +the following syntax: + + -fw_cfg [name=]<item_name>,file=<path> + +where <item_name> is the fw_cfg item name, and <path> is the location +on the host file system of a file containing the data to be inserted. + +NOTE: Users *SHOULD* choose item names beginning with the prefix "opt/" +when using the "-fw_cfg" command line option, to avoid conflicting with +item names used internally by QEMU. For instance: + + -fw_cfg name=opt/my_item_name,file=./my_blob.bin + +Similarly, QEMU developers *SHOULD NOT* use item names prefixed with +"opt/" when inserting items programmatically, e.g. via fw_cfg_add_file(). diff --git a/docs/specs/pci-ids.txt b/docs/specs/pci-ids.txt index c6732fe00..0adcb89aa 100644 --- a/docs/specs/pci-ids.txt +++ b/docs/specs/pci-ids.txt @@ -45,7 +45,9 @@ PCI devices (other than virtio): 1b36:0003 PCI Dual-port 16550A adapter (docs/specs/pci-serial.txt) 1b36:0004 PCI Quad-port 16550A adapter (docs/specs/pci-serial.txt) 1b36:0005 PCI test device (docs/specs/pci-testdev.txt) +1b36:0006 PCI Rocker Ethernet switch device 1b36:0007 PCI SD Card Host Controller Interface (SDHCI) +1b36:000a PCI-PCI bridge (multiseat) All these devices are documented in docs/specs. diff --git a/docs/specs/ppc-spapr-hotplug.txt b/docs/specs/ppc-spapr-hotplug.txt new file mode 100644 index 000000000..46e07196b --- /dev/null +++ b/docs/specs/ppc-spapr-hotplug.txt @@ -0,0 +1,305 @@ += sPAPR Dynamic Reconfiguration = + +sPAPR/"pseries" guests make use of a facility called dynamic-reconfiguration +to handle hotplugging of dynamic "physical" resources like PCI cards, or +"logical"/paravirtual resources like memory, CPUs, and "physical" +host-bridges, which are generally managed by the host/hypervisor and provided +to guests as virtualized resources. The specifics of dynamic-reconfiguration +are documented extensively in PAPR+ v2.7, Section 13.1. This document +provides a summary of that information as it applies to the implementation +within QEMU. + +== Dynamic-reconfiguration Connectors == + +To manage hotplug/unplug of these resources, a firmware abstraction known as +a Dynamic Resource Connector (DRC) is used to assign a particular dynamic +resource to the guest, and provide an interface for the guest to manage +configuration/removal of the resource associated with it. + +== Device-tree description of DRCs == + +A set of 4 Open Firmware device tree array properties are used to describe +the name/index/power-domain/type of each DRC allocated to a guest at +boot-time. There may be multiple sets of these arrays, rooted at different +paths in the device tree depending on the type of resource the DRCs manage. + +In some cases, the DRCs themselves may be provided by a dynamic resource, +such as the DRCs managing PCI slots on a hotplugged PHB. In this case the +arrays would be fetched as part of the device tree retrieval interfaces +for hotplugged resources described under "Guest->Host interface". + +The array properties are described below. Each entry/element in an array +describes the DRC identified by the element in the corresponding position +of ibm,drc-indexes: + +ibm,drc-names: + first 4-bytes: BE-encoded integer denoting the number of entries + each entry: a NULL-terminated <name> string encoded as a byte array + + <name> values for logical/virtual resources are defined in PAPR+ v2.7, + Section 13.5.2.4, and basically consist of the type of the resource + followed by a space and a numerical value that's unique across resources + of that type. + + <name> values for "physical" resources such as PCI or VIO devices are + defined as being "location codes", which are the "location labels" of + each encapsulating device, starting from the chassis down to the + individual slot for the device, concatenated by a hyphen. This provides + a mapping of resources to a physical location in a chassis for debugging + purposes. For QEMU, this mapping is less important, so we assign a + location code that conforms to naming specifications, but is simply a + location label for the slot by itself to simplify the implementation. + The naming convention for location labels is documented in detail in + PAPR+ v2.7, Section 12.3.1.5, and in our case amounts to using "C<n>" + for PCI/VIO device slots, where <n> is unique across all PCI/VIO + device slots. + +ibm,drc-indexes: + first 4-bytes: BE-encoded integer denoting the number of entries + each 4-byte entry: BE-encoded <index> integer that is unique across all DRCs + in the machine + + <index> is arbitrary, but in the case of QEMU we try to maintain the + convention used to assign them to pSeries guests on pHyp: + + bit[31:28]: integer encoding of <type>, where <type> is: + 1 for CPU resource + 2 for PHB resource + 3 for VIO resource + 4 for PCI resource + 8 for Memory resource + bit[27:0]: integer encoding of <id>, where <id> is unique across + all resources of specified type + +ibm,drc-power-domains: + first 4-bytes: BE-encoded integer denoting the number of entries + each 4-byte entry: 32-bit, BE-encoded <index> integer that specifies the + power domain the resource will be assigned to. In the case of QEMU + we associated all resources with a "live insertion" domain, where the + power is assumed to be managed automatically. The integer value for + this domain is a special value of -1. + + +ibm,drc-types: + first 4-bytes: BE-encoded integer denoting the number of entries + each entry: a NULL-terminated <type> string encoded as a byte array + + <type> is assigned as follows: + "CPU" for a CPU + "PHB" for a physical host-bridge + "SLOT" for a VIO slot + "28" for a PCI slot + "MEM" for memory resource + +== Guest->Host interface to manage dynamic resources == + +Each DRC is given a globally unique DRC Index, and resources associated with +a particular DRC are configured/managed by the guest via a number of RTAS +calls which reference individual DRCs based on the DRC index. This can be +considered the guest->host interface. + +rtas-set-power-level: + arg[0]: integer identifying power domain + arg[1]: new power level for the domain, 0-100 + output[0]: status, 0 on success + output[1]: power level after command + + Set the power level for a specified power domain + +rtas-get-power-level: + arg[0]: integer identifying power domain + output[0]: status, 0 on success + output[1]: current power level + + Get the power level for a specified power domain + +rtas-set-indicator: + arg[0]: integer identifying sensor/indicator type + arg[1]: index of sensor, for DR-related sensors this is generally the + DRC index + arg[2]: desired sensor value + output[0]: status, 0 on success + + Set the state of an indicator or sensor. For the purpose of this document we + focus on the indicator/sensor types associated with a DRC. The types are: + + 9001: isolation-state, controls/indicates whether a device has been made + accessible to a guest + + supported sensor values: + 0: isolate, device is made unaccessible by guest OS + 1: unisolate, device is made available to guest OS + + 9002: dr-indicator, controls "visual" indicator associated with device + + supported sensor values: + 0: inactive, resource may be safely removed + 1: active, resource is in use and cannot be safely removed + 2: identify, used to visually identify slot for interactive hotplug + 3: action, in most cases, used in the same manner as identify + + 9003: allocation-state, generally only used for "logical" DR resources to + request the allocation/deallocation of a resource prior to acquiring + it via isolation-state->unisolate, or after releasing it via + isolation-state->isolate, respectively. for "physical" DR (like PCI + hotplug/unplug) the pre-allocation of the resource is implied and + this sensor is unused. + + supported sensor values: + 0: unusable, tell firmware/system the resource can be + unallocated/reclaimed and added back to the system resource pool + 1: usable, request the resource be allocated/reserved for use by + guest OS + 2: exchange, used to allocate a spare resource to use for fail-over + in certain situations. unused in QEMU + 3: recover, used to reclaim a previously allocated resource that's + not currently allocated to the guest OS. unused in QEMU + +rtas-get-sensor-state: + arg[0]: integer identifying sensor/indicator type + arg[1]: index of sensor, for DR-related sensors this is generally the + DRC index + output[0]: status, 0 on success + + Used to read an indicator or sensor value. + + For DR-related operations, the only noteworthy sensor is dr-entity-sense, + which has a type value of 9003, as allocation-state does in the case of + rtas-set-indicator. The semantics/encodings of the sensor values are distinct + however: + + supported sensor values for dr-entity-sense (9003) sensor: + 0: empty, + for physical resources: DRC/slot is empty + for logical resources: unused + 1: present, + for physical resources: DRC/slot is populated with a device/resource + for logical resources: resource has been allocated to the DRC + 2: unusable, + for physical resources: unused + for logical resources: DRC has no resource allocated to it + 3: exchange, + for physical resources: unused + for logical resources: resource available for exchange (see + allocation-state sensor semantics above) + 4: recovery, + for physical resources: unused + for logical resources: resource available for recovery (see + allocation-state sensor semantics above) + +rtas-ibm-configure-connector: + arg[0]: guest physical address of 4096-byte work area buffer + arg[1]: 0, or address of additional 4096-byte work area buffer. only non-zero + if a prior RTAS response indicated a need for additional memory + output[0]: status: + 0: completed transmittal of device-tree node + 1: instruct guest to prepare for next DT sibling node + 2: instruct guest to prepare for next DT child node + 3: instruct guest to prepare for next DT property + 4: instruct guest to ascend to parent DT node + 5: instruct guest to provide additional work-area buffer + via arg[1] + 990x: instruct guest that operation took too long and to try + again later + + Used to fetch an OF device-tree description of the resource associated with + a particular DRC. The DRC index is encoded in the first 4-bytes of the first + work area buffer. + + Work area layout, using 4-byte offsets: + wa[0]: DRC index of the DRC to fetch device-tree nodes from + wa[1]: 0 (hard-coded) + wa[2]: for next-sibling/next-child response: + wa offset of null-terminated string denoting the new node's name + for next-property response: + wa offset of null-terminated string denoting new property's name + wa[3]: for next-property response (unused otherwise): + byte-length of new property's value + wa[4]: for next-property response (unused otherwise): + new property's value, encoded as an OFDT-compatible byte array + +== hotplug/unplug events == + +For most DR operations, the hypervisor will issue host->guest add/remove events +using the EPOW/check-exception notification framework, where the host issues a +check-exception interrupt, then provides an RTAS event log via an +rtas-check-exception call issued by the guest in response. This framework is +documented by PAPR+ v2.7, and already use in by QEMU for generating powerdown +requests via EPOW events. + +For DR, this framework has been extended to include hotplug events, which were +previously unneeded due to direct manipulation of DR-related guest userspace +tools by host-level management such as an HMC. This level of management is not +applicable to PowerKVM, hence the reason for extending the notification +framework to support hotplug events. + +Note that these events are not yet formally part of the PAPR+ specification, +but support for this format has already been implemented in DR-related +guest tools such as powerpc-utils/librtas, as well as kernel patches that have +been submitted to handle in-kernel processing of memory/cpu-related hotplug +events[1], and is planned for formal inclusion is PAPR+ specification. The +hotplug-specific payload is QEMU implemented as follows (with all values +encoded in big-endian format): + +struct rtas_event_log_v6_hp { +#define SECTION_ID_HOTPLUG 0x4850 /* HP */ + struct section_header { + uint16_t section_id; /* set to SECTION_ID_HOTPLUG */ + uint16_t section_length; /* sizeof(rtas_event_log_v6_hp), + * plus the length of the DRC name + * if a DRC name identifier is + * specified for hotplug_identifier + */ + uint8_t section_version; /* version 1 */ + uint8_t section_subtype; /* unused */ + uint16_t creator_component_id; /* unused */ + } hdr; +#define RTAS_LOG_V6_HP_TYPE_CPU 1 +#define RTAS_LOG_V6_HP_TYPE_MEMORY 2 +#define RTAS_LOG_V6_HP_TYPE_SLOT 3 +#define RTAS_LOG_V6_HP_TYPE_PHB 4 +#define RTAS_LOG_V6_HP_TYPE_PCI 5 + uint8_t hotplug_type; /* type of resource/device */ +#define RTAS_LOG_V6_HP_ACTION_ADD 1 +#define RTAS_LOG_V6_HP_ACTION_REMOVE 2 + uint8_t hotplug_action; /* action (add/remove) */ +#define RTAS_LOG_V6_HP_ID_DRC_NAME 1 +#define RTAS_LOG_V6_HP_ID_DRC_INDEX 2 +#define RTAS_LOG_V6_HP_ID_DRC_COUNT 3 + uint8_t hotplug_identifier; /* type of the resource identifier, + * which serves as the discriminator + * for the 'drc' union field below + */ + uint8_t reserved; + union { + uint32_t index; /* DRC index of resource to take action + * on + */ + uint32_t count; /* number of DR resources to take + * action on (guest chooses which) + */ + char name[1]; /* string representing the name of the + * DRC to take action on + */ + } drc; +} QEMU_PACKED; + +== ibm,lrdr-capacity == + +ibm,lrdr-capacity is a property in the /rtas device tree node that identifies +the dynamic reconfiguration capabilities of the guest. It consists of a triple +consisting of <phys>, <size> and <maxcpus>. + + <phys>, encoded in BE format represents the maximum address in bytes and + hence the maximum memory that can be allocated to the guest. + + <size>, encoded in BE format represents the size increments in which + memory can be hot-plugged to the guest. + + <maxcpus>, a BE-encoded integer, represents the maximum number of + processors that the guest can have. + +pseries guests use this property to note the maximum allowed CPUs for the +guest. + +[1] http://thread.gmane.org/gmane.linux.ports.ppc.embedded/75350/focus=106867 diff --git a/docs/specs/rocker.txt b/docs/specs/rocker.txt new file mode 100644 index 000000000..1c743515c --- /dev/null +++ b/docs/specs/rocker.txt @@ -0,0 +1,1014 @@ +Rocker Network Switch Register Programming Guide +Copyright (c) Scott Feldman <sfeldma@gmail.com> +Copyright (c) Neil Horman <nhorman@tuxdriver.com> +Version 0.11, 12/29/2014 + +LICENSE +======= + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +SECTION 1: Introduction +======================= + +Overview +-------- + +This document describes the hardware/software interface for the Rocker switch +device. The intended audience is authors of OS drivers and device emulation +software. + +Notations and Conventions +------------------------- + +o In register descriptions, [n:m] indicates a range from bit n to bit m, +inclusive. +o Use of leading 0x indicates a hexadecimal number. +o Use of leading 0b indicates a binary number. +o The use of RSVD or Reserved indicates that a bit or field is reserved for +future use. +o Field width is in bytes, unless otherwise noted. +o Register are (R) read-only, (R/W) read/write, (W) write-only, or (COR) clear +on read +o TLV values in network-byte-order are designated with (N). + + +SECTION 2: PCI Configuration Registers +====================================== + +PCI Configuration Space +----------------------- + +Each switch instance registers as a PCI device with PCI configuration space: + + offset width description value + --------------------------------------------- + 0x0 2 Vendor ID 0x1b36 + 0x2 2 Device ID 0x0006 + 0x4 4 Command/Status + 0x8 1 Revision ID 0x01 + 0x9 3 Class code 0x2800 + 0xC 1 Cache line size + 0xD 1 Latency timer + 0xE 1 Header type + 0xF 1 Built-in self test + 0x10 4 Base address low + 0x14 4 Base address high + 0x18-28 Reserved + 0x2C 2 Subsystem vendor ID * + 0x2E 2 Subsystem ID * + 0x30-38 Reserved + 0x3C 1 Interrupt line + 0x3D 1 Interrupt pin 0x00 + 0x3E 1 Min grant 0x00 + 0x3D 1 Max latency 0x00 + 0x40 1 TRDY timeout + 0x41 1 Retry count + 0x42 2 Reserved + + +* Assigned by sub-system implementation + +SECTION 3: Memory-Mapped Register Space +======================================= + +There are two memory-mapped BARs. BAR0 maps device register space and is +0x2000 in size. BAR1 maps MSI-X vector and PBA tables and is also 0x2000 in +size, allowing for 256 MSI-X vectors. + +All registers are 4 or 8 bytes long. It is assumed host software will access 4 +byte registers with one 4-byte access, and 8 byte registers with either two +4-byte accesses or a single 8-byte access. In the case of two 4-byte accesses, +access must be lower and then upper 4-bytes, in that order. + +BAR0 device register space is organized as follows: + + offset description + ------------------------------------------------------ + 0x0000-0x000f Bogus registers to catch misbehaving + drivers. Writes do nothing. Reads + back as 0xDEADBABE. + 0x0010-0x00ff Test registers + 0x0300-0x03ff General purpose registers + 0x1000-0x1fff Descriptor control + +Holes in register space are reserved. Writes to reserved registers do nothing. +Reads to reserved registers read back as 0. + +No fancy stuff like write-combining is enabled on any of the registers. + +BAR1 MSI-X register space is organized as follows: + + offset description + ------------------------------------------------------ + 0x0000-0x0fff MSI-X vector table (256 vectors total) + 0x1000-0x1fff MSI-X PBA table + + +SECTION 4: Interrupts, DMA, and Endianness +========================================== + +PCI Interrupts +-------------- + +The device supports only MSI-X interrupts. BAR1 memory-mapped region contains +the MSI-X vector and PBA tables, with support for up to 256 MSI-X vectors. + +The vector assignment is: + + vector description + ----------------------------------------------------- + 0 Command descriptor ring completion + 1 Event descriptor ring completion + 2 Test operation completion + 3 RSVD + 4-255 Tx and Rx descriptor ring completion + Tx vector is even + Rx vector is odd + +A MSI-X vector table entry is 16 bytes: + + field offset width description + ------------------------------------------------------------- + lower_addr 0x0 4 [31:2] message address[31:2] + [1:0] Rsvd (4 byte alignment + required) + upper_addr 0x4 4 [31:19] Rsvd + [14:0] message address[46:32] + data 0x8 4 message data[31:0] + control 0xc 4 [31:1] Rsvd + [0] mask (0 = enable, + 1 = masked) + +Software should install the Interrupt Service Routine (ISR) before any ports +are enabled or any commands are issued on the command ring. + +DMA Operations +-------------- + +DMA operations are used for packet DMA to/from the CPU, command and event +processing. Command processing includes statistical counters and table dumps, +table insertion/deletion, and more. Event processing provides an async +notification method for device-originating events. Each DMA operation has a +set of control registers to manage a descriptor ring. The descriptor rings are +allocated from contiguous host DMA-able memory and registers specify the rings +base address, size and current head and tail indices. Software always writes +the head, and hardware always writes the tail. + +The higher-order bit of DMA_DESC_COMP_ERR is used to mark hardware completion +of a descriptor. Software will clear this bit when posting a descriptor to the +ring, and hardware will set this bit when the descriptor is complete. + +Descriptor ring sizes must be a power of 2 and range from 2 to 64K entries. +Descriptor rings' base address must be 8-byte aligned. Descriptors must be +packed within ring. Each descriptor in each ring must also be aligned on an 8 +byte boundary. Each descriptor ring will have these registers: + + DMA_DESC_xxx_BASE_ADDR, offset 0x1000 + (x * 32), 64-bit, (R/W) + DMA_DESC_xxx_SIZE, offset 0x1008 + (x * 32), 32-bit, (R/W) + DMA_DESC_xxx_HEAD, offset 0x100c + (x * 32), 32-bit, (R/W) + DMA_DESC_xxx_TAIL, offset 0x1010 + (x * 32), 32-bit, (R) + DMA_DESC_xxx_CTRL, offset 0x1014 + (x * 32), 32-bit, (W) + DMA_DESC_xxx_CREDITS, offset 0x1018 + (x * 32), 32-bit, (R/W) + DMA_DESC_xxx_RSVD1, offset 0x101c + (x * 32), 32-bit, (R/W) + +Where x is descriptor ring index: + + index ring + -------------------- + 0 CMD + 1 EVENT + 2 TX (port 0) + 3 RX (port 0) + 4 TX (port 1) + 5 RX (port 1) + . + . + . + 124 TX (port 61) + 125 RX (port 61) + 126 Resv + 127 Resv + +Writing BASE_ADDR or SIZE will reset HEAD and TAIL to zero. HEAD cannot be +written past TAIL. To do so would wrap the ring. An empty ring is when HEAD +== TAIL. A full ring is when HEAD is one position behind TAIL. Both HEAD and +TAIL increment and modulo wrap at the ring size. + +CTRL register bits: + + bit name description + ------------------------------------------------------------------------ + [0] CTRL_RESET Reset the descriptor ring + [1:31] Reserved + +All descriptor types share some common fields: + + field width description + ------------------------------------------------------------------- + DMA_DESC_BUF_ADDR 8 Phys addr of desc payload, 8-byte + aligned + DMA_DESC_COOKIE 8 Desc cookie for completion matching, + upper-most bit is reserved + DMA_DESC_BUF_SIZE 2 Desc payload size in bytes + DMA_DESC_TLV_SIZE 2 Desc payload total size in bytes + used for TLVs. Must be <= + DMA_DESC_BUF_SIZE. + DMA_DESC_COMP_ERR 2 Completion status of associated + desc payload. High order bit is + clear on new descs, toggled by + hw for completed items. + +To support forward- and backward-compatibility, descriptor and completion +payloads are specified in TLV format. Fields are packed with Type=field name, +Length=field length, and Value=field value. Software will ignore unknown fields +filled in by the switch. Likewise, the switch will ignore unknown fields +filled in by software. + +Descriptor payload buffer is 8-byte aligned and TLVs are 8-byte aligned. The +value within a TLV is also 8-byte aligned. The (packed, 8 byte) TLV header is: + + field width description + ----------------------------- + type 4 TLV type + len 2 TLV value length + pad 2 Reserved + +The alignment requirements for descriptors and TLVs are to avoid unaligned +access exceptions in software. Note that the payload for each TLV is also +8 byte aligned. + +Figure 1 shows an example descriptor buffer with two TLVs. + + <------- 8 bytes -------> + + 8-byte +––––+ +–––––––––––+–––––+–––––+ +–+ + align | type | len | pad | TLV#1 hdr | + +–––––––––––+–––––+–––––+ (len=22) | + | | | + | value | TVL#1 value | + | | (padded to 8-byte | + | +–––––+ alignment) | + | |/////| | + 8-byte +––––+ +–––––––––––+–––––––––––+ | + align | type | len | pad | TLV#2 hdr DESC_BUF_SIZE + +–––––+–––––+–––––+–––––+ (len=2) | + |value|/////////////////| TLV#2 value | + +–––––+/////////////////| | + |///////////////////////| | + |///////////////////////| | + |///////////////////////| | + |////////unused/////////| | + |////////space//////////| | + |///////////////////////| | + |///////////////////////| | + |///////////////////////| | + +–––––––––––––––––––––––+ +–+ + + fig. 1 + +TLVs can be nested within the NEST TLV type. + +Interrupt credits +^^^^^^^^^^^^^^^^^ + +MSI-X vectors used for descriptor ring completions use a credit mechanism for +efficient device, PCIe bus, OS and driver operations. Each descriptor ring has +a credit count which represents the number of outstanding descriptors to be +processed by the driver. As the device marks descriptors complete, the credit +count is incremented. As the driver processes those outstanding descriptors, +it returns credits back to the device. This way, the device knows the driver's +progress and can make decisions about when to fire the next interrupt or not. +When the credit count is zero, and the first descriptors are posted for the +driver, a single interrupt is fired. Once the interrupt is fired, the +interrupt is disabled (auto-masked*). In response to the interrupt, the driver +will process descriptors and PIO write a returned credit value for that +descriptor ring. If the driver returns all credits (the driver caught up with +the device and there is no outstanding work), then the interrupt is unmasked, +but not fired. If only partial credits are returned, the interrupt remains +masked but the device generates an interrupt, signaling the driver that more +outstanding work is available. + +(* this masking is unrelated to to the MSI-X interrupt mask register) + +Endianness +---------- + +Device registers are hard-coded to little-endian (LE). The driver should +convert to/from host endianess to LE for device register accesses. + +Descriptors are LE. Descriptor buffer TLVs will have LE type and length +fields, but the value field can either be LE or network-byte-order, depending +on context. TLV values containing network packet data will be in network-byte +order. A TLV value containing a field or mask used to compare against network +packet data is network-byte order. For example, flow match fields (and masks) +are network-byte-order since they're matched directly, byte-by-byte, against +network packet data. All non-network-packet TLV multi-byte values will be LE. + +TLV values in network-byte-order are designated with (N). + + +SECTION 5: Test Registers +========================= + +Rocker has several test registers to support troubleshooting register access, +interrupt generation, and DMA operations: + + TEST_REG, offset 0x0010, 32-bit (R/W) + TEST_REG64, offset 0x0018, 64-bit (R/W) + TEST_IRQ, offset 0x0020, 32-bit (R/W) + TEST_DMA_ADDR, offset 0x0028, 64-bit (R/W) + TEST_DMA_SIZE, offset 0x0030, 32-bit (R/W) + TEST_DMA_CTRL, offset 0x0034, 32-bit (R/W) + +Reads to TEST_REG and TEST_REG64 will read a value equal to twice the last +value written to the register. The 32-bit and 64-bit versions are for testing +32-bit and 64-bit host accesses. + +A vector can be written to TEST_IRQ and the device will generate an interrupt +for that vector. + +To test basic DMA operations, allocate a DMA-able host buffer and put the +buffer address into TEST_DMA_ADDR and size into TEST_DMA_SIZE. Then, write to +TEST_DMA_CTRL to manipulate the buffer contents. TEST_DMA_CTRL operations are: + + operation value description + ----------------------------------------------------------- + TEST_DMA_CTRL_CLEAR 1 clear buffer + TEST_DMA_CTRL_FILL 2 fill buffer bytes with 0x96 + TEST_DMA_CTRL_INVERT 4 invert bytes in buffer + +Various buffer address and sizes should be tested to verify no address boundary +issue exists. In particular, buffers that start on odd-8-byte boundary and/or +span multiple PAGE sizes should be tested. + + +SECTION 6: Ports +================ + +Physical and Logical Ports +------------------------------------ + +The switch supports up to 62 physical (front-panel) ports. Register +PORT_PHYS_COUNT returns the actual number of physical ports available: + + PORT_PHYS_COUNT, offset 0x0304, 32-bit, (R) + +In addition to front-panel ports, the switch supports logical ports for +tunnels. + +Front-panel ports and logical tunnel ports are mapped into a single 32-bit port +space. A special CPU port is assigned port 0. The front-panel ports are +mapped to ports 1-62. A special loopback port is assigned port 63. Logical +tunnel ports are assigned ports 0x0001000-0x0001ffff. +To summarize the port assignments: + + port mapping + ------------------------------------------------------- + 0 CPU port (for packets to/from host CPU) + 1-62 front-panel physical ports + 63 loopback port + 64-0x0000ffff RSVD + 0x00010000-0x0001ffff logical tunnel ports + 0x00020000-0xffffffff RSVD + +Physical Port Mode +------------------ + +Switch front-panel ports operate in a mode. Currently, the only mode is +OF-DPA. OF-DPA[1] mode is based on OpenFlow Data Plane Abstraction (OF-DPA) +Abstract Switch Specification, Version 1.0, from Broadcom Corporation. To +set/get the mode for front-panel ports, see port settings, below. + +Port Settings +------------- + +Link status for all front-panel ports is available via PORT_PHYS_LINK_STATUS: + + PORT_PHYS_LINK_STATUS, offset 0x0310, 64-bit, (R) + + Value is port bitmap. Bits 0 and 63 always read 0. Bits 1-62 + read 1 for link UP and 0 for link DOWN for respective front-panel ports. + +Other properties for front-panel ports are available via DMA CMD descriptors: + + Get PORT_SETTINGS descriptor: + + field width description + ---------------------------------------------- + PORT_SETTINGS 2 CMD_GET + PPORT 4 Physical port # + + Get PORT_SETTINGS completion: + + field width description + ---------------------------------------------- + PPORT 4 Physical port # + SPEED 4 Current port interface speed, in Mbps + DUPLEX 1 1 = Full, 0 = Half + AUTONEG 1 1 = enabled, 0 = disabled + MACADDR 6 Port MAC address + MODE 1 0 = OF-DPA + LEARNING 1 MAC address learning on port + 1 = enabled + 0 = disabled + PHYS_NAME <var> Physical port name (string) + + Set PORT_SETTINGS descriptor: + + field width description + ---------------------------------------------- + PORT_SETTINGS 2 CMD_SET + PPORT 4 Physical port # + SPEED 4 Port interface speed, in Mbps + DUPLEX 1 1 = Full, 0 = Half + AUTONEG 1 1 = enabled, 0 = disabled + MACADDR 6 Port MAC address + MODE 1 0 = OF-DPA + +Port Enable +----------- + +Front-panel ports are initially disabled, which means port ingress and egress +packets will be dropped. To enable or disable a port, use PORT_PHYS_ENABLE: + + PORT_PHYS_ENABLE: offset 0x0318, 64-bit, (R/W) + + Value is bitmap of first 64 ports. Bits 0 and 63 are ignored + and always read as 0. Write 1 to enable port; write 0 to disable it. + Default is 0. + + +SECTION 7: Switch Control +========================= + +This section covers switch-wide register settings. + +Control +------- + +This register is used for low level control of the switch. + + CONTROL: offset 0x0300, 32-bit, (W) + + bit name description + ------------------------------------------------------------------------ + [0] CONTROL_RESET If set, device will perform reset + [1:31] Reserved + +Switch ID +--------- + +The switch has a SWITCH_ID to be used by software to uniquely identify the +switch: + + SWITCH_ID: offset 0x0320, 64-bit, (R) + + Value is opaque to switch software and no special encoding is implied. + + +SECTION 8: Events +================= + +Non-I/O asynchronous events from the device are notified to the host using the +event ring. The TLV structure for events is: + + field width description + --------------------------------------------------- + TYPE 4 Event type, one of: + 1: LINK_CHANGED + 2: MAC_VLAN_SEEN + INFO <nest> Event info (details below) + +Link Changed Event +------------------ + +When link status changes on a physical port, this event is generated. + + field width description + --------------------------------------------------- + INFO <nest> + PPORT 4 Physical port + LINKUP 1 Link status: + 0: down + 1: up + +MAC VLAN Seen Event +------------------- + +When a packet ingresses on a port and the source MAC/VLAN isn't known to the +device, the device will generate this event. In response to the event, the +driver should install to the device the MAC/VLAN on the port into the bridge +table. Once installed, the MAC/VLAN is known on the port and this event will +no longer be generated. + + field width description + --------------------------------------------------- + INFO <nest> + PPORT 4 Physical port + MAC 6 MAC address + VLAN 2 VLAN ID + + +SECTION 9: CPU Packet Processing +================================ + +Ingress packets directed to the host CPU for further processing are delivered +in the DMA RX ring. Likewise, host CPU originating packets destined to egress +on switch ports are scheduled by software using the DMA TX ring. + +Tx Packet Processing +-------------------- + +Software schedules packets for egress on switch ports using the DMA TX ring. A +TX descriptor buffer describes the packet location and size in host DMA-able +memory, the destination port, and any hardware-offload functions (such as L3 +payload checksum offload). Software then bumps the descriptor head to signal +hardware of new Tx work. In response, hardware will DMA read Tx descriptors up +to head, DMA read descriptor buffer and packet data, perform offloading +functions, and finally frame packet on wire (network). Once packet processing +is complete, hardware will writeback status to descriptor(s) to signal to +software that Tx is complete and software resources (e.g. skb) backing packet +can be released. + +Figure 2 shows an example 3-fragment packet queued with one Tx descriptor. A +TLV is used for each packet fragment. + + pkt frag 1 + +–––––––+ +–+ + +–––+ | | + desc buf | | | | + +––––––––+ | | | | + Tx ring +–––+ +–––––+ | | | + +–––––––––+ | | TLVs | +–––––––+ | + | +–––+ +––––––––+ pkt frag 2 | + | desc 0 | | +–––––+ +–––––––+ | + +–––––––––+ | TLVs | +–––+ | | + head+–+ | +––––––––+ | | | + | desc 1 | | +–––––+ +–––––––+ |pkt + +–––––––––+ | TLVs | | | + | | +––––––––+ | pkt frag 3 | + | | | +–––––––+ | + +–––––––––+ +–––+ | | + | | | | | + | | | | | + +–––––––––+ | | | + | | | | | + | | | | | + +–––––––––+ | | | + | | +–––––––+ +–+ + | | + +–––––––––+ + + fig 2. + +The TLVs for Tx descriptor buffer are: + + field width description + --------------------------------------------------------------------- + PPORT 4 Destination physical port # + TX_OFFLOAD 1 Hardware offload modes: + 0: no offload + 1: insert IP csum (ipv4 only) + 2: insert TCP/UDP csum + 3: L3 csum calc and insert + into csum offset (TX_L3_CSUM_OFF) + 16-bit 1's complement csum value. + IPv4 pseudo-header and IP + already calculated by OS + and inserted. + 4: TSO (TCP Segmentation Offload) + TX_L3_CSUM_OFF 2 For L3 csum offload mode, the offset, + from the beginning of the packet, + of the csum field in the L3 header + TX_TSO_MSS 2 For TSO offload mode, the + Maximum Segment Size in bytes + TX_TSO_HDR_LEN 2 For TSO offload mode, the + length of ethernet, IP, and + TCP/UDP headers, including IP + and TCP options. + TX_FRAGS <array> Packet fragments + TX_FRAG <nest> Packet fragment + TX_FRAG_ADDR 8 DMA address of packet fragment + TX_FRAG_LEN 2 Packet fragment length + +Possible status return codes in descriptor on completion are: + + DESC_COMP_ERR reason + -------------------------------------------------------------------- + 0 OK + -ROCKER_ENXIO address or data read err on desc buf or packet + fragment + -ROCKER_EINVAL bad pport or TSO or csum offloading error + -ROCKER_ENOMEM no memory for internal staging tx fragment + +Rx Packet Processing +-------------------- + +For packets ingressing on switch ports that are not forwarded by the switch but +rather directed to the host CPU for further processing are delivered in the DMA +RX ring. Rx descriptor buffers are allocated by software and placed on the +ring. Hardware will fill Rx descriptor buffers with packet data, write the +completion, and signal to software that a new packet is ready. Since Rx packet +size is not known a-priori, the Rx descriptor buffer must be allocated for +worst-case packet size. A single Rx descriptor will contain the entire Rx +packet data in one RX_FRAG. Other Rx TLVs describe and hardware offloads +performed on the packet, such as checksum validation. + +The TLVs for Rx descriptor buffer are: + + field width description + --------------------------------------------------- + PPORT 4 Source physical port # + RX_FLAGS 2 Packet parsing flags: + (1 << 0): IPv4 packet + (1 << 1): IPv6 packet + (1 << 2): csum calculated + (1 << 3): IPv4 csum good + (1 << 4): IP fragment + (1 << 5): TCP packet + (1 << 6): UDP packet + (1 << 7): TCP/UDP csum good + (1 << 8): Offload forward + RX_CSUM 2 IP calculated checksum: + IPv4: IP payload csum + IPv6: header and payload csum + (Only valid is RX_FLAGS:csum calc is set) + RX_FRAG_ADDR 8 DMA address of packet fragment + RX_FRAG_MAX_LEN 2 Packet maximum fragment length + RX_FRAG_LEN 2 Actual packet fragment length after receive + +Offload forward RX_FLAG indicates the device has already forwarded the packet +so the host CPU should not also forward the packet. + +Possible status return codes in descriptor on completion are: + + DESC_COMP_ERR reason + -------------------------------------------------------------------- + 0 OK + -ROCKER_ENXIO address or data read err on desc buf + -ROCKER_ENOMEM no memory for internal staging desc buf + -ROCKER_EMSGSIZE Rx descriptor buffer wasn't big enough to contain + packet data TLV and other TLVs. + + +SECTION 10: OF-DPA Mode +====================== + +OF-DPA mode allows the switch to offload flow packet processing functions to +hardware. An OpenFlow controller would communicate with an OpenFlow agent +installed on the switch. The OpenFlow agent would (directly or indirectly) +communicate with the Rocker switch driver, which in turn would program switch +hardware with flow functionality, as defined in OF-DPA. The block diagram is: + + +–––––––––––––––----–––+ + | OF | + | Remote Controller | + +––––––––+––----–––––––+ + | + | + +––––––––+–––––––––+ + | OF | + | Local Agent | + +––––––––––––––––––+ + | | + | Rocker Driver | + +––––––––––––––––––+ + <this spec> + +––––––––––––––––––+ + | | + | Rocker Switch | + +––––––––––––––––––+ + +To participate in flow functions, ports must be configure for OF-DPA mode +during switch initialization. + +OF-DPA Flow Table Interface +--------------------------- + +There are commands to add, modify, delete, and get stats of flow table entries. +The commands are issued using the DMA CMD descriptor ring. The following +commands are defined: + + CMD_ADD: add an entry to flow table + CMD_MOD: modify an entry in flow table + CMD_DEL: delete an entry from flow table + CMD_GET_STATS: get stats for flow entry + +TLVs for add and modify commands are: + + field width description + ---------------------------------------------------- + OF_DPA_CMD 2 CMD_[ADD|MOD] + OF_DPA_TBL 2 Flow table ID + 0: ingress port + 10: vlan + 20: termination mac + 30: unicast routing + 40: multicast routing + 50: bridging + 60: ACL policy + OF_DPA_PRIORITY 4 Flow priority + OF_DPA_HARDTIME 4 Hard timeout for flow + OF_DPA_IDLETIME 4 Idle timeout for flow + OF_DPA_COOKIE 8 Cookie + +Additional TLVs based on flow table ID: + +Table ID 0: ingress port + + field width description + ---------------------------------------------------- + OF_DPA_IN_PPORT 4 ingress physical port number + OF_DPA_GOTO_TBL 2 goto table ID; zero to drop + +Table ID 10: vlan + + field width description + ---------------------------------------------------- + OF_DPA_IN_PPORT 4 ingress physical port number + OF_DPA_VLAN_ID 2 (N) vlan ID + OF_DPA_VLAN_ID_MASK 2 (N) vlan ID mask + OF_DPA_GOTO_TBL 2 goto table ID; zero to drop + OF_DPA_NEW_VLAN_ID 2 (N) new vlan ID + +Table ID 20: termination mac + + field width description + ---------------------------------------------------- + OF_DPA_IN_PPORT 4 ingress physical port number + OF_DPA_IN_PPORT_MASK 4 ingress physical port number mask + OF_DPA_ETHERTYPE 2 (N) must be either 0x0800 or 0x86dd + OF_DPA_DST_MAC 6 (N) destination MAC + OF_DPA_DST_MAC_MASK 6 (N) destination MAC mask + OF_DPA_VLAN_ID 2 (N) vlan ID + OF_DPA_VLAN_ID_MASK 2 (N) vlan ID mask + OF_DPA_GOTO_TBL 2 only acceptable values are + unicast or multicast routing + table IDs + OF_DPA_OUT_PPORT 2 if specified, must be + controller, set zero otherwise + +Table ID 30: unicast routing + + field width description + ---------------------------------------------------- + OF_DPA_ETHERTYPE 2 (N) must be either 0x0800 or 0x86dd + OF_DPA_DST_IP 4 (N) destination IPv4 address. + Must be unicast address + OF_DPA_DST_IP_MASK 4 (N) IP mask. Must be prefix mask + OF_DPA_DST_IPV6 16 (N) destination IPv6 address. + Must be unicast address + OF_DPA_DST_IPV6_MASK 16 (N) IPv6 mask. Must be prefix mask + OF_DPA_GOTO_TBL 2 goto table ID; zero to drop + OF_DPA_GROUP_ID 4 data for GROUP action must + be an L3 Unicast group entry + +Table ID 40: multicast routing + + field width description + ---------------------------------------------------- + OF_DPA_ETHERTYPE 2 (N) must be either 0x0800 or 0x86dd + OF_DPA_VLAN_ID 2 (N) vlan ID + OF_DPA_SRC_IP 4 (N) source IPv4. Optional, + can contain IPv4 address, + must be completely masked + if not used + OF_DPA_SRC_IP_MASK 4 (N) IP Mask + OF_DPA_DST_IP 4 (N) destination IPv4 address. + Must be multicast address + OF_DPA_SRC_IPV6 16 (N) source IPv6 Address. Optional. + Can contain IPv6 address, + must be completely masked + if not used + OF_DPA_SRC_IPV6_MASK 16 (N) IPv6 mask. + OF_DPA_DST_IPV6 16 (N) destination IPv6 Address. Must + be multicast address + Must be multicast address + OF_DPA_GOTO_TBL 2 goto table ID; zero to drop + OF_DPA_GROUP_ID 4 data for GROUP action must + be an L3 multicast group entry + +Table ID 50: bridging + + field width description + ---------------------------------------------------- + OF_DPA_VLAN_ID 2 (N) vlan ID + OF_DPA_TUNNEL_ID 4 tunnel ID + OF_DPA_DST_MAC 6 (N) destination MAC + OF_DPA_DST_MAC_MASK 6 (N) destination MAC mask + OF_DPA_GOTO_TBL 2 goto table ID; zero to drop + OF_DPA_GROUP_ID 4 data for GROUP action must + be a L2 Interface, L2 + Multicast, L2 Flood, + or L2 Overlay group entry + as appropriate + OF_DPA_TUNNEL_LPORT 4 unicast Tenant Bridging + flows specify a tunnel + logical port ID + OF_DPA_OUT_PPORT 2 data for OUTPUT action, + restricted to CONTROLLER, + set to 0 otherwise + +Table ID 60: acl policy + + field width description + ---------------------------------------------------- + OF_DPA_IN_PPORT 4 ingress physical port number + OF_DPA_IN_PPORT_MASK 4 ingress physical port number mask + OF_DPA_ETHERTYPE 2 (N) ethertype + OF_DPA_VLAN_ID 2 (N) vlan ID + OF_DPA_VLAN_ID_MASK 2 (N) vlan ID mask + OF_DPA_VLAN_PCP 2 (N) vlan Priority Code Point + OF_DPA_VLAN_PCP_MASK 2 (N) vlan Priority Code Point mask + OF_DPA_SRC_MAC 6 (N) source MAC + OF_DPA_SRC_MAC_MASK 6 (N) source MAC mask + OF_DPA_DST_MAC 6 (N) destination MAC + OF_DPA_DST_MAC_MASK 6 (N) destination MAC mask + OF_DPA_TUNNEL_ID 4 tunnel ID + OF_DPA_SRC_IP 4 (N) source IPv4. Optional, + can contain IPv4 address, + must be completely masked + if not used + OF_DPA_SRC_IP_MASK 4 (N) IP Mask + OF_DPA_DST_IP 4 (N) destination IPv4 address. + Must be multicast address + OF_DPA_DST_IP_MASK 4 (N) IP Mask + OF_DPA_SRC_IPV6 16 (N) source IPv6 Address. Optional. + Can contain IPv6 address, + must be completely masked + if not used + OF_DPA_SRC_IPV6_MASK 16 (N) IPv6 mask + OF_DPA_DST_IPV6 16 (N) destination IPv6 Address. Must + be multicast address. + OF_DPA_DST_IPV6_MASK 16 (N) IPv6 mask + OF_DPA_SRC_ARP_IP 4 (N) source IPv4 address in the ARP + payload. Only used if ethertype + == 0x0806. + OF_DPA_SRC_ARP_IP_MASK 4 (N) IP Mask + OF_DPA_IP_PROTO 1 IP protocol + OF_DPA_IP_PROTO_MASK 1 IP protocol mask + OF_DPA_IP_DSCP 1 DSCP + OF_DPA_IP_DSCP_MASK 1 DSCP mask + OF_DPA_IP_ECN 1 ECN + OF_DPA_IP_ECN_MASK 1 ECN mask + OF_DPA_L4_SRC_PORT 2 (N) L4 source port, only for + TCP, UDP, or SCTP + OF_DPA_L4_SRC_PORT_MASK 2 (N) L4 source port mask + OF_DPA_L4_DST_PORT 2 (N) L4 source port, only for + TCP, UDP, or SCTP + OF_DPA_L4_DST_PORT_MASK 2 (N) L4 source port mask + OF_DPA_ICMP_TYPE 1 ICMP type, only if IP + protocol is 1 + OF_DPA_ICMP_TYPE_MASK 1 ICMP type mask + OF_DPA_ICMP_CODE 1 ICMP code + OF_DPA_ICMP_CODE_MASK 1 ICMP code mask + OF_DPA_IPV6_LABEL 4 (N) IPv6 flow label + OF_DPA_IPV6_LABEL_MASK 4 (N) IPv6 flow label mask + OF_DPA_GROUP_ID 4 data for GROUP action + OF_DPA_QUEUE_ID_ACTION 1 write the queue ID + OF_DPA_NEW_QUEUE_ID 1 queue ID + OF_DPA_VLAN_PCP_ACTION 1 write the VLAN priority + OF_DPA_NEW_VLAN_PCP 1 VLAN priority + OF_DPA_IP_DSCP_ACTION 1 write the DSCP + OF_DPA_NEW_IP_DSCP 1 new DSCP + OF_DPA_TUNNEL_LPORT 4 restrct to valid tunnel + logical port, set to 0 + otherwise. + OF_DPA_OUT_PPORT 2 data for OUTPUT action, + restricted to CONTROLLER, + set to 0 otherwise + OF_DPA_CLEAR_ACTIONS 4 if 1 packets matching flow are + dropped (all other instructions + ignored) + +TLVs for flow delete and get stats command are: + + field width description + --------------------------------------------------- + OF_DPA_CMD 2 CMD_[DEL|GET_STATS] + OF_DPA_COOKIE 8 Cookie + +On completion of get stats command, the descriptor buffer is written back with +the following TLVs: + + field width description + --------------------------------------------------- + OF_DPA_STAT_DURATION 4 Flow duration + OF_DPA_STAT_RX_PKTS 8 Received packets + OF_DPA_STAT_TX_PKTS 8 Transmit packets + +Possible status return codes in descriptor on completion are: + + DESC_COMP_ERR command reason + -------------------------------------------------------------------- + 0 all OK + -ROCKER_EFAULT all head or tail index outside + of ring + -ROCKER_ENXIO all address or data read err on + desc buf + -ROCKER_EMSGSIZE GET_STATS cmd descriptor buffer wasn't + big enough to contain write-back + TLVs + -ROCKER_EINVAL all invalid parameters passed in + -ROCKER_EEXIST ADD entry already exists + -ROCKER_ENOSPC ADD no space left in flow table + -ROCKER_ENOENT MOD|DEL|GET_STATS cookie invalid + +Group Table Interface +--------------------- + +There are commands to add, modify, delete, and get stats of group table +entries. The commands are issued using the DMA CMD descriptor ring. The +following commands are defined: + + CMD_ADD: add an entry to group table + CMD_MOD: modify an entry in group table + CMD_DEL: delete an entry from group table + CMD_GET_STATS: get stats for group entry + +TLVs for add and modify commands are: + + field width description + ----------------------------------------------------------- + FLOW_GROUP_CMD 2 CMD_[ADD|MOD] + FLOW_GROUP_ID 2 Flow group ID + FLOW_GROUP_TYPE 1 Group type: + 0: L2 interface + 1: L2 rewrite + 2: L3 unicast + 3: L2 multicast + 4: L2 flood + 5: L3 interface + 6: L3 multicast + 7: L3 ECMP + 8: L2 overlay + FLOW_VLAN_ID 2 Vlan ID (types 0, 3, 4, 6) + FLOW_L2_PORT 2 Port (types 0) + FLOW_INDEX 4 Index (all types but 0) + FLOW_OVERLAY_TYPE 1 Overlay sub-type (type 8): + 0: Flood unicast tunnel + 1: Flood multicast tunnel + 2: Multicast unicast tunnel + 3: Multicast multicast tunnel + FLOW_GROUP_ACTION nest + FLOW_GROUP_ID 2 next group ID in chain (all + types except 0) + FLOW_OUT_PORT 4 egress port (types 0, 8) + FLOW_POP_VLAN_TAG 1 strip outer VLAN tag (type 1 + only) + FLOW_VLAN_ID 2 (types 1, 5) + FLOW_SRC_MAC 6 (types 1, 2, 5) + FLOW_DST_MAC 6 (types 1, 2) + +TLVs for flow delete and get stats command are: + + field width description + ----------------------------------------------------------- + FLOW_GROUP_CMD 2 CMD_[DEL|GET_STATS] + FLOW_GROUP_ID 2 Flow group ID + +On completion of get stats command, the descriptor buffer is written back with +the following TLVs: + + field width description + --------------------------------------------------- + FLOW_GROUP_ID 2 Flow group ID + FLOW_STAT_DURATION 4 Flow duration + FLOW_STAT_REF_COUNT 4 Flow reference count + FLOW_STAT_BUCKET_COUNT 4 Flow bucket count + +Possible status return codes in descriptor on completion are: + + DESC_COMP_ERR command reason + -------------------------------------------------------------------- + 0 all OK + -ROCKER_EFAULT all head or tail index outside + of ring + -ROCKER_ENXIO all address or data read err on + desc buf + -ROCKER_ENOSPC GET_STATS cmd descriptor buffer wasn't + big enough to contain write-back + TLVs + -ROCKER_EINVAL ADD|MOD invalid parameters passed in + -ROCKER_EEXIST ADD entry already exists + -ROCKER_ENOSPC ADD no space left in flow table + -ROCKER_ENOENT MOD|DEL|GET_STATS group ID invalid + -ROCKER_EBUSY DEL group reference count non-zero + -ROCKER_ENODEV ADD next group ID doesn't exist + + + +References +========== + +[1] OpenFlow Data Plane Abstraction (OF-DPA) Abstract Switch Specification, +Version 1.0, from Broadcom Corporation, February 21, 2014. diff --git a/docs/writing-qmp-commands.txt b/docs/writing-qmp-commands.txt index f3df2066a..ab1fdd36b 100644 --- a/docs/writing-qmp-commands.txt +++ b/docs/writing-qmp-commands.txt @@ -598,7 +598,7 @@ stored in its "value" member. In our example, the "value" member is a pointer to an TimerAlarmMethod instance. Notice that the "current" variable is used as "true" only in the first -interation of the loop. That's because the alarm timer method in use is the +iteration of the loop. That's because the alarm timer method in use is the first element of the alarm_timers array. Also notice that QAPI lists are handled by hand and we return the head of the list. |